In [None]:
# Ques_4.ipynb — Ensuring Consistency in Multi-source Data Integration

import pandas as pd

# Load datasets
a_df = pd.read_csv("products_A.csv")
b_df = pd.read_csv("products_B.csv")

# Display sample data (optional)
print("Products A Sample:\n", a_df.head())
print("\nProducts B Sample:\n", b_df.head())

# Merge on product_id to compare categories
merged_df = pd.merge(a_df, b_df, on="product_id", suffixes=('_A', '_B'))

# Check consistency in category information
merged_df["category_consistent"] = merged_df["category_A"] == merged_df["category_B"]

# Calculate consistency rate
total_products = len(merged_df)
consistent_count = merged_df["category_consistent"].sum()
consistency_percent = (consistent_count / total_products) * 100

# Print results
print(f"\nTotal Matched Products: {total_products}")
print(f"Consistent Categories: {consistent_count}")
print(f"Category Consistency: {consistency_percent:.2f}%")

# Display inconsistent records
inconsistencies = merged_df[~merged_df["category_consistent"]]
print("\nInconsistent Category Records:\n", inconsistencies[["product_id", "category_A", "category_B"]])

# Optional: Save inconsistent records to CSV
inconsistencies[["product_id", "category_A", "category_B"]].to_csv("category_inconsistencies.csv", index=False)