## Ensuring Consistency in Multi-source Data Integration

**Description**: Validate the integration of two datasets `products_A.csv` and `products_B.csv` . Ensure consistency in product "category" information.

In [2]:
# Write your code from here
import pandas as pd

# Create sample data for products_A.csv
data_A = {
    'product_id': [1, 2, 3, 4, 5],
    'category': ['Electronics', 'Clothing', 'Furniture', 'Toys', 'Electronics']
}

# Create a DataFrame from the data
df_A = pd.DataFrame(data_A)

# Write the DataFrame to CSV
df_A.to_csv('products_A.csv', index=False)

# Create sample data for products_B.csv
data_B = {
    'product_id': [1, 2, 3, 4, 5],
    'category': ['Electronics', 'Apparel', 'Furniture', 'Kids', 'Electronics']
}

# Create a DataFrame from the data
df_B = pd.DataFrame(data_B)

# Write the DataFrame to CSV
df_B.to_csv('products_B.csv', index=False)

print("CSV files 'products_A.csv' and 'products_B.csv' have been created.")


CSV files 'products_A.csv' and 'products_B.csv' have been created.


In [3]:
import pandas as pd

# Load the datasets
def load_product_data(file_path):
    """Load product data from a CSV file."""
    try:
        df = pd.read_csv(file_path)
        if 'product_id' not in df.columns or 'category' not in df.columns:
            raise ValueError("Both 'product_id' and 'category' columns must be present.")
        return df
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Validate consistency in product categories between two datasets
def validate_category_consistency(df_A, df_B):
    """Validate consistency of 'category' across two product datasets."""
    # Merge datasets on 'product_id'
    merged_df = pd.merge(df_A[['product_id', 'category']], df_B[['product_id', 'category']], 
                         on='product_id', suffixes=('_A', '_B'), how='inner')
    
    # Find discrepancies where categories don't match
    discrepancies = merged_df[merged_df['category_A'] != merged_df['category_B']]

    if not discrepancies.empty:
        print(f"Found {len(discrepancies)} products with inconsistent categories.")
        print(discrepancies)
    else:
        print("All product categories are consistent across both datasets.")

# Main code execution
file_A = 'products_A.csv'
file_B = 'products_B.csv'

# Load both product datasets
df_A = load_product_data(file_A)
df_B = load_product_data(file_B)

if df_A is not None and df_B is not None:
    # Validate consistency in the "category" column
    validate_category_consistency(df_A, df_B)


Found 2 products with inconsistent categories.
   product_id category_A category_B
1           2   Clothing    Apparel
3           4       Toys       Kids
