In [10]:
import pandas as pd
import json

# 1. Load the data
df_clean = pd.read_csv('../data/processed/cleaned_transactions.csv')
print(f"Data Loaded: {len(df_clean)} rows")
print(f"Original Columns: {df_clean.columns.tolist()}")
# Convert CustomerID to integer
df_clean['CustomerID'] = df_clean['CustomerID'].astype(int)

# 2. STANDARDIZE COLUMNS (The Critical Fix)
# We map your specific Excel names to the project's required standard
column_mapping = {
    'Invoice': 'InvoiceNo',      # Your file has 'Invoice'
    'Price': 'UnitPrice',        # Your file has 'Price'
    'Customer ID': 'CustomerID'  # Your file has 'Customer ID'
}
df_clean.rename(columns=column_mapping, inplace=True)

# 3. Ensure 'TotalPrice' exists (Required for validation)
if 'TotalPrice' not in df_clean.columns:
    # Calculate it now using the fixed column names
    df_clean['TotalPrice'] = df_clean['Quantity'] * df_clean['UnitPrice']

print(f"Fixed Columns: {df_clean.columns.tolist()}")

# 4. RUN VALIDATION CHECKS
try:
    # CHECK 1: No missing values
    missing_sum = df_clean.isnull().sum().sum()
    assert missing_sum == 0, f"Missing values found: {missing_sum}"
    print("✅ CHECK 1 Passed: No missing values.")

    # CHECK 2: All quantities positive
    assert (df_clean['Quantity'] > 0).all(), "Negative quantities found!"
    print("✅ CHECK 2 Passed: All quantities are positive.")

    # CHECK 3: All prices positive
    assert (df_clean['UnitPrice'] > 0).all(), "Invalid prices found!"
    print("✅ CHECK 3 Passed: All unit prices are positive.")

    # CHECK 4: CustomerID is integer
    assert pd.api.types.is_integer_dtype(df_clean['CustomerID']), \
        f"CustomerID is {df_clean['CustomerID'].dtype}, expected int"
    print("✅ CHECK 4 Passed: CustomerID is integer.")

    # 5. GENERATE & SAVE REPORT
    validation_report = {
        'total_rows': int(len(df_clean)),
        'total_columns': int(len(df_clean.columns)),
        'unique_customers': int(df_clean['CustomerID'].nunique()),
        'unique_products': int(df_clean['StockCode'].nunique()),
        'total_revenue': float(df_clean['TotalPrice'].sum()),
        'validation_passed': True
    }

    output_path = '../data/processed/validation_report.json'
    with open(output_path, 'w') as f:
        json.dump(validation_report, f, indent=4)

    print(f"\n✅ SUCCESS: Validation Report saved to {output_path}")
    print(json.dumps(validation_report, indent=4))

except AssertionError as e:
    print(f"\n❌ VALIDATION FAILED: {str(e)}")
except KeyError as e:
    print(f"\n❌ COLUMN ERROR: Still missing column {e}. Check the renaming step above.")

Data Loaded: 779425 rows
Original Columns: ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']
Fixed Columns: ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country', 'TotalPrice']
✅ CHECK 1 Passed: No missing values.
✅ CHECK 2 Passed: All quantities are positive.
✅ CHECK 3 Passed: All unit prices are positive.
✅ CHECK 4 Passed: CustomerID is integer.

✅ SUCCESS: Validation Report saved to ../data/processed/validation_report.json
{
    "total_rows": 779425,
    "total_columns": 9,
    "unique_customers": 5878,
    "unique_products": 4631,
    "total_revenue": 17374804.268,
    "validation_passed": true
}
