In [5]:
import pandas as pd
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

print("Libraries loaded successfully")


Libraries loaded successfully


In [15]:
df_clean = pd.read_csv('../data/processed/clean_data_final.csv',
                       parse_dates=['Invoice'])

print(f"Dataset Shape: {df_clean.shape}")
df_clean.head()



Dataset Shape: (392733, 8)


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [17]:
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392733 entries, 0 to 392732
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      392733 non-null  object 
 1   StockCode    392733 non-null  object 
 2   Description  392733 non-null  object 
 3   Quantity     392733 non-null  int64  
 4   InvoiceDate  392733 non-null  object 
 5   Price        392733 non-null  float64
 6   Customer ID  392733 non-null  float64
 7   Country      392733 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 24.0+ MB


In [27]:
# --- Fix CustomerID column ---

# 1. Standardize column name
df_clean.rename(columns={'Customer ID': 'CustomerID'}, inplace=True)

# 2. Convert to integer (only safe if no missing values)
df_clean['CustomerID'] = df_clean['CustomerID'].astype(int)

print(df_clean[['CustomerID']].head())
print(df_clean['CustomerID'].dtype)


   CustomerID
0       17850
1       17850
2       17850
3       17850
4       17850
int32


In [47]:
# --- Create TotalPrice if missing ---
if 'Price' not in df_clean.columns:
    df_clean['TotalPrice'] = df_clean['Quantity'] * df_clean['Price']
    print("‚ÑπÔ∏è TotalPrice column created.")
else:
    print("‚úì TotalPrice already exists.")


‚úì TotalPrice already exists.


In [61]:
print("Running validation checks...")

# CHECK 1 ‚Äî no missing values
assert df_clean.isnull().sum().sum() == 0, "‚ùå Missing values found!"

# CHECK 2 ‚Äî positive quantities
assert (df_clean['Quantity'] > 0).all(), "‚ùå Negative quantities found!"

# CHECK 3 ‚Äî positive prices
assert (df_clean['Price'] >= 0).all(), "‚ùå Invalid prices found!"

# CHECK 4 ‚Äî CustomerID must be int
assert str(df_clean['CustomerID'].dtype).startswith('int'), "‚ùå CustomerID not integer!"

# CHECK 5 ‚Äî TotalPrice exists
assert 'TotalPrice' in df_clean.columns, "‚ùå TotalPrice column missing!"

# CHECK 6 ‚Äî Invoice column exists
assert 'Invoice' in df_clean.columns, "‚ùå Invoice column missing!"

# CHECK 7 ‚Äî Revenue positive
assert (df_clean['TotalPrice'] >= 0).all(), "‚ùå Invalid revenue detected!"

print("‚úÖ All validation checks passed!")


Running validation checks...
‚úÖ All validation checks passed!


In [65]:
df_clean.to_csv('../data/processed/cleaned_transactions.csv', index=False)
print("üíæ Cleaned dataset updated")


üíæ Cleaned dataset updated


In [67]:
validation_report = {
    'total_rows': len(df_clean),
    'total_columns': len(df_clean.columns),
    'date_range': f"{df_clean['InvoiceDate'].min()} to {df_clean['InvoiceDate'].max()}",
    'unique_customers': df_clean['CustomerID'].nunique(),
    'unique_products': df_clean['StockCode'].nunique(),
    'unique_countries': df_clean['Country'].nunique(),
    'total_revenue': float(df_clean['TotalPrice'].sum()),
    'average_order_value': float(df_clean.groupby('Invoice')['TotalPrice'].sum().mean()),
    'validation_passed': True
}


In [69]:
import json

with open('../data/processed/validation_report.json', 'w') as f:
    json.dump(validation_report, f, indent=4, default=str)

print("üìÑ Validation report saved successfully")


üìÑ Validation report saved successfully
