# 3.0 Data Cleaning

In [2]:
import pandas as pd 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
FILE_PATH = '../data/processed/ecommerce_merged_initial.csv'

# Load the single combined DataFrame directly
df_combined = pd.read_csv(FILE_PATH) 

df_clean = df_combined.copy()

print(f"Successfully loaded combined data from: {FILE_PATH}")
print(f"Starting Rows: {len(df_combined)}")

Successfully loaded combined data from: ../data/processed/ecommerce_merged_initial.csv
Starting Rows: 118434


### Standardize Formats (Date Conversion)

In [29]:

date_cols = [
  'order_purchase_timestamp', 
    'order_approved_at', 
    'order_delivered_carrier_date', 
    'order_delivered_customer_date',
    'order_estimated_delivery_date',
    'shipping_limit_date'
 ]

for col in date_cols:
    df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

# Validation Check
print("\n--- Date Conversion Validation ---")
print(df_clean[date_cols].info())


--- Date Conversion Validation ---
<class 'pandas.core.frame.DataFrame'>
Index: 113387 entries, 0 to 118433
Data columns (total 6 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_purchase_timestamp       113387 non-null  datetime64[ns]
 1   order_approved_at              113373 non-null  datetime64[ns]
 2   order_delivered_carrier_date   113385 non-null  datetime64[ns]
 3   order_delivered_customer_date  113379 non-null  datetime64[ns]
 4   order_estimated_delivery_date  113387 non-null  datetime64[ns]
 5   shipping_limit_date            113387 non-null  datetime64[ns]
dtypes: datetime64[ns](6)
memory usage: 6.1 MB
None


Successfully converted all date colums to the correct date type

### Handle Missing Values: Filter Unsuccessful Orders (Most Critical)

In [30]:
original_count = len(df_clean)

df_clean = df_clean[df_clean['order_status'] == 'delivered'].copy()

removed_rows = original_count - len(df_clean)
print(f"\n--- Order Status Filter Validation ---")
print(f"Rows Removed (Incomplete Orders): {removed_rows}")
print(f"Remaining Rows (Delivered): {len(df_clean)}")
print(f"New Order Statuses:\n{df_clean['order_status'].value_counts()}")


--- Order Status Filter Validation ---
Rows Removed (Incomplete Orders): 0
Remaining Rows (Delivered): 113387
New Order Statuses:
order_status
delivered    113387
Name: count, dtype: int64


In [31]:
original_count = len(df_clean)

df_clean.dropna(subset=[
    'price',
    'freight_value',
    'seller_id',
    'product_category_name_english'
], inplace=True)

removed_rows = original_count - len(df_clean)
print(f"\n--- Critical Null Drop Validation ---")
print(f"Rows removed (Missing Price/Category): {removed_rows}")
print(f"Remaining Rows: {len(df_clean)}")
print(f"Missing Categories: {df_clean['product_category_name_english'].isnull().sum()}")


--- Critical Null Drop Validation ---
Rows removed (Missing Price/Category): 0
Remaining Rows: 113387
Missing Categories: 0


Successfully removed missing rows form "price", "freight_value", "seller_id", "product_category_name_english" for accuarte calculation

### Consolidate Payment Duplicates

In [32]:
original_count = len(df_clean)

df_final = df_clean.groupby(['order_id', 'order_item_id'], as_index=False).agg(
    
    # Financial Aggregation
    total_paid=('payment_value', 'sum'),       
    price=('price', 'mean'),                   
    freight_value=('freight_value', 'mean'),   
    
    # Identifying/Categorical Columns
    customer_unique_id=('customer_unique_id', 'first'),
    product_category_name_english=('product_category_name_english', 'first'),
    customer_state=('customer_state', 'first'),
    seller_id=('seller_id', 'first'),          
    
    # Date/Time Columns
    order_purchase_timestamp=('order_purchase_timestamp', 'first'),
    order_approved_at=('order_approved_at', 'first'),              # Added for fulfillment metrics
    order_delivered_carrier_date=('order_delivered_carrier_date', 'first'), # Added for fulfillment metrics
    order_delivered_customer_date=('order_delivered_customer_date', 'first'),
    order_estimated_delivery_date=('order_estimated_delivery_date', 'first'),
    shipping_limit_date=('shipping_limit_date', 'first')           # Added for SLA metrics
)

removed_rows = original_count - len(df_final)
print(f"\n--- Duplicate Aggregation Validation ---")
print(f"Rows Removed (Consolidated Payments): {removed_rows}")
print(f"Final Cleaned Rows: {len(df_final)}")
print(f"Duplicates: {df_final.duplicated().sum()}")


--- Duplicate Aggregation Validation ---
Rows Removed (Consolidated Payments): 4749
Final Cleaned Rows: 108638
Duplicates: 0


* Aggregate rows based on the unique transaction key (`order_id` + `order_item_id`) to fix the 5,009 payment duplicates**.

* **SUM:** `payment_value` to create `total_paid` (since values were split).
* **FIRST/MEAN:** Used for all identifying columns (`price`, `customer_id`, `date`) since their values were identical across duplicated rows.

### New Useful Columns (Feature Engineering Prep)

In [33]:
#Transaction Revenue
df_final['revenue'] = df_final['price'] + df_final['freight_value']

#Time Features
df_final['order_year_month'] = df_final['order_purchase_timestamp'].dt.to_period('M')
df_final['order_month'] = df_final['order_purchase_timestamp'].dt.month
df_final['order_year'] = df_final['order_purchase_timestamp'].dt.year

print("\n--- Revenue & Time Feature Validation ---")
print(f"Sample Revenue Check:\n{df_final[['price', 'freight_value', 'revenue']].head()}")
print(f"Sample Time Feature Check:\n{df_final[['order_purchase_timestamp', 'order_year_month']].head()}")


--- Revenue & Time Feature Validation ---
Sample Revenue Check:
    price  freight_value  revenue
0   58.90          13.29    72.19
1  239.90          19.93   259.83
2  199.00          17.87   216.87
3   12.99          12.79    25.78
4  199.90          18.14   218.04
Sample Time Feature Check:
  order_purchase_timestamp order_year_month
0      2017-09-13 08:59:02          2017-09
1      2017-04-26 10:53:06          2017-04
2      2018-01-14 14:33:31          2018-01
3      2018-08-08 10:00:35          2018-08
4      2017-02-04 13:57:51          2017-02


The new columns create for future calculations of LTV 

### Saved the Cleaned DataFrame

In [34]:
df_final.to_csv('../data/processed/ecommerce_clean.csv', index=False)
print("\nSUCCESS: Cleaned data saved to '../data/processed/ecommerce_clean.csv'")


SUCCESS: Cleaned data saved to '../data/processed/ecommerce_clean.csv'


In [38]:
print(df_final.columns)

Index(['order_id', 'order_item_id', 'total_paid', 'price', 'freight_value',
       'customer_unique_id', 'product_category_name_english', 'customer_state',
       'seller_id', 'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date', 'shipping_limit_date', 'revenue',
       'order_year_month', 'order_month', 'order_year'],
      dtype='object')


Final columns in the dataset