In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Load dataset
df = pd.read_csv('../data/raw/data.csv', encoding='ISO-8859-1')
print("Initial shape:", df.shape)

Initial shape: (541909, 8)


In [3]:
# Drop rows with missing critical fields
df.dropna(subset=['CustomerID', 'StockCode', 'UnitPrice'], inplace=True)

In [4]:
# Remove cancelled/refunded orders (InvoiceNo starting with 'C')
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

In [5]:
# Remove zero or negative values
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]

In [6]:
# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [7]:
# Feature engineering
df['Revenue'] = df['Quantity'] * df['UnitPrice']
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['Weekday'] = df['InvoiceDate'].dt.day_name()


In [8]:
# Preview cleaned data
print("Cleaned shape:", df.shape)
print("Remaining nulls:\n", df.isnull().sum())


Cleaned shape: (397884, 12)
Remaining nulls:
 InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
Revenue        0
Year           0
Month          0
Weekday        0
dtype: int64


In [9]:
# Save cleaned file
Path('../data/processed').mkdir(parents=True, exist_ok=True)
df.to_csv('../data/processed/cleaned_retail_data.csv', index=False)
print("Cleaned dataset saved.")

Cleaned dataset saved.
