In [3]:
# Imports
import pandas as pd

# Load the raw dataset
df_raw = pd.read_csv('../data/raw/OnlineRetail.csv', encoding='ISO-8859-1')

# Remove rows with missing CustomerID
df_raw = df_raw[df_raw['CustomerID'].notnull()]

# Convert InvoiceDate to datetime
df_raw['InvoiceDate'] = pd.to_datetime(df_raw['InvoiceDate'])

# Remove rows where Quantity or UnitPrice is less than or equal to zero
df_raw = df_raw[(df_raw['Quantity'] > 0) & (df_raw['UnitPrice'] > 0)]

# Create a 'TotalPrice' column for each transaction
df_raw['TotalPrice'] = df_raw['Quantity'] * df_raw['UnitPrice']

# Aggregate the data to create customer-level metrics
customer_data = df_raw.groupby('CustomerID').agg({
    'InvoiceNo': 'nunique',       # Number of unique invoices (NumPurchases)
    'Quantity': 'sum',            # Total quantity purchased (TotalQuantity)
    'TotalPrice': 'sum'           # Total amount spent (TotalPrice)
}).reset_index()

# Rename the aggregated columns
customer_data.rename(columns={
    'InvoiceNo': 'NumPurchases', 
    'Quantity': 'TotalQuantity', 
    'TotalPrice': 'TotalPrice'
}, inplace=True)

# Save the aggregated customer data to the processed folder as a CSV
output_path = '../data/processed/customer_data.csv'
customer_data.to_csv(output_path, index=False)

print(f"Customer data successfully saved to {output_path}")

Customer data successfully saved to ../data/processed/customer_data.csv
