In [1]:
# 📦 Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')



In [2]:
# 🗃️ Step 2: Load the Dataset
df = pd.read_csv("online_retail.csv", encoding='ISO-8859-1')

df = df.dropna(subset=['Customer ID'])



In [3]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15Cm Christmas Glass Ball 20 Lights,12,01-12-2009 07:45,6.95,13085,United Kingdom
1,489434,79323P,Pink Cherry Lights,12,01-12-2009 07:45,6.75,13085,United Kingdom
2,489434,79323W,White Cherry Lights,12,01-12-2009 07:45,6.75,13085,United Kingdom
3,489434,22041,Record Frame 7 Single Size,48,01-12-2009 07:45,2.1,13085,United Kingdom
4,489434,21232,Strawberry Ceramic Trinket Box,24,01-12-2009 07:45,1.25,13085,United Kingdom


In [4]:
# 🧹 Step 3: Preprocessing
df['CustomerID'] = df['Customer ID'].astype(str)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce', dayfirst=True)
df['TotalPrice'] = df['Quantity'] * df['Price']
df['IsCancelled'] = df['Invoice'].astype(str).str.startswith('C').astype(int)



In [5]:
# 📅 Step 4: Set Reference Date
snapshot_date = df['InvoiceDate'].max() + timedelta(days=1)



In [6]:
# 🎯 Step 5: Group and Feature Extraction
customer_gp = df.groupby('CustomerID')

features = pd.DataFrame()
features['Recency'] = customer_gp['InvoiceDate'].max().apply(lambda x: (snapshot_date - x).days)
features['FirstPurchaseDaysAgo'] = customer_gp['InvoiceDate'].min().apply(lambda x: (snapshot_date - x).days)
features['Frequency'] = customer_gp['Invoice'].nunique()
features['TotalQuantity'] = customer_gp['Quantity'].sum()
features['Monetary'] = customer_gp['TotalPrice'].sum()
features['AvgOrderValue'] = features['Monetary'] / features['Frequency']

# 🧾 Return related
features['TotalReturns'] = customer_gp['IsCancelled'].sum()
features['ReturnRate'] = features['TotalReturns'] / features['Frequency']

# 🧮 Quantity & Price Stats
features['AvgQuantityPerInvoice'] = customer_gp['Quantity'].mean()
features['MaxQuantity'] = customer_gp['Quantity'].max()
features['MinQuantity'] = customer_gp['Quantity'].min()
features['StdQuantity'] = customer_gp['Quantity'].std()

features['MeanPrice'] = customer_gp['Price'].mean()
features['MaxPrice'] = customer_gp['Price'].max()
features['MinPrice'] = customer_gp['Price'].min()
features['StdPrice'] = customer_gp['Price'].std()

# 🕒 Time Features
df['Hour'] = df['InvoiceDate'].dt.hour
df['Weekday'] = df['InvoiceDate'].dt.dayofweek
features['MostCommonHour'] = customer_gp['Hour'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
features['MostCommonWeekday'] = customer_gp['Weekday'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)

# 🕐 Days Between Orders
def avg_days_between(dates):
    dates = sorted(dates)
    if len(dates) < 2:
        return np.nan
    diffs = np.diff(dates)
    return np.mean(diffs).days

features['AvgDaysBetweenOrders'] = customer_gp['InvoiceDate'].agg(lambda x: avg_days_between(x.tolist()))

# 🌍 Country (optional)
# features['Country'] = customer_gp['Country'].first()  # uncomment for label encoding later





In [13]:
from sklearn.preprocessing import StandardScaler
import joblib
import numpy as np
import pandas as pd

# Assume 'features' already has 'CustomerID' column retained
# If 'CustomerID' is in index, reset it
if features.index.name == "CustomerID" or features.index.name is not None:
    features.reset_index(inplace=True)

# Fill Missing
features.fillna(0, inplace=True)

# Normalize all columns *except* CustomerID
non_id_cols = features.columns.drop("CustomerID")
scaler = StandardScaler()
scaled_array = scaler.fit_transform(features[non_id_cols])

# Combine back into DataFrame
scaled_features = pd.DataFrame(scaled_array, columns=non_id_cols)
scaled_features["CustomerID"] = features["CustomerID"]

# Save in multiple formats
scaled_features.to_parquet("full_customer_features.parquet", index=False)
joblib.dump(scaled_features, "full_customer_features.pkl")
np.savez_compressed("full_customer_features.npz", data=scaled_array)

print(" Saved features in multiple formats:")
print(" - full_customer_features.parquet")
print(" - full_customer_features.pkl")
print(" - full_customer_features.npz")

# Preview
scaled_features.head()


 Saved features in multiple formats:
 - full_customer_features.parquet
 - full_customer_features.pkl
 - full_customer_features.npz


Unnamed: 0,Recency,FirstPurchaseDaysAgo,Frequency,TotalQuantity,Monetary,AvgOrderValue,TotalReturns,ReturnRate,AvgQuantityPerInvoice,MaxQuantity,MinQuantity,StdQuantity,MeanPrice,MaxPrice,MinPrice,StdPrice,MostCommonHour,MostCommonWeekday,AvgDaysBetweenOrders,CustomerID
0,0.595584,1.126453,0.438998,8.168099,5.166378,5.005717,0.0,0.0,5.915978,49.887133,-0.103968,19.928639,-0.014072,-0.09844,-0.022933,-0.077289,0.187925,-1.370474,0.415278,12346
1,-0.952279,-0.31698,0.131502,0.132759,0.136127,0.189436,0.0,0.0,-0.032748,0.079053,-0.083459,-0.015794,-0.034503,-0.081082,-0.027273,-0.069616,0.631193,-0.836032,-0.219649,12347
2,-0.603532,-0.164568,-0.09912,0.104253,-0.064857,0.015401,0.0,0.0,0.076422,0.014415,-0.103968,0.033421,-0.027196,0.008847,-0.027041,0.116943,0.631193,-1.370474,0.126675,12348
3,-0.871064,0.512322,-0.175994,-0.018556,0.101996,0.594632,0.0,0.0,-0.043939,-0.050223,-0.103968,-0.030806,-0.000688,0.866882,-0.026289,0.605748,-1.585148,0.232852,-0.104207,12349
4,0.519146,-0.738354,-0.406616,-0.179335,-0.181549,-0.041823,0.0,0.0,-0.037616,-0.066382,-0.103968,-0.036181,-0.026885,0.008847,-0.023801,0.087294,1.51773,-0.30159,-0.277369,12350
