In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
data_custom_ratios = pd.read_csv('/Users/keshavsaraogi/data/e-commerce/ecommerce_customer_data_custom_ratios.csv')
data_large = pd.read_csv('/Users/keshavsaraogi/data/e-commerce/ecommerce_customer_data_large.csv')

In [None]:
data_custom_ratios['Returns'] = data_custom_ratios['Returns'].fillna(0)
data_large['Returns'] = data_large['Returns'].fillna(0)

In [None]:
def engineer_features(df):

    customer_metrics = df.groupby('Customer ID').agg({
        'Total Purchase Amount': ['mean', 'sum', 'count'],
        'Returns': 'mean',
        'Purchase Date': lambda x: (pd.to_datetime(x.max()) - pd.to_datetime(x.min())).days, 
        'Age': 'first',
        'Gender': 'first'
    })

    customer_metrics.columns = ["_".join(col) for col in customer_metrics.columns] 
    customer_metrics.reset_index(inplace=True)
    customer_metrics.columns = ['Customer ID', 'Avg_Purchase_Amount', 'Total_Purchase_Amount', 'Purchase_Frequency', 'Return_Rate', 'Purchase_Span_Days', 'Age', 'Gender']
    customer_metrics['Gender'] = customer_metrics['Gender'].map({'Male': 0, 'Female': 1})
    return customer_metrics

customer_features = engineer_features(data_custom_ratios)

In [None]:
features_for_clustering = ['Avg_Purchase_Amount', 'Purchase_Frequency', 'Return_Rate', 'Purchase_Span_Days', 'Age', 'Gender']

scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[features_for_clustering])

In [None]:
max_clusters = 10
inertias = []
silhouette_scores = []

for k in range(2, max_clusters + 1):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_features)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(scaled_features, kmeans.labels_))