# K-Means Clustering for Customer Segmentation

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")


## Load and Prepare Data

In [None]:
# Load cleaned dataset
df = pd.read_csv('data/clean/order_klav_merge_customerLevel.csv')

# Drop irrelevant columns
df = df.drop(columns=[
    'CLV', 'Email', 'List SKU', 'Last Source New', 'Initial Source New',
    'Days since Date Added', 'Max Amount Orders', 'Max item amount',
    'Days since Profile Created On', 'PayMeth_Card', 'PayMeth_Other',
    'Always Free Shipping', 'Never Free Shipping'
])


## Encode Top Cities and Countries

In [None]:
# Encode top cities and countries
top_cities = df['Recent City'].value_counts().nlargest(5).index
top_countries = df['Recent Country'].value_counts().nlargest(3).index

df['Recent City'] = df['Recent City'].apply(lambda x: x if x in top_cities else 'Other')
df['Recent Country'] = df['Recent Country'].apply(lambda x: x if x in top_countries else 'Other')

df = pd.get_dummies(df, columns=['Recent City', 'Recent Country'], drop_first=True)


## PCA and K-Means Clustering

In [None]:
# Standardize and apply PCA
X = df.values
X_scaled = StandardScaler().fit_transform(X)
pca = PCA(n_components=5)
X_reduced = pca.fit_transform(X_scaled)

# Fit K-Means
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(X_reduced)

# Silhouette Score
score = silhouette_score(X_reduced, df['cluster'])
print(f"K-Means Silhouette Score: {score:.2f}")


## Cluster Summary and Labeling

In [None]:
cluster_summary = df.groupby("cluster").mean().round(2).reset_index()

cluster_labels = {
    0: "VIP / Loyal Customers",
    1: "Dormant Shoppers",
    2: "Occasional Buyers",
    3: "Promo-Engaged Buyers",
    4: "Inactive Customers"
}
df['cluster_label'] = df['cluster'].map(cluster_labels)
cluster_summary['Cluster Label'] = cluster_summary['cluster'].map(cluster_labels)


## Feature Visualization by Cluster

In [None]:
key_metrics = [
    'Avg Amount Orders', 'Nb items', 'Avg Nb items', 'Avg item amount',
    'PayMeth_Bancontact', 'PayMeth_Ideal', 'PayMeth_Klarna',
    'PayMeth_Pay Later', 'PayMeth_shopify payments',
    'Always Discount', 'Never Discount', 'Max Discount Percentage',
    'Same SKU more than once', 'Email Marketing Consent', 'Accepts Marketing',
    'click', 'open', 'Days since First Active', 'Days since Last Active',
    'Recent City_amsterdam', 'Recent City_den haag', 'Recent City_haarlem',
    'Recent City_rotterdam', 'Recent City_utrecht',
    'Recent Country_be', 'Recent Country_de', 'Recent Country_nl'
]

for feature in key_metrics:
    if feature in cluster_summary.columns:
        plt.figure(figsize=(8, 4))
        sns.barplot(data=cluster_summary, x="Cluster Label", y=feature, palette="Set2")
        plt.title(f"{feature} by Cluster")
        plt.xticks(rotation=20)
        plt.tight_layout()
        plt.show()
