# Hierarchical Clustering for Customer Segmentation

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")


## Load and Prepare Data

In [None]:
df = pd.read_csv('order_klav_merge_customerLevel.csv')

# Drop irrelevant columns
df = df.drop(columns=[
    'CLV', 'Email', 'List SKU', 'Last Source New', 'Initial Source New',
    'Days since Date Added'
])


## Encode Top Cities and Countries

In [None]:
top_cities = df['Recent City'].value_counts().nlargest(5).index
top_countries = df['Recent Country'].value_counts().nlargest(3).index

df['Recent City'] = df['Recent City'].apply(lambda x: x if x in top_cities else 'Other')
df['Recent Country'] = df['Recent Country'].apply(lambda x: x if x in top_countries else 'Other')

df = pd.get_dummies(df, columns=['Recent City', 'Recent Country'], drop_first=False)


## Scale Features and Apply PCA

In [None]:
X = df.values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_scaled)


## Fit Hierarchical Clustering Model

In [None]:
hc = AgglomerativeClustering(n_clusters=5, affinity='manhattan', linkage='average')
df['hc_cluster'] = hc.fit_predict(X_reduced)


## Evaluate Clustering

In [None]:
silhouette = silhouette_score(X_reduced, df['hc_cluster'], metric='manhattan')
print(f"Silhouette Score: {silhouette:.4f}")


## Cluster Summary and Business Labeling

In [None]:
cluster_summary = df.groupby('hc_cluster').mean(numeric_only=True).reset_index()

cluster_labels = {
    0: "Dormant Shoppers",
    1: "Churned Casual Buyers",
    2: "New & Engaged Buyers",
    3: "Loyal Core Customers",
    4: "Promo-Responsive Customers"
}
cluster_summary['Cluster Label'] = cluster_summary['hc_cluster'].map(cluster_labels)


## Feature Visualization by Cluster

In [None]:
key_metrics = [
    'Avg Amount Orders', 'Nb items', 'Avg Nb items', 'Avg item amount',
    'PayMeth_Bancontact', 'PayMeth_Ideal', 'PayMeth_Klarna',
    'PayMeth_Pay Later', 'PayMeth_shopify payments',
    'Always Discount', 'Never Discount', 'Max Discount Percentage',
    'Same SKU more than once', 'Email Marketing Consent', 'Accepts Marketing',
    'click', 'open', 'Days since First Active', 'Days since Last Active'
]

for feature in key_metrics:
    if feature in cluster_summary.columns:
        plt.figure(figsize=(8, 4))
        sns.barplot(data=cluster_summary, x="Cluster Label", y=feature, palette="Set2")
        plt.title(f"{feature} by Cluster")
        plt.xticks(rotation=20)
        plt.tight_layout()
        plt.show()


## City and Country Distribution by Cluster

In [None]:
city_cols = ['Recent City_amsterdam', 'Recent City_den haag', 'Recent City_haarlem',
             'Recent City_rotterdam', 'Recent City_utrecht']
country_cols = ['Recent Country_be', 'Recent Country_de', 'Recent Country_nl']

for col in city_cols + country_cols:
    if col in cluster_summary.columns:
        plt.figure(figsize=(8, 4))
        sns.barplot(data=cluster_summary, x="Cluster Label", y=col, palette="pastel")
        plt.title(f"{col.replace('_', ' ').title()} by Cluster")
        plt.xlabel("Customer Segment")
        plt.ylabel("Proportion")
        plt.tight_layout()
        plt.show()
