In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.metrics import silhouette_score
import numpy as np

In [None]:
df = pd.read_csv('data//clean//order_klav_merge_customerLevel.csv')
numeric_df = df.select_dtypes(include=[np.number])

In [None]:
#check variables correlation 
corr_matrix = numeric_df.corr()

# Plot heatmap
plt.figure(figsize=(16, 10))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False, linewidths=0.5)
plt.title("Correlation Heatmap of Customer Features")
plt.show()

In [None]:
#Creating dummy columns
#get most 10 cities 
top_cities = df['Recent City'].value_counts().head(10)
#print(top_cities)

#replace other cities with Other, 
df['Recent City'] = df['Recent City'].apply(lambda x: x if x in top_cities.index else 'Other')
#encode dummy variables
df = pd.get_dummies(df, columns=['Recent City'], drop_first=True)

#replace countries with other + pick top 10 countries 
df['Recent Country'] = df['Recent Country'].apply(lambda x: x if x in top_cities.index else 'Other')
#encode dummy variables
df = pd.get_dummies(df, columns=['Recent Country'], drop_first=True)

#replace initial source with other + pick top 10
df['Initial Source New'] = df['Initial Source New'].apply(lambda x: x if x in top_cities.index else 'Other')
#encode dummy variables
df = pd.get_dummies(df, columns=['Initial Source New'], drop_first=True)

#replace Last source with other + pick top 10
df['Last Source New'] = df['Last Source New'].apply(lambda x: x if x in top_cities.index else 'Other')
#encode dummy variables
df = pd.get_dummies(df, columns=['Last Source New'], drop_first=True)


In [None]:
df = df.drop(columns=['CLV', 'Email', 'List SKU'])

In [None]:
#prepare X
X = df.values
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# inertia = []
# K_range = range(1, 11)

# for k in K_range:
#     kmeans = KMeans(n_clusters=k, random_state=42)
#     kmeans.fit(X)
#     inertia.append(kmeans.inertia_)

# plt.plot(K_range, inertia, marker='o')
# plt.xlabel('Number of clusters (K)')
# plt.ylabel('Inertia')
# plt.title('Elbow Method for Optimal K')
# plt.grid(True)
# plt.show()


In [None]:
#Train model
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans_labels = kmeans.fit_predict(X)

In [None]:
#plot results of K-means 
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis', s=50)
# plt.title("K-Means Clustering with PCA (K=4)")
# plt.xlabel("PCA 1")
# plt.ylabel("PCA 2")
# plt.grid(True)
# plt.show()

In [None]:
#label results
df['cluster'] = kmeans_labels
#df.to_csv('Github//BigDataAI_Project//Train_models//k_means//clustered_data.csv', index=False)

In [None]:
#plot xlusters centers 
centers = pca.transform(kmeans.cluster_centers_)

# plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis', s=50)
# plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, marker='X', label='Centroids')
# plt.title("K-Means Clustering with Centroids (PCA)")
# plt.xlabel("PCA 1")
# plt.ylabel("PCA 2")
# plt.legend()
# plt.grid(True)
# plt.show()

In [None]:
#explore clusters charecterestic
# cluster_summary = df.groupby('cluster').mean()
# print(cluster_summary)

# print(df['cluster'].value_counts())

# # Exploring clusters for business overview
# sns.boxplot(x='cluster', y='DaysSinceRecentOrder', data=df)
# plt.title("DaysSinceRecentOrder")
# plt.show()

In [None]:
cluster_profiles = df.groupby("cluster").mean(numeric_only=True)

# Count of records in each cluster
cluster_counts = df["cluster"].value_counts().sort_index()

# tools.display_dataframe_to_user(name="Cluster Profiles", dataframe=cluster_profiles)
print(cluster_profiles)

print(cluster_counts)

cluster_profiles_with_counts = cluster_profiles.copy()
cluster_profiles_with_counts['count'] = cluster_counts


In [None]:
# Save to CSV
summary_output_path = "cluster_summary_profiles.csv"
cluster_profiles_with_counts.to_csv(summary_output_path)


In [None]:
cluster_labels = {
    0: "Dormant Shoppers",
    1: "VIP / Loyal Customers",
    2: "Occasional Buyers",
    3: "Inactive Customers"
}

df['cluster_label'] = df['cluster'].map(cluster_labels)

In [None]:
sil_score = silhouette_score(X, kmeans_labels)
print(f"K-Means Silhouette Score: {sil_score:.2f}")