In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Scale RFM features

rfm = pd.read_csv('data/rfm/rfm_table.csv')
rfm_features = rfm[['Recency', 'Frequency', 'Monetary']]
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_features)

print("Scaled RFM data preview (first 5 rows):")
print(rfm_scaled[:100])
rfm_scaled_df = pd.DataFrame(rfm_scaled, columns=['Recency_scaled', 'Frequency_scaled', 'Monetary_scaled'])
rfm_scaled_df['Customer ID'] = rfm['Customer ID'].values
rfm_scaled_df.to_csv('data/rfm/rfm_scaled.csv', index=False)
print("Scaled RFM data saved to 'data/rfm/rfm_scaled.csv'.")

rfm_scaled_df = pd.read_csv('data/rfm/rfm_scaled.csv')
print(rfm_scaled_df.head())
print(rfm_scaled_df.describe())

# Load your scaled file
rfm_scaled_df = pd.read_csv('data/rfm/rfm_scaled.csv')

# Extract features for clustering
X = rfm_scaled_df[['Recency_scaled', 'Frequency_scaled', 'Monetary_scaled']]

# Optional: Keep Customer ID for reference
customer_ids = rfm_scaled_df['Customer ID']


sse = []
k_range = range(2, 11)  # Typical range; adjust if needed

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)

plt.figure(figsize=(8,4))
plt.plot(k_range, sse, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('SSE (Inertia)')
plt.show()

k = 4  # optimize k with  elbow plot

kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

# Add cluster labels back to dataframe
rfm_scaled_df['Cluster'] = clusters

# Count customers per cluster
print(rfm_scaled_df['Cluster'].value_counts())

# Optional: merge cluster labels back with original RFM metrics for interpretation
rfm_original = pd.read_csv('data/rfm/rfm_table.csv')
rfm_original['Cluster'] = clusters

# Profile clusters
cluster_profile = rfm_original.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean',
    'Customer ID': 'count'  # Cluster size
}).rename(columns={'Customer ID': 'Num_Customers'})

print("Cluster profile summary:")
print(cluster_profile)
##Here's a summary of the result got after clustering the RFM data into 4 clusters:
##| Cluster | Num\_Customers | Avg Recency (days) | Avg Frequency (orders) | Avg Monetary (total spend) | Interpretation                                                       |
##| ------- | -------------- | ------------------ | ---------------------- | -------------------------- | -------------------------------------------------------------------- |
##| 0       | 3054 customers | 43.7 days          | 3.68 orders            | 1,354 currency             | ⭐️ **Active, moderate-value repeat customers**                       |
##| 1       | 1067 customers | 248 days           | 1.55 orders            | 479 currency               | 💤 **Inactive, low-value churned customers**                         |
##| 2       | 13 customers   | 7.4 days           | 82.5 orders            | 127,188 currency           | 👑 **VIP / ultra-premium customers (super-loyal, huge spenders)**    |
##| 3       | 204 customers  | 15.5 days          | 22.3 orders            | 12,690 currency            | 🔥 **High-value loyal customers (recent + frequent + big spenders)** |

rfm_original.to_csv('data/rfm/rfm_clustered.csv', index=False)
#create a bar plot to visualize the number of customers in each cluster
!pip install seaborn
import seaborn as sns
rfm_clustered = pd.read_csv('data/rfm/rfm_clustered.csv', encoding='latin1')
plt.figure(figsize=(6,4))
sns.countplot(x='Cluster', data=rfm_clustered)
plt.title('Number of Customers per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.show()
# Create bar plots for average Recency, Frequency, and Monetary value by cluster
plt.figure(figsize=(6,4))
sns.barplot(x='Cluster', y='Recency', data=rfm_clustered, estimator='mean')
plt.title('Average Recency by Cluster')
plt.show()
# Create bar plot for average Frequency by cluster
plt.figure(figsize=(6,4))
sns.barplot(x='Cluster', y='Frequency', data=rfm_clustered, estimator='mean')
plt.title('Average Frequency by Cluster')
plt.show()
# Create bar plot for average Monetary value by cluster
plt.figure(figsize=(6,4))
sns.barplot(x='Cluster', y='Monetary', data=rfm_clustered, estimator='mean')
plt.title('Average Monetary by Cluster')
plt.show()
# Create scatter plots to visualize clusters in RFM space
plt.figure(figsize=(8,6))
sns.scatterplot(x='Recency', y='Monetary', hue='Cluster', data=rfm_clustered, palette='tab10')
plt.title('Recency vs Monetary by Cluster')
plt.show()
