In [18]:
# Customer Segmentation using K-Means Clustering

## 1. Data Preparation
##Load and merge customer, transaction, and product data.*

## 2. Feature Engineering
##Create meaningful features from the raw data to capture customer profiles, transaction behaviors, and product preferences.*

## 3. Data Preprocessing
##Standardize the features to ensure they contribute equally to the clustering algorithm.*

## 4. Determine Optimal Clusters
##Use the Davies-Bouldin Index and Silhouette Score to identify the optimal number of clusters.*

## 5. Final Clustering
##Apply K-Means clustering with the optimal number of clusters and assign cluster labels to each customer.*

## 6. Visualization
##Reduce dimensionality using PCA and visualize the customer segments in a 2D space.*

## 7. Cluster Analysis
##Analyze the characteristics of each cluster to understand the distinct customer segments.*

## 8. Save Results & Metrics (Optional)
##Optionally save the clustering report and visualization for future reference.*


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score

In [2]:
customers = pd.read_csv('Customers.csv', parse_dates=['SignupDate'])
transactions = pd.read_csv('Transactions.csv', parse_dates=['TransactionDate'])
products = pd.read_csv('Products.csv')

In [4]:
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [5]:
last_date = transactions['TransactionDate'].max()
customers['Tenure'] = (last_date - customers['SignupDate']).dt.days
profile_features = pd.get_dummies(customers[['CustomerID', 'Region', 'Tenure']], 
                                columns=['Region'])

In [6]:
transaction_features = transactions.groupby('CustomerID').agg(
    Total_Transactions=('TransactionID', 'count'),
    Total_Spend=('TotalValue', 'sum'),
    Avg_Quantity=('Quantity', 'mean'),
    Purchase_Frequency=('TransactionDate', lambda x: (x.max() - x.min()).days)
).reset_index()

In [7]:
category_features = merged_data.groupby(['CustomerID', 'Category']).size()\
                        .unstack(fill_value=0).reset_index()

# Combine all features
final_features = profile_features.merge(transaction_features, on='CustomerID')\
                   .merge(category_features, on='CustomerID')\
                   .set_index('CustomerID').fillna(0)


In [8]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(final_features)

In [11]:
db_scores = []
silhouette_scores = []
k_values = range(2, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(scaled_data)
    db_scores.append(davies_bouldin_score(scaled_data, clusters))
    silhouette_scores.append(silhouette_score(scaled_data, clusters))

In [12]:
optimal_k = k_values[np.argmin(db_scores)]

In [13]:
final_model = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = final_model.fit_predict(scaled_data)
final_features['Cluster'] = clusters

In [14]:
pca = PCA(n_components=2)
pca_features = pca.fit_transform(scaled_data)

plt.figure(figsize=(10,6))
scatter = plt.scatter(pca_features[:,0], pca_features[:,1], c=clusters, cmap='viridis')
plt.title(f'Customer Segments (k={optimal_k})', fontsize=14)
plt.xlabel('PCA Component 1', fontsize=12)
plt.ylabel('PCA Component 2', fontsize=12)
plt.legend(*scatter.legend_elements(), title='Clusters')
plt.grid(alpha=0.3)
plt.savefig('clusters_visualization.png', bbox_inches='tight')
plt.close()

In [15]:
cluster_profile = final_features.groupby('Cluster').mean()

In [16]:
with open('Clustering_Report.txt', 'w') as f:
    f.write(f"Clustering Metrics:\n")
    f.write(f"Optimal Clusters: {optimal_k}\n")
    f.write(f"Davies-Bouldin Index: {db_scores[optimal_k-2]:.3f}\n")
    f.write(f"Silhouette Score: {silhouette_scores[optimal_k-2]:.3f}\n\n")
    
    f.write("Cluster Characteristics:\n")
    f.write(str(cluster_profile))

print("Clustering analysis completed successfully!")

Clustering analysis completed successfully!


In [None]:
Clustering Metrics:
Optimal Clusters: 6
Davies-Bouldin Index: 1.539
Silhouette Score: 0.207

Cluster Characteristics:
             Tenure  Region_Asia  Region_Europe  Region_North America  \
Cluster                                                                 
0        614.750000     0.541667       0.083333              0.125000   
1        596.940000     0.000000       0.000000              0.000000   
2        403.235294     0.411765       0.176471              0.235294   
3        531.511111     0.000000       1.000000              0.000000   
4        424.538462     0.000000       0.000000              1.000000   
5        548.666667     1.000000       0.000000              0.000000   

         Region_South America  Total_Transactions  Total_Spend  Avg_Quantity  \
Cluster                                                                        
0                    0.250000            8.416667  6176.439583      2.668013   
1                    1.000000            5.080000  3573.492200      2.543825   
2                    0.176471            1.294118   743.462941      2.323529   
3                    0.000000            4.688889  3297.754222      2.545423   
4                    0.000000            5.358974  3345.774615      2.442949   
5                    0.000000            4.250000  2981.807500      2.628571   

         Purchase_Frequency     Books  Clothing  Electronics  Home Decor  
Cluster                                                                   
0                283.333333  2.166667  2.500000     2.625000    1.125000  
1                243.480000  1.520000  0.900000     1.280000    1.380000  
2                 17.823529  0.117647  0.294118     0.235294    0.647059  
3                227.977778  1.311111  0.911111     1.155556    1.311111  
4                245.128205  1.435897  1.230769     1.256410    1.435897  
5                232.041667  1.041667  1.208333     0.916667    1.083333  