In [60]:
# Import libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from itertools import combinations


## Data import and pre-processing

In [46]:
#Load customers data
customersdata = pd.read_csv("segmentation data.csv")


In [47]:
print("\n=== Data Head ===")
display(customersdata.head())

#print("\n=== Data Info ===")
#customersdata.info()

#print("\n=== Statistical Summary ===")
#display(customersdata.describe())

print("\n=== Correlations ===") # Estimate the correlations between features
customersdata.corr()


=== Data Head ===


Unnamed: 0,ID,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
0,100000001,0,0,67,2,124670,1,2
1,100000002,1,1,22,1,150773,1,2
2,100000003,0,0,49,1,89210,0,0
3,100000004,0,0,45,1,171565,1,1
4,100000005,0,0,53,1,149031,1,1



=== Correlations ===


Unnamed: 0,ID,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
ID,1.0,0.328262,0.074403,-0.085246,0.012543,-0.303217,-0.291958,-0.378445
Sex,0.328262,1.0,0.566511,-0.182885,0.244838,-0.195146,-0.202491,-0.300803
Marital status,0.074403,0.566511,1.0,-0.213178,0.374017,-0.073528,-0.02949,-0.097041
Age,-0.085246,-0.182885,-0.213178,1.0,0.654605,0.34061,0.108388,0.119751
Education,0.012543,0.244838,0.374017,0.654605,1.0,0.233459,0.064524,0.034732
Income,-0.303217,-0.195146,-0.073528,0.34061,0.233459,1.0,0.680357,0.490881
Occupation,-0.291958,-0.202491,-0.02949,0.108388,0.064524,0.680357,1.0,0.571795
Settlement size,-0.378445,-0.300803,-0.097041,0.119751,0.034732,0.490881,0.571795,1.0


In [48]:
# Pre-processing data

# one-hot encode
# customersdata = pd.get_dummies(customersdata, columns=['Education', 'Occupation', 'Settlement size'], dtype=int)

features_to_scale = ['Age', 'Income', 'Education', 'Occupation', 'Settlement size']

scaler = MinMaxScaler()
customersdata[features_to_scale] = scaler.fit_transform(customersdata[features_to_scale])
print(customersdata.head())

          ID  Sex  Marital status       Age  Education    Income  Occupation  \
0  100000001    0               0  0.844828   0.666667  0.324781         0.5   
1  100000002    1               1  0.068966   0.333333  0.420210         0.5   
2  100000003    0               0  0.534483   0.333333  0.195144         0.0   
3  100000004    0               0  0.465517   0.333333  0.496223         0.5   
4  100000005    0               0  0.603448   0.333333  0.413842         0.5   

   Settlement size  
0              1.0  
1              1.0  
2              0.0  
3              0.5  
4              0.5  


## Implementing a K-means clustering

In [51]:
# Define K-means model
kmeans_model = KMeans(init='k-means++',  max_iter=400, random_state=42)

# Train the model
kmeans_model.fit(customersdata[[
    'Sex',
    'Marital status',
    'Age',
    'Education',
    'Income',
    'Occupation',
    'Settlement size']])


## Find a optimal value of K

In [53]:
#Create the K means model for different values of K
def try_different_clusters(K, data):

    cluster_values = list(range(1, K+1))
    inertias=[]

    for c in cluster_values:
        model = KMeans(n_clusters = c,init='k-means++',max_iter=400,random_state=42)
        model.fit(data)
        inertias.append(model.inertia_)

    return inertias

# Find output for k values between 1 to 12
outputs = try_different_clusters(12, customersdata[[
    'Sex',
    'Marital status',
    'Age',
    'Education',
    'Income',
    'Occupation',
    'Settlement size'
]])

distances = pd.DataFrame({"clusters": list(range(1, 13)),"sum of squared distances": outputs})

In [54]:
# Plotting and Finding optimal number of clusters k
figure = go.Figure()
figure.add_trace(go.Scatter(x=distances["clusters"], y=distances["sum of squared distances"]))

figure.update_layout(xaxis = dict(tick0 = 1,dtick = 1,tickmode = 'linear'),
                  xaxis_title="Number of clusters",
                  yaxis_title="Sum of squared distances",
                  title_text="Finding optimal number of clusters using elbow method")
figure.show()

## Re-train K means model with the optimal K

In [56]:
# Re-Train K means model with the optimal K
kmeans_model_new = KMeans(n_clusters = 4,init='k-means++',max_iter=400,random_state=42)

kmeans_model_new.fit_predict(customersdata[['Sex', 'Marital status', 'Age', 'Education', 'Income', 'Occupation' ,'Settlement size']])

# Create data arrays
cluster_centers = kmeans_model_new.cluster_centers_
data = np.expm1(cluster_centers)
points = np.append(data, cluster_centers, axis=1)
points

# Add "clusters" to customers data
points = np.append(points, [[0], [1], [2], [3]], axis=1)
customersdata["clusters"] = kmeans_model_new.labels_


## Visualizing customer segments

In [62]:
# visualize clusters
features = ['Sex', 'Marital status', 'Age', 'Education', 'Income', 'Occupation', 'Settlement size']

# Generate all 3-feature combinations
feature_combos = list(combinations(features, 3))

# Loop and plot
for combo in feature_combos:
    fig = px.scatter_3d(customersdata,
                        x=combo[0],
                        y=combo[1],
                        z=combo[2],
                        color='clusters',
                        title=f"3D Cluster Plot: {combo[0]}, {combo[1]}, {combo[2]}")
    fig.show()