A K-means clustering model to segment customers based on various features. The main objective of creating the model is to help determine credit limit allocation to customers.

I generated mock data from [mockaroo](https://mockaroo.com/).

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [2]:
# Get data
data = pd.read_csv("MOCK_DATA.csv").shift()[1:]

In [3]:
data.rename(columns={"customer since": "customer_since"}, inplace=True)

In [4]:
data.drop("id", axis=1, inplace=True)

In [5]:
data["customer_since"] = data["customer_since"].astype(int)
data["orders"] = data["orders"].astype(int)
data["num_order"] = data["num_order"].astype(int)

In [6]:
data.head()

Unnamed: 0,shop_name,customer_since,orders,num_order
1,Dablist,2,692261,3
2,Dynava,3,2102690,20
3,Fivespan,2,4653475,12
4,Demivee,1,2950648,17
5,Tazz,2,2247424,19


The main steps include:
- standardize data values
- label each data point
- create centroids
- vizualize
- iterate until centroids stop changing
- analyse results

In [7]:
# Get relevant data
relevant_data = ["customer_since", "orders", "num_order"]
df = data[relevant_data].copy()
df.head()

Unnamed: 0,customer_since,orders,num_order
1,2,692261,3
2,3,2102690,20
3,2,4653475,12
4,1,2950648,17
5,2,2247424,19


In [8]:
#Standardize values to ensure that a feature does not dominate over the others. I chose a scale of 1 to 10
df = (df - df.min()) / (df.max() - df.min()) *9 + 1

In [9]:
# The minimum value should now be 1 while the maximum value is 10
df.describe()

Unnamed: 0,customer_since,orders,num_order
count,999.0,999.0,999.0
mean,5.522523,5.427679,5.510669
std,3.595079,2.622226,2.686067
min,1.0,1.0,1.0
25%,1.0,3.180848,3.368421
50%,5.5,5.40497,5.263158
75%,10.0,7.616115,7.631579
max,10.0,10.0,10.0


In [10]:
# Sample random values from each column
def get_random_centroids(df, k):
    centroids = []
    for i in range(k):
        centroid = df.apply(lambda x: float(x.sample()))
        centroids.append(centroid)
    return pd.concat(centroids, axis=1)


In [11]:
# Get random centroids for a cluster of 5
centroids = get_random_centroids(df, 4)

In [12]:
centroids

Unnamed: 0,0,1,2,3
customer_since,1.0,5.5,5.5,5.5
orders,8.848751,6.44202,2.231436,4.511388
num_order,4.315789,8.578947,4.315789,9.526316


In [13]:
# Get euclidean distance and cluster assignment for each feature
def get_data_labels(df, centroids):
    distances = centroids.apply(lambda x: np.sqrt(((df - x) ** 2).sum(axis=1)))
    return distances.idxmin(axis=1)

In [14]:
labels = get_data_labels(df, centroids)

In [15]:
labels

1      2
2      3
3      1
4      1
5      3
      ..
995    3
996    0
997    3
998    1
999    3
Length: 999, dtype: int64

In [16]:
# How many feature in each label
labels.value_counts()

2    352
1    261
0    225
3    161
dtype: int64

In [20]:
# Group by labesl and calculate new centroids
def get_new_centroids(df, centroids, k):
    centroids = df.groupby(labels).apply(lambda x: np.exp(np.log(x).mean())).T
    return centroids

In [23]:
# Transform to 2d
def plot_clusters(df, labels, centroids, iteration):
    pca = PCA(n_components=2)
    data_2d = pca.fit_transform(df)
    centroids_2d = pca.transform(centroids.T)
    clear_output(wait=True)
    plt.title(f"Iteration{iteration}")
    plt.scatter(x=data_2d[:,0], y=data_2d[:,1], c=labels)
    plt.scatter(x=centroids_2d[:,0], y=centroids_2d[:,1])
    plt.show()