# Data Modeling

In this notebook I will cluster both the customer and seasonality dataset

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan

from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA


In [2]:
# Load in libraries

# Loading customer scaled dataset.
ds_cust=pd.read_csv('../data/Customer_scaled.csv')

# Loading seasonality scaled dataset.
ds_sea=pd.read_csv('../data/Seasonality_scaled.csv')


In [3]:
# Checking that ds_cust loaded properly

ds_cust

Unnamed: 0,CustomerTotalSpend,CustomerOrderCount,AvgOrderValue,AvgDiscountRate,AvgQuantity,AvgSunkCost
0,-1.214174,-1.044505,-1.356249,0.412296,-1.860116,-1.393696
1,-0.337423,0.546457,-0.671998,0.094130,-0.000458,-0.476196
2,-1.028223,-1.044505,-0.833763,1.207713,1.859200,-0.177815
3,-0.361546,0.546457,-0.694592,1.684962,-0.623443,-0.579382
4,-0.367946,-1.044505,1.021492,2.003129,0.929371,1.173068
...,...,...,...,...,...,...
43228,3.185470,-0.249024,4.722165,-1.178536,1.859200,4.042512
43229,0.181128,-1.044505,2.564284,-1.178536,-0.000458,2.687330
43230,-0.468757,-0.249024,-0.411639,-1.178536,-0.930287,-0.664511
43231,-1.021769,-1.044505,-0.815627,2.003129,-0.930287,-0.925917


In [4]:
# Checking that ds_sea loaded properly.

ds_sea

Unnamed: 0,CustomerTotalSpend,CustomerOrderCount,AvgOrderValue,AvgDiscountRate,AvgQuantity,AvgSunkCost,OrdersInMonth_1,OrdersInMonth_2,OrdersInMonth_3,OrdersInMonth_4,...,PctOrdersInMonth_OrdersInMonth_3,PctOrdersInMonth_OrdersInMonth_4,PctOrdersInMonth_OrdersInMonth_5,PctOrdersInMonth_OrdersInMonth_6,PctOrdersInMonth_OrdersInMonth_7,PctOrdersInMonth_OrdersInMonth_8,PctOrdersInMonth_OrdersInMonth_9,PctOrdersInMonth_OrdersInMonth_10,PctOrdersInMonth_OrdersInMonth_11,PctOrdersInMonth_OrdersInMonth_12
0,-1.214174,-1.044505,-1.356249,0.412296,-1.860116,-1.393696,-0.452415,-0.428299,-0.446383,-0.442042,...,-0.399174,-0.394401,-0.402466,-0.397179,4.288746,-0.404826,-0.395376,-0.394766,-0.390965,-0.398805
1,-0.337423,0.546457,-0.671998,0.094130,-0.000458,-0.476196,-0.452415,2.002348,1.840707,-0.442042,...,1.168632,-0.394401,-0.402466,-0.397179,1.148283,-0.404826,-0.395376,-0.394766,-0.390965,-0.398805
2,-1.028223,-1.044505,-0.833763,1.207713,1.859200,-0.177815,-0.452415,-0.428299,-0.446383,-0.442042,...,-0.399174,-0.394401,-0.402466,-0.397179,-0.398512,-0.404826,-0.395376,-0.394766,-0.390965,4.376985
3,-0.361546,0.546457,-0.694592,1.684962,-0.623443,-0.579382,-0.452415,-0.428299,-0.446383,-0.442042,...,-0.399174,-0.394401,-0.402466,2.806876,1.148283,-0.404826,-0.395376,-0.394766,-0.390965,-0.398805
4,-0.367946,-1.044505,1.021492,2.003129,0.929371,1.173068,-0.452415,-0.428299,1.840707,-0.442042,...,4.351753,-0.394401,-0.402466,-0.397179,-0.398512,-0.404826,-0.395376,-0.394766,-0.390965,-0.398805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43228,3.185470,-0.249024,4.722165,-1.178536,1.859200,4.042512,1.811913,-0.428299,-0.446383,-0.442042,...,-0.399174,-0.394401,-0.402466,-0.397179,-0.398512,-0.404826,-0.395376,-0.394766,-0.390965,1.989090
43229,0.181128,-1.044505,2.564284,-1.178536,-0.000458,2.687330,-0.452415,-0.428299,-0.446383,-0.442042,...,-0.399174,-0.394401,-0.402466,4.384993,-0.398512,-0.404826,-0.395376,-0.394766,-0.390965,-0.398805
43230,-0.468757,-0.249024,-0.411639,-1.178536,-0.930287,-0.664511,-0.452415,-0.428299,-0.446383,-0.442042,...,-0.399174,-0.394401,-0.402466,-0.397179,-0.398512,1.924733,1.986182,-0.394766,-0.390965,-0.398805
43231,-1.021769,-1.044505,-0.815627,2.003129,-0.930287,-0.925917,-0.452415,-0.428299,-0.446383,-0.442042,...,-0.399174,-0.394401,-0.402466,-0.397179,-0.398512,4.254291,-0.395376,-0.394766,-0.390965,-0.398805


### Customer clustering

In [None]:
# Calculating the silhouette scores to determine the optimal number of clusters

cust_sil_scores={}

for k in range(2,11):
    km=KMeans(n_clusters=k, random_state=42)
    c_labels=km.fit_predict(ds_cust)
    c_sil=silhouette_score(ds_cust,c_labels)
    cust_sil_scores[k]=c_sil
cust_sil_scores

In [None]:
# Sorting the customer silhouette scores
cs=pd.DataFrame(cust_sil_scores.items(), columns=['k', 'silhouette_score'])
cs_sorted=cs.sort_values('silhouette_score', ascending=False).reset_index(drop=True)
print(cs_sorted)

Looking at the silhouette score for the customer data, the optimal number of clusters per the silhouette score is 2 clusters. However, that is just splitting the dataset in half. Therefore I will use the next best which is 5 clusters

In [None]:
# Fitting kmeans with 5 clusters
c_kmeans=KMeans(n_clusters=5, random_state=42)
c_kmeans_labels=c_kmeans.fit_predict(ds_cust)

In [None]:
# Visualizing the clusters with PCA

c_PCA=PCA(n_components=2)
c_cust_PCA=c_PCA.fit_transform(ds_cust)

plt.scatter(c_cust_PCA[:,0], c_cust_PCA[:,1], c=c_kmeans_labels,cmap='viridis', s=5)
plt.title ("Customer KMeans Clusters (PCA Projection)")
plt.show()

In [None]:
# Creating a copy of the dataframe
ds_kmeans=ds_cust.copy()

# adding the kmeans data
ds_kmeans['cluster']=c_kmeans_labels
kmeans_summary=ds_kmeans.groupby('cluster').mean()
print("KMeans Cluster Summary:")
print(kmeans_summary)

#### KMeans Interpretation

- Cluster 0 : Customers in Cluster 0 are sensitive on discounts. They are more likely to purchase the item if it has a discount.
- Cluster 1 : Customers in CLuster 1 are low-value low-engangement buyers. They spend very little and don't buy a lot or use discounts.
- Cluster 2 : Customers in Cluster 2 are frequent buyers, spend above average.
- Cluster 3 : Customers in Cluster 3 are high order buyers and have a high sunkcost. Most likely a premium buyer
- Cluster 4 : Customers in Cluster 4 are bulk buyers, they buy a high quantity of goods, but at a lower price and try to avoid high sunkcosts.


#### DBscan

In [None]:
# clustering the data based on DBScan
c_db=DBSCAN(eps=0.5, min_samples=5)
c_db_labels=c_db.fit_predict(ds_cust)

In [None]:
# Visualizing the clusters

plt.scatter(c_cust_PCA[:,0], c_cust_PCA[:,1], c=c_db_labels,cmap='viridis', s=5)
plt.title ("Customer DBScan Clusters (PCA Projection)")
plt.show()

In [None]:
# Creating a copy of the dataframe
ds_dbscan=ds_cust.copy()

# adding the dbscan data
ds_dbscan['cluster']=c_db_labels
dbscan_summary=ds_dbscan.groupby('cluster').mean()
print("DBScan Cluster Summary:")
print(dbscan_summary)

#### HDBScan

In [None]:
# Clustering the data based on HDBScan
c_hdb=hdbscan.HDBSCAN(min_cluster_size=2645, min_samples=10)
c_hdb_labels=c_hdb.fit_predict(ds_cust)

In [None]:
plt.scatter(c_cust_PCA[:,0], c_cust_PCA[:,1], c=c_hdb_labels,cmap='viridis', s=5)
plt.title ("Customer HDBScan Clusters (PCA Projection)")
plt.show()

In [None]:
# Creating a copy of the dataframe
ds_hdbscan=ds_cust.copy()

# adding the hdbscan data
ds_hdbscan['cluster']=c_hdb_labels
hdbscan_summary=ds_hdbscan.groupby('cluster').mean()
print("HDBScan Cluster Summary:")
print(hdbscan_summary)