In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from kmodes.kprototypes import KPrototypes
from sklearn.metrics import silhouette_score

In [5]:
data = pd.read_csv('data_cleaned.csv',index_col = 'InvoiceDate')

In [6]:
data.index = pd.to_datetime(data.index, format = '%Y-%m-%d %H:%M')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 532820 entries, 2010-12-01 08:26:00 to 2011-12-09 12:50:00
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   InvoiceNo     532820 non-null  object 
 1   StockCode     532820 non-null  object 
 2   Description   532820 non-null  object 
 3   Quantity      532820 non-null  int64  
 4   UnitPrice     532820 non-null  float64
 5   CustomerID    403890 non-null  float64
 6   Country       532820 non-null  object 
 7   FinalPrice    532820 non-null  float64
 8   InvoiceMonth  532820 non-null  object 
 9   Day of week   532820 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 44.7+ MB


In [6]:
data.head()

Unnamed: 0_level_0,InvoiceNo,StockCode,Description,Quantity,UnitPrice,CustomerID,Country,FinalPrice,InvoiceMonth,Day of week
InvoiceDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-12-01 08:26:00,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2.95,17850.0,UNITED KINGDOM,17.7,December,Wednesday
2010-12-01 08:26:00,536365,71053,WHITE METAL LANTERN,6,3.75,17850.0,UNITED KINGDOM,22.5,December,Wednesday
2010-12-01 08:26:00,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,4.15,17850.0,UNITED KINGDOM,33.2,December,Wednesday
2010-12-01 08:26:00,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,3.75,17850.0,UNITED KINGDOM,22.5,December,Wednesday
2010-12-01 08:26:00,536365,84029E,RED WOOLLY HOTTIE WHITE HEART,6,4.25,17850.0,UNITED KINGDOM,25.5,December,Wednesday


In [8]:
data_copy = data.copy()
data_copy['Country'] = data_copy['Country'].map(lambda x: 'UK' if x=='UNITED KINGDOM' else 'non-UK')
Customer_Data = data_copy.groupby(['CustomerID','Country'], sort=False).agg({'Quantity':'mean','UnitPrice':'mean','InvoiceNo':'nunique','Description':'nunique'})
Customer_Data.reset_index(inplace=True)
Customer_Data.columns = ['CustomerID', 'UK?', 'Average Quantity', 'Average Price', 'Repeats', 'Product Variety']
Customer_Data.head()

Unnamed: 0,CustomerID,UK?,Average Quantity,Average Price,Repeats,Product Variety
0,17850.0,UK,5.513158,3.740428,35,24
1,13047.0,UK,6.984536,4.016289,16,105
2,12583.0,non-UK,21.182979,2.182894,17,116
3,13748.0,UK,15.678571,4.053571,5,24
4,15100.0,UK,9.666667,12.75,6,1


To eliminate the negative impacts of feature magnitudes and to speed up the process, it would be a good idea to scale the numerical features before we start.

The client's nationality is a categorical variable in the customer data. For this reason, we choose the k-prototypes technique, which can also account for categorical variables. In these kinds of approaches, we have to tell the algorithm exactly how many clusters there are. Since we are unsure of the quantity, we will examine the silhouette score and clustering cost to establish the ideal number of clusters. How close the data points are to their own cluster in comparison to other data points is determined by the average silhouette value.

Feature scaling

In [10]:
std_slr = StandardScaler()
Customer_Data.iloc[:,2:] = std_slr.fit_transform(Customer_Data.iloc[:,2:])
cst_syms = Customer_Data.iloc[:,0].values.astype(str)
cst_X = Customer_Data.iloc[:,1:].values.astype(object)

Finding number of optimal clusters

In [22]:
for n in range(2,10):
    algo = KPrototypes(n_clusters = n, init = 'Cao')
    clusters = algo.fit_predict(cst_X, categorical = [0])
    silhouette = silhouette_score(cst_X[:,1:],clusters)
    print('number of clusters:', n)
    print('  cost: ',algo.cost_)
    print('  average silhouette score: ',silhouette)

number of clusters: 2
  cost:  14380.190053147257
  average silhouette score:  0.5520788140635532
number of clusters: 3
  cost:  11496.152292547846
  average silhouette score:  0.5676629473885375
number of clusters: 4
  cost:  9064.557730716979
  average silhouette score:  0.40546904294705916
number of clusters: 5
  cost:  6847.479936487132
  average silhouette score:  0.4130000199916561
number of clusters: 6
  cost:  6057.401073355808
  average silhouette score:  0.28912607055254697
number of clusters: 7
  cost:  5303.616059182098
  average silhouette score:  0.3219038281196845
number of clusters: 8
  cost:  4697.681576660268
  average silhouette score:  0.3469220225592841
number of clusters: 9
  cost:  4291.676690255557
  average silhouette score:  0.302191030945364


Three clusters appear like they might adequately represent our data, therefore we use k = 3 to construct the technique.

In [23]:
algo = KPrototypes(n_clusters = 3, init = 'Cao')
clstr = algo.fit_predict(cst_X, categorical = [0])
print('Cluster Centers:\n', algo.cluster_centroids_)

Cluster Centers:
 [['-0.07704860211195215' '0.023470766180950706' '-0.16440775957960074'
  '-0.18651333298636244' 'UK']
 ['8.621149665116166' '-0.8840143724917177' '0.011304893142306646'
  '-0.6524409030503906' 'UK']
 ['-0.06351092657884515' '-0.1742292268528959' '1.9072199403263699'
  '2.2375751060578093' 'UK']]


We need to scale the data back to their original values since we scaled them for clustering, and then we put all of the original data and their cluster labels in a dataframe.

In [26]:
data_clustered = pd.DataFrame(data=std_slr.inverse_transform(Customer_Data.iloc[:,2:]),columns=['Average Quantity', 'Average Price', 'Repeats', 'Product Variety'])
data_clustered['UK?'] = cst_X[:,0]
data_clustered['CustomerID'] = cst_syms
data_clustered['Cluster'] = clstr
print(data_clustered.groupby(['Cluster','UK?']).count()['Repeats'],'\n\n\n')
print(data_clustered.groupby(['Cluster','UK?']).mean())

Cluster  UK?   
0        UK        3588
         non-UK     382
1        UK          35
         non-UK       3
2        UK         307
         non-UK      35
Name: Repeats, dtype: int64 



                Average Quantity  Average Price   Repeats  Product Variety
Cluster UK?                                                               
0       UK             -0.080805       0.027902 -0.163079        -0.189476
        non-UK         -0.041766      -0.018147 -0.176890        -0.158687
1       UK              8.586698      -0.863763  0.046750        -0.650664
        non-UK          9.023088      -1.120279 -0.402226        -0.673172
2       UK             -0.074433      -0.176283  1.862915         2.186079
        non-UK          0.032292      -0.156212  2.295838         2.689270


  print(data_clustered.groupby(['Cluster','UK?']).mean())


In [25]:
# check the above output and modify the below statement.

In [None]:
#Now we can see how our clusters are distributed along each variable in pairwise scatterplots. In the first cluster there are customers (mostly from UK) who bought very specific products with low prices in large volumes but not very often, one-time customers are more likely to be in this segment. The second cluster represents our regular customers who visit quite often buying various products in small quantities. The third cluster consists of customers who bought different products with higher price and they occasionally make a purchase, the majority of our customers (both from UK and non-UK countries) are within this cluster.

