In [2]:
#importing necessary libraries and the dataset
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from sklearn.preprocessing import StandardScaler
from kmodes.kprototypes import KPrototypes
from sklearn.metrics import silhouette_score
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.patches as mpatches
%matplotlib inline

CleanDataset = 'data_cleaned.csv'
Data_Cleaned = pd.read_csv(CleanDataset, index_col = 'InvoiceDate')
Data_Cleaned.index = pd.to_datetime(Data_Cleaned.index, format = '%Y-%m-%d %H:%M')

In [3]:
Data = Data_Cleaned.copy()
Data['Country'] = Data['Country'].map(lambda x: 'UK' if x=='UNITED KINGDOM' else 'non-UK')
CustomerData = Data.groupby(['CustomerID','Country'], sort=False).agg({'Quantity':'mean','UnitPrice':'mean','InvoiceNo':'nunique','Description':'nunique'})
CustomerData.reset_index(inplace=True)
CustomerData.columns = ['CustomerID', 'UK?', 'Average Quantity', 'Average Price', 'Repeats', 'Product Variety']
CustomerData.head()

Unnamed: 0,CustomerID,UK?,Average Quantity,Average Price,Repeats,Product Variety
0,17850.0,UK,5.513158,3.740428,35,24
1,13047.0,UK,6.984536,4.016289,16,105
2,12583.0,non-UK,21.182979,2.182894,17,116
3,13748.0,UK,15.678571,4.053571,5,24
4,15100.0,UK,9.666667,12.75,6,1


In [5]:
#scaling the numerical features for clustering
Scaler = StandardScaler()
CustomerData.iloc[:,2:] = Scaler.fit_transform(CustomerData.iloc[:,2:])
syms = CustomerData.iloc[:,0].values.astype(str)
X = CustomerData.iloc[:,1:].values.astype(object)

In [6]:
kproto = KPrototypes(n_clusters = 3, init = 'Cao')
clusters = kproto.fit_predict(X, categorical = [0])
print('Cluster Centers:\n', kproto.cluster_centroids_)

Cluster Centers:
 [['-0.07704860211195202' '0.023470766180950592' '-0.16440775957959944'
  '-0.18651333298636238' 'UK']
 ['-0.06351092657884302' '-0.17422922685289516' '1.9072199403263699'
  '2.2375751060578195' 'UK']
 ['8.62114966511616' '-0.8840143724917177' '0.011304893142306713'
  '-0.6524409030503906' 'UK']]


In [7]:
#scaling back to original values and retrieving all attributes
Clustered = pd.DataFrame(data=Scaler.inverse_transform(CustomerData.iloc[:,2:]),columns=['Average Quantity', 'Average Price', 'Repeats', 'Product Variety'])
Clustered['UK?'] = X[:,0]
Clustered['CustomerID'] = syms
Clustered['Cluster'] = clusters
print(Clustered.groupby(['Cluster','UK?']).count()['Repeats'],'\n\n\n')
print(Clustered.groupby(['Cluster','UK?']).mean())

Cluster  UK?   
0        UK        3588
         non-UK     382
1        UK         307
         non-UK      35
2        UK          35
         non-UK       3
Name: Repeats, dtype: int64 



                Average Quantity  Average Price    Repeats  Product Variety
Cluster UK?                                                                
0       UK             13.127527       3.116832   3.499443        45.214047
        non-UK         14.734203       3.051530   3.374346        47.858639
1       UK             13.389770       2.827273  21.850163       249.263844
        non-UK         17.782164       2.855736  25.771429       292.485714
2       UK            369.849336       1.852343   5.400000         5.600000
        non-UK        387.809524       1.488571   1.333333         3.666667


  print(Clustered.groupby(['Cluster','UK?']).mean())
