## K-Prototype Clustering

In [10]:
#Importing the Packages required
import numpy as np # for mathematical calculation
import pandas as pd # for importing and exporting / data manipulation
import matplotlib.pyplot as plt # for visualisation
import seaborn as sns # for visualisation along with statistics


In [9]:
!pip install kmodes

Collecting kmodes
  Downloading kmodes-0.11.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: kmodes
Successfully installed kmodes-0.11.0


In [12]:
from kmodes.kprototypes import KPrototypes

In [13]:
#Assigning file path
import os

In [14]:
#Choosing Working Directory
os.chdir('D:\\DS Project\\Ecommerce Analytics')

In [None]:
#Importing Dataset (Reading Excel Format)
rfm_df = pd.read_excel(r'E-com_Data.xlsx')

In [None]:
#Head to see the 1st 5 rows of our dataset
rfm_df.head(10)

In [6]:
rfm_df = rfm_df.drop(['Item Code','Quantity','Time','price per Unit','Cancelled_status','Reason of return','Sold as set'],axis = 1)

In [7]:
rfm_df

Unnamed: 0,CustomerID,InvoieNo,Date of purchase,Price,Shipping Location
0,4355.0,398177,2017-10-29,1926.0,Location 1
1,4352.0,394422,2017-10-05,1740.0,Location 1
2,4352.0,394422,2017-10-12,1866.0,Location 1
3,4352.0,388633,2017-08-22,1869.0,Location 1
4,4352.0,394422,2017-10-10,1888.0,Location 1
...,...,...,...,...,...
537974,,372313,2017-03-23,1188.0,Location 8
537975,,367605,2017-02-06,1522.0,Location 8
537976,,368246,2017-02-06,2283.0,Location 8
537977,,366891,2017-01-29,2970.0,Location 8


In [10]:
rfm = rfm_df.values

In [11]:
rfm[:,3] = rfm[:,3].astype(float)

In [12]:

rfm

array([[4355.0, 398177, Timestamp('2017-10-29 00:00:00'), 1926.0,
        'Location 1'],
       [4352.0, 394422, Timestamp('2017-10-05 00:00:00'), 1740.0,
        'Location 1'],
       [4352.0, 394422, Timestamp('2017-10-12 00:00:00'), 1866.0,
        'Location 1'],
       ...,
       [nan, 368246, Timestamp('2017-02-06 00:00:00'), 2283.0,
        'Location 8'],
       [nan, 366891, Timestamp('2017-01-29 00:00:00'), 2970.0,
        'Location 8'],
       [nan, 391243, Timestamp('2017-09-17 00:00:00'), 8340.0,
        'Location 8']], dtype=object)

In [17]:
#Missing Value Count in each variable columns
pd.DataFrame(rfm_df).isnull().sum()

CustomerID           0
InvoieNo             0
Date of purchase     0
Price                0
Shipping Location    0
dtype: int64

In [15]:
#Removing Blank Customer IDs
rfm_df = rfm_df[rfm_df['CustomerID'].isna()==False]

In [19]:
kproto = KPrototypes(n_clusters=5, verbose=2,max_iter=100)
clusters = kproto.fit_predict(rfm_df, categorical=[0, 1, 2, 4])

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 27374, ncost: 2270770863806.7476
Run: 1, iteration: 2/100, moves: 147034, ncost: 1908978290972.8093
Run: 1, iteration: 3/100, moves: 53512, ncost: 1707193486158.843
Run: 1, iteration: 4/100, moves: 31018, ncost: 1578662853604.4456
Run: 1, iteration: 5/100, moves: 35161, ncost: 1460529467399.3162
Run: 1, iteration: 6/100, moves: 33800, ncost: 1380348076724.408
Run: 1, iteration: 7/100, moves: 26853, ncost: 1327063233996.5298
Run: 1, iteration: 8/100, moves: 22730, ncost: 1274112027280.6294
Run: 1, iteration: 9/100, moves: 17469, ncost: 1234184655517.1797
Run: 1, iteration: 10/100, moves: 12418, ncost: 1204327628433.8237
Run: 1, iteration: 11/100, moves: 8753, ncost: 1182813485968.1316
Run: 1, iteration: 12/100, moves: 5054, ncost: 1172040846382.1091
Run: 1, iteration: 13/100, moves: 3443, ncost: 1165635811759.668
Run: 1, iteration: 14/100, moves: 2383, ncost: 1162464589299.38

In [21]:
print(kproto.cluster_centroids_)

[[25237.293676495654 1704.0 381981 Timestamp('2017-09-23 00:00:00')
  'Location 36']
 [824.7415114147444 4043.0 404260 Timestamp('2017-11-24 00:00:00')
  'Location 36']
 [58208.37254901961 1704.0 388678 Timestamp('2017-11-11 00:00:00')
  'Location 36']
 [3037.2644503358906 1896.0 396717 Timestamp('2017-11-18 00:00:00')
  'Location 36']
 [10276.89274216323 1704.0 381981 Timestamp('2017-11-17 00:00:00')
  'Location 36']]


In [22]:
cluster_dict=[]
for c in clusters:
    cluster_dict.append(c)

In [23]:
cluster_dict

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 3,
 4,
 4,
 3,
 1,
 3,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 1,
 3,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 3,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 4,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 1,
 3,
 1,
 1,
 3,
 3,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 1,


In [24]:
rfm_df['Cluster']=cluster_dict

In [25]:
rfm_df

Unnamed: 0,CustomerID,InvoieNo,Date of purchase,Price,Shipping Location,Cluster
0,4355.0,398177,2017-10-29,1926.0,Location 1,1
1,4352.0,394422,2017-10-05,1740.0,Location 1,1
2,4352.0,394422,2017-10-12,1866.0,Location 1,1
3,4352.0,388633,2017-08-22,1869.0,Location 1,1
4,4352.0,394422,2017-10-10,1888.0,Location 1,1
...,...,...,...,...,...,...
537945,37.0,402292,2017-11-28,384.0,Location 8,1
537946,37.0,402292,2017-11-27,398.0,Location 8,1
537947,21.0,363890,2016-12-21,2464.0,Location 8,3
537948,21.0,363890,2016-12-21,4068.0,Location 8,3


In [26]:
rfm_df[rfm_df['Cluster']== 0].head(10)

Unnamed: 0,CustomerID,InvoieNo,Date of purchase,Price,Shipping Location,Cluster
466,3717.0,385008,2017-07-18,19008.0,Location 1,0
467,3717.0,385008,2017-07-16,19152.0,Location 1,0
468,3717.0,385008,2017-07-23,24150.0,Location 1,0
494,3655.0,400210,2017-11-12,18840.0,Location 1,0
649,3326.0,404985,2017-12-09,20352.0,Location 1,0
650,3326.0,401836,2017-11-20,20544.0,Location 1,0
1278,2244.0,405093,2017-12-10,18864.0,Location 1,0
1279,2244.0,397402,2017-10-29,20760.0,Location 1,0
1348,2148.0,391284,2017-09-17,19848.0,Location 1,0
1455,2000.0,397257,2017-10-22,27750.0,Location 1,0


In [27]:
rfm_df[rfm_df['Cluster']== 1].head(10)

Unnamed: 0,CustomerID,InvoieNo,Date of purchase,Price,Shipping Location,Cluster
0,4355.0,398177,2017-10-29,1926.0,Location 1,1
1,4352.0,394422,2017-10-05,1740.0,Location 1,1
2,4352.0,394422,2017-10-12,1866.0,Location 1,1
3,4352.0,388633,2017-08-22,1869.0,Location 1,1
4,4352.0,394422,2017-10-10,1888.0,Location 1,1
5,4349.0,397122,2017-10-27,256.0,Location 1,1
6,4343.0,368432,2017-02-13,-3688.0,Location 1,1
11,4339.0,361897,2016-12-11,1872.0,Location 1,1
12,4333.0,375503,2017-04-24,1750.0,Location 1,1
13,4331.0,394146,2017-10-01,1702.0,Location 1,1


In [28]:
rfm_df[rfm_df['Cluster']== 2].head(10)

Unnamed: 0,CustomerID,InvoieNo,Date of purchase,Price,Shipping Location,Cluster
119,4126.0,390304,2017-09-09,85320.0,Location 1,2
202,4010.0,393455,2017-10-02,87156.0,Location 1,2
782,3203.0,386115,2017-07-28,54432.0,Location 1,2
783,3203.0,394634,2017-10-06,78768.0,Location 1,2
2270,1006.0,370646,2017-03-05,56928.0,Location 1,2
2629,459.0,388138,2017-08-16,72090.0,Location 1,2
2703,319.0,367693,2017-02-05,48480.0,Location 1,2
2704,319.0,367693,2017-02-05,50592.0,Location 1,2
2705,319.0,396928,2017-10-26,54528.0,Location 1,2
5351,1346.0,361954,2016-12-04,82080.0,Location 11,2


In [29]:
rfm_df[rfm_df['Cluster']== 3].head(10)

Unnamed: 0,CustomerID,InvoieNo,Date of purchase,Price,Shipping Location,Cluster
7,4341.0,377109,2017-05-14,2031.0,Location 1,3
8,4341.0,377109,2017-05-12,2076.0,Location 1,3
9,4341.0,390217,2017-09-07,4020.0,Location 1,3
10,4341.0,389462,2017-09-04,4044.0,Location 1,3
24,4331.0,385252,2017-07-25,3440.0,Location 1,3
25,4331.0,390869,2017-09-08,3460.0,Location 1,3
26,4331.0,375055,2017-04-18,3660.0,Location 1,3
27,4331.0,395889,2017-10-16,3688.0,Location 1,3
30,4328.0,365323,2017-01-10,2048.0,Location 1,3
32,4325.0,366343,2017-01-18,2076.0,Location 1,3


In [30]:
rfm_df[rfm_df['Cluster']== 4].head(10)

Unnamed: 0,CustomerID,InvoieNo,Date of purchase,Price,Shipping Location,Cluster
28,4331.0,382695,2017-06-26,10248.0,Location 1,4
29,4331.0,377010,2017-05-11,10296.0,Location 1,4
149,4060.0,395716,2017-10-12,10788.0,Location 1,4
367,3889.0,394481,2017-10-13,8172.0,Location 1,4
398,3862.0,393594,2017-09-30,6936.0,Location 1,4
426,3802.0,406084,2017-12-16,7800.0,Location 1,4
517,3580.0,399418,2017-11-07,8520.0,Location 1,4
518,3580.0,395507,2017-10-20,12480.0,Location 1,4
555,3484.0,385265,2017-07-22,10200.0,Location 1,4
720,3250.0,401580,2017-11-24,9340.0,Location 1,4
