In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline
import seaborn as sns

In [2]:
os.chdir(r'C:\Users\lwhieldon\OneDrive - SC&H Group\Administrative\School Materials\DATA 602 - Into to Data Analysis and ML\OnlineRetailCustomerSegmentation\images')


In [3]:
dataset = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx'

In [None]:
df_initial = pd.read_excel(dataset,encoding="ISO-8859-1",converters={'CustomerID':str,'InvoiceNo':str})
df_initial.head()

In [None]:
df_initial.info()

In [None]:
df_initial.describe()

For this purpose of the exercise, let's first check to see if invoices beginning with 'c' (i.e. cancelled purchases) are in the dataset. Giftware shop is interested in determining customer segmentation based on customers who purchased products, not cancelled purchases, so we can remove all cancelled purchases from the dataset.

Let's also remove null records, especially where a customer id is not assigned.

In [None]:
df_initial = df_initial[~df_initial.InvoiceNo.str.startswith('C', na=False)]

In [None]:
df_initial.isnull().sum()
#Description & Customer ID contain null records. 

In [None]:
missing_customerIDs = df_initial['CustomerID'].isnull().sum()/df_initial['CustomerID'].count()

print('Percentage of Customer IDs missing is {:0%}'.format(missing_customerIDs))

In [None]:
#Since ~34% of the records are missing and still leaves us with a good portion of data, let's remove these records for now
#Now we are left with fully populated data. 
df_initial.dropna(inplace=True)
df_initial.isnull().sum()


In [None]:
#Add a new Total Invoice Amt Column
df_initial['Total Line Amount']= df_initial['Quantity'] * df_initial['UnitPrice']

In [None]:
# Add a Day of Week Purchase Column
df_initial['PurchaseDayofWeek'] = pd.to_datetime(df_initial['InvoiceDate']).dt.dayofweek
df_initial['PurchaseDayofWeek'].unique()

In [None]:
# Add a Time of Day Purchase Column
df_initial['PurchaseTimeofDay'] = pd.to_datetime(df_initial['InvoiceDate'],format='%H:%m').dt.hour
df_initial['PurchaseTimeofDay'].unique()

In [None]:
#Add a Date Only Column from Invoice Date (no timestamp)
df_initial['InvoiceDate_noTime'] = pd.to_datetime(df_initial['InvoiceDate']).dt.date

In [None]:
print('Total no. of customers: ',df_initial['CustomerID'].nunique())
print('Total transactions : ',df_initial['InvoiceNo'].nunique())
print('Products sold are : ',df_initial['StockCode'].nunique())

In [None]:
df_initial.reset_index(drop=True, inplace=True)

In [None]:
df_initial.info()

Since we are looking at the data on a customer level, let's aggregate & perform feature engineering on the dataset:

1. Total Purchase Amount by Customer 
1. Number of Transactons by Customer
1. Distinct Products Types Purchased by Customer
1. Tenure (in days) of being a customer (based on last transaction in the dataset)
1. Apply getdummies to country feature; 1 = UK, 0 = Non-UK
1. Customers who, on average, purchase more products during the business week (Monday - Friday). Will denote with 1
1. Customers who, on average, purchase more products during normal business hours (To keep our analysis consistent, let's assume the dates and times are collected in British time; therefore, we will check to see times between 9a-5pm British time). Also, we will also factor in when a customer calls during the weekend as I want to make this indicator mutually exclusive from the workweek attribute above. Will denote with 1

Once we have aggregated our measures, let's create a new dataframe that groups all features by customer 


In [None]:
#Total Purchase Amount by Customer
total_amount = df_initial['Total Line Amount'].groupby(df_initial['CustomerID']).sum().reset_index()\
        .rename(columns={"Total Line Amount": "TotlAmtbyCust"})

In [None]:
#Number of Transactions by Customer
transactions = df_initial['InvoiceNo'].groupby(df_initial['CustomerID']).count().reset_index()\
        .rename(columns={"InvoiceNo": "TotalCustTransactions"})

In [None]:
#Distinct Products Types Purchased by Customer
product_types = df_initial['StockCode'].groupby(df_initial['CustomerID']).nunique().reset_index()\
                .rename(columns={"StockCode":"DistProductsbyCust"})

In [None]:
#Tenure (in days) of being a customer (based on last transaction in the dataset)
final = df_initial['InvoiceDate_noTime'].max()
df_initial['TenureofCustomer'] = final - df_initial['InvoiceDate_noTime']
CustomerTenure = df_initial['TenureofCustomer'].groupby(df_initial['CustomerID']).min().dt.days                         

In [None]:
df_initial['Country'].value_counts()
#Biggest customer is from the UK.

Since a good portion of the online retailer's customers are from the UK, let's create a new feature where we assign a customer from the UK with 1 and everyone else 0

In [None]:
#Apply getdummies to country feature
Countries = pd.concat([df_initial['CustomerID'],pd.get_dummies(df_initial['Country']=='United Kingdom', prefix='IsUK')],1)
Countries = Countries.loc[1:].groupby(Countries['CustomerID']).max()
Countries = pd.DataFrame(Countries).reset_index(drop=True)
Countries.drop('IsUK_False',axis='columns', inplace=True)
Countries=Countries.rename(columns={'IsUK_True':'IsUK'})

In [None]:
# Customers who, on average, purchase more products during the business week (Monday - Friday). Will denote with 1
IsWeekday = pd.concat([df_initial['CustomerID'].groupby(df_initial['CustomerID'])\
                       .max(),round(df_initial['PurchaseDayofWeek'].groupby(df_initial['CustomerID']).sum()\
                                    /df_initial['PurchaseDayofWeek'].groupby(df_initial['CustomerID']).count())],1)
IsWeekday = pd.DataFrame(IsWeekday).reset_index(drop=True)
IsWeekday['IsWeekDay'] = (IsWeekday['PurchaseDayofWeek']==0) & (IsWeekday['PurchaseDayofWeek']==6).any()
IsWeekday['IsWeekDay']=pd.get_dummies(IsWeekday['IsWeekDay'])
IsWeekday.drop('PurchaseDayofWeek',axis='columns', inplace=True)


In [None]:
# Customers who, on average, purchase more products during normal business hours (9a-5pm British time). Will denote with 1
IsWorkDay = pd.concat([df_initial['CustomerID'].groupby(df_initial['CustomerID'])\
                       .max(),round(df_initial['PurchaseTimeofDay'].groupby(df_initial['CustomerID']).sum()\
                                    /df_initial['PurchaseTimeofDay'].groupby(df_initial['CustomerID']).count())],1)
IsWorkDay = pd.DataFrame(IsWorkDay).reset_index(drop=True)

IsWorkDay['IsWorkDay'] = np.where((IsWorkDay['PurchaseTimeofDay']>= 9) & (IsWorkDay['PurchaseTimeofDay']<= 14), False, True)
IsWorkDay['IsWorkDay']=pd.get_dummies(IsWorkDay['IsWorkDay'])
IsWorkDay.drop('PurchaseTimeofDay',axis='columns', inplace=True)

In [None]:
df_new = pd.merge(total_amount,transactions,how='inner',on='CustomerID')
df_new = pd.merge(df_new,product_types,how='inner',on='CustomerID')
df_new = pd.merge(df_new,CustomerTenure,how='inner',on='CustomerID')
df_new = pd.merge(df_new,Countries,how='inner',on='CustomerID')
df_new = pd.merge(df_new,IsWeekday,how='inner',on='CustomerID')
df_new = pd.merge(df_new,IsWorkDay,how='inner',on='CustomerID')
df_new.drop('CustomerID',axis='columns', inplace=True)

In [None]:
# Outlier Analysis of Amount #ofTransactions, and Disinct Products & Tenure

attributes = ['TotlAmtbyCust','TotalCustTransactions','DistProductsbyCust','TenureofCustomer']
plt.rcParams['figure.figsize'] = [10,8]
sns.boxplot(data = df_new[attributes], orient="v", palette="Set2" ,whis=1.5,saturation=1, width=0.7)
plt.title("Outliers Variable Distribution", fontsize = 14, fontweight = 'bold')
plt.ylabel("Range", fontweight = 'bold')
plt.xlabel("Attributes", fontweight = 'bold')

Whoa! BIG outliers in TotlAmtbyCust; this could throw off our model. Let's reduce the dataset to only include prices & customer transactions that fall with the 95% percentile to help reduce the <b>outliers</b>. 

In [None]:
q_low = df_new["TotlAmtbyCust"].quantile(0.04)
q_hi  = df_new["TotlAmtbyCust"].quantile(0.95)
df_new = df_new[(df_new["TotlAmtbyCust"] < q_hi) & (df_new["TotlAmtbyCust"] > q_low)]
q_low = df_new["TotalCustTransactions"].quantile(0.04)
q_hi  = df_new["TotalCustTransactions"].quantile(0.95)
df_new = df_new[(df_new["TotalCustTransactions"] < q_hi) & (df_new["TotalCustTransactions"] > q_low)]
df_new.reset_index(drop=True, inplace=True)
df_new.describe()


Now that we have an aggregated dataset with features important to customers & removed outliers, let's perform some EDA & visualize the dataset.

Let's first look at the continuous features to see if we can obtain any details related to customer spending habits

In [None]:
plt.figure(1 , figsize = (10,15))
n = 0 
for x in ['TotlAmtbyCust' , 'TotalCustTransactions' ,'DistProductsbyCust','TenureofCustomer']:
    n += 1
    plt.subplot(3,2,n)
    plt.hist(df_new[x] , bins = 100)
    plt.title('Histogram of {}'.format(x))
plt.show()


In [None]:
plt.figure(1 , figsize = (15,6))
n = 0 
for x in ['IsUK' , 'IsWeekDay' ,'IsWorkDay']:
    n += 1
    plt.subplot(1,3,n)
    plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
    sns.countplot(df_new[x] )
    plt.title('Histogram of {}'.format(x))
plt.show()


1. Majority of customers have purchased between \\$0-\\$1000 in total from Giftware Shop
1. Majority of customers have purchased between 0-50 or so times
1. Customers tend to only purchase less than 50 distinct products from Giftware Shop in total
1. The tenure is a little bit more spread out - But the majority of Giftware Shop's customer base fall between 0 (new) to 100 days of transactions. This might be a good item to note to Giftware Shop that they should focus on retention of customers.
1. As expected, the majority of the customers are from the UK
1. Most customers purchase products during the week
1. Most customers purchase products during the work day

Since we know that most of the customers are in the UK, let's go ahead and remove anyone outside of the UK as a customer. There's not a ton of data to delineate customers from outside of the UK so let's focus this to reflect only UK customers.

Let's keep customers who purchase during the week & workday for now since they do not appear to have much presence in the dataset

In [None]:
df_new = df_new[(df_new["IsUK"] == 1) ]
df_new.reset_index(drop=True, inplace=True)
df_new.info()

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(df_new[['TotlAmtbyCust' , 'TotalCustTransactions' ,'DistProductsbyCust','TenureofCustomer']])

In [None]:
from sklearn.cluster import KMeans
import sys
from yellowbrick.cluster import KElbowVisualizer
kmeans = KMeans(random_state=42)

In [None]:
visualizer = KElbowVisualizer(kmeans, k=(2,10), metric = 'calinski_harabasz', timings=False)
visualizer.fit(X_scaled) 
visualizer.show()        
visualizer = KElbowVisualizer(kmeans, k=(2,10), metric = 'silhouette', timings=False,locate_elbow=True)
visualizer.fit(X_scaled)   
visualizer.show()

In [None]:
k_means = KMeans(n_clusters= 3, init='k-means++', random_state=0).fit(X_scaled)
clusters = k_means.predict(X_scaled)

cl_labels_k = k_means.labels_

Use PCA to reduce our dimensionality to see how KMeans clusters our data

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

X_pca = pca.fit_transform(X_scaled)


plt.scatter(X_pca[cl_labels_k==0,0],X_pca[cl_labels_k==0,1],label='Cluster 1')
plt.scatter(X_pca[cl_labels_k==1,0],X_pca[cl_labels_k==1,1],label='Cluster 2')
plt.scatter(X_pca[cl_labels_k==2,0],X_pca[cl_labels_k==2,1],label='Cluster 3')

plt.xlabel('Feature-1')
plt.ylabel('Feature-2')
plt.legend()
plt.title('PCA KMeans Clustering Scatter Plot')
plt.savefig('KMEANS.png')

In [None]:
df_labels = pd.DataFrame(k_means.labels_ , columns = list(['labels']))
df_labels.astype({'labels': 'object'}).dtypes
df_labels.loc[(df_labels.labels == 0)] = 'Cluster 1'
df_labels.loc[(df_labels.labels == 1)] = 'Cluster 2'
df_labels.loc[(df_labels.labels == 2)] = 'Cluster 3'
df_new['Kmeans_labels'] = df_labels['labels'].astype('category')

In [None]:
df_new['Kmeans_labels'].value_counts().sort_index()

In [None]:

plt.figure(1 , figsize = (15,15))
n = 0 
for var in ['TotlAmtbyCust' , 'TotalCustTransactions' ,'DistProductsbyCust','TenureofCustomer']:
    n += 1
    label = df_new['Kmeans_labels'].unique()                   

    plt.subplot(3,2,n)
    plt.hist([df_new.loc[df_new['Kmeans_labels'] == x, var] for x in label], label=label)

    plt.title('K Means Histogram of {}'.format(var))
    plt.legend()
plt.suptitle('K Means Cluster Results')
plt.savefig('KMEANSHistograms.png')
plt.show()


### Conclusions on K-Means Clustering with k=3

K-Means Clustering identified 3 primary clusters in the dataset:

- Customers tagged to __Cluster Id 2__ have a high amount of transactions as compared to the other customer clusters.
- Customers tagged to __Cluster Id 2__ are also frequent buyers & tend to buy a wider array of products.
- Customers tagged to __Cluster Id 3__ are newer buyers and, while they buy more than cluster 1, they don't really spend that much from a total amount & transaction perspective.
- Customers tagged to __Cluster Id 1__ are not recent buyers and tend to purchase less (in total amount spent, transactions, as well as the type of products they buy) and hence least of importance from business point of view.

But based on the dimensionality reduction we perform to visualize our clusters, does this really reflect our customer base? It looks like one big cluster, which gives the indication that KMeans isn't do the best job at fitting our model. Let's try DBSCAN & Spectral Clustering to see if they do a better job at identifying our customer segmentation


In [None]:
from sklearn.cluster import DBSCAN
db = DBSCAN().fit(X_scaled)
cl_labels_d = db.labels_

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

X_pca = pca.fit_transform(X_scaled)
# X_pca_centers_dbscan = pca.transform(cl_centers)

plt.scatter(X_pca[cl_labels_d==0,0],X_pca[cl_labels_d==0,1],label='Cluster 1')
plt.scatter(X_pca[cl_labels_d==1,0],X_pca[cl_labels_d==1,1],label='Cluster 2')
plt.scatter(X_pca[cl_labels_d==2,0],X_pca[cl_labels_d==2,1],label='Cluster 3')
plt.scatter(X_pca[cl_labels_d==-1,0],X_pca[cl_labels_d==-1,1],s=6,label='Noisy')
# plt.scatter(X_pca_centers_kmeans[:, 0], X_pca_centers_kmeans[:, 1], c='black', s = 75)
plt.xlabel('Feature-1')
plt.ylabel('Feature-2')
plt.legend()
plt.title('PCA DBScan Scatter Plot')
plt.savefig('DBSCAN.png')

In [None]:
df_labels = pd.DataFrame(cl_labels_d, columns = list(['labels']))
df_labels.astype({'labels': 'object'}).dtypes
df_labels.loc[(df_labels.labels == -1)] = 'Noisy'
df_labels.loc[(df_labels.labels == 0)] = 'Cluster 1'
df_labels.loc[(df_labels.labels == 1)] = 'Cluster 2'
df_labels.loc[(df_labels.labels == 2)] = 'Cluster 3'
df_new['DBSCAN_labels'] = df_labels['labels'].astype('category')

In [None]:
df_new['DBSCAN_labels'].value_counts().sort_index()

In [None]:
plt.figure(1 , figsize = (15,15))
n = 0 
for var in ['TotlAmtbyCust' , 'TotalCustTransactions' ,'DistProductsbyCust','TenureofCustomer']:
    n += 1
    label = df_new['DBSCAN_labels'].unique()                   

    plt.subplot(3,2,n)
    plt.hist([df_new.loc[df_new['DBSCAN_labels'] == x, var] for x in label], label=label)

    plt.title('DBSCAN Label Histogram of {}'.format(var))
    plt.legend()
plt.suptitle('DBSCAN Cluster Results')
plt.savefig('DBSCANHistograms.png')
plt.show()


### Conclusions on DBSCAN Clustering

This algorithm looks promising! It was able to identify that the majority of our customers fit in the same cluster based on their behaviors.

DBSCAN Clustering identified 3 primary cluster & 1 noisy label:

- DBSCAN grouped the majority of our customers in __Cluster Id 1__: It recognized that most of the customers spend and transact relatively in the same manner. This aligns with the 'single' cluster presented in the lower dimensionality visualization.
- Customers tagged to __Cluster Id 2__ & __Cluster Id 3__ are more frequent buyers & tend to buy a wider array of products.
- We can disregard the __Noisy__ label as the algorithm detected noise in our dataset

In [None]:
from sklearn.cluster import SpectralClustering
sc = SpectralClustering(n_clusters=3).fit(X_scaled)
cl_labels_s = sc.labels_
np.unique(cl_labels_s)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

X_pca = pca.fit_transform(X_scaled)
# X_pca_centers_dbscan = pca.transform(cl_centers)

plt.scatter(X_pca[cl_labels_s==0,0],X_pca[cl_labels_s==0,1],label='Cluster 1')
plt.scatter(X_pca[cl_labels_s==1,0],X_pca[cl_labels_s==1,1],label='Cluster 2')
plt.scatter(X_pca[cl_labels_s==2,0],X_pca[cl_labels_s==2,1],label='Cluster 3')
# plt.scatter(X_pca_centers_kmeans[:, 0], X_pca_centers_kmeans[:, 1], c='black', s = 75)
plt.xlabel('Feature-1')
plt.ylabel('Feature-2')
plt.title('PCA Spectral Clustering Scatter Plot')
plt.legend()
plt.savefig('SPECTRALCLUSTER.png')

In [None]:
df_labels = pd.DataFrame(cl_labels_s , columns = list(['labels']))
df_labels.astype({'labels': 'object'}).dtypes
df_labels.loc[(df_labels.labels == 0)] = 'Cluster 1'
df_labels.loc[(df_labels.labels == 1)] = 'Cluster 2'
df_labels.loc[(df_labels.labels == 2)] = 'Cluster 3'
df_new['SpectralClustering_labels'] = df_labels['labels'].astype('category')

In [None]:
df_new['SpectralClustering_labels'].value_counts().sort_index()

In [None]:
plt.figure(1 , figsize = (15,15))
n = 0 
for var in ['TotlAmtbyCust' , 'TotalCustTransactions' ,'DistProductsbyCust','TenureofCustomer']:
    n += 1
    label = df_new['SpectralClustering_labels'].unique()                   

    plt.subplot(3,2,n)
    plt.hist([df_new.loc[df_new['SpectralClustering_labels'] == x, var] for x in label], label=label)

    plt.title('Spectral Clustering Histogram of {}'.format(var))
    plt.legend()
plt.suptitle('Spectral Clustering Cluster Results')
plt.savefig('SCHistograms.png')
plt.show()

### Conclusions on Spectral Clustering

Spectral Clustering appears to be the 'middle ground' between KMeans & DBSCAN:
- Similar to DBSCAN, Spectral Clustering identified that majority of customers have similar behaviors and grouped them into a single cluster
- Like KMeans, it still segregated customers into __Cluster 2__ & __Cluster 3__ for those groups that tend to be more frequent buyers & tend to buy a wider array of products. 
- Another interesting observation is __Cluster 1__ tends to have less transactions, product selection, and purchases than that of __Cluster 2__ & __Cluster 3__. Seems like Giftware could really benefit from a marketing campaign to bring it new clients!

### Final Observations

When we reduce the dimensionality of the data to visualize the clusters, DBSCAN identified a single cluster, aligning to the majority of customers transacting in the same manner. Another important consideration is that all 3 models identified that a good portion of customers tend to only purchase minimally & not buy a ton of products. Perhaps Giftware Shop should ignite customer engagement by conducting additional marketing compaigns to bring more traffic (and thereby purchases) to the online retail site. 