In [3]:
# importing necessary Python libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
#import plotly.offline as pyoff
import plotly.graph_objs as go 
#import plotly.figure_factory as ff

# avoid displaying warnings
import warnings
warnings.filterwarnings("ignore")

#import machine learning related libraries
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV, cross_validate

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.cluster import KMeans
import xgboost as xgb
import time 

# Loading the data
df = pd.read_csv('online_retail_II.csv')

DATA WRANGLING


In [4]:
# Rename the following columns: 
#    Invoice to InvoiceNo
#    Customer ID to CustomerID
#    Price to UnitPrice

df.rename(columns={'Invoice':'InvoiceNo', 'Customer ID':'CustomerID', 
                   'Price':'UnitPrice'}, 
          inplace=True)

Dropping empty CustomerID

In [5]:
df.dropna(inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 824364 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    824364 non-null  object 
 1   StockCode    824364 non-null  object 
 2   Description  824364 non-null  object 
 3   Quantity     824364 non-null  int64  
 4   InvoiceDate  824364 non-null  object 
 5   UnitPrice    824364 non-null  float64
 6   CustomerID   824364 non-null  float64
 7   Country      824364 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 56.6+ MB


In [7]:
df.InvoiceDate=pd.to_datetime(df.InvoiceDate)

In [8]:
ctm_bhvr_dt = df[(df.InvoiceDate < pd.Timestamp(2011,9,1)) & (df.InvoiceDate >= pd.Timestamp(2009,12,1))].reset_index(drop=True)

In [9]:
ctm_next_quarter = df[(df.InvoiceDate < pd.Timestamp(2011,12,1)) & (df.InvoiceDate >= pd.Timestamp(2011,9,1))].reset_index(drop=True)

In [10]:
# Get the distinct customers in the dataframe ctm_bhvr_dt
cust_dt = pd.DataFrame(ctm_bhvr_dt['CustomerID'].unique())

# Rename the column to CustomerID.
cust_dt.columns = ['CustomerID']

In [11]:
cust_dt

Unnamed: 0,CustomerID
0,13085.0
1,13078.0
2,15362.0
3,18102.0
4,12682.0
...,...
5309,15866.0
5310,14660.0
5311,13726.0
5312,15690.0


In [24]:
# Create a dataframe with CustomerID and customers first purchase 
# date in the dataset ctm_next_quarter
ctm_1st_purchase_in_next_quarter = pd.DataFrame(ctm_next_quarter.groupby('CustomerID').InvoiceDate.min()).reset_index()
ctm_1st_purchase_in_next_quarter.columns = ['CustomerID', 'MinPurchaseDate']


In [25]:
ctm_1st_purchase_in_next_quarter

Unnamed: 0,CustomerID,MinPurchaseDate
0,12347.0,2011-10-31 12:25:00
1,12348.0,2011-09-25 13:13:00
2,12349.0,2011-11-21 09:51:00
3,12352.0,2011-09-20 14:34:00
4,12356.0,2011-11-17 08:40:00
...,...,...
2867,18276.0,2011-10-27 10:54:00
2868,18277.0,2011-10-12 15:22:00
2869,18278.0,2011-09-27 11:58:00
2870,18283.0,2011-09-05 12:35:00


In [26]:
# Create a dataframe with CustomerID and customers last purchase 
# date in the dataset ctm_bhvr_dt
ctm_last_purchase_bhvr_dt = ctm_bhvr_dt.groupby('CustomerID').InvoiceDate.max().reset_index()
ctm_last_purchase_bhvr_dt.columns = ['CustomerID', 'MaxPurchaseDate']


In [27]:
ctm_last_purchase_bhvr_dt

Unnamed: 0,CustomerID,MaxPurchaseDate
0,12346.0,2011-01-18 10:17:00
1,12347.0,2011-08-02 08:48:00
2,12348.0,2011-04-05 10:47:00
3,12349.0,2010-10-28 08:23:00
4,12350.0,2011-02-02 16:01:00
...,...,...
5309,18283.0,2011-07-14 13:20:00
5310,18284.0,2010-10-06 12:31:00
5311,18285.0,2010-02-17 10:24:00
5312,18286.0,2010-08-20 11:57:00


In [28]:
# Merge two dataframes ctm_last_purchase_bhvr_dt and ctm_1st_purchase_in_next_quarter
ctm_purchase_dates = pd.merge(ctm_last_purchase_bhvr_dt, ctm_1st_purchase_in_next_quarter, on='CustomerID', how='left')

In [33]:
# Get the difference in days from MinPurchaseDate and MaxPurchaseDate for each customer
ctm_purchase_dates['NextPurchaseDay'] = (ctm_purchase_dates['MinPurchaseDate'] - ctm_purchase_dates['MaxPurchaseDate']).dt.days

In [38]:

cust_dt = pd.merge(cust_dt, ctm_purchase_dates[['CustomerID', 'NextPurchaseDay']], on='CustomerID', how='left')
# Fill all missing values in the dataset cust_dt with the number 9999
cust_dt = cust_dt.fillna(9999)
cust_dt.head()

Unnamed: 0,CustomerID,NextPurchaseDay
0,13085.0,9999.0
1,13078.0,13.0
2,15362.0,9999.0
3,18102.0,27.0
4,12682.0,15.0


In [40]:
ctm_max_purchase = ctm_bhvr_dt.groupby('CustomerID').InvoiceDate.max().reset_index()
ctm_max_purchase.columns = ['CustomerID','MaxPurchaseDate']
ctm_max_purchase



Unnamed: 0,CustomerID,MaxPurchaseDate
0,12346.0,2011-01-18 10:17:00
1,12347.0,2011-08-02 08:48:00
2,12348.0,2011-04-05 10:47:00
3,12349.0,2010-10-28 08:23:00
4,12350.0,2011-02-02 16:01:00
...,...,...
5309,18283.0,2011-07-14 13:20:00
5310,18284.0,2010-10-06 12:31:00
5311,18285.0,2010-02-17 10:24:00
5312,18286.0,2010-08-20 11:57:00


In [41]:


# Find the recency in days 
ctm_max_purchase['Recency'] = (ctm_max_purchase['MaxPurchaseDate'].max() - ctm_max_purchase['MaxPurchaseDate']).dt.days

# Merge the dataframes cust_dt and ctm_max_purchase[['CustomerID', 'Recency']] on the CustomerID column.
cust_dt = pd.merge(cust_dt, ctm_max_purchase[['CustomerID', 'Recency']], on='CustomerID')
cust_dt.head()



Unnamed: 0,CustomerID,NextPurchaseDay,Recency
0,13085.0,9999.0,57
1,13078.0,13.0,0
2,15362.0,9999.0,348
3,18102.0,27.0,26
4,12682.0,15.0,0


In [42]:
pd.DataFrame(cust_dt.Recency.describe())

Unnamed: 0,Recency
count,5314.0
mean,204.675574
std,173.569622
min,0.0
25%,48.0
50%,161.0
75%,320.0
max,638.0


Chotu Visualistion

In [47]:
hist_fig = px.histogram(cust_dt,x="Recency",title='Customer Recency in Days',template= "plotly_dark")

hist_fig.update_layout(title_x=0.5,xaxis_title="Recency in groups of 20 days",yaxis_title="No of Custs")

hist_fig.show(config={'displaylogo': False})

In [48]:
my_dict={}
ctm_recency = cust_dt[['Recency']]
for idx in range(1, 10):
    kmeans = KMeans(n_clusters=idx, max_iter=1000).fit(ctm_recency)
    ctm_recency["clusters"] = kmeans.labels_
    my_dict[idx] = kmeans.inertia_ 

line_fig = px.line(x=list(my_dict.keys()), 
                   y=list(my_dict.values()), 
                   template="plotly_dark"
                  )

line_fig.update_layout(title_x=0, 
                       xaxis_title="Number of cluster", 
                       yaxis_title=""
                      )

line_fig.show(config={'displaylogo': False})

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fef76d13e20>
Traceback (most recent call last):
  File "/home/gesskay/anaconda3/lib/python3.10/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/gesskay/anaconda3/lib/python3.10/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/gesskay/anaconda3/lib/python3.10/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/gesskay/anaconda3/lib/python3.10/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<loc

In [49]:
number_of_clusters = 4

In [51]:
kmeans = KMeans(n_clusters=number_of_clusters)
kmeans.fit(cust_dt[['Recency']])
cust_dt['RecencyCluster'] = kmeans.predict(cust_dt[['Recency']])
cust_dt.head()

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fef74adc820>
Traceback (most recent call last):
  File "/home/gesskay/anaconda3/lib/python3.10/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/gesskay/anaconda3/lib/python3.10/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/gesskay/anaconda3/lib/python3.10/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/gesskay/anaconda3/lib/python3.10/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<loc

Unnamed: 0,CustomerID,NextPurchaseDay,Recency,RecencyCluster
0,13085.0,9999.0,57,0
1,13078.0,13.0,0,0
2,15362.0,9999.0,348,1
3,18102.0,27.0,26,0
4,12682.0,15.0,0,0
