In [3]:
# import required libraries for dataframe and visualization

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import plotly as py 
import plotly.graph_objs as go

from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# import required libraries for clustering
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

In [4]:
pd.set_option('display.max_columns', 999)

In [5]:
df = pd.read_csv('OnlineRetail.csv', encoding = 'unicode_escape')
df.head(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 08:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 08:26,2.75,17850.0,United Kingdom


### 1. Data Preprocessing


In [6]:
df.shape

(541909, 8)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [8]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


In [9]:
df.describe(include="O")

Unnamed: 0,InvoiceNo,StockCode,Description,InvoiceDate,Country
count,541909,541909,540455,541909,541909
unique,25900,4070,4223,23260,38
top,573585,85123A,WHITE HANGING HEART T-LIGHT HOLDER,31-10-2011 14:41,United Kingdom
freq,1114,2313,2369,1114,495478


In [11]:
len(df)

541909

In [10]:
len(df[df["CustomerID"].isna()])

135080

In [12]:
df = df.dropna(subset="CustomerID")
len(df)

406829

In [16]:
df.sample(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
182289,552533,22697,GREEN REGENCY TEACUP AND SAUCER,6,10-05-2011 09:41,2.95,13767.0,United Kingdom
510178,579432,23316,RED REFECTORY CLOCK,1,29-11-2011 13:12,9.95,15311.0,United Kingdom
354782,567899,23245,SET OF 3 REGENCY CAKE TINS,4,22-09-2011 16:26,4.95,14911.0,EIRE
199543,554098,22577,WOODEN HEART CHRISTMAS SCANDINAVIAN,6,22-05-2011 13:01,0.85,14769.0,United Kingdom
228352,556926,21172,PARTY METAL SIGN,1,15-06-2011 14:26,1.45,16324.0,United Kingdom
266331,560251,23196,VINTAGE LEAF MAGNETIC NOTEPAD,5,17-07-2011 12:32,1.45,17774.0,United Kingdom
392459,570693,23265,SET OF 3 WOODEN TREE DECORATIONS,2,11-10-2011 16:50,1.25,16161.0,United Kingdom
387468,C570290,82001S,VINYL RECORD FRAME SILVER,-4,10-10-2011 11:54,3.75,14665.0,United Kingdom
276245,561036,22728,ALARM CLOCK BAKELIKE PINK,1,24-07-2011 11:54,3.75,13137.0,United Kingdom
79155,542911,22938,CUPCAKE LACE PAPER SET 6,12,01-02-2011 15:41,1.95,12676.0,Sweden


In [18]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,406829.0,406829.0,406829.0
mean,12.061303,3.460471,15287.69057
std,248.69337,69.315162,1713.600303
min,-80995.0,0.0,12346.0
25%,2.0,1.25,13953.0
50%,5.0,1.95,15152.0
75%,12.0,3.75,16791.0
max,80995.0,38970.0,18287.0


In [19]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [20]:
df.sample(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
444486,574733,84406B,CREAM CUPID HEARTS COAT HANGER,16,06-11-2011 15:17,4.15,17082.0,United Kingdom
152954,549716,22966,GINGERBREAD MAN COOKIE CUTTER,12,11-04-2011 14:43,1.25,14628.0,United Kingdom
149003,549262,22908,PACK OF 20 NAPKINS RED APPLES,1,07-04-2011 12:38,0.85,14465.0,United Kingdom


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    406829 non-null  object 
 1   StockCode    406829 non-null  object 
 2   Description  406829 non-null  object 
 3   Quantity     406829 non-null  int64  
 4   InvoiceDate  406829 non-null  object 
 5   UnitPrice    406829 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      406829 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 27.9+ MB


In [27]:
df[df.CustomerID % 1 != 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [28]:
df.head(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 08:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 08:26,2.75,17850.0,United Kingdom


In [32]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='mixed')

In [33]:
df.head(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-01-12 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-01-12 08:26:00,2.75,17850.0,United Kingdom


In [41]:
len(df)

406829

In [43]:
len(df["StockCode"].str.match("^\\d{5}[a-zA-Z]+$") == False)

406829

In [42]:
len(df[(df["StockCode"].str.match("^\\d{5}$") == False) | (df["StockCode"].str.match("^\\d{5}[a-zA-Z]+$") == False)]["StockCode"].unique())

3684

In [38]:
df[(df["StockCode"].str.match("^\\d{5}$") == False) | (df["StockCode"].str.match("^\\d{5}[a-zA-Z]+$") == False)]["StockCode"].unique()

array(['85123A', '71053', '84406B', ..., '90214Z', '90089', '23843'],
      dtype=object)

In [44]:
df[df["StockCode"].str.match("^\\d{5}[a-zA-Z]+$")]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-01-12 08:26:00,2.55,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-01-12 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom
49,536373,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-01-12 09:02:00,2.55,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541778,581578,84997C,CHILDRENS CUTLERY POLKADOT BLUE,8,2011-09-12 12:16:00,4.15,12713.0,Germany
541809,581579,85099C,JUMBO BAG BAROQUE BLACK WHITE,10,2011-09-12 12:19:00,1.79,17581.0,United Kingdom
541838,581580,84993A,75 GREEN PETIT FOUR CASES,2,2011-09-12 12:20:00,0.42,12748.0,United Kingdom
541844,581580,85049A,TRADITIONAL CHRISTMAS RIBBONS,1,2011-09-12 12:20:00,1.25,12748.0,United Kingdom


In [57]:
df[df["StockCode"].str.startswith("85123")]["StockCode"].unique()

array(['85123A'], dtype=object)

In [36]:
df[df["StockCode"] == "POST"]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
45,536370,POST,POSTAGE,3,2010-01-12 08:45:00,18.0,12583.0,France
386,536403,POST,POSTAGE,1,2010-01-12 11:27:00,15.0,12791.0,Netherlands
1123,536527,POST,POSTAGE,1,2010-01-12 13:04:00,18.0,12662.0,Germany
5073,536840,POST,POSTAGE,1,2010-02-12 18:27:00,18.0,12738.0,Germany
5258,536852,POST,POSTAGE,1,2010-03-12 09:51:00,18.0,12686.0,France
...,...,...,...,...,...,...,...,...
541198,581493,POST,POSTAGE,1,2011-09-12 10:10:00,15.0,12423.0,Belgium
541216,581494,POST,POSTAGE,2,2011-09-12 10:13:00,18.0,12518.0,Germany
541730,581570,POST,POSTAGE,1,2011-09-12 11:59:00,18.0,12662.0,Germany
541767,581574,POST,POSTAGE,2,2011-09-12 12:09:00,18.0,12526.0,Germany
