In [59]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
import re
from scipy import stats

# Load Dataset

In [60]:
df = pd.read_csv("Salinan Online Retail Data.csv")
print(df.shape)
print(df.info())
df.head(10)

(461773, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 461773 entries, 0 to 461772
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      461773 non-null  object 
 1   product_code  461773 non-null  object 
 2   product_name  459055 non-null  object 
 3   quantity      461773 non-null  int64  
 4   order_date    461773 non-null  object 
 5   price         461773 non-null  float64
 6   customer_id   360853 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 24.7+ MB
None


Unnamed: 0,order_id,product_code,product_name,quantity,order_date,price,customer_id
0,493410,TEST001,This is a test product.,5,2010-01-04 09:24:00,4.5,12346.0
1,C493411,21539,RETRO SPOTS BUTTER DISH,-1,2010-01-04 09:43:00,4.25,14590.0
2,493412,TEST001,This is a test product.,5,2010-01-04 09:53:00,4.5,12346.0
3,493413,21724,PANDA AND BUNNIES STICKER SHEET,1,2010-01-04 09:54:00,0.85,
4,493413,84578,ELEPHANT TOY WITH BLUE T-SHIRT,1,2010-01-04 09:54:00,3.75,
5,493413,21723,ALPHABET HEARTS STICKER SHEET,1,2010-01-04 09:54:00,0.85,
6,493414,21844,RETRO SPOT MUG,36,2010-01-04 10:28:00,2.55,14590.0
7,493414,21533,RETRO SPOT LARGE MILK JUG,12,2010-01-04 10:28:00,4.25,14590.0
8,493414,37508,NEW ENGLAND CERAMIC CAKE SERVER,2,2010-01-04 10:28:00,2.55,14590.0
9,493414,35001G,HAND OPEN SHAPE GOLD,2,2010-01-04 10:28:00,4.25,14590.0


# Data Cleansing

In [61]:
# Drop N/A values
df.dropna(how='any',inplace=True)

# Remove Unnecessary rows
df = df[~df['order_id'].str.contains('C')]
df = df[(~df['product_code'].str.contains('TEST')) | (~df['product_name'].str.contains('test'))]
df = df[df['price'] > 0]

# Remove Outlier
df = df[(np.abs(stats.zscore(df[['quantity','price']])) < 3).all(axis=1)]

# Change Dtype
df['order_date'] = df['order_date'].astype('datetime64[ns]')
df['customer_id'] = df['customer_id'].astype('str')

# Fixing different product_name on the same product_code issue
temp = df.groupby(['product_code','product_name'],as_index=False).agg(count=('order_id','nunique')).sort_values(['product_code','count'],ascending=[True,False])
temp['rank'] = temp.groupby('product_code')['count'].rank(method='first',ascending=False).astype('int')
temp = temp[temp['rank'] == 1].drop(columns=['rank','count'])
temp.rename(columns={'product_name':'frequent_product_name'},inplace=True)
df = df.merge(temp,how='left',on='product_code')
df['product_name'] = df['frequent_product_name']
df.drop('frequent_product_name',axis=1,inplace=True)

df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,order_id,product_code,product_name,quantity,order_date,price,customer_id
0,493414,21844,RED RETROSPOT MUG,36,2010-01-04 10:28:00,2.55,14590.0
1,493414,21533,RETRO SPOT LARGE MILK JUG,12,2010-01-04 10:28:00,4.25,14590.0
2,493414,37508,NEW ENGLAND CERAMIC CAKE SERVER,2,2010-01-04 10:28:00,2.55,14590.0
3,493414,35001G,HAND OPEN SHAPE GOLD,2,2010-01-04 10:28:00,4.25,14590.0
4,493414,21527,RED RETROSPOT TRADITIONAL TEAPOT,12,2010-01-04 10:28:00,6.95,14590.0
...,...,...,...,...,...,...,...
351278,539988,84380,SET OF 3 BUTTERFLY COOKIE CUTTERS,1,2010-12-23 16:06:00,1.25,18116.0
351279,539988,84849D,HOT BATHS SOAP HOLDER,1,2010-12-23 16:06:00,1.69,18116.0
351280,539988,84849B,FAIRY SOAP SOAP HOLDER,1,2010-12-23 16:06:00,1.69,18116.0
351281,539988,22854,CREAM SWEETHEART EGG HOLDER,2,2010-12-23 16:06:00,4.95,18116.0


In [62]:
print(df.info())
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351283 entries, 0 to 351282
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   order_id      351283 non-null  object        
 1   product_code  351283 non-null  object        
 2   product_name  351283 non-null  object        
 3   quantity      351283 non-null  int64         
 4   order_date    351283 non-null  datetime64[ns]
 5   price         351283 non-null  float64       
 6   customer_id   351283 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 18.8+ MB
None


Unnamed: 0,quantity,order_date,price
count,351283.0,351283,351283.0
mean,9.636723,2010-07-22 13:27:39.280636928,2.969548
min,1.0,2010-01-04 10:28:00,0.001
25%,2.0,2010-04-25 12:34:00,1.25
50%,4.0,2010-08-03 12:13:00,1.95
75%,12.0,2010-10-25 10:41:00,3.75
max,212.0,2010-12-23 16:06:00,95.0
std,17.129233,,3.236739
