## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

## Data Preprocessing & Cleaning

In [2]:
dataset_path = "online_retail_II.xlsx"
df = pd.read_excel(dataset_path)

df.head(10)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
5,489434,22064,PINK DOUGHNUT TRINKET POT,24,2009-12-01 07:45:00,1.65,13085.0,United Kingdom
6,489434,21871,SAVE THE PLANET MUG,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
7,489434,21523,FANCY FONT HOME SWEET HOME DOORMAT,10,2009-12-01 07:45:00,5.95,13085.0,United Kingdom
8,489435,22350,CAT BOWL,12,2009-12-01 07:46:00,2.55,13085.0,United Kingdom
9,489435,22349,"DOG BOWL , CHASING BALL DESIGN",12,2009-12-01 07:46:00,3.75,13085.0,United Kingdom


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      525461 non-null  object        
 1   StockCode    525461 non-null  object        
 2   Description  522533 non-null  object        
 3   Quantity     525461 non-null  int64         
 4   InvoiceDate  525461 non-null  datetime64[ns]
 5   Price        525461 non-null  float64       
 6   Customer ID  417534 non-null  float64       
 7   Country      525461 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 32.1+ MB


In [4]:
df.isnull().sum()

Invoice             0
StockCode           0
Description      2928
Quantity            0
InvoiceDate         0
Price               0
Customer ID    107927
Country             0
dtype: int64

In [5]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 417534 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      417534 non-null  object        
 1   StockCode    417534 non-null  object        
 2   Description  417534 non-null  object        
 3   Quantity     417534 non-null  int64         
 4   InvoiceDate  417534 non-null  datetime64[ns]
 5   Price        417534 non-null  float64       
 6   Customer ID  417534 non-null  float64       
 7   Country      417534 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 28.7+ MB


In [6]:
df['Description'] = df['Description'].str.strip()

In [7]:
df.columns

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')

In [8]:
df = df.drop(columns=['StockCode', 'InvoiceDate', 'Country'])
df.head(15)

Unnamed: 0,Invoice,Description,Quantity,Price,Customer ID
0,489434,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95,13085.0
1,489434,PINK CHERRY LIGHTS,12,6.75,13085.0
2,489434,WHITE CHERRY LIGHTS,12,6.75,13085.0
3,489434,"RECORD FRAME 7"" SINGLE SIZE",48,2.1,13085.0
4,489434,STRAWBERRY CERAMIC TRINKET BOX,24,1.25,13085.0
5,489434,PINK DOUGHNUT TRINKET POT,24,1.65,13085.0
6,489434,SAVE THE PLANET MUG,24,1.25,13085.0
7,489434,FANCY FONT HOME SWEET HOME DOORMAT,10,5.95,13085.0
8,489435,CAT BOWL,12,2.55,13085.0
9,489435,"DOG BOWL , CHASING BALL DESIGN",12,3.75,13085.0


In [9]:
df_50k = df.iloc[:50000]

In [10]:
print(df_50k['Invoice'].nunique())
print(df_50k['Description'].nunique())

2957
2983


## Recommender System Using Co-occurrence

In [36]:
item_invoice = pd.DataFrame(
    0,
    index=df_50k['Invoice'].unique(),
    columns=df_50k['Description'].unique(),
    dtype='int16'
)

for _, row in df_50k.iterrows():
    item_invoice.loc[row['Invoice'], row['Description']] = 1

item_invoice.head(10)

Unnamed: 0,15CM CHRISTMAS GLASS BALL 20 LIGHTS,PINK CHERRY LIGHTS,WHITE CHERRY LIGHTS,"RECORD FRAME 7"" SINGLE SIZE",STRAWBERRY CERAMIC TRINKET BOX,PINK DOUGHNUT TRINKET POT,SAVE THE PLANET MUG,FANCY FONT HOME SWEET HOME DOORMAT,CAT BOWL,"DOG BOWL , CHASING BALL DESIGN",...,BLUE/YELLOW CERAMIC CANDLE HOLDER,RED SPOTTY BEAKER,WHITE ROHMBIC BLOCK TABLE LAMP,SILVER FISHING GNOME,BOTTLE BAG RETROSPOT,PINK BABY GIRL'S MEMORY BOX,BLUE BABY BOY'S MEMORY BOX,Adjustment by john on 26/01/2010 16,DOOR MAT BLUE FLOCK,Adjustment by john on 26/01/2010 17
489434,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
489435,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
489436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489437,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489438,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489439,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489440,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
489441,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489442,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489443,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
item_item_matrix = item_invoice.T.dot(item_invoice)

item_item_matrix.head(10)

Unnamed: 0,15CM CHRISTMAS GLASS BALL 20 LIGHTS,PINK CHERRY LIGHTS,WHITE CHERRY LIGHTS,"RECORD FRAME 7"" SINGLE SIZE",STRAWBERRY CERAMIC TRINKET BOX,PINK DOUGHNUT TRINKET POT,SAVE THE PLANET MUG,FANCY FONT HOME SWEET HOME DOORMAT,CAT BOWL,"DOG BOWL , CHASING BALL DESIGN",...,BLUE/YELLOW CERAMIC CANDLE HOLDER,RED SPOTTY BEAKER,WHITE ROHMBIC BLOCK TABLE LAMP,SILVER FISHING GNOME,BOTTLE BAG RETROSPOT,PINK BABY GIRL'S MEMORY BOX,BLUE BABY BOY'S MEMORY BOX,Adjustment by john on 26/01/2010 16,DOOR MAT BLUE FLOCK,Adjustment by john on 26/01/2010 17
15CM CHRISTMAS GLASS BALL 20 LIGHTS,46,3,5,1,3,1,3,1,0,1,...,0,0,0,0,0,0,0,0,0,0
PINK CHERRY LIGHTS,3,121,80,1,13,8,8,23,1,1,...,0,0,1,0,0,0,0,0,0,0
WHITE CHERRY LIGHTS,5,80,148,2,16,8,7,24,2,3,...,0,0,1,0,0,0,0,0,0,0
"RECORD FRAME 7"" SINGLE SIZE",1,1,2,14,3,3,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0
STRAWBERRY CERAMIC TRINKET BOX,3,13,16,3,229,42,14,8,2,4,...,0,0,0,0,0,0,0,0,0,0
PINK DOUGHNUT TRINKET POT,1,8,8,3,42,65,7,4,5,3,...,0,0,0,0,0,0,0,0,0,0
SAVE THE PLANET MUG,3,8,7,3,14,7,79,11,4,3,...,0,0,0,0,0,0,0,0,0,0
FANCY FONT HOME SWEET HOME DOORMAT,1,23,24,1,8,4,11,111,1,1,...,0,0,0,0,0,0,0,0,0,0
CAT BOWL,0,1,2,0,2,5,4,1,40,32,...,0,0,0,0,0,0,0,0,0,0
"DOG BOWL , CHASING BALL DESIGN",1,1,3,0,4,3,3,1,32,52,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# we can use int16 since the max value is 487
max_value = item_item_matrix.max().max()
print(max_value)

487


In [39]:
print((item_item_matrix == 0).sum().sum())

7809212


In [40]:
def recommender1(item_name, item_item_matrix):
    sim_score = item_item_matrix[item_name].sort_values(ascending=False)
    recommended = sim_score.drop(item_name).head(2)

    return recommended


test = recommender1("DOOR MAT BLACK FLOCK", item_item_matrix)
print(f"recommended item is:\n {test}")

recommended item is:
 RED SPOTTY COIR DOORMAT    21
DOOR MAT NEW ENGLAND       15
Name: DOOR MAT BLACK FLOCK, dtype: int16


## Recommendation System Using Cosine Similarity

In [41]:
# this is a numpy array
cosine_sim = cosine_similarity(item_item_matrix)

# turning it into pandas df
cosine_sim_df = pd.DataFrame(
    cosine_sim,
    index=item_item_matrix.index,
    columns=item_item_matrix.columns
)

In [42]:
def recommender2(item_name, cosine_sim_df):
    sim_score = cosine_sim_df[item_name].sort_values(ascending=False)
    recommended = sim_score.drop(item_name).head(2)

    return recommended


test = recommender2("DOOR MAT BLACK FLOCK", cosine_sim_df)
print(f"recommended item is:\n {test}")

recommended item is:
 DOOR MAT ENGLISH ROSE      0.702425
RED SPOTTY COIR DOORMAT    0.698638
Name: DOOR MAT BLACK FLOCK, dtype: float64
