In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# Importing the dataset
df = pd.read_csv('./data/retail_clean.csv')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Date,Time,Hour,DayOfWeek,Month,Value
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,2010-12-01,08:26:00,8,Wednesday,12,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01,08:26:00,8,Wednesday,12,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,2010-12-01,08:26:00,8,Wednesday,12,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01,08:26:00,8,Wednesday,12,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01,08:26:00,8,Wednesday,12,20.34


In [3]:
# Checking the shape of the dataset
df.shape

(528140, 14)

In [4]:
# Checking the info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528140 entries, 0 to 528139
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    528140 non-null  int64  
 1   StockCode    528140 non-null  object 
 2   Description  528140 non-null  object 
 3   Quantity     528140 non-null  int64  
 4   InvoiceDate  528140 non-null  object 
 5   UnitPrice    528140 non-null  float64
 6   CustomerID   396370 non-null  float64
 7   Country      528140 non-null  object 
 8   Date         528140 non-null  object 
 9   Time         528140 non-null  object 
 10  Hour         528140 non-null  int64  
 11  DayOfWeek    528140 non-null  object 
 12  Month        528140 non-null  int64  
 13  Value        528140 non-null  float64
dtypes: float64(3), int64(4), object(7)
memory usage: 56.4+ MB


# Create the Rating Matrix

In [5]:
# Create a unique identifier by combining 'InvoiceNo' and 'Description'
df['InNo_Desc'] = df['InvoiceNo'].astype('str') + '_' + df['Description']

# Filter out duplicates based on the unique identifier and drop the identifier column
df = df.drop_duplicates(subset=['InNo_Desc']).drop('InNo_Desc', axis=1)

len(df)

517455

In [12]:
# Create the Rating Matrix
from scipy.sparse import csr_matrix

# Create a pivot table with 'CustomerID' as the index and 'Description' as the columns
# Fill the missing values with 0
# Convert the pivot table into a sparse matrix

df_pivot = df.pivot(index='InvoiceNo', columns='Description', values='Quantity').fillna(0)
df_pivot_sparse = csr_matrix(df_pivot.values)

df_pivot_sparse

<19793x4008 sparse matrix of type '<class 'numpy.float64'>'
	with 517455 stored elements in Compressed Sparse Row format>

In [8]:
df_pivot.shape

(19793, 4008)

In [9]:
ratings_matrix = df[['InvoiceNo', 'Description']]

# Add a column of 1s
ratings_matrix['value'] = 1

# Spread into user-item format
ratings_matrix = ratings_matrix.pivot(index='InvoiceNo', columns='Description', values='value').fillna(0)

# Convert to matrix
ratings_matrix = csr_matrix(ratings_matrix.values)

ratings_matrix

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_matrix['value'] = 1


<19793x4008 sparse matrix of type '<class 'numpy.float64'>'
	with 517455 stored elements in Compressed Sparse Row format>

In [10]:
ratings_matrix.shape

(19793, 4008)

# Evaluation Scheme and Model Validation

In [38]:
# Import the required functions
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt


In [39]:
# Create training and test sets
train, test = train_test_split(ratings_matrix, test_size=0.2, random_state=42)

In [40]:
# Check the shapes of the resultant DataFrames
train.shape, test.shape

((15834, 4008), (3959, 4008))

In [41]:
# Import the required functions
from sklearn.metrics.pairwise import cosine_similarity

In [42]:
# Create the item-item similarity matrix
item_similarity = cosine_similarity(X=train.T, dense_output=False)
item_similarity

<4008x4008 sparse matrix of type '<class 'numpy.float64'>'
	with 7110855 stored elements in Compressed Sparse Row format>

In [43]:
# Check the shape of the item-item similarity matrix
item_similarity.shape

(4008, 4008)

In [44]:
# Create the user-user similarity matrix
user_similarity = cosine_similarity(X=train, dense_output=False)
user_similarity

<15834x15834 sparse matrix of type '<class 'numpy.float64'>'
	with 58666366 stored elements in Compressed Sparse Row format>

In [45]:
# Check the shape of the user-user similarity matrix
user_similarity.shape

(15834, 15834)

# Estimate the Models

In [46]:
# Import the required functions
from scipy.sparse.linalg import svds

In [47]:
# Create the ratings matrix
ratings_matrix = train.copy()

In [None]:
# Mean of the ratings
