# 3. Recommender System

In [1]:
# We'll use colloborative filtering method here.

# Collaborative filtering : 
- Find the DEGREE of similarity between the users and predict the buying quantity of the product that the user hasn't bought

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
original_data=pd.read_excel("Online_shopping.xlsx")

In [4]:
df=original_data.copy()
df.head()

Unnamed: 0,InvoiceNo,StockCode,lower,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,white hanging heart t-light holder,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,white metal lantern,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,cream cupid hearts coat hanger,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,knitted union flag hot water bottle,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,red woolly hottie white heart.,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [5]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
lower          540093
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [6]:
df=df.drop(["lower"],axis=1)

In [7]:
df=df.dropna(axis=0)

In [8]:
df.shape

(406829, 8)

In [9]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [10]:
# We required a pivot table (matrix) to implement the colloborative filtering

In [11]:
custID_matrix=df.pivot_table(index="CustomerID",columns="StockCode",values="Quantity",aggfunc="sum")

In [12]:
custID_matrix.head()

StockCode,10002,10080,10120,10125,10133,10135,11001,15030,15034,15036,...,90214Y,90214Z,BANK CHARGES,C2,CRUK,D,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,,,,,,,,,,,...,,,,,,,,,,
12347.0,,,,,,,,,,,...,,,,,,,,,,
12348.0,,,,,,,,,,,...,,,,,,,,,,9.0
12349.0,,,,,,,,,,,...,,,,,,,,,,1.0
12350.0,,,,,,,,,,,...,,,,,,,,,,1.0


In [13]:
# We'll normalize each row such that the average rating of each user is 0 which helps us excluding the biasness

In [14]:
def normalize(row):
    new_row = row - row.mean()
    return new_row 


In [15]:
norm_df = custID_matrix.apply(normalize,axis = 1)

In [16]:
# We'll replace nan values as 0.

In [17]:
custID_matrix1 = norm_df.fillna(0,axis = 0)

In [18]:
# we can apply the function cosine_similarity to calculate the degree of similarity

In [19]:
similarity = cosine_similarity(custID_matrix1)

In [20]:
similarity.shape

(4372, 4372)

In [21]:
similarity

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  1.00000000e+00, -9.13385091e-02, ...,
         1.72756967e-02,  2.85099877e-02,  8.53810998e-02],
       [ 0.00000000e+00, -9.13385091e-02,  1.00000000e+00, ...,
         0.00000000e+00,  5.42046334e-02,  9.76917186e-03],
       ...,
       [ 0.00000000e+00,  1.72756967e-02,  0.00000000e+00, ...,
         1.00000000e+00,  9.86118006e-05,  0.00000000e+00],
       [ 0.00000000e+00,  2.85099877e-02,  5.42046334e-02, ...,
         9.86118006e-05,  1.00000000e+00, -3.36525954e-03],
       [ 0.00000000e+00,  8.53810998e-02,  9.76917186e-03, ...,
         0.00000000e+00, -3.36525954e-03,  1.00000000e+00]])

In [22]:
final=pd.DataFrame(similarity)

In [23]:
final.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4362,4363,4364,4365,4366,4367,4368,4369,4370,4371
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,-0.091339,0.008536,-0.001821,0.025436,0.0,0.011149,0.574726,0.173346,...,0.0,0.0,0.394338,0.011502,-0.012847,-0.00235,0.0,0.017276,0.02851,0.085381
2,0.0,-0.091339,1.0,0.048655,0.22975,0.010098,0.0,-0.009706,0.048722,0.060961,...,0.0,0.0,0.029611,0.0,0.0,0.0,0.0,0.0,0.054205,0.009769
3,0.0,0.008536,0.048655,1.0,0.079928,0.111437,0.0,0.010867,0.021912,0.101352,...,0.0,0.0,0.0,0.051687,0.005834,0.0,0.0,0.020927,0.019202,-0.044551
4,0.0,-0.001821,0.22975,0.079928,1.0,0.01631,0.0,0.0,0.0,0.035553,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.00114,0.0


In [24]:
final.columns=custID_matrix.index

In [25]:
final.head()

CustomerID,12346.0,12347.0,12348.0,12349.0,12350.0,12352.0,12353.0,12354.0,12355.0,12356.0,...,18273.0,18274.0,18276.0,18277.0,18278.0,18280.0,18281.0,18282.0,18283.0,18287.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,-0.091339,0.008536,-0.001821,0.025436,0.0,0.011149,0.574726,0.173346,...,0.0,0.0,0.394338,0.011502,-0.012847,-0.00235,0.0,0.017276,0.02851,0.085381
2,0.0,-0.091339,1.0,0.048655,0.22975,0.010098,0.0,-0.009706,0.048722,0.060961,...,0.0,0.0,0.029611,0.0,0.0,0.0,0.0,0.0,0.054205,0.009769
3,0.0,0.008536,0.048655,1.0,0.079928,0.111437,0.0,0.010867,0.021912,0.101352,...,0.0,0.0,0.0,0.051687,0.005834,0.0,0.0,0.020927,0.019202,-0.044551
4,0.0,-0.001821,0.22975,0.079928,1.0,0.01631,0.0,0.0,0.0,0.035553,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.00114,0.0


In [26]:
final=final.set_index(custID_matrix.index)

In [27]:
final.head()

CustomerID,12346.0,12347.0,12348.0,12349.0,12350.0,12352.0,12353.0,12354.0,12355.0,12356.0,...,18273.0,18274.0,18276.0,18277.0,18278.0,18280.0,18281.0,18282.0,18283.0,18287.0
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12347.0,0.0,1.0,-0.091339,0.008536,-0.001821,0.025436,0.0,0.011149,0.574726,0.173346,...,0.0,0.0,0.394338,0.011502,-0.012847,-0.00235,0.0,0.017276,0.02851,0.085381
12348.0,0.0,-0.091339,1.0,0.048655,0.22975,0.010098,0.0,-0.009706,0.048722,0.060961,...,0.0,0.0,0.029611,0.0,0.0,0.0,0.0,0.0,0.054205,0.009769
12349.0,0.0,0.008536,0.048655,1.0,0.079928,0.111437,0.0,0.010867,0.021912,0.101352,...,0.0,0.0,0.0,0.051687,0.005834,0.0,0.0,0.020927,0.019202,-0.044551
12350.0,0.0,-0.001821,0.22975,0.079928,1.0,0.01631,0.0,0.0,0.0,0.035553,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.00114,0.0


In [28]:
final.loc[12348].sort_values(ascending=False)

CustomerID
12348.0    1.000000
16174.0    0.462112
14163.0    0.443668
12628.0    0.362972
17589.0    0.347290
             ...   
17306.0   -0.123998
12740.0   -0.134821
13027.0   -0.139666
12763.0   -0.148741
14413.0   -0.194928
Name: 12348.0, Length: 4372, dtype: float64

In [29]:
final.loc[12348].sort_values(ascending=False).iloc[1:11]

CustomerID
16174.0    0.462112
14163.0    0.443668
12628.0    0.362972
17589.0    0.347290
12442.0    0.321736
14778.0    0.317873
15579.0    0.313273
12738.0    0.310974
17788.0    0.307264
12547.0    0.300524
Name: 12348.0, dtype: float64

In [45]:
# defining a function to get similar users where the user has rated the product
def similar_users_for_product(user_index, interactions_matrix, similarity_matrix, product_id, l=5):
    similar_users = similarity_matrix.loc[user_index].sort_values(ascending=False)
    similar_users = list(zip(similar_users,similar_users.index))
    similarity = []
    
    for score,user in similar_users[1:]:
        #Appending the user and the corresponding similarity score with user_id as a tuple
        if interactions_matrix.loc[user,product_id] != 0:
            similarity.append((user,score))
        if len(similarity)>=l:
            break
        
    similarity = sorted(similarity,key = lambda x : x,reverse = True)
    most_similar_users = [tup[0] for tup in similarity] #Extract the user from each tuple in the sorted list
    similarity_score = [tup[1] for tup in similarity] ##Extracting the similarity score from each tuple in the sorted list
    
    
    return most_similar_users, similarity_score

In [46]:
#Ratings prediction of a user for the products he hasn't rated.
def ratings(user_id, interactions_matrix,similarity_matrix):
    rat_pred = []
    for product_id in interactions_matrix:
        if interactions_matrix.loc[user_id][product_id] == 0:
            sim_users, sim_scores = similar_users_for_product(user_id,interactions_matrix,similarity_matrix, product_id)
            rat_list = interactions_matrix.loc[sim_users,product_id]
            #Weighted average of the ratings
            rat_pred.append((np.sum(np.multiply(sim_scores,rat_list))/np.sum(sim_scores),product_id))
    
    return rat_pred
    

In [47]:
def recommend_product(user_id,interactions_matrix,similarity_matrix):
    ratings_pred = ratings(user_id,interactions_matrix,similarity_matrix)
    ratings_pred.sort(reverse = True)
    prod = [j for _ , j in ratings_pred]
    return prod[:10]
    

In [48]:
recommend_product(12348.0, custID_matrix1,final)

[16049, 16008, 16045, 21192, 21200, 21201, 22741, 22339, 23310, 23186]