In [1]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
#This is the surprise library Randomized Search CV
from surprise.model_selection.search import RandomizedSearchCV 
from surprise import accuracy
from surprise import KNNBasic
from sklearn.ensemble import RandomForestClassifier
from surprise.model_selection import KFold
df=pd.read_csv('7817_1.csv')
df.head(3)

Unnamed: 0,id,asins,brand,categories,colors,dateAdded,dateUpdated,dimension,ean,keys,...,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sizes,upc,weight
0,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,I initially had trouble deciding between the p...,"Paperwhite voyage, no regrets!",,,Cristina M,,,205 grams
1,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,Allow me to preface this with a little history...,One Simply Could Not Ask For More,,,Ricky,,,205 grams
2,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,4.0,https://www.amazon.com/Kindle-Paperwhite-High-...,I am enjoying it so far. Great for reading. Ha...,Great for those that just want an e-reader,,,Tedd Gardiner,,,205 grams


In [2]:
print(df['reviews.rating'].value_counts())

reviews.rating
5.0    741
4.0    236
3.0    124
1.0     42
2.0     34
Name: count, dtype: int64


In [3]:
df.isnull().sum()

id                         0
asins                      0
brand                      0
categories                 0
colors                   823
dateAdded                  0
dateUpdated                0
dimension               1032
ean                      699
keys                       0
manufacturer             632
manufacturerNumber       695
name                       0
prices                     0
reviews.date             380
reviews.doRecommend     1058
reviews.numHelpful       697
reviews.rating           420
reviews.sourceURLs         0
reviews.text               0
reviews.title             17
reviews.userCity        1597
reviews.userProvince    1597
reviews.username          17
sizes                   1597
upc                      699
weight                   911
dtype: int64

In [4]:
df.shape

(1597, 27)

In [5]:
#using regex to remove all the special charecters
#lamba performs the respective operation on each row
#apply performs it on the entire column
df['reviews.text']=df['reviews.text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '',x))

In [6]:
#now we are coverting all the reviews into lowercase for easy analysis
# str.lower affects the whole column and is also faster when compared to lambda functions
df['reviews.text']=df['reviews.text'].str.lower()
df['reviews.text'].head(10)

0    i initially had trouble deciding between the p...
1    allow me to preface this with a little history...
2    i am enjoying it so far great for reading had ...
3    i bought one of the first paperwhites and have...
4    i have to say upfront  i dont like coroporate ...
5    my previous kindle was a dx this is my second ...
6    allow me to preface this with a little history...
7    just got mine right now looks the same as the ...
8    i initially had trouble deciding between the p...
9    i am enjoying it so far great for reading had ...
Name: reviews.text, dtype: object

In [7]:
#applying TDF-IF and storing the sparse matrix into a new variable
#by default the TDF-IF is applied on the entire column
#TF-IDF also expects whole sentences or documents as input
vectorizer=TfidfVectorizer()
df_tf_idf_matrix=vectorizer.fit_transform(df['reviews.text'])
print(df_tf_idf_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 143268 stored elements and shape (1597, 7480)>
  Coords	Values
  (0, 3345)	0.09563030671363099
  (0, 2905)	0.08345633171946584
  (0, 6805)	0.09121597170669409
  (0, 1584)	0.09694742471749919
  (0, 634)	0.07483601346287423
  (0, 6564)	0.27110941536841765
  (0, 4679)	0.2597084099818659
  (0, 262)	0.14029627804639896
  (0, 7113)	0.08773884031347202
  (0, 573)	0.04470917898454384
  (0, 5538)	0.0561724831717947
  (0, 4170)	0.029665877531807102
  (0, 4546)	0.030394206127855983
  (0, 3716)	0.058518385560671994
  (0, 5624)	0.07556747568457363
  (0, 5629)	0.04938580679067498
  (0, 6602)	0.05167038694076653
  (0, 3465)	0.045200521503484005
  (0, 2849)	0.03373904225595677
  (0, 868)	0.05498893646691517
  (0, 3212)	0.10166066588402228
  (0, 7444)	0.08844912060184575
  (0, 2966)	0.08087836213982427
  (0, 6140)	0.10168128802659403
  (0, 4158)	0.07046381725108337
  :	:
  (1596, 5041)	0.09067004121467519
  (1596, 1838)	0.13036172959882
  (1

In [8]:
#using cosine similarity to find the co-relation between words
cosine_sim_matrix=cosine_similarity(df_tf_idf_matrix)

In [9]:
cosine_sim_matrix

array([[1.        , 0.22917487, 0.17843075, ..., 0.10461446, 0.10477761,
        0.17136213],
       [0.22917487, 1.        , 0.17368846, ..., 0.04460697, 0.12893309,
        0.16984777],
       [0.17843075, 0.17368846, 1.        , ..., 0.02644085, 0.05143964,
        0.1072023 ],
       ...,
       [0.10461446, 0.04460697, 0.02644085, ..., 1.        , 0.11391033,
        0.12805379],
       [0.10477761, 0.12893309, 0.05143964, ..., 0.11391033, 1.        ,
        0.15334813],
       [0.17136213, 0.16984777, 0.1072023 , ..., 0.12805379, 0.15334813,
        1.        ]])

In [10]:
cosine_sim_matrix.shape

(1597, 1597)

In [11]:
# Dropping columns that are not relevant or contain missing values
columns_to_drop = [
    'colors', 'dimension', 'ean', 'manufacturer', 'manufacturerNumber',
    'sizes', 'upc', 'weight', 'reviews.date', 'reviews.doRecommend', 
    'reviews.numHelpful', 'reviews.rating', 'reviews.title', 'reviews.userCity', 
    'reviews.userProvince', 'reviews.username'
]

df_cleaned = df.drop(columns=columns_to_drop)


In [12]:
print(df_cleaned.columns)


Index(['id', 'asins', 'brand', 'categories', 'dateAdded', 'dateUpdated',
       'keys', 'name', 'prices', 'reviews.sourceURLs', 'reviews.text'],
      dtype='object')


In [13]:
print(df_cleaned.isnull().sum())

id                    0
asins                 0
brand                 0
categories            0
dateAdded             0
dateUpdated           0
keys                  0
name                  0
prices                0
reviews.sourceURLs    0
reviews.text          0
dtype: int64


In [14]:
#here we used the cosine similarity matrix rows as index without having an explicit index in
#original table. this is the reason product ID is able to map to the correct row

In [15]:
#function to get the top recommendations
def get_top_recommendations(product_id, cosine_sim_matrix, N=5):
    similarity_scores=cosine_sim_matrix[product_id]
    #sorting the scores based on descending order
    #remember to even keep track of the indices of the scores to get the similar products
    #np.argsort() returns the indices of the sorted array elements
    similar_indices=np.argsort(similarity_scores)[::-1]
    #removing the product itself
    #it creates a boolean array removing all the false (elements which do not satisfy the condition)
    similar_indices=similar_indices[similar_indices != product_id]
    #get the indices
    top_indices=similar_indices[:N]
    #return the top N recommendations and their similarity scores
    return [(index, similarity_scores[index]) for index in top_indices]


In [16]:
#calling the above function to give recommendations
a=int(input('Enter the product ID'))
n=5
model_recommendations=get_top_recommendations(a-1,cosine_sim_matrix,n)
#printing the recommendation
for idx,score in model_recommendations:
    print(f'Product : {idx} \t Similarity Score : {score}')

Product : 850 	 Similarity Score : 0.38982353576875034
Product : 786 	 Similarity Score : 0.38982353576875034
Product : 790 	 Similarity Score : 0.3852937737524592
Product : 851 	 Similarity Score : 0.3852937737524592
Product : 14 	 Similarity Score : 0.37552068677194


In [17]:
df.dtypes

id                       object
asins                    object
brand                    object
categories               object
colors                   object
dateAdded                object
dateUpdated              object
dimension                object
ean                     float64
keys                     object
manufacturer             object
manufacturerNumber       object
name                     object
prices                   object
reviews.date             object
reviews.doRecommend      object
reviews.numHelpful      float64
reviews.rating          float64
reviews.sourceURLs       object
reviews.text             object
reviews.title            object
reviews.userCity        float64
reviews.userProvince    float64
reviews.username         object
sizes                   float64
upc                     float64
weight                   object
dtype: object

In [18]:
#do not fill the missing values as it is critical for making recommendations
#because it could lead to distortion of user patterns
df['id'].value_counts()

id
AVpfpK8KLJeJML43BCuD    542
AVpfLiCSilAPnD_xWpk_    166
AVpge-anilAPnD_xtDVf    133
AVpjWh8e1cnluZ0-Vy0O     87
AVpe7LD5LJeJML43ybWA     70
                       ... 
AVphEeRyilAPnD_x035P      3
AV1T1nRwvKc47QAVgf3I      3
AVpg4dIu1cnluZ0-7fWM      3
AV1T19jBvKc47QAVgf3S      3
AV1T0_ti-jtxr-f31CoV      3
Name: count, Length: 66, dtype: int64

In [19]:
#here we are removing the null value rows which do not have a username
df.dropna(axis=0, subset=['reviews.username'], how='any', inplace=True)


In [20]:
#checking if null values are removed
df['reviews.username'].isnull().sum()

0

In [21]:
#here factorize function assigns a unique integer to each user 
#multiple rows assigned same id as the same user has done multiple reviews
#here we are assigning one unique integer for each customer
#here we are indexing from 1, so it will be User 1,2,3,4,....
df['user_id']=df['reviews.username'].factorize()[0]+1
print(df[['reviews.username','user_id']].head())


     reviews.username  user_id
0          Cristina M        1
1               Ricky        2
2       Tedd Gardiner        3
3              Dougal        4
4  Miljan David Tanic        5


In [22]:
#same we are doing same for the products
df['product_id']=df['asins'].factorize()[0]+1
print(df[['product_id','user_id']].head(5))


   product_id  user_id
0           1        1
1           1        2
2           1        3
3           1        4
4           1        5


In [23]:
df['user_id'].value_counts()

user_id
921    59
38     58
923    43
924    30
335    29
       ..
370     1
371     1
372     1
373     1
949     1
Name: count, Length: 949, dtype: int64

In [24]:
#number of reviews each 
df['product_id'].value_counts()

product_id
48    542
27    166
52    133
34     87
4      56
47     53
6      43
40     41
3      38
2      32
28     27
1      23
51     22
18     22
29     20
7      18
30     18
53     17
5      15
32     13
43     13
54     12
41     12
49     12
35     12
45      9
16      8
15      7
50      7
17      7
26      6
38      6
9       6
24      6
39      5
33      5
8       4
12      4
23      4
14      4
46      4
37      4
42      4
13      3
44      3
36      3
31      3
11      3
25      3
22      3
21      3
20      3
19      3
10      3
Name: count, dtype: int64

In [25]:
df

Unnamed: 0,id,asins,brand,categories,colors,dateAdded,dateUpdated,dimension,ean,keys,...,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sizes,upc,weight,user_id,product_id
0,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,i initially had trouble deciding between the p...,"Paperwhite voyage, no regrets!",,,Cristina M,,,205 grams,1,1
1,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,allow me to preface this with a little history...,One Simply Could Not Ask For More,,,Ricky,,,205 grams,2,1
2,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,i am enjoying it so far great for reading had ...,Great for those that just want an e-reader,,,Tedd Gardiner,,,205 grams,3,1
3,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,i bought one of the first paperwhites and have...,Love / Hate relationship,,,Dougal,,,205 grams,4,1
4,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,i have to say upfront i dont like coroporate ...,I LOVE IT,,,Miljan David Tanic,,,205 grams,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,AVpfo9ukilAPnD_xfhuj,B00NO8JJZW,Amazon,"Amazon Devices & Accessories,Amazon Device Acc...",,2016-04-02T14:40:43Z,2017-08-13T08:28:46Z,,,alexavoiceremoteforamazonfiretvfiretvstick/b00...,...,this is not the same remote that i got for my ...,I would be disappointed with myself if i produ...,,,GregAmandawith4,,,4 ounces,947,54
1593,AVpfo9ukilAPnD_xfhuj,B00NO8JJZW,Amazon,"Amazon Devices & Accessories,Amazon Device Acc...",,2016-04-02T14:40:43Z,2017-08-13T08:28:46Z,,,alexavoiceremoteforamazonfiretvfiretvstick/b00...,...,i have had to change the batteries in this rem...,Battery draining remote!!!!,,,Amazon Customer,,,4 ounces,55,54
1594,AVpfo9ukilAPnD_xfhuj,B00NO8JJZW,Amazon,"Amazon Devices & Accessories,Amazon Device Acc...",,2016-04-02T14:40:43Z,2017-08-13T08:28:46Z,,,alexavoiceremoteforamazonfiretvfiretvstick/b00...,...,remote did not activate nor did it connect to ...,replacing an even worse remote. Waste of time,,,Amazon Customer,,,4 ounces,55,54
1595,AVpfo9ukilAPnD_xfhuj,B00NO8JJZW,Amazon,"Amazon Devices & Accessories,Amazon Device Acc...",,2016-04-02T14:40:43Z,2017-08-13T08:28:46Z,,,alexavoiceremoteforamazonfiretvfiretvstick/b00...,...,it does the job but is super over priced i fee...,Overpriced,,,Meg Ashley,,,4 ounces,948,54


In [26]:
# Finding the total number of unique products
total_products_b = df['product_id'].nunique()
print(f"Total number of unique products: {total_products_b}")

Total number of unique products: 54


In [27]:
#filtering the products with reviews less than 5 to prevent distortation
min_review=5
#this series contains number of reviews for each product
product_count=df['product_id'].value_counts()
#then it is compared to see it has minimum review threshold and it is stored
filtered_product=product_count[product_count >= min_review].index
#the products which have the review threshold are now compared to the dataframe
#product id's 
df_filtered=df[df['product_id'].isin(filtered_product)]

In [28]:
#above we are removing the  products which have total min reviews as 5

In [29]:
#now we are filtering the users who interacted with min 2 products
#thus further refining our user item matrix
#this gives the number of products user has interacted with
user_counts_1=df_filtered['user_id'].value_counts()
#filter out the user_id which do not satisfy the condition 
#the ones which do we store their indices 
users_to_keep_1=user_counts_1[user_counts_1>=2].index
#compare with all the user_id and remove the ones which do not satisfy the condition
df_filtered_updated=df_filtered[df_filtered['user_id'].isin(users_to_keep_1)]

In [30]:
print(df_filtered_updated.head())
print(df_filtered_updated.shape)

                     id       asins   brand                  categories  \
0  AVpe7AsMilAPnD_xQ78G  B00QJDU3KY  Amazon  Amazon Devices,mazon.co.uk   
1  AVpe7AsMilAPnD_xQ78G  B00QJDU3KY  Amazon  Amazon Devices,mazon.co.uk   
2  AVpe7AsMilAPnD_xQ78G  B00QJDU3KY  Amazon  Amazon Devices,mazon.co.uk   
6  AVpe7AsMilAPnD_xQ78G  B00QJDU3KY  Amazon  Amazon Devices,mazon.co.uk   
8  AVpe7AsMilAPnD_xQ78G  B00QJDU3KY  Amazon  Amazon Devices,mazon.co.uk   

  colors             dateAdded           dateUpdated  \
0    NaN  2016-03-08T20:21:53Z  2017-07-18T23:52:58Z   
1    NaN  2016-03-08T20:21:53Z  2017-07-18T23:52:58Z   
2    NaN  2016-03-08T20:21:53Z  2017-07-18T23:52:58Z   
6    NaN  2016-03-08T20:21:53Z  2017-07-18T23:52:58Z   
8    NaN  2016-03-08T20:21:53Z  2017-07-18T23:52:58Z   

                  dimension  ean                         keys  ...  \
0  169 mm x 117 mm x 9.1 mm  NaN  kindlepaperwhite/b00qjdu3ky  ...   
1  169 mm x 117 mm x 9.1 mm  NaN  kindlepaperwhite/b00qjdu3ky  ...   
2 

In [31]:
# Finding the total number of unique products after filtering
total_products_after = df_filtered_updated['product_id'].nunique()
print(f"Total number of unique products after filtering: {total_products_after}")

Total number of unique products after filtering: 35


In [32]:
#user item matrix
#it is a pandas dataframe
user_item_matrix=df_filtered_updated.pivot_table(index='user_id',columns='product_id',values='reviews.rating')
user_item_matrix.columns

Index([ 1,  2,  3,  4,  5,  6,  7,  9, 15, 16, 17, 18, 24, 26, 27, 28, 29, 30,
       32, 33, 34, 35, 38, 39, 40, 41, 43, 45, 47, 48, 49, 51, 52, 53, 54],
      dtype='int64', name='product_id')

In [33]:
print(user_item_matrix)

product_id   1    2   3   4   5   6   7   9   15  16  ...  41  43  45  47  48  \
user_id                                               ...                       
1           5.0  NaN NaN NaN NaN NaN NaN NaN NaN NaN  ... NaN NaN NaN NaN NaN   
2           5.0  NaN NaN NaN NaN NaN NaN NaN NaN NaN  ... NaN NaN NaN NaN NaN   
3           4.0  NaN NaN NaN NaN NaN NaN NaN NaN NaN  ... NaN NaN NaN NaN NaN   
20          NaN  3.0 NaN NaN NaN NaN NaN NaN NaN NaN  ... NaN NaN NaN NaN NaN   
21          NaN  4.0 NaN NaN NaN NaN NaN NaN NaN NaN  ... NaN NaN NaN NaN NaN   
...         ...  ...  ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ..  ..  ..  ..   
916         NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  ... NaN NaN NaN NaN NaN   
917         NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  ... NaN NaN NaN NaN NaN   
921         NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  ... NaN NaN NaN NaN NaN   
923         NaN  NaN NaN NaN NaN NaN NaN NaN NaN NaN  ... NaN NaN NaN NaN NaN   
924         NaN  NaN NaN NaN

In [34]:
#SVD model will train and test on the ratings that are given
#removing rows where ratings are not given
df_given_ratings=df.dropna(subset=['reviews.rating'])


In [35]:
#SVD as cosine similarity cannot take NaN values
#Fill NaN values with 0 approach should be avoided
#as users who did not rate will be treated as dislike(0) which may skew data
reader=Reader(rating_scale=(1,5))
data=Dataset.load_from_df(df_given_ratings[['user_id','product_id','reviews.rating']],reader)
#splitting the data into training and testing sets
train_set,test_set=train_test_split(data,test_size=0.2)


In [36]:
#applying the SVD
svd=SVD()
svd.fit(train_set)
#making predictions on the test set
prediction_svd=svd.test(test_set)
#measuring the accuracy of predictiions
accuracy.rmse(prediction_svd)



RMSE: 0.9029


0.902863938227926

In [37]:
#now we are using another method KNNBasic to predict the missing values
sim_options={
    'name': 'cosine', 'user_based': False
}

In [38]:
#randomized search will take care of parameters
knnb=KNNBasic(sim_options=sim_options) 

In [39]:
#parameter grid for hypertuning
param_grid={
    'k':[10,20,30,40],
    'k_min':[1,2,3,4]
}

In [40]:
#the algo uses KFolds internally for consistent splits
#applying the 'randomized search CV ' of the surprise library here
random_state=RandomizedSearchCV(algo_class=knnb,param_distributions=param_grid,n_iter=10,measures=['rmse','mae'],cv =3,refit=True,n_jobs=-1,random_state=0)

In [41]:
#training the model
knnb.fit(train_set)
#making predictions
prediction_knnb=knnb.test(test_set)
#measuring the accuracy
accuracy.rmse(prediction_knnb)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9247


0.9247211617852136

In [42]:
user_item_matrix_knnb=user_item_matrix.copy()
#now we will fill the missing values using KNNBasic
for user_id in user_item_matrix_knnb.index:
    for product_id in user_item_matrix_knnb.columns:
        if pd.isna(user_item_matrix_knnb.loc[user_id,product_id]):
            answer_knnb=knnb.predict(user_id,product_id)
            user_item_matrix_knnb.loc[user_id,product_id]=answer_knnb.est


In [43]:
#now we will fill the missing values using the trained SVD
for user_id in user_item_matrix.index:
    for product_id in user_item_matrix.columns:
        if pd.isna(user_item_matrix.loc[user_id,product_id]):
            answer=svd.predict(user_id,product_id)
            user_item_matrix.loc[user_id,product_id]=answer.est
    

In [44]:
print(user_item_matrix.head())

product_id        1         2         3         4         5         6   \
user_id                                                                  
1           5.000000  4.240146  4.377272  4.576921  4.600434  3.784259   
2           5.000000  4.354822  4.595043  4.608260  4.703783  3.958073   
3           4.000000  4.287041  4.326878  4.392772  4.329375  3.758298   
20          4.453754  3.000000  4.392064  4.416130  4.254766  3.743777   
21          4.509754  4.000000  4.250743  4.326595  4.463872  4.069127   

product_id        7         9         15        16  ...        41        43  \
user_id                                             ...                       
1           4.055346  4.491800  4.301547  4.516625  ...  4.325812  4.630516   
2           4.045811  4.646854  4.343316  4.468310  ...  4.310567  4.603111   
3           3.916991  4.203263  4.438226  4.410634  ...  4.313396  4.573323   
20          3.995586  4.017425  4.059350  4.300894  ...  4.052725  4.443172   
21     

In [45]:
print(user_item_matrix_knnb)

product_id        1         2         3         4         5         6   \
user_id                                                                  
1           5.000000  4.364506  4.364506  4.364506  4.364506  4.364506   
2           5.000000  4.364506  4.364506  4.364506  4.364506  4.364506   
3           4.000000  4.364506  4.364506  4.364506  4.364506  4.364506   
20          4.364506  3.000000  4.364506  4.364506  4.364506  4.364506   
21          4.364506  4.000000  4.364506  4.364506  4.364506  4.364506   
...              ...       ...       ...       ...       ...       ...   
916         4.364506  4.364506  4.364506  4.364506  4.364506  4.364506   
917         4.364506  4.364506  4.364506  4.364506  4.364506  4.364506   
921         4.364506  4.364506  4.364506  4.364506  4.364506  4.364506   
923         4.364506  4.364506  4.364506  4.364506  4.364506  4.364506   
924         4.364506  4.364506  4.364506  4.364506  4.364506  4.364506   

product_id        7         9        

In [46]:
#Created based on SVD
#now creating the cosine similarity matrix for collaborative filtering
#it is item based as we are recommending products or items 
cosine_item_similarity_svd = cosine_similarity(user_item_matrix.T)
print(cosine_item_similarity_svd)

[[1.         0.99868023 0.99928328 ... 0.9988438  0.99655097 0.99710477]
 [0.99868023 1.         0.99880949 ... 0.99875048 0.99602055 0.9971404 ]
 [0.99928328 0.99880949 1.         ... 0.99876014 0.99716287 0.99772975]
 ...
 [0.9988438  0.99875048 0.99876014 ... 1.         0.99626122 0.99724069]
 [0.99655097 0.99602055 0.99716287 ... 0.99626122 1.         0.9983063 ]
 [0.99710477 0.9971404  0.99772975 ... 0.99724069 0.9983063  1.        ]]


In [47]:
cosine_item_similarity_knnb=cosine_similarity(user_item_matrix_knnb.T)
print(cosine_item_similarity_knnb)

[[1.         0.99902681 0.99513541 ... 0.99894075 0.99570636 0.99633168]
 [0.99902681 1.         0.99466889 ... 0.99850647 0.99528063 0.99591007]
 [0.99513541 0.99466889 1.         ... 0.99460679 0.99139913 0.99202953]
 ...
 [0.99894075 0.99850647 0.99460679 ... 1.         0.99519792 0.9958251 ]
 [0.99570636 0.99528063 0.99139913 ... 0.99519792 1.         0.9999676 ]
 [0.99633168 0.99591007 0.99202953 ... 0.9958251  0.9999676  1.        ]]


In [48]:
print(type(cosine_item_similarity_svd))
df_filtered.duplicated().sum()

<class 'numpy.ndarray'>


0

In [49]:
def get_collab_recommendation(product_id, cosine_item_similarity,user_item_matrix,top_r):
    recommendations={}
    #checking if the product id exists or not5
    if product_id not in user_item_matrix.columns:
        recommendations['1']=('Product {product_id} not found')
    else:
        #find out the product index 
        #we are finding 5this as numpy array can only be accessed by index positions
        product_index=user_item_matrix.columns.get_loc(product_id)
        #getting the similarity scores 
        product_similarity=cosine_item_similarity[product_index]
        #sorting in descending order
        product_similarity_sorted=np.argsort(product_similarity)[::-1]
        #removing the product itself and keeping the recommended products
        product_similarity_sorted=product_similarity_sorted[1:top_r+1]
        #map the indices with the product id
    for i in product_similarity_sorted:
        product_id_similar=user_item_matrix.columns[i]
        similarity_score=product_similarity[i]
        recommendations[product_id_similar]=similarity_score
    return recommendations

In [50]:
#we are putting the product id label directly here
#if label not correct , it will give error
top_r=5
b=int(input('Enter the Product ID for SVD approach'))
recommedations=get_collab_recommendation(b,cosine_item_similarity_svd,user_item_matrix,top_r)
for k,v in recommedations.items():
    print(f'{k} : {v}')

24 : 0.9995552376604376
26 : 0.9994499934046776
3 : 0.9994247220910829
34 : 0.9994089703709677
29 : 0.9994076921503902


In [51]:
top_r=5
c=int(input('Enter the product ID for KNN Basic approach'))
recommedations_knnb=get_collab_recommendation(c,cosine_item_similarity_knnb,user_item_matrix_knnb,top_r)
for k,v in recommedations_knnb.items():
    print(f'{k} : {v}')

26 : 0.9996900975222236
24 : 0.9996055842964732
17 : 0.9996055842964732
35 : 0.9996055842964731
29 : 0.999567346968912


In [52]:
#here what we are basically doing is if a user has given rating on some
#product we recommend similar products to him based on the similarity of 
#the other products which is found by the cosine similarity matrix.
#missing ratings are filled by svd and the input is the product id label
#to my future self to not forget 😅 

In [53]:
df_given_ratings.shape

(1177, 29)

In [54]:
import pickle

# Assuming cosine_item_similarity_svd and user_item_matrix_svd are your matrices

# Export cosine similarity matrix
with open('cosine_item_similarity_svd.pkl', 'wb') as file:
    pickle.dump(cosine_item_similarity_svd, file)

# Export user-item matrix
with open('user_item_matrix_svd.pkl', 'wb') as file:
    pickle.dump(user_item_matrix, file)


print('This is the end')

This is the end
