In [1]:
# Getting Dependencies

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Reading the files for the recommendation system

file_a = pd.read_csv('Groceries_dataset.csv')
#file_b = pd.read_csv('groceries.csv', error_bad_lines=False)
file_c = pd.read_csv('ratings.csv')
file_d = pd.read_csv('Movies (1).csv')

# The collaborative filtering method

Collaborative filtering is based on the notion of similarity (or distance). For example, if two users A and B have purchased the same products and have rated them similarly on common rating scale, then A and B can be considered in similar in nature and their buying behaviour. Hence, if A buys a new product and rate high, then that product can be recommended to B and vice-versa.
Collborative filtering comes in two variations:

(1) User-Based Similarity
(2) Item-Based Similarity

# 1. User-based similarity

In [3]:
file_c.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
file_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [5]:
file_c = file_c.drop('timestamp', axis=1)
len(file_c['userId'].unique()), len(file_c['movieId'].unique())

(610, 9724)

In [9]:
# Create a pivot dataframe with index as userid and column as movieid
um_df = file_c.pivot(index='userId',
                    columns='movieId',
                    values='rating').reset_index(drop=True)
um_df.index = file_c.userId.unique()
um_df.iloc[:5,:15]

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
1,4.0,,4.0,,,4.0,,,,,,,,,
2,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,
5,4.0,,,,,,,,,,,,,,


In [10]:
# use fillna method to convert NaN to zeros
um_df.fillna(0, inplace=True)
um_df.iloc[:5,:15]

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# calculating cosine similarity between users
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

user_sim = 1 - pairwise_distances(um_df.values, 
                                  metric = 'cosine')
# store the results in a dataframe
user_sim_df = pd.DataFrame(user_sim)
# set the index and columns of the dataframe
user_sim_df.index = file_c.userId.unique()
user_sim_df.columns = file_c.userId.unique()
user_sim_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,1.0,0.027283,0.05972,0.194395,0.12908,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.0,0.0,0.003726,0.016614,0.025333,0.027585,0.027257,0.0,0.067445,...,0.202671,0.016866,0.011997,0.0,0.0,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.05972,0.0,1.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0,...,0.005048,0.004892,0.024992,0.0,0.010694,0.012993,0.019247,0.021128,0.0,0.032119
4,0.194395,0.003726,0.002251,1.0,0.128659,0.088491,0.11512,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.12908,0.016614,0.00502,0.128659,1.0,0.300349,0.108342,0.429075,0.0,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792


In [12]:
# remove the diagonal values of similarity with itself
np.fill_diagonal(user_sim, 0)
user_sim_df.loc[:5, :10]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
1,0.0,0.027283,0.05972,0.194395,0.12908,0.128152,0.158744,0.136968,0.064263,0.016875
2,0.027283,0.0,0.0,0.003726,0.016614,0.025333,0.027585,0.027257,0.0,0.067445
3,0.05972,0.0,0.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0
4,0.194395,0.003726,0.002251,0.0,0.128659,0.088491,0.11512,0.062969,0.011361,0.031163
5,0.12908,0.016614,0.00502,0.128659,0.0,0.300349,0.108342,0.429075,0.0,0.030611


In [13]:
# Filtering similar users of first 5 user id
user_sim_df.idxmax(axis=1)[:5]

1    266
2    366
3    313
4    391
5    470
dtype: int64

For user id 1 most similar user is user id no. 266 and so no..

In [14]:
file_d.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [16]:
# find similar movies users are watching
def get_user_similar_movies( user1, user2):
    # innner join between the movies they watched
    common_movies = file_c[file_c.userId == user1].merge(file_c[file_c.userId == user2],
                                                  on = 'movieId',
                                                  how= 'inner')
    # return the common movies with movies they watched from movies dataframe
    return common_movies.merge(file_d, on = 'movieId')

In [17]:
# example of merge method of pandas
file_c[file_c.userId == 2].merge(file_c[file_c.userId == 366], on = 'movieId', how='inner')

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y
0,2,3578,4.0,366,4.5
1,2,6874,4.0,366,4.0
2,2,48516,4.0,366,4.5
3,2,58559,4.5,366,4.0
4,2,68157,4.5,366,4.5
5,2,79132,4.0,366,4.0
6,2,91529,3.5,366,4.0
7,2,109487,3.0,366,5.0
8,2,122882,5.0,366,2.0


In [18]:
# find the common movies of user id 2 and 336
common_movies = get_user_similar_movies(2, 366)
common_movies

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title,genres
0,2,3578,4.0,366,4.5,Gladiator (2000),Action|Adventure|Drama
1,2,6874,4.0,366,4.0,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller
2,2,48516,4.0,366,4.5,"Departed, The (2006)",Crime|Drama|Thriller
3,2,58559,4.5,366,4.0,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
4,2,68157,4.5,366,4.5,Inglourious Basterds (2009),Action|Drama|War
5,2,79132,4.0,366,4.0,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
6,2,91529,3.5,366,4.0,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
7,2,109487,3.0,366,5.0,Interstellar (2014),Sci-Fi|IMAX
8,2,122882,5.0,366,2.0,Mad Max: Fury Road (2015),Action|Adventure|Sci-Fi|Thriller


In [19]:
# find most highly commonaly rated movies 
common_movies[(common_movies.rating_x >= 4.0) & (common_movies.rating_y >= 4.0)]

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title,genres
0,2,3578,4.0,366,4.5,Gladiator (2000),Action|Adventure|Drama
1,2,6874,4.0,366,4.0,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller
2,2,48516,4.0,366,4.5,"Departed, The (2006)",Crime|Drama|Thriller
3,2,58559,4.5,366,4.0,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
4,2,68157,4.5,366,4.5,Inglourious Basterds (2009),Action|Drama|War
5,2,79132,4.0,366,4.0,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX


# Challanges with user-based similarity

It is only used after users watch few movies and rate them. for new users it is not suitable to recommend movies with user-based approch

# 2. Item-based similarity

In [20]:
# pivot based on movie rating
rating_mat = file_c.pivot(index= 'movieId',
                            columns = 'userId',
                            values = 'rating').reset_index(drop = True)
#rating_mat.index = movies_df.movieId
rating_mat.loc[:5, :15]

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,4.0,,,,4.0,,4.5,,,,,,,,2.5
1,,,,,,4.0,,4.0,,,,,,,
2,4.0,,,,,5.0,,,,,,,,,
3,,,,,,3.0,,,,,,,,3.0,
4,,,,,,5.0,,,,,,,,,
5,4.0,,,,,4.0,,,,,5.0,,,,


In [21]:
rating_mat.fillna(0, inplace=True)
# find the correlation between the movies
movie_sim = 1 - pairwise_distances(rating_mat.values,
                                  metric='correlation')
movie_sim_df = pd.DataFrame(movie_sim)
movie_sim_df.loc[:5, :15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.0,0.231327,0.173213,-0.028917,0.192474,0.192686,0.143743,0.085477,0.177245,0.183382,0.172799,0.159352,0.106217,0.099645,0.031566,0.111011
1,0.231327,1.0,0.191945,0.071269,0.200526,0.158341,0.127569,0.14154,-0.021045,0.285086,0.21709,0.11529,0.163556,0.033185,0.191785,0.108676
2,0.173213,0.191945,1.0,0.067143,0.370171,0.196442,0.351513,0.296897,0.275812,0.136916,0.174251,0.168038,0.118157,0.136819,0.111644,0.216929
3,-0.028917,0.071269,0.067143,1.0,0.16791,0.053755,0.258075,0.148726,-0.016025,0.056,0.128247,-0.016306,0.142266,0.095113,0.145606,0.082152
4,0.192474,0.200526,0.370171,0.16791,1.0,0.215503,0.42989,0.265777,0.308085,0.110833,0.201002,0.17363,0.089913,0.220718,0.07017,0.108118
5,0.192686,0.158341,0.196442,0.053755,0.215503,1.0,0.148109,0.114707,0.167909,0.251343,0.182082,0.115893,-0.013484,0.24288,0.091079,0.408483


In [22]:
np.fill_diagonal(movie_sim, 0)
movie_sim_df.loc[:5,:15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.231327,0.173213,-0.028917,0.192474,0.192686,0.143743,0.085477,0.177245,0.183382,0.172799,0.159352,0.106217,0.099645,0.031566,0.111011
1,0.231327,0.0,0.191945,0.071269,0.200526,0.158341,0.127569,0.14154,-0.021045,0.285086,0.21709,0.11529,0.163556,0.033185,0.191785,0.108676
2,0.173213,0.191945,0.0,0.067143,0.370171,0.196442,0.351513,0.296897,0.275812,0.136916,0.174251,0.168038,0.118157,0.136819,0.111644,0.216929
3,-0.028917,0.071269,0.067143,0.0,0.16791,0.053755,0.258075,0.148726,-0.016025,0.056,0.128247,-0.016306,0.142266,0.095113,0.145606,0.082152
4,0.192474,0.200526,0.370171,0.16791,0.0,0.215503,0.42989,0.265777,0.308085,0.110833,0.201002,0.17363,0.089913,0.220718,0.07017,0.108118
5,0.192686,0.158341,0.196442,0.053755,0.215503,0.0,0.148109,0.114707,0.167909,0.251343,0.182082,0.115893,-0.013484,0.24288,0.091079,0.408483


In [31]:
#  finding most similar movies
def get_similar_movies( movieid, topN):
    # get the index of the movie record in movies_df
    movieidx = file_d[file_d.movieId == movieid].index[0]
    file_d['similarity'] = movie_sim_df.iloc[movieidx]
    top_n = file_d.sort_values(['similarity'], ascending=False)[0:topN]
    return top_n

In [32]:
# recommendation of 5 movies based on similarity for movie id on 858
get_similar_movies(858, 5)

Unnamed: 0,movieId,title,genres,similarity
921,1220,"Blues Brothers, The (1980)",Action|Comedy|Musical,0.76939
913,1212,"Third Man, The (1949)",Film-Noir|Mystery|Thriller,0.560246
895,1192,Paris Is Burning (1990),Documentary,0.496048
827,1088,Dirty Dancing (1987),Drama|Musical|Romance,0.442128
520,608,Fargo (1996),Comedy|Crime|Drama|Thriller,0.418042
