# USER BASED RECOMMENDER SYSTEM

Steps in a user-based recommendation system:

1. Select a user with the movies the user has watched
2. Based on his rating to movies, find the top x neighbours
3. Get the watched movie record of the user for each neighbour.
4. Calculate a similarity score using some formula
5. Recommend the items with the highest score

In [1]:
import pandas as pd
import numpy as np
from math import sqrt

# Movie Lens - Movie Recommendation Data Sets


In [2]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [3]:
userInput = [{'title':'City of Lost Children, The (Cité des enfants perdus, La) (1995)', 'rating':5},
             {'title':'Indian in the Cupboard, The (1995)', 'rating':5},
             {'title':'Juror, The (1996)', 'rating':5},
             {'title':'Broken Arrow (1996)', 'rating':4},
             {'title':'Mr. Wrong (1996)', 'rating':5}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                                               title  rating
0  City of Lost Children, The (Cité des enfants p...       5
1                 Indian in the Cupboard, The (1995)       5
2                                  Juror, The (1996)       5
3                                Broken Arrow (1996)       4
4                                   Mr. Wrong (1996)       5


In [4]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
print(inputMovies)

   movieId                                              title  \
0       29  City of Lost Children, The (Cité des enfants p...   
1       60                 Indian in the Cupboard, The (1995)   
2       79                                  Juror, The (1996)   
3       95                                Broken Arrow (1996)   
4      102                                   Mr. Wrong (1996)   

                                   genres  rating  
0  Adventure|Drama|Fantasy|Mystery|Sci-Fi       5  
1              Adventure|Children|Fantasy       5  
2                          Drama|Thriller       5  
3               Action|Adventure|Thriller       4  
4                                  Comedy       5  


In [5]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
29           38      38         38
60           34      34         34
79           17      17         17
95           84      84         84
102           5       5          5


In [6]:
userSubsetGroup = userSubset.groupby(['userId'])
def take_5_elem(x):
    return len(x[1])
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)
userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(43,       userId  movieId  rating  timestamp
6323      43       29     5.0  848994937
6328      43       60     5.0  848994424
6329      43       79     5.0  848994617
6330      43       95     4.0  848993983
6331      43      102     5.0  848994916), (6,      userId  movieId  rating  timestamp
591       6       60     4.0  845554263
597       6       79     3.0  845554907
604       6       95     4.0  845553559
606       6      102     1.0  845555436), (599,        userId  movieId  rating   timestamp
92642     599       29     3.5  1498500987
92655     599       60     2.0  1519118310
92663     599       79     2.0  1519336237
92669     599       95     2.0  1498510588), (160,        userId  movieId  rating  timestamp
23236     160       60     2.0  971619579
23238     160       79     1.0  971196754
23241     160       95     1.0  971112529), (274,        userId  movieId  rating   timestamp
39241     274       29     4.0  1238050945
39248     274       60     3.5  1171827419
39257 

  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [7]:
pearsonCorrelationDict = {}
for name, group in userSubsetGroup:
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    nRatings = len(group)
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [8]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         1.000000      43
1        -0.471405       6
2         0.333333     599
3         0.500000     160
4         0.944911     274


In [9]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
0               1.0      43
22              1.0     437
19              1.0     373
23              1.0     480
24              1.0     492


In [10]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating  timestamp
0               1.0      43        1     5.0  848993983
1               1.0      43        3     5.0  848994405
2               1.0      43        5     5.0  848994281
3               1.0      43        7     5.0  848994392
4               1.0      43        8     5.0  848994814
..              ...     ...      ...     ...        ...
95              1.0      43      595     5.0  848993693
96              1.0      43      596     5.0  848994332
97              1.0      43      597     5.0  848993817
98              1.0      43      609     5.0  848994863
99              1.0      43      610     5.0  848994332

[100 rows x 5 columns]


In [11]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating  timestamp  weightedRating
0              1.0      43        1     5.0  848993983             5.0
1              1.0      43        3     5.0  848994405             5.0
2              1.0      43        5     5.0  848994281             5.0
3              1.0      43        7     5.0  848994392             5.0
4              1.0      43        8     5.0  848994814             5.0


In [12]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                   9.778245           38.779645
2                   5.778245           20.140522
3                   5.333333           19.000000
5                   4.500000           14.500000
6                  10.278245           43.279645


In [13]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.965911        1
2                                     3.485578        2
3                                     3.562500        3
5                                     3.222222        5
6                                     4.210801        6
7                                     3.739130        7
8                                     4.028325        8
9                                     3.150000        9
10                                    3.680350       10
11                                    3.550000       11


In [14]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
4450                                       5.0     4450
238                                        5.0      238
3677                                       5.0     3677
84944                                      5.0    84944
2394                                       5.0     2394
...                                        ...      ...
182715                                     NaN   182715
184245                                     NaN   184245
188675                                     NaN   188675
188833                                     NaN   188833
189381                                     NaN   189381

[4993 rows x 2 columns]


In [15]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]
print(recommended_movie)

      movieId                                  title  \
0           1                       Toy Story (1995)   
1           2                         Jumanji (1995)   
2           3                Grumpier Old Men (1995)   
4           5     Father of the Bride Part II (1995)   
5           6                            Heat (1995)   
...       ...                                    ...   
9710   187595         Solo: A Star Wars Story (2018)   
9713   188301            Ant-Man and the Wasp (2018)   
9714   188675                          Dogman (2018)   
9717   188833  The Man Who Killed Don Quixote (2018)   
9721   189381                        SuperFly (2018)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
1                      Adventure|Children|Fantasy  
2                                  Comedy|Romance  
4                                          Comedy  
5                           Action|Crime|Thriller  
...            