In [3]:
import pandas as pd
from math import sqrt
import numpy as np

In [4]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [5]:
userInput = [{'title':'Pocahontas (1995)', 'rating':5},
             {'title':'Mortal Kombat (1995)', 'rating':1},
             {'title':'Home for the Holidays (1995)', 'rating':1},
             {'title':'Eye for an Eye (1996)', 'rating':5},
             {'title':'To Die For (1995)', 'rating':4.5}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                          title  rating
0             Pocahontas (1995)     5.0
1          Mortal Kombat (1995)     1.0
2  Home for the Holidays (1995)     1.0
3         Eye for an Eye (1996)     5.0
4             To Die For (1995)     4.5


In [6]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                         title  rating
0       44          Mortal Kombat (1995)     1.0
1       45             To Die For (1995)     4.5
2       48             Pocahontas (1995)     5.0
3       57  Home for the Holidays (1995)     1.0
4       61         Eye for an Eye (1996)     5.0


In [7]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
44           46      46         46
45           32      32         32
48           68      68         68
57            8       8          8
61            6       6          6


In [8]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(414,        userId  movieId  rating  timestamp
62319     414       44     2.0  961516249
62320     414       45     3.0  961438476
62323     414       48     3.0  961437741
62327     414       57     3.0  961517912), (599,        userId  movieId  rating   timestamp
92649     599       44     2.5  1498517161
92650     599       45     2.5  1498516640
92654     599       57     2.5  1519240604
92656     599       61     2.5  1519327817), (6,      userId  movieId  rating  timestamp
586       6       45     3.0  845553907
592       6       61     4.0  845555454), (19,       userId  movieId  rating  timestamp
2285      19       44     3.0  965710019
2287      19       48     1.0  965709172), (21,       userId  movieId  rating   timestamp
3224      21       44     1.0  1376822969
3225      21       48     3.0  1376822924)]


In [9]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [10]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.574801     414
1         0.000000     599
2         1.000000       6
3        -1.000000      19
4         1.000000      21


In [11]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
29              1.0     476
23              1.0     386
2               1.0       6
32              1.0     603
4               1.0      21


In [12]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating  timestamp
0               1.0     476        1     4.0  835021447
1               1.0     476        2     4.0  835021693
2               1.0     476       10     3.0  835021420
3               1.0     476       11     3.0  835021635
4               1.0     476       13     3.0  835022487
..              ...     ...      ...     ...        ...
95              1.0     386      204     1.0  842613737
96              1.0     386      208     2.0  842610246
97              1.0     386      223     3.0  842613794
98              1.0     386      288     1.0  842610269
99              1.0     386      292     1.0  842610228

[100 rows x 5 columns]


In [13]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating  timestamp  weightedRating
0              1.0     476        1     4.0  835021447             4.0
1              1.0     476        2     4.0  835021693             4.0
2              1.0     476       10     3.0  835021420             3.0
3              1.0     476       11     3.0  835021635             3.0
4              1.0     476       13     3.0  835022487             3.0


In [14]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                   9.574801           35.299205
2                   6.574801           21.724404
3                   4.574801           15.299205
4                   2.000000            6.000000
5                   7.574801           22.649602


In [15]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.686678        1
2                                     3.304192        2
3                                     3.344234        3
4                                     3.000000        4
5                                     2.990125        5
6                                     3.520104        6
7                                     3.396050        7
8                                     3.000000        8
9                                          NaN        9
10                                    3.215986       10


In [16]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
43376                                      5.0    43376
495                                        5.0      495
2843                                       5.0     2843
6791                                       5.0     6791
3473                                       5.0     3473
...                                        ...      ...
185435                                     NaN   185435
187593                                     NaN   187593
188675                                     NaN   188675
188833                                     NaN   188833
189381                                     NaN   189381

[5886 rows x 2 columns]


In [17]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

      movieId                                  title  \
0           1                       Toy Story (1995)   
1           2                         Jumanji (1995)   
2           3                Grumpier Old Men (1995)   
3           4               Waiting to Exhale (1995)   
4           5     Father of the Bride Part II (1995)   
...       ...                                    ...   
9709   187593                      Deadpool 2 (2018)   
9710   187595         Solo: A Star Wars Story (2018)   
9714   188675                          Dogman (2018)   
9717   188833  The Man Who Killed Don Quixote (2018)   
9721   189381                        SuperFly (2018)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
1                      Adventure|Children|Fantasy  
2                                  Comedy|Romance  
3                            Comedy|Drama|Romance  
4                                          Comedy  
...            