In [2]:
import pandas as pd
from math import sqrt
import numpy as np

In [3]:
mdf = pd.read_csv('movies.csv')
rdf = pd.read_csv('ratings.csv')
mdf.info()
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [4]:
userInput = [{'title':'RoboCop (1987)', 'rating':3},
             {'title':'Who Framed Roger Rabbit? (1988)', 'rating':5},
             {'title':'House on Haunted Hill (1999)', 'rating':2},
             {'title':'Being John Malkovich (1999)', 'rating':4},
             {'title':'Bone Collector, The (1999)', 'rating':2}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                             title  rating
0                   RoboCop (1987)       3
1  Who Framed Roger Rabbit? (1988)       5
2     House on Haunted Hill (1999)       2
3      Being John Malkovich (1999)       4
4       Bone Collector, The (1999)       2


In [5]:
inputId = mdf[mdf['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('genres', 1) #we don't really need this at the moment
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                            title  rating
0     2985                   RoboCop (1987)       3
1     2987  Who Framed Roger Rabbit? (1988)       5
2     2995     House on Haunted Hill (1999)       2
3     2997      Being John Malkovich (1999)       4
4     3005       Bone Collector, The (1999)       2


  inputMovies = inputMovies.drop('genres', 1) #we don't really need this at the moment


In [6]:
userSubset = rdf[rdf['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
2985         70      70         70
2987         97      97         97
2995         14      14         14
2997         99      99         99
3005         25      25         25


In [7]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
  # print (len(x[1]))
  return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(19,       userId  movieId  rating  timestamp
2831      19     2985     3.0  965703785
2832      19     2987     5.0  965709047
2833      19     2995     2.0  965705322
2834      19     2997     4.0  965705619
2835      19     3005     2.0  965711873), (68,        userId  movieId  rating   timestamp
10832      68     2985     3.0  1233381010
10833      68     2987     2.5  1158531994
10836      68     2995     2.0  1269123657
10837      68     2997     4.5  1158530999
10839      68     3005     4.0  1158535433), (274,        userId  movieId  rating   timestamp
39722     274     2985     3.5  1171756231
39724     274     2987     4.0  1171428374
39725     274     2995     3.5  1171828813
39726     274     2997     4.0  1222883767
39728     274     3005     3.5  1285027059), (608,        userId  movieId  rating   timestamp
99108     608     2985     2.0  1117408311
99110     608     2987     3.0  1117490688
99111     608     2995     3.0  1117507097
99112     608     2997     2.0  11175

In [8]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
  
  #Let's start by sorting the input and current user group so the values aren't mixed up later on
  group = group.sort_values(by='movieId')
  inputMovies = inputMovies.sort_values(by='movieId')
  
  #Get the N for the formula
  nRatings = len(group)
  
  #Get the review scores for the movies that they both have in common
  temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
  
  #And then store them in a temporary buffer variable in a list format to facilitate future calculations
  tempRatingList = temp_df['rating'].tolist()
  
  #Let's also put the current user group reviews in a list format
  tempGroupList = group['rating'].tolist()
  
  
  #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
  Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
  Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
  Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
  
  #If the denominator is different than zero, then divide, else, 0 correlation.
  if Sxx != 0 and Syy != 0:
    pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
  else:
      pearsonCorrelationDict[name] = 0

In [9]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         1.000000      19
1         0.055480      68
2         0.910182     274
3        -0.342997     608
4         0.774597     135


In [10]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
0               1.0      19
64              1.0     328
80              1.0     577
39              1.0     101
40              1.0     115


In [11]:
topUsersRating=topUsers.merge(rdf, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating  timestamp
0               1.0      19        1     4.0  965705637
1               1.0      19        2     3.0  965704331
2               1.0      19        3     3.0  965707636
3               1.0      19        7     2.0  965706657
4               1.0      19       10     2.0  965709556
..              ...     ...      ...     ...        ...
95              1.0      19      466     3.0  965707518
96              1.0      19      468     3.0  965706967
97              1.0      19      472     2.0  965707518
98              1.0      19      474     3.0  965710914
99              1.0      19      480     2.0  965703785

[100 rows x 5 columns]


In [12]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating  timestamp  weightedRating
0              1.0      19        1     4.0  965705637             4.0
1              1.0      19        2     3.0  965704331             3.0
2              1.0      19        3     3.0  965707636             3.0
3              1.0      19        7     2.0  965706657             2.0
4              1.0      19       10     2.0  965709556             2.0


In [13]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  23.672737           96.735466
2                  17.474596           54.847297
3                   8.479583           25.002818
4                   0.397360            0.596040
5                   5.345766           12.239764


In [14]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']



recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     4.086366        1
2                                     3.138688        2
3                                     2.948591        3
4                                     1.500000        4
5                                     2.289618        5
6                                     3.865589        6
7                                     2.618817        7
8                                     3.000000        8
9                                     1.500000        9
10                                    3.343711       10


In [15]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
4495                                       5.0     4495
170705                                     5.0   170705
44193                                      5.0    44193
26073                                      5.0    26073
72171                                      5.0    72171
88448                                      5.0    88448
2654                                       5.0     2654
89118                                      5.0    89118
26326                                      5.0    26326
67618                                      5.0    67618


In [16]:
recommended_movie=mdf.loc[mdf['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie.loc[1000:1006,:])

      movieId                                              title  \
1000     1302                             Field of Dreams (1989)   
1001     1303                  Man Who Would Be King, The (1975)   
1002     1304          Butch Cassidy and the Sundance Kid (1969)   
1003     1305                                Paris, Texas (1984)   
1004     1306  Until the End of the World (Bis ans Ende der W...   
1005     1307                     When Harry Met Sally... (1989)   

                      genres  
1000  Children|Drama|Fantasy  
1001         Adventure|Drama  
1002          Action|Western  
1003           Drama|Romance  
1004  Adventure|Drama|Sci-Fi  
1005          Comedy|Romance  
