In [1]:
import pandas as pd
from math import sqrt
import numpy as np

In [2]:
movies_df = pd.read_csv(r'C:\Users\tito\Downloads\ml-latest-small\ml-latest-small\movies.csv')
ratings_df = pd.read_csv(r'C:\Users\tito\Downloads\ml-latest-small\ml-latest-small\ratings.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [3]:
userInput = [{'title':'Seven (a.k.a. Se7en) (1995)', 'rating':5},
             {'title':'Rob Roy (1995)', 'rating':5.0},
             {'title':'Imperium (2016)', 'rating':2.5},
             {'title':'Return of the Pink Panther (1995)', 'rating':5.0},
             {'title':'Top Gun (1986)', 'rating':4.0}]

inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                               title  rating
0        Seven (a.k.a. Se7en) (1995)     5.0
1                     Rob Roy (1995)     5.0
2                    Imperium (2016)     2.5
3  Return of the Pink Panther (1995)     5.0
4                     Top Gun (1986)     4.0


In [6]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('genres', 1) #we don't really need this at the moment
inputMovies = inputMovies[['movieId','title','rating',]]
print(inputMovies)

   movieId                        title  rating
0       47  Seven (a.k.a. Se7en) (1995)     5.0
1      151               Rob Roy (1995)     5.0
2     1101               Top Gun (1986)     4.0
3   162828              Imperium (2016)     2.5


  inputMovies = inputMovies.drop('genres', 1) #we don't really need this at the moment


In [7]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
47          203     203        203
151          44      44         44
1101         83      83         83
162828        1       1          1


In [8]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])


[(64,       userId  movieId  rating   timestamp
9440      64       47     4.5  1161520185
9449      64      151     3.0  1161529309
9539      64     1101     3.0  1161521245), (91,        userId  movieId  rating   timestamp
14136      91       47     4.5  1112712832
14147      91      151     3.0  1112710950
14276      91     1101     3.0  1112711132), (140,        userId  movieId  rating   timestamp
21091     140       47     4.0   942842215
21101     140      151     4.0   949667175
21206     140     1101     4.5  1055093098), (249,        userId  movieId  rating   timestamp
36379     249       47     5.0  1346757700
36391     249      151     4.0  1357685694
36447     249     1101     3.5  1416930822), (288,        userId  movieId  rating   timestamp
42126     288       47     3.5  1054568985
42136     288      151     4.0  1079984467
42274     288     1101     3.0   976138347)]


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [9]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [10]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.500000      64
1         0.500000      91
2        -1.000000     140
3         0.755929     249
4         0.866025     288


In [11]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
50              1.0     469
31              1.0     239
29              1.0     222
33              1.0     268
35              1.0     307


In [12]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating  timestamp
0               1.0     469        1     4.0  965336888
1               1.0     469        6     3.0  965336673
2               1.0     469       10     2.0  965334356
3               1.0     469       11     3.0  965425831
4               1.0     469       29     4.0  965335401
..              ...     ...      ...     ...        ...
95              1.0     469     1080     5.0  965336888
96              1.0     469     1082     4.0  965333459
97              1.0     469     1086     4.0  965846225
98              1.0     469     1089     5.0  965336673
99              1.0     469     1090     4.0  965846279

[100 rows x 5 columns]


In [13]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating  timestamp  weightedRating
0              1.0     469        1     4.0  965336888             4.0
1              1.0     469        6     3.0  965336673             3.0
2              1.0     469       10     2.0  965334356             2.0
3              1.0     469       11     3.0  965425831             3.0
4              1.0     469       29     4.0  965335401             4.0


In [14]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  22.424909           83.935605
2                  17.424909           51.414632
3                   7.668980           20.622967
4                   0.000000            0.000000
5                   5.168980           11.140916


In [15]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.742963        1
2                                     2.950640        2
3                                     2.689141        3
4                                          NaN        4
5                                     2.155341        5
6                                     4.089286        6
7                                     2.727273        7
9                                     2.000000        9
10                                    3.337616       10
11                                    3.309524       11


In [16]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
86504                                      5.0    86504
65261                                      5.0    65261
5034                                       5.0     5034
108795                                     5.0   108795
67618                                      5.0    67618
...                                        ...      ...
171023                                     NaN   171023
177593                                     NaN   177593
180095                                     NaN   180095
180777                                     NaN   180777
185135                                     NaN   185135

[6816 rows x 2 columns]


In [17]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

      movieId                                  title  \
0           1                       Toy Story (1995)   
1           2                         Jumanji (1995)   
2           3                Grumpier Old Men (1995)   
3           4               Waiting to Exhale (1995)   
4           5     Father of the Bride Part II (1995)   
...       ...                                    ...   
9707   187031  Jurassic World: Fallen Kingdom (2018)   
9709   187593                      Deadpool 2 (2018)   
9710   187595         Solo: A Star Wars Story (2018)   
9713   188301            Ant-Man and the Wasp (2018)   
9716   188797                             Tag (2018)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
1                      Adventure|Children|Fantasy  
2                                  Comedy|Romance  
3                            Comedy|Drama|Romance  
4                                          Comedy  
...            