In [83]:
import pandas as pd
from math import sqrt
import numpy as np

In [84]:
movies_data = pd.read_csv('movies.csv')
ratings_data = pd.read_csv('ratings.csv')

In [85]:
print(movies_data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [86]:
print(ratings_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None


In [101]:
userInput = [{'title':'Breakfast Club, The', 'rating':5},
             {'title':'Toy Story', 'rating':1},
             {'title':'Jumanji', 'rating':1},
             {'title':'Pulp Fiction', 'rating':5},
             {'title':'Akira', 'rating':4.5}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                 title  rating
0  Breakfast Club, The     5.0
1            Toy Story     1.0
2              Jumanji     1.0
3         Pulp Fiction     5.0
4                Akira     4.5


In [100]:
inputId = movies_data[movies_data['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
# inputMovies = inputMovies.drop('year', 1) #we don't really need this at the moment
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)


Empty DataFrame
Columns: [movieId, title, rating]
Index: []


In [60]:
userSubset = ratings_data[ratings_data['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

Empty DataFrame
Columns: [userId, rating, timestamp]
Index: []


In [58]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])


#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[]


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [61]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()

    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()


    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [63]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

ValueError: ignored

In [None]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

In [None]:
topUsersRating=topUsers.merge(ratings_data, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

In [None]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

In [None]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

In [None]:
#Creates an empty dataframe
recommendation_data = pd.DataFrame()

#Now we take the weighted average
recommendation_data['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_data['movieId'] = tempTopUsersRating.index
print(recommendation_data.head(10))

In [93]:
recommendation_data = recommendation_data.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_data)

NameError: ignored

In [None]:
recommended_movie=movies_data.loc[movies_data['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)