## Collaborative Filtering

In [1]:
# import all the libraries
import pandas as pd

from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Preprocessing 

In [4]:
movies_df = pd.read_csv('movies.csv')

ratings_df = pd.read_csv('ratings.csv')

In [5]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

In [6]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [7]:
# dropping the genres column
movies_df = movies_df.drop('genres',1)

In [8]:
# drop timestamp from ratings
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0



The process for creating a User Based recommendation system is as follows:

*   Select a user with the movies the user has watched
*   Based on his rating of the movies, find the top X neighbours
*   Get the watched movie record of the user for each neighbour
*   Calculate a similarity score using some formula
*   Recommend the items with the highest score

In [9]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [10]:
#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('year', 1)
#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [11]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0


In [12]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

In [14]:
userSubsetGroup.get_group(1130)

Unnamed: 0,userId,movieId,rating
104167,1130,1,0.5
104168,1130,2,4.0
104214,1130,296,4.0
104363,1130,1274,4.5
104443,1130,1968,4.5


In [15]:
#Sorting it so users with movies most in common with the input will have priority
userSubsetGroup =sorted(userSubsetGroup, key=lambda x : len(x[1]), reverse =True)

In [16]:
userSubsetGroup = userSubsetGroup[0:100]

In [19]:
pearsonCorrelationDict = {}

#for every user group in our subset
for name, group in userSubsetGroup:
    group = group.sort_values(by = 'movieId')
    inputMovies = inputMovies.sort_values(by = 'movieId')
    
    #get the N for the formula
    nRatings = len(group)
    
    #get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    
    #storing the temporary buffer variable in a list format to facilitate futute calculation
    tempRatingList = group['rating'].tolist()
    
    tempGroupList =group['rating'].tolist()
    
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    


In [20]:
pearsonCorrelationDict.items()

dict_items([(75, 1.0), (106, 1.0), (686, 1.0), (815, 1.0), (1040, 1.0), (1130, 1.0), (1502, 1.0), (1599, 1.0), (1625, 1.0), (1950, 1.0), (2065, 1.0), (2128, 1.0), (2432, 1.0), (2791, 1.0), (2839, 1.0), (2948, 1.0), (3025, 1.0), (3040, 1.0), (3186, 1.0), (3271, 1.0), (3429, 1.0), (3734, 1.0), (4099, 1.0), (4208, 1.0), (4282, 1.0), (4292, 1.0), (4415, 1.0), (4586, 1.0), (4725, 1.0), (4818, 1.0), (5104, 1.0), (5165, 1.0), (5547, 1.0), (6082, 1.0), (6207, 1.0), (6366, 1.0), (6482, 1.0), (6530, 1.0), (7235, 1.0), (7403, 1.0), (7641, 1.0), (7996, 1.0), (8008, 1.0), (8086, 1.0), (8245, 1.0), (8572, 1.0), (8675, 1.0), (9101, 1.0), (9358, 1.0), (9663, 1.0), (9994, 1.0), (10248, 1.0), (10315, 1.0), (10368, 1.0), (10607, 1.0), (10707, 1.0), (10863, 1.0), (11314, 1.0), (11399, 1.0), (11769, 1.0), (11827, 1.0), (12069, 1.0), (12120, 1.0), (12211, 1.0), (12325, 1.0), (12916, 1.0), (12921, 1.0), (13053, 1.0), (13142, 1.0), (13260, 1.0), (13366, 1.0), (13768, 1.0), (13888, 1.0), (13923, 1.0), (13934, 

In [21]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,1.0,75
1,1.0,106
2,1.0,686
3,1.0,815
4,1.0,1040


In [22]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
0,1.0,75
63,1.0,12211
73,1.0,13923
72,1.0,13888
71,1.0,13768


In [23]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,75,1,5.0
1,1.0,75,2,3.5
2,1.0,75,16,3.0
3,1.0,75,19,2.5
4,1.0,75,21,4.5


In [24]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,75,1,5.0,5.0
1,1.0,75,2,3.5,3.5
2,1.0,75,16,3.0,3.0
3,1.0,75,19,2.5,2.5
4,1.0,75,21,4.5,4.5


In [25]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,75,1,5.0
1,1.0,75,2,3.5
2,1.0,75,16,3.0
3,1.0,75,19,2.5
4,1.0,75,21,4.5


In [26]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,75,1,5.0,5.0
1,1.0,75,2,3.5,3.5
2,1.0,75,16,3.0,3.0
3,1.0,75,19,2.5,2.5
4,1.0,75,21,4.5,4.5


In [27]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,50.0,192.0
2,50.0,142.0
3,17.0,47.0
4,1.0,3.0
5,12.0,31.0


In [28]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.84,1
2,2.84,2
3,2.764706,3
4,3.0,4
5,2.583333,5


In [29]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3759,5.0,3759
2848,5.0,2848
2732,5.0,2732
2733,5.0,2733
91199,5.0,91199
1900,5.0,1900
139098,5.0,139098
6918,5.0,6918
106109,5.0,106109
27834,5.0,27834


In [30]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
1817,1900,"Children of Heaven, The (Bacheha-Ye Aseman)",1997
2647,2732,Jules and Jim (Jules et Jim),1961
2648,2733,Vibes,1988
2763,2848,Othello (Tragedy of Othello: The Moor of Venic...,1952
3669,3759,Fun and Fancy Free,1947
6808,6918,"Unvanquished, The (Aparajito)",1957
9523,27834,"Return, The (Vozvrashcheniye)",2003
18272,91199,Weekend,2011
22105,106109,"Masquerade (Gwanghai, Wangyidoen namja)",2012
30704,139098,Four Days in October,2010


In [32]:
import tensorflow
import keras

ModuleNotFoundError: No module named 'tensorflow'