In [17]:
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt



#### Loading the necessary tables first

In [18]:
movies_list = pd.read_csv('E:\MAID\ADSA_pro\code\movies.csv')
popular_movies = pd.read_csv('E:\MAID\ADSA_pro\code\popular_movies.csv')
movies_stats = pd.read_csv('E:\MAID\ADSA_pro\code\movies_stats.csv')


<hr>

<a id="ref3"></a>
# Collaborative Filtering

We are using here the technique of  __Collaborative Filtering__(__User-User Filtering__.). It uses other users to recommend movies to the user, finding users that have similar preferences as the input user and then recommends movies that they have liked to the input user which he didn't see. There are several methods of finding similar users, here we will be using the __Pearson Correlation __.


The steps to create recommendation system are:
- Select a user.
- Find the movies the user has watched
- Based on his rating to movies, find the top N neighbours 
- Get the watched movie record of the user for each neighbour.
- Calculate a similarity score using some formula
- Recommend the items with the highest score

In [19]:
recommend_userid = 55


In [20]:
inputMovies = popular_movies.loc[popular_movies['userId'] == recommend_userid]
inputMovies.reset_index(drop=True, inplace=True)
inputMovies.head()

Unnamed: 0,userId,movieId,rating
0,55,73,4.5
1,55,610,4.0
2,55,665,4.5
3,55,668,5.0
4,55,670,5.0


#### Add movieId to selected user
extract the input movies's ID's from the movies dataframe and add them into it.

We can achieve this by first filtering out the rows that contain the input movies' ID and then merging this subset with the input dataframe. We also drop unnecessary columns for the input to save memory space.

In [21]:
#Filtering out the movies by movieId
#Then merging it so we can get the movieId. It's implicitly merging it by title.

inputMovies = pd.merge(inputMovies, movies_list, how='inner', on=['movieId'])

inputMovies.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,55,73,4.5,"Misérables, Les (1995)",Drama|War
1,55,610,4.0,Heavy Metal (1981),Action|Adventure|Animation|Horror|Sci-Fi
2,55,665,4.5,Underground (1995),Comedy|Drama|War
3,55,668,5.0,Song of the Little Road (Pather Panchali) (1955),Drama
4,55,670,5.0,"World of Apu, The (Apur Sansar) (1959)",Drama


#### The users who has seen the same movies
Now with the movie ID's in our input, we can now get the subset of users that have watched and reviewed the movies in our input.


In [22]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = popular_movies[popular_movies['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
2,1,665,5.0
23,1,6016,5.0
29,1,7361,5.0
123,2,4995,5.0
249,3,4995,4.0


In [23]:
userSubset.shape


(129162, 3)

In [24]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])


sort these groups so the users that share the most movies in common with the input have higher priority. This provides a richer recommendation since we won't go through every single user. Then We will select a subset of users to iterate through.

In [25]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)


In [26]:
userSubsetGroup = userSubsetGroup[1:100]


#### Pearson Correlation
Pearson correlation is used to measure the strength of a linear association between two variables. The formula for finding this coefficient between sets X and Y with N values can be seen in the image below. 
Pearson correlation is invariant to scaling, i.e. multiplying all elements by a nonzero constant or adding any constant to all elements. For example, if you have two vectors X and Y, then, pearson(X, Y) = pearson(X, 2 * Y + 3). This is a pretty important property in recommendation systems because for example two users might rate two movies different in terms of absolute rates, but they would be similar users (with similar ideas) with similar rates in various scales .


![alt text](pearson_formula.png)

The values given by the formula vary from r = -1 to r = 1, where 1 forms a direct 
correlation between the two entities (it means a perfect positive correlation) 
and -1 forms a perfect negative correlation.

In case of “1” means that the two users have similar taste
while in case of “-1” means totally the opposite.

In [28]:
def get_pearson(subset,movies):
    pearsonCorrelationDict = {}

    #For every user group in our subset
    for ID, group in subset:
        #Let's start by sorting the input and current user group so the values aren't mixed up later on
        group = group.sort_values(by='movieId')
        inputMovies = movies.sort_values(by='movieId')
        #Get the N for the formula
        nRatings = len(group)
        #Get the ratings for the movies that they both have in common
        temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
        #store them in a temporary buffer variable in a list format to facilitate future calculations
        tempRatingList = temp_df['rating'].tolist()
        #Let's also put the current user group reviews in a list format
        tempGroupList = group['rating'].tolist()
        #Now let's calculate the pearson correlation between two users, so called, x and y
        Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
        Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
        Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
        
        #If the denominator is different than zero, then divide, else, 0 correlation.
        if Sxx != 0 and Syy != 0:
            pearsonCorrelationDict[ID] = Sxy/sqrt(Sxx*Syy)
        else:
            pearsonCorrelationDict[ID] = 0


    return pearsonCorrelationDict


In [29]:
pearsonCorrelationDict = get_pearson(userSubsetGroup,inputMovies)
pearsonCorrelationDict.items()

dict_items([(115900, 0.16386755713853005), (20055, 0.07787611974767762), (72315, 0.28941438070815306), (80974, -0.3072549338995154), (97994, 0.14170505031628433), (77657, 0.4558794171340812), (12593, 0.31640862263215547), (92046, -0.4031892562490727), (107249, 0.4313146063237124), (110971, 0.11546055130293431), (71403, -0.0851599433705518), (144322, -0.05564148840746776), (146925, 0.2964520160618152), (120783, -0.2117411343886713), (11539, -0.24258614802928366), (93424, -0.0769567651749761), (115037, 0.1259633739264735), (10077, -0.2430862174022012), (49998, 0.13898526132014835), (55704, 0.19518001458970666), (81403, 0.10660035817781706), (142226, -0.5345224838248488), (148115, 0.30714755841698893), (150342, -0.06662966046528439), (2003, 0.5004010588597086), (18306, -0.3440414906637061), (24869, 0.0), (29021, 0.21367393083174768), (85757, -0.10307764064044159), (104992, 0.305967061431563), (120544, 0.09249844854030241), (122803, -0.07909557182242624), (133921, 0.2175844552560761), (145

In [32]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.drop(pearsonDF.index[0], inplace=True)
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head(10)

Unnamed: 0,similarityIndex,userId
0,0.077876,20055
1,0.289414,72315
2,-0.307255,80974
3,0.141705,97994
4,0.455879,77657
5,0.316409,12593
6,-0.403189,92046
7,0.431315,107249
8,0.115461,110971
9,-0.08516,71403


In [33]:
#get the top 50 users that are most similar to the input.
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head(5)

Unnamed: 0,similarityIndex,userId
86,0.575829,118647
41,0.5547,65150
23,0.500401,2003
58,0.461593,145626
75,0.458906,48453


In [34]:
#Get the ratings of the selected users to their movies
topUsersRating=topUsers.merge(popular_movies, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.575829,118647,1,5.0
1,0.575829,118647,6,4.0
2,0.575829,118647,10,4.0
3,0.575829,118647,11,4.0
4,0.575829,118647,16,4.0


multiply the movie rating by its weight (The similarity index), then sum up the new weighted ratings and divide it by the sum of the weights (normalize the ratings).

In [45]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.575829,118647,1,5.0,2.879145
1,0.575829,118647,6,4.0,2.303316
2,0.575829,118647,10,4.0,2.303316
3,0.575829,118647,11,4.0,2.303316
4,0.575829,118647,16,4.0,2.303316


In [46]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7.505383,33.008019
2,0.739346,3.207762
3,1.530094,6.120375
4,0.240259,0.961035
5,0.355227,1.420908


In [51]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()


In [52]:
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score
movieId,Unnamed: 1_level_1
1,4.397913
2,4.338648
3,4.0
4,4.0
5,4.0


In [55]:
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.397913,1
2,4.338648,2
3,4.0,3
4,4.0,4
5,4.0,5


Now let's sort it and see the top 10 movies that the algorithm recommended!

In [56]:
recommendation_df = movies_list.loc[movies_list['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]
recommendation_df.reset_index(drop=True, inplace=True)
recommendation_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,10,GoldenEye (1995),Action|Adventure|Thriller
9,11,"American President, The (1995)",Comedy|Drama|Romance
