# Recommending movies based on user-based Collaborative Filtering!

## Downloading the dataset!

In [None]:
!wget -O moviedataset.zip https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip
print('unziping ...')
!unzip -o -j moviedataset.zip 

## Importing the required libraries

In [1]:
# Dataframe manipulation library
import pandas as pd

# Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt

# Numerical python library
import numpy as np

# Visualization Library
import matplotlib.pyplot as plt


## Data Preprocessing! 

In [22]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [23]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [24]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [35]:
ratings.drop("timestamp", 1, inplace = True)

In [25]:
movies.drop("genres", 1, inplace = True)

In [26]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [27]:
movies["year"] = movies.title.str.extract("(\(\d\d\d\d\))", expand = False)

In [28]:
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story (1995),(1995)
1,2,Jumanji (1995),(1995)
2,3,Grumpier Old Men (1995),(1995)
3,4,Waiting to Exhale (1995),(1995)
4,5,Father of the Bride Part II (1995),(1995)


In [29]:
movies.title = movies.title.str.replace("(\(\d\d\d\d\))", "")

In [30]:
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,(1995)
1,2,Jumanji,(1995)
2,3,Grumpier Old Men,(1995)
3,4,Waiting to Exhale,(1995)
4,5,Father of the Bride Part II,(1995)


In [31]:
movies.title = movies.title.apply(lambda x: x.strip())

In [32]:
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,(1995)
1,2,Jumanji,(1995)
2,3,Grumpier Old Men,(1995)
3,4,Waiting to Exhale,(1995)
4,5,Father of the Bride Part II,(1995)


In [33]:
movies.year = movies.year.str.extract("(\d\d\d\d)", expand = False)

In [34]:
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [36]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


## Collaborative Implementation Begin

In [37]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [49]:
input_id = movies[movies.title.isin(inputMovies.title)]

In [50]:
input_id

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
293,296,Pulp Fiction,1994
1246,1274,Akira,1988
1885,1968,"Breakfast Club, The",1985


In [51]:
inputMovies = pd.merge(input_id, inputMovies)

In [52]:
inputMovies

Unnamed: 0,movieId,title,year,rating
0,1,Toy Story,1995,3.5
1,2,Jumanji,1995,2.0
2,296,Pulp Fiction,1994,5.0
3,1274,Akira,1988,4.5
4,1968,"Breakfast Club, The",1985,5.0


## Filtering out the users who have watched the movies present in our inputMovies!

In [53]:
subset_of_users = ratings[ratings.movieId.isin(inputMovies.movieId)]

In [55]:
subset_of_users.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0


In [56]:
grouped_users_subset = subset_of_users.groupby("userId")

In [66]:
# Checking out the user details with userId 1130
grouped_users_subset.get_group(1130)

Unnamed: 0,userId,movieId,rating
104167,1130,1,0.5
104168,1130,2,4.0
104214,1130,296,4.0
104363,1130,1274,4.5
104443,1130,1968,4.5


In [67]:
# Sorting the grouped user data with the values of complete row to allign the users with the same interest together.
grouped_users_subset = sorted(grouped_users_subset, key = lambda x: len(x[1]), reverse = True)

In [73]:
# Here the valeus are in the form of tuple, first there is userID, then the complete detail, this forms one tuple.
grouped_users_subset[:2]

[(75,
        userId  movieId  rating
  7507      75        1     5.0
  7508      75        2     3.5
  7540      75      296     5.0
  7633      75     1274     4.5
  7673      75     1968     5.0),
 (106,
        userId  movieId  rating
  9083     106        1     2.5
  9084     106        2     3.0
  9115     106      296     3.5
  9198     106     1274     3.0
  9238     106     1968     3.5)]

In [80]:
grouped_users_subset = grouped_users_subset[:100]

In [85]:
for name, group in grouped_users_subset:
    print(name, group)
    break

75       userId  movieId  rating
7507      75        1     5.0
7508      75        2     3.5
7540      75      296     5.0
7633      75     1274     4.5
7673      75     1968     5.0


In [84]:
Pearson_Correlation_Dict = {}

#For every user group in our subset
for name, group in grouped_users_subset:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        Pearson_Correlation_Dict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        Pearson_Correlation_Dict[name] = 0

In [91]:
Pearson_Correlation_Dict

{75: 0.8272781516947562,
 106: 0.5860090386731182,
 686: 0.8320502943378437,
 815: 0.5765566601970551,
 1040: 0.9434563530497265,
 1130: 0.2891574659831201,
 1502: 0.8770580193070299,
 1599: 0.4385290096535153,
 1625: 0.716114874039432,
 1950: 0.179028718509858,
 2065: 0.4385290096535153,
 2128: 0.5860090386731196,
 2432: 0.1386750490563073,
 2791: 0.8770580193070299,
 2839: 0.8204126541423674,
 2948: -0.11720180773462392,
 3025: 0.45124262819713973,
 3040: 0.89514359254929,
 3186: 0.6784622064861935,
 3271: 0.26989594817970664,
 3429: 0.0,
 3734: -0.15041420939904673,
 4099: 0.05860090386731196,
 4208: 0.29417420270727607,
 4282: -0.4385290096535115,
 4292: 0.6564386345361464,
 4415: -0.11183835382312353,
 4586: -0.9024852563942795,
 4725: -0.08006407690254357,
 4818: 0.4885967564883424,
 5104: 0.7674257668936507,
 5165: -0.4385290096535153,
 5547: 0.17200522903844556,
 6082: -0.04728779924109591,
 6207: 0.9615384615384616,
 6366: 0.6577935144802716,
 6482: 0.0,
 6530: -0.351605423203

In [95]:
# Here orient is set to index, so that the keys of the dictionary become rows of the dataframe
pearsonDF = pd.DataFrame.from_dict(Pearson_Correlation_Dict, orient='index', columns = ["similarityIndex"])

In [98]:
pearsonDF.head()

Unnamed: 0,similarityIndex
75,0.827278
106,0.586009
686,0.83205
815,0.576557
1040,0.943456


In [99]:
pearsonDF["userID"] = pearsonDF.index
pearsonDF.index = range(pearsonDF.shape[0])

In [100]:
pearsonDF.head()

Unnamed: 0,similarityIndex,userID
0,0.827278,75
1,0.586009,106
2,0.83205,686
3,0.576557,815
4,0.943456,1040


In [101]:
topUsers = pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userID
64,0.961678,12325
34,0.961538,6207
55,0.961538,10707
67,0.960769,13053
4,0.943456,1040


In [105]:
topUsersRating = topUsers.merge(ratings, left_on='userID', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userID,userId,movieId,rating
0,0.961678,12325,12325,1,3.5
1,0.961678,12325,12325,2,1.5
2,0.961678,12325,12325,3,3.0
3,0.961678,12325,12325,5,0.5
4,0.961678,12325,12325,6,2.5


In [106]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userID,userId,movieId,rating,weightedRating
0,0.961678,12325,12325,1,3.5,3.365874
1,0.961678,12325,12325,2,1.5,1.442517
2,0.961678,12325,12325,3,3.0,2.885035
3,0.961678,12325,12325,5,0.5,0.480839
4,0.961678,12325,12325,6,2.5,2.404196


In [107]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.376281,140.800834
2,38.376281,96.656745
3,10.253981,27.254477
4,0.929294,2.787882
5,11.723262,27.151751


In [108]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.668955,1
2,2.518658,2
3,2.657941,3
4,3.0,4
5,2.316058,5


In [109]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
5073,5.0,5073
3329,5.0,3329
2284,5.0,2284
26801,5.0,26801
6776,5.0,6776
6672,5.0,6672
3759,5.0,3759
3769,5.0,3769
3775,5.0,3775
90531,5.0,90531


In [110]:
movies.loc[movies['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
2200,2284,Bandit Queen,1994
3243,3329,"Year My Voice Broke, The",1987
3669,3759,Fun and Fancy Free,1947
3679,3769,Thunderbolt and Lightfoot,1974
3685,3775,Make Mine Music,1946
4978,5073,"Son's Room, The (Stanza del figlio, La)",2001
6563,6672,War Photographer,2001
6667,6776,Lagaan: Once Upon a Time in India,2001
9064,26801,Dragon Inn (Sun lung moon hak chan),1992
18106,90531,Shame,2011


# Yeah, Another Project Completed!