In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
movies_df = pd.read_csv("data/movies.csv")
ratings_df = pd.read_csv("data/ratings.csv")

In [4]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [5]:
ratings_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [6]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)

In [7]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,(1995)
1,2,Jumanji (1995),Adventure|Children|Fantasy,(1995)
2,3,Grumpier Old Men (1995),Comedy|Romance,(1995)


In [8]:
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)

In [9]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995


In [10]:
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))','')

In [11]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995


In [12]:
movies_df['title'] = movies_df['title'].apply(lambda x:x.strip())
movies_df.head(3)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995


In [13]:
movies_df = movies_df.drop('genres',axis=1)
movies_df.head(3)

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995


In [15]:
userInput = [
{'title':'Breakfast Club, The', 'rating':5},
{'title':'Toy Story', 'rating':3.5},
{'title':'Jumanji', 'rating':2},
{'title':'Pulp Fiction', 'rating':5},
{'title':'Akira', 'rating':4.5}
]
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [16]:
# Filtering out movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputId

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
257,296,Pulp Fiction,1994
973,1274,Akira,1988
1445,1968,"Breakfast Club, The",1985


In [17]:
inputMovies = pd.merge(inputId,inputMovies)
inputMovies

Unnamed: 0,movieId,title,year,rating
0,1,Toy Story,1995,3.5
1,2,Jumanji,1995,2.0
2,296,Pulp Fiction,1994,5.0
3,1274,Akira,1988,4.5
4,1968,"Breakfast Club, The",1985,5.0


In [18]:
inputMovies = inputMovies.drop('year',axis=1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [19]:
# Finding similar user that have watched same movies
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
16,1,296,3.0,964982967
320,4,296,1.0,945173350
422,4,1968,4.0,986934786
516,5,1,4.0,847434962


In [34]:
userSubset.tail()

Unnamed: 0,userId,movieId,rating,timestamp
99510,609,296,4.0,847220802
99534,610,1,5.0,1479542900
99552,610,296,5.0,1479545817
99636,610,1274,5.0,1493846688
99664,610,1968,4.0,1493850238


In [22]:
userSubsetGroup = userSubset.groupby(['userId'])
userSubsetGroup

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F132435F70>

In [35]:
userSubsetGroup.get_group(610)

Unnamed: 0,userId,movieId,rating,timestamp
99534,610,1,5.0,1479542900
99552,610,296,5.0,1479545817
99636,610,1274,5.0,1493846688
99664,610,1968,4.0,1493850238


In [36]:
userSubsetGroup = sorted(userSubsetGroup,key=lambda x:len(x[1]),reverse=True)

In [37]:
userSubsetGroup[0:3]

[(91,
         userId  movieId  rating   timestamp
  14121      91        1     4.0  1112713037
  14122      91        2     3.0  1112713392
  14173      91      296     4.5  1112711264
  14316      91     1274     5.0  1112713057
  14383      91     1968     3.0  1112713409),
 (177,
         userId  movieId  rating   timestamp
  24900     177        1     5.0  1435533535
  24901     177        2     3.5  1435534109
  24930     177      296     5.0  1435530409
  25069     177     1274     2.0  1435535036
  25129     177     1968     3.5  1435534080),
 (219,
         userId  movieId  rating   timestamp
  31524     219        1     3.5  1194681084
  31525     219        2     2.5  1194740185
  31554     219      296     4.0  1198522553
  31628     219     1274     2.5  1194686351
  31680     219     1968     3.0  1194931899)]

In [38]:
userSubsetGroup = userSubsetGroup[0:100]

In [39]:
pearsonCorrelationDict = {}

In [43]:
from math import sqrt
for name,group in userSubsetGroup:
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    nRatings = len(group)
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    Sxx = sum([i**2 for i in tempRatingList])-pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

# converting Dict to DataFrame

In [44]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict,orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.438529,91
1,0.0,177
2,0.451243,219
3,0.716115,274
4,0.959271,298


In [46]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
43,1.0,132
34,1.0,18
63,1.0,305
82,1.0,489
86,1.0,525


In [48]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,timestamp
0,1.0,132,1,2.0,1157921785
1,1.0,132,17,3.0,1157922698
2,1.0,132,29,2.0,1157924165
3,1.0,132,32,3.0,1329983726
4,1.0,132,34,1.5,1157921395


In [49]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,timestamp,weightedRating
0,1.0,132,1,2.0,1157921785,2.0
1,1.0,132,17,3.0,1157922698,3.0
2,1.0,132,29,2.0,1157924165,2.0
3,1.0,132,32,3.0,1329983726,3.0
4,1.0,132,34,1.5,1157921395,1.5


In [52]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,36.354096,133.167946
2,31.005292,94.904257
3,8.783859,26.381456
4,0.866025,1.732051
5,7.165336,19.775255


In [53]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.66308,1
2,3.060905,2
3,3.003402,3
4,2.0,4
5,2.75985,5


In [54]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3310,5.0,3310
7579,5.0,7579
905,5.0,905
1211,5.0,1211
140627,5.0,140627
4298,5.0,4298
152711,5.0,152711
633,5.0,633
5537,5.0,5537
5485,5.0,5485


In [55]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
536,633,Denise Calls Up,1995
687,905,It Happened One Night,1934
912,1211,"Wings of Desire (Himmel über Berlin, Der)",1987
2484,3310,"Kid, The",1921
3189,4298,Rififi (Du rififi chez les hommes),1955
3905,5485,Tadpole,2002
3936,5537,Satin Rouge,2002
4969,7579,Pride and Prejudice,1940
9022,140627,Battle For Sevastopol,2015
9234,152711,Who Killed Chea Vichea?,2010
