# Imports

In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Dataset

In [2]:
movies_df = pd.read_csv('movies.csv')

In [3]:
ratings_df = pd.read_csv('ratings.csv')

In [4]:
movies_df.head()
# movies_df.shape

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [13]:
ratings_df = ratings_df.drop('timestamp', axis=1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


### Separation the Year

In [7]:
titles = []
years = []

for item in movies_df['title']:
    title = item[:-7]
    year = item[len(item)-6:]
    year = year[1:5]
    titles.append(title)
    years.append(year)
    
movies_df['title'] = titles
movies_df['year'] = years


In [12]:
movies_df = movies_df.drop('genres', axis=1)
movies_df.head()
# movies_df.shape

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


# Modelling

### Input

In [88]:
user_input = [
            {'title':'Janky Promoters', 'rating':4.5},
            {'title':'Never Make It Home', 'rating':2.5},
            {'title':'Ace High', 'rating':1.5},
            {'title':'Best Men', 'rating':4},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Under Fire', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
user_input_df = pd.DataFrame(user_input)
user_input_df

Unnamed: 0,title,rating
0,Janky Promoters,4.5
1,Never Make It Home,2.5
2,Ace High,1.5
3,Best Men,4.0
4,Toy Story,3.5
5,Under Fire,2.0
6,Pulp Fiction,5.0
7,Akira,4.5


In [89]:
inputId = movies_df[movies_df['title'].isin(user_input_df['title'].tolist())]
user_input_df = pd.merge(inputId, user_input_df)
user_input_df = user_input_df.drop('year', axis=1)
user_input_df

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,296,Pulp Fiction,5.0
2,1274,Akira,4.5
3,1473,Best Men,4.0
4,7480,Under Fire,2.0
5,62385,Ace High,1.5
6,113816,Never Make It Home,2.5


### Similarity

In [90]:
users = ratings_df[ratings_df['movieId'].isin(user_input_df['movieId'].tolist())]
users.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
531,13,1274,5.0
681,14,296,2.0
749,15,1,4.0
776,15,296,3.0


In [91]:
user_groups = users.groupby('userId')

user_groups.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
531,13,1274,5.0
681,14,296,2.0
749,15,1,4.0
776,15,296,3.0
...,...,...,...
3899005,42117,296,2.0
3899036,42118,1,3.0
3899049,42118,296,3.0
3899433,42127,296,4.5


In [92]:
user_groups = sorted(user_groups, key=lambda x: len(x[1]), reverse=True)
user_groups

[(9944,
          userId  movieId  rating
  918465    9944        1     4.0
  918515    9944      296     5.0
  918721    9944     1274     3.0
  918777    9944     1473     2.0),
 (12921,
           userId  movieId  rating
  1209144   12921        1     4.0
  1209219   12921      296     5.0
  1209410   12921     1274     4.0
  1209446   12921     1473     2.5),
 (16957,
           userId  movieId  rating
  1588654   16957        1     3.5
  1588696   16957      296     4.0
  1588848   16957     1274     4.5
  1589658   16957    62385     4.0),
 (19495,
           userId  movieId  rating
  1820651   19495        1     3.5
  1820797   19495      296     4.0
  1821188   19495     1473     4.0
  1822759   19495     7480     2.0),
 (22084,
           userId  movieId  rating
  2043681   22084        1     4.0
  2043701   22084      296     5.0
  2043749   22084     1274     5.0
  2043759   22084     1473     5.0),
 (24692,
           userId  movieId  rating
  2292655   24692        1     2

In [93]:
user_groups = user_groups[0:100]

In [94]:
PC = {}

for name, group in user_groups:
    group = group.sort_values(by='movieId') # sorted users
    user_input_df = user_input_df.sort_values(by='movieId') # sorted input
    N = len(group) # N at PC formula
    
    temp_df = user_input_df[user_input_df['movieId'].isin(group['movieId'].tolist())]
    temp_rating_input = temp_df['rating'].tolist()
    temp_rating_users = group['rating'].tolist()

    Sxx = sum([i**2 for i in temp_rating_input]) - pow(sum(temp_rating_input),2)/float(N)
    Syy = sum([i**2 for i in temp_rating_users]) - pow(sum(temp_rating_users),2)/float(N)
    Sxy = sum( i*j for i, j in zip(temp_rating_input, temp_rating_users)) - sum(temp_rating_input)*sum(temp_rating_users)/float(N)
    
    if Sxx != 0 and Syy != 0:
        PC[name] = Sxy/sqrt(Sxx*Syy)
    else:
        PC[name] = 0


In [46]:
PC

{9944: -0.258007843557668,
 12921: 0.30181636371957526,
 24692: -0.029977225959120522,
 36946: 0.04728779924109591,
 75: 0.8017837257372732,
 106: 0.4629100498862757,
 217: 0.8819171036881969,
 393: 0.592156525463792,
 686: 0.8819171036881969,
 815: 0.5194624816493197,
 1040: 0.9456108576893003,
 1066: 0.9650595721775965,
 1130: 0.17039954414774924,
 1414: 0.90694099390581,
 1502: 0.8510644963469901,
 1599: 0.629940788348712,
 1625: 0.8510644963469901,
 1824: 0.8017837257372732,
 1950: 0.4364357804719848,
 1966: 0.8819171036881969,
 2065: 0.629940788348712,
 2128: 0.4795122238161564,
 2432: 0.1543033499620919,
 2473: 0.7237468644557459,
 2726: 0.7715167498104595,
 2791: 0.8819171036881969,
 2839: 0.9869275424396534,
 2948: 0.4364357804719848,
 3025: 0.3563483225498992,
 3040: 0.8728715609439696,
 3051: 0.9341484842923421,
 3116: 0.7122123116119197,
 3186: 0.6351073488299558,
 3271: 0.24209101306752098,
 3388: 0.5509080804800992,
 3392: -0.3779644730092272,
 3429: 0.2182178902359924,
 3

In [109]:
PC_df = pd.DataFrame.from_dict(PC, orient='index')
PC_df.columns = ['similarityIndex']
PC_df['userId'] = PC_df.index
PC_df.index = range(len(PC_df))

PC_df = PC_df.sort_values(by='similarityIndex', ascending=False)[0:50]
PC_df.head(10)

Unnamed: 0,similarityIndex,userId
98,1.0,5104
92,1.0,4938
71,0.995871,3962
40,0.995871,2164
75,0.995871,4079
9,0.981981,106
93,0.944911,4947
32,0.944911,1669
72,0.944911,3995
47,0.944911,2839


In [110]:
top_users=PC_df.merge(ratings_df, left_on='userId', right_on='userId', how='inner')

top_users.shape

(33449, 4)

In [111]:
top_users['weighted'] = top_users['similarityIndex']*top_users['rating']

top_users[25000:25010]

Unnamed: 0,similarityIndex,userId,movieId,rating,weighted
25000,0.654654,3699,303,2.0,1.309307
25001,0.654654,3699,304,3.0,1.963961
25002,0.654654,3699,305,1.0,0.654654
25003,0.654654,3699,306,4.0,2.618615
25004,0.654654,3699,307,3.0,1.963961
25005,0.654654,3699,308,4.0,2.618615
25006,0.654654,3699,312,1.0,0.654654
25007,0.654654,3699,318,5.0,3.273268
25008,0.654654,3699,337,3.0,1.963961
25009,0.654654,3699,344,3.0,1.963961


### Recommendation

In [113]:
temp_top_users = top_users.groupby('movieId').sum()[['similarityIndex','weighted']]
temp_top_users.columns = ['sum_similarityIndex','sum_weightedRating']
temp_top_users.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.747198,131.687667
2,21.877176,59.284165
3,9.430269,24.938983
4,2.558375,7.469796
5,7.899335,18.533953


In [114]:
recom_df = pd.DataFrame()

recom_df['weighted average recommendation score'] = temp_top_users['sum_weightedRating']/temp_top_users['sum_similarityIndex']
recom_df['movieId'] = temp_top_users.index
recom_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.398637,1
2,2.709864,2
3,2.644568,3
4,2.919742,4
5,2.346268,5


In [115]:
recom_df = recom_df.sort_values(by='weighted average recommendation score', ascending=False)
recom_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
59018,5.0,59018
1361,5.0,1361
87304,5.0,87304
5772,5.0,5772
93790,5.0,93790
1189,5.0,1189
125535,5.0,125535
824,5.0,824
2894,5.0,2894
2905,5.0,2905


In [116]:
recom_list = movies_df.loc[movies_df['movieId'].isin(recom_df.head(10)['movieId'].tolist())]

recom_list

Unnamed: 0,movieId,title,year
809,824,Kaspar Hauser,1993
1165,1189,"Thin Blue Line, The",1988
1331,1361,Paradise Lost: The Child Murders at Robin Hood...,1996
2809,2894,Romance,1999
2820,2905,Sanjuro (Tsubaki Sanjûrô),1962
5674,5772,My Dinner with André,1981
12602,59018,"Visitor, The",2007
17337,87304,Beginners,2010
18895,93790,Detachment,2011
26996,125535,Fist of Jesus,2012
