In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import re, os, math, sklearn, datetime, pickle

In [2]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')
df_tags = pd.read_csv('tags.csv')

In [3]:
df_movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
df_ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


### 1. Filter movies with 5 or more ratings

#### Create a dataframe (df_movie_ratings_count_min5) that stores each movie that has more than 4 ratings along with the number of its ratings

Group the ratings_df by the movieId and count the rows with the same movieId

In [5]:
df_movie_ratings_count = df_ratings.groupby('movieId').count()

Keep only the index and the first column and rename the first column for beautifying purposes

In [6]:
df_movie_ratings_count = df_movie_ratings_count.iloc[:,:1]

df_movie_ratings_count.rename(columns = {'userId':'count'}, inplace = True)

In [7]:
df_movie_ratings_count.head(33)

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
6,102
7,54
8,8
9,16
10,132


In [8]:
df_movie_ratings_count_min5= df_movie_ratings_count.loc[df_movie_ratings_count['count'] > 4]

In [9]:
df_movie_ratings_count_min5.head(40)

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
6,102
7,54
8,8
9,16
10,132


#### Remove movies with fewer than 5 ratings

In [10]:
df_ratings_min5 = df_ratings.loc[df_ratings['movieId'].isin(df_movie_ratings_count_min5.index)]

In [11]:
df_movies_min5 = df_movies.loc[df_movies['movieId'].isin(df_movie_ratings_count_min5.index)]

In [12]:
print(df_ratings_min5.shape)
print(df_ratings.shape)
print(df_movies_min5.shape)
print(df_movies.shape)
print(df_movies.shape[0]-df_movies_min5.shape[0], "movies have been removed")

(90274, 4)
(100836, 4)
(3650, 3)
(9742, 3)
6092 movies have been removed


#### Check that movies with fewer than 5 ratings have indeed been removed

In [13]:
df_movies.loc[df_movies['movieId']==40]

Unnamed: 0,movieId,title,genres
36,40,"Cry, the Beloved Country (1995)",Drama


In [14]:
df_ratings.loc[df_ratings['movieId']==40]

Unnamed: 0,userId,movieId,rating,timestamp
47991,311,40,3.5,1057854804
84607,544,40,5.0,850688776


In [15]:
df_movies_min5.loc[df_movies_min5['movieId']==40]

Unnamed: 0,movieId,title,genres


In [16]:
df_ratings_min5.loc[df_ratings_min5['movieId']==40]

Unnamed: 0,userId,movieId,rating,timestamp


In [17]:
merged = df_ratings_min5.merge(df_movies_min5, left_on = 'movieId', right_on = 'movieId', suffixes= ['_user', ''])
merged.drop(columns = ['timestamp','genres'], inplace = True)

In [18]:
merged2 = df_movies_min5.merge(df_ratings_min5, left_on = 'movieId', right_on = 'movieId', suffixes= ['_user', ''])

In [19]:
merged[300:309]

Unnamed: 0,userId,movieId,rating,title
300,202,6,5.0,Heat (1995)
301,217,6,2.0,Heat (1995)
302,219,6,3.5,Heat (1995)
303,220,6,3.5,Heat (1995)
304,239,6,5.0,Heat (1995)
305,244,6,5.0,Heat (1995)
306,266,6,4.0,Heat (1995)
307,269,6,5.0,Heat (1995)
308,270,6,3.0,Heat (1995)


In [20]:
merged.isnull().values.any()

False

In [21]:
piv = merged.pivot_table(index=['userId'], columns=['title'], values='rating')

In [22]:
piv.iloc[99:109, 325:335]

title,Beautiful Girls (1996),"Beautiful Mind, A (2001)",Beauty and the Beast (1991),Beauty and the Beast (La belle et la bête) (1946),Beauty of the Day (Belle de jour) (1967),Beavis and Butt-Head Do America (1996),Becoming Jane (2007),Bed of Roses (1996),Bedazzled (2000),Bedknobs and Broomsticks (1971)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100,,4.0,,,,,,4.0,,
101,,,,,,,,,,
102,,,,,,,,,,
103,,,3.0,,,,,,,
104,,,5.0,,,,,,,
105,,4.0,,,,,,,3.0,
106,,,,,,,,,,
107,,,,,,,,,,
108,,5.0,,,,,,,,
109,,,,,3.0,,,,,


In [23]:
piv.iloc[10:20]

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,,,,,,,,,,,...,,,,,,,,,,
12,5.0,,,,5.0,,,,,,...,,,,,,,,,,
13,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,
15,,4.0,,5.0,,,,1.5,,,...,,,,,3.0,,,,,
16,,,,,,,,,,4.0,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,
18,,4.0,,,,,,,,5.0,...,,4.5,,3.0,,,,,,
19,2.0,,,,3.0,,1.0,,,,...,3.0,,,,,,,,,2.0
20,,,,,,,3.0,4.0,,,...,,,,3.5,,,,0.5,,


#### Calculate cosine similarity of table
In order to do, we need to create a sparse matrix of the pivot table 

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

piv_cos = piv.copy()
piv_cos.fillna(0, inplace=True)
piv_cos = piv_cos.T
piv_cos_sparse = sp.sparse.csr_matrix(piv_cos.to_numpy())

In [25]:
piv_cos

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
*batteries not included (1987),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
[REC] (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5


#### Define similarity functions

For the calculation of adjusted cosine similarity the sklearn cosine similarity will again be used but with an adjusted dataset

In [26]:
"""
Given a pivot table P (dataframe) with users as rows and items as columns and ratings on items as P(i,j)
Removes from every user column the user's average rating and then replaces NaN with 0s. 
Transposes P.
Creates the sparse matrix from the transposed P and calculates the cosine similarith of this matrix
"""
def adjusted_cosine_similarity(P):
    
    P_adj = P.apply(lambda x: (x-np.mean(x)), axis=1)
    P_adj.fillna(0, inplace=True)
    P_adj = P_adj.T
    P_adj_sparse = sp.sparse.csr_matrix(P_adj.to_numpy())
    return cosine_similarity(P_adj_sparse)

adjusted_cosine_similarity(piv)

array([[ 1.        , -0.01346345, -0.02928447, ...,  0.05352441,
         0.        , -0.07110765],
       [-0.01346345,  1.        ,  0.09604104, ...,  0.00836394,
        -0.03587317,  0.00543719],
       [-0.02928447,  0.09604104,  1.        , ...,  0.19826153,
         0.        , -0.02097782],
       ...,
       [ 0.05352441,  0.00836394,  0.19826153, ...,  1.        ,
         0.45506245,  0.00891434],
       [ 0.        , -0.03587317,  0.        , ...,  0.45506245,
         1.        ,  0.        ],
       [-0.07110765,  0.00543719, -0.02097782, ...,  0.00891434,
         0.        ,  1.        ]])

#### Showcase how rating values are changed inside the function of adjusted cosine similarity

In [27]:
piv_cos.iloc[:, 10:20]

userId,11,12,13,14,15,16,17,18,19,20
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"'burbs, The (1989)",0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0
*batteries not included (1987),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You (1999),0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...
[REC] (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
piv_adj = piv.apply(lambda x: (x-np.mean(x)), axis=1).T
piv_adj.iloc[:5, 10:20]

userId,11,12,13,14,15,16,17,18,19,20
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"'burbs, The (1989)",,0.609375,,,,,,,-0.661367,
(500) Days of Summer (2009),,,,,0.551852,,,0.286813,,
*batteries not included (1987),,,,,,,,,,
10 Cloverfield Lane (2016),,,,,1.551852,,,,,
10 Things I Hate About You (1999),,0.609375,,,,,,,0.338633,


In [29]:
item_similarity_adjusted = adjusted_cosine_similarity(piv)

In [30]:
user_similarity_adjusted = adjusted_cosine_similarity(piv.T)

In [31]:
item_sim_adj_df = pd.DataFrame(item_similarity_adjusted, index = piv_adj.index, columns = piv_adj.index)
item_sim_adj_df.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.0,-0.013463,-0.029284,0.0,-0.013758,0.032021,0.15909,-0.028624,0.0,-0.05677,...,-0.133334,-0.077794,0.006028,-0.019345,-0.018213,0.0,0.011224,0.053524,0.0,-0.071108
(500) Days of Summer (2009),-0.013463,1.0,0.096041,0.052195,0.019758,-0.086357,-0.033997,-0.011918,-0.031756,0.071598,...,0.033554,0.25098,-0.002781,0.024014,-0.001126,-0.054452,-0.12795,0.008364,-0.035873,0.005437
*batteries not included (1987),-0.029284,0.096041,1.0,0.137916,-0.042369,0.0,-0.101073,0.013752,0.0,0.008479,...,-0.019781,-0.08151,-0.036965,0.104383,0.0,0.0,-0.027369,0.198262,0.0,-0.020978
10 Cloverfield Lane (2016),0.0,0.052195,0.137916,1.0,-0.00381,-0.053141,-0.004067,-0.225013,0.0,0.096075,...,0.0,-0.039538,0.010682,0.009534,-0.028213,0.010827,0.042195,0.055918,-0.061019,0.005202
10 Things I Hate About You (1999),-0.013758,0.019758,-0.042369,-0.00381,1.0,0.019754,-0.084563,-0.015927,-0.048479,-0.047686,...,-0.033594,0.007322,0.116893,-0.073864,-0.048665,-0.006352,-0.16062,0.035357,0.025081,0.194862


In [32]:
user_sim_adj_df = pd.DataFrame(user_similarity_adjusted, index = piv_adj.columns, columns = piv_adj.columns)
user_sim_adj_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.005356,-0.047354,0.0052,0.018511,0.010302,-0.011764,0.027983,0.03455,-0.019557,...,0.030594,0.00017,-0.084563,-0.016493,-0.067884,-0.019646,0.036946,-0.095194,-0.005878,0.058422
2,0.005356,1.0,0.0,-0.019007,0.09035,-0.009844,0.008776,-0.035974,0.0,-0.002071,...,-0.064156,-0.020064,-0.004198,0.0,0.0,0.026022,-0.015414,0.000677,0.052549,0.058009
3,-0.047354,0.0,1.0,-0.01393,-0.035853,0.015929,0.0,-0.035729,0.0,0.0,...,-0.067234,-0.013533,0.028565,0.0,0.001178,-0.061151,-0.000765,0.007509,0.0,0.014367
4,0.0052,-0.019007,-0.01393,1.0,-0.059798,0.013888,0.071245,0.025394,-0.011,0.10423,...,-0.083475,0.058255,-0.01532,-0.005828,0.037585,-0.02191,0.011648,-0.030031,0.00717,-0.038193
5,0.018511,0.09035,-0.035853,-0.059798,1.0,-0.001414,0.003465,-0.128999,0.0,-0.029641,...,-0.010443,-0.064583,0.015001,-0.029221,-0.059299,0.011099,-0.001007,-0.024429,0.122453,-0.013359


In [33]:
user_similarity = cosine_similarity(piv_cos_sparse)
user_sim_df = pd.DataFrame(user_similarity, index = piv_adj.index, columns = piv_adj.index)

In [34]:
def top_movies(title):
    count = 1
    print('Similar shows to {} include:\n'.format(title))
    for item in item_sim_adj_df.sort_values(by = title, ascending = False).index[1:11]:
        print('No. {}: {}'.format(count, item))
        count +=1 

In [35]:
top_movies('10 Cloverfield Lane (2016)')

Similar shows to 10 Cloverfield Lane (2016) include:

No. 1: Inherent Vice (2014)
No. 2: Bicentennial Man (1999)
No. 3: Limitless (2011)
No. 4: Kung Fury (2015)
No. 5: Enemy (2013)
No. 6: Mary and Max (2009)
No. 7: Sicario (2015)
No. 8: Untitled Spider-Man Reboot (2017)
No. 9: Pacific Rim (2013)
No. 10: Man Bites Dog (C'est arrivé près de chez vous) (1992)


In [36]:
# This function constructs a list of lists containing the highest rated shows per similar user
# and returns the name of the show along with the frequency it appears in the list
import operator

def similar_user_recs(user):
    
    if user not in piv_adj.columns:
        return('No data available on user {}'.format(user))
    
    sim_users =user_sim_adj_df.sort_values(by=user, ascending=False).index[1:5]
    print(sim_users)
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv_adj.loc[:, i].max()
        print(max_score)
        best.append(piv_adj[piv_adj.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:20]    

In [37]:
piv.loc[41,'Lost in Translation (2003)']

3.5

In [163]:
"""
Item-based prediction
calculates the weighted average of k similar movies to determine a potential rating for an input user and movie
movie_values: list of the similarities (with values 0-1) of the k nearest movies 
"""
def ib_predicted_rating(movie_name, user, k):
    sim_movies = item_sim_adj_df.sort_values(by=movie_name, ascending=False).index[1:k+1]  #start index at 1 to avoid including the searched movie in its neighbors
    movie_values = item_sim_adj_df.sort_values(by=movie_name, ascending=False).loc[:,movie_name].tolist()[1:k+1]
    #print (movie_values)
    #print(sim_movies)
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_movies):
        rating = piv.loc[user, i]
        similarity = movie_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            #print("rat",rating)
            #print("sim",similarity)
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    #print("wl",weight_list)
    
    #sum(weight_list)==0 means that the user has not rated any of the k nearest movies 
    if (sum(weight_list)==0):  
        return np.nan
    else:
        return sum(rating_list)/sum(weight_list)    

In [157]:
ib_predicted_rating('Lost in Translation (2003)',17,5)

Index(['3:10 to Yuma (2007)', 'Straight Story, The (1999)',
       'Mulholland Drive (2001)', 'Frida (2002)', 'Sideways (2004)'],
      dtype='object', name='title')


nan

In [165]:
"""
User-based prediction
calculates the weighted average of k similar users to determine a potential rating for an input user and movie
"""
def ub_predicted_rating(movie_name, user, k):
    sim_users = user_sim_adj_df.sort_values(by=user, ascending=False).index[1:k+1]
    user_values = user_sim_adj_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:k+1]
    #print(sim_users)
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, movie_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            #print("rat",rating)
            #print("sim",similarity)
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
            
    #print("wl",weight_list)
    if (sum(weight_list)==0):
        return np.nan
    else:
        return sum(rating_list)/sum(weight_list)  

In [159]:
ub_predicted_rating('Lost in Translation (2003)',331,5)

Int64Index([88, 610, 416, 63, 300], dtype='int64', name='userId')
wl [0.10747421910767509, 0.08799788828156915]


5.0

In [42]:
piv.loc[331,'Lost in Translation (2003)']

5.0

In [43]:
piv.loc[1,'10 Cloverfield Lane (2016)']

nan

In [44]:
piv.loc[:,'12 Angry Men (1957)']

userId
1      NaN
2      NaN
3      NaN
4      5.0
5      NaN
      ... 
606    NaN
607    NaN
608    NaN
609    NaN
610    NaN
Name: 12 Angry Men (1957), Length: 610, dtype: float64

In [45]:
def true_ratings_as_list(movie_list, user_list):
    movie_ratings = [[]]
    for i, movie in enumerate(movie_list):
        print(movie)
        for user in user_list:
            movie_ratings[i].append(piv.loc[user,movie])
        if (i!=len(movie_list)-1):
            movie_ratings.append([]) # Initialize the list for the next movie
    return movie_ratings

In [46]:
y_test = true_ratings_as_list(['12 Angry Men (1957)','Lost in Translation (2003)'],[1,2,3,4,5,6,7,8,9])

12 Angry Men (1957)
Lost in Translation (2003)


In [47]:
y_test

[[nan, nan, nan, 5.0, nan, nan, nan, nan, nan],
 [nan, nan, nan, nan, nan, nan, nan, nan, nan]]

In [160]:
def make_predictions(movie_list, user_list, k):
    movie_pred = [[]]
    for i, movie in enumerate(movie_list):
        print(movie)
        for user in user_list:
            if (ib_predicted_rating(movie, user, k)>2.5):
                movie_pred[i].append(ub_predicted_rating(movie, user, k))
            else:
                movie_pred[i].append(np.nan)
        if (i!=len(movie_list)-1):
            movie_pred.append([])
    return movie_pred


In [49]:
yes = make_predictions(test.columns[:4],[1,2,3,4,5,6,7])

NameError: name 'test' is not defined

In [None]:
print(yes)

### 2. Split data 90/10

In [50]:
train = piv.sample(frac=0.1)
test = piv.drop(train.index).sample(frac=0.1)

In [51]:
print(train.shape)
print(test.shape)

(61, 3650)
(55, 3650)


In [52]:
piv.iloc[:,40:80]

title,30 Days of Night (2007),30 Minutes or Less (2011),300 (2007),3000 Miles to Graceland (2001),"39 Steps, The (1935)",3:10 to Yuma (2007),40 Days and 40 Nights (2002),"40-Year-Old Virgin, The (2005)","400 Blows, The (Les quatre cents coups) (1959)",47 Ronin (2013),...,Abraham Lincoln: Vampire Hunter (2012),"Absent-Minded Professor, The (1961)",Absolute Power (1997),"Abyss, The (1989)",Accepted (2006),"Accidental Tourist, The (1988)","Accused, The (1988)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Across the Universe (2007)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,4.0,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,4.0,3.5,,2.0,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,5.0,,,,,,,,...,,,,3.0,,,,3.5,2.0,
609,,,,,,,,,,,...,,,,,,,,,,


In [53]:
y_test = true_ratings_as_list(['12 Angry Men (1957)','Abyss, The (1989)','Fight Club (1999)','Lost in Translation (2003)'],list(test.index))

12 Angry Men (1957)
Abyss, The (1989)
Fight Club (1999)
Lost in Translation (2003)


In [166]:
y_pred = make_predictions(['12 Angry Men (1957)','Abyss, The (1989)','Fight Club (1999)','Lost in Translation (2003)'],list(test.index),50)

12 Angry Men (1957)
Abyss, The (1989)
Fight Club (1999)
Lost in Translation (2003)


In [168]:
y_test_watch = []
y_pred_watch = []
for i in range(len(y_test)):
    for j in range(len(y_test[i])):
        if (math.isnan(y_test[i][j])==False and math.isnan(y_pred[i][j])==False):
            y_test_watch.append(y_test[i][j])
            y_pred_watch.append(y_pred[i][j])

In [170]:
binary_y_test_watch = []
binary_y_pred_watch = []
for i in range(len(y_test_watch)):
    if (y_test_watch[i]>3.5):
        binary_y_test_watch.append(1)
    else:
        binary_y_test_watch.append(0)
    if (y_pred_watch[i]>3.5):
        binary_y_pred_watch.append(1)
    else:
        binary_y_pred_watch.append(0)

In [None]:
df_nonull = piv.iloc[3].dropna()
np.mean(list(df_nonull))
df_nonull.shape

In [259]:
"""
Given a pivot table P (dataframe) with users as rows and items as columns and ratings on items as P(i,j)
Removes from every user column the user's average rating and then replaces NaN with 0s. 
Transposes P.
Creates the sparse matrix from the transposed P and calculates the cosine similarith of this matrix
"""
def adjusted_cosine_similarity(piv):
    
    piv_adj = piv.copy()
    piv_adj.apply(lambda x: (x-np.mean(x)), axis=1)
    piv_adj.fillna(0, inplace=True)
    #P_adj = P_adj.T
    piv_adj_sparse = sp.sparse.csr_matrix(piv_adj.to_numpy())
    return cosine_similarity(piv_adj_sparse)

adjusted_cosine_similarity(piv)

array([[1.        , 0.0285007 , 0.10303152, ..., 0.29849003, 0.10354441,
        0.1695671 ],
       [0.0285007 , 1.        , 0.        , ..., 0.04834093, 0.03111888,
        0.12192864],
       [0.10303152, 0.        , 1.        , ..., 0.03650235, 0.        ,
        0.04510303],
       ...,
       [0.29849003, 0.04834093, 0.03650235, ..., 1.        , 0.1351822 ,
        0.3754171 ],
       [0.10354441, 0.03111888, 0.        , ..., 0.1351822 , 1.        ,
        0.06711619],
       [0.1695671 , 0.12192864, 0.04510303, ..., 0.3754171 , 0.06711619,
        1.        ]])

In [256]:
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform

def jaccard_similarity(piv):
    piv_jac = piv.copy()
    piv_jac.fillna(0, inplace=True)
    piv_jac.to_numpy()
    jaccard_distances = pdist(piv_jac, metric='jaccard')
    jaccard_distances = squareform(jaccard_distances)
    jaccard_similarity = 1-jaccard_distances
    return jaccard_similarity

In [272]:
"""
Parameters: piv (pivot table), sim_func ('adjusted cosine','jaccard')
Defines: two dataframes; 1.showing the similarity between items and 2.showing the similarity between users
"""
def define_similarity(piv,sim_func):
    
    if (sim_func=='adjusted cosine'):
        user_similarity_adjusted = adjusted_cosine_similarity(piv.T)
        item_similarity_adjusted = adjusted_cosine_similarity(piv)
        user_sim_adj_df = pd.DataFrame(user_similarity_adjusted, index = piv.columns, columns = piv.columns)
        item_sim_adj_df = pd.DataFrame(item_similarity_adjusted, index = piv.index, columns = piv.index)
        print("adj user",user_sim_adj_df.shape)
        print("adj item",item_sim_adj_df.shape)
    elif (sim_func=='jaccard'):
        user_similarity_jaccard = jaccard_similarity(piv)
        item_similarity_jaccard = jaccard_similarity(piv.T)
        user_sim_jac_df = pd.DataFrame(user_similarity_jaccard, index = piv.T.columns, columns = piv.T.columns)
        item_sim_jac_df = pd.DataFrame(item_similarity_jaccard, index = piv.T.index, columns = piv.T.index)
        print("jac user",user_sim_jac_df.shape)
        print("jac item",item_sim_jac_df.shape)


In [274]:
dev_piv=piv.iloc[:,0:200]

In [91]:
"""
Parameters: movie list, user list
Returns: movie ratings list
"""
def true_ratings_as_list(movie_list, user_list):
    movie_ratings = [[]]
    for i, movie in enumerate(movie_list):
        print(movie)
        for user in user_list:
            movie_ratings[i].append(piv.loc[user,movie])
        if (i!=len(movie_list)-1):
            movie_ratings.append([]) # Initialize the list for the next movie
    return movie_ratings

In [194]:
"""
Parameters: piv (pivot table - users on rows, movies on columns), train percentage (from total users) , k nearest neighbours,
sim_func ('adjusted cosine','jaccard')
Returns: prediction list, real test list
"""
def create_model(piv, train_percentage, k, sim_func):
    train = piv.sample(frac=train_percentage)
    test = piv.drop(train.index).sample(frac=0.1)
    
    define_similarity(piv,sim_func) 
    
    movie_list = piv.columns
    
    user_list = train.index
    print(user_list)
    y_test = true_ratings_as_list(movie_list, user_list)
    y_pred = make_predictions(movie_list, user_list, k)
    return (y_pred, y_test)
    

In [277]:
y_pred, y_test = create_model(dev_piv, 0.008, 20,'adjusted cosine')

adj user (200, 200)
adj item (610, 610)
Int64Index([370, 486, 371, 398, 157], dtype='int64', name='userId')
'burbs, The (1989)
(500) Days of Summer (2009)
*batteries not included (1987)
10 Cloverfield Lane (2016)
10 Things I Hate About You (1999)
10,000 BC (2008)
101 Dalmatians (1996)
101 Dalmatians (One Hundred and One Dalmatians) (1961)
102 Dalmatians (2000)
12 Angry Men (1957)
12 Years a Slave (2013)
127 Hours (2010)
13 Going on 30 (2004)
13 Hours (2016)
13th Warrior, The (1999)
1408 (2007)
15 Minutes (2001)
16 Blocks (2006)
17 Again (2009)
1984 (Nineteen Eighty-Four) (1984)
2 Days in the Valley (1996)
2 Fast 2 Furious (Fast and the Furious 2, The) (2003)
20 Dates (1998)
20,000 Leagues Under the Sea (1954)
200 Cigarettes (1999)
2001: A Space Odyssey (1968)
2010: The Year We Make Contact (1984)
2012 (2009)
2046 (2004)
21 (2008)
21 Grams (2003)
21 Jump Street (2012)
22 Jump Street (2014)
24 Hour Party People (2002)
25th Hour (2002)
27 Dresses (2008)
28 Days (2000)
28 Days Later (2002)

Aladdin and the King of Thieves (1996)
Alexander (2004)
Ali (2001)
Alice in Wonderland (1951)
Alice in Wonderland (2010)
Alien (1979)
Alien Nation (1988)
Alien: Covenant (2017)
Alien: Resurrection (1997)
Aliens (1986)
Alien³ (a.k.a. Alien 3) (1992)
Alive (1993)
All About Eve (1950)
All About My Mother (Todo sobre mi madre) (1999)
All Dogs Go to Heaven (1989)
All Dogs Go to Heaven 2 (1996)
All Quiet on the Western Front (1930)
All That Jazz (1979)
All the President's Men (1976)
Allan Quatermain and the Lost City of Gold (1987)
Almost Famous (2000)
Along Came Polly (2004)
Along Came a Spider (2001)
Alpha Dog (2007)
Altered States (1980)
Always (1989)
Amadeus (1984)
Amazing Spider-Man, The (2012)
Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
America's Sweethearts (2001)
American Beauty (1999)
American Gangster (2007)
American Gigolo (1980)
American Graffiti (1973)
American History X (1998)
American Hustle (2013)
American Movie (1999)
American Pie (1999)
American Pie 2 (2001)
Americ

In [247]:
len(y_pred)

150

In [248]:
len(y_test)

150

In [249]:
"""
From 2 lists that their indices correspond creates 2 new 'same' lists with no NaN values,
e.g. if it is found that predicted rating for movie1 by user1 is y_pred[0][0]==5 and 
real rating is y_test[0][0]==NaN, y_pred_watch and y_test_watch must not contain this rating

Parameters: predictions list, true test list
Returns: 2 lists
"""
def remove_nan(y_pred,y_test):
    y_test_watch = []
    y_pred_watch = []
    for i in range(len(y_test)):
        for j in range(len(y_test[i])):
            if ( math.isnan(y_test[i][j])==False and math.isnan(y_pred[i][j])==False):
                y_test_watch.append(y_test[i][j])
                y_pred_watch.append(y_pred[i][j])
                
    return (y_pred_watch, y_test_watch)

In [250]:
y_pred_watch, y_test_watch = remove_nan(y_pred,y_test)

In [252]:
"""
Create 2 lists in order be compared with sklearns's binary classification metrics
Parameters: predictions list, true test list
Returns: 2 lists like the given but with values in the binary range
"""

def prepare_for_binary_evaluation(y_pred,y_test):
    binary_y_test_watch = []
    binary_y_pred_watch = []
    for i in range(len(y_test_watch)):
        if (y_test_watch[i]>3.5):
            binary_y_test_watch.append(1)
        else:
            binary_y_test_watch.append(0)
        if (y_pred_watch[i]>3.5):
            binary_y_pred_watch.append(1)
        else:
            binary_y_pred_watch.append(0)
            
    return (binary_y_pred_watch, binary_y_test_watch)

In [253]:
binary_y_pred_watch, binary_y_test_watch = prepare_for_binary_evaluation(y_pred_watch,y_test_watch)

In [254]:
from sklearn.metrics import mean_absolute_error, precision_score, recall_score

print(mean_absolute_error(y_test_watch, y_pred_watch))

print(precision_score(binary_y_test_watch, binary_y_pred_watch, average='weighted'))
print(recall_score(binary_y_test_watch, binary_y_pred_watch, average='weighted'))

0.6768462327122758
0.7203007518796992
0.7142857142857143


#### Implementing the zero rule in order to make an assesment about the MAE value

In [255]:
zero_pred = [np.mean(y_test_watch)] * len(y_test_watch)
print(mean_absolute_error(y_test_watch, zero_pred))

0.5318720080624845


#### K = 20, train = 10%