In [23]:
import pandas as pd
import numpy as np
import scipy as sp
import re, os, math, sklearn, datetime, pickle

In [24]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')
df_tags = pd.read_csv('tags.csv')

In [25]:
df_movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [26]:
df_ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


### 1. Filter movies with 5 or more ratings

#### Create a dataframe (df_movie_ratings_count_min5) that stores each movie that has more than 4 ratings along with the number of its ratings

Group the ratings_df by the movieId and count the rows with the same movieId

In [5]:
df_movie_ratings_count = df_ratings.groupby('movieId').count()

Keep only the index and the first column and rename the first column for beautifying purposes

In [6]:
df_movie_ratings_count = df_movie_ratings_count.iloc[:,:1]

df_movie_ratings_count.rename(columns = {'userId':'count'}, inplace = True)

In [7]:
df_movie_ratings_count.head(33)

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
6,102
7,54
8,8
9,16
10,132


In [8]:
df_movie_ratings_count_min5= df_movie_ratings_count.loc[df_movie_ratings_count['count'] > 4]

In [9]:
df_movie_ratings_count_min5.head(40)

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
6,102
7,54
8,8
9,16
10,132


#### Remove movies with fewer than 5 ratings

In [10]:
df_ratings_min5 = df_ratings.loc[df_ratings['movieId'].isin(df_movie_ratings_count_min5.index)]

In [11]:
df_movies_min5 = df_movies.loc[df_movies['movieId'].isin(df_movie_ratings_count_min5.index)]

In [12]:
print(df_ratings_min5.shape)
print(df_ratings.shape)
print(df_movies_min5.shape)
print(df_movies.shape)
print(df_movies.shape[0]-df_movies_min5.shape[0], "movies have been removed")

(90274, 4)
(100836, 4)
(3650, 3)
(9742, 3)
6092 movies have been removed


#### Check that movies with fewer than 5 ratings have indeed been removed

In [13]:
df_movies.loc[df_movies['movieId']==40]

Unnamed: 0,movieId,title,genres
36,40,"Cry, the Beloved Country (1995)",Drama


In [14]:
df_ratings.loc[df_ratings['movieId']==40]

Unnamed: 0,userId,movieId,rating,timestamp
47991,311,40,3.5,1057854804
84607,544,40,5.0,850688776


In [15]:
df_movies_min5.loc[df_movies_min5['movieId']==40]

Unnamed: 0,movieId,title,genres


In [16]:
df_ratings_min5.loc[df_ratings_min5['movieId']==40]

Unnamed: 0,userId,movieId,rating,timestamp


In [17]:
merged = df_ratings_min5.merge(df_movies_min5, left_on = 'movieId', right_on = 'movieId', suffixes= ['_user', ''])
merged.drop(columns = ['timestamp','genres'], inplace = True)

In [18]:
merged2 = df_movies_min5.merge(df_ratings_min5, left_on = 'movieId', right_on = 'movieId', suffixes= ['_user', ''])

In [19]:
merged[300:309]

Unnamed: 0,userId,movieId,rating,title
300,202,6,5.0,Heat (1995)
301,217,6,2.0,Heat (1995)
302,219,6,3.5,Heat (1995)
303,220,6,3.5,Heat (1995)
304,239,6,5.0,Heat (1995)
305,244,6,5.0,Heat (1995)
306,266,6,4.0,Heat (1995)
307,269,6,5.0,Heat (1995)
308,270,6,3.0,Heat (1995)


In [20]:
merged.isnull().values.any()

False

In [21]:
piv = merged.pivot_table(index=['userId'], columns=['title'], values='rating')

In [22]:
piv.iloc[99:109, 325:335]

title,Beautiful Girls (1996),"Beautiful Mind, A (2001)",Beauty and the Beast (1991),Beauty and the Beast (La belle et la bête) (1946),Beauty of the Day (Belle de jour) (1967),Beavis and Butt-Head Do America (1996),Becoming Jane (2007),Bed of Roses (1996),Bedazzled (2000),Bedknobs and Broomsticks (1971)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100,,4.0,,,,,,4.0,,
101,,,,,,,,,,
102,,,,,,,,,,
103,,,3.0,,,,,,,
104,,,5.0,,,,,,,
105,,4.0,,,,,,,3.0,
106,,,,,,,,,,
107,,,,,,,,,,
108,,5.0,,,,,,,,
109,,,,,3.0,,,,,


In [23]:
piv.iloc[10:20]

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,,,,,,,,,,,...,,,,,,,,,,
12,5.0,,,,5.0,,,,,,...,,,,,,,,,,
13,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,
15,,4.0,,5.0,,,,1.5,,,...,,,,,3.0,,,,,
16,,,,,,,,,,4.0,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,
18,,4.0,,,,,,,,5.0,...,,4.5,,3.0,,,,,,
19,2.0,,,,3.0,,1.0,,,,...,3.0,,,,,,,,,2.0
20,,,,,,,3.0,4.0,,,...,,,,3.5,,,,0.5,,


#### Calculate cosine similarity of table
In order to do, we need to create a sparse matrix of the pivot table 

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

piv_cos = piv.copy()
piv_cos.fillna(0, inplace=True)
piv_cos = piv_cos.T
piv_cos_sparse = sp.sparse.csr_matrix(piv_cos.to_numpy())

In [25]:
piv_cos

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
*batteries not included (1987),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
[REC] (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5


#### Define similarity functions

For the calculation of adjusted cosine similarity the sklearn cosine similarity will again be used but with an adjusted dataset

In [26]:
"""
Given a pivot table P (dataframe) with users as rows and items as columns and ratings on items as P(i,j)
Removes from every user column the user's average rating and then replaces NaN with 0s. 
Transposes P.
Creates the sparse matrix from the transposed P and calculates the cosine similarith of this matrix
"""
def adjusted_cosine_similarity(P):
    
    P_adj = P.apply(lambda x: (x-np.mean(x)), axis=1)
    P_adj.fillna(0, inplace=True)
    P_adj = P_adj.T
    P_adj_sparse = sp.sparse.csr_matrix(P_adj.to_numpy())
    return cosine_similarity(P_adj_sparse)

adjusted_cosine_similarity(piv)

array([[ 1.        , -0.01346345, -0.02928447, ...,  0.05352441,
         0.        , -0.07110765],
       [-0.01346345,  1.        ,  0.09604104, ...,  0.00836394,
        -0.03587317,  0.00543719],
       [-0.02928447,  0.09604104,  1.        , ...,  0.19826153,
         0.        , -0.02097782],
       ...,
       [ 0.05352441,  0.00836394,  0.19826153, ...,  1.        ,
         0.45506245,  0.00891434],
       [ 0.        , -0.03587317,  0.        , ...,  0.45506245,
         1.        ,  0.        ],
       [-0.07110765,  0.00543719, -0.02097782, ...,  0.00891434,
         0.        ,  1.        ]])

#### Showcase how rating values are changed inside the function of adjusted cosine similarity

In [33]:
user_similarity = cosine_similarity(piv_cos_sparse)
user_sim_df = pd.DataFrame(user_similarity, index = piv_adj.index, columns = piv_adj.index)

In [34]:
def top_movies(title):
    count = 1
    print('Similar shows to {} include:\n'.format(title))
    for item in item_sim_adj_df.sort_values(by = title, ascending = False).index[1:11]:
        print('No. {}: {}'.format(count, item))
        count +=1 

In [35]:
top_movies('10 Cloverfield Lane (2016)')

Similar shows to 10 Cloverfield Lane (2016) include:

No. 1: Inherent Vice (2014)
No. 2: Bicentennial Man (1999)
No. 3: Limitless (2011)
No. 4: Kung Fury (2015)
No. 5: Enemy (2013)
No. 6: Mary and Max (2009)
No. 7: Sicario (2015)
No. 8: Untitled Spider-Man Reboot (2017)
No. 9: Pacific Rim (2013)
No. 10: Man Bites Dog (C'est arrivé près de chez vous) (1992)


In [36]:
# This function constructs a list of lists containing the highest rated shows per similar user
# and returns the name of the show along with the frequency it appears in the list
import operator

def similar_user_recs(user):
    
    if user not in piv_adj.columns:
        return('No data available on user {}'.format(user))
    
    sim_users =user_sim_adj_df.sort_values(by=user, ascending=False).index[1:5]
    print(sim_users)
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv_adj.loc[:, i].max()
        print(max_score)
        best.append(piv_adj[piv_adj.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:20]    

In [37]:
piv.loc[41,'Lost in Translation (2003)']

3.5

In [52]:
"""
Item-based prediction
calculates the weighted average of k similar movies to determine a potential rating for an input user and movie
movie_values: list of the similarities (with values 0-1) of the k nearest movies 
"""
def ib_predicted_rating(movie_name, user, k, item_sim_df):
    sim_movies = item_sim_df.sort_values(by=movie_name, ascending=False).index[1:k+1]  #start index at 1 to avoid including the searched movie in its neighbors
    movie_values = item_sim_df.sort_values(by=movie_name, ascending=False).loc[:,movie_name].tolist()[1:k+1]
    #print (movie_values)
    #print(sim_movies)
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_movies):
        rating = piv.loc[user, i]
        similarity = movie_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            #print("rat",rating)
            #print("sim",similarity)
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    #print("wl",weight_list)
    
    #sum(weight_list)==0 means that the user has not rated any of the k nearest movies 
    if (sum(weight_list)==0):  
        return np.nan
    else:
        return sum(rating_list)/sum(weight_list)    

In [28]:
ib_predicted_rating('Lost in Translation (2003)',17,5)

NameError: name 'item_sim_adj_df' is not defined

In [315]:
"""
User-based prediction
calculates the weighted average of k similar users to determine a potential rating for an input user and movie
"""
def ub_predicted_rating(movie_name, user, k, user_sim_df):
    print('ub user',user)
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:k+1]
    print ('sim',sim_users)
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:k+1]
    #print(sim_users)
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, movie_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            #print("rat",rating)
            #print("sim",similarity)
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
            
    #print("wl",weight_list)
    if (sum(weight_list)==0):
        return np.nan
    else:
        return sum(rating_list)/sum(weight_list)  

In [159]:
ub_predicted_rating('Lost in Translation (2003)',331,5)

Int64Index([88, 610, 416, 63, 300], dtype='int64', name='userId')
wl [0.10747421910767509, 0.08799788828156915]


5.0

In [42]:
piv.loc[331,'Lost in Translation (2003)']

5.0

In [43]:
piv.loc[1,'10 Cloverfield Lane (2016)']

nan

In [339]:
def make_predictions(movie_list, user_list, k, item_sim_df, user_sim_df):
    movie_pred = [[]]
    for i, movie in enumerate(movie_list):
        print(movie)
        for user in user_list:
            print("u ",user)
            if (ib_predicted_rating(movie, user, k, item_sim_df)>2.5):
                print("in")
                movie_pred[i].append(ub_predicted_rating(movie, user, k, user_sim_df))
            else:
                print("not in")
                movie_pred[i].append(np.nan)
        if (i!=len(movie_list)-1):
            movie_pred.append([])
    return movie_pred


### 2. Split data 90/10

In [50]:
train = piv.sample(frac=0.1)
test = piv.drop(train.index).sample(frac=0.1)

In [51]:
print(train.shape)
print(test.shape)

(61, 3650)
(55, 3650)


In [52]:
piv.iloc[:,40:80]

title,30 Days of Night (2007),30 Minutes or Less (2011),300 (2007),3000 Miles to Graceland (2001),"39 Steps, The (1935)",3:10 to Yuma (2007),40 Days and 40 Nights (2002),"40-Year-Old Virgin, The (2005)","400 Blows, The (Les quatre cents coups) (1959)",47 Ronin (2013),...,Abraham Lincoln: Vampire Hunter (2012),"Absent-Minded Professor, The (1961)",Absolute Power (1997),"Abyss, The (1989)",Accepted (2006),"Accidental Tourist, The (1988)","Accused, The (1988)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Across the Universe (2007)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,4.0,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,4.0,3.5,,2.0,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,5.0,,,,,,,,...,,,,3.0,,,,3.5,2.0,
609,,,,,,,,,,,...,,,,,,,,,,


In [53]:
y_test = true_ratings_as_list(['12 Angry Men (1957)','Abyss, The (1989)','Fight Club (1999)','Lost in Translation (2003)'],list(test.index))

12 Angry Men (1957)
Abyss, The (1989)
Fight Club (1999)
Lost in Translation (2003)


In [166]:
y_pred = make_predictions(['12 Angry Men (1957)','Abyss, The (1989)','Fight Club (1999)','Lost in Translation (2003)'],list(test.index),50)

12 Angry Men (1957)
Abyss, The (1989)
Fight Club (1999)
Lost in Translation (2003)


In [196]:
print(piv.T.iloc[:,11])

title
'burbs, The (1989)                   5.0
(500) Days of Summer (2009)          NaN
*batteries not included (1987)       NaN
10 Cloverfield Lane (2016)           NaN
10 Things I Hate About You (1999)    5.0
                                    ... 
[REC] (2007)                         NaN
eXistenZ (1999)                      NaN
xXx (2002)                           NaN
xXx: State of the Union (2005)       NaN
¡Three Amigos! (1986)                NaN
Name: 12, Length: 3650, dtype: float64


In [117]:
 piv_adj = piv.copy()
piv2=piv.copy()

In [204]:
index = [1, 2, 3, 4]
a = [np.nan, np.nan, np.nan, 0.1]
b = [0.2, np.nan, 0.2, 0.2]
c = [np.nan, 0.5, 0.5, np.nan]
d = [0.2, np.nan, 0.9, 0.8]
df = pd.DataFrame({'A': a, 'B': b, 'C': c, 'D':d}, index=index)

In [206]:
df_c=df.copy()
df_c=df_c.apply(lambda x: (x-np.mean(x)), axis=1)

In [249]:
"""
Given a pivot table P (dataframe) with users as rows and items as columns and ratings on items as P(i,j)
Removes from every user column the user's average rating and then replaces NaN with 0s. 
Transposes P.
Creates the sparse matrix from the transposed P and calculates the cosine similarith of this matrix
"""
from sklearn.metrics.pairwise import cosine_similarity

def adjusted_cosine_similarity(piv):
    
    piv_adj = piv.copy()
    piv_adj = piv_adj.apply(lambda x: (x-np.mean(x)), axis=1)
    piv_adj.fillna(0, inplace=True)
    piv_adj_sparse = sp.sparse.csr_matrix(piv_adj.to_numpy())
    return cosine_similarity(piv_adj_sparse)

adjusted_cosine_similarity(dev_piv).shape


(610, 610)

In [72]:
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform

def jaccard_similarity(piv):
    piv_jac = piv.copy()
    piv_jac.fillna(0, inplace=True)
    piv_jac.to_numpy()
    jaccard_distances = pdist(piv_jac, metric='jaccard')
    jaccard_distances = squareform(jaccard_distances)
    jaccard_similarity = 1-jaccard_distances
    return jaccard_similarity

In [220]:
"""
Parameters: piv (pivot table), experiment ('jaccard|adjusted','adjusted|adjusted')
Defines: two dataframes; 1.showing the similarity between items and 2.showing the similarity between users
"""
def define_similarity(piv,experiment):
    
    if (experiment=='jaccard|adjusted'):
    
        item_similarity_jaccard = jaccard_similarity(piv.T)
        item_sim_jac_df = pd.DataFrame(item_similarity_jaccard, index = piv.columns, columns = piv.columns)
        user_similarity_adjusted = adjusted_cosine_similarity(piv)
        user_sim_adj_df = pd.DataFrame(user_similarity_adjusted, index = piv.index, columns = piv.index)
        print("jac item df shape",item_sim_jac_df.shape)
        print("adj user df shape",user_sim_adj_df.shape)
        
        return (item_sim_jac_df, user_sim_adj_df)
        
    elif (experiment=='adjusted|adjusted'):
        
        item_similarity_adjusted = adjusted_cosine_similarity(piv.T)
        item_sim_adj_df = pd.DataFrame(item_similarity_adjusted, index = piv.columns, columns = piv.columns)
        user_similarity_adjusted = adjusted_cosine_similarity(piv)
        user_sim_adj_df = pd.DataFrame(user_similarity_adjusted, index = piv.index, columns = piv.index)
        
        print("adj user df shape",user_sim_adj_df.shape)
        print("adj item df shape",item_sim_adj_df.shape)
        
        return (item_sim_adj_df, user_sim_adj_df)


In [224]:
dev_piv=piv.iloc[:,0:150]

In [287]:
"""
Parameters: movie list, user list
Returns: movie ratings list
"""
def true_ratings_as_list(movie_list, user_list, piv):
    movie_ratings = [[]]
    for i, movie in enumerate(movie_list):
        print(movie)
        for user in user_list:
            movie_ratings[i].append(piv.loc[user,movie])
        if (i!=len(movie_list)-1):
            movie_ratings.append([]) # Initialize the list for the next movie
    return movie_ratings

In [410]:
"""
Parameters: piv (pivot table - users on rows, movies on columns), train percentage (from total users) , k nearest neighbours,
sim_func ('adjusted cosine','jaccard')
Returns: prediction list, real test list
"""
def create_model(piv, train_percentage, k, experiment):
    train = piv.sample(frac=train_percentage)
    test = piv.drop(train.index).sample(frac=0.1)
    
    item_sim_df, user_sim_df = define_similarity(train,experiment) 
    print("item_sim df shape",item_sim_df.shape)
    print("user_sim df shape",user_sim_df.shape)
    print("user_sim df",user_sim_df)
    movie_list = piv.columns
    test_user_list = test.index
    print(user_sim_df.index)
    print("test",test_user_list)
    
    y_test = true_ratings_as_list(movie_list, test_user_list, piv)
    y_pred = make_predictions(movie_list, test_user_list, k, item_sim_df, user_sim_df)
    return (y_pred, y_test)
    

In [411]:
y_pred, y_test = create_model(dev_piv, 0.02, 20,'adjusted|adjusted')

adj user df shape (12, 12)
adj item df shape (100, 100)
item_sim df shape (100, 100)
user_sim df shape (12, 12)
user_sim df userId  218  576  162  268  568  96        599       22   75        562  \
userId                                                                    
218     0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.000000   
576     0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.000000   
162     0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.000000   
268     0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.000000   
568     0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.000000   
96      0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.000000   
599     0.0  0.0  0.0  0.0  0.0  0.0  1.000000  0.058274  0.0 -0.028332   
22      0.0  0.0  0.0  0.0  0.0  0.0  0.058274  1.000000  0.0 -0.027986   
75      0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.000000   
562     0.0  0.0  0.0  0.0  0.0  0.0 -0.028332 -0.0

KeyError: 305

In [231]:
"""
From 2 lists that their indices correspond creates 2 new 'same' lists with no NaN values,
e.g. if it is found that predicted rating for movie1 by user1 is y_pred[0][0]==5 and 
real rating is y_test[0][0]==NaN, y_pred_watch and y_test_watch must not contain this rating

Parameters: predictions list, true test list
Returns: 2 lists
"""
def remove_nan(y_pred,y_test):
    y_test_watch = []
    y_pred_watch = []
    for i in range(len(y_test)):
        for j in range(len(y_test[i])):
            if ( math.isnan(y_test[i][j])==False and math.isnan(y_pred[i][j])==False):
                y_test_watch.append(y_test[i][j])
                y_pred_watch.append(y_pred[i][j])
                
    return (y_pred_watch, y_test_watch)

In [238]:
"""
Create 2 lists in order be compared with sklearns's binary classification metrics
Parameters: predictions list, true test list
Returns: 2 lists like the given but with values in the binary range
"""

def prepare_for_binary_evaluation(y_pred,y_test):
    binary_y_test_watch = []
    binary_y_pred_watch = []
    for i in range(len(y_test_watch)):
        if (y_test_watch[i]>3.5):
            binary_y_test_watch.append(1)
        else:
            binary_y_test_watch.append(0)
        if (y_pred_watch[i]>3.5):
            binary_y_pred_watch.append(1)
        else:
            binary_y_pred_watch.append(0)
            
    return (binary_y_pred_watch, binary_y_test_watch)

In [295]:
binary_y_pred_watch, binary_y_test_watch = prepare_for_binary_evaluation(y_pred_watch,y_test_watch)

In [297]:
binary_y_pred_watch

[1, 0, 0, 0]

In [298]:
binary_y_test_watch

[0, 0, 0, 0]

### 1) S1 with jaccard and S2 with adjusted cosine

#### 1A) K = 20, train = 10%

In [399]:
y_pred_1A, y_test_1A = create_model(piv, 0.1, 20,'jaccard|adjusted')

jac item df shape (3650, 3650)
adj user df shape (61, 61)
item_sim df shape (3650, 3650)
user_sim df shape (61, 61)
Int64Index([217,  57, 269,  81, 204, 202, 440,  87, 533, 174,  90, 459, 549,
            457, 439, 178, 574,  59,  92, 374,  70, 150, 564, 529, 220, 569,
            608, 362, 196, 162, 239, 590, 517, 209,  37, 415, 115,  80, 116,
            403,  32, 380, 410, 310, 577, 393, 444, 423, 536, 456, 361, 276,
            429, 114,  24,  63, 522, 417, 388, 436, 357],
           dtype='int64', name='userId')
'burbs, The (1989)
(500) Days of Summer (2009)
*batteries not included (1987)
10 Cloverfield Lane (2016)
10 Things I Hate About You (1999)
10,000 BC (2008)
101 Dalmatians (1996)
101 Dalmatians (One Hundred and One Dalmatians) (1961)
102 Dalmatians (2000)
12 Angry Men (1957)
12 Years a Slave (2013)
127 Hours (2010)
13 Going on 30 (2004)
13 Hours (2016)
13th Warrior, The (1999)
1408 (2007)
15 Minutes (2001)
16 Blocks (2006)
17 Again (2009)
1984 (Nineteen Eighty-Four) (1984)


Cabin in the Woods, The (2012)
Cabinet of Dr. Caligari, The (Cabinet des Dr. Caligari., Das) (1920)
Cable Guy, The (1996)
Caddyshack (1980)
Calendar Girls (2003)
Campaign, The (2012)
Can't Hardly Wait (1998)
Canadian Bacon (1995)
Candidate, The (1972)
Candyman (1992)
Cannonball Run, The (1981)
Cape Fear (1962)
Cape Fear (1991)
Capote (2005)
Captain America: Civil War (2016)
Captain America: The First Avenger (2011)
Captain America: The Winter Soldier (2014)
Captain Fantastic (2016)
Captain Phillips (2013)
Captain Ron (1992)
Capturing the Friedmans (2003)
Carlito's Way (1993)
Carrie (1976)
Cars (2006)
Casablanca (1942)
Casino (1995)
Casino Royale (1967)
Casino Royale (2006)
Casper (1995)
Cast Away (2000)
Castle, The (1997)
Cat Ballou (1965)
Cat People (1982)
Cat Returns, The (Neko no ongaeshi) (2002)
Cat in the Hat, The (2003)
Cat on a Hot Tin Roof (1958)
Cat's Eye (1985)
Catch Me If You Can (2002)
Catch-22 (1970)
Cats & Dogs (2001)
Catwoman (2004)
Cecil B. DeMented (2000)
Celebration, 

Fantastic Four: Rise of the Silver Surfer (2007)
Fantastic Mr. Fox (2009)
Fantastic Voyage (1966)
Far and Away (1992)
Far from Heaven (2002)
Farewell My Concubine (Ba wang bie ji) (1993)
Fargo (1996)
Farinelli: il castrato (1994)
Fast & Furious (Fast and the Furious 4, The) (2009)
Fast & Furious 6 (Fast and the Furious 6, The) (2013)
Fast Five (Fast and the Furious 5, The) (2011)
Fast Times at Ridgemont High (1982)
Fast and the Furious, The (2001)
Fast and the Furious: Tokyo Drift, The (Fast and the Furious 3, The) (2006)
Faster Pussycat! Kill! Kill! (1965)
Fatal Attraction (1987)
Fatal Instinct (1993)
Father of the Bride (1950)
Father of the Bride (1991)
Father of the Bride Part II (1995)
Fear (1996)
Fear and Loathing in Las Vegas (1998)
Fearless (1993)
Feeling Minnesota (1996)
Femme Nikita, La (Nikita) (1990)
Ferngully: The Last Rainforest (1992)
Ferris Bueller's Day Off (1986)
Fever Pitch (2005)
Few Good Men, A (1992)
Fiddler on the Roof (1971)
Fido (2006)
Field of Dreams (1989)
Fie

Human Nature (2001)
Hunchback of Notre Dame, The (1996)
Hunt for Red October, The (1990)
Hunt, The (Jagten) (2012)
Hurricane, The (1999)
Hurt Locker, The (2008)
Husbands and Wives (1992)
Hustler, The (1961)
I Am Legend (2007)
I Am Number Four (2011)
I Am Sam (2001)
I Heart Huckabees (2004)
I Know What You Did Last Summer (1997)
I Love Trouble (1994)
I Love You, Man (2009)
I Now Pronounce You Chuck and Larry (2007)
I Spy (2002)
I Still Know What You Did Last Summer (1998)
I'll Do Anything (1994)
I, Robot (2004)
I.Q. (1994)
Ice Age (2002)
Ice Age 2: The Meltdown (2006)
Ice Age 4: Continental Drift (2012)
Ice Age: Dawn of the Dinosaurs (2009)
Ice Storm, The (1997)
Ichi the Killer (Koroshiya 1) (2001)
Ideal Husband, An (1999)
Identity (2003)
Ides of March, The (2011)
Idiocracy (2006)
Idle Hands (1999)
Igby Goes Down (2002)
Ikiru (1952)
Illusionist, The (2006)
Imaginarium of Doctor Parnassus, The (2009)
Immortal Beloved (1994)
Importance of Being Earnest, The (2002)
In & Out (1997)
In Ameri

Man Who Knew Too Little, The (1997)
Man Who Knew Too Much, The (1956)
Man Who Shot Liberty Valance, The (1962)
Man Who Wasn't There, The (2001)
Man Who Would Be King, The (1975)
Man Without a Face, The (1993)
Man for All Seasons, A (1966)
Man from Earth, The (2007)
Man in the Iron Mask, The (1998)
Man of Steel (2013)
Man of the House (1995)
Man on Fire (2004)
Man on Wire (2008)
Man on the Moon (1999)
Man with Two Brains, The (1983)
Man with the Golden Gun, The (1974)
Manchester by the Sea (2016)
Manchurian Candidate, The (1962)
Manchurian Candidate, The (2004)
Manhattan (1979)
Manhattan Murder Mystery (1993)
Manhunter (1986)
Mannequin (1987)
Manon of the Spring (Manon des sources) (1986)
Mansfield Park (1999)
Marathon Man (1976)
March of the Penguins (Marche de l'empereur, La) (2005)
Margin Call (2011)
Maria Full of Grace (Maria, Llena eres de gracia) (2004)
Mariachi, El (1992)
Marie Antoinette (2006)
Marley & Me (2008)
Married to the Mob (1988)
Mars Attacks! (1996)
Marvin's Room (1996

Police Academy (1984)
Police Academy 2: Their First Assignment (1985)
Police Academy 3: Back in Training (1986)
Police Academy 4: Citizens on Patrol (1987)
Police Academy 5: Assignment: Miami Beach (1988)
Police Academy: Mission to Moscow (1994)
Pollock (2000)
Pollyanna (1960)
Poltergeist (1982)
Poltergeist II: The Other Side (1986)
Ponyo (Gake no ue no Ponyo) (2008)
Popeye (1980)
Porco Rosso (Crimson Pig) (Kurenai no buta) (1992)
Porky's (1982)
Porky's Revenge (1985)
Poseidon Adventure, The (1972)
Postman Always Rings Twice, The (1981)
Postman, The (1997)
Postman, The (Postino, Il) (1994)
Powder (1995)
Practical Magic (1998)
Prairie Home Companion, A (2006)
Preacher's Wife, The (1996)
Predator (1987)
Predator 2 (1990)
Predators (2010)
Predestination (2014)
Prefontaine (1997)
Premium Rush (2012)
Presidio, The (1988)
Prestige, The (2006)
Presumed Innocent (1990)
Pretty Woman (1990)
Pretty in Pink (1986)
Pride & Prejudice (2005)
Pride and Prejudice (1995)
Primal Fear (1996)
Primary Color

Star Trek: First Contact (1996)
Star Trek: Generations (1994)
Star Trek: Insurrection (1998)
Star Trek: Nemesis (2002)
Star Trek: The Motion Picture (1979)
Star Wars: Episode I - The Phantom Menace (1999)
Star Wars: Episode II - Attack of the Clones (2002)
Star Wars: Episode III - Revenge of the Sith (2005)
Star Wars: Episode IV - A New Hope (1977)
Star Wars: Episode V - The Empire Strikes Back (1980)
Star Wars: Episode VI - Return of the Jedi (1983)
Star Wars: Episode VII - The Force Awakens (2015)
Star Wars: The Clone Wars (2008)
Star Wars: The Last Jedi (2017)
Stardust (2007)
Stardust Memories (1980)
Stargate (1994)
Starman (1984)
Starship Troopers (1997)
Starsky & Hutch (2004)
Startup.com (2001)
State and Main (2000)
Station Agent, The (2003)
Stealing Beauty (1996)
Stealing Harvard (2002)
Stealth (2005)
Steel Magnolias (1989)
Step Brothers (2008)
Stepford Wives, The (1975)
Stepford Wives, The (2004)
Stepmom (1998)
Steve Jobs (2015)
Stick It (2006)
Stigmata (1999)
Still Alice (2014)

Where the Heart Is (2000)
Where the Wild Things Are (2009)
While You Were Sleeping (1995)
Whip It (2009)
Whiplash (2014)
Whisper of the Heart (Mimi wo sumaseba) (1995)
White Chicks (2004)
White Christmas (1954)
White House Down (2013)
White Men Can't Jump (1992)
White Noise (2005)
White Squall (1996)
Who Framed Roger Rabbit? (1988)
Who Killed the Electric Car? (2006)
Who's Afraid of Virginia Woolf? (1966)
Who's Harry Crumb? (1989)
Whole Nine Yards, The (2000)
Whole Ten Yards, The (2004)
Wicker Man, The (1973)
Wild (2014)
Wild America (1997)
Wild Bunch, The (1969)
Wild Hogs (2007)
Wild Tales (2014)
Wild Things (1998)
Wild Wild West (1999)
Wild at Heart (1990)
William Shakespeare's A Midsummer Night's Dream (1999)
William Shakespeare's Romeo + Juliet (1996)
Willow (1988)
Willy Wonka & the Chocolate Factory (1971)
Wimbledon (2004)
Win Win (2011)
Win a Date with Tad Hamilton! (2004)
Wind River (2017)
Windtalkers (2002)
Wing Commander (1999)
Winged Migration (Peuple migrateur, Le) (2001)
Wi

KeyError: 27

In [370]:
y_pred_watch, y_test_watch = remove_nan(y_pred_1A,y_test_1A)
binary_y_pred_watch, binary_y_test_watch = prepare_for_binary_evaluation(y_pred_watch,y_test_watch)

In [375]:
len(y_test_watch)==len(y_pred_watch)

True

In [373]:
from sklearn.metrics import mean_absolute_error, precision_score, recall_score

print(mean_absolute_error(y_test_watch, y_pred_watch))
print(precision_score(binary_y_test_watch, binary_y_pred_watch, average='weighted'))
print(recall_score(binary_y_test_watch, binary_y_pred_watch, average='weighted'))

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

#### Implementing the zero rule in order to make an assesment about the MAE value

In [272]:
zero_pred = [np.mean(y_test_watch)] * len(y_test_watch)
print(mean_absolute_error(y_test_watch, zero_pred))

0.4583333333333333


In [348]:
dev2_piv=dev_piv.sample(frac=0.02)

In [349]:
aitem_sim_df, auser_sim_df = define_similarity(dev2_piv,'jaccard|adjusted') 

jac item df shape (150, 150)
adj user df shape (12, 12)


In [350]:
auser_sim_df.columns

Int64Index([585, 143, 173, 9, 83, 564, 277, 56, 374, 362, 316, 539], dtype='int64', name='userId')

In [355]:
for user in auser_sim_df.columns:
    
    sim_users = auser_sim_df.sort_values(by=user, ascending=False).index[1:5+1]
    #print (sim_users)

In [357]:
sim_users

Int64Index([585, 564, 143, 173, 9], dtype='int64', name='userId')

In [358]:
(auser_sim_df.iloc[:,:])

userId,585,143,173,9,83,564,277,56,374,362,316,539
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
585,1.0,0.0,0.0,0.0,0.014921,0.0,0.0,0.0,0.0,-0.319801,-0.023262,0.319801
143,0.0,1.0,0.0,0.0,0.0,0.18601,0.0,0.0,0.0,0.0,0.0,0.0
173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83,0.014921,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.265949,-0.015272,0.0
564,0.0,0.18601,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.378494,0.135742
277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.288675,0.0,0.251976,0.0
374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.288675,1.0,0.0,0.436436,0.0
362,-0.319801,0.0,0.0,0.0,0.265949,0.0,0.0,0.0,0.0,1.0,0.0,-0.5


In [415]:
auser_sim_df

userId,585,143,173,9,83,564,277,56,374,362,316,539
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
585,1.0,0.0,0.0,0.0,0.014921,0.0,0.0,0.0,0.0,-0.319801,-0.023262,0.319801
143,0.0,1.0,0.0,0.0,0.0,0.18601,0.0,0.0,0.0,0.0,0.0,0.0
173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83,0.014921,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.265949,-0.015272,0.0
564,0.0,0.18601,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.378494,0.135742
277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.288675,0.0,0.251976,0.0
374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.288675,1.0,0.0,0.436436,0.0
362,-0.319801,0.0,0.0,0.0,0.265949,0.0,0.0,0.0,0.0,1.0,0.0,-0.5
