In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import re, os, math, sklearn, datetime, pickle

In [2]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')
df_tags = pd.read_csv('tags.csv')

In [3]:
df_movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
df_ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


### 1. Filter movies with 5 or more ratings

#### Create a dataframe (df_movie_ratings_count_min5) that stores each movie that has more than 4 ratings along with the number of its ratings

Group the ratings_df by the movieId and count the rows with the same movieId

In [5]:
df_movie_ratings_count = df_ratings.groupby('movieId').count()

Keep only the index and the first column and rename the first column for beautifying purposes

In [6]:
df_movie_ratings_count = df_movie_ratings_count.iloc[:,:1]

df_movie_ratings_count.rename(columns = {'userId':'count'}, inplace = True)

In [7]:
df_movie_ratings_count.head(33)

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
6,102
7,54
8,8
9,16
10,132


In [8]:
df_movie_ratings_count_min5= df_movie_ratings_count.loc[df_movie_ratings_count['count'] > 4]

In [9]:
df_movie_ratings_count_min5.head(40)

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
6,102
7,54
8,8
9,16
10,132


#### Remove movies with fewer than 5 ratings

In [10]:
df_ratings_min5 = df_ratings.loc[df_ratings['movieId'].isin(df_movie_ratings_count_min5.index)]

In [11]:
df_movies_min5 = df_movies.loc[df_movies['movieId'].isin(df_movie_ratings_count_min5.index)]

In [12]:
print(df_ratings_min5.shape)
print(df_ratings.shape)
print(df_movies_min5.shape)
print(df_movies.shape)
print(df_movies.shape[0]-df_movies_min5.shape[0], "movies have been removed")

(90274, 4)
(100836, 4)
(3650, 3)
(9742, 3)
6092 movies have been removed


#### Check that movies with fewer than 5 ratings have indeed been removed

In [13]:
df_movies.loc[df_movies['movieId']==40]

Unnamed: 0,movieId,title,genres
36,40,"Cry, the Beloved Country (1995)",Drama


In [14]:
df_ratings.loc[df_ratings['movieId']==40]

Unnamed: 0,userId,movieId,rating,timestamp
47991,311,40,3.5,1057854804
84607,544,40,5.0,850688776


In [15]:
df_movies_min5.loc[df_movies_min5['movieId']==40]

Unnamed: 0,movieId,title,genres


In [16]:
df_ratings_min5.loc[df_ratings_min5['movieId']==40]

Unnamed: 0,userId,movieId,rating,timestamp


In [17]:
merged = df_ratings_min5.merge(df_movies_min5, left_on = 'movieId', right_on = 'movieId', suffixes= ['_user', ''])
merged.drop(columns = ['timestamp','genres'], inplace = True)

In [18]:
merged2 = df_movies_min5.merge(df_ratings_min5, left_on = 'movieId', right_on = 'movieId', suffixes= ['_user', ''])

In [19]:
merged[300:309]

Unnamed: 0,userId,movieId,rating,title
300,202,6,5.0,Heat (1995)
301,217,6,2.0,Heat (1995)
302,219,6,3.5,Heat (1995)
303,220,6,3.5,Heat (1995)
304,239,6,5.0,Heat (1995)
305,244,6,5.0,Heat (1995)
306,266,6,4.0,Heat (1995)
307,269,6,5.0,Heat (1995)
308,270,6,3.0,Heat (1995)


In [20]:
merged.isnull().values.any()

False

In [21]:
piv = merged.pivot_table(index=['userId'], columns=['title'], values='rating')

In [22]:
piv.iloc[99:109, 325:335]

title,Beautiful Girls (1996),"Beautiful Mind, A (2001)",Beauty and the Beast (1991),Beauty and the Beast (La belle et la bête) (1946),Beauty of the Day (Belle de jour) (1967),Beavis and Butt-Head Do America (1996),Becoming Jane (2007),Bed of Roses (1996),Bedazzled (2000),Bedknobs and Broomsticks (1971)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100,,4.0,,,,,,4.0,,
101,,,,,,,,,,
102,,,,,,,,,,
103,,,3.0,,,,,,,
104,,,5.0,,,,,,,
105,,4.0,,,,,,,3.0,
106,,,,,,,,,,
107,,,,,,,,,,
108,,5.0,,,,,,,,
109,,,,,3.0,,,,,


In [23]:
piv.iloc[10:20]

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,,,,,,,,,,,...,,,,,,,,,,
12,5.0,,,,5.0,,,,,,...,,,,,,,,,,
13,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,
15,,4.0,,5.0,,,,1.5,,,...,,,,,3.0,,,,,
16,,,,,,,,,,4.0,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,
18,,4.0,,,,,,,,5.0,...,,4.5,,3.0,,,,,,
19,2.0,,,,3.0,,1.0,,,,...,3.0,,,,,,,,,2.0
20,,,,,,,3.0,4.0,,,...,,,,3.5,,,,0.5,,


#### Calculate cosine similarity of table
In order to do, we need to create a sparse matrix of the pivot table 

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

piv_cos = piv.copy()
piv_cos.fillna(0, inplace=True)
piv_cos = piv_cos.T
piv_cos_sparse = sp.sparse.csr_matrix(piv_cos.to_numpy())

In [25]:
piv_cos

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
*batteries not included (1987),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
[REC] (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5


#### Define similarity functions

For the calculation of adjusted cosine similarity the sklearn cosine similarity will again be used but with an adjusted dataset

In [26]:
"""
Given a pivot table P (dataframe) with users as rows and items as columns and ratings on items as P(i,j)
Removes from every user column the user's average rating and then replaces NaN with 0s. 
Transposes P.
Creates the sparse matrix from the transposed P and calculates the cosine similarith of this matrix
"""
def adjusted_cosine_similarity(P):
    
    P_adj = P.apply(lambda x: (x-np.mean(x)), axis=1)
    P_adj.fillna(0, inplace=True)
    P_adj = P_adj.T
    P_adj_sparse = sp.sparse.csr_matrix(P_adj.to_numpy())
    return cosine_similarity(P_adj_sparse)

adjusted_cosine_similarity(piv)

array([[ 1.        , -0.01346345, -0.02928447, ...,  0.05352441,
         0.        , -0.07110765],
       [-0.01346345,  1.        ,  0.09604104, ...,  0.00836394,
        -0.03587317,  0.00543719],
       [-0.02928447,  0.09604104,  1.        , ...,  0.19826153,
         0.        , -0.02097782],
       ...,
       [ 0.05352441,  0.00836394,  0.19826153, ...,  1.        ,
         0.45506245,  0.00891434],
       [ 0.        , -0.03587317,  0.        , ...,  0.45506245,
         1.        ,  0.        ],
       [-0.07110765,  0.00543719, -0.02097782, ...,  0.00891434,
         0.        ,  1.        ]])

#### Showcase how rating values are changed inside the function of adjusted cosine similarity

In [27]:
piv_cos.iloc[:, 10:20]

userId,11,12,13,14,15,16,17,18,19,20
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"'burbs, The (1989)",0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0
*batteries not included (1987),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You (1999),0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...
[REC] (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
piv_adj = piv.apply(lambda x: (x-np.mean(x)), axis=1).T
piv_adj.iloc[:5, 10:20]

userId,11,12,13,14,15,16,17,18,19,20
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"'burbs, The (1989)",,0.609375,,,,,,,-0.661367,
(500) Days of Summer (2009),,,,,0.551852,,,0.286813,,
*batteries not included (1987),,,,,,,,,,
10 Cloverfield Lane (2016),,,,,1.551852,,,,,
10 Things I Hate About You (1999),,0.609375,,,,,,,0.338633,


In [29]:
item_similarity_adjusted = adjusted_cosine_similarity(piv)

In [30]:
user_similarity_adjusted = adjusted_cosine_similarity(piv.T)

In [31]:
item_sim_adj_df = pd.DataFrame(item_similarity_adjusted, index = piv_adj.index, columns = piv_adj.index)
item_sim_adj_df.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.0,-0.013463,-0.029284,0.0,-0.013758,0.032021,0.15909,-0.028624,0.0,-0.05677,...,-0.133334,-0.077794,0.006028,-0.019345,-0.018213,0.0,0.011224,0.053524,0.0,-0.071108
(500) Days of Summer (2009),-0.013463,1.0,0.096041,0.052195,0.019758,-0.086357,-0.033997,-0.011918,-0.031756,0.071598,...,0.033554,0.25098,-0.002781,0.024014,-0.001126,-0.054452,-0.12795,0.008364,-0.035873,0.005437
*batteries not included (1987),-0.029284,0.096041,1.0,0.137916,-0.042369,0.0,-0.101073,0.013752,0.0,0.008479,...,-0.019781,-0.08151,-0.036965,0.104383,0.0,0.0,-0.027369,0.198262,0.0,-0.020978
10 Cloverfield Lane (2016),0.0,0.052195,0.137916,1.0,-0.00381,-0.053141,-0.004067,-0.225013,0.0,0.096075,...,0.0,-0.039538,0.010682,0.009534,-0.028213,0.010827,0.042195,0.055918,-0.061019,0.005202
10 Things I Hate About You (1999),-0.013758,0.019758,-0.042369,-0.00381,1.0,0.019754,-0.084563,-0.015927,-0.048479,-0.047686,...,-0.033594,0.007322,0.116893,-0.073864,-0.048665,-0.006352,-0.16062,0.035357,0.025081,0.194862


In [32]:
user_sim_adj_df = pd.DataFrame(user_similarity_adjusted, index = piv_adj.columns, columns = piv_adj.columns)
user_sim_adj_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.005356,-0.047354,0.0052,0.018511,0.010302,-0.011764,0.027983,0.03455,-0.019557,...,0.030594,0.00017,-0.084563,-0.016493,-0.067884,-0.019646,0.036946,-0.095194,-0.005878,0.058422
2,0.005356,1.0,0.0,-0.019007,0.09035,-0.009844,0.008776,-0.035974,0.0,-0.002071,...,-0.064156,-0.020064,-0.004198,0.0,0.0,0.026022,-0.015414,0.000677,0.052549,0.058009
3,-0.047354,0.0,1.0,-0.01393,-0.035853,0.015929,0.0,-0.035729,0.0,0.0,...,-0.067234,-0.013533,0.028565,0.0,0.001178,-0.061151,-0.000765,0.007509,0.0,0.014367
4,0.0052,-0.019007,-0.01393,1.0,-0.059798,0.013888,0.071245,0.025394,-0.011,0.10423,...,-0.083475,0.058255,-0.01532,-0.005828,0.037585,-0.02191,0.011648,-0.030031,0.00717,-0.038193
5,0.018511,0.09035,-0.035853,-0.059798,1.0,-0.001414,0.003465,-0.128999,0.0,-0.029641,...,-0.010443,-0.064583,0.015001,-0.029221,-0.059299,0.011099,-0.001007,-0.024429,0.122453,-0.013359


In [33]:
user_similarity = cosine_similarity(piv_cos_sparse)
user_sim_df = pd.DataFrame(user_similarity, index = piv_adj.index, columns = piv_adj.index)

In [34]:
def top_movies(title):
    count = 1
    print('Similar shows to {} include:\n'.format(title))
    for item in item_sim_adj_df.sort_values(by = title, ascending = False).index[1:11]:
        print('No. {}: {}'.format(count, item))
        count +=1 

In [35]:
top_movies('10 Cloverfield Lane (2016)')

Similar shows to 10 Cloverfield Lane (2016) include:

No. 1: Inherent Vice (2014)
No. 2: Bicentennial Man (1999)
No. 3: Limitless (2011)
No. 4: Kung Fury (2015)
No. 5: Enemy (2013)
No. 6: Mary and Max (2009)
No. 7: Sicario (2015)
No. 8: Untitled Spider-Man Reboot (2017)
No. 9: Pacific Rim (2013)
No. 10: Man Bites Dog (C'est arrivé près de chez vous) (1992)


In [36]:
# This function constructs a list of lists containing the highest rated shows per similar user
# and returns the name of the show along with the frequency it appears in the list
import operator

def similar_user_recs(user):
    
    if user not in piv_adj.columns:
        return('No data available on user {}'.format(user))
    
    sim_users =user_sim_adj_df.sort_values(by=user, ascending=False).index[1:5]
    print(sim_users)
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv_adj.loc[:, i].max()
        print(max_score)
        best.append(piv_adj[piv_adj.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:20]    

In [37]:
piv_adj.loc[:, 5]

title
'burbs, The (1989)                  NaN
(500) Days of Summer (2009)         NaN
*batteries not included (1987)      NaN
10 Cloverfield Lane (2016)          NaN
10 Things I Hate About You (1999)   NaN
                                     ..
[REC] (2007)                        NaN
eXistenZ (1999)                     NaN
xXx (2002)                          NaN
xXx: State of the Union (2005)      NaN
¡Three Amigos! (1986)               NaN
Name: 5, Length: 3650, dtype: float64

In [38]:
si3rs = item_sim_adj_df.sort_values(by='10 Cloverfield Lane (2016)', ascending=False)
si3rs.shape

(3650, 3650)

In [39]:
piv_adj.iloc[3,3]

nan

In [40]:
piv.head(3)

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,4.0
2,,,,,,,,,,,...,,,3.0,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


In [41]:
piv_adj.loc['10 Cloverfield Lane (2016)':'Lost in Translation (2003)',3]

title
10 Cloverfield Lane (2016)                               NaN
10 Things I Hate About You (1999)                        NaN
10,000 BC (2008)                                         NaN
101 Dalmatians (1996)                                    NaN
101 Dalmatians (One Hundred and One Dalmatians) (1961)   NaN
                                                          ..
Lost Highway (1997)                                      NaN
Lost World: Jurassic Park, The (1997)                    NaN
Lost in La Mancha (2002)                                 NaN
Lost in Space (1998)                                     NaN
Lost in Translation (2003)                               NaN
Name: 3, Length: 1958, dtype: float64

In [42]:
piv.loc[41,'Lost in Translation (2003)']

3.5

In [43]:
"""
Item-based prediction
calculates the weighted average of k similar movies to determine a potential rating for an input user and movie
"""
def ib_predicted_rating(movie_name, user, k):
    sim_movies = item_sim_adj_df.sort_values(by=movie_name, ascending=False).index[1:k]
    movie_values = item_sim_adj_df.sort_values(by=movie_name, ascending=False).loc[:,movie_name].tolist()[1:k]
    #print(sim_movies)
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_movies):
        rating = piv.loc[user, i]
        similarity = movie_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            #print("rat",rating)
            #print("sim",similarity)
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
            #print("r",(rating_list))
            #print("w",weight_list)
    if (sum(weight_list)==0):
        return -1
    else:
        return sum(rating_list)/sum(weight_list)    

In [44]:
ib_predicted_rating('Lost in Translation (2003)',17,50)

4.349833819586118

In [45]:
"""
User-based prediction
calculates the weighted average of k similar users to determine a potential rating for an input user and movie
"""
def ub_predicted_rating(movie_name, user, k):
    sim_users = user_sim_adj_df.sort_values(by=user, ascending=False).index[1:k]
    user_values = user_sim_adj_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:k]
    #print(sim_users)
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, movie_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            #print("rat",rating)
            #print("sim",similarity)
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
            #print("r",(rating_list))
    if (sum(weight_list)==0):
        return -1
    else:
        return sum(rating_list)/sum(weight_list)  

In [46]:
ub_predicted_rating('Lost in Translation (2003)',331,100)

4.309352467412565

In [253]:
movie_pr[0].append(2)
movie_pr[1].append(23)
print(movie_pr)

[[4, 5, 4, 3, 4, 3, 2], [3, 4, 3, 2, 3, 1, 23]]


In [47]:
movie_list = (list(piv.columns))

In [264]:
movie_pr = list([[4,5,4,3,4,3],[3,4,3,2,3,1]])

In [301]:
movie_pr.append([])
print(movie_pr)

[[], []]


In [308]:
movies = 'brie', 'leian', 'gkavo'

In [342]:
movie_pr =[[]]

In [343]:
users = [1,4,5,8,12]
for i,movie in enumerate(movies):
    print(movie)
    print(movie_pr)
    for user in (users):
        print(i)
        if (user>3):
            movie_pr[i].append(user)
        else:
            movie_pr[i].append('nan')
    if (i!=len(movies)-1):
         movie_pr.append([])
   


brie
[[]]
0
0
0
0
0
leian
[['nan', 4, 5, 8, 12], []]
1
1
1
1
1
gkavo
[['nan', 4, 5, 8, 12], ['nan', 4, 5, 8, 12], []]
2
2
2
2
2


In [344]:
movie_pr

[['nan', 4, 5, 8, 12], ['nan', 4, 5, 8, 12], ['nan', 4, 5, 8, 12]]

In [282]:
def make_predictions(movie_list, user_list):
    
    for movie_name in movie_list:
        movie_pred = [[]]
        print(movie_name)
        for i,user in enumerate(user_list)
        for user in user_list:
            
            if (ib_predicted_rating(movie_name, user, 1000)>2.5):
                movie_pred[user-1].append(ub_predicted_rating(movie_name, user, 200))
                print(movie_pred)
            else:
                movie_pred[user-1].append('nan')
        movie_pred.append([])
    return movie_pred


In [49]:
make_predictions(['Kiss Kiss Bang Bang (2005)'], [78])

4.116989160352662

### 2. Split data 90/10

In [52]:
train = piv.sample(frac=0.1).reset_index(drop=True)


In [60]:
mov = list(train.columns)

In [61]:
usr = list(train.index)

In [232]:
from random import sample

sample1 = mov[:90]
sample2 = usr[:40]
print(sample1)

["'burbs, The (1989)", '(500) Days of Summer (2009)', '*batteries not included (1987)', '10 Cloverfield Lane (2016)', '10 Things I Hate About You (1999)', '10,000 BC (2008)', '101 Dalmatians (1996)', '101 Dalmatians (One Hundred and One Dalmatians) (1961)', '102 Dalmatians (2000)', '12 Angry Men (1957)', '12 Years a Slave (2013)', '127 Hours (2010)', '13 Going on 30 (2004)', '13 Hours (2016)', '13th Warrior, The (1999)', '1408 (2007)', '15 Minutes (2001)', '16 Blocks (2006)', '17 Again (2009)', '1984 (Nineteen Eighty-Four) (1984)', '2 Days in the Valley (1996)', '2 Fast 2 Furious (Fast and the Furious 2, The) (2003)', '20 Dates (1998)', '20,000 Leagues Under the Sea (1954)', '200 Cigarettes (1999)', '2001: A Space Odyssey (1968)', '2010: The Year We Make Contact (1984)', '2012 (2009)', '2046 (2004)', '21 (2008)', '21 Grams (2003)', '21 Jump Street (2012)', '22 Jump Street (2014)', '24 Hour Party People (2002)', '25th Hour (2002)', '27 Dresses (2008)', '28 Days (2000)', '28 Days Later (

In [283]:
y_pred = make_predictions(sample1, [1,2,3,4])

'burbs, The (1989)
[[3.7412897271814507]]


IndexError: list index out of range

In [281]:
y_pred

[[4.495429294140831], [3.6883366655406165], ['nan'], [3.2897843065861156]]

In [113]:
y_pred[:]

[3.393363796717929, 5.0, 3.0651940103745887]

In [86]:
piv.iloc[:,78]

userId
1      NaN
2      NaN
3      NaN
4      NaN
5      NaN
      ... 
606    2.0
607    NaN
608    2.0
609    NaN
610    NaN
Name: Ace Ventura: When Nature Calls (1995), Length: 610, dtype: float64

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

precision_score(y_test, y_test_pred, average='weighted')

In [62]:
make_predictions(mov, usr)

KeyError: 0

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
# initialize
a = []

# create the table (name, age, job)
a.append(["Nick", 30, "Doctor"])
a.append(["John",  8, "Student"])
a.append(["Paul", 22, "Car Dealer"])
a.append(["Mark", 66, "Retired"])    

# sort the table by age
import operator
a.sort(key=operator.itemgetter(2)) 
#a.sort

# print the table
print(a)

In [None]:
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform
 
piv_jac = piv.copy()
piv_jac.fillna(0, inplace=True)
piv_jac.to_numpy()

In [None]:
# Calculate all pairwise distances
jaccard_distances = pdist(piv_jac, metric='jaccard')
 
# Convert the distances to a square matrix
jaccard_distances = squareform(jaccard_distances)
jaccard_similarity = 1-jaccard_distances
jaccard_similarity

In [None]:
(piv_jac.to_numpy()[0])

In [None]:
k=np.round(piv_jac.to_numpy()[0],0)
j=np.round(piv_jac.to_numpy()[1],0)
l=np.round([3,2,5.67],0)

In [None]:
l

In [None]:
jaccard_score(k,j, average='weighted')

In [None]:
x = [0,1,0,0,0,1,0,0,1]
y = [0,0,1,0,0,0,0,0,1]
z = [1,1,0,0,0,1.0,0,0,0]

jaccard_score(x,y)

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
jac_sim = 1 - pairwise_distances(piv_jac.T, metric = "hamming")
# optionally convert it to a DataFrame
jac_sim = pd.DataFrame(jac_sim, index=piv_jac.columns, columns=piv_jac.columns)

In [None]:
jac_sim

In [None]:
def jaccard_binary(x,y):
    """A function for finding the similarity between two binary vectors"""
    intersection = np.logical_and(x, y)
    union = np.logical_or(x, y)
    similarity = intersection.sum() / float(union.sum())
    return similarity

In [None]:
np.random.seed(0)
df = pd.DataFrame(np.random.binomial(1, 0.5, size=(100, 5)), columns=list('ABCDE'))
print(df.head())



In [None]:
from sklearn.metrics import jaccard_score
print(jaccard_score(round(piv_jac.iloc[:,1]), round(piv_jac.iloc[:,0]), average='micro'))


In [None]:
jaccard_binary(df['B'], df['A'])

In [None]:
print(1 - pairwise_distances(df.T, metric = "hamming"))
