In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
plotly.offline
%matplotlib inline

In [7]:
movies = ["user", "Friday the 13th", "Nightmare on Elm St", "Dawn of the Dead", "Hiro Dreams of Sushi", "180 South", "Exit Through the Giftshop"]
users = [
    ("Chuck", 5, 4, None, None, None, 1),
    ("Nancy", 5, None, 4, None, 2, None),
    ("Anya", 4, 5, 5, None, 1, None),
    ("Divya", 1, None, 2, 5, 4, 5),
    ("Pat", 1, 1, 1, None, 3, 4),
]
users= pd.DataFrame(users,columns=movies).set_index('user')

In [8]:
users

Unnamed: 0_level_0,Friday the 13th,Nightmare on Elm St,Dawn of the Dead,Hiro Dreams of Sushi,180 South,Exit Through the Giftshop
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chuck,5,4.0,,,,1.0
Nancy,5,,4.0,,2.0,
Anya,4,5.0,5.0,,1.0,
Divya,1,,2.0,5.0,4.0,5.0
Pat,1,1.0,1.0,,3.0,4.0


In [16]:
# user-based collaborate filtering
# step 1: calculate similarity between users

# center the ratings per user
def mean_center_rows(df):
    return (df.T - df.mean(axis=1)).T

user_mc=mean_center_rows(users)

# mark nan as zeros
user_mc.fillna(0,inplace=True)
user_mc

Unnamed: 0_level_0,Friday the 13th,Nightmare on Elm St,Dawn of the Dead,Hiro Dreams of Sushi,180 South,Exit Through the Giftshop
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chuck,1.666667,0.666667,0.0,0.0,0.0,-2.333333
Nancy,1.333333,0.0,0.333333,0.0,-1.666667,0.0
Anya,0.25,1.25,1.25,0.0,-2.75,0.0
Divya,-2.4,0.0,-1.4,1.6,0.6,1.6
Pat,-1.0,-1.0,-1.0,0.0,1.0,2.0


In [21]:
# calculate similarity between users
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(user_mc.iloc[0,:].values.reshape(1,-1),user_mc.iloc[1,:].values.reshape(1,-1))

array([[0.34942828]])

In [25]:
# compute similarity between users
sim_mat= cosine_similarity(user_mc)
users_sim=pd.DataFrame(sim_mat,columns=users.index,index=users.index)
users_sim

user,Chuck,Nancy,Anya,Divya,Pat
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Chuck,1.0,0.349428,0.129503,-0.723026,-0.840673
Nancy,0.349428,1.0,0.752993,-0.594588,-0.545545
Anya,0.129503,0.752993,1.0,-0.335791,-0.59308
Divya,-0.723026,-0.594588,-0.335791,1.0,0.739574
Pat,-0.840673,-0.545545,-0.59308,0.739574,1.0


In [26]:
# Q: what would Nancy rate on nightmare
# idea: ask similar people to Nancy and summarize thier ratings.

# get Nancy's similarity across (users), and keep similar ones (>0)

nancy_sim= users_sim['Nancy'].drop('Nancy')
nancy_sim= nancy_sim[nancy_sim>0]
nancy_sim

user
Chuck    0.349428
Anya     0.752993
Name: Nancy, dtype: float64

In [28]:
# turn similarity scores to weights
nancy_weights=nancy_sim.values/np.sum(nancy_sim.values)
nancy_weights

array([0.3169643, 0.6830357])

In [34]:
# get nighmare movie ratings
elm_ratings=users['Nightmare on Elm St'].drop('Nancy').iloc[:2]
elm_ratings

user
Chuck    4.0
Anya     5.0
Name: Nightmare on Elm St, dtype: float64

In [35]:
# get Nancy's predicted score
np.dot(elm_ratings.values,nancy_weights)

4.683035701380843

In [39]:
# movielens
movies = users.T

sim_mat=cosine_similarity(mean_center_rows(movies).fillna(0))
movies_sim=pd.DataFrame(sim_mat,columns=movies.index,index=movies.index)
movies_sim

Unnamed: 0,Friday the 13th,Nightmare on Elm St,Dawn of the Dead,Hiro Dreams of Sushi,180 South,Exit Through the Giftshop
Friday the 13th,1.0,0.635369,0.771517,0.0,-0.709208,-0.773492
Nightmare on Elm St,0.635369,1.0,0.859338,0.0,-0.557007,-0.358974
Dawn of the Dead,0.771517,0.859338,1.0,0.0,-0.848528,-0.322252
Hiro Dreams of Sushi,0.0,0.0,0.0,0.0,0.0,0.0
180 South,-0.709208,-0.557007,-0.848528,0.0,1.0,0.430414
Exit Through the Giftshop,-0.773492,-0.358974,-0.322252,0.0,0.430414,1.0


In [42]:
elm_sim= movies_sim['Nightmare on Elm St'].drop('Nightmare on Elm St')
elm_sim= elm_sim[elm_sim>0]
elm_sim


Friday the 13th     0.635369
Dawn of the Dead    0.859338
Name: Nightmare on Elm St, dtype: float64

In [43]:
elm_weights= elm_sim.values/np.sum(elm_sim.values)
elm_weights

array([0.42507927, 0.57492073])

In [45]:
# get nancy's ratings for movies similar to elm
nancy_ratings=movies['Nancy'].loc[elm_sim.index]
nancy_ratings

Friday the 13th     5.0
Dawn of the Dead    4.0
Name: Nancy, dtype: float64

In [46]:
# get nancy's rating on elm
np.dot(nancy_ratings.values,elm_weights)

4.425079267585091