In [1]:
# Idea: find similar users and recommend movies based on that

In [100]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt
%matplotlib inline

In [3]:
movies = pd.read_csv("./movies.csv")
ratings = pd.read_csv("./ratings.csv")

In [6]:
# Let's start with user-based collaborative filtering. Point is to find users similar to us and 
# then to give recommendations based on what THEY liked

In [7]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
ratings = pd.merge(movies, ratings, on='movieId')

In [11]:
ratings.head(5)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [12]:
user_ratings = ratings.pivot_table(index='userId', columns='title', values='rating')

In [13]:
 v.head(10)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,1.0,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [16]:
# Compute average rating for each user (mu)

users_avg_rating = mu = user_ratings.mean(axis=1)

In [58]:
users_avg_rating[5]

3.6363636363636362

In [159]:
def pearson_coeff(u_userId, u, v_userId, v):
    '''
    Calculates the pearson coefficient between two series (two users in our case).
    
    It works by calculating the intersect (i.e. movies rated by both users), and then calculates pearson coeff.
    for those two.
    
    Note that general avg. ratings are used for "u" and "v". The better approach would be to calculate avgs. based
    on rated movies only, but, according to the Book (page 35), even this generalized approach is OKayish.
    
    Parameters
    ----------
    u_userId : ID of the user "u"
    u : Series with ratings by user "u"
    v_user_id : ID of the user "v"
    v : Series with ratings by user "v"
    '''
    
    u_mu = mu[u_userId]
    v_mu = mu[v_userId]

    intersect = u[u.notna() & v.notna()] # Intersect of movies rated by both user
    
    if intersect.size == 0:
        return 0
    
    # Upper part of Pearson's equation .. no idea how it's called
    sum_of_diffs = sum(((u[intersect.index] - u_mu) * (v[intersect.index] - v_mu)).values)
    
    u_sqrt = sqrt(pow(u[intersect.index] - u_mu, 2).sum())
    v_sqrt = sqrt(pow(v[intersect.index] - v_mu, 2).sum())
    
    pearson = sum_of_diffs / (u_sqrt * v_sqrt)
    
    return pearson

In [163]:
target_userId = 1 # ID of the user for whom we want to recommend the moves
target_user_ratings = user_ratings.loc[target_userId]

In [169]:
# This gives a coefficient of users similar to the target user
similar_users = user_ratings.apply(lambda x: pearson_coeff(target_userId, target_user_ratings, x.name, x), axis=1)



In [170]:
similar_users.sort_values(ascending=False).head(5)

userId
1      1.0
77     1.0
12     1.0
388    1.0
291    1.0
dtype: float64

In [171]:
# Construct a dataframe where index are userIds, and columns are all the same movies rated by these users

In [172]:
user_ratings.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


In [175]:
# now assume that we want to predict rating for movie '71'. 
# In the similar users list, we find users who HAVE rated this movie

In [268]:
# Find users who have rated the target movie
target_movie_name = "Terminator 2: Judgment Day (1991)"
target_movie = 507

# List of correlated user who rated the given move (k=100)
uids_who_rated_target_movie = list(set(ratings[ratings['movieId'] == 589].userId) & set(similar_users.sort_values(ascending=False).head(100).index))

# Set of users who rated the given movie and their associated ratings
P = user_ratings.loc[uids_who_rated_target_movie][target_movie_name]

# Do a mean-centering of ratings. The result here is the Series object with mean-centered ratings
P = pd.Series(
    # Here we reset the index so that we have access to User ID and Rating, and then we return the values
    # so that we can rebuild the series.
    P.reset_index().apply(lambda x: x[target_movie_name] - users_avg_rating[x.userId] , axis=1).values, 
    index=P.index
)

In [272]:
P

userId
450    1.000000
72     0.344444
297    0.402778
75     1.268116
173   -0.480000
301   -2.820175
302    0.000000
366    0.048387
494    0.772727
178   -0.090909
210   -0.079710
596   -0.495134
597    1.022573
476    1.318841
477    1.263333
dtype: float64

In [293]:
similar_user_ratings_and_correlation_coeff = pd.DataFrame(index=P.index)
similar_user_ratings_and_correlation_coeff['rating'] = P

# Correlation between similar users and target user
similar_user_ratings_and_correlation_coeff['correlation'] = similar_users[similar_user_ratings_and_correlation_coeff.index] 

# This gives us a DataFrame with users similar to target user, their normalized ratings and correlation coefficient ...
similar_user_ratings_and_correlation_coeff.head(5)

Unnamed: 0_level_0,rating,correlation
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
450,1.0,0.53682
72,0.344444,0.579855
297,0.402778,0.58368
75,1.268116,0.41554
173,-0.48,0.440353


In [296]:
sum(similar_user_ratings_and_correlation_coeff['rating'] * similar_user_ratings_and_correlation_coeff['correlation']) / sum(similar_user_ratings_and_correlation_coeff['correlation'])

0.1733218872568476

In [194]:
def calc_predicted_rating(u_uid, target_movie_id, correlated_users):
    u_mu = mu[u]
    

In [208]:
movies = movies.set_index('movieId')