# Chapter 2 - NEIGHBORHOOD-BASED COLLABORATIVE FILTERING

The code below is my attempt at coding the Simple recommendation predictor based on similar Users ratings.
It covers Chapter 2.3 in the Book.

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt
%matplotlib inline

In [147]:
# Recreate the data from the book

data = [
    [1, 7, 6, 7, 4, 5, 4],
    [2, 6, 7, np.NaN, 4, 3, 4],
    [3, np.NaN, 3, 3, 1, 1, np.NaN],
    [4, 1, 2, 2, 3, 3, 4],
    [5, 1, np.NaN, 1, 2, 3, 3]
]

In [148]:
ratings = pd.DataFrame(data, columns=['userId', 'm_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6'])
ratings = ratings.set_index('userId')
ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,7.0,6.0,7.0,4,5,4.0
2,6.0,7.0,,4,3,4.0
3,,3.0,3.0,1,1,
4,1.0,2.0,2.0,3,3,4.0
5,1.0,,1.0,2,3,3.0


In [149]:
# Calculate average ratings for all users

users_avg_rating = ratings.mean(axis=1)
users_avg_rating

userId
1    5.5
2    4.8
3    2.0
4    2.5
5    2.0
dtype: float64

In [150]:
# Normalize the ratings (make the mean value 0); this is optional but according to the book, predictions made
# with zeroed mean value tend to be a bit better

normalized_ratings = ratings.apply(lambda x: x - users_avg_rating[x.name], axis=1)
normalized_ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.5,0.5,1.5,-1.5,-0.5,-1.5
2,1.2,2.2,,-0.8,-1.8,-0.8
3,,1.0,1.0,-1.0,-1.0,
4,-1.5,-0.5,-0.5,0.5,0.5,1.5
5,-1.0,,-1.0,0.0,1.0,1.0


In [219]:
def pearson_coeff(u, v):
    '''
    Calculates the pearson coefficient (i.e. similarity) between two users.
    
    The code is based on the formula 2.2 from the Book.
    
    It works by calculating the intersect (i.e. movies rated by both users), and then calculates pearson coeff.
    for those two.
    
    Parameters
    ----------
    u : Series with ratings by user "u"
    v : Series with ratings by user "v"
    '''

    intersect = u[u.notna() & v.notna()] # Intersect of movies rated by both user
    
    if intersect.size == 0:
        return 0
    
    # Upper part of Pearson's equation .. no idea how it's called
    sum_of_diffs = sum(((u[intersect.index]) * (v[intersect.index])).values)
    
    u_sqrt = sqrt(pow(u[intersect.index], 2).sum())
    v_sqrt = sqrt(pow(v[intersect.index], 2).sum())
    
    pearson = sum_of_diffs / (u_sqrt * v_sqrt)
    
    return pearson

In [216]:
# Test what the book says -- pearson coeff. between users 1 and 3 should be 0.89
pearson_coeff(normalized_ratings.loc[1], normalized_ratings.loc[3])

0.8944271909999159

We're going to define the three main functions now:

    - get_similarities() -- returns Pearson coefficients for given User

    - get_similar_users_who_rated_given_movie -- returns set of Users 
            who are positively correlated with target user and who also 
            rated the movie that we are trying to get recommendation for

    - predict_rating() -- main function that we will be calling. Predicts
            rating of given movie for given user.


In [210]:
def get_similarities(user_id):
    '''Calculates similarities between given user ID and other users'''
    return normalized_ratings.apply(
        lambda x: pearson_coeff(normalized_ratings.loc[user_id], normalized_ratings.loc[x.name]),
        axis=1
    )

def get_similar_users_who_rated_given_movie(similarities, movie_id, k = 50):
    '''Returns set of Users who are positively correlated (according to similarities array) 
    and who rated the given movie'''
    
    P = set(
        similarities[similarities > 0].sort_values(ascending=False).head(k).index &  # Only positive correlations
        normalized_ratings[normalized_ratings[movie_id].notna()].index
    )
    
    return P

def predict_rating(user_id, movie_id):
    '''Predicts the rating of given movie for the given user'''
    
    if not np.isnan(ratings.loc[user_id][movie_id]):
        return ratings.loc[user_id][movie_id]
    
    similarities = get_similarities(user_id)
    
    P = get_similar_users_who_rated_given_movie(similarities, movie_id)
    
    mu = users_avg_rating[user_id]

    # This is basically using the formula 2.4 from the book
    return mu + (sum(similarities.loc[P] * normalized_ratings.loc[P][movie_id]) / sum(similarities.loc[P]))

Finally, let's test what we've got. According to the book, predicted rating for user 3 for movie 1 should be 3.35 and for movie 6 it should be 0.85

In [217]:
predict_rating(3, 'm_1') # Predicted rating for user 3 for movie 1

3.3463952993809016

In [218]:
predict_rating(3, 'm_6') # Predicted rating for user 3 for movie 6

0.8584109681112306

Looks like we've got the correct results. Woohoo!