In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt, ceil
import math
from sklearn.metrics.pairwise import cosine_similarity

%matplotlib inline
%config Completer.use_jedi = False

In this notebook, we're going to do a quick evaluation of Item-based collabortive filtering techniques.

The idea is to predict the rating for item based on ratings for similar items.

In [3]:
# Recreate the data from the book

data = [
    [1, 7, 6, 7, 4, 5, 4],
    [2, 6, 7, np.NaN, 4, 3, 4],
    [3, np.NaN, 3, 3, 1, 1, np.NaN],
    [4, 1, 2, 2, 3, 3, 4],
    [5, 1, np.NaN, 1, 2, 3, 3]
]

In [4]:
ratings = pd.DataFrame(data, columns=['userId', 'm_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6'])
ratings = ratings.set_index('userId')
ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,7.0,6.0,7.0,4,5,4.0
2,6.0,7.0,,4,3,4.0
3,,3.0,3.0,1,1,
4,1.0,2.0,2.0,3,3,4.0
5,1.0,,1.0,2,3,3.0


In [7]:
normalized_ratings = ratings.apply(lambda x: x - x.mean(), axis=1)
normalized_ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.5,0.5,1.5,-1.5,-0.5,-1.5
2,1.2,2.2,,-0.8,-1.8,-0.8
3,,1.0,1.0,-1.0,-1.0,
4,-1.5,-0.5,-0.5,0.5,0.5,1.5
5,-1.0,,-1.0,0.0,1.0,1.0


In [8]:
normalized_ratings = normalized_ratings.fillna(0)
normalized_ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.5,0.5,1.5,-1.5,-0.5,-1.5
2,1.2,2.2,0.0,-0.8,-1.8,-0.8
3,0.0,1.0,1.0,-1.0,-1.0,0.0
4,-1.5,-0.5,-0.5,0.5,0.5,1.5
5,-1.0,0.0,-1.0,0.0,1.0,1.0


In [16]:
# Calculate similarity coefficients between items
similarity_coefficients = cosine_similarity(normalized_ratings.T) # Transpose as we want Movies to become rows
similarity_coefficients[:, :5]

array([[ 1.        ,  0.62413132,  0.71577084, -0.73878026, -0.73832952],
       [ 0.62413132,  1.        ,  0.3744373 , -0.73391041, -0.90509063],
       [ 0.71577084,  0.3744373 ,  1.        , -0.81088939, -0.59028134],
       [-0.73878026, -0.73391041, -0.81088939,  1.        ,  0.70567109],
       [-0.73832952, -0.90509063, -0.59028134,  0.70567109,  1.        ],
       [-0.9896203 , -0.522503  , -0.76097353,  0.72196647,  0.66367597]])

In [17]:
# Convert similarity coefficients to DataFrame for easier searching
similarity_coefficients = pd.DataFrame(index=normalized_ratings.columns, columns=normalized_ratings.columns, data=similarity_coefficients)
similarity_coefficients

Unnamed: 0,m_1,m_2,m_3,m_4,m_5,m_6
m_1,1.0,0.624131,0.715771,-0.73878,-0.73833,-0.98962
m_2,0.624131,1.0,0.374437,-0.73391,-0.905091,-0.522503
m_3,0.715771,0.374437,1.0,-0.810889,-0.590281,-0.760974
m_4,-0.73878,-0.73391,-0.810889,1.0,0.705671,0.721966
m_5,-0.73833,-0.905091,-0.590281,0.705671,1.0,0.663676
m_6,-0.98962,-0.522503,-0.760974,0.721966,0.663676,1.0


In [32]:
similarity_coefficients.loc['m_1'].drop('m_1').sort_values(ascending=False).head(11)


m_3    0.715771
m_2    0.624131
m_5   -0.738330
m_4   -0.738780
m_6   -0.989620
Name: m_1, dtype: float64

In [35]:
def get_similar_movies(target_movie, k=10):
    '''Returns list of movies similar to target movie'''
    
    similar_movies = similarity_coefficients.loc[target_movie].drop(target_movie).sort_values(ascending=False)
    
    return similar_movies[similar_movies > 0].head(k)

In [41]:
get_similar_movies('m_1')

m_3    0.715771
m_2    0.624131
Name: m_1, dtype: float64

In [83]:
def predict_rating(target_user, target_movie):
    similar_movies = get_similar_movies(target_movie)
    
    similar_movies_that_were_rated_by_user = ratings.loc[target_user][similar_movies.index].dropna()
    
    numerator = 0
    
    for movie in similar_movies_that_were_rated_by_user.index:
        # Similarity coefficient between this movie and target movie
        movie_similarity = similarity_coefficients.loc[target_movie][movie]
        
        target_users_rating_of_current_movie = ratings.loc[target_user][movie]
        
        numerator += movie_similarity * target_users_rating_of_current_movie    
    
    # Sum of similarity coefficients between this movie and similar ones rated by target user
    denominator = sum(similarity_coefficients.loc[target_movie][similar_movies_that_were_rated_by_user.index])

    return numerator / denominator

In [114]:
def predict_rating(target_user, target_movie):
    '''Predicts rating of target user for target movie'''

    # First, find the movies that are similar to target movie
    similar_movies = get_similar_movies(target_movie, k = 100)
    
    # Out of those that are similar, leave only the ones that were rated by given user
    similar_movies_that_were_rated_by_user = ratings.loc[target_user][similar_movies.index].dropna()
    
    # Similarity coefficients between the target movie and the movies that are similar to it
    cosine_similarities = similarity_coefficients.loc[target_movie][similar_movies_that_were_rated_by_user.index]
    
    # Target users ratings of movies that are similar to target movie
    ratings_for_similar_movies = ratings.loc[target_user][similar_movies_that_were_rated_by_user.index]
    
    # Calculate the numerator part of formula 2.15
    # Basically, numerator is calculated by multiplying similarities coefficients and users rating for similar movies
    # You can think of it as saying - multiply how similar the target movie is and how user rated the movie similar
    # to it, and then sum up all those results
    numerator = sum(cosine_similarities * ratings_for_similar_movies)
    
    # Denominator is just the sum of similarity coefficients
    denominator = sum(cosine_similarities)

    return numerator / denominator

In [75]:
similar_movies = get_similar_movies('m_1')
similar_movies

m_3    0.715771
m_2    0.624131
Name: m_1, dtype: float64

In [76]:
ratings.loc[3]

m_1    NaN
m_2    3.0
m_3    3.0
m_4    1.0
m_5    1.0
m_6    NaN
Name: 3, dtype: float64

In [84]:
similar_movies_that_were_rated_by_user = ratings.loc[3][similar_movies.index].dropna()
similar_movies_that_were_rated_by_user

m_3    3.0
m_2    3.0
Name: 3, dtype: float64

In [87]:
sum(similar_movies_that_were_rated_by_user * similarity_coefficients.loc['m_1'][similar_movies.index])

4.019706469108971

In [78]:
target_movie = 'm_1'

In [79]:
for movie in similar_movies_that_were_rated_by_user.index:
    similarity_coefficients.loc[target_movie][movie]

In [80]:
ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,7.0,6.0,7.0,4,5,4.0
2,6.0,7.0,,4,3,4.0
3,,3.0,3.0,1,1,
4,1.0,2.0,2.0,3,3,4.0
5,1.0,,1.0,2,3,3.0


In [115]:
predict_rating(3, 'm_3')

3.0000000000000004

In [116]:
predict_rating(3, 'm_6')

1.0

In [95]:
similarity_coefficients.loc[target_movie][similar_movies_that_were_rated_by_user.index]

m_3    0.715771
m_2    0.624131
Name: m_1, dtype: float64

In [98]:
sum(ratings.loc[3][similar_movies_that_were_rated_by_user.index] * similarity_coefficients.loc[target_movie][similar_movies_that_were_rated_by_user.index])

4.019706469108971

In [99]:
sum(similarity_coefficients.loc[target_movie][similar_movies_that_were_rated_by_user.index])

1.3399021563696567

In [100]:
4.019706469108971 / 1.3399021563696567

3.0000000000000004