In [228]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt, ceil
import math

%matplotlib inline
%config Completer.use_jedi = False

In this notebook, we're going to do a quick evaluation of Item-based collabortive filtering techniques. 

The idea is to predict the rating for item based on ratings for similar items.

In [229]:
movies = pd.read_csv("./movies.csv")
ratings = pd.read_csv("./ratings.csv", usecols=['userId', 'movieId', 'rating'])

In [230]:
# Recreate the data from the book

data = [
    [1, 7, 6, 7, 4, 5, 4],
    [2, 6, 7, np.NaN, 4, 3, 4],
    [3, np.NaN, 3, 3, 1, 1, np.NaN],
    [4, 1, 2, 2, 3, 3, 4],
    [5, 1, np.NaN, 1, 2, 3, 3]
]

In [231]:
ratings = pd.DataFrame(data, columns=['userId', 'm_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6'])
ratings = ratings.set_index('userId')
ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,7.0,6.0,7.0,4,5,4.0
2,6.0,7.0,,4,3,4.0
3,,3.0,3.0,1,1,
4,1.0,2.0,2.0,3,3,4.0
5,1.0,,1.0,2,3,3.0


In [232]:
# First step is to normalize the ratings by subtracting the mean value

users_mean_values = ratings.apply(lambda x: x.mean(), axis = 1)
users_mean_values

userId
1    5.5
2    4.8
3    2.0
4    2.5
5    2.0
dtype: float64

In [233]:
normalized_ratings = ratings.apply(lambda x: x - users_mean_values[x.name], axis=1)
normalized_ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.5,0.5,1.5,-1.5,-0.5,-1.5
2,1.2,2.2,,-0.8,-1.8,-0.8
3,,1.0,1.0,-1.0,-1.0,
4,-1.5,-0.5,-0.5,0.5,0.5,1.5
5,-1.0,,-1.0,0.0,1.0,1.0


In [234]:
target_movie_i = 'm_1'

target_movie_j = 'm_1'

# Users who have rated the target_movie_i
U_i = normalized_ratings[pd.notna(normalized_ratings[target_movie_i])][target_movie_i].index
U_i
#def calc_adjusted_cosine_similarity(movie_i, movie_j):
    

Int64Index([1, 2, 4, 5], dtype='int64', name='userId')

In [235]:
# Users who have rated the target_movie_j
U_j = normalized_ratings[pd.notna(normalized_ratings[target_movie_j])][target_movie_j].index
U_j

Int64Index([1, 2, 4, 5], dtype='int64', name='userId')

In [236]:
# Users who rated both movies I and J
U_ij = U_i & U_j
U_ij

Int64Index([1, 2, 4, 5], dtype='int64', name='userId')

In [237]:
numerator = []
denominator_i = []
denominator_j = []

for user_id in U_ij:
    s_ui = normalized_ratings.loc[user_id][target_movie_i]
    s_uj = normalized_ratings.loc[user_id][target_movie_j]
    
    numerator.append(s_ui * s_uj)
    
    denominator_i.append(pow(s_ui, 2))
    denominator_j.append(pow(s_uj, 2))
    
sum(numerator) / (sqrt(sum(denominator_i)) * sqrt(sum(denominator_j)))

1.0

In [238]:
def calc_adjusted_cosine_similarity(movie_i, movie_j):
    '''Calculates the adjusted cosine similarity between movies I and J'''
    # Users who have rated the target_movie_i
    U_i = normalized_ratings[pd.notna(normalized_ratings[movie_i])][movie_i].index
        
    # Users who have rated the target_movie_j
    U_j = normalized_ratings[pd.notna(normalized_ratings[movie_j])][movie_j].index
        
    # Users who rated both movies
    
    U_ij = U_i & U_j
        
    if len(U_ij) == 0:
        # No users who rated both movies, apparently
        return -5
    
    numerator = []
    denominator_i = []
    denominator_j = []
    
    for user_id in U_ij:
        s_ui = normalized_ratings.loc[user_id][movie_i]
        s_uj = normalized_ratings.loc[user_id][movie_j]

        numerator.append(s_ui * s_uj)

        denominator_i.append(pow(s_ui, 2))
        denominator_j.append(pow(s_uj, 2))

    return sum(numerator) / (sqrt(sum(denominator_i)) * sqrt(sum(denominator_j)))
    

In [239]:
calc_adjusted_cosine_similarity('m_1', 'm_5')

-0.812488103301994

In [240]:
movies_ids = ratings.columns.values

In [241]:
import operator

def get_movies_similar_to_target_movie(target_movie_id):
    '''Returns list of movie IDs that are similar to target movie'''
    
    # Iterate through list of movies and find movies similar to it
    
    coefficients = {}
    
    for movie_id in movies_ids:
        if movie_id == target_movie_id:
            # Skip the target movie, as the similarity is 1 for sure
            continue
        
        coeff = calc_adjusted_cosine_similarity(target_movie_id, movie_id)
        
        #print("Similarity between {} and {} is {}".format(target_movie_id, movie_id, coeff))
        
        if coeff <= 0:
            # Skip movies which aren't correlated
            continue
            
        coefficients[movie_id] = coeff
        
    coefficients = sorted(coefficients.items(), key=operator.itemgetter(1), reverse=True)
    
    movie_indices = [i[0] for i in coefficients]
    
    return movie_indices

In [242]:
get_movies_similar_to_target_movie('m_1')

['m_3', 'm_2']

In [243]:
# Represents similar movies for which the user has provided rating for

target_user = 3
target_movie = 'm_1'

def predict_rating(target_user, target_movie):
    '''Predicts rating of user for specified movie'''

    # Movies similar to target movie
    similar_movies_to_target_movie = get_movies_similar_to_target_movie(target_movie)

    # Movies similar to target movie that were rated by target user 
    Qt_u = normalized_ratings.loc[target_user][similar_movies_to_target_movie].dropna().index.values

    numerator = []
    denominator = []

    for j in Qt_u:
        numerator.append(calc_adjusted_cosine_similarity(j, target_movie) * ratings.loc[target_user][j])
        denominator.append(calc_adjusted_cosine_similarity(j, target_movie))

    return sum(numerator) / sum(denominator)
    

In [244]:
predict_rating(3, 'm_1')

2.9999999999999996

In [245]:
predict_rating(3, 'm_6')

1.0

In [248]:
ratings_zerofilled = ratings.fillna(0)
ratings_zerofilled

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,7.0,6.0,7.0,4,5,4.0
2,6.0,7.0,0.0,4,3,4.0
3,0.0,3.0,3.0,1,1,0.0
4,1.0,2.0,2.0,3,3,4.0
5,1.0,0.0,1.0,2,3,3.0


In [250]:
M = ratings_zerofilled.as_matrix()
M

  """Entry point for launching an IPython kernel.


array([[7., 6., 7., 4., 5., 4.],
       [6., 7., 0., 4., 3., 4.],
       [0., 3., 3., 1., 1., 0.],
       [1., 2., 2., 3., 3., 4.],
       [1., 0., 1., 2., 3., 3.]])

In [254]:
M_u = M.mean(axis=1)
M_u

array([5.5       , 4.        , 1.33333333, 2.5       , 1.66666667])

In [257]:
M_u[:, np.newaxis]

array([[5.5       ],
       [4.        ],
       [1.33333333],
       [2.5       ],
       [1.66666667]])

In [261]:
item_mean_subtracted = M - M_u[:, np.newaxis]

In [262]:
cosine_similarity(item_mean_subtracted)

array([[ 1.        , -0.05923489,  0.4247954 , -0.8992288 , -0.71885076],
       [-0.05923489,  1.        , -0.23904572, -0.23354968, -0.33709993],
       [ 0.4247954 , -0.23904572,  1.        , -0.27914526, -0.64465837],
       [-0.8992288 , -0.23354968, -0.27914526,  1.        ,  0.78729582],
       [-0.71885076, -0.33709993, -0.64465837,  0.78729582,  1.        ]])

In [265]:
item_mean_subtracted

array([[ 1.5       ,  0.5       ,  1.5       , -1.5       , -0.5       ,
        -1.5       ],
       [ 2.        ,  3.        , -4.        ,  0.        , -1.        ,
         0.        ],
       [-1.33333333,  1.66666667,  1.66666667, -0.33333333, -0.33333333,
        -1.33333333],
       [-1.5       , -0.5       , -0.5       ,  0.5       ,  0.5       ,
         1.5       ],
       [-0.66666667, -1.66666667, -0.66666667,  0.33333333,  1.33333333,
         1.33333333]])

In [266]:
cosine_similarity(item_mean_subtracted.T)

array([[ 1.        ,  0.50284547, -0.4441123 , -0.51415411, -0.65435704,
        -0.38855339],
       [ 0.50284547,  1.        , -0.39322122, -0.32976223, -0.87887858,
        -0.53977794],
       [-0.4441123 , -0.39322122,  1.        , -0.42625118,  0.18130289,
        -0.46197695],
       [-0.51415411, -0.32976223, -0.42625118,  1.        ,  0.51214752,
         0.8304548 ],
       [-0.65435704, -0.87887858,  0.18130289,  0.51214752,  1.        ,
         0.71240324],
       [-0.38855339, -0.53977794, -0.46197695,  0.8304548 ,  0.71240324,
         1.        ]])

In [268]:
ratings_zerofilled.T

userId,1,2,3,4,5
m_1,7.0,6.0,0.0,1.0,1.0
m_2,6.0,7.0,3.0,2.0,0.0
m_3,7.0,0.0,3.0,2.0,1.0
m_4,4.0,4.0,1.0,3.0,2.0
m_5,5.0,3.0,1.0,3.0,3.0
m_6,4.0,4.0,0.0,4.0,3.0


In [274]:
normalized_ratings_zerofilled = normalized_ratings.fillna(0)
normalized_ratings_zerofilled

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.5,0.5,1.5,-1.5,-0.5,-1.5
2,1.2,2.2,0.0,-0.8,-1.8,-0.8
3,0.0,1.0,1.0,-1.0,-1.0,0.0
4,-1.5,-0.5,-0.5,0.5,0.5,1.5
5,-1.0,0.0,-1.0,0.0,1.0,1.0


In [278]:
cosine_similarity(normalized_ratings_zerofilled.T)[:, :4]

array([[ 1.        ,  0.62413132,  0.71577084, -0.73878026],
       [ 0.62413132,  1.        ,  0.3744373 , -0.73391041],
       [ 0.71577084,  0.3744373 ,  1.        , -0.81088939],
       [-0.73878026, -0.73391041, -0.81088939,  1.        ],
       [-0.73832952, -0.90509063, -0.59028134,  0.70567109],
       [-0.9896203 , -0.522503  , -0.76097353,  0.72196647]])