In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt, ceil
import math

%matplotlib inline
%config Completer.use_jedi = False

This notebook holds the data used for chapter 4.3, specifically for evaluating data for user with ID 148

In [2]:
movies = pd.read_csv("./movies.csv")
ratings = pd.read_csv("./ratings.csv", usecols=['userId', 'movieId', 'rating'])

In [3]:
ratings = ratings.pivot_table('rating', index='userId', columns='movieId')

In [5]:
# Calculate mean ratings for each user
users_mean_values = ratings.apply(lambda x: x.mean(), axis=1)
users_mean_values.head(5)

userId
1    4.366379
2    3.948276
3    2.435897
4    3.555556
5    3.636364
dtype: float64

In [6]:
# Normalize the data by making the mean value to be 0
normalized_ratings = ratings.apply(lambda x: x - users_mean_values[x.name], axis=1)
normalized_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,,-0.366379,,,-0.366379,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,0.363636,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,,,,,,-1.157399,,,,...,,,,,,,,,,
607,0.213904,,,,,,,,,,...,,,,,,,,,,
608,-0.634176,-1.134176,-1.134176,,,,,,,0.865824,...,,,,,,,,,,
609,-0.270270,,,,,,,,,0.729730,...,,,,,,,,,,


In [7]:
similarity_coefficients = normalized_ratings.T.corr()
similarity_coefficients.loc[:5, :10]

userId,1,2,3,4,5,6,7,8,9,10
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1.0,,0.079819,0.207983,0.268749,-0.2916358,-0.118773,0.469668,0.918559,-0.037987
2,,1.0,,,,,-0.991241,,,0.037796
3,0.079819,,1.0,,,7.850462000000001e-17,,,,
4,0.207983,,,1.0,-0.336525,0.1484982,0.542861,0.117851,,0.485794
5,0.268749,,,-0.336525,1.0,0.0431659,0.158114,0.028347,,-0.777714


In [9]:
ratings_data_transposed_corr = ratings.T.corr()

In [10]:
def get_similar_users(target_uid, k=10):
    '''Gets K users most similar to target UID'''
    
    # So what we're doing is that we're transposing the matrix so that rows are movie IDs and columns are user IDs
    # Then we are finding correlation coefficients, sorting the values, and returning the top K elements
    
    if target_uid not in ratings_data_transposed_corr:
        return []
    
    # TODO: Exclude the target_uid from array
    return ratings_data_transposed_corr[target_uid].sort_values(ascending=False).index[:k]

In [11]:
get_similar_users(148)

Int64Index([148, 361, 526, 202, 506, 577, 424, 196, 84, 108], dtype='int64', name='userId')

In [12]:
def predict_rating(target_uid, target_movieId):
    if target_movieId not in normalized_ratings:
        return -2

    users_who_rated_target_movie = normalized_ratings.loc[get_similar_users(target_uid, 30)][target_movieId].dropna()

    if len(users_who_rated_target_movie) == 0:
        return -5
    
    predicted = users_mean_values.loc[target_uid] + (sum(similarity_coefficients.loc[target_uid][users_who_rated_target_movie.index] * users_who_rated_target_movie) / len(users_who_rated_target_movie))
    
    return predicted

In [13]:
predict_rating(148, 73) # 73 == Les Miserables

-5

In [16]:
predict_rating(148, 4308) # 4308 == Moulin Rouge

4.294699980056343

In [17]:
predict_rating(148, 52975) # 52975 == Hairspray

3.4776785714285716