In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt
%matplotlib inline
%config Completer.use_jedi = False

In [6]:
movies = pd.read_csv("./movies.csv")
ratings = pd.read_csv("./ratings.csv", usecols=['userId', 'movieId', 'rating'])

In [7]:
user_ids_series = ratings.userId.unique()
movie_ids_series = ratings.movieId.unique()

print("There are, in total {} unique users and {} unique movies".format(user_ids_series.shape[0], movie_ids_series.shape[0]))

There are, in total 610 unique users and 9724 unique movies


In [10]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
5,1,70,3.0
6,1,101,5.0
7,1,110,4.0
8,1,151,5.0
9,1,157,5.0


Since there's a lot of data which my laptop can't support, I'm going to load only part of the data

In [11]:
small_data = ratings.sample(frac=0.2)

small_data.shape

(20167, 3)

In [12]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(small_data, test_size=0.2)

In [13]:
train_data_matrix = train_data.to_numpy()
test_data_matrix = test_data.to_numpy()

In [14]:
train_data

Unnamed: 0,userId,movieId,rating
84343,541,196,3.0
4891,31,588,5.0
65716,422,4925,3.0
26697,182,3895,1.5
28405,198,1221,3.0
...,...,...,...
83225,527,1372,4.0
10077,66,1243,5.0
82330,522,44191,4.5
68500,446,32,4.0


In [20]:
users_ratings_matrix_df = train_data.pivot_table('rating', index='userId', columns='movieId')
users_ratings_matrix_df.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,184791,185029,185135,185585,187593,187595,188301,188675,189333,191005
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,,,3.0,,4.0,,3.0,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [21]:
users_ratings_matrix_df.corr()

movieId,1,2,3,4,5,6,7,8,9,10,...,184791,185029,185135,185585,187593,187595,188301,188675,189333,191005
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,-1.0,1.0,,,,,,,,...,,,,,,,,,,
2,-1.0,1.0,,,,,,,,,...,,,,,,1.0,,,,
3,1.0,,1.0,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187595,,1.0,,,,,,,,,...,,,,,,1.0,,,,
188301,,,,,,,,,,,...,,,,,,,1.0,,,
188675,,,,,,,,,,,...,,,,,,,,,,
189333,,,,,,,,,,,...,,,,,,,,,,


In [22]:
data = [
    [1, 7, 6, 7, 4, 5, 4],
    [2, 6, 7, np.NaN, 4, 3, 4],
    [3, np.NaN, 3, 3, 1, 1, np.NaN],
    [4, 1, 2, 2, 3, 3, 4],
    [5, 1, np.NaN, 1, 2, 3, 3]
]

ratings = pd.DataFrame(data, columns=['userId', 'm_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6'])
ratings = ratings.set_index('userId')
ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,7.0,6.0,7.0,4,5,4.0
2,6.0,7.0,,4,3,4.0
3,,3.0,3.0,1,1,
4,1.0,2.0,2.0,3,3,4.0
5,1.0,,1.0,2,3,3.0


In [41]:
users_mean_values = ratings.apply(lambda x: x.mean(), axis=1)
users_mean_values

userId
1    5.5
2    4.8
3    2.0
4    2.5
5    2.0
dtype: float64

In [47]:
normalized_ratings = ratings.apply(lambda x: x - users_mean_values[x.name], axis=1)
normalized_ratings

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.5,0.5,1.5,-1.5,-0.5,-1.5
2,1.2,2.2,,-0.8,-1.8,-0.8
3,,1.0,1.0,-1.0,-1.0,
4,-1.5,-0.5,-0.5,0.5,0.5,1.5
5,-1.0,,-1.0,0.0,1.0,1.0


In [50]:
normalized_ratings.loc[1].to_numpy()

array([ 1.5,  0.5,  1.5, -1.5, -0.5, -1.5])

In [51]:
normalized_ratings.loc[2].to_numpy()

array([ 1.2,  2.2,  nan, -0.8, -1.8, -0.8])

In [52]:
np.corrcoef(normalized_ratings.loc[1].to_numpy(), normalized_ratings.loc[2].to_numpy())

array([[ 1., nan],
       [nan, nan]])

In [61]:
normalized_ratings.loc[2].dropna().index

Index(['m_1', 'm_2', 'm_4', 'm_5', 'm_6'], dtype='object')

In [60]:
normalized_ratings.loc[1][normalized_ratings.loc[2].dropna().index]

m_1    1.5
m_2    0.5
m_4   -1.5
m_5   -0.5
m_6   -1.5
Name: 1, dtype: float64

In [70]:
def corr(a, b):
    movies_rated_by_a_indexes = a.dropna().index
    
    movies_rated_by_a_and_b = b[movies_rated_by_a_indexes].dropna()
    
    print(movies_rated_by_a_and_b)
    
    return np.corrcoef(a[movies_rated_by_a_and_b.index], movies_rated_by_a_and_b)

In [72]:
corr(normalized_ratings.loc[1], normalized_ratings.loc[3])

m_2    1.0
m_3    1.0
m_4   -1.0
m_5   -1.0
Name: 3, dtype: float64


array([[1.        , 0.89442719],
       [0.89442719, 1.        ]])

In [None]:
def get_similarities(target_uid):
    '''Returns similarities between target user ID and others'''
    
    normalized_ratings.apply(
        lambda
    )

In [81]:
np.corrcoef(normalized_ratings.loc[1], normalized_ratings.loc[3])

array([[ 1., nan],
       [nan, nan]])

In [96]:
normalized_ratings.T.corr()[1].sort_values(ascending=False).index[1:]

Int64Index([3, 2, 5, 4], dtype='int64', name='userId')

In [109]:
similarity_coefficients_sample_data = normalized_ratings.T.corr()
similarity_coefficients_sample_data

userId,1,2,3,4,5
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,0.723478,0.894427,-0.899229,-0.824226
2,0.723478,1.0,0.970725,-0.720577,-0.899229
3,0.894427,0.970725,1.0,-1.0,-0.866025
4,-0.899229,-0.720577,-1.0,1.0,0.877058
5,-0.824226,-0.899229,-0.866025,0.877058,1.0


In [110]:
def get_similar_users_sample_data(target_uid, k=10):
    '''Gets K users most similar to target UID'''
    
    # So what we're doing is that we're transposing the matrix so that rows are movie IDs and columns are user IDs
    # Then we are finding correlation coefficients, sorting the values, and returning the top K elements, excluding
    # the first one which is assumed to be the target_uid (i.e. it'd have highest similarity)
    return similarity_coefficients_sample_data[target_uid].sort_values(ascending=False).index[1:k]

def get_similar_users_movielens(target_uid, k=10):
    '''Gets K users most similar to target UID'''
    
    return train_data.pivot_table('rating', index='userId', columns='movieId').T.corr()[target_uid].sort_values(ascending=False).index[1:k]

In [133]:
def predict_rating(target_uid, target_movieId):
    users_who_rated_target_movie = normalized_ratings.loc[get_similar_users_sample_data(target_uid)][target_movieId].dropna()
    
    return users_mean_values.loc[target_uid] + (sum(similarity_coefficients_sample_data.loc[target_uid][users_who_rated_target_movie.index] * users_who_rated_target_movie) / len(users_who_rated_target_movie))

In [None]:
def predict_rating_for_movielens(target_uid, target_movieId):
    users_who_rated_target_movie = normalized_ratings.loc[get_similar_users_sample_data(target_uid)][target_movieId].dropna()

In [137]:
predict_rating(3, 'm_6')

0.8789383837500919

In [112]:
get_similar_users_sample_data(3)

Int64Index([2, 1, 5, 4], dtype='int64', name='userId')

In [130]:
target_movieId = 'm_6'

In [131]:
users_who_rated_target_movie = normalized_ratings.loc[get_similar_users_sample_data(3)][target_movieId].dropna()
users_who_rated_target_movie

userId
2   -0.8
1   -1.5
5    1.0
4    1.5
Name: m_6, dtype: float64

In [129]:
users_mean_values.loc[3] + (sum(similarity_coefficients_sample_data.loc[3][users_who_rated_target_movie.index] * users_who_rated_target_movie) / len(users_who_rated_target_movie))

3.2181341505893233

In [132]:
users_mean_values.loc[3] + (sum(similarity_coefficients_sample_data.loc[3][users_who_rated_target_movie.index] * users_who_rated_target_movie) / len(users_who_rated_target_movie))

0.8789383837500919

In [108]:
normalized_ratings.loc[get_similar_users_sample_data(1)]

Unnamed: 0_level_0,m_1,m_2,m_3,m_4,m_5,m_6
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,,1.0,1.0,-1.0,-1.0,
2,1.2,2.2,,-0.8,-1.8,-0.8
5,-1.0,,-1.0,0.0,1.0,1.0
4,-1.5,-0.5,-0.5,0.5,0.5,1.5


In [105]:
get_similar_users_movielens(1)

Int64Index([336, 458, 21, 226, 244, 246, 287, 312, 318], dtype='int64', name='userId')

In [None]:
users_mean_values_movielens = ratings.apply(lambda x: x.mean(), axis=1)
users_mean_values

In [None]:
# So lets apply what we learned to movielens now ...
# I'll use the data from test_set to find predictions ...

In [89]:
train_data.pivot_table('rating', index='userId', columns='movieId').T.corr()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,,,,,,-1.000000,,,,...,,,,,,0.845154,,0.083045,,
2,,1.0,,,,,,,,,...,,,,,,,,,,
3,,,1.0,,,,,,,,...,,,,,,,,,,
4,,,,1.000000,,,,,,,...,,,,,,0.511408,,-0.960769,,
5,,,,,1.0,,,,,,...,,,,,,-1.000000,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.845154,,,0.511408,-1.0,1.000000,0.389831,,,,...,,1.000000,0.029001,,-1.0,1.000000,,0.544949,1.0,0.608922
607,,,,,,-0.866025,,,,,...,,,-0.944911,,,,1.0,-1.000000,,
608,0.083045,,,-0.960769,,0.277350,-1.000000,,,,...,,,0.608581,,,0.544949,-1.0,1.000000,,0.153093
609,,,,,,,,,,,...,,0.866025,,,,1.000000,,,1.0,


In [78]:
# Ok so user ID 111 is in 

In [None]:
#

In [40]:
from sklearn.metrics import pairwise_distances

user_correlation = 1 - pairwise_distances(train_data, metric='correlation')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation[:4, :4])

[[1.         0.99331109 0.98709497 0.98468507]
 [0.99331109 1.         0.96200166 0.95796746]
 [0.98709497 0.96200166 1.         0.99989618]
 [0.98468507 0.95796746 0.99989618 1.        ]]


In [41]:
# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation[:4, :4])

[[ 1.          0.00839972 -0.04196469]
 [ 0.00839972  1.         -0.00198547]
 [-0.04196469 -0.00198547  1.        ]]


In [None]:
def predict_user_user(ratings, similarity):
    mean_user_rating = ratings.mean(axis=1)
    
    # Use np.newaxis so that mean_user_rating has same format as ratings
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
    
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T

In [1]:
# Function to predict ratings
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [3]:
X = [
    [1, 1, 1],
    [2, 2, 2]
]