In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt, ceil
import math

%matplotlib inline
%config Completer.use_jedi = False

In [2]:
movies = pd.read_csv("./movies.csv")
ratings = pd.read_csv("./ratings.csv", usecols=['userId', 'movieId', 'rating'])

In [3]:
user_ids_series = ratings.userId.unique()
movie_ids_series = ratings.movieId.unique()

print("There are, in total {} unique users and {} unique movies".format(user_ids_series.shape[0], movie_ids_series.shape[0]))

There are, in total 610 unique users and 9724 unique movies


In [4]:
# Let's take a small fraction of data as there's really a bunch of ratings there ...

small_data = ratings.sample(frac=0.2)

# Convert to m x n matrix
# small_data = small_data.pivot_table('rating', index='userId', columns='movieId')

small_data

Unnamed: 0,userId,movieId,rating
86769,561,2640,4.0
40547,274,78041,3.0
25238,177,3189,3.0
90611,590,903,4.5
72081,464,1391,2.0
...,...,...,...
51451,332,2019,4.5
84215,538,671,5.0
4750,28,65982,3.0
94092,599,5970,3.0


In [50]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

In [51]:
print("Train data has {} samples, while test data set has {} samples".format(train_data.shape[0], test_data.shape[0]))

Train data has 80668 samples, while test data set has 20168 samples


In [52]:
test_data.shape

(20168, 3)

In [53]:
# Convert both train and test data to m x n matrix
train_data = train_data.pivot_table('rating', index='userId', columns='movieId')
test_data = test_data.pivot_table('rating', index='userId', columns='movieId')

In [54]:
train_data

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [55]:
train_data.loc[148].dropna()

movieId
356       4.0
1197      3.0
4308      4.0
5816      4.0
5952      3.0
6377      3.0
7153      3.0
8368      4.0
30816     5.0
31658     4.0
40629     5.0
44191     4.0
50872     3.0
60069     4.5
68954     4.0
69844     4.0
72998     4.0
76093     3.0
79702     4.0
88125     4.0
89745     4.0
98243     4.5
98491     5.0
99149     3.0
108932    4.0
110102    4.0
112175    2.5
112852    3.5
116797    4.5
122882    4.0
122886    3.5
122920    3.5
134853    4.0
152081    4.0
157296    3.0
160718    4.5
Name: 148, dtype: float64

In [56]:
test_data.loc[148].dropna()

movieId
4886      3.0
4896      4.0
4993      3.0
5618      3.0
40815     4.0
54001     4.0
69757     3.5
79091     3.5
79132     1.5
81834     4.0
81847     4.5
115617    3.5
Name: 148, dtype: float64

In [57]:
# Calculate mean ratings for each user
users_mean_values = train_data.apply(lambda x: x.mean(), axis=1)
users_mean_values.head(5)

userId
1    4.331606
2    3.920000
3    2.580645
4    3.464706
5    3.657895
dtype: float64

In [58]:
# Normalize the data by making the mean value to be 0
normalized_ratings = train_data.apply(lambda x: x - users_mean_values[x.name], axis=1)
normalized_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.331606,,-0.331606,,,-0.331606,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157002,,,,,,-1.157002,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,-0.617820,-1.11782,-1.117820,,,,,,,0.882180,...,,,,,,,,,,
609,-0.290323,,,,,,,,,0.709677,...,,,,,,,,,,


In [59]:
# Looks like everything is NaN but it's just because there's bunch of data ...
# Here's a proof that for user id 148, ratings have been perfectly normalized
normalized_ratings.loc[148].dropna()

movieId
356       0.166667
1197     -0.833333
4308      0.166667
5816      0.166667
5952     -0.833333
6377     -0.833333
7153     -0.833333
8368      0.166667
30816     1.166667
31658     0.166667
40629     1.166667
44191     0.166667
50872    -0.833333
60069     0.666667
68954     0.166667
69844     0.166667
72998     0.166667
76093    -0.833333
79702     0.166667
88125     0.166667
89745     0.166667
98243     0.666667
98491     1.166667
99149    -0.833333
108932    0.166667
110102    0.166667
112175   -1.333333
112852   -0.333333
116797    0.666667
122882    0.166667
122886   -0.333333
122920   -0.333333
134853    0.166667
152081    0.166667
157296   -0.833333
160718    0.666667
Name: 148, dtype: float64

In [60]:
similarity_coefficients = normalized_ratings.T.corr()
similarity_coefficients.loc[:5, :10]

userId,1,2,3,4,5,6,7,8,9,10
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1.0,,0.085749,0.207172,0.18503,-0.24132,-0.064002,0.450287,0.904534,-0.356348
2,,1.0,,,,,-1.0,,,0.695701
3,0.085749,,1.0,,,,,,,
4,0.207172,,,1.0,-0.273483,0.274795,0.647345,0.0,,0.593666
5,0.18503,,,-0.273483,1.0,0.123476,0.437237,-0.053872,,


In [61]:
train_data_transposed_corr = train_data.T.corr()

In [62]:
def get_similar_users(target_uid, k=10):
    '''Gets K users most similar to target UID'''
    
    # So what we're doing is that we're transposing the matrix so that rows are movie IDs and columns are user IDs
    # Then we are finding correlation coefficients, sorting the values, and returning the top K elements
    
    if target_uid not in train_data_transposed_corr:
        return []
    
    # TODO: Exclude the target_uid from array
    return train_data_transposed_corr[target_uid].sort_values(ascending=False).index[:k]

In [63]:
get_similar_users(148)

Int64Index([190, 497, 526, 439, 562, 84, 75, 577, 248, 506], dtype='int64', name='userId')

In [64]:
def predict_rating(target_uid, target_movieId):
    if target_movieId not in normalized_ratings:
        return -2

    users_who_rated_target_movie = normalized_ratings.loc[get_similar_users(target_uid, 30)][target_movieId].dropna()

    if len(users_who_rated_target_movie) == 0:
        return -5
    
    predicted = users_mean_values.loc[target_uid] + (sum(similarity_coefficients.loc[target_uid][users_who_rated_target_movie.index] * users_who_rated_target_movie) / len(users_who_rated_target_movie))
    
    return predicted

In [65]:
ratings[(ratings['userId'] == 148) & (ratings['rating'] >= 4)].sort_values(by='rating', ascending=False).merge(movies)[['title', 'genres', 'rating']].head(5)

Unnamed: 0,title,genres,rating
0,"Phantom of the Opera, The (2004)",Drama|Musical|Romance,5.0
1,Pride & Prejudice (2005),Drama|Romance,5.0
2,Paperman (2012),Animation|Comedy|Romance,5.0
3,Piper (2016),Animation,4.5
4,The Imitation Game (2014),Drama|Thriller|War,4.5


In [66]:
predict_rating(148, 73) # 73 == Les Miserables

-5

In [67]:
predict_rating(148, 4308) # 4308 == Mouling ROuge

3.880465878413574

In [68]:
predict_rating(148, 52975) # 52975 == Hairspray

3.4826569138850565

In [69]:
predict_rating(148, 4993)

3.9627716440862546

In [70]:
users_who_rated_target_movie = normalized_ratings.loc[get_similar_users(148, 30)][4993].dropna()
users_who_rated_target_movie

userId
439    0.894737
75     1.677966
65    -0.482759
330   -0.188725
82     0.579235
182   -0.501911
254    0.495283
551   -1.707921
477    0.754737
63     0.334091
189   -0.076923
123   -0.521277
307    0.322663
Name: 4993, dtype: float64

In [71]:
similarity_coefficients.loc[148]

userId
1     -1.000000
2           NaN
3           NaN
4     -1.000000
5           NaN
         ...   
606   -0.338241
607         NaN
608   -0.488094
609         NaN
610   -0.418030
Name: 148, Length: 610, dtype: float64

In [72]:
test_data.loc[148].dropna()

movieId
4886      3.0
4896      4.0
4993      3.0
5618      3.0
40815     4.0
54001     4.0
69757     3.5
79091     3.5
79132     1.5
81834     4.0
81847     4.5
115617    3.5
Name: 148, dtype: float64

In [87]:
# Frankly, I actually don't know how to unpivot the data (seems like it should be "melted" but I can't seem to do it)
# Therefore, I'll just rerun train test split in order to get the original data back

train_data_original, test_data_original = train_test_split(ratings, test_size=0.2, random_state=42)

test_data_original

Unnamed: 0,userId,movieId,rating
67037,432,77866,4.5
42175,288,474,3.0
93850,599,4351,3.0
6187,42,2987,4.0
12229,75,1610,4.0
...,...,...,...
57416,380,5048,2.0
67290,434,54272,3.5
33423,226,5989,4.5
98552,607,1320,3.0


In [88]:
predict_rating(279, 93838)

4.0

In [89]:
# Now let's try predicting the data ;-)

test_data_original['predicted'] = test_data_original.apply(lambda x: predict_rating(x['userId'], x['movieId']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [90]:
test_data_original

Unnamed: 0,userId,movieId,rating,predicted
67037,432,77866,4.5,-5.000000
42175,288,474,3.0,3.511106
93850,599,4351,3.0,-5.000000
6187,42,2987,4.0,-5.000000
12229,75,1610,4.0,3.911320
...,...,...,...,...
57416,380,5048,2.0,2.085178
67290,434,54272,3.5,-5.000000
33423,226,5989,4.5,-5.000000
98552,607,1320,3.0,3.965926


In [91]:
from sklearn.metrics import mean_squared_error

In [92]:
test_data_original.shape

(20168, 4)

In [93]:
# We'll take into consideration only the rows where we actually WERE able to find the predicted rating
test_data_to_calculate_rmse = test_data_original[test_data_original['predicted'] >= 0]
test_data_to_calculate_rmse

Unnamed: 0,userId,movieId,rating,predicted
42175,288,474,3.0,3.511106
12229,75,1610,4.0,3.911320
65098,416,750,4.5,3.689905
46319,305,55276,5.0,5.308861
86670,561,1278,3.5,3.803558
...,...,...,...,...
62853,414,1587,5.0,3.356389
55901,369,1262,3.0,2.872641
57416,380,5048,2.0,2.085178
98552,607,1320,3.0,3.965926


In [94]:
test_data_to_calculate_rmse.rating.values

array([3. , 4. , 4.5, ..., 2. , 3. , 3. ])

In [95]:
test_data_to_calculate_rmse.predicted.values

array([3.51110637, 3.91131961, 3.6899045 , ..., 2.08517841, 3.96592622,
       3.8442691 ])

In [104]:
sqrt(mean_squared_error(test_data_to_calculate_rmse.rating.values, test_data_to_calculate_rmse.predicted.values))

1.1832961797798442

In [97]:
test_data_to_calculate_rmse.shape

(6756, 4)

In [98]:
train_data_original['predicted'] = train_data_original.apply(lambda x: predict_rating(x['userId'], x['movieId']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [99]:
train_data_to_calculate_rmse = train_data_original[train_data_original['predicted'] >= 0]
train_data_to_calculate_rmse

Unnamed: 0,userId,movieId,rating,predicted
80568,509,7347,3.0,3.000000
50582,326,71462,4.0,4.000000
8344,57,2115,3.0,3.000000
99603,610,1127,4.0,3.769196
71701,462,2409,2.0,2.326930
...,...,...,...,...
6265,42,4005,4.0,4.000000
54886,364,141,4.0,3.878969
76820,480,6867,4.0,4.000000
860,6,981,3.0,3.000000


In [101]:
sqrt(mean_squared_error(train_data_to_calculate_rmse.rating.values, train_data_to_calculate_rmse.predicted.values))

0.28636826052751335

In [102]:
train_data_to_calculate_rmse.shape

(80286, 4)

In [103]:
test_data_to_calculate_rmse

Unnamed: 0,userId,movieId,rating,predicted
42175,288,474,3.0,3.511106
12229,75,1610,4.0,3.911320
65098,416,750,4.5,3.689905
46319,305,55276,5.0,5.308861
86670,561,1278,3.5,3.803558
...,...,...,...,...
62853,414,1587,5.0,3.356389
55901,369,1262,3.0,2.872641
57416,380,5048,2.0,2.085178
98552,607,1320,3.0,3.965926


# Item-based collaborative filtering