In [1]:
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds

%config Completer.use_jedi = False

In [2]:
raw_ratings = pd.read_csv('./ratings.csv')

In [3]:
raw_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


We need to pivot the data so that we get m x n matrix (rows = users, columns = movies)

In [4]:
ratings = raw_ratings.pivot_table(index='userId', columns='movieId', values='rating')
ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


Perfect. Let's do mean-centering now. We'll use the row-wise mean value.

In [5]:
users_mean_ratings = ratings.mean(axis=1)
users_mean_ratings.head(3)

userId
1    4.366379
2    3.948276
3    2.435897
dtype: float64

In [6]:
users_mean_ratings

userId
1      4.366379
2      3.948276
3      2.435897
4      3.555556
5      3.636364
         ...   
606    3.657399
607    3.786096
608    3.134176
609    3.270270
610    3.688556
Length: 610, dtype: float64

Perfect. Let's normalize our ratings now.

In [7]:
normalized_ratings = ratings.apply(lambda x: x - users_mean_ratings[x.name], axis=1)
normalized_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,,-0.366379,,,-0.366379,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,0.363636,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,,,,,,-1.157399,,,,...,,,,,,,,,,
607,0.213904,,,,,,,,,,...,,,,,,,,,,
608,-0.634176,-1.134176,-1.134176,,,,,,,0.865824,...,,,,,,,,,,
609,-0.270270,,,,,,,,,0.729730,...,,,,,,,,,,


Finally, let's fill in the NaN values with zeros.

In [8]:
normalized_ratings = normalized_ratings.fillna(0)
normalized_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,0.000000,-0.366379,0.0,0.0,-0.366379,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.363636,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,0.000000,0.000000,0.0,0.0,0.000000,-1.157399,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.213904,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,-0.634176,-1.134176,-1.134176,0.0,0.0,0.000000,0.000000,0.0,0.0,0.865824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,-0.270270,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.729730,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Perfect. We're ready to decompose and recompose our matrix now ;-)

In [9]:
U, sigma, Vt = svds(normalized_ratings.values, k = 2)

In [10]:
sigma = np.diag(sigma)

In [11]:
np.dot(np.dot(U, sigma), Vt)[0, 0:6] + users_mean_ratings[1]

array([4.44371748, 4.3414349 , 4.35635076, 4.3602882 , 4.3101328 ,
       4.41549084])

In [12]:
U, sigma, Vt = svds(normalized_ratings.values, k = 6)

In [13]:
sigma = np.diag(sigma)

In [14]:
np.dot(np.dot(U, sigma), Vt)[0, 0:6] + users_mean_ratings[1]

array([4.46231366, 4.34918476, 4.38913168, 4.35582835, 4.3048888 ,
       4.3799541 ])

In [15]:
U, sigma, Vt = svds(normalized_ratings.values, k = 50)

In [16]:
sigma = np.diag(sigma)

In [17]:
predicted = np.dot(np.dot(U, sigma), Vt) + users_mean_ratings.values.reshape(-1, 1)
predicted

array([[4.30616055, 4.29639677, 4.40595086, ..., 4.36676653, 4.36676653,
        4.36724272],
       [3.94598477, 3.93225963, 3.9367516 , ..., 3.94819836, 3.94819836,
        3.94793259],
       [2.42520627, 2.4731044 , 2.40444357, ..., 2.43590095, 2.43590095,
        2.43595447],
       ...,
       [2.40823265, 1.92951625, 1.90144494, ..., 3.13490895, 3.13490895,
        3.11973288],
       [3.31689671, 3.27012154, 3.27617457, ..., 3.270299  , 3.270299  ,
        3.27067519],
       [5.03909959, 3.68865446, 3.63611918, ..., 3.6879992 , 3.6879992 ,
        3.70389295]])

In [18]:
predicted[0][20:30]

array([4.41941476, 4.29271834, 4.32466399, 4.28243484, 4.39922809,
       4.35227507, 4.36769508, 4.33414686, 4.44916955, 4.35175148])

In [20]:
# Let's try to evaluate all this ... First, split into train and test sets. Then train on TRAIN set, and then 
# use that to validate against test set

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
raw_train_data, raw_test_data = train_test_split(raw_ratings, test_size=0.2)

In [23]:
raw_train_data.shape

(80668, 4)

In [24]:
raw_test_data.shape

(20168, 4)

Rebuild a matrix out of train data.

In [25]:
train_data = raw_train_data.pivot_table(index='userId', columns='movieId', values='rating')
train_data

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193567,193571,193573,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [26]:
users_mean_ratings = train_data.mean(axis=1)

In [27]:
normalized_train_data = train_data.apply(lambda x: x - users_mean_ratings[x.name], axis=1).fillna(0)
normalized_train_data

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193567,193571,193573,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.000000,-0.423729,0.0,0.0,-0.423729,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.175629,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.187500,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,-0.628959,-1.128959,-1.128959,0.0,0.0,0.000000,0.0,0.0,0.0,0.871041,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,-0.321429,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.678571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
U, sigma, Vt = svds(normalized_train_data.values, k = 50)
sigma = np.diag(sigma)
predicted = np.dot(np.dot(U, sigma), Vt) + users_mean_ratings.values.reshape(-1, 1)

In [29]:
train_data_predicted = pd.DataFrame(index=train_data.index, columns=train_data.columns, data=predicted)
train_data_predicted

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193567,193571,193573,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.418646,4.309349,4.370153,4.439421,4.408837,4.250286,4.250275,4.433034,4.435792,4.373848,...,4.427197,4.421502,4.424118,4.425425,4.422810,4.422810,4.422810,4.424118,4.424118,4.424118
2,3.919609,3.909721,3.911045,3.926690,3.928896,3.893973,3.911957,3.924326,3.928178,3.948567,...,3.924566,3.923241,3.923048,3.922952,3.923145,3.923145,3.923145,3.923048,3.923048,3.923048
3,2.419388,2.435030,2.350165,2.430527,2.418325,2.438244,2.401472,2.428237,2.418392,2.484711,...,2.426737,2.428395,2.428602,2.428706,2.428499,2.428499,2.428499,2.428602,2.428602,2.428602
4,3.928861,3.440996,3.567070,3.531352,3.543575,3.235284,3.614601,3.535194,3.569790,3.279098,...,3.572098,3.539860,3.551427,3.557210,3.545644,3.545644,3.545644,3.551427,3.551427,3.551427
5,3.665293,3.585110,3.582510,3.580138,3.590368,3.672654,3.601084,3.592007,3.584943,3.578667,...,3.588058,3.594560,3.594601,3.594621,3.594580,3.594580,3.594580,3.594601,3.594601,3.594601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.064986,3.274306,3.761356,3.717391,3.571952,3.728055,3.719854,3.682296,3.705681,3.927611,...,3.672592,3.676410,3.675493,3.675034,3.675952,3.675952,3.675952,3.675493,3.675493,3.675493
607,3.826359,3.974816,3.959094,3.780682,3.812230,3.847824,3.799769,3.787900,3.806231,3.788344,...,3.812167,3.814433,3.812163,3.811027,3.813298,3.813298,3.813298,3.812163,3.812163,3.812163
608,2.568271,2.028331,1.925144,3.143413,3.146038,3.280650,3.127067,3.123169,3.111283,4.024695,...,3.119296,3.131076,3.128590,3.127346,3.129833,3.129833,3.129833,3.128590,3.128590,3.128590
609,3.319145,3.310736,3.324348,3.314081,3.311063,3.364288,3.329400,3.319395,3.313955,3.356078,...,3.317136,3.321484,3.321419,3.321387,3.321451,3.321451,3.321451,3.321419,3.321419,3.321419


In [30]:
train_data.head(3)

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193567,193571,193573,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


In [31]:
train_data_predicted.head(3)

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193567,193571,193573,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.418646,4.309349,4.370153,4.439421,4.408837,4.250286,4.250275,4.433034,4.435792,4.373848,...,4.427197,4.421502,4.424118,4.425425,4.42281,4.42281,4.42281,4.424118,4.424118,4.424118
2,3.919609,3.909721,3.911045,3.92669,3.928896,3.893973,3.911957,3.924326,3.928178,3.948567,...,3.924566,3.923241,3.923048,3.922952,3.923145,3.923145,3.923145,3.923048,3.923048,3.923048
3,2.419388,2.43503,2.350165,2.430527,2.418325,2.438244,2.401472,2.428237,2.418392,2.484711,...,2.426737,2.428395,2.428602,2.428706,2.428499,2.428499,2.428499,2.428602,2.428602,2.428602


So, let's try evaluating the model now. Do note that I will be skipping movies that were not in train data. I'm aware this introduces a certain bias in my results but I was more interested in seeing how this works.

In [32]:
# These two will hold predicted vs actual data

actual_values = []
predicted_values = []

In [34]:
raw_test_data.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
45902,304,1747,2.0,916237654
23155,159,1961,3.0,1508641205
78586,489,904,4.5,1385823606


In [35]:
from math import sqrt
from sklearn.metrics import mean_squared_error

In [36]:
i = 0

for key, row in raw_test_data.iterrows():
    userId = row['userId']
    movieId = row['movieId']
    rating = row['rating']
    
    # First, let's see if this movie exists in train data
    
    if movieId not in train_data.columns:
        continue
        
    predicted = train_data_predicted.loc[userId][movieId]
    
    #print("User ID: {}, Movie ID: {}, Predicted: {}, Actual: {}".format(userId, movieId, predicted, rating))
    
    predicted_values.append(predicted)
    actual_values.append(rating)

In [37]:
sqrt(mean_squared_error(actual_values, predicted_values))

0.9253234123373119

Well, that's the best MSE I've ever got so far ... Pretty nice!

Finally, let's build a predicter system, just for the fun of doing it. I'll use the full data set for the sake of "wht the hell not" :)

In [43]:
users_mean_ratings = ratings.mean(axis=1)
normalized_ratings = ratings.apply(lambda x: x - users_mean_ratings[x.name], axis=1).fillna(0)
normalized_ratings.head(3)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,0.0,-0.366379,0.0,0.0,-0.366379,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
U, sigma, Vt = svds(normalized_ratings.values, k = 2)
sigma = np.diag(sigma)
predicted = np.dot(np.dot(U, sigma), Vt) + users_mean_ratings.values.reshape(-1, 1)

predicted = pd.DataFrame(index=ratings.index, columns=ratings.columns, data=predicted)
predicted.head(3)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.443717,4.341435,4.356351,4.360288,4.310133,4.415491,4.339169,4.357135,4.354477,4.357889,...,4.366355,4.366297,4.366414,4.366414,4.366355,4.366414,4.366355,4.366355,4.366355,4.366679
2,3.950691,3.950173,3.948489,3.948707,3.953105,3.948302,3.950168,3.948705,3.948743,3.954024,...,3.948268,3.94825,3.948287,3.948287,3.948268,3.948287,3.948268,3.948268,3.948268,3.948285
3,2.433803,2.442137,2.437264,2.437355,2.451117,2.431433,2.442337,2.43764,2.437965,2.448633,...,2.435884,2.435851,2.435917,2.435917,2.435884,2.435917,2.435884,2.435884,2.435884,2.435889


In [80]:
movies = pd.read_csv('./movies.csv')

In [84]:
def get_movie_title(id):
    return movies.loc[id]['title']

def get_movie_genre(id):
    return movies.loc[id]['genres']

In [75]:
def get_movies_to_watch(userId, k=10):
    # Get a list of movies that this user DID NOT watch
    unrated_movies = ratings.loc[userId][pd.isna(ratings.loc[1])].index.values
    
    # Get predicted ratings for unrated movies, sort and return the top-k
    sorted_unrated_movies = predicted.loc[userId][unrated].sort_values(ascending=False)
    
    return sorted_unrated_movies[:k]

In [86]:
target_user = 1 # user for whom we want to predict movies to watch

Let's first list some of the movies that he liked the most 

In [92]:
for movieId in ratings.loc[target_user].dropna().sort_values(ascending=False).index[0:20]:
    print("{} ({})".format(get_movie_title(movieId), get_movie_genre(movieId)))

Dark Days (2000) (Documentary)
Artemisia (1997) (Drama)
NeverEnding Story, The (1984) (Adventure|Children|Fantasy)
Snake Eyes (1998) (Action|Crime|Mystery|Thriller)
Man with Two Brains, The (1983) (Comedy)
Great Mouse Detective, The (1986) (Action|Animation|Children|Crime)
Big Lebowski, The (1998) (Comedy|Crime)
Titanic (1997) (Drama|Romance)
Flubber (1997) (Children|Comedy|Fantasy)
Alien: Resurrection (1997) (Action|Horror|Sci-Fi)
Tales from the Darkside: The Movie (1990) (Fantasy|Horror|Thriller)
I Love You, I Love You Not (1996) (Drama|Romance)
Deceiver (1997) (Crime|Drama|Thriller)
Fast, Cheap & Out of Control (1997) (Documentary)
U Turn (1997) (Crime|Drama|Mystery)
Game, The (1997) (Drama|Mystery|Thriller)
Wishmaster (1997) (Horror)
Kiss the Girls (1997) (Crime|Drama|Mystery|Thriller)
Wrongfully Accused (1998) (Action|Comedy)
Rambo III (1988) (Action|Adventure|Thriller|War)


Judging by looking at movie list, looks like he's a fan of Actio and Comedy I guess? Let's see what the predicted stuff is

In [93]:
for movieId in get_movies_to_watch(target_user, 20).index:
    print("{} ({})".format(get_movie_title(movieId), get_movie_genre(movieId)))

I Love Trouble (1994) (Action|Comedy)
Escape from New York (1981) (Action|Adventure|Sci-Fi|Thriller)
Zapped! (1982) (Comedy|Sci-Fi)
Soul Food (1997) (Drama)
Animal Crackers (1930) (Comedy|Musical)
Last Dance (1996) (Drama)
Jack and Sarah (1995) (Romance)
Pompatus of Love, The (1996) (Comedy|Drama)
Babe (1995) (Children|Drama)
Oklahoma! (1955) (Musical|Romance|Western)
Grand Day Out with Wallace and Gromit, A (1989) (Adventure|Animation|Children|Comedy|Sci-Fi)
TiMER (2009) (Comedy|Drama|Fantasy|Romance)
Georgy Girl (1966) (Comedy)
Surrogates (2009) (Action|Sci-Fi|Thriller)
Desperate Measures (1998) (Crime|Drama|Thriller)
Pride and Glory (2008) (Crime|Drama)
Underneath (1995) (Mystery|Thriller)
Wings of Desire (Himmel über Berlin, Der) (1987) (Drama|Fantasy|Romance)
King Solomon's Mines (1985) (Adventure|Comedy)
Withnail & I (1987) (Comedy)


Well, I hope that he'd like these :-)