In [1]:
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from scipy.sparse import coo_matrix

In [2]:
#importing data
data=pd.read_csv('ratings.csv')

In [3]:
data

Unnamed: 0,userId;movieId;rating;timestamp
0,1;1193;5;978300760
1,1;661;3;978302109
2,1;914;3;978301968
3,1;3408;4;978300275
4,1;2355;5;978824291
...,...
1000204,6040;1091;1;956716541
1000205,6040;1094;5;956704887
1000206,6040;562;5;956704746
1000207,6040;1096;4;956715648


In [4]:
#data cleaning
data[['userId', 'movieId', 'rating', 'timestamp']] = data['userId;movieId;rating;timestamp'].str.split(';', expand=True)
data = data.astype({'userId': int, 'movieId': int, 'rating': int, 'timestamp': int})

In [5]:
data.drop('userId;movieId;rating;timestamp', axis=1, inplace=True)
data.drop('timestamp', axis=1, inplace=True)

In [6]:

print(data.dtypes)
data

userId     int32
movieId    int32
rating     int32
dtype: object


Unnamed: 0,userId,movieId,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [7]:
ratings_matrix = pd.pivot_table(data, values='rating', index='userId', columns='movieId', fill_value=0)

In [8]:
#original ratings matrix
ratings_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0,0,0,2,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
U, sigma, Vt = np.linalg.svd(ratings_matrix)

In [10]:
print(Vt.shape)
print(U.shape)
print(sigma.shape)

(3706, 3706)
(6040, 6040)
(3706,)


In [11]:
print(Vt)

[[-7.01371394e-02 -2.35438150e-02 -1.37658393e-02 ... -2.61526344e-03
  -1.16635687e-03 -1.32565863e-02]
 [-2.09401541e-02 -2.97924550e-02 -1.67038987e-02 ...  1.87440015e-03
   2.26511244e-03  5.02213333e-03]
 [ 3.01647236e-02 -1.01890706e-02  1.25724193e-02 ...  1.78319162e-03
   3.52091700e-03  2.23576838e-02]
 ...
 [ 0.00000000e+00 -2.34524318e-17 -2.09874404e-17 ...  5.54718602e-17
   3.25183148e-17  1.64842192e-17]
 [ 0.00000000e+00 -9.47143690e-19  4.09864954e-18 ...  1.21750576e-17
   5.28965941e-17  5.84795322e-18]
 [ 0.00000000e+00  2.65291448e-17  3.55560463e-17 ... -9.08096433e-17
  -9.27671336e-18 -4.28091557e-17]]


In [12]:
sigma_matrix = np.diag(sigma)
sigma_matrix.shape

(3706, 3706)

In [13]:
k = 3706
ratings_matrix_approx = np.dot(np.dot(U[:, :k], sigma_matrix[:k, :k]), Vt[:k, :])

In [14]:
#Predicted ratings matrix
print(ratings_matrix_approx.shape)

(6040, 3706)


In [15]:
ratings_matrix_approx=pd.DataFrame(ratings_matrix_approx)

In [16]:
ratings_matrix_approx

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,5.000000e+00,2.946202e-13,1.101945e-14,-1.297898e-14,-5.230025e-14,-3.760261e-14,-2.195661e-14,-1.003928e-13,8.642989e-15,6.445799e-15,...,-1.530893e-16,4.559612e-16,-1.609715e-15,-2.242781e-15,1.277407e-15,-2.190997e-14,-1.303862e-15,-1.588356e-16,-1.403825e-15,-1.361172e-14
1,-1.520454e-14,-2.535577e-13,-7.171897e-14,6.747641e-15,1.510971e-13,-5.451990e-14,3.363493e-14,-2.584231e-14,-3.297406e-14,7.324225e-14,...,2.088824e-15,4.649059e-16,2.404760e-16,-2.468891e-15,1.605703e-15,-1.051305e-14,5.983115e-15,-8.862269e-16,4.513263e-16,-2.965971e-15
2,-2.159031e-14,-1.578539e-14,-1.117688e-13,-9.325375e-14,6.020011e-14,3.406733e-16,-5.882881e-16,-1.194108e-14,1.058902e-14,2.802102e-14,...,2.142167e-15,1.919038e-16,-2.666704e-15,-4.781332e-16,2.035698e-15,-6.570265e-16,2.536057e-15,1.823899e-15,3.125321e-15,8.055514e-15
3,-5.279170e-15,3.154771e-15,-1.676210e-14,8.272278e-14,-1.429830e-14,-5.843145e-15,-8.009381e-15,7.359673e-15,1.384553e-15,-1.858987e-15,...,-2.230204e-16,2.289835e-16,1.973248e-16,2.401291e-15,1.416619e-15,3.855857e-15,1.664603e-15,-5.048587e-16,-1.183244e-15,5.431107e-15
4,-9.841520e-15,7.347253e-14,-8.786374e-15,7.418843e-15,-8.418125e-15,2.000000e+00,-3.554448e-15,-6.964481e-15,-3.650075e-15,-3.272156e-14,...,8.337515e-16,-2.300243e-15,3.155028e-16,-7.009367e-16,3.010613e-15,2.773972e-15,1.946143e-15,5.282233e-16,-2.537033e-16,1.273938e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,-4.485995e-15,9.034770e-15,2.381103e-14,2.000000e+00,-4.665864e-15,3.000000e+00,-1.310233e-14,1.740936e-14,4.585819e-15,1.155876e-14,...,2.258718e-15,9.211382e-16,6.969252e-16,-1.143264e-15,1.804695e-15,1.348390e-14,-1.143400e-15,-1.551276e-15,-8.669823e-16,4.100534e-15
6036,-7.615080e-15,3.926807e-14,1.151683e-14,3.243608e-15,3.901285e-15,-2.577772e-15,3.279332e-15,4.038883e-15,1.994390e-16,6.460788e-15,...,2.233456e-16,4.631712e-16,1.138412e-17,2.217085e-15,2.068875e-15,1.385960e-14,-1.152941e-15,1.884560e-15,2.565222e-16,4.413895e-15
6037,-2.802744e-15,4.878910e-17,-4.036403e-15,-1.208560e-15,-1.205741e-15,4.066707e-16,2.054563e-17,4.071132e-15,-2.448129e-16,4.417148e-15,...,-5.130445e-16,3.295975e-16,-2.087631e-16,6.111648e-16,5.102255e-16,-3.299363e-16,7.118939e-16,1.395314e-15,-3.567025e-16,-4.306044e-16
6038,-5.529797e-15,5.961702e-15,-6.217764e-16,-5.858568e-15,9.489751e-16,-5.607616e-15,1.037365e-15,2.173310e-15,1.898980e-15,5.913577e-15,...,-7.811406e-16,1.899522e-16,-1.986258e-16,2.183583e-15,1.642458e-15,-2.289672e-15,-1.858323e-16,-8.847090e-16,3.439957e-15,-9.318176e-16


In [17]:
#recommendation system
def recommend_high_rated(predictions_df, userID, original_ratings_df):
    user_row_number = userID  # UserID starts at 1, not 0
    user_data = original_ratings_df[userID]
    predicted_data = predictions_df[userID]
    movie_id=0
    rating=0
    for i in range(predictions_df.shape[1]):
      if(user_data[i+1]== 0).any():
        if(rating<predicted_data[i]).any():
          movie_id=i+1
          rating=predicted_data[i]
    
    return movie_id


In [18]:
print(recommend_high_rated(ratings_matrix_approx, 2,ratings_matrix ))

543
