In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds

In [4]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('DataSet/ml-100k/ua.base', sep='\t', names=r_cols,encoding='latin-1')
ratings_train.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [5]:
totalRatingCount_train=  ratings_train.groupby(by =("movie_id"))["rating"].count().reset_index().rename(columns ={"rating": "totalRatingCount"})
ratings_train = pd.merge(ratings_train,totalRatingCount_train, on = "movie_id", how = "left")
ratings_train.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,totalRatingCount
0,1,1,5,874965758,392
1,1,2,3,876893171,121
2,1,3,4,878542960,85
3,1,4,3,876893119,198
4,1,5,3,889751712,79


In [6]:
ratingCountThreshold = 50
ratings_train = ratings_train.query("totalRatingCount >= @ratingCountThreshold")
pivot_matrix_train = ratings_train.pivot_table(index = "user_id", columns = "movie_id", values = "rating").fillna(0)
pivot_matrix_train.head(5)

movie_id,1,2,3,4,5,7,8,9,10,11,...,1028,1035,1039,1041,1047,1048,1073,1074,1101,1119
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,4.0,1.0,5.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
matrix_train = pivot_matrix_train.values
ratings_mean_train = np.mean(matrix_train, axis = 1).reshape(-1, 1)
matrix_train_demeaned = matrix_train - ratings_mean_train

In [8]:
def rmse(true, pred):
    # this will be used towards the end
    x = true - pred
    x_sqr = x*x
    return  np.sqrt(mean_squared_error( true, pred))

In [9]:
def SVD(matrix_train , k =20):
    
    ratings_mean = np.mean(matrix_train, axis = 1).reshape(-1, 1)
    matrix_demeaned = matrix_train - ratings_mean
     
    U, sigma, Vt = svds(matrix_demeaned, k = k)
    sigma = np.diag(sigma)
    predicted_ratings = np.dot(np.dot(U, sigma), Vt) + ratings_mean.reshape(-1, 1)
    preds_df = pd.DataFrame(predicted_ratings, columns = pivot_matrix_train.columns)
    
    print(rmse(matrix_train,preds_df))
    
    return preds_df

In [10]:
preds_df = SVD(matrix_train, 20)

0.9440954805696053


In [11]:
def getRecommendationForUser(userID, num_recommendations=5 ):
    
    user_row_number = userID - 1
    user_predictions = preds_df.iloc[user_row_number]  
    user_full  = pivot_matrix_train.iloc[user_row_number]
    non_rated_indexes = np.where(user_full == 0)[0]
    recomendation_rating = user_predictions.iloc[non_rated_indexes].reset_index().rename(columns = {2:"Rating"}).sort_values(by = "Rating", ascending=False)
    print(f"Recommendated Movie IDs for user {userID} are following")
    print(recomendation_rating["movie_id"][0:10].to_string(index=False))
    

In [12]:
getRecommendationForUser(3,5)

Recommendated Movie IDs for user 3 are following
 313
 328
 286
 294
 315
 269
 332
  50
 690
 270
