In [15]:
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds

file = 'ratings.csv'
df = pd.read_csv(file)
ratings_matrix = df.pivot(index='userId', columns='movieId', values='rating')
ratings_matrix = ratings_matrix.dropna(thresh=10, axis=0)
ratings_matrix = ratings_matrix.dropna(thresh=20, axis=1)
ratings_matrix_without_na = ratings_matrix.fillna(2.5)
print(ratings_matrix)

R = ratings_matrix_without_na.values
user_ratings_mean = np.mean(R, axis=1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)
U, sigma, Vt = svds( R_demeaned, k=300 )
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

movieId  1       2       3       5       6       7       10      11      \
userId                                                                    
1           4.0     NaN     4.0     NaN     4.0     NaN     NaN     NaN   
2           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5           4.0     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
...         ...     ...     ...     ...     ...     ...     ...     ...   
606         2.5     NaN     NaN     NaN     NaN     2.5     NaN     2.5   
607         4.0     NaN     NaN     NaN     NaN     NaN     NaN     3.0   
608         2.5     2.0     2.0     NaN     NaN     NaN     4.0     NaN   
609         3.0     NaN     NaN     NaN     NaN     NaN     4.0     NaN   
610         5.0     NaN     NaN     NaN     5.0     NaN     NaN     NaN   

movieId  16      17     

In [29]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns=ratings_matrix_without_na.columns, index=ratings_matrix_without_na.index)
preds_df_csv = preds_df.to_csv("preds_df.csv", index=False)
print(preds_df)

movieId    1         2         3         5         6         7         10      \
userId                                                                          
1        4.109035  2.478175  4.018092  2.540547  4.038038  2.562586  2.424897   
2        2.472785  2.553877  2.435443  2.652260  2.493768  2.489383  2.593776   
3        2.345767  2.362979  2.612378  2.532266  2.717807  2.325320  2.498705   
4        2.610368  2.568247  2.528285  2.550830  2.596556  2.480706  2.491853   
5        4.050518  2.787768  2.652472  2.694362  2.645796  2.498920  2.238768   
...           ...       ...       ...       ...       ...       ...       ...   
606      2.517974  2.563921  2.538516  2.525910  2.455976  2.496923  2.445914   
607      4.030515  2.504351  2.558206  2.514530  2.471593  2.540678  2.680623   
608      2.517856  1.958807  1.985810  2.519315  2.412509  2.423381  4.011740   
609      2.972722  2.609247  2.549427  2.412548  2.475535  2.469227  3.254849   
610      5.033734  2.545382 

In [30]:
new_table = preds_df.mask(~ratings_matrix.isna())
new_table_csv = new_table.to_csv("new_table.csv", index=False)
print(new_table)

movieId    1         2         3         5         6         7         10      \
userId                                                                          
1             NaN  2.478175       NaN  2.540547       NaN  2.562586  2.424897   
2        2.472785  2.553877  2.435443  2.652260  2.493768  2.489383  2.593776   
3        2.345767  2.362979  2.612378  2.532266  2.717807  2.325320  2.498705   
4        2.610368  2.568247  2.528285  2.550830  2.596556  2.480706  2.491853   
5             NaN  2.787768  2.652472  2.694362  2.645796  2.498920  2.238768   
...           ...       ...       ...       ...       ...       ...       ...   
606           NaN  2.563921  2.538516  2.525910  2.455976       NaN  2.445914   
607           NaN  2.504351  2.558206  2.514530  2.471593  2.540678  2.680623   
608           NaN       NaN       NaN  2.519315  2.412509  2.423381       NaN   
609           NaN  2.609247  2.549427  2.412548  2.475535  2.469227       NaN   
610           NaN  2.545382 

In [28]:
def recommend(id):
    recommendations = new_table.loc[id].dropna().sort_values(ascending=False)
    top_recommendations_indexes = recommendations.head(10).index
    top_recommendations = df_movies.loc[top_recommendations_indexes,['title', 'genres']]
    return top_recommendations

movies_file = 'movies.csv'
df_movies = pd.read_csv(movies_file)
df_movies = df_movies.set_index('movieId')
top_recommendations = recommend(8)
print(f'Recommendations:{top_recommendations}')
top_recommendations = top_recommendations.to_csv("top.csv", index=False, header=True)

Recommendations:                                                     title  \
movieId                                                      
300                                       Quiz Show (1994)   
595                            Beauty and the Beast (1991)   
288                            Natural Born Killers (1994)   
161                                    Crimson Tide (1995)   
265      Like Water for Chocolate (Como agua para choco...   
19                   Ace Ventura: When Nature Calls (1995)   
266                             Legends of the Fall (1994)   
62                               Mr. Holland's Opus (1995)   
111                                     Taxi Driver (1976)   
509                                      Piano, The (1993)   

                                                  genres  
movieId                                                   
300                                                Drama  
595      Animation|Children|Fantasy|Musical|Romance|IMAX  
288

In [26]:
def experiment_with_different_k(k): 
    U, sigma, Vt = svds( R_demeaned, k=k )
    sigma = np.diag(sigma)
    R_demeaned_reconstructed = np.dot(np.dot(U, sigma), Vt)    
    print(R_demeaned_reconstructed)
    print('....................................................')
print(R_demeaned)
print('....................................................')
experiment_with_different_k(600)
experiment_with_different_k(500)
experiment_with_different_k(300)
experiment_with_different_k(200)
experiment_with_different_k(100)
experiment_with_different_k(50)
experiment_with_different_k(10)
experiment_with_different_k(1)

[[ 1.23361604 -0.26638396  1.23361604 ... -0.26638396 -0.26638396
  -0.26638396]
 [-0.02544333 -0.02544333 -0.02544333 ... -0.02544333 -0.02544333
  -0.02544333]
 [ 0.01580571  0.01580571  0.01580571 ...  0.01580571  0.01580571
   0.01580571]
 ...
 [-0.34772552 -0.84772552 -0.84772552 ... -0.34772552 -0.34772552
  -0.34772552]
 [ 0.4849653  -0.0150347  -0.0150347  ... -0.0150347  -0.0150347
  -0.0150347 ]
 [ 1.90593678 -0.59406322 -0.59406322 ...  1.90593678  0.90593678
   1.90593678]]
....................................................
[[ 1.23329268 -0.26661711  1.23377098 ... -0.26583092 -0.26619474
  -0.2662601 ]
 [-0.02536603 -0.02498092 -0.02641406 ... -0.02466658 -0.02482583
  -0.02609371]
 [ 0.01704084  0.01477683  0.01309043 ...  0.01479848  0.01480508
   0.01624865]
 ...
 [-0.34778188 -0.847174   -0.84765106 ... -0.34792605 -0.34771211
  -0.3474324 ]
 [ 0.47472869 -0.04610165  0.02346295 ... -0.00672356 -0.03475209
   0.00582828]
 [ 1.90600886 -0.59406869 -0.59384533 ...  1.9