In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [44]:
df_movies = pd.read_csv('movies.csv')
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [29]:
df_ratings = pd.read_csv('ratings.csv')
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [30]:
# Converting the rows to one row per user
R_df = df_ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
R_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# SVD
## R = U Σ V^T

where 
R
 is user ratings matrix, 
U
 is the user “features” matrix, 
Σ
 is the diagonal matrix of singular values (essentially weights), and 
V
T
 is the movie “features” matrix. 
U
 and 
V^T
 are orthogonal, and represent different things. 
U
 represents how much users “like” each feature and 
V^T
 represents how relevant each feature is to each movie.

In [31]:
R = R_df.values
user_ratings_mean = np.mean(R, axis = 1)# ROw Mean
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [32]:
print(R.shape)

print(user_ratings_mean.shape)

print(R_demeaned.shape)

(610, 9724)
(610,)
(610, 9724)


In [33]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50) #k is no of singular values or vectors
print(f"U shape: {U.shape}, sigma shape: {sigma.shape}, Vt shape: {Vt.shape}")

# Rank 1 + Rank 1 + ,,,,,,,, = Total 50 Ranks(k)

U shape: (610, 50), sigma shape: (50,), Vt shape: (50, 9724)


In [34]:
print(sigma)
sigma = np.diag(sigma)#convert sigma to diagnol matrix
print(sigma)

[ 67.86628347  68.1967072   69.02678246  69.4170401   69.91863747
  70.02091789  70.19408599  71.67445157  72.43371861  73.21879553
  73.43760593  74.02644882  74.28978377  74.9207733   75.17528213
  75.59325141  76.70227225  77.35717925  78.39405157  79.04344482
  79.21217131  80.56747647  81.5467832   82.1973482   83.04447645
  85.11688914  85.74871886  86.51711471  87.91550637  90.33575237
  90.9340682   92.26271695  93.39976829  97.10067118  99.28906754
  99.82361796 101.84794614 105.97367358 107.04782929 109.20838712
 112.80840902 120.61532345 122.64724436 134.58721632 139.637245
 153.93097112 163.73084057 184.86187801 231.22453421 474.20606204]
[[ 67.86628347   0.           0.         ...   0.           0.
    0.        ]
 [  0.          68.1967072    0.         ...   0.           0.
    0.        ]
 [  0.           0.          69.02678246 ...   0.           0.
    0.        ]
 ...
 [  0.           0.           0.         ... 184.86187801   0.
    0.        ]
 [  0.           0. 

In [35]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
print(all_user_predicted_ratings.shape)

(610, 9724)


In [36]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,2.167328,0.402751,0.840184,-0.076281,-0.551337,2.504091,-0.890114,-0.026443,0.196974,1.593259,...,-0.023453,-0.019967,-0.026939,-0.026939,-0.023453,-0.026939,-0.023453,-0.023453,-0.023453,-0.058732
1,0.211459,0.006658,0.033455,0.017419,0.18343,-0.062473,0.083037,0.024158,0.04933,-0.15253,...,0.019498,0.016777,0.022219,0.022219,0.019498,0.022219,0.019498,0.019498,0.019498,0.032281
2,0.003588,0.030518,0.046393,0.008176,-0.006247,0.107328,-0.012416,0.003779,0.007297,-0.059362,...,0.005909,0.006209,0.00561,0.00561,0.005909,0.00561,0.005909,0.005909,0.005909,0.008004
3,2.051549,-0.387104,-0.252199,0.087562,0.130465,0.27021,0.477835,0.040313,0.025858,-0.017365,...,0.004836,0.004172,0.0055,0.0055,0.004836,0.0055,0.004836,0.004836,0.004836,-0.023311
4,1.344738,0.778511,0.065749,0.111744,0.273144,0.584426,0.25493,0.128788,-0.085541,1.023455,...,-0.008042,-0.007419,-0.008664,-0.008664,-0.008042,-0.008664,-0.008042,-0.008042,-0.008042,-0.010127


In [70]:
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=4):
    
    
    user_row_number = userID - 1 # Actual userID in R_df = (Index-1) in predictions_df
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) #descending order
    
    
    user_data = original_ratings_df[original_ratings_df.userId == userID]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False))
    
   
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])]. 
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

# movies from movies_df which is not watched by specific user(UserID:15)

In [50]:
sorted_user_predictions = preds_df.iloc[15-1].sort_values(ascending=False) 
print(sorted_user_predictions)

movieId
2571    5.546076
318     4.838602
260     4.364714
356     4.069895
1196    3.989045
          ...   
441    -0.514335
2395   -0.514545
3253   -0.516573
1288   -0.525468
1148   -0.579187
Name: 14, Length: 9724, dtype: float64


In [52]:
#print(df_ratings[df_ratings.userId == 15].shape)
#print(df_movies)
user_full = df_ratings[df_ratings.userId == 15].merge(df_movies, how = 'left', left_on = 'movieId', right_on = 'movieId').sort_values(['rating'], ascending=False)
print(user_full)

     userId  movieId  rating   timestamp                           title  \
32       15     2001     5.0  1299424826          Lethal Weapon 2 (1989)   
98       15    84152     5.0  1510572834                Limitless (2011)   
46       15     3147     5.0  1510571797          Green Mile, The (1999)   
38       15     2150     5.0  1299425040  Gods Must Be Crazy, The (1980)   
47       15     3156     5.0  1510573324         Bicentennial Man (1999)   
..      ...      ...     ...         ...                             ...   
125      15   122924     1.0  1510573305        X-Men: Apocalypse (2016)   
10       15      355     1.0  1299425002         Flintstones, The (1994)   
4        15      172     1.0  1299424762          Johnny Mnemonic (1995)   
3        15      158     1.0  1299424840                   Casper (1995)   
107      15    99114     1.0  1510571958         Django Unchained (2012)   

                              genres  
32         Action|Comedy|Crime|Drama  
98       

In [69]:
#print(df_movies['movieId'].isin(user_full['movieId']))
#print(~df_movies['movieId'].isin(user_full['movieId']))
#df_movies[~df_movies['movieId'].isin(user_full['movieId'])]
'''print(df_movies[~df_movies['movieId'].isin(user_full['movieId'])]. 
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').rename(columns = {user_row_number: 'Predictions'}))'''

recommendations = (df_movies[~df_movies['movieId'].isin(user_full['movieId'])]. 
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )
print(recommendations)

NameError: name 'user_row_number' is not defined

In [71]:
full, predictions = recommend_movies(preds_df, 15, df_movies, df_ratings, 10)

In [72]:
full.head(15)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
32,15,2001,5.0,1299424826,Lethal Weapon 2 (1989),Action|Comedy|Crime|Drama
98,15,84152,5.0,1510572834,Limitless (2011),Sci-Fi|Thriller
46,15,3147,5.0,1510571797,"Green Mile, The (1999)",Crime|Drama
38,15,2150,5.0,1299425040,"Gods Must Be Crazy, The (1980)",Adventure|Comedy
47,15,3156,5.0,1510573324,Bicentennial Man (1999),Drama|Romance|Sci-Fi
49,15,3510,5.0,1299425097,Frequency (2000),Drama|Thriller
33,15,2011,5.0,1510572060,Back to the Future Part II (1989),Adventure|Comedy|Sci-Fi
65,15,4995,5.0,1510571789,"Beautiful Mind, A (2001)",Drama|Romance
27,15,1270,5.0,1510571953,Back to the Future (1985),Adventure|Comedy|Sci-Fi
52,15,3578,5.0,1510571768,Gladiator (2000),Action|Adventure|Drama


In [73]:
predictions

Unnamed: 0,movieId,title,genres
494,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
4726,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
94,110,Braveheart (1995),Action|Drama|War
1153,1580,Men in Black (a.k.a. MIB) (1997),Action|Comedy|Sci-Fi
1253,1704,Good Will Hunting (1997),Drama|Romance
6920,68157,Inglourious Basterds (2009),Action|Drama|War
120,150,Apollo 13 (1995),Adventure|Drama|IMAX
405,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
816,1097,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi
962,1291,Indiana Jones and the Last Crusade (1989),Action|Adventure
