In [1]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds

In [31]:
ratings_df = pd.read_csv('F:\\RecommendationEngine\\dataset\\ml-latest-small\\ratings.csv')
print('Ratings Dataset Shape:',ratings_df.shape)
print('Unique Users:',ratings_df.userId.nunique())
ratings_df.head()

Ratings Dataset Shape: (100836, 4)
Unique Users: 610


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [32]:
movies_df = pd.read_csv('F:\\RecommendationEngine\\dataset\\ml-latest-small\\movies.csv')
print('Movies Dataset Shape:',movies_df.shape)
print('Unique Movies are:', movies_df.movieId.nunique())
movies_df.head()

Movies Dataset Shape: (9742, 3)
Unique Movies are: 9742


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [33]:
A_df = ratings_df.pivot_table(index=['userId'],columns=['movieId'],values='rating',aggfunc=np.max)
A_df.replace({np.nan:0},inplace=True) # Don't use regex=True inside the dataframe
A_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
A = A_df.values # Convert to ndarray
A

array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [35]:
user_rating_mean = np.mean(A,axis=1)
user_rating_mean

array([0.10417524, 0.01177499, 0.00976964, 0.07897984, 0.01645413,
       0.11281366, 0.05049362, 0.01727684, 0.01542575, 0.0472028 ,
       0.02488688, 0.01444879, 0.01162073, 0.01676265, 0.04787125,
       0.03753599, 0.04545455, 0.19266763, 0.18850267, 0.08936652,
       0.14854998, 0.03146853, 0.04540313, 0.04128959, 0.01285479,
       0.00699301, 0.04925956, 0.1770362 , 0.03450226, 0.01655697,
       0.02015631, 0.03938708, 0.06077746, 0.03023447, 0.0096668 ,
       0.01624846, 0.00894694, 0.02581242, 0.04113534, 0.03990128,
       0.07260387, 0.16135335, 0.0533731 , 0.01655697, 0.15903949,
       0.01727684, 0.04396339, 0.0136775 , 0.00920403, 0.08864665,
       0.13939737, 0.05985191, 0.01028383, 0.01028383, 0.00730152,
       0.01799671, 0.16608392, 0.04494035, 0.04792267, 0.00843274,
       0.01624846, 0.15364048, 0.10119292, 0.2003805 , 0.01408885,
       0.14263677, 0.01470588, 0.41901481, 0.02067051, 0.02756067,
       0.01295763, 0.01923077, 0.08016248, 0.07774578, 0.02293

In [36]:
A_normalized = A - user_rating_mean.reshape(-1,1)
A_normalized

array([[ 3.89582476, -0.10417524,  3.89582476, ..., -0.10417524,
        -0.10417524, -0.10417524],
       [-0.01177499, -0.01177499, -0.01177499, ..., -0.01177499,
        -0.01177499, -0.01177499],
       [-0.00976964, -0.00976964, -0.00976964, ..., -0.00976964,
        -0.00976964, -0.00976964],
       ...,
       [ 2.23215755,  1.73215755,  1.73215755, ..., -0.26784245,
        -0.26784245, -0.26784245],
       [ 2.98755656, -0.01244344, -0.01244344, ..., -0.01244344,
        -0.01244344, -0.01244344],
       [ 4.50611888, -0.49388112, -0.49388112, ..., -0.49388112,
        -0.49388112, -0.49388112]])

In [37]:
U,sigma,Vt = svds(A_normalized,k=50)

In [38]:
sigma = np.diag(sigma)

In [39]:
predicted_rating = np.dot(np.dot(U,sigma),Vt)+user_rating_mean.reshape(-1,1)
predicted_rating_df = pd.DataFrame(predicted_rating,columns=A_df.columns)

In [52]:
def recommend_movies(prediction_df,userID, movies_df, original_ratings_df,num_recommendations):
    user_row_number = userID-1
    sorted_user_predictions = predicted_rating_df.iloc[user_row_number].sort_values()
    user_data = original_ratings_df[original_ratings_df.userId==(userID)]
    user_full = (user_data.merge(movies_df,how='left',left_on='movieId',right_on='movieId').sort_values(['rating'],ascending=False))
    print('user {} has already rated {} movies'.format(userID, user_full.shape[0]))
    print('Recommending highest {} predicted ratings movies not already rated.'.format(num_recommendations))
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])].merge(pd.DataFrame(sorted_user_predictions).reset_index(),how='left',left_on='movieId',right_on='movieId').rename(columns={user_row_number:'Predictions'}).sort_values('Predictions',ascending=False).iloc[:num_recommendations,:-1])
    return user_full, recommendations

already_rated, predictions = recommend_movies(predicted_rating_df, 2, movies_df, ratings_df,10)

user 2 has already rated 29 movies
Recommending highest 10 predicted ratings movies not already rated.


In [57]:
already_rated.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
28,2,131724,5.0,1445714851,The Jinx: The Life and Deaths of Robert Durst ...,Documentary
27,2,122882,5.0,1445715272,Mad Max: Fury Road (2015),Action|Adventure|Sci-Fi|Thriller
22,2,106782,5.0,1445714966,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama
18,2,89774,5.0,1445715189,Warrior (2011),Drama
9,2,60756,5.0,1445714980,Step Brothers (2008),Comedy
16,2,80906,5.0,1445715172,Inside Job (2010),Documentary
2,2,1704,4.5,1445715228,Good Will Hunting (1997),Drama|Romance
8,2,58559,4.5,1445715141,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
10,2,68157,4.5,1445715154,Inglourious Basterds (2009),Action|Drama|War
15,2,80489,4.5,1445715340,"Town, The (2010)",Crime|Drama|Thriller


In [58]:
predictions

Unnamed: 0,movieId,title,genres
2223,2959,Fight Club (1999),Action|Crime|Drama|Thriller
1936,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
7398,80463,"Social Network, The (2010)",Drama
312,356,Forrest Gump (1994),Comedy|Drama|Romance|War
8850,134130,The Martian (2015),Adventure|Drama|Sci-Fi
508,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
6323,48780,"Prestige, The (2006)",Drama|Mystery|Sci-Fi|Thriller
3634,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
7032,69122,"Hangover, The (2009)",Comedy|Crime
4795,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
