In [1]:
import pandas as pd
import numpy as np

from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise.prediction_algorithms import BaselineOnly, SVDpp, SVD

In [2]:
df_col = pd.read_csv('./data/review_all_clean.csv')
df_col.head()

Unnamed: 0,rating,user_id,movie_id,reviews
0,5,A2VHSG6TZHU1OB,1527665,Having lived in West New Guinea (Papua) during...
1,5,A1KM9FNEJ8Q171,1527665,"More than anything, I've been challenged to fi..."
2,4,A38LY2SSHVHRYB,1527665,This is a great movie for a missionary going i...
3,5,AHTYUW2H1276L,1527665,This movie was in ENGLISH....it was a great su...
4,5,A3M3HCZLXW0YLF,1527665,"This is a fascinating true story, well acted b..."


In [3]:
df_col.drop(columns='reviews', axis=1, inplace=True)
df_col.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3013831 entries, 0 to 3013830
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   rating    int64 
 1   user_id   object
 2   movie_id  object
dtypes: int64(1), object(2)
memory usage: 69.0+ MB


In [36]:
df_col['user_id'].value_counts().head(50)

user_id
A1L2DLWYRNHKDR    428
A2YUA3H1LLU53Z    400
A1TA5QYECZP1L1    398
A3DZT870KCFD1     328
A1M0YAZ6JV488Z    313
A1IWR4YH4ZA9BM    284
A1I1U2M5KSPY1R    248
AJNIACIPB2I5P     242
A2K4PH68WESPT4    227
A3I4C4LS3ID7Y4    221
A30M9ME5AT8K12    216
AIMR915K4YCN      214
A3RPLCEFOTTVO6    208
A3OU8T2DWM38EX    201
A3VO05LD5WSO9F    190
ABVQ4VI0UHIK2     188
A3MPTLMGS9DHVZ    183
A1XMHK9HN5MW2H    177
A2GIBFKJS5FXKX    177
AMM20XSJM32PA     174
AZQ8EITRKV9GS     171
A38X3820MO0TAN    166
A2K62IHNYD9CBR    164
A1PGC1GEBY9CZU    163
A3956O40ZZYFMY    160
A1TJLSSL0J0ZHA    157
A3PHL18RYME2UB    156
A2EBHVSUECXI3I    156
A101IGU6UDKW3X    155
A35TR9OOETQ170    154
AB5M1SO1RUMP8     153
AB6QZWTQR54XU     152
A34PHU91ZPOFL7    151
A3CWLNCVLI0F64    150
AQ7AA5G9G224E     149
A19SBIW8TFMZ5S    148
A39MC854HO5GE6    147
A25XWFC8NMM08D    139
A2TA5Q3W87GP78    135
A3SS6VRWCTB7V     134
AQXG8FJANG281     131
A155LCIZE7CS51    131
A6T098A9B0GZC     131
A1W5RUAJRI8ZD3    129
A31UN01C95N0TV    129
A1

In [60]:
df = df_col[df_col['user_id'].isin(df_col['user_id'].value_counts()[df_col['user_id'].value_counts() >= 3].index)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1274639 entries, 7 to 3013830
Data columns (total 3 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   rating    1274639 non-null  int64 
 1   user_id   1274639 non-null  object
 2   movie_id  1274639 non-null  object
dtypes: int64(1), object(2)
memory usage: 38.9+ MB


In [23]:
df['user_id'].value_counts().mean()

5.203683711191808

In [24]:
reader = Reader()
data = Dataset.load_from_df(df[['user_id', 'movie_id', 'rating']], reader)

In [25]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [31]:
baselinee = BaselineOnly()
baselinee.fit(trainset)
predictions = baselinee.test(testset)
base_pred = accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 1.0556


In [20]:
svd_cv = SVD()
cv_svd = cross_validate(svd_cv, data, measures=['RMSE'], n_jobs=-1, verbose=True)

for i in cv_svd.items():
    print(i)
print('-----------------------')
print(np.mean(cv_svd['test_rmse']))

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0420  1.0405  1.0426  1.0394  1.0433  1.0416  0.0014  
Fit time          6.17    6.30    6.32    6.10    5.82    6.14    0.18    
Test time         2.24    2.13    2.01    1.88    1.77    2.00    0.17    
('test_rmse', array([1.04197293, 1.04046707, 1.04260458, 1.0394358 , 1.04334289]))
('fit_time', (6.165508270263672, 6.297786712646484, 6.319845199584961, 6.096203327178955, 5.822165250778198))
('test_time', (2.2359492778778076, 2.1276607513427734, 2.0060012340545654, 1.8811333179473877, 1.7694106101989746))
-----------------------
1.0415646551472708


In [23]:
params = {'n_factors': [10, 20, 30],
          'n_epochs':[10, 20, 40],
         'reg_all': [0.01, 0.02, 0.05]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1, joblib_verbose=10)
g_s_svd.fit(data)

print(g_s_svd.best_score)
print(g_s_svd.best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   48.0s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 118 out of 135 | elapsed:  3.0min remaining:   26.0s
[Parallel(n_jobs=-1)]: Done 132 out of 135 | elapsed:  3.4min remaining:    4.6s


{'rmse': 1.023629580258, 'mae': 0.7576699431970669}
{'rmse': {'n_factors': 10, 'n_epochs': 40, 'reg_all': 0.05}, 'mae': {'n_factors': 10, 'n_epochs': 40, 'reg_all': 0.02}}


[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:  3.5min finished


In [9]:
SVD_base = SVD(random_state=42)
SVD_base.fit(trainset)
predictions = SVD_base.test(testset)
kn_first = accuracy.rmse(predictions)

RMSE: 1.0412


In [27]:
SVD_base = SVD(n_factors=10, n_epochs=40, random_state=42)
SVD_base.fit(trainset)
predictions = SVD_base.test(testset)
kn_first = accuracy.rmse(predictions)

RMSE: 1.0334


In [28]:
svd_pp_cv = SVDpp()
cv_svdpp = cross_validate(svd_pp_cv, data, measures=['RMSE'], n_jobs=-1, verbose=True)

for i in cv_svdpp.items():
    print(i)
print('-----------------------')
print(np.mean(cv_svdpp['test_rmse']))

Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0399  1.0368  1.0360  1.0367  1.0411  1.0381  0.0020  
Fit time          11.27   11.24   11.18   11.06   11.03   11.16   0.10    
Test time         4.14    4.08    3.92    4.01    3.99    4.03    0.08    
('test_rmse', array([1.0399138 , 1.03676999, 1.03604482, 1.03666629, 1.04105292]))
('fit_time', (11.270710229873657, 11.243730545043945, 11.18043565750122, 11.059995174407959, 11.027558088302612))
('test_time', (4.143270969390869, 4.081752777099609, 3.9197046756744385, 4.014470815658569, 3.9861934185028076))
-----------------------
1.0380895628011755


In [30]:
params = {'n_factors': [10, 20, 30],
          'n_epochs': [20, 40, 60],
         'reg_all': [0.02, 0.05, 0.1],
          'cache_ratings': [True, False]
         }
g_s_svdpp = GridSearchCV(SVDpp,param_grid=params,n_jobs=-1, measures=['RMSE'], joblib_verbose=10)
g_s_svdpp.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   48.7s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed: 1

In [32]:
print(g_s_svdpp.best_score)
print(g_s_svdpp.best_params)

{'rmse': 1.0258731469254718}
{'rmse': {'n_factors': 30, 'n_epochs': 40, 'reg_all': 0.1, 'cache_ratings': False}}


In [61]:
SVDpp_final = SVDpp(n_factors=30, n_epochs=40, reg_all=0.1, cache_ratings=False)
SVDpp_final.fit(trainset)
predictions = SVDpp_final.test(testset)
kn_first = accuracy.rmse(predictions)

RMSE: 1.0248


In [76]:
def recommend_movies(user_id, trained_model, movie_df, N=10):
    # Get a list of all movies that the user hasn't seen yet
    user_movies = movie_df[movie_df['user_id'] == user_id]['movie_id'].tolist()
    all_movies = movie_df['movie_id'].tolist()
    unseen_movies = set(all_movies) - set(user_movies)

    # Create a dataframe of predictions for all unseen movies
    predictions = []
    for movie_id in unseen_movies:
        predicted_rating = trained_model.predict(user_id, movie_id).est
        predictions.append({'movie_id': movie_id, 'predicted_rating': predicted_rating})
    predictions_df = pd.DataFrame(predictions)

    # Sort predictions by rating and return top N
    top_N = predictions_df.sort_values('predicted_rating', ascending=False).head(N)
    top_N_movie_ids = top_N['movie_id'].tolist()

    # Get the details of the top N movies, including title and rating
    top_N_movies = movie_df[movie_df['movie_id'].isin(top_N_movie_ids)]
    top_N_movies.drop_duplicates(subset=['title'], inplace=True)

    # Merge top N movie ratings with the details dataframe
    top_N_ratings = pd.merge(top_N, top_N_movies, on='movie_id')

    # Return the top N movie details (excluding title)
    return top_N_ratings.drop('title', axis=1)

In [46]:
movies_df = pd.read_csv('./data/meta_all_clean.csv')
movies_df.rename(columns={'asin': 'movie_id'}, inplace=True)
movies_df.drop(columns =['english', 'rank'], inplace=True)
movies_df.head()

Unnamed: 0,genre,description,title,starring,movie_id
0,Christian Video,An early movie edition of the life of Jesus.,Where Jesus Walked VHS,Various,5000009
1,Movies,"In Depression-era New England, a miserly busin...",An American Christmas Carol VHS,Various,5019281
2,Documentary,This documentary takes you on a journey...from...,A NATION ADRIFT A Chronicle of America's Prov...,Tom Kane,5092663
3,Science Fiction & Fantasy Science Fiction Anim...,This is The VHS Movie: SANTA CLAUS IS COMIN TO...,Santa Claus Is Comin' To Town VHS,Fred Astaire,307142493
4,Sony Pictures Home Entertainment,"Arthur, the hapless, hugely popular star of hi...",Arthur's Perfect Christmas VHS,Various,375810331


In [63]:
df_movies = pd.merge(df, movies_df, on='movie_id', how='right')
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1125298 entries, 0 to 1125297
Data columns (total 7 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   rating       1122318 non-null  float64
 1   user_id      1122318 non-null  object 
 2   movie_id     1125298 non-null  object 
 3   genre        1120623 non-null  object 
 4   description  1125298 non-null  object 
 5   title        1125295 non-null  object 
 6   starring     1125298 non-null  object 
dtypes: float64(1), object(6)
memory usage: 60.1+ MB


In [64]:
df_movies.head()

Unnamed: 0,rating,user_id,movie_id,genre,description,title,starring
0,4.0,A526JEFWQZ03V,5000009,Christian Video,An early movie edition of the life of Jesus.,Where Jesus Walked VHS,Various
1,5.0,A3K9C2YKOYV3W,5000009,Christian Video,An early movie edition of the life of Jesus.,Where Jesus Walked VHS,Various
2,3.0,A2PVV525II3GIB,5000009,Christian Video,An early movie edition of the life of Jesus.,Where Jesus Walked VHS,Various
3,1.0,AL1HSCICK2KJL,5000009,Christian Video,An early movie edition of the life of Jesus.,Where Jesus Walked VHS,Various
4,4.0,A2DAKQ91LTZMB2,5000009,Christian Video,An early movie edition of the life of Jesus.,Where Jesus Walked VHS,Various


In [78]:
user_id = 'A526JEFWQZ03V'
recommendations = recommend_movies(user_id, SVDpp_final, df_movies)
recommendations

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_N_movies.drop_duplicates(subset=['title'], inplace=True)


Unnamed: 0,movie_id,predicted_rating,rating,user_id,genre,description,starring
0,B01ELI9E9O,4.349621,5.0,A3SWMVAKRE22Q2,Action,"During a manned mission to Mars, Astronaut Mar...",Various
1,B000EMGIDC,4.321811,5.0,A1QWX2MYHZQHPG,Science Fiction & Fantasy Science Fiction Anim...,"<![CDATA[ Venture Bros, The: Season One (DVD) ...",Christopher McCulloch
2,0783226853,4.316688,5.0,A3NGCB9QQMM9SM,Action & Adventure,Based on Frederick Forsyth's best-selling nove...,Edward Fox
3,6303637493,4.315063,5.0,AW97ZK5FDSELS,Art House & International France,This interesting Belgian film from 1994 has a ...,Stefano Dionisi
4,B009FLZL04,4.310209,5.0,A3EDIPCXT15C3T,Documentary,We follow 94 year old 82nd Airborne veteran Ja...,Various
5,B00450AG1Y,4.308032,5.0,A3JFVVVZN7RI1H,Military & War,Christianity,Various
6,B00005JPH2,4.305267,5.0,A1J0GRQCGZWZDE,Science Fiction & Fantasy Fantasy,UPC:786936735437<br>DESCRIPTION: The magical w...,Ben Barnes
7,B00OLII2DO,4.298429,5.0,A3CDOQMJTHINPT,Documentary,Set against the chaotic backdrop of recent eve...,Mosab Hassan Yousef
8,B00GDEKMGO,4.288402,5.0,AD27YH6IRSCVU,Action & Adventure,<b>SEASON 4 OF THE ONLY SHOW FOR REAL AVIATION...,Anthony Nalli
9,B001CK7ONS,4.286119,5.0,A102W7P70UV0OH,All Universal Studios Titles,A boatload of beloved VeggieTales pals embarks...,Mike Nawrocki


In [79]:
user_id = 'A3NGCB9QQMM9SM'
recommendations = recommend_movies(user_id, SVDpp_final, df_movies)
recommendations

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_N_movies.drop_duplicates(subset=['title'], inplace=True)


Unnamed: 0,movie_id,predicted_rating,rating,user_id,genre,description,starring
0,B0001FT50A,5.0,5.0,A1ZHY00KZSGHKS,Drama,"The Gospel of John, the best-loved of the four...",Christopher Plummer
1,B00KQJRZ8U,5.0,4.0,A3JIPFRO667EW7,Documentary,Rich Hill intimately chronicles the turbulent ...,Alyssa Jewell
2,B0024F08TW,5.0,5.0,AO73V8IQ0R5CD,Fox TV,<br><b>Genre: </b>Television: Series<br><b>Rat...,Reba McEntire
3,B003R0MF5I,5.0,4.0,A1G2WFEXLVBPN,Warner Home Video,<![CDATA[ Chuck: The Complete Fourth Season Th...,Zachary Levi
4,B004QOB8SO,5.0,5.0,AR9Z4LT28HV2Q,BBC,<![CDATA[ Sherlock: Season Two (BBC/DVD) Nomin...,Benedict Cumberbatch
5,B01ELI9E9O,5.0,5.0,A3SWMVAKRE22Q2,Action,"During a manned mission to Mars, Astronaut Mar...",Various
6,6305962596,5.0,5.0,A2J61ONLQX1C8D,Kids & Family,"With its unforgettably heroic story, its stunn...",Kirk Douglas
7,B009999YD2,5.0,5.0,ACMNJZOGHV119,Action & Adventure,"Each year, through the Extreme Mustang Makeove...",Carlos Chee
8,6304176287,5.0,5.0,A3BDXJRB6M1ZLN,Warner Home Video,A poor boy wins the opportunity to tour the mo...,Gene Wilder
9,B000X1CMXO,5.0,5.0,A2UM4VBPFMHZL9,All Universal Studios Titles,Laserdisc version of the movie Jaws.,Roy Scheider
