In [1]:
import pandas as pd
import numpy as np

from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise.prediction_algorithms import BaselineOnly, SVDpp, SVD

In [2]:
df_col = pd.read_csv('./data/review_all_clean.csv')
df_col.head()

Unnamed: 0,rating,user_id,movie_id,reviews
0,5,A2VHSG6TZHU1OB,1527665,Having lived in West New Guinea (Papua) during...
1,5,A1KM9FNEJ8Q171,1527665,"More than anything, I've been challenged to fi..."
2,4,A38LY2SSHVHRYB,1527665,This is a great movie for a missionary going i...
3,5,AHTYUW2H1276L,1527665,This movie was in ENGLISH....it was a great su...
4,5,A3M3HCZLXW0YLF,1527665,"This is a fascinating true story, well acted b..."


In [3]:
df_col.drop(columns='reviews', axis=1, inplace=True)
df_col.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3013831 entries, 0 to 3013830
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   rating    int64 
 1   user_id   object
 2   movie_id  object
dtypes: int64(1), object(2)
memory usage: 69.0+ MB


In [24]:
df = df_col[df_col['user_id'].isin(df_col['user_id'].value_counts()[df_col['user_id'].value_counts() >= 4].index)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 951722 entries, 7 to 3013830
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   rating    951722 non-null  int64 
 1   user_id   951722 non-null  object
 2   movie_id  951722 non-null  object
dtypes: int64(1), object(2)
memory usage: 29.0+ MB


In [25]:
reader = Reader()
data = Dataset.load_from_df(df[['user_id', 'movie_id', 'rating']], reader)

In [26]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [7]:
baselinee = BaselineOnly()
baselinee.fit(trainset)
predictions = baselinee.test(testset)
base_pred = accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 1.0554


In [8]:
svd_cv = SVD()
cv_svd = cross_validate(svd_cv, data, measures=['RMSE'], n_jobs=-1, verbose=True)

for i in cv_svd.items():
    print(i)
print('-----------------------')
print(np.mean(cv_svd['test_rmse']))

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0387  1.0452  1.0436  1.0401  1.0409  1.0417  0.0024  
Fit time          6.17    6.22    6.58    6.21    5.89    6.21    0.22    
Test time         2.19    2.13    1.93    1.88    1.76    1.98    0.16    
('test_rmse', array([1.0387223 , 1.04524995, 1.04355342, 1.04011017, 1.04093426]))
('fit_time', (6.168604612350464, 6.215359210968018, 6.58399224281311, 6.210144758224487, 5.889847755432129))
('test_time', (2.1911733150482178, 2.1289987564086914, 1.932830572128296, 1.8806607723236084, 1.7608122825622559))
-----------------------
1.0417140194753844


In [None]:
params = {'n_factors': [10, 20, 30],
          'n_epochs':[10, 20, 40],
         'reg_all': [0.01, 0.02, 0.05]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1, joblib_verbose=10)
g_s_svd.fit(data)

print(g_s_svd.best_score)
print(g_s_svd.best_params)

In [9]:
SVD_base = SVD(random_state=42)
SVD_base.fit(trainset)
predictions = SVD_base.test(testset)
kn_first = accuracy.rmse(predictions)

RMSE: 1.0412


In [None]:
SVD_base = SVD(n_factors=10, n_epochs=40, random_state=42)
SVD_base.fit(trainset)
predictions = SVD_base.test(testset)
kn_first = accuracy.rmse(predictions)

In [None]:
svd_pp_cv = SVDpp()
cv_svdpp = cross_validate(svd_pp_cv, data, measures=['RMSE'], n_jobs=-1, verbose=True)

for i in cv_svdpp.items():
    print(i)
print('-----------------------')
print(np.mean(cv_svdpp['test_rmse']))

In [28]:
params = {'n_factors': [10, 20, 30, 50],
          'n_epochs': [10, 20, 30, 50, 100],
          'reg_all': [0.02, 0.05, 0.1],
          'cache_ratings': [True, False],
          'lr_all': [0.002, 0.005, 0.01],
          'verbose':[True]
         }
g_s_svdpp = GridSearchCV(SVDpp,param_grid=params,n_jobs=-1, measures=['RMSE'], joblib_verbose=10)
g_s_svdpp.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   39.4s


KeyboardInterrupt: 

In [None]:
print(g_s_svdpp.best_score)
print(g_s_svdpp.best_params)

In [10]:
# SVDpp_final = SVDpp(n_factors=30, n_epochs=40, reg_all=0.1, cache_ratings=False)
# SVDpp_final.fit(trainset)
# predictions = SVDpp_final.test(testset)
# kn_first = accuracy.rmse(predictions)


#output for user_id with >3 reviews = 1.0246

RMSE: 1.0246


In [19]:
#USE THIS FUNCTION TO CONFIRM THAT THE REC'S ARE MOVIES THAT HAVE NOT BEEN SEEN BY THE USER



# def recommend_movies(user_id, trained_model, movie_df, N=10):
#     # Get a list of all movies that the user hasn't seen yet
#     user_movies = movie_df[movie_df['user_id'] == user_id]['movie_id'].tolist()
#     all_movies = movie_df['movie_id'].tolist()
#     unseen_movies = set(all_movies) - set(user_movies)

#     # Create a dataframe of predictions for all unseen movies
#     predictions = []
#     for movie_id in unseen_movies:
#         predicted_rating = trained_model.predict(user_id, movie_id).est
#         predictions.append({'movie_id': movie_id, 'predicted_rating': predicted_rating})
#     predictions_df = pd.DataFrame(predictions)

#     # Sort predictions by rating and return top N
#     top_N = predictions_df.sort_values('predicted_rating', ascending=False).head(N)
#     top_N_movie_ids = top_N['movie_id'].tolist()

#     # Get the details of the top N movies, including title and rating
#     top_N_movies = movie_df[movie_df['movie_id'].isin(top_N_movie_ids)]
#     top_N_movies.drop_duplicates(subset=['title'], inplace=True)

#     # Merge top N movie ratings with the details dataframe
#     top_N_ratings = pd.merge(top_N, top_N_movies, on='movie_id')

#     # Return the top N movie details (excluding title)
#     return top_N_ratings.drop('title', axis=1)

In [21]:
def recommend_movies(user_id, trained_model, movie_df, N=10):
    # Get a list of all movies that the user hasn't seen yet
    user_movies = movie_df[movie_df['user_id'] == user_id]['movie_id'].tolist()
    all_movies = movie_df['movie_id'].tolist()
    unseen_movies = set(all_movies) - set(user_movies)

    # Create a dataframe of predictions for all unseen movies
    predictions = []
    for movie_id in unseen_movies:
        predicted_rating = trained_model.predict(user_id, movie_id).est
        predictions.append({'movie_id': movie_id, 'predicted_rating': predicted_rating})
    predictions_df = pd.DataFrame(predictions)

    # Sort predictions by rating and return top N
    top_N = predictions_df.sort_values('predicted_rating', ascending=False).head(N)
    top_N_movie_ids = top_N['movie_id'].tolist()

    # Get the details of the top N movies, including title and rating
    top_N_movies = movie_df[movie_df['movie_id'].isin(top_N_movie_ids)]
    top_N_movies.drop_duplicates(subset=['title'], inplace=True)

    # Merge top N movie ratings with the details dataframe
    top_N_ratings = pd.merge(top_N, top_N_movies, on='movie_id')

    # Return the top N movie details (excluding title)
    return top_N_ratings.drop(columns=['user_id', 'rating'], axis=1)

In [12]:
movies_df = pd.read_csv('./data/meta_all_clean.csv')
movies_df.rename(columns={'asin': 'movie_id'}, inplace=True)
movies_df.drop(columns =['english', 'rank'], inplace=True)
movies_df.head()

Unnamed: 0,genre,description,title,starring,movie_id
0,Christian Video,An early movie edition of the life of Jesus.,Where Jesus Walked VHS,Various,5000009
1,Movies,"In Depression-era New England, a miserly busin...",An American Christmas Carol VHS,Various,5019281
2,Documentary,This documentary takes you on a journey...from...,A NATION ADRIFT A Chronicle of America's Prov...,Tom Kane,5092663
3,Science Fiction & Fantasy Science Fiction Anim...,This is The VHS Movie: SANTA CLAUS IS COMIN TO...,Santa Claus Is Comin' To Town VHS,Fred Astaire,307142493
4,Sony Pictures Home Entertainment,"Arthur, the hapless, hugely popular star of hi...",Arthur's Perfect Christmas VHS,Various,375810331


In [13]:
df_movies = pd.merge(df, movies_df, on='movie_id', how='right')
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1125298 entries, 0 to 1125297
Data columns (total 7 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   rating       1122318 non-null  float64
 1   user_id      1122318 non-null  object 
 2   movie_id     1125298 non-null  object 
 3   genre        1120623 non-null  object 
 4   description  1125298 non-null  object 
 5   title        1125295 non-null  object 
 6   starring     1125298 non-null  object 
dtypes: float64(1), object(6)
memory usage: 60.1+ MB


In [22]:
user_id = 'A526JEFWQZ03V'
recommendations = recommend_movies(user_id, SVDpp_final, df_movies)
recommendations

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_N_movies.drop_duplicates(subset=['title'], inplace=True)


Unnamed: 0,movie_id,predicted_rating,genre,description,title,starring
0,6303637493,4.471995,Art House & International France,This interesting Belgian film from 1994 has a ...,Farinelli: Il Castrato VHS,Stefano Dionisi
1,B0000040EK,4.452566,Warner Home Video,Absolutely one of the most hilarious movies ev...,A Night at the Opera VHS,Groucho Marx
2,B0053O89QU,4.421754,Warner Home Video,"<![CDATA[ Middle, The: The Complete Third Seas...","Middle, The: S3 (DVD)",Patricia Heaton
3,B000BNTM46,4.420633,Comedy,<![CDATA[ Everybody Loves Raymond: The Complet...,Everybody Loves Raymond: Season 5,Ray Romano
4,B000BVN0ZA,4.410929,Kids & Family,8 animated classics! <i>When sold by Amazon.co...,The Snowman plus 7 Holiday Classics,Various
5,6303968651,4.407045,Art House & International French,"One of the most famous short films ever made, ...",The Red Balloon VHS,Pascal Lamorisse
6,B000I3ONLG,4.406605,Drama,*** SEASON SIX *** VOLUME SIX *** CONTAINS THR...,Dr. Quinn Medicine Woman: Season Six - Volume ...,Various
7,B01CO70KX8,4.402719,Mystery & Thrillers,<B>IN FRENCH WITH ENGLISH SUBTITLES</b> <br>Th...,A French Village: Season 3,Thierry Godard
8,B00K7NGSDI,4.397017,Action & Adventure,"Its 1990, a new era for Martin Moone (David Ra...","Moone Boy series two [UK import, Region 2 PAL ...",Chris O'Dowd
9,B001RTSPVY,4.389187,Sony Pictures Home Entertainment,Walt (two-time Emmy award winner* Bryan Cranst...,Breaking Bad: Season 2,Bryan Cranston


In [23]:
df_movies.loc[df_movies['user_id'] == 'A526JEFWQZ03V']

Unnamed: 0,rating,user_id,movie_id,genre,description,title,starring
0,4.0,A526JEFWQZ03V,0005000009,Christian Video,An early movie edition of the life of Jesus.,Where Jesus Walked VHS,Various
118639,5.0,A526JEFWQZ03V,6305837325,Music Videos & Concerts,Natural sounds...natural beauty...naturally re...,Moodtapes: Pacific Surf,Various
640151,1.0,A526JEFWQZ03V,B00BGI07NK,HBO,HBO delivers a new season of the Emmy-winning ...,Veep: S2 (DVD),Various
811454,3.0,A526JEFWQZ03V,B00LO1SYIO,Kids & Family,Strawberry S B Bes Fr,Strawberry Shortcake: Berry Best Friends,Various
