In [59]:
import pandas as pd
import numpy as np

In [61]:
dfLinks = pd.read_pickle('LinksProcessed.pkl')
dfTags = pd.read_pickle('TagsProcessed.pkl')
dfRatings = pd.read_pickle('RatingsProcessed.pkl')
dfMovies = pd.read_pickle('MoviesProcessed.pkl')

In [3]:
temp = dfRatings[dfRatings['userId'] == int(1)]
watched_movies = dfMovies[dfMovies['movieId'].isin(temp['movieId'])]
watched_movies = temp.merge(watched_movies, on='movieId').sort_values('rating', ascending=False)
watched_movies.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'dateTime', 'year', 'month',
       'dayName', 'day', 'title', 'original_language', 'popularity', 'runtime',
       'release_date', 'poster_path', 'weightedVoteAverage', 'Adventure',
       'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance', 'Drama',
       'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi', 'IMAX',
       'Documentary', 'War', 'Musical', 'Western', 'Film-Noir', 'Unknown'],
      dtype='object')

In [6]:
temp = dfMovies.drop(['original_language', 'popularity', 'runtime', 'release_date', 'poster_path', 'weightedVoteAverage', 'genres'], axis = 'columns')

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from tqdm.notebook import tqdm
from sklearnex import patch_sklearn
from sklearn.neighbors import NearestNeighbors
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from imblearn.pipeline import Pipeline
from surprise import accuracy
import joblib
patch_sklearn()
import gc

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [417]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=11)

In [425]:
pl = Pipeline((
    ("L2 normalization", Normalizer(norm='l2')),
    ("KNN", NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=11))
))

In [427]:
pl

In [429]:
features = temp.drop(['title', 'movieId'], axis='columns')
pl.fit(features)

In [573]:
title_to_index = pd.Series(temp.index, index=temp['title']).drop_duplicates()
index_to_title = pd.Series(temp['title'], index=temp.index)

0                          Toy Story (1995)
1                            Jumanji (1995)
2                   Grumpier Old Men (1995)
3                  Waiting to Exhale (1995)
4        Father of the Bride Part II (1995)
                        ...                
86466             The Monroy Affaire (2022)
86467            Shelter in Solitude (2023)
86468                           Orca (2023)
86469                The Angry Breed (1968)
86470             Race to the Summit (2023)
Name: title, Length: 86471, dtype: object

In [268]:
temp.drop(['movieId'], axis = 'columns').iloc[0].to_frame().T

Unnamed: 0,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,Unknown
0,Toy Story (1995),1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [280]:
features[0].reshape(1, -1) 

array([[0.4472136, 0.4472136, 0.4472136, 0.4472136, 0.4472136, 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       ]])

In [445]:
pl.named_steps['KNN']

In [None]:
pd.Series().to_numpy

In [555]:
def get_content_recommendation(movieName, numberOfMovies = 10):
    index = title_to_index[movieName]
    vector = features.iloc[[index]]
    vector = pl.named_steps['L2 normalization'].transform(vector)
    distances, indices = pl.named_steps.KNN.kneighbors(vector.reshape(1, -1), numberOfMovies + 1)
    indices = list(filter(lambda i :True if i != index else False, indices[0]))
    return index_to_title[indices].tolist()

In [561]:
joblib.dump(pl, 'KNN.joblib')

['KNN.joblib']

In [457]:
distances, indices = pl.named_steps["KNN"].kneighbors(np.array(features.iloc[0]).reshape(1, -1) , n_neighbors=11)

In [459]:
indices.flatten()[1:]

array([ 2194,  3903,  3011,     0,  4769,  9921, 11569,  3644, 10741,
       17346], dtype=int64)

In [461]:
index_to_title[indices.flatten()[1:]].to_list()

['Antz (1998)',
 "Emperor's New Groove, The (2000)",
 'Toy Story 2 (1999)',
 'Toy Story (1995)',
 'Monsters, Inc. (2001)',
 'DuckTales: The Movie - Treasure of the Lost Lamp (1990)',
 'Shrek the Third (2007)',
 'Adventures of Rocky and Bullwinkle, The (2000)',
 'Wild, The (2006)',
 'Asterix and the Vikings (Astérix et les Vikings) (2006)']

In [12]:
gc.collect()

0

In [65]:
reader = Reader(rating_scale=(0.5, 5.0))

# Load the data from the pandas dataframe
data = Dataset.load_from_df(dfRatings[['userId', 'movieId', 'weightedRating']], reader)
del dfRatings

In [8]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [13]:
algo = SVD(verbose = True)

algo.fit(trainset)

predictions = algo.test(testset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x136384ecd40>

In [565]:
cv_results = cross_validate(algo, data, measures=['RMSE', 'MAE', 'FCP'], cv=5, verbose=True)
# Extract results
mean_rmse = np.mean(cv_results['test_rmse'])
mean_mae = np.mean(cv_results['test_mae'])
mean_fcp = np.mean(cv_results['test_fcp'])

print(f"Mean RMSE: {mean_rmse:.4f}")
print(f"Mean MAE: {mean_mae:.4f}")
print(f"Mean FCP: {mean_fcp:.4f}")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing

In [567]:
cv_results

{'test_rmse': array([0.77179572, 0.7715578 , 0.7719257 , 0.77179901, 0.77167764]),
 'test_mae': array([0.5801739 , 0.579832  , 0.58012903, 0.57998294, 0.57993305]),
 'test_fcp': array([0.74305129, 0.74235785, 0.74311619, 0.74239881, 0.74300165]),
 'fit_time': (333.0295264720917,
  467.79617953300476,
  456.57896876335144,
  461.28045892715454,
  415.5925350189209),
 'test_time': (1143.837121963501,
  1068.1797683238983,
  1858.6542365550995,
  1134.586153268814,
  1780.470967054367)}

In [413]:
joblib.dump(algo, 'SVD.joblib', compress=0)

['SVD.joblib']

In [14]:
joblib.dump(algo, 'SVDWeighted.joblib', compress=0)

['SVDWeighted.joblib']

In [405]:
def get_cf_recommendations(user_id, top_n=25):
    user_ratings = dfRatings[dfRatings['userId'] == user_id]
    user_movies = user_ratings['movieId'].tolist()
    all_movies = dfMovies['movieId'].tolist()
    movies_to_predict = list(set(all_movies) - set(user_movies))
    
    predictions = [algo.predict(user_id, movie_id) for movie_id in movies_to_predict]
    predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    
    top_predictions = predictions[:top_n]
    top_movie_ids = [pred.iid for pred in top_predictions]
    return dfMovies[dfMovies['movieId'].isin(top_movie_ids)]['title']
get_cf_recommendations(200948)

49                              Usual Suspects, The (1995)
726                 Wallace & Gromit: A Close Shave (1995)
1162     Star Wars: Episode V - The Empire Strikes Back...
1164     Raiders of the Lost Ark (Indiana Jones and the...
1205                                          Glory (1989)
1827                   There's Something About Mary (1998)
1932                            Saving Private Ryan (1998)
2224            Life Is Beautiful (La Vita è bella) (1997)
3044                                Green Mile, The (1999)
9974                                  Batman Begins (2005)
10925                     Pursuit of Happyness, The (2006)
11082    Borat: Cultural Learnings of America for Make ...
12180                              Dark Knight, The (2008)
14869                                     Inception (2010)
16239                         Louis C.K.: Hilarious (2010)
16244                         Louis C.K.: Shameless (2007)
17379                        Dark Knight Rises, The (201