In [18]:
import pandas as pd
import heapq
import random
import torch
import tensorflow as tf
import joblib
import numpy as np
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer


In [19]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

Using device: cuda



In [27]:
movies = pd.read_csv('./data/preprocessed_data.csv')

In [28]:
movies.head()

Unnamed: 0,genres,id,imdb_id,poster_path,release_date,runtime,title,description,keywords,preprocessed_description,weighted_rating,features
0,Adventure Fantasy Family,8844,tt0113497,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,104.0,Jumanji,Roll the dice and unleash the excitement!When ...,"board game, disappearance, based on children's...",roll dice unleash excitement sibling judy pete...,6.86723,adventure fantasy family roll dice unleash exc...
1,Romance Comedy,15602,tt0113228,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,101.0,Grumpier Old Men,Still Yelling. Still Fighting. Still Ready for...,"fishing, best friend, duringcreditsstinger, ol...",still yelling still fighting still ready famil...,6.170573,romance comedy still yelling still fighting st...
2,Comedy Drama Romance,31357,tt0114885,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,127.0,Waiting to Exhale,Friends are the people who let you be yourself...,"based on novel, interracial relationship, sing...",friend people let never let forget mistreated ...,5.856086,comedy drama romance friend people let never l...
3,Comedy,11862,tt0113041,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,106.0,Father of the Bride Part II,Just When His World Is Back To Normal... He's ...,"baby, midlife crisis, confidence, aging, daugh...",world back normal surprise life george bank re...,5.710835,comedy world back normal surprise life george ...
4,Action Crime Drama Thriller,949,tt0113277,/zMyfPUelumio3tiDKPffaUpsQTD.jpg,1995-12-15,170.0,Heat,A Los Angeles Crime SagaObsessive master thief...,"robbery, detective, bank, obsession, chase, sh...",los angeles crime sagaobsessive master thief n...,7.629771,action crime drama thriller los angeles crime ...


In [29]:
movies = movies.drop_duplicates(subset=['id'])
len(movies)

20392

In [255]:
ratings = pd.read_csv('./data/ratings.csv')

# Content based recommender

In [8]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=0.05)
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['features']).astype(np.float32)
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [9]:
cosine_similarities.shape

(20392, 20392)

In [251]:
class ContentBasedRecommender:
    def __init__(self, movie_ids, cosine_similarities):
        self.movie_ids = list(set(movie_ids))
        self.cosine_similarities = cosine_similarities

    def get_content_based_recommendations(self, liked_movie_ids, disliked_movie_ids):
        if not disliked_movie_ids:
            seen_movies = liked_movie_ids
        else:
            seen_movies = liked_movie_ids + disliked_movie_ids

        liked_movies_indices = [self.movie_ids.index(movie_id) for movie_id in liked_movie_ids]

        all_recommendations = []

        N = 10

        for index in liked_movies_indices:
            similarity_scores = self.cosine_similarities[index]
            valid_top_indices = [i for i in heapq.nlargest(N, range(len(similarity_scores)), key=similarity_scores.__getitem__) if i != index and similarity_scores[i] != 1.0]
            all_recommendations.extend(valid_top_indices)

        random_10_recommendations = random.sample(all_recommendations, N)
        
        recommended_movie_ids = [self.movie_ids[index] for index in random_10_recommendations]

        return recommended_movie_ids


In [250]:
liked_movie_ids = [13, 12, 24, 65, 127380] 
disliked_movie_ids = [74, 87, 137, 15854, 28368, 11778] 
content_based_recommender = ContentBasedRecommender(movie_ids=movies['id'], cosine_similarities=cosine_similarities)
recommendations = content_based_recommender.get_content_based_recommendations(liked_movie_ids, disliked_movie_ids)
print(recommendations)

[30307, 211166, 41417, 18764, 43795, 38662, 324245, 44629, 212747, 53010]


In [252]:
recommended_movie_titles = movies[movies['id'].isin(recommendations)]['title'].tolist()
print(recommended_movie_titles)

['Penny Serenade', 'Firestorm', 'Twin Dragons', 'The Devil Commands', 'The Hurricane Express', 'Animal Kingdom', '7 Chinese Brothers', 'Rio, I Love You', 'La Chance de ma vie', 'All the Wrong Reasons']


In [207]:
def evaluate_recommender(recommender, user_ratings_df, liked_movies_per_user=3, disliked_movies_per_user=3, num_recommendations=5):
    precision_scores = []
    recall_scores = []
    f1_scores = []
    user_metrics = {} 

    for user_id in user_ratings_df['userId'].unique():
        
        user_ratings = user_ratings_df[user_ratings_df['userId'] == user_id]
        liked_movies = user_ratings[user_ratings['binary_rating'] == 1]['movieId'].tolist()
        disliked_movies = user_ratings[user_ratings['binary_rating'] == 0]['movieId'].tolist()

        sampled_liked_movies = np.random.choice(liked_movies, size=min(liked_movies_per_user, len(liked_movies)), replace=False).tolist()
        sampled_disliked_movies = np.random.choice(disliked_movies, size=min(disliked_movies_per_user, len(disliked_movies)), replace=False).tolist()
        
        recommendations = recommender.get_content_based_recommendations(sampled_liked_movies, sampled_disliked_movies)
        
        true_positives = len(set(recommendations) & set(liked_movies))
        false_positives = len(set(recommendations) - set(liked_movies))
        false_negatives = len(set(sampled_liked_movies) - set(recommendations))
        
        precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
        
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        
        user_metrics[user_id] = {'precision': precision, 'recall': recall, 'f1': f1}

    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)
    mean_f1 = np.mean(f1_scores)
    
    best_precision_user = max(user_metrics, key=lambda x: user_metrics[x]['precision'])
    worst_precision_user = min(user_metrics, key=lambda x: user_metrics[x]['precision'])

    best_recall_user = max(user_metrics, key=lambda x: user_metrics[x]['recall'])
    worst_recall_user = min(user_metrics, key=lambda x: user_metrics[x]['recall'])

    worst_f1_user = min(user_metrics, key=lambda x: user_metrics[x]['f1'])
    best_f1_user = max(user_metrics, key=lambda x: user_metrics[x]['f1'])
    
    return mean_precision, mean_recall, mean_f1, user_metrics, best_precision_user, worst_precision_user, best_recall_user, worst_recall_user, worst_f1_user, best_f1_user


In [200]:
eval_ratings_count = ratings.groupby(['userId', 'binary_rating']).size().unstack(fill_value=0)
eval_ratings_users = eval_ratings_count[(eval_ratings_count[1] >= 10) & (eval_ratings_count[0] >= 5)].index
eval_ratings = ratings[ratings['userId'].isin(eval_ratings_users)]


common_items = set(movies['id']).intersection(set(ratings['movieId']))
eval_movies = movies[movies['id'].isin(common_items)]
eval_ratings = eval_ratings[eval_ratings['movieId'].isin(common_items)]

eval_movie_ids = eval_movies['id']
eval_indices = np.where(np.isin(eval_movie_ids, eval_movie_ids))[0]
eval_cosine_similarities = cosine_similarities[eval_indices][:, eval_indices]

In [232]:
content_based_recommender = ContentBasedRecommender(movie_ids=eval_movies['id'], cosine_similarities=eval_cosine_similarities)
mean_precision, mean_recall, mean_f1, user_metrics, best_precision_user, worst_precision_user, best_recall_user, worst_recall_user, worst_f1_user, best_f1_user = evaluate_recommender(content_based_recommender, eval_ratings, liked_movies_per_user=10, disliked_movies_per_user=5, num_recommendations=5)
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1:", mean_f1)
print("Best precision user:", best_precision_user)
print("worst precision user:", worst_precision_user)
print("best recall user:", best_recall_user)
print("worst recall user:", worst_recall_user)
print("best f1 user:", best_f1_user)
print("worst f1 user:", worst_f1_user)


Mean Precision: 0.031879543094496365
Mean Recall: 0.028117346049589043
Mean F1: 0.02973527240777459
Best precision user: 73
worst precision user: 2
best recall user: 73
worst recall user: 2
best f1 user: 73
worst f1 user: 2


In [254]:
sorted_user_metrics = sorted(user_metrics.items(), key=lambda x: x[1]['precision'], reverse=True)
best_user_id, best_user_metrics = sorted_user_metrics[0]
print("Metrics for the best user (User ID:", best_user_id, "):", best_user_metrics)

Metrics for the best user (User ID: 73 ): {'precision': 0.6, 'recall': 0.375, 'f1': 0.4615384615384615}


In [23]:
joblib.dump(content_based_recommender, './dependencies/content_based_recommender.joblib')

['./dependencies/content_based_recommender.joblib']

# Collaborative

In [234]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD, accuracy
from surprise.model_selection import GridSearchCV
import joblib

class CollaborativeFilteringRecommender:
    def __init__(self, ratings_df):
        self.ratings_df = ratings_df
        self.reader = Reader(rating_scale=(0, 1))
        self.data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'binary_rating']], self.reader)
        self.trainset, self.testset = train_test_split(self.data, test_size=0.2, random_state=42)
        self.model = None
        self.movie_mapping = None

    def create_movie_mapping(self):
        unique_movie_ids = self.ratings_df['movieId'].unique()

        self.movie_mapping = pd.DataFrame({'movieId': unique_movie_ids, 'movie_column': range(len(unique_movie_ids))})

        joblib.dump(self.movie_mapping, 'movie_mapping.joblib')

    def tune_hyperparameters(self):
        param_grid = {'n_epochs': [5, 15], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6], 'n_factors': [50, 100]}

        svd = SVD()

        grid_search = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae', 'fcp'], cv=5)

        grid_search.fit(self.data)

        best_params = grid_search.best_params
        print(f'Best Parameters: {best_params}')

    def train_model(self, hyperparameters=None):
        if hyperparameters:
            self.model = SVD(**hyperparameters)
        else:
            self.model = SVD()

        self.model.fit(self.trainset)

    def evaluate_model(self):
        predictions = self.model.test(self.testset)

        rmse = accuracy.rmse(predictions)
        mae = accuracy.mae(predictions)
        mse = accuracy.mse(predictions)
        fcp = accuracy.fcp(predictions)
        
    def recommend_for_user(self, user_id, n=10):
        if self.movie_mapping is None:
            self.create_movie_mapping()

        all_movie_ids = self.ratings_df['movieId'].unique()

        rated_movie_ids = self.ratings_df[self.ratings_df['userId'] == user_id]['movieId'].tolist()

        unrated_movie_ids = list(set(all_movie_ids) - set(rated_movie_ids))

        predictions = [self.model.predict(user_id, movie_id) for movie_id in unrated_movie_ids]

        sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

        top_n_recommendations = [prediction.iid for prediction in sorted_predictions[:n]]
        
        return top_n_recommendations

collabRecommender = CollaborativeFilteringRecommender(ratings)


In [235]:
collabRecommender.tune_hyperparameters()


Best Parameters: {'rmse': {'n_epochs': 15, 'lr_all': 0.005, 'reg_all': 0.4, 'n_factors': 50}, 'mae': {'n_epochs': 15, 'lr_all': 0.005, 'reg_all': 0.4, 'n_factors': 50}, 'fcp': {'n_epochs': 15, 'lr_all': 0.002, 'reg_all': 0.6, 'n_factors': 100}}


In [249]:
collabRecommender.train_model(hyperparameters={'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4, 'n_factors': 50})
collabRecommender.evaluate_model()
recommendations = collabRecommender.recommend_for_user(user_id=518, n=10)
print(recommendations)

RMSE: 0.3558
MAE:  0.2589
MSE: 0.1266
FCP:  0.6781
[50, 1213, 44191, 1252, 2692, 68157, 1203, 1953, 908, 48516]


In [73]:
joblib.dump(collabRecommender, './dependencies/collaborative_filtering_model.joblib')

['./dependencies/collaborative_filtering_model.joblib']