<a href="https://colab.research.google.com/github/IKosovych/ucu-recommender-system-2023/blob/master/ContentBasedFiltering_and_BaseLine_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import LabelBinarizer
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


After inspection of four datases I didn't found severe outliers that can cause problems in our tasks.

###Content-Based Filtering model implementation

In [17]:
class ContentBasedFiltering:
    """
    A content-based filtering algorithm based on movie genres
    """

    def __init__(self, train_size=0.8):
        self.df_movies = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/ml-latest-small/movies.csv')
        self.df_ratings = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/ml-latest-small/ratings.csv')
        self.df_movies['genres'] = self.df_movies['genres'].fillna('')
        self.indices = pd.Series(self.df_movies.index, index=self.df_movies['movieId']).drop_duplicates()
        self.train_size = train_size
        self.train_set, self.test_set = self.train_test_split()

    def train_test_split(self):
        train, test = train_test_split(self.df_ratings, test_size=1-self.train_size, random_state=42, stratify=self.df_ratings['userId'])
        return train, test

    def fit(self):
        # Use TF-IDF to convert the genres into vectors
        tfidf = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf.fit_transform(self.df_movies['genres'])

        # Compute the cosine similarity matrix
        self.cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    def predict_on_testset(self, user_id):
        # Get the movies that the user has watched
        watched_movies = self.train_set[self.train_set['userId'] == user_id]['movieId'].tolist()

        # Get the pairwise similarity scores of all movies with each movie the user has watched
        sim_scores = [list(enumerate(self.cosine_sim[self.indices[movie]])) for movie in watched_movies]

        # Flatten the list and sort it based on the similarity scores
        sim_scores = sorted([score for sublist in sim_scores for score in sublist], key=lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores = sim_scores[1:11]

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return the top 10 most similar movies
        return self.df_movies['title'].iloc[movie_indices]

    def evaluate(self):
        users = self.test_set['userId'].unique()
        average_ndcg = []

        for user in users:
            actual = self.test_set[self.test_set['userId'] == user]['movieId'].tolist()
            predicted = self.predict_on_testset(user)

            # Convert movie titles back to ids for evaluation
            predicted_ids = self.df_movies[self.df_movies['title'].isin(predicted)]['movieId'].tolist()

            lb = LabelBinarizer()
            lb.fit(actual)

            if len(predicted_ids) <= len(actual):
                actual = actual[:len(predicted_ids)]
            else:
                predicted_ids = predicted_ids[:len(actual)]

            binary_actual = lb.transform(actual)
            binary_predicted = lb.transform(predicted_ids)

            average_ndcg.append(ndcg_score(binary_actual, binary_predicted))

        return np.mean(average_ndcg)


In [None]:
cbf = ContentBasedFiltering()
cbf.fit()

recommendations = cbf.predict_on_testset(2)

print("Recommendations for User {}:".format(2))
for idx, title in enumerate(recommendations):
    print("{}: {}".format(idx+1, title))





Recommendations for User 2:
1: Apocalypse Now (1979)
2: Boot, Das (Boat, The) (1981)
3: All Quiet on the Western Front (1930)
4: Saving Private Ryan (1998)
5: Thin Red Line, The (1998)
6: Dirty Dozen, The (1967)
7: Longest Day, The (1962)
8: Tora! Tora! Tora! (1970)
9: Red Dawn (1984)
10: Force 10 from Navarone (1978)


###Content-Based Filtering model Evaluation

In [20]:
average_ndcg = cbf.evaluate()
print(f'Average NDCG: {average_ndcg}')

Average NDCG: 0.40424739400007437


###Baseline model implementation

In [4]:
from sklearn.model_selection import train_test_split

class BaselineModel:
    """
    A baseline recommendation model that recommends movies
    based on their popularity and average ratings
    """
    def __init__(self, train_size=0.8):
        self.df_links = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/ml-latest-small/links.csv')
        self.df_tags = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/ml-latest-small/tags.csv')
        self.df_ratings = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/ml-latest-small/ratings.csv')
        self.df_movies = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/ml-latest-small/movies.csv')
        self.train_size = train_size
        self.train_set, self.test_set = self.train_test_split()

    def train_test_split(self):
        train, test = train_test_split(self.df_ratings, test_size=1-self.train_size, random_state=42, stratify=self.df_ratings['userId'])
        return train, test

    def fit(self):
        self.train_set['count'] = 1
        df_ratings_aggr = self.train_set.groupby('movieId')[['rating', 'count']].sum()
        df_ratings_aggr['rating_avg'] = round(df_ratings_aggr['rating'] / df_ratings_aggr['count'],1)
        df_ratings_aggr['rank'] = np.log(df_ratings_aggr['rating']) * df_ratings_aggr['rating_avg']

        self.df_ranked_films = pd.merge(self.df_movies, df_ratings_aggr[['rank', 'rating_avg']], on='movieId', how='outer') \
            .sort_values(by=['rank'], ascending=False) \
            .fillna(0)

    def predict_on_testset(self, user_id):
        filtered_films = self.df_ranked_films.merge(self.train_set[self.train_set['userId'] == user_id]['movieId'],
                                                     on='movieId',
                                                     how='left',
                                                     indicator=True)

        filtered_films = filtered_films[filtered_films['_merge'] == 'left_only']
        return filtered_films[['movieId', 'title', 'rank']].head(10)


In [5]:
bm = BaselineModel()
bm.fit()
print(bm.predict_on_testset(1)) # Assuming 1 is a valid user_id

    movieId                                              title       rank
0       318                   Shawshank Redemption, The (1994)  31.437469
1       296                                Pulp Fiction (1994)  29.331918
7       858                              Godfather, The (1972)  28.029161
13    58559                            Dark Knight, The (2008)  26.887327
14      110                                  Braveheart (1995)  26.608871
15     5952      Lord of the Rings: The Two Towers, The (2002)  26.599504
17     7153  Lord of the Rings: The Return of the King, The...  26.552432
19     1193             One Flew Over the Cuckoo's Nest (1975)  26.515811
20     4993  Lord of the Rings: The Fellowship of the Ring,...  26.511193
22      589                  Terminator 2: Judgment Day (1991)  26.210032


###BaseLine model evaluation

In [7]:
from typing import List
from sklearn.metrics import f1_score

model = BaselineModel()
model.fit()

from sklearn.metrics import f1_score

def precision_at_k(predicted: List[int], actual: List[int], k: int) -> float:
    assert k <= len(predicted)
    return len(set(predicted[:k]) & set(actual)) / k

def recall_at_k(predicted: List[int], actual: List[int], k: int) -> float:
    assert k <= len(predicted)
    return len(set(predicted[:k]) & set(actual)) / len(set(actual))

def f1_at_k(predicted: List[int], actual: List[int], k: int) -> float:
    precision = precision_at_k(predicted, actual, k)
    recall = recall_at_k(predicted, actual, k)
    return 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
# For each user in the test set, generate the top-N predictions and evaluate them
for user_id in model.test_set['userId'].unique():
    actual_movies = model.test_set[model.test_set['userId'] == user_id]['movieId'].tolist()
    predicted_movies = model.predict_on_testset(user_id)['movieId'].tolist()

    precision = precision_at_k(predicted_movies, actual_movies, k=10)
    recall = recall_at_k(predicted_movies, actual_movies, k=10)
    f1 = f1_at_k(predicted_movies, actual_movies, k=10)

    print(f'User ID: {user_id}, Precision@10: {precision:.2f}, Recall@10: {recall:.2f}, F1@10: {f1:.2f}')


User ID: 318, Precision@10: 0.40, Recall@10: 0.02, F1@10: 0.04
User ID: 288, Precision@10: 0.40, Recall@10: 0.02, F1@10: 0.04
User ID: 314, Precision@10: 0.20, Recall@10: 0.08, F1@10: 0.11
User ID: 184, Precision@10: 0.00, Recall@10: 0.00, F1@10: 0.00
User ID: 414, Precision@10: 1.00, Recall@10: 0.02, F1@10: 0.04
User ID: 182, Precision@10: 0.60, Recall@10: 0.03, F1@10: 0.06
User ID: 126, Precision@10: 0.20, Recall@10: 0.25, F1@10: 0.22
User ID: 462, Precision@10: 0.20, Recall@10: 0.02, F1@10: 0.04
User ID: 483, Precision@10: 0.60, Recall@10: 0.04, F1@10: 0.08
User ID: 234, Precision@10: 0.00, Recall@10: 0.00, F1@10: 0.00
User ID: 63, Precision@10: 0.70, Recall@10: 0.13, F1@10: 0.22
User ID: 387, Precision@10: 0.50, Recall@10: 0.02, F1@10: 0.05
User ID: 176, Precision@10: 0.00, Recall@10: 0.00, F1@10: 0.00
User ID: 28, Precision@10: 0.90, Recall@10: 0.08, F1@10: 0.15
User ID: 607, Precision@10: 0.20, Recall@10: 0.05, F1@10: 0.09
User ID: 367, Precision@10: 0.20, Recall@10: 0.05, F1@10: