# 🎉 User Rating Predictions using Matrix Factorization for the MovieLens Dataset 🎉

TeamMHL: Emmanouil Chatzakis, Hind El-Bouchrifi, Lluka Stojollari (emmanouil.chatzakis@epfl.ch, hind.el-bouchrifi@epfl.ch, lluka.stojollari@epfl.ch)

This notebook implements an optimized Matrix Factorization recommendation system for the MovieLens dataset.

November 2023, Distributed Information Systems, Project 2, EPFL

## Imports and Libraries 🚨

Here, we import all the needed libraries. Make sure to install all the frameworks and libraries used below.

In [1]:
import torch
import time
import os
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import torch.optim as optim

from collections import defaultdict

from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau

from tqdm import tqdm

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

seed_value = 42
np.random.seed(seed_value)
torch.manual_seed(seed_value)
random.seed(seed_value)

## Object and Dataset definitions 💡

Here we define the Matrix Factorization modules and datasets, as well as helper functions to be used later.

In [2]:
# Define the movielens dataser representation
class MovieRatingDataset(Dataset):
    """Dataset representing a movie for Matrix Factorization"""
    def __init__(self, dataframe, movie_to_genre_vector):
        self.dataframe = dataframe
        self.movie_to_genre_vector = movie_to_genre_vector
        self.x_user_movie = list(zip(self.dataframe.userId.values, self.dataframe.movieId.values))
        self.y_rating = self.dataframe.rating.values

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        movie_id = self.x_user_movie[idx][1]
        genre_vector = self.movie_to_genre_vector[movie_id.item()]

        return (
            self.x_user_movie[idx],
            self.y_rating[idx],
            genre_vector,
        ) 

In [3]:
# Matrix Factorization modules and functions

def sigmoid_range(x, low, high):
    """Sigmoid function with range (low, high)"""
    return torch.sigmoid(x) * (high - low) + low


class MatrixFactorization(nn.Module):
    """Matrix Factorization module class"""
    def __init__(self, n_users, n_movies, n_factors, n_genre):
        super(MatrixFactorization, self).__init__()

        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)
        self.genre_factors = nn.Linear(n_genre, n_factors)  # as Linear

        self.user_bias = nn.Parameter(torch.zeros(n_users).normal_(0, 0.01))
        self.movie_bias = nn.Parameter(torch.zeros(n_movies).normal_(0, 0.01))

        self.offset = nn.Parameter(torch.zeros(1).normal_(0, 0.01))

        self.user_factors.weight.data.uniform_(0.0, 0.05)
        self.movie_factors.weight.data.uniform_(0.0, 0.05)
        self.genre_factors.weight.data.uniform_(0.0, 0.05)

    def forward(self, user, item, genre_vector):
        user_emb = self.user_factors(user)
        item_emb = self.movie_factors(item)

        genre_emb = self.genre_factors(genre_vector)

        item_emb += genre_emb

        element_product = (user_emb * item_emb).sum(1)

        user_b = self.user_bias[user]
        item_b = self.movie_bias[item]

        element_product += user_b + item_b + self.offset

        return sigmoid_range(element_product, 0, 5.5)

## Section 1: Data Loading and Preprocessing 📈

In this section we load MovieLens, preprocess and merge the data.

### Basic Data Loading and Processing

Here we load the raw MovieLens data.

In [4]:
# Get all the data paths
current_directory = os.getcwd()
general_path = "/kaggle/input/dis-project-2-recommender-systems"

movies_path = os.path.join(general_path, "movies.csv")
links_path = os.path.join(general_path, "links.csv")
tags_path = os.path.join(general_path, "tags.csv")

In [5]:
# Training and test ratings sets
training_path = os.path.join(general_path, "train_ratings.csv")
test_path = os.path.join(general_path, "test_set_no_ratings.csv")

In [6]:
# Load the movie and link data into dataframes
movies = pd.read_csv(movies_path)
links = pd.read_csv(links_path)

movies = pd.concat([movies, links], axis=1)  # concatenate the two infromation dataframes for the movies
movies = movies.loc[:, ~movies.columns.duplicated()]  # remove the duplicate columns

movies.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [7]:
# Load and process the tag data
tags = pd.read_csv(tags_path)
tags.rename(columns={"userId": "user_tag_id"}, inplace=True)

grouped_tags = tags.groupby(["user_tag_id", "movieId"]).agg(list).reset_index()
grouped_tags.drop(columns="timestamp", inplace=True)
grouped_tags.head()

Unnamed: 0,user_tag_id,movieId,tag
0,2,60756,"[funny, Highly quotable, will ferrell]"
1,2,89774,"[Boxing story, MMA, Tom Hardy]"
2,2,106782,"[drugs, Leonardo DiCaprio, Martin Scorsese]"
3,7,48516,[way too long]
4,18,431,"[Al Pacino, gangster, mafia]"


In [8]:
# Prepare the training dataset
train = pd.read_csv(training_path)
train.drop(columns="timestamp", inplace=True)
test = pd.read_csv(test_path)

test.drop(columns="Id", inplace=True)
test["rating"] = -1

frames = [train, test]
ratings = pd.concat(frames)

merged_train = ratings.merge(movies, on="movieId", how="left")

# Prepare the testing dataset
test = pd.read_csv(test_path)

# Add the corresponding tags for each user-movieid combination
merged_full = pd.merge(
    merged_train,
    grouped_tags,
    how="left",
    left_on=["userId", "movieId"],
    right_on=["user_tag_id", "movieId"],
)

# Minor modifications to the merged dataframe
merged_full.drop(columns="user_tag_id", inplace=True)
merged_full["rating_count_per_user"] = merged_full.groupby("userId")["rating"].transform("count")
merged_full.drop(columns=["genres"], inplace=True)

merged_full.head()

Unnamed: 0,userId,movieId,rating,title,imdbId,tmdbId,tag,rating_count_per_user
0,509,7347,3.0,Secret Window (2004),363988,1586.0,,467
1,326,71462,4.0,"Cove, The (2009)",1313104,23128.0,,152
2,57,2115,3.0,Indiana Jones and the Temple of Doom (1984),87469,87.0,,476
3,610,1127,4.0,"Abyss, The (1989)",96754,2756.0,,1302
4,462,2409,2.0,Rocky II (1979),79817,1367.0,,455


### Genre Incorporation

Now, we include the available genres per movie to our data.

In [9]:
# Load the full_movielens_genres
full_movies_path = "/kaggle/input/movielens-custom/full_movielens_genres.csv"
full_movies = pd.read_csv(full_movies_path)
full_movies = full_movies.groupby("imdbId")["genres"].first().reset_index()

merged_full = pd.merge(merged_full, full_movies, on="imdbId", how="left")

print(f"Number of unique user IDs: {merged_full['userId'].unique().size}")
print(f"Number of unique movie IDs: {merged_full['movieId'].unique().size}")

merged_full["rating_count_per_user"].describe()

Number of unique user IDs: 610
Number of unique movie IDs: 9724


count    100836.000000
mean        603.892816
std         653.007108
min          20.000000
25%         148.000000
50%         385.000000
75%         836.000000
max        2698.000000
Name: rating_count_per_user, dtype: float64

### Calculate the average rating per user

In [10]:
average_ratings_per_user = merged_full["rating_count_per_user"].mean()
print(f"Average number of ratings per user: {average_ratings_per_user:.2f}")

Average number of ratings per user: 603.89


## Section 2: Implementing Matrix Factorization 🪛

In this section we prepare the data for Matrix Factorization, and then we implement and train the Matrix Factorization model.


### Prepare the data for the model

In [11]:
# Data preparation for Matrix Factorization
test.drop(columns="Id", inplace=True)
test["rating"] = -1

frames = [train, test]
ratings = pd.concat(frames)  # concatenate train and test in order to create the matrix later
ratings = ratings.merge(movies, on="movieId", how="left")  # merge with movies
ratings.drop(columns=["title", "imdbId", "tmdbId"], inplace=True)

ratings.head()

Unnamed: 0,userId,movieId,rating,genres
0,509,7347,3.0,Mystery|Thriller
1,326,71462,4.0,Documentary
2,57,2115,3.0,Action|Adventure|Fantasy
3,610,1127,4.0,Action|Adventure|Sci-Fi|Thriller
4,462,2409,2.0,Action|Drama


In [12]:
no_genres_rows = ratings[ratings["genres"] == "(no genres listed)"]

# Iterate through rows and update 'genres' based on 'merged_full'
for index, row in no_genres_rows.iterrows():
    movie_id = row["movieId"]
    corresponding_genre = merged_full.loc[merged_full["movieId"] == movie_id, "genres"].values
    
    new_genre = (
        corresponding_genre[0]
        if len(corresponding_genre) > 0 and not pd.isna(corresponding_genre[0])
        else "Action"
    )
    
    ratings.at[index, "genres"] = new_genre
    
ratings["genres"] = ratings["genres"].str.lower()

d = defaultdict(LabelEncoder)

cols_cat = ["userId", "movieId"]
for c in cols_cat:
    d[c].fit(ratings[c].unique())
    ratings[c] = d[c].transform(ratings[c])

grouped_rating = ratings.groupby("movieId")["genres"].first().reset_index()
n_genres = set("|".join(grouped_rating["genres"]).split("|"))

### Create the one-hot representation of the genres to use them in Matrix Factorization

In [13]:
# Create a dictionary mapping movie IDs to genre vectors using one-hot encoding
movie_to_genre_vector = {}

# Convert genres column to a list of lists
genres_list = [genres.split("|") for genres in grouped_rating["genres"]]

# Use MultiLabelBinarizer for one-hot encoding
mlb = MultiLabelBinarizer()
genre_vectors = mlb.fit_transform(genres_list)

for idx, row in grouped_rating.iterrows():
    movie_id = row["movieId"]
    genre_vector = torch.FloatTensor(genre_vectors[idx])

    movie_to_genre_vector[movie_id] = genre_vector

train_ratings = ratings[ratings.rating != -1]
test_ratings = ratings[ratings.rating == -1].copy()

train_ratings.reset_index(drop=True, inplace=True)
test_ratings.reset_index(drop=True, inplace=True)

df_train = train_ratings
df_train.head()

Unnamed: 0,userId,movieId,rating,genres
0,508,4893,3.0,mystery|thriller
1,325,7127,4.0,documentary
2,56,1575,3.0,action|adventure|fantasy
3,609,855,4.0,action|adventure|sci-fi|thriller
4,461,1808,2.0,action|drama


### Initialize the MovieDataset and the Dataloaders

In [14]:
# Create the DataLoader
batch_size = 32

train_dataset = MovieRatingDataset(df_train, movie_to_genre_vector)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = MovieRatingDataset(test_ratings, movie_to_genre_vector)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

### Training Loop: Train the Matrix Factorization model

In [15]:
# Training loop parameter setting
n_users = len(ratings["userId"].unique())
n_movies = len(ratings["movieId"].unique())
n_genres = len(set("|".join(grouped_rating["genres"]).split("|")))

n_factors = 15

model = MatrixFactorization(n_users, n_movies, n_factors, n_genres)

criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.06)
scheduler = ReduceLROnPlateau(optimizer, verbose=True)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MatrixFactorization(
  (user_factors): Embedding(610, 15)
  (movie_factors): Embedding(9724, 15)
  (genre_factors): Linear(in_features=22, out_features=15, bias=True)
)

In [16]:
# Training loop routine
epoch_train_losses, epoch_val_losses, epoch_val_rmse = [], [], []

num_epochs = 7

for epoch in tqdm(range(num_epochs), desc="Training", unit="epoch"):
    train_losses, val_losses = [], []
    predicted_ratings, true_ratings = [], []

    for batch_x, batch_y, batch_genres in train_dataloader:
        # Get the data
        users = batch_x[0].to(device)
        movies = batch_x[1].to(device)

        genres = batch_genres.to(device)
        batch_y = batch_y.to(device, dtype=torch.float)
        
        # Set the gradients to zero
        optimizer.zero_grad()

        # Forward pass
        outputs = model(users, movies, genres)

        # Calculate the loss
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Keep track of the loss
        train_losses.append(loss.item())

    # Calculate statistics
    epoch_train_loss = np.mean(train_losses)
    epoch_train_losses.append(epoch_train_loss)

    # Print the results for this epoch:
    s = f"Epoch: {epoch}, Train Loss: {epoch_train_loss:0.3f}, "
    print(s)

print("Training complete!")

Training:  14%|█▍        | 1/7 [00:04<00:25,  4.27s/epoch]

Epoch: 0, Train Loss: 0.871, 


Training:  29%|██▊       | 2/7 [00:08<00:19,  3.97s/epoch]

Epoch: 1, Train Loss: 0.736, 


Training:  43%|████▎     | 3/7 [00:11<00:15,  3.86s/epoch]

Epoch: 2, Train Loss: 0.665, 


Training:  57%|█████▋    | 4/7 [00:15<00:11,  3.89s/epoch]

Epoch: 3, Train Loss: 0.618, 


Training:  71%|███████▏  | 5/7 [00:19<00:07,  3.80s/epoch]

Epoch: 4, Train Loss: 0.580, 


Training:  86%|████████▌ | 6/7 [00:23<00:03,  3.78s/epoch]

Epoch: 5, Train Loss: 0.548, 


Training: 100%|██████████| 7/7 [00:26<00:00,  3.83s/epoch]

Epoch: 6, Train Loss: 0.521, 
Training complete!





At this point, we have a trained Matrix Factorization model and we are ready to generate the predictions!

## Section 3: User Rating Prediction Generation 🔧

Now that we completed the training of the Matrix Factorization model, we generate the predictions to be submitted to Kaggle.

### Generate the predictions by running the testing data.

In [17]:
# Create a list to store the predicted ratings
predicted_ratings = []

model.eval()
for user_movie_Id, _, genre_vector in test_dataloader:
    # Fetch the data
    user_id = user_movie_Id[0].to(device)
    movie_id = user_movie_Id[1].to(device)
    genre_vector_ = genre_vector.to(device)

    # Generate the rating prediction
    predicted_rating = model(user_id, movie_id, genre_vector_)
    
    # Keep the results
    predicted_ratings.append(predicted_rating.item())

# Add the data to the final dataframe
test_ratings["rating"] = predicted_ratings
test_ratings.reset_index(inplace=True)
test_ratings.rename(columns={"index": "Id"}, inplace=True)

# Keep only the columns needed for the submission
test_ratings.drop(columns=["userId", "movieId", "genres"], inplace=True)

test_ratings.head()

Unnamed: 0,Id,rating
0,0,3.348304
1,1,3.296681
2,2,2.482054
3,3,3.548125
4,4,3.535537


### Save the generated predictions for submission

In [18]:
# Save the predictions to a csv file
filename = "final_predictions.csv"
test_ratings.to_csv(filename, index=False)

print(f"Predictions saved to {filename}")

Predictions saved to final_predictions.csv
