# Matrix Factorization

This notebook implements matrix factorization for the MovieLens dataset. 

## Imports and Libraries

In [1]:
import torch
import pandas as pd
import numpy as np
import time
import os
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import time
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from torch import nn
import matplotlib.pyplot as plt

seed_value = 42
np.random.seed(seed_value)

## Section 1: Load the Movielens data, do the needed preprocessing and merging

In [2]:
#Get all the data paths 
current_directory = os.getcwd()
general_path = os.path.join(current_directory, 'project2_Data')

movies_path = os.path.join(general_path, 'movies.csv')
links_path = os.path.join(general_path, 'links.csv')
tags_path = os.path.join(general_path, 'tags.csv')

In [3]:
# Training and test ratings sets 
training_path = os.path.join(general_path, 'train_ratings.csv')
test_path = os.path.join(general_path, 'test_set_no_ratings.csv')

In [4]:
movies = pd.read_csv(movies_path)
links = pd.read_csv(links_path)

movies = pd.concat([movies,links],axis=1)   # concatenate the two infromation dataframes for the movies
movies = movies.loc[:, ~movies.columns.duplicated()] # remove the duplicate columns 
movies.head(1)

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0


In [5]:
tags = pd.read_csv(tags_path)
tags.rename(columns={'userId':'user_tag_id'},inplace=True)
grouped_tags = tags.groupby(['user_tag_id', 'movieId']).agg(list).reset_index()
grouped_tags.drop(columns='timestamp',inplace=True)
grouped_tags.head(1)

Unnamed: 0,user_tag_id,movieId,tag
0,2,60756,"[funny, Highly quotable, will ferrell]"


In [6]:
#Prepare the training dataset 
train = pd.read_csv(training_path)
train.drop(columns='timestamp',inplace=True)
merged_train = train.merge(movies,on='movieId',how='left')

test = pd.read_csv(test_path)

#Add the corresponding tags for each user-movieid combination
merged_full = pd.merge(merged_train, grouped_tags, how='left', left_on=['userId', 'movieId'], right_on=['user_tag_id', 'movieId'])
merged_full.drop(columns='user_tag_id',inplace=True)
merged_full['rating_count_per_user']=merged_full.groupby('userId')['rating'].transform("count")

In [7]:
merged_full.head(5)

Unnamed: 0,userId,movieId,rating,title,genres,imdbId,tmdbId,tag,rating_count_per_user
0,509,7347,3.0,Secret Window (2004),Mystery|Thriller,363988,1586.0,,374
1,326,71462,4.0,"Cove, The (2009)",Documentary,1313104,23128.0,,126
2,57,2115,3.0,Indiana Jones and the Temple of Doom (1984),Action|Adventure|Fantasy,87469,87.0,,383
3,610,1127,4.0,"Abyss, The (1989)",Action|Adventure|Sci-Fi|Thriller,96754,2756.0,,1044
4,462,2409,2.0,Rocky II (1979),Action|Drama,79817,1367.0,,366


In [8]:
print(f"Number of unique user IDs: {merged_full['userId'].unique().size}")
print(f"Number of unique movie IDs: {merged_full['movieId'].unique().size}")

Number of unique user IDs: 610
Number of unique movie IDs: 8983


In [9]:
merged_full['rating_count_per_user'].describe()

count    80668.000000
mean       483.686914
std        520.633681
min         13.000000
25%        115.000000
50%        301.000000
75%        679.000000
max       2122.000000
Name: rating_count_per_user, dtype: float64

In [10]:
average_ratings_per_user = merged_full['rating_count_per_user'].mean()
print(f"Average number of ratings per user: {average_ratings_per_user:.2f}")

Average number of ratings per user: 483.69


## Section 2: Matrix Factorization Implementation 

In [11]:
class MatrixFactorization():
    
    def __init__(self, rating_matrix, scale=0.03, num_features=400, l_rate=0.005, lambda_=0.1, num_iterations=400):
        
        '''
        Parameters:
        rating_matrix: The user-item rating matrix
        scale: Scaling factor for initializing user and item vectors
        num_features: Number of latent factors for user and item vectors
        l_rate: Learning rate for gradient descent
        lambda_: Regularization parameter to prevent overfitting
        num_iterations: Number of iterations for training the model
            
        '''
        self.rating_matrix = rating_matrix
        self.scale = scale
        self.num_features = num_features
        self.l_rate = l_rate  
        self.lambda_ = lambda_  
        self.num_iterations = num_iterations
        #Indices of non-zero elements in the rating matrix
        self.non_zero_row_ind, self.non_zero_col_ind = rating_matrix.nonzero()   #Non-zero indices
        self.num_pairs= len(rating_matrix[np.where(rating_matrix != 0)])  # Number of non-zero entries in the rating matrix 
        self.indices = list(range(self.num_pairs))   # Indices of the pairs of rating matrix 
        self.losses = []                             # To store training losses
        
        self.wait = 10
        self.no_change = 10       
        self.early_stopping = False

        # Initialize Bias Values
        self.user_biases = np.zeros(self.rating_matrix.shape[0])
        self.item_biases = np.zeros(self.rating_matrix.shape[1])
        
        # Initialize user & item vectors        
        self.user_features = np.random.normal(scale=self.scale, size=(self.rating_matrix.shape[0], self.num_features))
        self.item_features = np.random.normal(scale=self.scale, size=(self.rating_matrix.shape[1], self.num_features))
        # Compute global bias
        self.global_bias = np.mean(self.rating_matrix[np.where(self.rating_matrix != 0)])        
        
    def predict(self, u, i):
        return  self.global_bias + self.user_biases[u] + self.item_biases[i] + self.user_features[u] @ self.item_features[i]
    
    def fit(self, ):
        '''
        Train the matrix factorization model using stochastic gradient descent.
        '''
        self.now = time.time()
        
        for epoch in range(1, self.num_iterations):
            
            epoch_loss = 0
            np.random.shuffle(self.indices)

            
            if self.early_stopping == False:
                
                for index in self.indices:
                    # Extracting user item information indices in which we have a rating
                    u, i = self.non_zero_row_ind[index], self.non_zero_col_ind[index]
                    pred_rat = self.predict(u, i)
                    
                    error = self.rating_matrix[u, i] - pred_rat
                    
                    # Update biases
                    self.user_biases[u] += self.l_rate * (error - self.lambda_ * self.user_biases[u])
                    self.item_biases[i] += self.l_rate * (error - self.lambda_ * self.item_biases[i])

                    # Update User and item Vectors
                    self.user_features[u, :] += self.l_rate * (error * self.item_features[i, :] - self.lambda_ * self.user_features[u, :])
                    self.item_features[i, :] += self.l_rate * (error * self.user_features[u, :] - self.lambda_ * self.item_features[i, :])
                
                # Calculate epoch error 
                for index in self.indices:
                    # Extracting user item information indices in which we have a rating
                    u, i = self.non_zero_row_ind[index], self.non_zero_col_ind[index]
                    pred_rat = self.predict(u, i)
                    epoch_loss += (self.rating_matrix[u, i] - pred_rat) ** 2
                avg_loss = epoch_loss / self.num_pairs
                self.losses.append(avg_loss)
                
                # Check for early stopping 
                if len(self.losses)>1 and (self.losses[-2] - self.losses[-1]) <= 1e-3:
                    if self.wait == self.no_change:
                        self.early_stopping = True
                    self.wait += 1
                else:
                    self.wait = 0
                    
        temp = np.round(time.time() - self.now, 3)            
        print(f"Fitted in {temp} seconds.")
        
    def loss_plot(self, ):
        '''
        Plot the training Loss vs. num_iterations.
        '''
        plt.figure(figsize=(18, 6))
        plt.plot(range(1, 1 + len(self.losses)), self.losses, marker='o')
        plt.title("Training Loss vs. Iterations", fontsize=20)
        plt.xlabel('Number of iterations', fontsize=18)
        plt.ylabel('Error', fontsize=18)
        plt.xticks(range(1, self.conv_epoch_num + 5), fontsize=15, rotation=90)  # There's no variable named 'conv_epoch_num'
        plt.yticks(np.linspace(min(self.losses), max(self.losses), 15), fontsize=15)
        plt.grid()
        plt.show()

## Section 3: Create a validation set by masking the training set and retaining all user IDs in both sets.

In [13]:
### For the validation set we masked 40 last ratings for each userId 
def get_mask_indexes(data, mask_rate=40):
    mask_indexes = []
    users = data['userId'].unique()
    
    for user_id in users:
        #If one user have less than 40 ratings which is the case for around 7% entries we just take the last 10 ratings 
        #So this way we ensure that in validation set we dont have unseen user Ids
        if len(data[data['userId'] == user_id])<=40:
            mask_indexes += data[data['userId'] == user_id].iloc[-5:].index.tolist()         
        else:
            mask_indexes += data[data['userId'] == user_id].iloc[-mask_rate:].index.tolist()
    
    return mask_indexes

masked_indexes = get_mask_indexes(merged_full)
training_set =  merged_full.drop(masked_indexes)
validation_set = merged_full.loc[masked_indexes]
print(f"Training set length: {len(training_set)}") 
print(f"Validation set length: {len(validation_set)}")


Training set length: 64423
Validation set length: 16245


In [14]:
validation_set.head(2)

Unnamed: 0,userId,movieId,rating,title,genres,imdbId,tmdbId,tag,rating_count_per_user
71664,509,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,,374
71775,509,28,3.5,Persuasion (1995),Drama|Romance,114117,17015.0,,374


In [15]:
training_set.head(2)

Unnamed: 0,userId,movieId,rating,title,genres,imdbId,tmdbId,tag,rating_count_per_user
0,509,7347,3.0,Secret Window (2004),Mystery|Thriller,363988,1586.0,,374
1,326,71462,4.0,"Cove, The (2009)",Documentary,1313104,23128.0,,126


In [16]:
set(validation_set.userId.values ).issubset(training_set.userId.values)

True

## Section 4: Generate the Training Matrix and Fine-tune the Hyperparameters

In [17]:
# Create user_movie_matrix for the training/validation set

def create_matrix(X_train,X_val):

    user_ids = np.unique(np.concatenate([X_train['userId'].values, X_val['userId'].values]))
    movie_ids = np.unique(np.concatenate([X_train['movieId'].values, X_val['movieId'].values]))
    user_movie_matrix = np.zeros((len(user_ids), len(movie_ids)))

    user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
    movie_id_to_index = {movie_id: index for index, movie_id in enumerate(movie_ids)}

    for index, row in X_train.iterrows():
        user_index = user_id_to_index[row['userId']]
        movie_index = movie_id_to_index[row['movieId']]
        user_movie_matrix[user_index, movie_index] = row['rating']
    return user_id_to_index,movie_id_to_index,user_movie_matrix

train_user_id_to_index,train_movie_id_to_index,train_user_movie_matrix = create_matrix(training_set,validation_set)


In [None]:
'  
# Define the hyperparameter values to explore
num_features_values = [300,350,400,250,200]
lambda_values = [0.1]
lr_rate_factors = [0.005]
scale_values = [0.03,0.035]
best_rmse = float('inf')
best_params = None
        
for num_features in num_features_values:
    for l_rate in lr_rate_factors:
        for lambda_ in lambda_values:
            for scale in scale_values:

                # Create and fit the MatrixFactorization model with current hyperparameters
                model = MatrixFactorization(train_user_movie_matrix,scale=scale,l_rate=l_rate, num_features=num_features, lambda_=lambda_)
                model.fit()

                # Make predictions on the validation set
                preds = []
                for index, row in validation_set.iterrows():
                    user_id = row['userId']
                    movie_id = row['movieId']

                    user_index = train_user_id_to_index[user_id]
                    movie_index = train_movie_id_to_index[movie_id]

                    predicted_rating = model.predict(user_index, movie_index)
                    preds.append(predicted_rating)

                # Calculate RMSE for the current hyperparameters
                rmse = np.sqrt(mean_squared_error(validation_set['rating'], preds))
                print(f"Hyperparameters: num_features={num_features}, lambda_={lambda_},lr={l_rate},scale={scale} - RMSE: {rmse}")

                # Update the best parameters if the current RMSE is lower
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_params = {'num_features': num_features,'lr':l_rate, 'lambda_': lambda_}
        
print(f"Best Hyperparameters: {best_params}")
print(f"Best RMSE: {best_rmse}")

## Generate the predictions for the Testing set

In [18]:
#Use the function instead 
all_user_ids = np.unique(np.concatenate([train['userId'].values, test['userId'].values]))
all_movie_ids = np.unique(np.concatenate([train['movieId'].values, test['movieId'].values]))
user_movie_matrix = np.zeros((len(all_user_ids), len(all_movie_ids)))

user_id_to_index = {user_id: index for index, user_id in enumerate(all_user_ids)}
movie_id_to_index = {movie_id: index for index, movie_id in enumerate(all_movie_ids)}

for index, row in train.iterrows():
    user_index = user_id_to_index[row['userId']]
    movie_index = movie_id_to_index[row['movieId']]
    user_movie_matrix[user_index, movie_index] = row['rating']
user_movie_matrix.shape

(610, 9724)

In [19]:
test_matrix = MatrixFactorization(user_movie_matrix)
test_matrix.fit()

Fitted in 246.984 seconds.


#### Create the final csv file

In [20]:
test['rating']=0.0
for index, row in test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']

    user_index = user_id_to_index[user_id]
    movie_index = movie_id_to_index[movie_id]

    # Assuming predict is a function that takes user and movie indices
    predicted_rating = test_matrix.predict(user_index, movie_index)

    # Update the 'predicted_rating' column with the predicted rating
    test.at[index, 'rating'] = predicted_rating
test.drop(columns=['userId','movieId'],inplace=True)
test.head(5)

Unnamed: 0,Id,rating
0,0,3.064759
1,1,3.393295
2,2,2.605893
3,3,3.83508
4,4,3.733241


In [None]:
test.to_csv('predictions.csv', index=False)
print(f"CSV file '{csv_file}' has been created.")