In [31]:
import pandas as pd
import torch
from torch import nn
from torch import optim
import numpy as np
import matplotlib.pyplot as plt
from torch.nn.init import xavier_uniform_
from sklearn.model_selection import train_test_split
import seaborn as sns

sns.set_theme()


#not sure which one is necessary
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x1e30a95ab50>

In [32]:

df_train = pd.read_csv('./data/train.csv')

book_ids = df_train['book_id'].unique()
user_ids = df_train['user_id'].unique()

n_books = len(book_ids)
n_users = len(user_ids)

#unique index for each user and book in the matrix
book_idx = {ids: i for i,ids in enumerate(book_ids)}
user_idx = {ids: i for i,ids in enumerate(user_ids)}

### whole df to train on once optimal hyperparameters are found

#to access vectors in MF
user_assigned_idx = torch.LongTensor(np.array([user_idx[i] for i in df_train['user_id'].values]))
book_assigned_idx = torch.LongTensor(np.array([book_idx[i] for i in df_train['book_id'].values]))

# to compute training loss
ratings = torch.FloatTensor(df_train['rating'].values)



In [33]:
#train-val split for hyperparam selection
df_train['user_idx'] = df_train['user_id'].map(user_idx)
df_train['book_idx'] = df_train['book_id'].map(book_idx)

#we keep a low validation size to capture most user-book interactions
train_data, val_data = train_test_split(df_train, test_size=0.01, random_state=42)

user_assigned_idx_train = torch.LongTensor(train_data['user_idx'].values)
book_assigned_idx_train = torch.LongTensor(train_data['book_idx'].values)
ratings_train = torch.FloatTensor(train_data['rating'].values)

user_assigned_idx_val = torch.LongTensor(val_data['user_idx'].values)
book_assigned_idx_val = torch.LongTensor(val_data['book_idx'].values)
ratings_val = torch.FloatTensor(val_data['rating'].values)




In [34]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_books, embedding_size, var, init):
        """
        Initializes a MatrixFactorization model
        P contains the embedded vectors of users
        Q contains the embedded vectors of books

        input:
            n_users: int, number of users
            n_books: int, number of books
            embedding_size: int, dimension of embedded space
            var: float, range of initialized weights (for random or uniform initialization)
            init: string, type of method to initialize weights, must be in {uniform, normal xavier}, keeps initialization from nn.Embeding otherwise
        """


        super().__init__()
        self.P = nn.Embedding(n_users, embedding_size)
        self.Q = nn.Embedding(n_books, embedding_size)

        #change weights initialization:
        if init == 'uniform':
            self.P.weight.data.uniform_(0, var)
            self.Q.weight.data.uniform_(0, var)

        if init == 'normal':
            self.P.weight.data.normal_(mean=0, std=var)
            self.Q.weight.data.normal_(mean=0, std=var)

        if init == 'xavier':
            xavier_uniform_(self.P.weight)
            xavier_uniform_(self.Q.weight)


        
    def forward(self, user_id, book_id):
        """
        Forward pass to predict ratings

        inputs:
            user_id: tensor, ids of user
            book_id: tensor, ids of books
        ouput:
            out: tensor, predicted ratings of (user, book) pairs
        """

        user_vec = self.P(user_id)
        book_vec = self.Q(book_id)
        #dot product
        out = (user_vec*book_vec).sum(1)
        return out


    


In [35]:

#metric
mse_metric = torch.nn.MSELoss()

def train(user_assigned_idx, book_assigned_idx, ratings, embedding_size, var, init, decay, lr, lambda_, N_EPOCH, verbose):
    """
    Trains a MF model given the hyperparameters

    Input:
        user_assigned_idx: array containing the unique user id to train on
        book_assigned_idx: array containing the unique book id to train on
        ratings: array containing ratings to train on
        embedding_size: int, dimension of embedded space
        var: float, range of values for weight initialization
        init: string: type of weight initialization
        decay: float: l2 regularization implemented in Adam optimizer
        lr: float: optimizer's learning rate
        lambda_: float (not used): regularization to aim for 2.5 mean rating
        N_EPOCH: int, number of epoch to train on
        verbose: boolean: whether to print rmse and mean rating during training

    Output:
        model: a trained MF model
    """
    
    model = MatrixFactorization(n_users, n_books, embedding_size, var, init)

    #weight decay acts as a l2 regularizer
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=decay)

    model.train()

    for epoch in range(N_EPOCH):

        optimizer.zero_grad()

        #prediction
        r_hat = model(user_assigned_idx, book_assigned_idx)

        mse = mse_metric(r_hat, ratings)
        rmse = torch.sqrt(mse)
        #adds regularization term to aim for mean rating
        loss = mse + lambda_*(torch.mean(r_hat) - 2.5)**2
        
        #update
        loss.backward()
        optimizer.step()

        if (epoch+1) % 100 == 0 and verbose:
            print(f'epoch {epoch+1}, RMSE: {rmse.item()}, mean rating: {torch.mean(r_hat)}')

    return model


def validation(model, user_assigned_idx_val, book_assigned_idx_val, ratings):
    """
    Estimates error of a trained model on unseen data
    
    Input:
        model: MF object, a trained model
        user_assigned_idx_val: array of unique user id of the validation set
        book_assigned_idx_val: array of unique book id of the validation set
        ratings: array of ratings of the validation set

    Output:
        rmse: float, rmse on validation set
        mean_rating: float, mean predicted rating
    """


    #evaluation mode as training is done
    model.eval()
    #predictions
    r_hat = model(user_assigned_idx_val, book_assigned_idx_val)
    #get ratings between 0 and 5 to lower possible error
    r_hat_clipped = torch.clamp(r_hat, 0, 5)

    err = mse_metric(r_hat_clipped, ratings)
    rmse = torch.sqrt(err)
    mean_rating = torch.mean(r_hat_clipped)

    return rmse.item(), mean_rating




In [36]:
"""
#current best:
d_star = 150
lr_star = 1e-4
var_star = 1e-4
decay_star = 1e-6
N_EPOCH_STAR = 1000
lambda_star = 0
init_star = 'uniform'

"""

"\n#current best:\nd_star = 150\nlr_star = 1e-4\nvar_star = 1e-4\ndecay_star = 1e-6\nN_EPOCH_STAR = 1000\nlambda_star = 0\ninit_star = 'uniform'\n\n"

In [None]:

#optimal hyperparameters
d_star = 250
lr_star = 1e-4
var_star = 1e-4
decay_star = 1e-6
N_EPOCH_STAR = 1500
lambda_star = 0
init_star = 'uniform'

#train on whole dataset
model_star = train(user_assigned_idx, book_assigned_idx, ratings, d_star, var_star, init_star, decay_star, lr_star, lambda_star, N_EPOCH_STAR, False

epoch 100, RMSE: 2.5679476261138916, mean rating: 0.04473259672522545
epoch 200, RMSE: 2.411073684692383, mean rating: 0.2195410132408142
epoch 300, RMSE: 2.1536216735839844, mean rating: 0.5107437968254089
epoch 400, RMSE: 1.8381308317184448, mean rating: 0.8762753009796143
epoch 500, RMSE: 1.5162923336029053, mean rating: 1.2596404552459717
epoch 600, RMSE: 1.232742190361023, mean rating: 1.6027053594589233
epoch 700, RMSE: 1.0099505186080933, mean rating: 1.868487000465393
epoch 800, RMSE: 0.8495232462882996, mean rating: 2.052623987197876
epoch 900, RMSE: 0.7430964708328247, mean rating: 2.1715707778930664
epoch 1000, RMSE: 0.6784493923187256, mean rating: 2.245534896850586
epoch 1100, RMSE: 0.6421830654144287, mean rating: 2.290447473526001
epoch 1200, RMSE: 0.622795820236206, mean rating: 2.3171491622924805
epoch 1300, RMSE: 0.6124729514122009, mean rating: 2.3326828479766846
epoch 1400, RMSE: 0.6067605018615723, mean rating: 2.3415334224700928
epoch 1500, RMSE: 0.603381395339965

In [None]:

def write_submission(model, df_test):
    """
    Writes and return a csv file containing the predicted rating for the Kaggle competition

    input:
        model: trained MatrixFactorization object
        df_test: pandas dataframe containing the (user, book) pairs we want to predict

    output:
        submission: pandas dataframe, predicted ratings
    """


    #put model in evaluating mode as training is done
    model.eval()

    #all users and book are already in train.csv -> no cold start problem
    test_user_assigned_idx = torch.LongTensor(np.array([user_idx[i] for i in df_test['user_id'].values]))
    test_book_assigned_idx = torch.LongTensor(np.array([book_idx[i] for i in df_test['book_id'].values]))


    predicted_ratings = model(test_user_assigned_idx, test_book_assigned_idx)

    predicted_ratings_clipped = torch.clamp(predicted_ratings, 0, 5)
    #cast in list for saving
    final = [rating.item() for rating  in predicted_ratings_clipped]

    #check mean rating
    #print(np.mean(final))


    submission = pd.DataFrame({
            'id':range(len(df_test)),
            'rating': final
        })

    submission.to_csv('./submission.csv', index=False)

    return submission


df_test = pd.read_csv('./data/test.csv')

submission = write_submission(model_star, df_test)



2.1348255927120983


Unnamed: 0,id,rating
0,0,2.030217
1,1,1.779391
2,2,1.078360
3,3,1.772655
4,4,2.313256
...,...,...
29362,29362,1.845475
29363,29363,1.748242
29364,29364,2.704699
29365,29365,2.521550
