In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import utils

### (1) Load Artifacts

In [None]:
user_id_map_dict = utils.open_object("./artifacts/user_id_map_dict.pkl")
movie_id_map_dict = utils.open_object("./artifacts/movie_id_map_dict.pkl")
genres_map_dict = utils.open_object("./artifacts/genres_map_dict.pkl")

In [None]:
num_user = len(user_id_map_dict)
num_movie = len(movie_id_map_dict)
num_genere = len(genres_map_dict)

### (2) Modeling

In [None]:
config_dict={}

In [None]:
config_dict['num_user'] = num_user 
config_dict['num_item'] = num_movie 
config_dict['num_genere'] = num_genere
config_dict['latent_dim_mlp'] =  128
config_dict['latent_dim_mf']=config_dict['latent_dim_mlp']
config_dict['layers'] = [config_dict['latent_dim_mf']*2]+[64,32]
config_dict['dropout_rate_mf']=0.2
config_dict['dropout_rate_mlp']=0.2

In [None]:
class Config:
    def __init__(self, dictionary):
        for key, value in dictionary.items():
            setattr(self, key, value)

In [None]:
config = Config(dictionary=config_dict)

In [None]:
%%writefile model.py

import torch

class NeuMF(torch.nn.Module):
    def __init__(self, config):
        super(NeuMF, self).__init__()
        self.config = config

        # matrix factorization part
        self.embedding_user_mf = torch.nn.Embedding(
            num_embeddings=self.config.num_user, embedding_dim=self.config.latent_dim_mf)
        self.embedding_item_mf = torch.nn.Embedding(
            num_embeddings=self.config.num_item, embedding_dim=self.config.latent_dim_mf)

        # multilayer perceptron part
        self.embedding_user_mlp = torch.nn.Embedding(
            num_embeddings=self.config.num_user, embedding_dim=self.config.latent_dim_mlp)
        self.embedding_item_mlp = torch.nn.Embedding(
            num_embeddings=self.config.num_item, embedding_dim=self.config.latent_dim_mlp)

        self.fc_layers = torch.nn.ModuleList()
        for idx, (in_size, out_size) in enumerate(zip(self.config.layers[:-1], self.config.layers[1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))

        self.logits = torch.nn.Linear(
            in_features=self.config.layers[-1] + self.config.latent_dim_mf, out_features=1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, user_indices, item_indices):
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)

        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)

        # mf part: element-wise product
        mf_vector = torch.mul(user_embedding_mf, item_embedding_mf)
        mf_vector = torch.nn.Dropout(self.config.dropout_rate_mf)(mf_vector)

        # mlp part
        # the concat latent vector
        mlp_vector = torch.cat(
            [user_embedding_mlp, item_embedding_mlp], dim=-1)

        for idx, _ in enumerate(range(len(self.fc_layers))):
            mlp_vector = self.fc_layers[idx](mlp_vector)
            """
            1) The sigmoid function restricts each
            neuron to be in (0,1), which may limit the model's perfor-
            mance; and it is known to suffer from saturation, where
            neurons stop learning when their output is near either 0 or
            1. 2) Even though tanh is a better choice and has been
            widely adopted [6, 44], it only alleviates the issues of sig-
            moid to a certain extent, since it can be seen as a rescaled
            version of sigmoid. And 3) as
            such, we opt for ReLU, which is more biologically plausi-
            ble and proven to be non-saturated [9]; moreover, it encour-
            ages sparse activations, being well-suited for sparse data and
            making the model less likely to be overfitting. Our empirical
            results show that ReLU yields slightly better performance
            than tanh, which in turn is significantly better than sigmoid.
            """
            mlp_vector = torch.nn.ReLU()(mlp_vector)

        mlp_vector = torch.nn.Dropout(self.config.dropout_rate_mlp)(mlp_vector)

        vector = torch.cat([mlp_vector, mf_vector], dim=-1)
        logits = self.logits(vector)
        output = self.sigmoid(logits)
        return output

In [None]:
from model import NeuMF

In [None]:
# recommender(user_indices=sample_dataset['user_embed_id'],
#             item_indices=sample_dataset['movie_embed_id'])

## (3) Load Processed Data

In [None]:
df_processed = pd.read_parquet("./data/processed.parquet")

In [None]:
max_genres = max([ len(x) for x in df_processed['genres_embed_ids']])

In [None]:
%%writefile dataset.py
import torch
from torch.utils.data import Dataset
# from torch.nn.utils.rnn import pad_sequence


class RatingDataset(Dataset):
    def __init__(self, data, max_genres=10):
        self.data = data
        self.max_genres = max_genres

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        
        data_item = self.data.iloc[index]
        
        user_embed_id = data_item["user_embed_id"]
        movie_embed_id = data_item["movie_embed_id"]
        # genres_embed_ids = data_item["genres_embed_ids"]
        # genres_embed_ids = [torch.tensor(ids) for ids in genres_embed_ids]
        # padded_genres_embed_ids = pad_sequence(
        #     genres_embed_ids, batch_first=True, padding_value=0)
        
        # padded_genres_embed_ids = padded_genres_embed_ids[:, :self.max_genres]

        rating = self.data.iloc[index]["rating"]

        sample = {
            "user_embed_id": torch.tensor(user_embed_id, dtype=torch.long),
            "movie_embed_id": torch.tensor(movie_embed_id, dtype=torch.long),
            # "genres_embed_ids": padded_genres_embed_ids,
            "rating": torch.tensor(rating, dtype=torch.float),
        }

        return sample

In [None]:
from dataset import RatingDataset
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_parquet("./data/processed.parquet")

In [None]:
df_train,df_test = train_test_split(df,test_size=0.1,random_state=33,shuffle=True)

In [None]:
df_train = df_train.reset_index()
df_test = df_test.reset_index()

In [None]:
df_train.to_parquet("./data/train.parquet")
df_test.to_parquet("./data/test.parquet")

In [None]:
train_dataset = RatingDataset(data=df_train)

In [None]:
train_dataset[:10]

In [None]:
from torch.utils.data import DataLoader

In [None]:
loader = DataLoader(train_dataset,batch_size=32,shuffle=True)

In [None]:
for sample in loader:
    break

In [None]:
sample

In [None]:
recommender = NeuMF(config=config)

In [None]:
sample

In [None]:
recommender(user_indices=sample['user_embed_id'],
            item_indices=sample['movie_embed_id'])