In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import utils

### (1) Load Artifacts

In [3]:
user_id_map_dict = utils.open_object("./artifacts/user_id_map_dict.pkl")
movie_id_map_dict = utils.open_object("./artifacts/movie_id_map_dict.pkl")
genres_map_dict = utils.open_object("./artifacts/genres_map_dict.pkl")

In [4]:
num_user = len(user_id_map_dict)
num_movie = len(movie_id_map_dict)
num_genre = len(genres_map_dict)

### (2) Modeling

In [5]:
config_dict={}

In [6]:
config_dict={}
config_dict['num_user'] = num_user 
config_dict['num_item'] = num_movie 
config_dict['num_genre'] = num_genre
config_dict['latent_dim_mlp'] =  128
config_dict['latent_dim_mf']=config_dict['latent_dim_mlp']
config_dict['layers'] = [config_dict['latent_dim_mf']*2]+[64,32]
config_dict['num_layers'] = len(config_dict['layers'])
config_dict['dropout_rate_mf']=0.6
config_dict['dropout_rate_mlp']=0.6
config_dict['batch_size']=32
config_dict['epoches']=8
config_dict['eval_steps']=500
config_dict['use_xavier_uniform']=False
config_dict['learning_rate']=0.001

In [7]:
class Config:
    def __init__(self, dictionary):
        for key, value in dictionary.items():
            setattr(self, key, value)

In [8]:
config = Config(dictionary=config_dict)

In [9]:
%%writefile model.py

import torch

class NeuMF(torch.nn.Module):
    def __init__(self, config):
        super(NeuMF, self).__init__()
        self.config = config

        # matrix factorization part
        self.embedding_user_mf = torch.nn.Embedding(
            num_embeddings=self.config.num_user, embedding_dim=self.config.latent_dim_mf)
        torch.nn.init.xavier_uniform_(self.embedding_user_mf.weight)

        self.embedding_item_mf = torch.nn.Embedding(
            num_embeddings=self.config.num_item, embedding_dim=self.config.latent_dim_mf)
        torch.nn.init.xavier_uniform_(self.embedding_item_mf.weight)

        # multilayer perceptron part
        self.embedding_user_mlp = torch.nn.Embedding(
            num_embeddings=self.config.num_user, embedding_dim=self.config.latent_dim_mlp)
        torch.nn.init.xavier_uniform_(self.embedding_user_mlp.weight)

        self.embedding_item_mlp = torch.nn.Embedding(
            num_embeddings=self.config.num_item, embedding_dim=self.config.latent_dim_mlp)
        torch.nn.init.xavier_uniform_(self.embedding_item_mlp.weight)

        self.fc_layers = torch.nn.ModuleList()
        for idx, (in_size, out_size) in enumerate(zip(self.config.layers[:-1], self.config.layers[1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))

        self.logits = torch.nn.Linear(
            in_features=self.config.layers[-1] + self.config.latent_dim_mf, out_features=1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, user_indices, item_indices):
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)

        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)

        # mf part: element-wise product
        mf_vector = torch.mul(user_embedding_mf, item_embedding_mf)
        mf_vector = torch.nn.Dropout(self.config.dropout_rate_mf)(mf_vector)

        # mlp part
        # the concat latent vector
        mlp_vector = torch.cat(
            [user_embedding_mlp, item_embedding_mlp], dim=-1)

        for idx, _ in enumerate(range(len(self.fc_layers))):
            mlp_vector = self.fc_layers[idx](mlp_vector)
            """
            1) The sigmoid function restricts each
            neuron to be in (0,1), which may limit the model's perfor-
            mance; and it is known to suffer from saturation, where
            neurons stop learning when their output is near either 0 or
            1. 2) Even though tanh is a better choice and has been
            widely adopted [6, 44], it only alleviates the issues of sig-
            moid to a certain extent, since it can be seen as a rescaled
            version of sigmoid. And 3) as
            such, we opt for ReLU, which is more biologically plausi-
            ble and proven to be non-saturated [9]; moreover, it encour-
            ages sparse activations, being well-suited for sparse data and
            making the model less likely to be overfitting. Our empirical
            results show that ReLU yields slightly better performance
            than tanh, which in turn is significantly better than sigmoid.
            """
            mlp_vector = torch.nn.ReLU()(mlp_vector)

        mlp_vector = torch.nn.Dropout(self.config.dropout_rate_mlp)(mlp_vector)

        vector = torch.cat([mlp_vector, mf_vector], dim=-1)
        logits = self.logits(vector)
        output = self.sigmoid(logits)
        return output

Overwriting model.py


In [10]:
from model import NeuMF

## (3) Load Processed Data

In [11]:
df_processed = pd.read_parquet("./data/processed.parquet")

In [12]:
max_genres = max([ len(x) for x in df_processed['genres_embed_ids']])

In [13]:
%%writefile dataset.py
import torch
from torch.utils.data import Dataset
# from torch.nn.utils.rnn import pad_sequence


class RatingDataset(Dataset):
    def __init__(self, data, max_genres=10):
        self.data = data
        self.max_genres = max_genres

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        
        data_item = self.data.iloc[index]
        
        user_embed_id = data_item["user_embed_id"]
        movie_embed_id = data_item["movie_embed_id"]
        # genres_embed_ids = data_item["genres_embed_ids"]
        # genres_embed_ids = [torch.tensor(ids) for ids in genres_embed_ids]
        # padded_genres_embed_ids = pad_sequence(
        #     genres_embed_ids, batch_first=True, padding_value=0)
        
        # padded_genres_embed_ids = padded_genres_embed_ids[:, :self.max_genres]

        rating = self.data.iloc[index]["rating"]

        sample = {
            "user_embed_id": torch.tensor(user_embed_id, dtype=torch.long),
            "movie_embed_id": torch.tensor(movie_embed_id, dtype=torch.long),
            # "genres_embed_ids": padded_genres_embed_ids,
            "rating": torch.tensor(rating, dtype=torch.float),
        }

        return sample

Overwriting dataset.py


In [14]:
from dataset import RatingDataset

In [15]:
df = pd.read_parquet("./data/processed.parquet")

In [16]:
train_dataset = RatingDataset(data=df)

In [17]:
train_dataset[:10]

{'user_embed_id': tensor([ 1,  5,  7, 15, 17, 18, 19, 21, 27, 31]),
 'movie_embed_id': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'rating': tensor([0.7778, 0.7778, 0.8889, 0.4444, 0.8889, 0.6667, 0.7778, 0.6667, 0.5556,
         1.0000])}

In [18]:
from torch.utils.data import DataLoader

In [19]:
loader = DataLoader(train_dataset,batch_size=32,shuffle=True)

In [20]:
for sample in loader:
    break

In [21]:
sample

{'user_embed_id': tensor([448, 119, 249, 534, 526,  88, 476, 279, 376,  31, 307, 144, 603, 346,
         564, 387,   7, 313, 220, 312, 346, 561, 203, 337, 352, 182, 226, 599,
         232, 410, 380, 419]),
 'movie_embed_id': tensor([1125, 4707, 2874,  241, 2538,  754, 2141,   71, 1322,  505,  580,  204,
         9360, 1609, 1993,   86,  447,  134,  198,  167, 6996,    5, 1343,  498,
          273, 2188, 1270,  935, 2405, 2764, 6122, 3625]),
 'rating': tensor([0.7778, 0.7778, 0.5556, 0.7778, 1.0000, 1.0000, 0.7778, 0.7778, 0.7778,
         0.7778, 0.3333, 0.7778, 0.5556, 0.5556, 0.7778, 0.3333, 0.3333, 0.5556,
         1.0000, 0.5556, 0.5556, 0.8889, 1.0000, 1.0000, 1.0000, 0.7778, 0.7778,
         0.5556, 0.6667, 1.0000, 0.3333, 1.0000])}

In [22]:
recommender = NeuMF(config=config)

In [23]:
sample

{'user_embed_id': tensor([448, 119, 249, 534, 526,  88, 476, 279, 376,  31, 307, 144, 603, 346,
         564, 387,   7, 313, 220, 312, 346, 561, 203, 337, 352, 182, 226, 599,
         232, 410, 380, 419]),
 'movie_embed_id': tensor([1125, 4707, 2874,  241, 2538,  754, 2141,   71, 1322,  505,  580,  204,
         9360, 1609, 1993,   86,  447,  134,  198,  167, 6996,    5, 1343,  498,
          273, 2188, 1270,  935, 2405, 2764, 6122, 3625]),
 'rating': tensor([0.7778, 0.7778, 0.5556, 0.7778, 1.0000, 1.0000, 0.7778, 0.7778, 0.7778,
         0.7778, 0.3333, 0.7778, 0.5556, 0.5556, 0.7778, 0.3333, 0.3333, 0.5556,
         1.0000, 0.5556, 0.5556, 0.8889, 1.0000, 1.0000, 1.0000, 0.7778, 0.7778,
         0.5556, 0.6667, 1.0000, 0.3333, 1.0000])}

In [24]:
recommender(user_indices=sample['user_embed_id'],
            item_indices=sample['movie_embed_id'])

tensor([[0.5117],
        [0.5055],
        [0.5059],
        [0.5039],
        [0.5158],
        [0.4991],
        [0.5021],
        [0.5065],
        [0.5117],
        [0.5054],
        [0.5023],
        [0.5033],
        [0.5062],
        [0.5020],
        [0.5057],
        [0.4935],
        [0.5007],
        [0.4983],
        [0.5031],
        [0.5048],
        [0.5032],
        [0.5098],
        [0.5105],
        [0.5067],
        [0.5006],
        [0.5002],
        [0.5077],
        [0.5015],
        [0.5110],
        [0.4970],
        [0.5012],
        [0.5022]], grad_fn=<SigmoidBackward0>)