In [470]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch import nn, optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [471]:
DATA_DIR = '../data/internal'

In [472]:
class Config:
    device = 'cpu'
    epochs = 20
    seed = 0
    batch_size = 128
    embedding_dim = 32
    hidden_size = 32
    lr = 1e-3


config = Config()

In [473]:
def set_seed(seed_value=0):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True


set_seed(config.seed)

In [474]:
train_data = pd.read_csv(os.path.join(DATA_DIR, 'data_train.csv'))
val_data = pd.read_csv(os.path.join(DATA_DIR, 'data_val.csv'))
user_data = pd.read_csv(os.path.join(DATA_DIR, 'user.csv'), index_col=0)
item_data = pd.read_csv(os.path.join(DATA_DIR, 'item.csv'), index_col=0)

In [475]:
n_user_features = user_data.shape[1]
n_item_features = item_data.shape[1]
n_user_features, n_item_features

(22, 19)

In [490]:
class MovieDataset(Dataset):
    def __init__(self, ratings, users, items):
        self.users = users
        self.items = items
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, ids):
        ratings = self.ratings.iloc[ids]

        user_ids = ratings.user_id.astype('int')
        item_ids = ratings.item_id.astype('int')

        users = self.users.iloc[user_ids]
        items = self.items.iloc[item_ids]

        return {
            "ratings": torch.tensor(ratings.rating, dtype=torch.long),
            "user_ids": torch.tensor(user_ids, dtype=torch.long),
            "item_ids": torch.tensor(item_ids, dtype=torch.long),
            "users_info": torch.tensor(users.to_numpy(), dtype=torch.float),
            "items_info": torch.tensor(items.to_numpy(), dtype=torch.float),
        }

In [491]:
train_dataset = MovieDataset(train_data, user_data, item_data)
val_dataset = MovieDataset(val_data, user_data, item_data)

In [492]:
train_dataloader = DataLoader(
    train_dataset, batch_size=config.batch_size, shuffle=True)
val_dataloader = DataLoader(
    val_dataset, batch_size=config.batch_size, shuffle=False)

In [493]:
for data in train_dataloader:
    print(data['ratings'].shape)
    print(data['user_ids'].shape)
    print(data['item_ids'].shape)
    print(data['users_info'].shape)
    print(data['items_info'].shape)
    break

torch.Size([128])
torch.Size([128])
torch.Size([128])
torch.Size([128, 22])
torch.Size([128, 19])


In [494]:
class RecSysModel(nn.Module):
    def __init__(self, n_users, n_items, embedding_dim, hidden_size):
        super().__init__()

        self.user_embed = nn.Embedding(n_users, embedding_dim=embedding_dim)
        self.item_embed = nn.Embedding(n_items, embedding_dim=embedding_dim)

        self.out = nn.Sequential(
            nn.Linear(embedding_dim * 2 + n_item_features + n_user_features, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1),
        )

    def forward(self, user_ids, item_ids, users_info, items_info):
        user_embeds = self.user_embed(user_ids)
        item_embeds = self.item_embed(item_ids)

        x = torch.cat([user_embeds, item_embeds, users_info, items_info], dim=1)

        x = self.out(x)

        return x

In [495]:
model = RecSysModel(n_items=len(item_data), n_users=len(
    user_data), embedding_dim=config.embedding_dim, hidden_size=config.hidden_size).to(config.device)

optimizer = optim.Adam(model.parameters())
loss_fn = nn.MSELoss()

In [496]:
def test_model(model: nn.Module, criterion, test_dataloader: DataLoader, device='cuda'):
    """
    Function that evaluates model on specified dataloader
    by specified loss function.

    Parameter
    ---------
    model : nn.Module
      Model to train.
    criterion
      The loss function from pytorch
    test_dataloader: DataLoader
      The dataset for testing model

    Returns
    -------
    float: loss of model on given dataset
    """

    model.eval()
    model.to(device)

    # Test loss value
    test_loss = 0.0

    with torch.no_grad():
        for data in test_dataloader:
            
            user_ids = data['user_ids'].to(device)
            item_ids = data['item_ids'].to(device)
            users_info = data['users_info'].to(device)
            items_info = data['items_info'].to(device)
            ratings = data['ratings'].to(device)
            
            ratings = ratings.view(-1, 1).to(torch.float)            
            
            # Forward pass
            outputs = model(user_ids=user_ids, item_ids=item_ids, users_info=users_info, items_info=items_info)
            test_loss += criterion(outputs, ratings)

    # Computation of test loss
    test_loss /= len(test_dataloader)

    return test_loss.item()

In [497]:
def train_model(model: nn.Module, epochs: int, criterion, train_dataloader, validation_dataloader, load_ckpt: bool = False, load_ckpt_path: str or None = None, save_ckpt_path: str = 'best.pt', device: torch.device = 'cuda'):
    """
    Function that trains model using number of epochs, loss function, optimizer.
    Can use validation or test data set for evaluation.
    Calculates f1 score.

    Parameter
    ---------
    model : nn.Module
        Model to train.
    epochs: int
        Number of train epochs
    criterion
        The loss function from pytorch
    train_dataloader
        Dataloader of the train dataset
    train_dataloader
        Dataloader of the validation dataset
    load_ckpt: bool
        load model from checkpoint if true or train from scratch if false
    load_ckpt_path: str
        Path of already existing checkpoint to load model from
    save_ckpt_path: str
        Path of where to store the best model checkpoint
    device: torch.device
        Pytroch device
    """

    if load_ckpt_path is None:
        load_ckpt_path = save_ckpt_path

    model.train()
    model.to(device)

    # best score for checkpointing
    best = 1000000000.0
    train_losses = []
    val_losses = []

    first_epoch = 1

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

    isCkptExists = os.path.isfile(load_ckpt_path)

    if (load_ckpt and not isCkptExists):
        print('Checkpoint file does not exist. Training model from scratch!')

    if (load_ckpt and isCkptExists):
        checkpoint = torch.load(load_ckpt_path)
        best = checkpoint['best_score']
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        train_losses = checkpoint['train_losses']
        first_epoch = checkpoint['epoch'] + 1

    # Train the model
    for epoch in range(first_epoch, epochs + first_epoch):
        model.train()

        train_loss = 0.0

        bar = tqdm(train_dataloader)
        iterations = 0

        for data in bar:

            user_ids = data['user_ids'].to(device)
            item_ids = data['item_ids'].to(device)
            users_info = data['users_info'].to(device)
            items_info = data['items_info'].to(device)
            ratings = data['ratings'].to(device)
            
            ratings = ratings.view(-1, 1).to(torch.float)            
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(user_ids=user_ids, item_ids=item_ids, users_info=users_info, items_info=items_info)
            loss = criterion(outputs, ratings)

            train_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

            iterations += 1
            bar.set_postfix(
                ({"loss": f"{train_loss/(iterations*train_dataloader.batch_size)}"}))

        # Computing loss
        train_loss /= len(train_dataloader.dataset)

        # Printing information in the end of train loop
        val_loss = test_model(model, criterion, validation_dataloader)

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        if val_loss < best:
            best = val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_losses': train_losses,
                'val_losses': val_losses,
                'best_score': best,
            }, save_ckpt_path)

        print(f"Epoch {epoch}: \ntrain:\t\t(loss: {train_loss:.4f}) \nvalidation:\t(loss: {val_loss:.4f})\n")

In [498]:
train_model(model=model, epochs=config.epochs, criterion=loss_fn, train_dataloader=train_dataloader, validation_dataloader=val_dataloader)

100%|██████████| 522/522 [00:17<00:00, 30.62it/s, loss=0.021048668559669192]


Epoch 1: 
train:		(loss: 0.0211) 
validation:	(loss: 1.1788)



100%|██████████| 522/522 [00:16<00:00, 31.87it/s, loss=0.008463003031260038]


Epoch 2: 
train:		(loss: 0.0085) 
validation:	(loss: 1.0417)



100%|██████████| 522/522 [00:16<00:00, 31.81it/s, loss=0.007578821201591085] 


Epoch 3: 
train:		(loss: 0.0076) 
validation:	(loss: 0.9764)



100%|██████████| 522/522 [00:16<00:00, 31.49it/s, loss=0.007097773362750409] 


Epoch 4: 
train:		(loss: 0.0071) 
validation:	(loss: 0.9420)



100%|██████████| 522/522 [00:16<00:00, 31.39it/s, loss=0.006810437480230384] 


Epoch 5: 
train:		(loss: 0.0068) 
validation:	(loss: 0.9189)



100%|██████████| 522/522 [00:16<00:00, 31.53it/s, loss=0.006616681487695582] 


Epoch 6: 
train:		(loss: 0.0066) 
validation:	(loss: 0.9137)



100%|██████████| 522/522 [00:16<00:00, 31.91it/s, loss=0.006482206551729206] 


Epoch 7: 
train:		(loss: 0.0065) 
validation:	(loss: 0.9057)



100%|██████████| 522/522 [00:16<00:00, 31.19it/s, loss=0.00637349854196668]  


Epoch 8: 
train:		(loss: 0.0064) 
validation:	(loss: 0.8961)



100%|██████████| 522/522 [00:16<00:00, 30.76it/s, loss=0.006280743651238352] 


Epoch 9: 
train:		(loss: 0.0063) 
validation:	(loss: 0.9002)



100%|██████████| 522/522 [00:16<00:00, 31.76it/s, loss=0.006202981312191863] 


Epoch 10: 
train:		(loss: 0.0062) 
validation:	(loss: 0.8966)



100%|██████████| 522/522 [00:16<00:00, 30.98it/s, loss=0.0061419250218775765]


Epoch 11: 
train:		(loss: 0.0061) 
validation:	(loss: 0.8981)



100%|██████████| 522/522 [00:16<00:00, 31.52it/s, loss=0.006071755331601962] 


Epoch 12: 
train:		(loss: 0.0061) 
validation:	(loss: 0.8971)



100%|██████████| 522/522 [00:16<00:00, 31.15it/s, loss=0.005995386254695383] 


Epoch 13: 
train:		(loss: 0.0060) 
validation:	(loss: 0.9032)



100%|██████████| 522/522 [00:16<00:00, 31.47it/s, loss=0.005937170921075532] 


Epoch 14: 
train:		(loss: 0.0059) 
validation:	(loss: 0.8983)



100%|██████████| 522/522 [00:16<00:00, 31.18it/s, loss=0.005875687514096356] 


Epoch 15: 
train:		(loss: 0.0059) 
validation:	(loss: 0.9012)



100%|██████████| 522/522 [00:16<00:00, 31.84it/s, loss=0.005817360370473444] 


Epoch 16: 
train:		(loss: 0.0058) 
validation:	(loss: 0.9030)



100%|██████████| 522/522 [00:16<00:00, 31.37it/s, loss=0.005755030774418622] 


Epoch 17: 
train:		(loss: 0.0058) 
validation:	(loss: 0.9032)



100%|██████████| 522/522 [00:23<00:00, 22.51it/s, loss=0.005699188189787997] 


Epoch 18: 
train:		(loss: 0.0057) 
validation:	(loss: 0.9060)



100%|██████████| 522/522 [00:33<00:00, 15.79it/s, loss=0.00563815895198591]  


Epoch 19: 
train:		(loss: 0.0056) 
validation:	(loss: 0.9056)



100%|██████████| 522/522 [00:33<00:00, 15.49it/s, loss=0.005584300182567075] 


Epoch 20: 
train:		(loss: 0.0056) 
validation:	(loss: 0.9135)



In [499]:
checkpoint = torch.load('best.pt')

In [500]:
model.to(config.device)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [501]:
def dup_rows(a, indx, num_dups=1):
    return np.insert(a,[indx+1]*num_dups,a[indx],axis=0)

In [502]:
def get_top_k_movies(model, user_id, k=10):
    model.eval()

    watched_movies = val_data[val_data.user_id == user_id].item_id

    unwatched_movies = item_data[~item_data.index.isin(watched_movies)]

    unwatched_movie_ids = unwatched_movies.index.to_list()

    users_info_one_row = user_data[user_data.index == user_id]
    users_info = dup_rows(users_info_one_row.to_numpy(),
                          0, len(unwatched_movie_ids)-1)

    user_ids = [user_id] * len(unwatched_movie_ids)

    items_info = np.zeros((len(unwatched_movie_ids), n_item_features))
    for i, movie_id in enumerate(unwatched_movie_ids):
        items_info[i] = item_data[item_data.index == movie_id].to_numpy()

    user_ids = torch.tensor(user_ids).to(dtype=torch.long)
    item_ids = torch.tensor(unwatched_movie_ids).to(dtype=torch.long)
    users_info = torch.tensor(users_info).to(dtype=torch.float)
    items_info = torch.tensor(items_info).to(dtype=torch.float)

    with torch.no_grad():
        output = model(user_ids=user_ids, item_ids=item_ids,
                       users_info=users_info, items_info=items_info).squeeze()

    unwatched_movies_with_rating = list(
        zip(unwatched_movie_ids, output.numpy()))

    top_k = sorted(unwatched_movies_with_rating,
                   key=lambda x: x[1], reverse=True)[:k]

    return top_k

In [503]:
get_top_k_movies(model, 0, 10)

[(349, 4.719418),
 (98, 4.6398897),
 (56, 4.637362),
 (10, 4.592761),
 (38, 4.547897),
 (152, 4.503314),
 (428, 4.4787703),
 (401, 4.469755),
 (166, 4.4374886),
 (669, 4.4250584)]

In [504]:
get_top_k_movies(model, 1, 10)

[(165, 4.8964577),
 (56, 4.8113103),
 (161, 4.774931),
 (10, 4.734867),
 (428, 4.7292366),
 (224, 4.696537),
 (109, 4.6899176),
 (76, 4.656498),
 (241, 4.65567),
 (430, 4.644674)]

In [505]:
get_top_k_movies(model, 2, 10)

[(486, 4.1330075),
 (521, 4.0649877),
 (109, 3.9641457),
 (164, 3.928926),
 (424, 3.9173942),
 (56, 3.9163256),
 (426, 3.908978),
 (20, 3.894853),
 (143, 3.8934903),
 (698, 3.8702784)]

In [506]:
get_top_k_movies(model, 3, 10)

[(349, 4.4354954),
 (56, 4.2890596),
 (144, 4.2192116),
 (669, 4.200879),
 (12, 4.1977253),
 (283, 4.1760993),
 (109, 4.173289),
 (428, 4.16583),
 (73, 4.148289),
 (143, 4.1428227)]

In [507]:
get_top_k_movies(model, 4, 10)

[(283, 4.397307),
 (432, 4.2847853),
 (279, 4.28288),
 (146, 4.276494),
 (397, 4.256823),
 (669, 4.248531),
 (487, 4.244829),
 (486, 4.227861),
 (42, 4.210864),
 (426, 4.172885)]

In [508]:
get_top_k_movies(model, 5, 10)

[(349, 5.272317),
 (56, 5.25202),
 (556, 5.1639433),
 (130, 5.138515),
 (12, 5.1369667),
 (442, 5.1151314),
 (163, 5.0343633),
 (439, 5.030911),
 (386, 5.0135493),
 (73, 4.985412)]