In [None]:
import torch

from torch import nn
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(4)

# Set up

In [None]:
import numpy as np

class MovieDataset(Dataset):
    def __init__(self, movie_df):
        self.movie_stats = torch.tensor(movie_df.drop(['description', 'IMDB_Rating'], axis=1).to_numpy())
        self.movie_des = movie_df['description']
        self.movie_ratings = movie_df['IMDB_Rating'].astype(np.float32)

    def __len__(self): return self.movie_stats.shape[0]
    def __getitem__(self, idx: int):
        return self.movie_stats[idx], self.movie_des.iloc[idx], self.movie_ratings.iloc[idx]

In [None]:
class MLPClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        self.relu6 = nn.ReLU6()
        self.mlp = nn.Sequential(
            nn.Linear(768 + 25, 512), nn.LeakyReLU(.4),
            nn.BatchNorm1d(512),
            nn.Linear(512, 512), nn.LeakyReLU(.4),
            nn.BatchNorm1d(512),
            nn.Linear(512, 512), nn.LeakyReLU(.4),
            nn.BatchNorm1d(512),
            nn.Linear(512, 512), nn.LeakyReLU(.4),
            nn.BatchNorm1d(512),
            nn.Linear(512, 1)
        )
    
    def forward(self, movie):
      logits = self.mlp(movie)
      return self.relu6(logits) * 5 / 3

In [None]:
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

disbert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
disbert = AutoModel.from_pretrained('distilbert-base-cased')
disbert.requires_grad_ = False

tokenize = lambda description: disbert_tokenizer(
    description, return_tensors='pt',
    padding=True, truncation=True 
)

def train_loop(
    model, loss_fn,
    optimizer, dataloader,
    use_gpu: bool = False
):
    model.train()

    losses = []
    pbar = tqdm(dataloader, total=len(dataloader))
    for stats, des, ratings in pbar:
        tokens = tokenize(des)
        des_embeddings = disbert(tokens['input_ids'], tokens['attention_mask'])
        des_embeddings = des_embeddings['last_hidden_state'][:, 0, :]

        movie_input = torch.concat([stats, des_embeddings], axis=1)
        if use_gpu:
            movie_input = movie_input.cuda()
            ratings = ratings.cuda()

        rating_preds = model(movie_input)
        loss = loss_fn(rating_preds.squeeze(), ratings)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        pbar.set_postfix_str(f'Loss: {losses[-1]}')
    return losses

@torch.no_grad()
def eval_loop(
    model, loss_fn,
    test_loader,
    use_gpu: bool = False
):
    model.eval()

    losses = []
    for stats, des, ratings in test_loader:
        tokens = tokenize(des)
        des_embeddings = disbert(tokens['input_ids'], tokens['attention_mask'])
        des_embeddings = des_embeddings['last_hidden_state'][:, 0, :]


        movie_input = torch.concat([stats, des_embeddings], axis=1)
        if use_gpu:
            movie_input = movie_input.cuda()
            ratings = ratings.cuda()

        rating_preds = model(movie_input)
        loss = loss_fn(rating_preds.squeeze(), ratings)

        losses.append(loss.item())

    return losses

# Load data and model

In [None]:
import pandas as pd

dfs = [
    pd.read_csv('../dataset/train_data/train_data.csv', index_col=0),
    pd.read_csv('../dataset/train_data/test_data.csv', index_col=0),
]

train_data = MovieDataset(dfs[0])
test_data = MovieDataset(dfs[1])

In [None]:
batch_size = 32

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [None]:
mlp_model = MLPClassifier()
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=1e-4)

# Training

In [None]:
epochs = 5
global_loss = {
    'train': [],
    'eval': []
}
gpu_avail = torch.cuda.is_available()

In [None]:
for epoch in range(epochs):
    print(f"Epoch {epoch}")
    losses = train_loop(mlp_model, loss_fn,
        optimizer, train_loader,
        gpu_avail
    )
    
    print(f"  Train loss: {sum(losses) / len(losses)}")
    eval_losses = eval_loop(mlp_model,
        loss_fn, test_loader,
        gpu_avail
    )

    print(f"  Eval loss: {sum(eval_losses) / len(eval_losses)}")

    global_loss['eval'].append(eval_losses)
    global_loss['train'].append(losses)