In [None]:
import torch

from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

from classifiers import MLPClassifierWithPhoBERT

## Utility Functions

In [None]:
import math
phobert_tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base-v2')
apply_tokenization = lambda minibatch: phobert_tokenizer(
    minibatch, return_tensors = 'pt', padding=True,
    truncation=True, max_length=256
)

def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    loss_fn: nn.Module,
    optimizer: nn.Module,
    track_loss: bool = False,
    use_gpu: bool = False
) -> list[float]:
    """
    Performs backpropogation on `model` using `optimizer`.

    :param nn.Module model: The model on which to perform backpropogation.
    :param nn.utils.data.DataLoader train_loader: A DataLoader dispatching batches
        for each backpropogations.
    :param nn.Module loss_fn: The loss function to based on which to compute gradients.
    :param nn.Module optimizer: The optimization algorithm for gradient descent.
    :param bool track_loss: Whether or not to return average loss.
        This is `False` by default.

    :return: A list of loss values per batch if `track_loss=True` else an empty list.
    :rtype: list[float]
    """
    model.train()
    total_loss = 0

    for X, y in train_loader:
        tokenized_X = apply_tokenization(X)
        
        X_input_ids = tokenized_X['input_ids']
        X_att_mask = tokenized_X['attention_mask']

        if use_gpu:
            X_input_ids = X_input_ids.cuda()
            X_att_mask = X_att_mask.cuda()
            y = y.cuda()
        pred_value = model(X_input_ids, X_att_mask)
        loss = loss_fn(pred_value, y)

        # Compute the gradient with loss.backward()
        # Then backpropogate with optimizer.step()
        # However, to avoid accumulation of previous backward passes
        # we need to call optimizer.zero_grad() to zero out the gradient
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if track_loss: total_loss += loss
    return total_loss / len(train_loader)

@torch.no_grad()
def test_model(
    model: nn.Module,
    test_loader: DataLoader,
    loss_fn: nn.Module,
    return_true_preds: bool,
    use_gpu: bool = False
) -> tuple[float, torch.Tensor, torch.Tensor]:
    """
    Evaluate `model` based on `loss_fn` and return the average loss along with
    true predictions and the total labels corresponding to each class.

    :param nn.Module model: The model on which to perform evaluation.
    :param nn.utils.data.DataLoader test_loader: A DataLoader containing test data.
    :param nn.Module loss_fn: The loss function to based on which to compute metrics.
    :param bool return_true_preds: Whether or not to store statistics on correctly
        classified labels. This is only meaningful in the case the `model` is a classifier.

    :return: The average loss (per batch). If `return_true_preds=True` then the number of
        correctly classified labels and the total labels corresponding to each class are returned as
        `torch.Tensor`. If not, zero tensors are returned instead.
    :rtype: tuple[float, torch.Tensor, torch.Tensor]
    """
    model.eval()
    total_loss = 0

    correct_labels = torch.tensor([0, 0, 0])
    total_labels = torch.tensor([0, 0, 0])

    for X, y in test_loader:
        tokenized_X = apply_tokenization(X)

        X_input_ids = tokenized_X['input_ids']
        X_att_mask = tokenized_X['attention_mask']

        if use_gpu:
            X_input_ids = X_input_ids.cuda()
            X_att_mask = X_att_mask.cuda()
            y = y.cuda()

        pred = model(X_input_ids, X_att_mask)
        total_loss += loss_fn(pred, y)

        if return_true_preds:
            pred_labels = pred.argmax(dim=1)
            correct_preds = pred_labels[pred_labels == y].bincount().cpu()
            true_counts = y.bincount().cpu()

            for i, count in enumerate(correct_preds):
                correct_labels[i] += count
            for i, count in enumerate(true_counts):
                total_labels[i] += count

    return total_loss / len(test_loader), correct_labels, total_labels

def run_epochs(
    epochs: int,
    model: nn.Module,
    train_loader: DataLoader,
    test_loader: DataLoader,
    loss_fn: nn.Module,
    optimizer: nn.Module, *,
    update_rate: int | None = None
):
    num_dig = int(math.log10(epochs)) + 1
    if update_rate is None:
        update_rate = 1 if epochs <= 20 else 10
    losses = {'train': [], 'test': []}

    for epoch in range(epochs):
        if not epoch % update_rate:
            print(f"\033[102;30;1mEpoch {epoch + 1:>{num_dig}}/{epochs}\033[0m", end=' || ')

        training_loss = train_model(
            model, train_loader,
            loss_fn, optimizer, track_loss=True,
            use_gpu=torch.cuda.is_available()
        )
        if not epoch % update_rate:
            print(f"\033[94;1mTraining loss: {training_loss:<10.6f}\033[0m", end=' | ')

        loss, true_labels, total_labels = test_model(
            model, test_loader, loss_fn, True,
            use_gpu=torch.cuda.is_available()
        )

        if not epoch % update_rate:
            acc_by_class = (true_labels / total_labels) * 100
            avg_acc = (true_labels.sum() / total_labels.sum()) * 100
            print(f"""\033[94;1mEval Loss: {loss:<10.6f}\033[0m
  \033[1mAverage Accuracy: {avg_acc:.4f}%\033[0m
  \033[32;10mPos: {acc_by_class[0]:<7.4f}%\033[0m | Neu: {acc_by_class[1]:<7.4f}% | \033[31;10mNeg: {acc_by_class[2]:<7.4f}%\033[0m
""")
        losses['train'].append(training_loss)
        losses['test'].append(loss)
    return losses

# Load training data

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').config('spark.ui.port', '4040').getOrCreate()
spark

## Buidling torch's Dataset for training

In [3]:
class ReviewDataset(Dataset):
    sentiment_as_index = {
        'positive': 0,
        'neutral': 1,
        'negative': 2
    }
    def __init__(self, data_as_spark_df):
        self.data_as_rdd = data_as_spark_df.rdd.zipWithIndex()
        self.len = data_as_spark_df.count()
    
    def __len__(self): return self.len

    def __getitem__(self, index: int):
        if index < 0 or index > self.len - 1:
            raise ValueError('index exceeded length of dataframe')
        
        nth_row = (self.data_as_rdd
                   .filter(lambda data: data[1] == index)
                   .take(1)[0][0]
        )
        review, sentiment = nth_row

        return review, ReviewDataset.sentiment_as_index[sentiment]

## Load train and test set
and other computations

In [4]:
train_set = spark.read.parquet(
    'hdfs://namenode:9000/training_data/train_set'
)
test_set = spark.read.parquet(
    'hdfs://namenode:9000/training_data/test_set'
)

# computing the class count for later computation
class_counts = train_set.groupBy('sentiment').count().collect()

train_set, test_set = ReviewDataset(train_set), ReviewDataset(test_set)

**Compute the class weights for loss function**

Here the class weight $C_i$ for the $i$-th class is computed by:
$$
    C_i = \frac{\text{n\_samples}}{\text{class\_counts}_i\cdot\text{n\_classes}}
$$
where:
- n_samples: is the number of sample within the dataset considered. This will be the train_set above.
- $\text{class\_counts}_i$: the number of samples belonging to class $i$.
- n_classes: the total classes present in the dataset

In [5]:
sentiment_weights = {class_: len(train_set) / (count * len(class_counts)) for class_, count in class_counts}
sentiment_weights = torch.tensor([sentiment_weights[class_] for class_ in train_set.sentiment_as_index], dtype=torch.float32)

# Model Training

In [None]:
batch_size = 64
learning_rate = 1.5e-5

train_loader = DataLoader(train_set, batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size, shuffle=True)

review_model = MLPClassifierWithPhoBERT([512, 512], nn.LeakyReLU(.02))
if torch.cuda.is_available():
    sentiment_weights = sentiment_weights.cuda()
    review_model.cuda()

cross_entropy = nn.CrossEntropyLoss(weight=sentiment_weights)
optimizer = torch.optim.Adam(review_model.parameters(), learning_rate)

In [None]:
epochs = 25
losses = run_epochs(
    epochs, review_model,
    train_loader, test_loader,
    cross_entropy, optimizer
)

# Saving the model

In [None]:
states = {
    'model_param': review_model.state_dict(),
    'optimizer_param': optimizer.state_dict(),
    'losses': losses,
    'lr': learning_rate,
    'epochs': epochs,
    'batch_size': batch_size,
}

torch.save(states, 'work/models/03_05_25-epoch25-model.tar')