In [1]:
import numpy as np
import torch

from sklearn.model_selection import train_test_split

from torch import nn
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoTokenizer

from classifiers import ReviewClassifierWithPhoBERT

In [10]:
phobert_tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base-v2')
apply_tokenization = lambda minibatch: phobert_tokenizer(
    minibatch, return_tensors = 'pt', padding=True,
    truncation=True, max_length=256
)

def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    loss_fn: nn.Module,
    optimizer: nn.Module,
    track_loss: bool = False,
    use_gpu: bool = False
) -> list[float]:
    """
    Performs backpropogation on `model` using `optimizer`.

    :param nn.Module model: The model on which to perform backpropogation.
    :param nn.utils.data.DataLoader train_loader: A DataLoader dispatching batches
        for each backpropogations.
    :param nn.Module loss_fn: The loss function to based on which to compute gradients.
    :param nn.Module optimizer: The optimization algorithm for gradient descent.
    :param bool track_loss: Whether or not to return average loss.
        This is `False` by default.

    :return: A list of loss values per batch if `track_loss=True` else an empty list.
    :rtype: list[float]
    """
    model.train()
    total_loss = 0

    for batch, (X, y) in enumerate(train_loader, start=1):
        tokenized_X = apply_tokenization(X)
        
        X_input_ids = tokenized_X['input_ids']
        X_att_mask = tokenized_X['attention_mask']

        if use_gpu:
            X_input_ids = X_input_ids.cuda()
            X_att_mask = X_att_mask.cuda()
            y = y.cuda()
        pred_value = model(X_input_ids, X_att_mask)
        loss = loss_fn(pred_value, y)

        # Compute the gradient with loss.backward()
        # Then backpropogate with optimizer.step()
        # However, to avoid accumulation of previous backward passes
        # we need to call optimizer.zero_grad() to zero out the gradient
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if track_loss: total_loss += loss
    return total_loss / len(train_loader)

@torch.no_grad()
def test_model(
    model: nn.Module,
    test_loader: DataLoader,
    loss_fn: nn.Module,
    return_true_preds: bool,
    use_gpu: bool = False
) -> tuple[float, torch.Tensor, torch.Tensor]:
    """
    Evaluate `model` based on `loss_fn` and return the average loss along with
    true predictions and the total labels corresponding to each class.

    :param nn.Module model: The model on which to perform evaluation.
    :param nn.utils.data.DataLoader test_loader: A DataLoader containing test data.
    :param nn.Module loss_fn: The loss function to based on which to compute metrics.
    :param bool return_true_preds: Whether or not to store statistics on correctly
        classified labels. This is only meaningful in the case the `model` is a classifier.

    :return: The average loss (per batch). If `return_true_preds=True` then the number of
        correctly classified labels and the total labels corresponding to each class are returned as
        `torch.Tensor`. If not, zero tensors are returned instead.
    :rtype: tuple[float, torch.Tensor, torch.Tensor]
    """
    model.eval()
    total_loss = 0

    correct_labels = torch.tensor([0, 0, 0])
    total_labels = torch.tensor([0, 0, 0])

    for X, y in test_loader:
        tokenized_X = apply_tokenization(X)

        X_input_ids = tokenized_X['input_ids']
        X_att_mask = tokenized_X['attention_mask']

        if use_gpu:
            X_input_ids = X_input_ids.cuda()
            X_att_mask = X_att_mask.cuda()
            y = y.cuda()

        pred = model(X_input_ids, X_att_mask)
        total_loss += loss_fn(pred, y)

        if return_true_preds:
            pred_labels = pred.argmax(dim=1)
            correct_preds = pred_labels[pred_labels == y].bincount().cpu()
            true_counts = y.bincount().cpu()

            for i, count in enumerate(correct_preds):
                correct_labels[i] += count
            for i, count in enumerate(true_counts):
                total_labels[i] += count

    return total_loss / len(test_loader), correct_labels, total_labels

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

# Load training data

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').config('spark.ui.port', '4040').getOrCreate()

In [3]:
preprocessed_fp = 'hdfs://namenode:9000/review_data/preprocessed'
preprocessed_df = spark.read.csv(preprocessed_fp, header=True, inferSchema=True)
preprocessed_df = preprocessed_df.drop('rating', 'place_index')

preprocessed_df.printSchema()
print(f'Total reviews: {preprocessed_df.count()}')
preprocessed_df.show(5)

root
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)

Total reviews: 3425
+--------------------+---------+
|              review|sentiment|
+--------------------+---------+
|3 miếng gà 105k n...| negative|
|Gà ướp vừa vị , m...| positive|
|Thật tuyệt với gà...| positive|
|Quán sạch , đẹp ,...| positive|
|Nhân_viên bự con ...| positive|
+--------------------+---------+
only showing top 5 rows



## Buidling torch's Dataset for training

In [4]:
class ReviewDataset(Dataset):
    sentiment_as_index = {
        'positive': 0,
        'neutral': 1,
        'negative': 2
    }
    def __init__(self, data_as_spark_df):
        self.data_as_rdd = data_as_spark_df.rdd.zipWithIndex()
        self.len = data_as_spark_df.count()
    
    def __len__(self): return self.len

    def __getitem__(self, index: int):
        if index < 0 or index > self.len - 1:
            raise ValueError('index exceeded length of dataframe')
        
        nth_row = (self.data_as_rdd
                   .filter(lambda data: data[1] == index)
                   .take(1)[0][0]
        )
        review, sentiment = nth_row

        return review, ReviewDataset.sentiment_as_index[sentiment]

# Train Test split

In [5]:
dataset = ReviewDataset(preprocessed_df)

In [6]:
train_idx, test_idx = train_test_split(
    np.arange(len(dataset)),
    test_size=.2, random_state=0,
    stratify=[row['sentiment'] for row in preprocessed_df.collect()]
)

train_set, test_set = Subset(dataset, train_idx), Subset(dataset, test_idx)

Compute the class weights for loss function

In [7]:
class_counts = preprocessed_df.groupBy('sentiment').count().collect()
sentiment_weights = {}
for class_, count in class_counts:
    sentiment_weights[class_] = len(dataset) / (count * len(class_counts))

sentiment_weights = torch.tensor([sentiment_weights[class_] for class_ in dataset.sentiment_as_index], dtype=torch.float32)

# Model Training

In [9]:
import math
def run_epochs(
    epochs: int,
    model: nn.Module,
    train_loader: DataLoader,
    test_loader: DataLoader,
    loss_fn: nn.Module,
    optimizer: nn.Module, *,
    update_rate: int | None = None
):
    num_dig = int(math.log10(epochs)) + 1
    if update_rate is None:
        update_rate = 1 if epochs <= 20 else 10

    for epoch in range(epochs):
        if not epoch % update_rate:
            print(f"\033[102;30;1mEpoch {epoch + 1:>{num_dig}}/{epochs}\033[0m", end=' || ')

        training_loss = train_model(
            model, train_loader,
            loss_fn, optimizer, track_loss=True,
            use_gpu=torch.cuda.is_available()
        )
        if not epoch % update_rate:
            print(f"\033[94;1mTraining loss: {training_loss * 100:<10.6f}\033[0m", end=' | ')

        loss, true_labels, total_labels = test_model(
            model, test_loader, loss_fn, True,
            use_gpu=torch.cuda.is_available()
        )

        if not epoch % update_rate:
            acc_by_class = (true_labels / total_labels) * 100
            avg_acc = (true_labels.sum() / total_labels.sum()) * 100
            print(f"""\033[94;1mEval Loss: {loss:<10.6f}\033[0m
  Average Accuracy: {avg_acc:.4f}%
  Pos: {acc_by_class[0]:<7.4f}% | Neu: {acc_by_class[1]:<7.4f}% | Neg: {acc_by_class[2]:<7.4f}%""")

In [11]:
batch_size = 32
learning_rate = 0.00005

trainloader = DataLoader(train_set, batch_size, shuffle=True)
testloader = DataLoader(test_set, batch_size, shuffle=True)

review_model = ReviewClassifierWithPhoBERT()
if torch.cuda.is_available():
    sentiment_weights = sentiment_weights.cuda()
    review_model.cuda()

cross_entropy = nn.CrossEntropyLoss(weight=sentiment_weights)
optimizer = torch.optim.Adam(review_model.parameters(), learning_rate)

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

In [12]:
epochs = 1
run_epochs(
    epochs, review_model,
    trainloader, testloader,
    cross_entropy, optimizer
)

[102;30;1mEpoch 1/1[0m || [94;1mTraining loss: 100.860954[0m | [94;1mEval Loss: 0.874189  [0m
  Average Accuracy: 84.8175%
  Pos: 87.4317% | Neu: 0.0000 % | Neg: 89.3805%


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 52252)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =