In [35]:
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

from classifiers import ReviewClassifierWithPhoBERT

In [None]:
phobert_tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base-v2')

def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    loss_fn: nn.Module,
    optimizer: nn.Module,
    track_loss: bool = False,
    use_gpu: bool = False
) -> list[float]:
    """
    Performs backpropogation on `model` using `optimizer`.

    :param nn.Module model: The model on which to perform backpropogation.
    :param nn.utils.data.DataLoader train_loader: A DataLoader dispatching batches
        for each backpropogations.
    :param nn.Module loss_fn: The loss function to based on which to compute gradients.
    :param nn.Module optimizer: The optimization algorithm for gradient descent.
    :param bool track_loss: Whether or not to return loss on each backpropogation.
        This is `False` by default.
    :return: A list of loss values per batch if `track_loss=True` else an empty list.
    :rtype: list[float]
    """
    model.train()
    losses = []

    for batch, (X, y) in enumerate(train_loader, start=1):
        tokenized_X = phobert_tokenizer(X, return_tensors='pt',
                                padding=True, truncation=True,
                                max_length=256)
        
        X_input_ids = tokenized_X['input_ids']
        X_att_mask = tokenized_X['attention_mask']

        if use_gpu:
            X_input_ids = X_input_ids.cuda()
            X_att_mask = X_att_mask.cuda()
            y = y.cuda()
        pred_value = model(X_input_ids, X_att_mask)
        loss = loss_fn(pred_value, y)

        # Compute the gradient with loss.backward()
        # Then backpropogate with optimizer.step()
        # However, to avoid accumulation of previous backward passes
        # we need to call optimizer.zero_grad() to zero out the gradient
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if track_loss: losses.append(loss.item())
    return losses

@torch.no_grad()
def test_model(
    model: nn.Module,
    test_loader: DataLoader,
    loss_fn: nn.Module,
    compute_accuracy: bool,
    use_gpu: bool = False
) -> tuple[float, float]:
    """
    Evaluate `model` based on `loss_fn` and return the average score(s).

    :param nn.Module model: The model on which to perform evaluation.
    :param nn.utils.data.DataLoader test_loader: A DataLoader containing test data.
    :param nn.Module loss_fn: The loss function to based on which to compute metrics.
    :param bool compute_accuracy: Whether or not to compute accuracy. This is only
        meaningful in the case the `model` is a classifier.
    :return: The average loss (per batch) and average accuracy (per sample). If
        `compute_accuracy=False` then average accuracy returned is 0.
    :rtype: tuple[float, float]
    """
    model.eval()
    total_loss, total_accuracy = 0, 0
    for X, y in test_loader:
        tokenized_X = phobert_tokenizer(X, return_tensors='pt',
                                padding=True, truncation=True,
                                max_length=256)
        
        X_input_ids = tokenized_X['input_ids']
        X_att_mask = tokenized_X['attention_mask']

        if use_gpu:
            X_input_ids = X_input_ids.cuda()
            X_att_mask = X_att_mask.cuda()
            y= y.cuda()
        pred = model(X_input_ids, X_att_mask)
        total_loss += loss_fn(pred, y)
        if compute_accuracy:
            labels = (pred.argmax(dim=1) == y)
            total_accuracy += labels.type(torch.int).sum().item()
    return total_loss / len(test_loader), total_accuracy / len(test_loader.dataset)

# Load training data

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').config('spark.ui.port', '4040').getOrCreate()

In [10]:
preprocessed_fp = 'hdfs://namenode:9000/review_data/preprocessed'
preprocessed_df = spark.read.csv(preprocessed_fp, header=True, inferSchema=True)
preprocessed_df = preprocessed_df.drop('rating', 'place_index')

preprocessed_df.printSchema()
print(f'Total reviews: {preprocessed_df.count()}')
preprocessed_df.show(5)

root
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)

Total reviews: 3425
+--------------------+---------+
|              review|sentiment|
+--------------------+---------+
|3 miếng gà 105k n...| negative|
|Gà ướp vừa vị , m...| positive|
|Thật tuyệt với gà...| positive|
|Quán sạch , đẹp ,...| positive|
|Nhân_viên bự con ...| positive|
+--------------------+---------+
only showing top 5 rows



## Buidling torch's Dataset for training

In [28]:
class ReviewDataset(Dataset):
    sentiment_as_index = {
        'positive': 0,
        'neutral': 1,
        'negative': 2
    }
    def __init__(self, data_as_spark_df):
        self.data_as_rdd = data_as_spark_df.rdd.zipWithIndex()
        self.len = data_as_spark_df.count()
    
    def __len__(self): return self.len

    def __getitem__(self, index: int):
        if index < 0 or index > self.len - 1:
            raise ValueError('index exceeded length of dataframe')
        
        nth_row = (self.data_as_rdd
                   .filter(lambda data: data[1] == index)
                   .take(1)[0][0]
        )
        review, sentiment = nth_row

        return review, ReviewDataset.sentiment_as_index[sentiment]