Some notes before running: because the checkpoint for torch's model is too large to push to GitHub, please download the saved checkpoint at [Google Drive](https://drive.google.com/file/d/1Eg4ZGp1hS-EcDB7LfCEUbPvtLzSjxc8f/view?usp=sharing) first and move it to the suitable directory. 

In [None]:
import torch

from pyspark.sql import SparkSession
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

from tqdm import tqdm

from classifiers import MLPClassifierWithPhoBERT, SENTIMENTS_AS_INDEX

spark = SparkSession.builder.master('local[*]').getOrCreate()

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, data_as_spark_df):
        self.data_as_rdd = data_as_spark_df.rdd.zipWithIndex()
        self.len = data_as_spark_df.count()
    
    def __len__(self): return self.len

    def __getitem__(self, index: int):
        if index < 0 or index > self.len - 1:
            raise ValueError('index exceeded length of dataframe')
        
        nth_row = (self.data_as_rdd
                   .filter(lambda data: data[1] == index)
                   .take(1)[0][0]
        )
        review, sentiment = nth_row

        return review, SENTIMENTS_AS_INDEX[sentiment]

# Loading test data and model

In [7]:
test_set = spark.read.parquet(
    'hdfs://namenode:9000/training_data/test_set'
)
test_set = ReviewDataset(test_set)
test_loader = DataLoader(test_set, 64)

## Loading MLP model

In [8]:
checkpoint = torch.load('work/models/03_05_25-epoch25-model.tar', map_location=torch.device('cpu'))

review_model = MLPClassifierWithPhoBERT([512, 512], nn.LeakyReLU(.02))
review_model.load_state_dict(checkpoint['model_param'])

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [None]:
phobert_tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base-v2')
apply_tokenization = lambda minibatch: phobert_tokenizer(
    minibatch, return_tensors = 'pt', padding=True,
    truncation=True, max_length=256
)

@torch.no_grad
def get_cm(
    model: nn.Module,
    data_loader: DataLoader,
    n_labels: int,
    use_gpu: bool = False
) -> torch.Tensor:
    """
    Make inference with a `torch.nn.Module` and return the confusion matrix.

    :param nn.Module: The model to make inference with.
    :param DataLoader: The data to make inference on.
    :param int n_labels: The number of labels within the dataset. Note that 
        this should be the number of labels on the WHOLE dataset. The `data_loader`
        must have at maximum `n_labels`.
    :param bool use_gpu: Whether or not to do computations on GPU.
    :return: A 2-d tensor of integers. Each row represents the predictions made and
        each column represents the ground truth.
    :rtype: torch.Tensor
    """
    model.eval()
    flattened_dim = n_labels ** 2
    confusion_mat = torch.zeros(flattened_dim, dtype=torch.long)

    for X, y in tqdm(data_loader):
        tokenized_X = apply_tokenization(X)

        X_input_ids = tokenized_X['input_ids']
        X_att_mask = tokenized_X['attention_mask']

        if use_gpu:
            X_input_ids = X_input_ids.cuda()
            X_att_mask = X_att_mask.cuda()

        pred = model(X_input_ids, X_att_mask).argmax(dim=1).cpu()

        count_as_idx = y + n_labels * pred
        count_as_idx = torch.bincount(count_as_idx)
        if count_as_idx.shape[0] < flattened_dim:
            zeros = torch.zeros(flattened_dim - count_as_idx.shape[0], dtype=torch.long)
            count_as_idx = torch.concat([count_as_idx, zeros])
        confusion_mat += count_as_idx
    return confusion_mat.reshape((n_labels, n_labels))

# Inference

In [None]:
cm = get_cm(review_model, test_loader, 3)

100%|██████████| 13/13 [05:08<00:00, 23.74s/it]
