# Phobertv2 for shopee reviews

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers
!pip install transformers[torch]
! pip install --quiet vncorenlp
! pip install underthesea

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

# 1. Generate dataset and preprocess data for training PhoBERT:

In [None]:


from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from distutils.dir_util import copy_tree
copy_tree("/content/drive/MyDrive/Thesis: Topic Modelling/Code/utils", "./utils/")

from utils.data_preprocessing_v2 import *
from vncorenlp import VnCoreNLP
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

class Phobertv2:
  """
  A class used to interact with the PhoBERT model for multi-label text classification.

  This class provides methods for loading the PhoBERT model, preprocessing text data, and generating datasets
  suitable for training or inference.
  """
  def __init__(self, data = None):
    """
    Initializes the Phobertv2 class, loads the PhoBERT model and tokenizer, and sets up the Vietnamese word segmenter.

    Parameters
    ----------
    data : any, optional
        Placeholder for input data, by default None.
    """
    self.tokenizer, self.model  = self.get_model()
    self.data = None
    self.rdrsegmenter = VnCoreNLP("/content/drive/MyDrive/transformers/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')


  def get_model(self):
    """
    Loads the PhoBERT model and tokenizer, and sets up the label mappings for multi-label classification.

    Returns
    -------
    tuple
        A tuple containing the tokenizer and model for the PhoBERT-based text classification.
    """
    labels = ["Quality",	"Serve",	"Pack",	"Shipping", "Price", "Other"]
    id2label = {idx:label for idx, label in enumerate(labels)}
    label2id = {label:idx for idx, label in enumerate(labels)}
    self.tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
    self.model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base-v2",
                                                              problem_type="multi_label_classification",
                                                              num_labels=len(labels),
                                                              id2label=id2label,
                                                              label2id=label2id)
    return (self.tokenizer, self.model)

  def preprocess(self, data):
    """
    Preprocesses the input data by tokenizing, segmenting Vietnamese text, and encoding it for the PhoBERT model.

    Parameters
    ----------
    data : dict
        A dictionary containing the text comments and their corresponding labels.

    Returns
    -------
    dict
        A dictionary containing the encoded input data, including input IDs, attention masks, and labels.
    """
    if not self.tokenizer and not self.model:
      self.tokenizer, self.model = self.get_model()
    labels = ["Quality",	"Serve",	"Pack",	"Shipping", "Price", "Other"]
    comments = data["comment"]
    text_list = []
    for text in comments:
      text = self.rdrsegmenter.tokenize(text)
      text = ' '.join([' '.join(x) for x in text])
      text_list.append(text)

    encoding = self.tokenizer(text_list, padding = "max_length", truncation = True, max_length = 125)
    labels_batch = {k: data[k] for k in data.keys() if k in labels}
    labels_matrix = np.zeros((len(text_list), len(labels)))
    for idx, label in enumerate(labels):
      labels_matrix[:, idx] = labels_batch[label]
    encoding["labels"] = labels_matrix.tolist()
    return encoding

  def generate_dataset(self, processed_data, batch_size = 32):
    """
    Converts the processed data into a PyTorch dataset and creates a data loader for model training or inference.

    Parameters
    ----------
    processed_data : dict
        The processed data, including input IDs, attention masks, and labels.
    batch_size : int, optional
        The batch size for the data loader, by default 32.

    Returns
    -------
    torch.utils.data.DataLoader
        A data loader that yields batches of input data, attention masks, and labels for training or inference.
    """
    inputs = torch.tensor(processed_data["input_ids"])
    labels = torch.tensor(processed_data["labels"])
    masks = torch.tensor(processed_data["attention_mask"])
    dataset = TensorDataset(inputs, masks, labels)
    dataset_sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=dataset_sampler, batch_size=batch_size)
    return data_loader

No module named 'fasttext'


In [None]:

import re
from sklearn.model_selection import train_test_split
import pandas as pd

def cleaning_for_phobert(sentence):
    """
    Cleans and preprocesses a sentence to make it suitable for PhoBERT training.

    This function applies a series of text preprocessing steps such as normalization, tag removal,
    and removing unnecessary ending letters, ensuring that the sentence is clean and standardized
    for input into the PhoBERT model.

    Parameters
    ----------
    sentence : str
         The input sentence to be cleaned and preprocessed.

    Returns
    -------
    str
        Cleaned sentence for training PhoBERT.
    """
    sentence = text_normalize(sentence)
    sentence =  remove_all_tag(sentence)
    sentence = remove_ending_letters_in_sentence(sentence)
    return sentence

cols = ["Quality",	"Serve",	"Pack",	"Shipping",	"Price",	"Other",	"rating",	"comment"	]

train = pd.read_excel("/content/drive/MyDrive/Thesis: Topic Modelling/Data/Splitted data/train.xlsx")[cols]
train["comment"] = train["comment"].astype(str).apply(lambda x: cleaning_for_phobert(x))

test = pd.read_excel("/content/drive/MyDrive/Thesis: Topic Modelling/Data/Splitted data/test.xlsx")[cols]
test["comment"] = test["comment"].astype(str).apply(lambda x: cleaning_for_phobert(x))

valid = pd.read_excel("/content/drive/MyDrive/Thesis: Topic Modelling/Data/Splitted data/valid.xlsx")[cols]
valid["comment"] = valid["comment"].astype(str).apply(lambda x: cleaning_for_phobert(x))

In [None]:
train.shape

(12240, 8)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

def multi_label_metrics(predictions, labels, threshold=0.5):
    """
    Computes multi-label classification metrics such as F1 score, ROC-AUC, and accuracy.

    Args:
        predictions (array-like): The raw prediction scores, typically of shape (batch_size, num_labels).
        labels (array-like): The true binary labels, of the same shape as predictions.
        threshold (float, optional): The threshold for converting predicted probabilities into binary predictions.
                                     Defaults to 0.5.

    Returns:
        dict: A dictionary containing the computed metrics:
              - 'f1': F1 score with micro average
              - 'roc_auc': ROC-AUC score with micro average
              - 'accuracy': Accuracy score
    """
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def get_prediction(predictions, threshold=0.5):
    """
    Converts raw prediction scores into binary predictions based on a threshold.
    ---
    Args:
        predictions (array-like): The raw prediction scores, typically of shape (batch_size, num_labels).
        threshold (float, optional): The threshold for converting predicted probabilities into binary predictions.
                                     Defaults to 0.5.
    ---
    Returns:
        array: The binary predictions, of the same shape as predictions.

    """
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    return y_pred


In [None]:
phobert = Phobertv2()
train_data_encoding = phobert.preprocess(train)
train_data = phobert.generate_dataset(train_data_encoding)
test_data_encoding = phobert.preprocess(test)
test_data = phobert.generate_dataset(test_data_encoding, batch_size = 1)
valid_data_encoding = phobert.preprocess(valid)
valid_data = phobert.generate_dataset(valid_data_encoding)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Initialize segmenter, tokenizer and model
segmenter = VnCoreNLP("/content/drive/MyDrive/transformers/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
tokenizer = phobert.tokenizer
PHO_BERT = phobert.model.cuda()

# 2. Training Loop

In [None]:
import random
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW
from tqdm import tqdm_notebook
device = 'cuda'
epochs = 10

param_optimizer = list(PHO_BERT.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# Save training weights:
save_best_path =  "/content/drive/MyDrive/THESIS DSEB62: Product review analysis/Baseline-model/Phobertv2" + "/processed_best.pth"
save_last_path = "/content/drive/MyDrive/THESIS DSEB62: Product review analysis/Baseline-model/Phobertv2" + "/processed_last.pth"

train_loss_list, eval_loss_list = [], []
train_accuracy_list, eval_accuracy_list = [], []
train_f1_list, eval_f1_list = [], []
saved_status = []
max_acc = 0
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)
best_epoch = 0
for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_loss = 0
    PHO_BERT.train()
    train_accuracy = 0
    nb_train_steps = 0
    train_f1 = 0

    for step, batch in tqdm_notebook(enumerate(train_data)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        PHO_BERT.zero_grad()
        outputs = PHO_BERT(b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()

        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        train_eval = multi_label_metrics(logits, label_ids)
        tmp_train_accuracy = train_eval["accuracy"]
        tmp_train_f1 = train_eval["f1"]
        train_accuracy += tmp_train_accuracy
        train_f1 += tmp_train_f1
        nb_train_steps += 1

        loss.backward()
        torch.nn.utils.clip_grad_norm_(PHO_BERT.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_data)
    print(" Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
    print(" F1 score: {0:.4f}".format(train_f1/nb_train_steps))
    print(" Average training loss: {0:.4f}".format(avg_train_loss))
    train_loss_list.append(avg_train_loss)
    train_accuracy_list.append(train_accuracy/nb_train_steps)
    train_f1_list.append(train_f1/nb_train_steps)
    print("Running Validation...")
    PHO_BERT.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_f1 = 0
    for batch in tqdm_notebook(valid_data):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = PHO_BERT(b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels)
            loss = outputs[0]
            eval_loss += loss.item()
            logits = outputs[1].detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            valid_eval = multi_label_metrics(logits, label_ids)
            tmp_eval_accuracy = valid_eval["accuracy"]
            tmp_eval_f1 = valid_eval["f1"]
            eval_accuracy += tmp_eval_accuracy
            eval_f1 += tmp_eval_f1
            nb_eval_steps += 1
    avg_eval_loss = eval_loss / len(valid_data)
    print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
    print(" F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))
    print(" Average validation loss: {0:.4f}".format(avg_eval_loss))
    eval_loss_list.append(avg_eval_loss)
    eval_accuracy_list.append(eval_accuracy/nb_eval_steps)
    eval_f1_list.append(eval_f1/nb_eval_steps)

    if (eval_accuracy/nb_eval_steps) > max_acc:
        print("new model saved")
        max_acc = eval_accuracy/nb_eval_steps
        best_epoch = epoch_i
        saved_status.append(1)
        # torch.save(PHO_BERT, save_best_path)
    else:
        saved_status.append(0)
    if epoch_i - best_epoch > 5:
        break
print("Training complete!")
# torch.save(PHO_BERT, save_last_path)

Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in tqdm_notebook(enumerate(train_data)):


0it [00:00, ?it/s]

 Accuracy: 0.8757
 F1 score: 0.9595
 Average training loss: 0.0784
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(valid_data):


  0%|          | 0/43 [00:00<?, ?it/s]

 Accuracy: 0.8228
 F1 score: 0.9386
 Average training loss: 0.1181
new model saved
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in tqdm_notebook(enumerate(train_data)):


0it [00:00, ?it/s]

 Accuracy: 0.9065
 F1 score: 0.9705
 Average training loss: 0.0601
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(valid_data):


  0%|          | 0/43 [00:00<?, ?it/s]

 Accuracy: 0.8316
 F1 score: 0.9414
 Average training loss: 0.1165
new model saved
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in tqdm_notebook(enumerate(train_data)):


0it [00:00, ?it/s]

 Accuracy: 0.9224
 F1 score: 0.9763
 Average training loss: 0.0490
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(valid_data):


  0%|          | 0/43 [00:00<?, ?it/s]

 Accuracy: 0.8308
 F1 score: 0.9397
 Average training loss: 0.1225
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in tqdm_notebook(enumerate(train_data)):


0it [00:00, ?it/s]

 Accuracy: 0.9386
 F1 score: 0.9812
 Average training loss: 0.0408
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(valid_data):


  0%|          | 0/43 [00:00<?, ?it/s]

 Accuracy: 0.8258
 F1 score: 0.9389
 Average training loss: 0.1272
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in tqdm_notebook(enumerate(train_data)):


0it [00:00, ?it/s]

 Accuracy: 0.9514
 F1 score: 0.9853
 Average training loss: 0.0343
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(valid_data):


  0%|          | 0/43 [00:00<?, ?it/s]

 Accuracy: 0.8251
 F1 score: 0.9372
 Average training loss: 0.1390
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in tqdm_notebook(enumerate(train_data)):


0it [00:00, ?it/s]

 Accuracy: 0.9585
 F1 score: 0.9872
 Average training loss: 0.0294
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(valid_data):


  0%|          | 0/43 [00:00<?, ?it/s]

 Accuracy: 0.8251
 F1 score: 0.9376
 Average training loss: 0.1381
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in tqdm_notebook(enumerate(train_data)):


0it [00:00, ?it/s]

 Accuracy: 0.9635
 F1 score: 0.9889
 Average training loss: 0.0257
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(valid_data):


  0%|          | 0/43 [00:00<?, ?it/s]

 Accuracy: 0.8251
 F1 score: 0.9368
 Average training loss: 0.1489
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in tqdm_notebook(enumerate(train_data)):


0it [00:00, ?it/s]

 Accuracy: 0.9700
 F1 score: 0.9909
 Average training loss: 0.0212
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(valid_data):


  0%|          | 0/43 [00:00<?, ?it/s]

 Accuracy: 0.8287
 F1 score: 0.9377
 Average training loss: 0.1531
Training complete!


In [None]:
# Summarizing training progress
df = pd.DataFrame({
    "train_loss": train_loss_list,
    "eval_loss": eval_loss_list,
    "train_accuracy": train_accuracy_list,
    "eval_accuracy": eval_accuracy_list,
    "train_f1": train_f1_list,
    "eval_f1": eval_f1_list,
    "saved_status": saved_status,
})
df.to_excel("/content/result.xlsx", index=False)

In [None]:
df

Unnamed: 0,train_loss,eval_loss,train_accuracy,eval_accuracy,train_f1,eval_f1,saved_status
0,0.078386,0.118057,0.875734,0.822845,0.959548,0.938575,1
1,0.060071,0.116476,0.906495,0.831566,0.970533,0.94144,1
2,0.049003,0.122496,0.922405,0.83084,0.976257,0.93966,0
3,0.040808,0.12718,0.938642,0.825838,0.981188,0.938939,0
4,0.034305,0.13898,0.951371,0.825111,0.985305,0.937246,0
5,0.029402,0.138124,0.958469,0.825111,0.987161,0.937649,0
6,0.025717,0.148905,0.963528,0.825111,0.988946,0.936841,0
7,0.021191,0.153056,0.969974,0.828745,0.990872,0.9377,0


In [None]:
torch.save(PHO_BERT, "/content/drive/MyDrive/THESIS DSEB62: Product review analysis/Baseline-model/Phobertv2/last.pth")

# 3. Evaluation

## 3.1. Evaluate model at final epochs:

In [None]:
# Get predictions for model at final epochs
import random
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW
from tqdm import tqdm_notebook
device = "cuda"
model = torch.load("/content/drive/MyDrive/THESIS DSEB62: Product review analysis/Baseline-model/Phobertv2/last.pth")
predictions = []
labels_l = []
for batch in tqdm_notebook(test_data):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids,
        token_type_ids=None,
        attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.append(get_prediction(logits))
        labels_l.append(label_ids)

In [None]:
prediction = pd.DataFrame(np.array(predictions).reshape(-1, 6), columns = ["Quality",	"Serve",	"Pack",	"Shipping", "Price", "Other"])
labels = pd.DataFrame(np.array(labels_l).reshape(-1, 6), columns = ["Quality",	"Serve",	"Pack",	"Shipping", "Price", "Other"])

In [None]:
# Classification report on last model
from sklearn.metrics import hamming_loss
from sklearn.metrics import classification_report
import pandas as pd

phobert_pred = prediction

label_cols = ["Quality",	"Serve",	"Pack",	"Shipping", "Price", "Other"]
print("Classification report from phobertv2")
print('Hamming Loss: ', round(hamming_loss(labels[label_cols], phobert_pred[label_cols]),3))
print(classification_report(labels[label_cols],  phobert_pred[label_cols]))
for i in range(len(label_cols)):
  print(f"classification report of {label_cols[i]}")
  print(classification_report(labels[label_cols[i]], phobert_pred[label_cols[i]]))

Classification report from phobertv2
Hamming Loss:  0.038
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      2721
           1       0.85      0.94      0.89       518
           2       0.93      0.98      0.95       581
           3       0.91      0.92      0.92      1366
           4       0.95      0.95      0.95       509
           5       0.94      0.86      0.90       309

   micro avg       0.93      0.94      0.94      6004
   macro avg       0.92      0.93      0.93      6004
weighted avg       0.93      0.94      0.94      6004
 samples avg       0.93      0.94      0.93      6004

classification report of Quality
              precision    recall  f1-score   support

         0.0       0.81      0.79      0.80       680
         1.0       0.95      0.96      0.95      2721

    accuracy                           0.92      3401
   macro avg       0.88      0.87      0.88      3401
weighted avg       0.92      0.92      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 3.2 Evaluate best model on validation set:

In [None]:
# Get predictions and evaluate best model on validation set
import random
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW
from tqdm import tqdm_notebook
device = "cuda"
model = torch.load("/content/drive/MyDrive/THESIS DSEB62: Product review analysis/Baseline-model/Phobertv2/best.pth")
predictions = []
labels_l = []
for batch in tqdm_notebook(test_data):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids,
        token_type_ids=None,
        attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.append(get_prediction(logits))
        labels_l.append(label_ids)

prediction = pd.DataFrame(np.array(predictions).reshape(-1, 6), columns = ["Quality",	"Serve",	"Pack",	"Shipping", "Price", "Other"])
labels = pd.DataFrame(np.array(labels_l).reshape(-1, 6), columns = ["Quality",	"Serve",	"Pack",	"Shipping", "Price", "Other"])
phobert_pred = prediction

label_cols = ["Quality",	"Serve",	"Pack",	"Shipping", "Price", "Other"]
print("Classification report from phobertv2")
print('Hamming Loss: ', round(hamming_loss(labels[label_cols], phobert_pred[label_cols]),3))
print(classification_report(labels[label_cols],  phobert_pred[label_cols]))
for i in range(len(label_cols)):
  print(f"classification report of {label_cols[i]}")
  print(classification_report(labels[label_cols[i]], phobert_pred[label_cols[i]]))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(test_data):


  0%|          | 0/3401 [00:00<?, ?it/s]

Classification report from phobertv2
Hamming Loss:  0.032
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      2721
           1       0.88      0.93      0.90       518
           2       0.97      0.97      0.97       581
           3       0.91      0.96      0.94      1366
           4       0.95      0.96      0.96       509
           5       0.94      0.90      0.92       309

   micro avg       0.94      0.96      0.95      6004
   macro avg       0.93      0.95      0.94      6004
weighted avg       0.94      0.96      0.95      6004
 samples avg       0.94      0.95      0.94      6004

classification report of Quality
              precision    recall  f1-score   support

         0.0       0.84      0.79      0.81       680
         1.0       0.95      0.96      0.96      2721

    accuracy                           0.93      3401
   macro avg       0.89      0.88      0.88      3401
weighted avg       0.93      0.93      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 4. Generate features from PhoBERT model

In [None]:
# Load best model as embedder
save_best_path =  "/content/drive/MyDrive/THESIS DSEB62: Product review analysis/Baseline-model/Phobertv2" + "/best.pth"
model = torch.load(save_best_path)
embedder = model.roberta

In [None]:
# Generate embedding for train and test set by mean pooling
import random
from tqdm import tqdm_notebook
device = 'cuda'
phobert = embedder.to(device)
with torch.no_grad():
    phobert.eval()
    train_embedded_mean = []
    train_embedded_pooling = []
    for batch in tqdm_notebook(train_data):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        last_hidden_layer = phobert(b_input_ids,
        token_type_ids=None,
        attention_mask=b_input_mask)
        embedded_value_mean = torch.mean(last_hidden_layer[0], dim=1)
        embedded_value_pool = last_hidden_layer[0]
        train_embedded_mean.append(embedded_value_mean)
        train_embedded_pooling.append(embedded_value_pool)

with torch.no_grad():
    phobert.eval()
    test_embedded_mean = []
    test_embedded_pooling = []
    for batch in tqdm_notebook(test_data):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        last_hidden_layer = phobert(b_input_ids,
        token_type_ids=None,
        attention_mask=b_input_mask)
        embedded_value_mean = torch.mean(last_hidden_layer[0], dim=1)
        embedded_value_pool = last_hidden_layer[0]
        test_embedded_mean.append(embedded_value_mean)
        test_embedded_pooling.append(embedded_value_pool)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(train_data):


  0%|          | 0/383 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(test_data):


  0%|          | 0/3401 [00:00<?, ?it/s]

In [None]:

test_feature = torch.cat(test_embedded_mean).detach().cpu().numpy()
train_feature = torch.cat(train_embedded_mean).detach().cpu().numpy()

In [None]:
phobert_test_feature = pd.DataFrame(test_feature)
phobert_test_feature.to_csv("/content/drive/MyDrive/Thesis: Topic Modelling/Code/Phobert result/phobert_test_feature.csv")

phobert_train_feature = pd.DataFrame(train_feature)
phobert_train_feature.to_csv("/content/drive/MyDrive/Thesis: Topic Modelling/Code/Phobert result/phobert_train_feature.csv")

In [None]:
phobert_train_feature.shape

(12240, 768)

In [None]:
phobert_test_feature.shape

(3401, 768)