# XLM-RoBERTa
*Time for run all (GPU): ~4 hours*

## 1. General Settings and Import Libraries

In [1]:
# !pip install unicodedata2
# !git clone https://github.com/vncorenlp/VnCoreNLP

In [2]:
# # Thư viện vncorenlp và wordsegmenter dùng để Word Segmentation cho tiếng Việt

# # Install the vncorenlp python wrapper
# !pip install vncorenlp

# # Download VnCoreNLP-1.1.1.jar & its word segmentation component (i.e. RDRSegmenter) 
# !mkdir -p vncorenlp/models/wordsegmenter
# !wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
# !wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
# !wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
# !mv VnCoreNLP-1.1.1.jar vncorenlp/ 
# !mv vi-vocab vncorenlp/models/wordsegmenter/
# !mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

In [3]:
import pprint
import numpy as np
import pandas as pd
import re
import unicodedata

from sklearn import preprocessing, metrics

import torch
from torch import Tensor
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torchtext
import torchtext.transforms as T
import torchtext.functional as F
from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER

import pickle

import nltk
from nltk.corpus import stopwords

from vncorenlp import VnCoreNLP

## 2. Data Preparation 

In [4]:
df = pd.read_csv("./datasetVie.csv")
# columns_to_remove = ['Unnamed: 2','Unnamed: 3','Unnamed: 4']
# df = df.drop(columns=columns_to_remove)
df

Unnamed: 0,Comment,Label
0,Chúc mừng các cô gái vàng của đội tuyển bóng đ...,2
1,Hay,2
2,Đội tuyển nữ Việt Nam thật là tuyệt vời,2
3,Các gái cam biết thế nào là đội dự wc chưa,2
4,Thanh nhã xinh,2
...,...,...
2263,"Đó mọi người thấy chưa ghê quá, ai thấy t nói ...",0
2264,Lam vay moi giau cat nha lau,1
2265,Cho di tù,0
2266,từ bỏ nóm này nha mọi người,0


In [5]:
# lowercasing
df['Comment'] = df['Comment'].str.lower()

# removing urls
df['Comment'] = df['Comment'].str.replace('http\S+|www.\S+', '', case=False)

# removing commas "\n"
df['Comment'] = df['Comment'].replace('\n','', regex=True)

# removing all the punctuations
df['Comment'] = df['Comment'].str.replace('[^\w\s]','')

# # removing integers
# df['Comment'] = df['Comment'].replace('\d','', regex=True)

# # removing emojis
# df['Comment'] = df['Comment'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)

In [6]:
df['Comment'] = df['Comment'].astype(str)

In [7]:
from autocorrect import Speller

spell = Speller(lang='vi')

def typo_corrector(text):
    return spell(text)

df['Comment'] = df['Comment'].apply(typo_corrector)

In [8]:
# Removing stopwords

def remove_stopwords(text, stopwords):
    # Create a regular expression pattern for stopwords
    pattern = r'\b(?:{})\b'.format('|'.join(stopwords))
    
    # Remove stopwords using regex
    filtered_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    
    return filtered_text

# Load the stopwords from the file
with open('./vietnamese-stopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = [line.strip() for line in file]

df['Comment'] = df['Comment'].apply(lambda x: remove_stopwords(x, stopwords))


In [9]:
rdrsegmenter = VnCoreNLP("./vncorenlp/VnCoreNLP-1.2.jar", annotators="wseg,pos,ner", max_heap_size='-Xmx2g')
for i in range(len(df)):
    df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
    df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = ' '.join([' '.join(x) for x in df['Comment'][i]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comment'][i] = rdrsegmenter.tokenize(df['Comment'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

Unnamed: 0,Comment,Label
0,chúc_mừng gái vàng đội_tuyển bóng_đá nữ việt_n...,2
1,hay,2
2,đội tuyển nữ việt nam thật là tuyệt vời,2
3,gái cam đội dự wc,2
4,nhã xinh,2
...,...,...
2263,"ghê , t like",0
2264,lam vay moi giau cat nha lau,1
2265,di tù,0
2266,nha,0


In [10]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [11]:
train_df

Unnamed: 0,Comment,Label
2192,"ngon ngon lắm , hành phi dc ngon , dầu kia thể...",1
2103,am kinh,0
450,ảo đấy,2
316,trị . ta kỉ_niệm chiến_thắng giặc tàu . tưởng ...,1
1681,co cuc kho t van lam vi dam me .,1
...,...,...
1638,kính chúc tài thương lộ bình_an nẻo đường,2
1095,ion ma_trận . thực : ),1
1130,phê ! !,2
1294,camera man sống,1


In [12]:
test_df

Unnamed: 0,Comment,Label
188,nồi cơm 🤤🤤,2
1320,tội_ác mỹ ko thể dung,0
2221,bơm tiền êm ...,0
1087,đầu detroit : become human,1
674,clip hay hơi gượng : (,2
...,...,...
1050,tao mơi biet . chu tao coi hinh nhu 5.6 lan ma...,1
1281,lính mỹ ác_độc,0
891,all hoàn_thiện ma_trận,1
1567,nhân chiến_tranh cướp đi mong hoà_bình mãi_mãi,1


In [13]:
label_encoder = preprocessing.LabelEncoder()
train_df['Label'] = label_encoder.fit_transform(train_df['Label'])

In [14]:
test_df['Label'] = label_encoder.transform(test_df['Label'])

In [15]:
class PadTransform(torch.nn.Module):
    """Pad tensor to a fixed length with given padding value.
    :param max_length: Maximum length to pad to
    :type max_length: int
    :param pad_value: Value to pad the tensor with
    :type pad_value: bool
    """

    def __init__(self, max_length: int, pad_value: int) -> None:
        super().__init__()
        self.max_length = max_length
        self.pad_value = float(pad_value)
        
    def forward(self, x: Tensor) -> Tensor:
        """
        :param x: The tensor to pad
        :type x: Tensor
        :return: Tensor padded up to max_length with pad_value
        :rtype: Tensor
        """
        max_encoded_length = x.size(-1)
        if max_encoded_length < self.max_length:
            pad_amount = self.max_length - max_encoded_length
            x = torch.nn.functional.pad(x, (0, pad_amount), value=self.pad_value)
        return x

In [16]:
padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256

In [17]:
text_transform = torchtext.models.XLMR_LARGE_ENCODER.transform()

In [18]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(text_transform(self.df.iloc[idx, 0])),
            torch.tensor(self.df.iloc[idx, 1])
        )

In [19]:
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)

In [20]:
SMALL_BATCH_SIZE = 16
K = 2
LARGE_BATCH_SIZE = K * SMALL_BATCH_SIZE

In [21]:
def batch_collate_fn(batch):
    inp_list = list()
    tar_list = list()
    
    for sample in batch:
        inp_list.append(sample[0].tolist())
        tar_list.append(sample[1])
        
    padded_tensor = F.to_tensor(inp_list, padding_value=padding_idx)
    target_tensor = torch.stack(tar_list).type(torch.LongTensor)
    
    return padded_tensor, target_tensor

In [22]:
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=SMALL_BATCH_SIZE,
    shuffle=True,
    collate_fn=batch_collate_fn
)

In [23]:
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=SMALL_BATCH_SIZE,
    shuffle=False,
    collate_fn=batch_collate_fn
)

In [24]:
num_classes = 3
input_dim = 768

In [25]:
classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
model = XLMR_BASE_ENCODER.get_model(head=classifier_head)

In [26]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [27]:
model.to(DEVICE)
pass  

In [28]:
learning_rate = 1.2e-5 
optim = AdamW(model.parameters(), lr=learning_rate)
criteria = torch.nn.CrossEntropyLoss()

## 3. Model Training and Evaluation

In [29]:
def train_step(input, target, small_batch_no):
    output = model(input)
    loss = criteria(output, target)
    loss.backward()
    
    if (small_batch_no + 1) % K == 0 or (small_batch_no + 1) == len(train_dataloader):
        optim.step()
        optim.zero_grad()
    
    return loss.item() / input.size(dim=0)

def evaluate():
    model.eval()
    total_loss = 0
    counter = 0
    
    with torch.no_grad():
        for i, batch in enumerate(test_dataloader):
            input = batch[0].clone().detach().to(DEVICE)
            output = model(input)
            target = batch[1].clone().detach().to(DEVICE)
            
            if i == 0:
                class_output = torch.argmax(output, dim=1)
                class_target = target
            else:
                class_output = torch.cat([class_output, torch.argmax(output, dim=1)])
                class_target = torch.cat([class_target, target])
            
            loss = criteria(output, target).item()
            total_loss += loss
            counter += input.size(dim=0)
            
            
        confusion_matrix = metrics.confusion_matrix(
            class_target.cpu().numpy().flatten(),
            class_output.cpu().numpy().flatten(),
            labels=[0, 1, 2]
        ) 
        classification_report = metrics.classification_report(
            class_target.cpu().numpy().flatten(),
            class_output.cpu().numpy().flatten(),
            labels=[0, 1, 2],
            output_dict=True
        )

    return (
        total_loss,
        counter,
        confusion_matrix,
        classification_report
    )

In [30]:
num_epochs = 15

In [31]:
def save_log(epoch, **kwargs):
    with open(f'log_{epoch}.pkl', 'wb') as f:
        pickle.dump(kwargs, f)

def save_model(message):
    torch.save(model.state_dict(), f'model_{message}.pth')
    print('Model saved successfully')

In [32]:
max_accuracy = float('-inf')
max_macro_f1 = float('-inf')
max_weighted_f1 = float('-inf')

In [33]:
for epoch in range(num_epochs):
    model.train()
    avg_training_losses = list()
    
    for small_batch_no, small_batch in enumerate(train_dataloader):
        input = small_batch[0].clone().detach().to(DEVICE)
        target = small_batch[1].clone().detach().to(DEVICE)
        avg_training_losses.append(
            train_step(input, target, small_batch_no)
        )
        torch.cuda.empty_cache()

    total_loss, counter, confusion_matrix, classification_report = evaluate()
    print(f'EPOCH {epoch}')
    print(f'Mean of avg_training_losses={np.mean(avg_training_losses)}')
    print(f'total_loss={total_loss}')
    print(f'counter={counter}')
    print(f'loss=total_loss/counter={total_loss/counter}')
    print(f'confusion_matrix=\n{confusion_matrix}')
    print('classification_report=')
    pprint.pprint(classification_report)
    
    save_log(
        epoch,
        avg_training_losses=avg_training_losses,
        total_loss=total_loss,
        counter=counter,
        loss=total_loss/counter,
        confusion_matrix=confusion_matrix,
        classification_report=classification_report
    )
    
    if classification_report['accuracy'] > max_accuracy:
        print(f'New max_accuracy')
        max_accuracy = classification_report['accuracy']
        max_accuracy_index = epoch
        save_model('max_accuracy')
        
    elif classification_report['macro avg']['f1-score'] > max_macro_f1:
        print(f'New max_macro_f1')
        max_macro_f1 = classification_report['macro avg']['f1-score']
        max_macro_f1_index = epoch
        save_model('max_macro_f1')
    
    elif classification_report['weighted avg']['f1-score'] > max_weighted_f1:
        print(f'New max_weighted_f1')
        max_weighted_f1 = classification_report['weighted avg']['f1-score']
        max_weighted_f1_index = epoch
        save_model('max_weighted_f1')
    
    elif epoch == num_epochs - 1:
        save_model(f'{epoch}_last')
    
    elif epoch % 40 == 0:
        save_model(f'{epoch}_checkpoint')
        
    else:
        continue

save_log(
    'post_train_info',
    max_accuracy=max_accuracy,
    max_accuracy_index=max_accuracy_index,
    max_macro_f1=max_macro_f1,
    max_macro_f1_index=max_macro_f1_index,
    max_weighted_f1=max_weighted_f1,
    max_weighted_f1_index=max_weighted_f1_index
)

  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


EPOCH 0
Mean of avg_training_losses=0.06784324568004636
total_loss=29.971278965473175
counter=454
loss=total_loss/counter=0.0660160329635973
confusion_matrix=
[[  0  86   0]
 [  0 238   0]
 [  0 130   0]]
classification_report=
{'0': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 86},
 '1': {'f1-score': 0.6878612716763006,
       'precision': 0.5242290748898678,
       'recall': 1.0,
       'support': 238},
 '2': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 130},
 'accuracy': 0.5242290748898678,
 'macro avg': {'f1-score': 0.22928709055876687,
               'precision': 0.17474302496328928,
               'recall': 0.3333333333333333,
               'support': 454},
 'weighted avg': {'f1-score': 0.36059687810343516,
                  'precision': 0.27481612295988667,
                  'recall': 0.5242290748898678,
                  'support': 454}}
New max_accuracy
Model saved successfully


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


EPOCH 1
Mean of avg_training_losses=0.06679463532497311
total_loss=29.739292919635773
counter=454
loss=total_loss/counter=0.06550505048377923
confusion_matrix=
[[  0  86   0]
 [  0 238   0]
 [  0 130   0]]
classification_report=
{'0': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 86},
 '1': {'f1-score': 0.6878612716763006,
       'precision': 0.5242290748898678,
       'recall': 1.0,
       'support': 238},
 '2': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 130},
 'accuracy': 0.5242290748898678,
 'macro avg': {'f1-score': 0.22928709055876687,
               'precision': 0.17474302496328928,
               'recall': 0.3333333333333333,
               'support': 454},
 'weighted avg': {'f1-score': 0.36059687810343516,
                  'precision': 0.27481612295988667,
                  'recall': 0.5242290748898678,
                  'support': 454}}
New max_macro_f1
Model saved successfully


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


EPOCH 2
Mean of avg_training_losses=0.06545887702302626
total_loss=27.91561597585678
counter=454
loss=total_loss/counter=0.0614881409159841
confusion_matrix=
[[  0  85   1]
 [  0 219  19]
 [  0 107  23]]
classification_report=
{'0': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 86},
 '1': {'f1-score': 0.674884437596302,
       'precision': 0.5328467153284672,
       'recall': 0.9201680672268907,
       'support': 238},
 '2': {'f1-score': 0.2658959537572254,
       'precision': 0.5348837209302325,
       'recall': 0.17692307692307693,
       'support': 130},
 'accuracy': 0.5330396475770925,
 'macro avg': {'f1-score': 0.31359346378450914,
               'precision': 0.35591014541956656,
               'recall': 0.3656970480499892,
               'support': 454},
 'weighted avg': {'f1-score': 0.4299316522827295,
                  'precision': 0.4324942774649899,
                  'recall': 0.5330396475770925,
                  'support': 454}}
New max_accuracy
Model saved 



EPOCH 3
Mean of avg_training_losses=0.06134344881389574
total_loss=26.157998740673065
counter=454
loss=total_loss/counter=0.05761673731425785
confusion_matrix=
[[ 11  74   1]
 [  6 230   2]
 [  1 106  23]]
classification_report=
{'0': {'f1-score': 0.21153846153846154,
       'precision': 0.6111111111111112,
       'recall': 0.12790697674418605,
       'support': 86},
 '1': {'f1-score': 0.7098765432098766,
       'precision': 0.5609756097560976,
       'recall': 0.9663865546218487,
       'support': 238},
 '2': {'f1-score': 0.2948717948717949,
       'precision': 0.8846153846153846,
       'recall': 0.17692307692307693,
       'support': 130},
 'accuracy': 0.5814977973568282,
 'macro avg': {'f1-score': 0.405428933206711,
               'precision': 0.6855673684941977,
               'recall': 0.4237388694297039,
               'support': 454},
 'weighted avg': {'f1-score': 0.49664374077002565,
                  'precision': 0.663144825280852,
                  'recall': 0.58149779735682



EPOCH 4
Mean of avg_training_losses=0.05742001162068537
total_loss=24.284067302942276
counter=454
loss=total_loss/counter=0.05348913502850722
confusion_matrix=
[[ 32  43  11]
 [ 27 159  52]
 [  8  25  97]]
classification_report=
{'0': {'f1-score': 0.4183006535947712,
       'precision': 0.47761194029850745,
       'recall': 0.37209302325581395,
       'support': 86},
 '1': {'f1-score': 0.6838709677419355,
       'precision': 0.7004405286343612,
       'recall': 0.6680672268907563,
       'support': 238},
 '2': {'f1-score': 0.6689655172413793,
       'precision': 0.60625,
       'recall': 0.7461538461538462,
       'support': 130},
 'accuracy': 0.6343612334801763,
 'macro avg': {'f1-score': 0.5903790461926953,
               'precision': 0.5947674896442895,
               'recall': 0.5954380321001388,
               'support': 454},
 'weighted avg': {'f1-score': 0.629296616240331,
                  'precision': 0.6312598517194926,
                  'recall': 0.6343612334801763,
        



EPOCH 5
Mean of avg_training_losses=0.05383527384558964
total_loss=22.581658840179443
counter=454
loss=total_loss/counter=0.049739336652377626
confusion_matrix=
[[ 46  37   3]
 [ 35 175  28]
 [ 11  36  83]]
classification_report=
{'0': {'f1-score': 0.5168539325842696,
       'precision': 0.5,
       'recall': 0.5348837209302325,
       'support': 86},
 '1': {'f1-score': 0.720164609053498,
       'precision': 0.7056451612903226,
       'recall': 0.7352941176470589,
       'support': 238},
 '2': {'f1-score': 0.680327868852459,
       'precision': 0.7280701754385965,
       'recall': 0.6384615384615384,
       'support': 130},
 'accuracy': 0.6696035242290749,
 'macro avg': {'f1-score': 0.6391154701634089,
               'precision': 0.6445717789096397,
               'recall': 0.63621312567961,
               'support': 454},
 'weighted avg': {'f1-score': 0.6702450178585889,
                  'precision': 0.6731116105597232,
                  'recall': 0.6696035242290749,
                



EPOCH 6
Mean of avg_training_losses=0.05042059987522008
total_loss=22.005897223949432
counter=454
loss=total_loss/counter=0.04847113925980051
confusion_matrix=
[[ 50  29   7]
 [ 41 167  30]
 [ 14  28  88]]
classification_report=
{'0': {'f1-score': 0.5235602094240838,
       'precision': 0.47619047619047616,
       'recall': 0.5813953488372093,
       'support': 86},
 '1': {'f1-score': 0.722943722943723,
       'precision': 0.7455357142857143,
       'recall': 0.7016806722689075,
       'support': 238},
 '2': {'f1-score': 0.6901960784313725,
       'precision': 0.704,
       'recall': 0.676923076923077,
       'support': 130},
 'accuracy': 0.6718061674008811,
 'macro avg': {'f1-score': 0.645566670266393,
               'precision': 0.6419087301587302,
               'recall': 0.6533330326763979,
               'support': 454},
 'weighted avg': {'f1-score': 0.6757979609408715,
                  'precision': 0.6826208831550241,
                  'recall': 0.6718061674008811,
             



EPOCH 7
Mean of avg_training_losses=0.04773999849737388
total_loss=23.039173901081085
counter=454
loss=total_loss/counter=0.050747079077271114
confusion_matrix=
[[ 54  26   6]
 [ 46 157  35]
 [ 16  28  86]]
classification_report=
{'0': {'f1-score': 0.5346534653465347,
       'precision': 0.46551724137931033,
       'recall': 0.627906976744186,
       'support': 86},
 '1': {'f1-score': 0.6993318485523385,
       'precision': 0.7440758293838863,
       'recall': 0.6596638655462185,
       'support': 238},
 '2': {'f1-score': 0.669260700389105,
       'precision': 0.6771653543307087,
       'recall': 0.6615384615384615,
       'support': 130},
 'accuracy': 0.6541850220264317,
 'macro avg': {'f1-score': 0.6344153380959927,
               'precision': 0.6289194750313017,
               'recall': 0.6497031012762887,
               'support': 454},
 'weighted avg': {'f1-score': 0.6595265837573617,
                  'precision': 0.6721498374779246,
                  'recall': 0.6541850220264317



EPOCH 8
Mean of avg_training_losses=0.04578248964764221
total_loss=25.22150269150734
counter=454
loss=total_loss/counter=0.055553970686139514
confusion_matrix=
[[ 47  24  15]
 [ 25 156  57]
 [ 11  17 102]]
classification_report=
{'0': {'f1-score': 0.5562130177514794,
       'precision': 0.5662650602409639,
       'recall': 0.5465116279069767,
       'support': 86},
 '1': {'f1-score': 0.7172413793103449,
       'precision': 0.7918781725888325,
       'recall': 0.6554621848739496,
       'support': 238},
 '2': {'f1-score': 0.6710526315789473,
       'precision': 0.5862068965517241,
       'recall': 0.7846153846153846,
       'support': 130},
 'accuracy': 0.6718061674008811,
 'macro avg': {'f1-score': 0.6481690095469239,
               'precision': 0.6481167097938402,
               'recall': 0.6621963991321036,
               'support': 454},
 'weighted avg': {'f1-score': 0.6735123566250055,
                  'precision': 0.6902482308559232,
                  'recall': 0.6718061674008811



EPOCH 9
Mean of avg_training_losses=0.04483857091341974
total_loss=20.90896213054657
counter=454
loss=total_loss/counter=0.046054982666402136
confusion_matrix=
[[ 51  33   2]
 [ 34 185  19]
 [ 14  36  80]]
classification_report=
{'0': {'f1-score': 0.5513513513513513,
       'precision': 0.5151515151515151,
       'recall': 0.5930232558139535,
       'support': 86},
 '1': {'f1-score': 0.7520325203252034,
       'precision': 0.7283464566929134,
       'recall': 0.7773109243697479,
       'support': 238},
 '2': {'f1-score': 0.6926406926406927,
       'precision': 0.7920792079207921,
       'recall': 0.6153846153846154,
       'support': 130},
 'accuracy': 0.6960352422907489,
 'macro avg': {'f1-score': 0.6653415214390824,
               'precision': 0.6785257265884068,
               'recall': 0.661906265189439,
               'support': 454},
 'weighted avg': {'f1-score': 0.6970115552795256,
                  'precision': 0.7062109780300588,
                  'recall': 0.6960352422907489,



EPOCH 10
Mean of avg_training_losses=0.04244635685221755
total_loss=20.25366058945656
counter=454
loss=total_loss/counter=0.044611587201446166
confusion_matrix=
[[ 46  35   5]
 [ 25 185  28]
 [  9  24  97]]
classification_report=
{'0': {'f1-score': 0.5542168674698795,
       'precision': 0.575,
       'recall': 0.5348837209302325,
       'support': 86},
 '1': {'f1-score': 0.7676348547717842,
       'precision': 0.7581967213114754,
       'recall': 0.7773109243697479,
       'support': 238},
 '2': {'f1-score': 0.7461538461538462,
       'precision': 0.7461538461538462,
       'recall': 0.7461538461538462,
       'support': 130},
 'accuracy': 0.7224669603524229,
 'macro avg': {'f1-score': 0.6893351894651699,
               'precision': 0.6931168558217738,
               'recall': 0.6861161638179422,
               'support': 454},
 'weighted avg': {'f1-score': 0.7210567093350094,
                  'precision': 0.720045858308659,
                  'recall': 0.7224669603524229,
           



EPOCH 11
Mean of avg_training_losses=0.040667671492399526
total_loss=21.138729572296143
counter=454
loss=total_loss/counter=0.04656107835307521
confusion_matrix=
[[ 43  40   3]
 [ 15 203  20]
 [  8  41  81]]
classification_report=
{'0': {'f1-score': 0.5657894736842106,
       'precision': 0.6515151515151515,
       'recall': 0.5,
       'support': 86},
 '1': {'f1-score': 0.7777777777777778,
       'precision': 0.7147887323943662,
       'recall': 0.8529411764705882,
       'support': 238},
 '2': {'f1-score': 0.6923076923076924,
       'precision': 0.7788461538461539,
       'recall': 0.6230769230769231,
       'support': 130},
 'accuracy': 0.7202643171806168,
 'macro avg': {'f1-score': 0.6786249812565602,
               'precision': 0.7150500125852238,
               'recall': 0.6586726998491704,
               'support': 454},
 'weighted avg': {'f1-score': 0.7131475899734653,
                  'precision': 0.7211454214541018,
                  'recall': 0.7202643171806168,
           



EPOCH 12
Mean of avg_training_losses=0.038782642707664364
total_loss=22.85859540104866
counter=454
loss=total_loss/counter=0.05034932907719969
confusion_matrix=
[[ 63  17   6]
 [ 50 162  26]
 [ 16  24  90]]
classification_report=
{'0': {'f1-score': 0.586046511627907,
       'precision': 0.4883720930232558,
       'recall': 0.7325581395348837,
       'support': 86},
 '1': {'f1-score': 0.7346938775510204,
       'precision': 0.7980295566502463,
       'recall': 0.680672268907563,
       'support': 238},
 '2': {'f1-score': 0.7142857142857143,
       'precision': 0.7377049180327869,
       'recall': 0.6923076923076923,
       'support': 130},
 'accuracy': 0.6938325991189427,
 'macro avg': {'f1-score': 0.678342034488214,
               'precision': 0.6747021892354298,
               'recall': 0.7018460335833797,
               'support': 454},
 'weighted avg': {'f1-score': 0.7006922592825677,
                  'precision': 0.7220984005000461,
                  'recall': 0.6938325991189427,




EPOCH 13
Mean of avg_training_losses=0.03755047177481982
total_loss=22.125170558691025
counter=454
loss=total_loss/counter=0.04873385585614763
confusion_matrix=
[[ 55  26   5]
 [ 32 179  27]
 [ 10  25  95]]
classification_report=
{'0': {'f1-score': 0.6010928961748634,
       'precision': 0.5670103092783505,
       'recall': 0.6395348837209303,
       'support': 86},
 '1': {'f1-score': 0.7649572649572649,
       'precision': 0.7782608695652173,
       'recall': 0.7521008403361344,
       'support': 238},
 '2': {'f1-score': 0.7392996108949417,
       'precision': 0.7480314960629921,
       'recall': 0.7307692307692307,
       'support': 130},
 'accuracy': 0.724669603524229,
 'macro avg': {'f1-score': 0.7017832573423567,
               'precision': 0.6977675583021866,
               'recall': 0.7074683182754318,
               'support': 454},
 'weighted avg': {'f1-score': 0.7265699725709466,
                  'precision': 0.7295882556005481,
                  'recall': 0.724669603524229,



EPOCH 14
Mean of avg_training_losses=0.038079933513892675
total_loss=21.02610546350479
counter=454
loss=total_loss/counter=0.04631300762886518
confusion_matrix=
[[ 52  31   3]
 [ 27 194  17]
 [ 11  41  78]]
classification_report=
{'0': {'f1-score': 0.5909090909090908,
       'precision': 0.5777777777777777,
       'recall': 0.6046511627906976,
       'support': 86},
 '1': {'f1-score': 0.7698412698412698,
       'precision': 0.7293233082706767,
       'recall': 0.8151260504201681,
       'support': 238},
 '2': {'f1-score': 0.6842105263157895,
       'precision': 0.7959183673469388,
       'recall': 0.6,
       'support': 130},
 'accuracy': 0.7136563876651982,
 'macro avg': {'f1-score': 0.68165362902205,
               'precision': 0.701006484465131,
               'recall': 0.6732590710702886,
               'support': 454},
 'weighted avg': {'f1-score': 0.7114268115891116,
                  'precision': 0.7196855154458415,
                  'recall': 0.7136563876651982,
               

In [34]:
max_accuracy
# max_accuracy_index
# max_macro_f1
# max_macro_f1_index
# max_weighted_f1
# max_weighted_f1_index

0.724669603524229

*Maximum recorded accuracy = 89.7%*

### 4. Demonstration

In [35]:
def prepare_model():
    num_classes = 3
    input_dim = 768

    classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
    model = XLMR_BASE_ENCODER.get_model(head=classifier_head)
    
    DEMO_MODEL_PATH = 'model_max_weighted_f1.pth'
    model.load_state_dict(torch.load(DEMO_MODEL_PATH))
    model.to(DEVICE)
    
    print(f'Loaded model to [{DEVICE}] in [{DEMO_MODEL_PATH}]')
    return model

In [36]:
def prepare_text_transform():
    text_transform = torchtext.models.XLMR_LARGE_ENCODER.transform()
    return text_transform

In [37]:
def predict(sentence, model, text_transform, label_map):
    transformed_text = text_transform(sentence)
    out = model(torch.tensor([transformed_text]).to(DEVICE))
    return label_map[torch.argmax(out).item()]

In [38]:
label_map = {
    0.0: 'negative',
    1.0: 'neutral',
    2.0: 'positive'
}

In [39]:
model = prepare_model()
text_transform = prepare_text_transform()

Loaded model to [cpu] in [model_max_weighted_f1.pth]


In [55]:
sample_text = 'ngu vl'
predict(sample_text, model, text_transform, label_map)

'neutral'