# DLNLP - Model Training & Testing

#### 0. Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
"""
 @Time : 05/05/2024 20:31
 @Author : SN23064343
 """

# Import Dependencies
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer,AutoModel
from transformers import AdamW

import sys
sys.path.append('/content/drive/MyDrive/DLNLP_24_SN23064343')
from A.Model import HateDetector
from A.Dataset import MyDataset

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
# Make path to save model
out_path = '/content/drive/MyDrive/DLNLP_24_SN23064343/B/result'
if not os.path.exists(out_path):
     os.makedirs(out_path)

In [20]:
# Define Hyperparameters
batch_size = 32
epochs = 10
lr = 0.001
criterion = nn.CrossEntropyLoss().to(device)

In [14]:
from torch.utils.data import Dataset

# Prepare Dataset (with defined tokenizer and cleaned data)
class MyDataset(Dataset):
    def __init__(self, data_csv, tokenizer):
        self.df = data_csv
        self.tokenizer = tokenizer

    #Get (input_ids, token_type_ids, attention_mask, and label) for Training
    def __getitem__(self, index):
        data = self.df.iloc[index]
        tweet,label = data['cleaned'],data['class']
        tokenzied_dict = self.tokenizer.encode_plus(tweet,
                                                    max_length=64,
                                                    truncation=True,
                                                    padding='max_length')
        input_ids = torch.tensor(tokenzied_dict['input_ids'])
        token_type_ids = torch.tensor(tokenzied_dict['token_type_ids'])
        attention_mask = torch.tensor(tokenzied_dict['attention_mask'])
        return input_ids,token_type_ids,attention_mask,torch.tensor(label)

    def __len__(self):
        return len(self.df)

In [15]:
# Prepare Data for Training (8:1:1)
print("Data Splitting...")
tweets = pd.read_csv('/content/drive/MyDrive/DLNLP_24_SN23064343/Datasets/clean.csv')
train_df,test_df = train_test_split(tweets,test_size=0.2,shuffle=True)
val_df,test_df = train_test_split(test_df,test_size=0.5,shuffle=True)

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
print("Loading Training Data...")
train_data = MyDataset(train_df,tokenizer)
train_dataloader = DataLoader(dataset=train_data,
                              batch_size=batch_size,
                              shuffle=True)
print("Size of Training Data: ",len(train_data))

print("Loading Validation Data...")
val_data = MyDataset(val_df,tokenizer)
val_dataloader = DataLoader(dataset=val_data,
                            batch_size=batch_size,
                            shuffle=True)
print("Size of Validation Data: ",len(val_data))

Data Splitting...


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Loading Training Data...
Size of Training Data:  19826
Loading Validation Data...
Size of Validation Data:  2478


In [17]:
# Model Architecture
# Classifier_1
class HateDetector(nn.Module):
    def __init__(self,encoder,label_num):
        super(HateDetector,self).__init__()
        self.label_num = label_num
        #1.BERTweet is used as the feature extractor for embedding.
        self.feature = encoder

        #2.Define the Classifier
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(128, self.label_num),
            nn.Softmax()
        )

    def forward(self, input_ids,token_type_ids,attention_mask,labels=None):
        # output shape of BERTweet: [seq_len, 768]
        outputs = self.feature(input_ids=input_ids,
                               token_type_ids=token_type_ids,
                               attention_mask=attention_mask)

        # [CLS] token
        pooler_output = outputs[1]

        out = self.classifier(pooler_output)

        return out
    
# Classifier_2
class HateDetector_CNN(nn.Module):
    def __init__(self,encoder,label_num):
        super(HateDetector_CNN,self).__init__()
        self.label_num = label_num
        #1.BERTweet is used as the feature extractor for embedding.
        self.feature = encoder

        #2.Define a CNN classifier
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Conv2d(in_channels=13, out_channels=13, kernel_size=(3, 768), padding=True),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.MaxPool2d(kernel_size=3, stride=1),
            nn.Dropout(0.1),
            nn.Flatten(),
            nn.Dropout(0.1),
            nn.Linear(442, self.label_num),
            nn.Softmax()
        )

    def forward(self, input_ids,token_type_ids,attention_mask,labels=None):
        _, _, all_layers = self.feature(input_ids=input_ids,
                                         token_type_ids=token_type_ids,
                                         attention_mask=attention_mask, 
                                         output_hidden_states=True)
        
        # all_layers  = [13, 32, 64, 768]
        x = torch.transpose(torch.cat(tuple([t.unsqueeze(0) for t in all_layers]), 0), 0, 1)
        del all_layers,
        torch.cuda.empty_cache()

        out = self.classifier(x)
        return out

In [None]:
# Initialize Model
encoder = AutoModel.from_pretrained("vinai/bertweet-base")
print("Loading Model...")
model = HateDetector(encoder,2)
print("Classifier_1 Loaded!")

model2 = HateDetector_CNN(encoder,2)
print("Classifier_2 Loaded!")

#### 1. Training

In [26]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from transformers import AdamW

# Move model to GPU
model = model.to(device)
# Define the Optimizer
optimizer = AdamW(model.parameters(), lr=lr)

print("Start Training...")
# Train model
train_epoch_loss,val_epoch_loss = [],[]
train_acc,train_pre,train_rec,train_f1 = [],[],[],[]
val_acc,val_pre,val_rec,val_f1 = [],[],[],[]

for epoch in range(epochs):
    # ========================================
    #               Training
    # ========================================
    print(f"\nEpoch: {epoch+1}")
    model.train()
    epoch_loss = []
    epoch_acc,epoch_pre,epoch_rec,epoch_f1 = [],[],[],[]
    step = 0
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        # Push the batch to gpu
        batch = [t.to(device) for t in batch]
        input_ids,token_type_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids.long(),
                        token_type_ids=token_type_ids.long(),
                        attention_mask=attention_mask.long(),
                        labels=labels)

        del batch,input_ids,token_type_ids, attention_mask,
        torch.cuda.empty_cache()

        # Calculate Loss
        loss = criterion(outputs, labels)
        # Gradient Calculation
        loss.backward()
        epoch_loss.append(loss.item())

        # Clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update
        optimizer.step()
        step += 1
        if step % 500 == 0:
            print("loss:"+str(np.array(epoch_loss).mean()))

        # Record ACC,PRE,REC,F1
        y_pred = outputs.argmax(dim=1).cpu()
        labels = labels.cpu()
        epoch_acc.append(accuracy_score(labels,y_pred))
        epoch_pre.append(precision_score(labels,y_pred))
        epoch_rec.append(recall_score(labels,y_pred))
        epoch_f1.append(f1_score(labels,y_pred))

        # Free up GPU memory
        del labels, loss,
        torch.cuda.empty_cache()

    # Compute Epoch Training Loss
    avg_loss = np.array(epoch_loss).mean()
    train_epoch_loss.append(avg_loss)
    print(f'Training Loss: {avg_loss:.4f}')

    # Compute Epoch ACC,PRE,REC,F1
    avg_acc,avg_pre,avg_rec,avg_f1 = np.array(epoch_acc).mean(),np.array(epoch_pre).mean(),np.array(epoch_rec).mean(),np.array(epoch_f1).mean()
    train_acc.append(avg_acc)
    train_pre.append(avg_pre)
    train_rec.append(avg_rec)
    train_f1.append(avg_f1)
    print(f'Accuracy: {avg_acc:.4f}, Precision: {avg_pre:.4f}, Recall: {avg_rec:.4f}, F1 Score: {avg_f1:.4f}')

    # Save Checkpoints
    if (epoch+1)%2==0:
        save_path = out_path+"/Epoch_"+str(epoch+1)+"_model.pth"
        torch.save(model.state_dict(),save_path)


    # ========================================
    #               Validation
    # ========================================
    print('Evaluate on Validation set...')
    model.eval()
    loss_val = []
    epoch_acc,epoch_pre,epoch_rec,epoch_f1 = [],[],[],[]
    for batch in tqdm(val_dataloader, desc="Evaluating"):
        with torch.no_grad():
            batch = [t.to(device) for t in batch]
            input_ids,token_type_ids, attention_mask, labels = batch
            outputs = model(input_ids=input_ids.long(),
                            token_type_ids=token_type_ids.long(),
                            attention_mask=attention_mask.long(),
                            labels=labels)

            # Free up GPU memory
            del batch,input_ids,token_type_ids, attention_mask,
            torch.cuda.empty_cache()

            # validation loss
            loss = criterion(outputs, labels)
            loss_val.append(loss.item())

            y_pred = outputs.argmax(dim=1).cpu()
            labels = labels.cpu()
            epoch_acc.append(accuracy_score(labels,y_pred))
            epoch_pre.append(precision_score(labels,y_pred))
            epoch_rec.append(recall_score(labels,y_pred))
            epoch_f1.append(f1_score(labels,y_pred))

        del labels, loss,
        torch.cuda.empty_cache()

    val_avg_loss = np.array(loss_val).mean()
    val_epoch_loss.append(val_avg_loss)
    print(f'Validation Loss: {val_avg_loss:.4f}')

    avg_acc,avg_pre,avg_rec,avg_f1 = np.array(epoch_acc).mean(),np.array(epoch_pre).mean(),np.array(epoch_rec).mean(),np.array(epoch_f1).mean()
    val_acc.append(avg_acc)
    val_pre.append(avg_pre)
    val_rec.append(avg_rec)
    val_f1.append(avg_f1)
    print(f'Accuracy: {avg_acc:.4f}, Precision: {avg_pre:.4f}, Recall: {avg_rec:.4f}, F1 Score: {avg_f1:.4f}')


#### 2. Testing

In [None]:
# Load Data for Testing
print("Loading Testing Data...")
test_data = MyDataset(test_df,tokenizer)
test_dataloader = DataLoader(dataset=test_data,
                             batch_size=batch_size,
                             shuffle=True)
print("Size of Testing Data: ",len(test_data))

# ========================================
#               Testing
# ========================================
def test(model,test_dataloader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Start Testing...")
    model.eval()
    acc,pre,rec,f1 = [],[],[],[]
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        with torch.no_grad():
            batch = [t.to(device) for t in batch]
            input_ids,token_type_ids, attention_mask, labels = batch
            outputs = model(input_ids=input_ids.long(),
                            token_type_ids=token_type_ids.long(),
                            attention_mask=attention_mask.long(),
                            labels=labels)

            # Free up GPU memory
            del batch,input_ids,token_type_ids, attention_mask,
            torch.cuda.empty_cache()

            y_pred = outputs.argmax(dim=1).cpu()
            labels = labels.cpu()
            acc.append(accuracy_score(labels,y_pred))
            pre.append(precision_score(labels,y_pred))
            rec.append(recall_score(labels,y_pred))
            f1.append(f1_score(labels,y_pred))
    
    avg_acc,avg_pre,avg_rec,avg_f1 = np.array(acc).mean(),np.array(pre).mean(),np.array(rec).mean(),np.array(f1).mean()

    print(f'Accuracy: {avg_acc:.4f}, Precision: {avg_pre:.4f}, Recall: {avg_rec:.4f}, F1 Score: {avg_f1:.4f}')


test(model,test_dataloader)       