### References:
* https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
* https://peterbloem.nl/blog/transformers
* https://arxiv.org/pdf/1810.04805
* https://www.kaggle.com/code/chayan8/sentiment-analysis-using-bert-pytorch

## 1. Installing new dependencies 

In [None]:
! pip install transformers datasets
! pip install pytorch-transformers
! pip install pandas seaborn matplotlib numpy

## 2. Importing Libraries and Load Data

In [1]:
# import libraries
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from torch.utils.data import TensorDataset
import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [2]:
# Load the GoEmotions dataset
datasets = load_dataset("go_emotions")

# Print some information about the dataset
print(datasets)
train_df = pd.DataFrame(datasets['train'])
print(train_df.head())

print("Train dataset's shape:",datasets['train'].shape)
print("Validation dataset's shape:",datasets['validation'].shape)
print("Test dataset's shape:",datasets['test'].shape)


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})
                                                text labels       id
0  My favourite food is anything I didn't have to...   [27]  eebbqej
1  Now if he does off himself, everyone will thin...   [27]  ed00q6i
2                     WHY THE FUCK IS BAYLESS ISOING    [2]  eezlygj
3                        To make her feel threatened   [14]  ed7ypvh
4                             Dirty Southern Wankers    [3]  ed0bdzj
Train dataset's shape: (43410, 3)
Validation dataset's shape: (5426, 3)
Test dataset's shape: (5427, 3)


In [3]:
# OPTIONAL
# checking if my GPU is ready to be used for training

gpu_available = torch.cuda.is_available()

if gpu_available:
    print("CUDA is available. GPU detected.")
    # Get the number of GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs: {num_gpus}")
    # Print details about each GPU
    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {gpu_name}")
        gpu_properties = torch.cuda.get_device_properties(i)
        print(f"Properties of GPU {i}: {gpu_properties}")
else:
    print("CUDA is not available. No GPU detected.")

CUDA is available. GPU detected.
Number of GPUs: 1
GPU 0: NVIDIA GeForce RTX 3060 Laptop GPU
Properties of GPU 0: _CudaDeviceProperties(name='NVIDIA GeForce RTX 3060 Laptop GPU', major=8, minor=6, total_memory=6143MB, multi_processor_count=30)


## 3. Preprocessing Data

In [None]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(data):
    return tokenizer(data['text'], padding=True, truncation=True, return_tensors="pt")

tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Verify the columns
print(tokenized_datasets)


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5427
    })
})


In [None]:
# Binarize the labels
def binarize_labels(examples):
    binarized_labels = torch.zeros((len(examples["labels"]), len(dataset["train"].features["labels"].feature.names)), dtype=torch.float)
    for i, labels in enumerate(examples["labels"]):
        binarized_labels[i, labels] = 1.0
    examples["labels"] = binarized_labels
    return examples

tokenized_datasets = tokenized_datasets.map(binarize_labels, batched=True)


In [None]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

tokenized_datasets.set_format("torch")
data_collator = DataCollatorWithPadding(tokenizer)

# Create DataLoaders
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=16, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=16, collate_fn=data_collator)

In [None]:
import torch
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_output = self.softmax(linear_output)
        return final_output

# Instantiate the model
model = BertClassifier(num_classes=28)  # GoEmotions has 28 emotion labels


In [None]:
from tqdm import tqdm
import torch.nn.functional as F
CUDA_LAUNCH_BLOCKING=1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs} completed.")

print("Training completed.")



## 3. Preprocessing and training BERT
### Overview steps:
1. Encoding label into hot encoders 
2. Convert dataframe dictionary into dataframe using pandas
3. Import BERT tokenizer to tokenize/encode the text 
5. Created tensor dataset with input ids (originally text), attention masks (from tokenizer/encoder), and label tensors (originally labels) 
6. Importing pre-trained pretrained BERT model to train machine
7. Creatin optimizer and scheduler for training 
8. Setting up and checking GPU for training
9. Train model and evaluate every epoch

### Tokenize and Encode
#### Description:
BERT model  

In [4]:
# importing 
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [5]:
df_train = pd.DataFrame(datasets['train'])
df_validation = pd.DataFrame(datasets['validation'])
df_test = pd.DataFrame(datasets['test'])

# print(df_train['labels'])

In [6]:
type_of_emotion = datasets['train'].features['labels'].feature.names

# Function to encode numeric labels to multi-hot vectors 
def encode_labels(labels):
    encoding = [0] * len(type_of_emotion)
    for label in labels:
        encoding[label] = 1
    return encoding

# Apply the encoding labels to the DataFrame
df_train['encoded_labels'] = df_train['labels'].apply(encode_labels)
df_validation['encoded_labels'] = df_validation['labels'].apply(encode_labels)
#df_test['encoded_labels'] = df_test['labels'].apply(encode_labels)

# Convert the labels to a list of lists
labels_train_list = df_train['encoded_labels'].tolist()
labels_validation_list = df_validation['encoded_labels'].tolist()
#labels_test_list = df_test['encoded_labels'].tolist()

# Convert to PyTorch tensors
labels_train_tensor = torch.tensor(labels_train_list, dtype=torch.float32)
labels_validation_tensor = torch.tensor(labels_validation_list, dtype=torch.float32)
#labels_test_tensor = torch.tensor(labels_test_list, dtype=torch.float32)

# Print the resulting tensor to ensure correctness
print("\nTrain Labels:\n", df_train['labels'])
print("\nTrain Labels Tensor:\n", labels_train_tensor)
print("\nValidation Labels Tensor:\n", labels_validation_tensor)
#print("\nTest Labels Tensor:\n", labels_test_tensor)




Train Labels:
 0        [27]
1        [27]
2         [2]
3        [14]
4         [3]
         ... 
43405    [18]
43406     [6]
43407     [3]
43408    [13]
43409    [17]
Name: labels, Length: 43410, dtype: object

Train Labels Tensor:
 tensor([[0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

Validation Labels Tensor:
 tensor([[0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [7]:
encoded_data_train = tokenizer.batch_encode_plus(
    df_train.text.values,
    add_special_tokens=True, # addin [CLS] and [SEP]
    return_attention_mask=True, # changr input to 1 with actual words and 0 to none
    pad_to_max_length=True, 
    max_length=	512,
    truncation=True,  # length that is longer than max_length will be trucated
    return_tensors='pt' # returning tensor in pytorch form
)

encoded_data_val = tokenizer.batch_encode_plus(
    df_validation.text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=	512,
    truncation=True,  
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']




In [8]:
# turning each dataset into tensor dataset to be processed

dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train_tensor)

dataset_val = TensorDataset(input_ids_val, 
                            attention_masks_val,
                           labels_validation_tensor)

dataset_train.tensors

(tensor([[ 101, 2026, 8837,  ...,    0,    0,    0],
         [ 101, 2085, 2065,  ...,    0,    0,    0],
         [ 101, 2339, 1996,  ...,    0,    0,    0],
         ...,
         [ 101, 2054, 2024,  ...,    0,    0,    0],
         [ 101, 2062, 2066,  ...,    0,    0,    0],
         [ 101, 5959, 1996,  ...,    0,    0,    0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]))

In [9]:
from transformers import BertForSequenceClassification

# passing in bert pretrained classification model to fine tune

model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = len(type_of_emotion),
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# creating data loaders
# batch size reference https://huggingface.co/docs/transformers/model_doc/bert
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    num_workers=4, # subprocesses to use for data loading
    batch_size=batch_size,
    pin_memory=True,  #data loadwe will copy tensor to CUDa pinned memory before return (improve GPU stransfer speed)
    prefetch_factor=2 # need google more
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32,
    num_workers=4, 
    pin_memory=True, 
    prefetch_factor=2
)

In [11]:
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast
# optimizer type: 
# Stochastic Gradient Descent (SGD): The basic optimization algorithm that updates parameters based on the gradient of the loss function.
# Adam: A popular variant of SGD that combines adaptive learning rates with momentum.
# AdamW: A variant of Adam that also incorporates weight decay to prevent overfitting.
# Adagrad: Adapts the learning rate for each parameter based on the historical gradient information.
# RMSprop: Root Mean Square Propagation, similar to Adagrad but with an exponentially decaying average of squared gradients.

optimizer = AdamW(
    model.parameters(), # optimizing model
    lr = 1e-5 ,
    eps = 1e-8, # need a further verification
)

scaler= GradScaler()

epochs = 10

# Scheduler: a learning rate schediler that adjust learning rate during training to improve performance
# type of scheduler:
# StepLR: decreases the learning rate by a factor of the fized number of epochs
# MultiStepLR: similar to StepLR but allows specifying multicle milestones for decreasing learning rate
# ExponentialLR: decays the learning rate expotentially over time
# Reduce LROnPlateau: decrease learning rate when specific metric stops improving -
# CosineAnnealingLR: gradually decreases the learning rate base on cosine
# Linear Warmup: linearly increases the learning rate from zero to the specified value over warmup period -

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)



In [12]:
import subprocess
import torch

def get_gpu_usage():
    try:
        result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits'], stdout=subprocess.PIPE)
        gpu_usage = int(result.stdout.decode('utf-8').strip())
        return gpu_usage
    except Exception as e:
        print(f"Error: {e}")
        return None

if __name__ == "__main__":
    gpu_usage = get_gpu_usage()
    if gpu_usage is not None:
        print(f"GPU Usage: {gpu_usage}%")
    else:
        print("Failed to retrieve GPU usage.")

torch.cuda.empty_cache()



GPU Usage: 0%


In [13]:
import random


torch.cuda.empty_cache()

# seeding / setting seeds for various random number to ensure that the result is the same
seed_val = 17
random.seed(seed_val) # sets seed for python's build in libraries
np.random.seed(seed_val) # set seeds for numpy random generator
torch.manual_seed(seed_val) # sets seed for pytorch CPU operations
torch.cuda.manual_seed_all(seed_val) # set seeds for CUDA operation

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.cuda()
print(device)
print(model)

cuda
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), e

In [14]:
from tqdm import tqdm
import numpy as np

def evaluate(dataloader_val):
    model.eval()  # sets model to evaluation mode, no dropout and batch normalization layer
    
    loss_val_total = 0
    predictions, true_vals = [], []  # to store predictions and true labels
    
    for batch in tqdm(dataloader_val):  # iterate over batches of data in validation loader
        
        batch = tuple(b.to(device) for b in batch)  # move batch tensors to the appropriate device
        
        # prepare model inputs tensors 
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        # forward pass
        # performing forward pass without gradient computation
        with torch.no_grad():       
            # passing input tensors to the model and obtain the outputs (loss and logits - raw output scores) 
            outputs = model(**inputs)
            
        # calculate raw values 
        loss = outputs[0] 
        logits = outputs[1]
        loss_val_total += loss.item()

        # processing predictions and true labels
        logits = logits.detach().cpu().numpy()  # detach logits from computational graph -> to CPU -> to numpy array
        label_ids = inputs['labels'].cpu().numpy()  # same concept
        
        # Ensure that the shapes are consistent before appending
        if logits.shape[0] == label_ids.shape[0]:
            predictions.append(logits)
            true_vals.append(label_ids)
        else:
            print(f"Shape mismatch: logits {logits.shape}, labels {label_ids.shape}")
    
    loss_val_avg = loss_val_total / len(dataloader_val) 
    
    # Debugging: Check the shapes before concatenation
    print(f"Predictions shapes: {[pred.shape for pred in predictions]}")
    print(f"True values shapes: {[val.shape for val in true_vals]}")
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    
    # Final shape check
    print(f"Final shape of predictions: {predictions.shape}")
    print(f"Final shape of true values: {true_vals.shape}")
    
    return loss_val_avg, predictions, true_vals




In [26]:
# setting up performance metrics
import numpy as np
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    assert preds.shape[0] == labels.shape[0], f"Shape mismatch: preds {preds.shape}, labels {labels.shape}"
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.argmax(axis=1).flatten()
    
    # Check the shapes after flattening
    print(f"Shapes after flattening - preds_flat: {preds_flat.shape}, labels_flat: {labels_flat.shape}")
    return f1_score(labels_flat, preds_flat, average = 'weighted')




In [20]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in type_of_emotion.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

https://pytorch.org/docs/stable/amp.html

Libraries used for training:
* tqdm: adding progress bars to loops so we can see training progress
* GradScaler: dynamically adjust gradient scale before backward propagation. Ensuring that there is no overflow or underflow by rescalling them to similar as ROC curve (scale 0 to 1)  (https://youtu.be/IkeEadgSy6w)
* Autocast: Automatic Mixed Precision (AMP) feature. Accelerate training by leveraging tensor cores on NVIDA GPUs. used around foward pass and loss calculation

P.S. gradient scaler and autocast is just used when we want to utilize GPU for training
Underflow:  value too small to represent or compute (close to zero)
Overflow: values exceed numerical computations, causing memory leackage. can caused infinity

underflow and overflow can cause numerical instabilities due numbers being too small and big, causing inaccurate computational.

In [27]:
import os
from tqdm.notebook import tqdm
from torch.cuda.amp import GradScaler, autocast

epochs_to_save = [4, 6, 8, 10]

save_dir = 'Models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

scaler = GradScaler()

for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, 
                        desc=f'Epoch {epoch}', 
                        leave=False, 
                        disable=False)
    
    for batch in progress_bar:
        optimizer.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        with autocast():
            outputs = model(**inputs)
            loss = outputs[0]
        
        # Scale gradients
        scaler.scale(loss).backward()
        # Update optimizer
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        loss_train_total += loss.item()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
    
    if epoch in epochs_to_save:
        model_save_path = os.path.join(save_dir, f'BERT_ft_Epoch{epoch}.model')
        torch.save(model.state_dict(), model_save_path)
    
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    
    # Check shapes before computing F1 score
    print(f"Shapes before F1 computation - Predictions: {predictions.shape}, True values: {true_vals.shape}")
    

    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

    # has quite stable and not improving validation loss and f-1 score


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/2714 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.06657235673199803


  0%|          | 0/170 [00:00<?, ?it/s]

Shapes before F1 computation - Predictions: (5426, 28), True values: (5426, 28)
Shapes after flattening - preds_flat: (5426,), labels_flat: (5426,)
Validation loss: 0.0874639522503404
F1 Score (weighted): 0.5603494697969168


Epoch 2:   0%|          | 0/2714 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.06068420308651011


  0%|          | 0/170 [00:00<?, ?it/s]

Shapes before F1 computation - Predictions: (5426, 28), True values: (5426, 28)
Shapes after flattening - preds_flat: (5426,), labels_flat: (5426,)
Validation loss: 0.09000845696119701
F1 Score (weighted): 0.5632054252509165


Epoch 3:   0%|          | 0/2714 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.05560672562981166


  0%|          | 0/170 [00:00<?, ?it/s]

Shapes before F1 computation - Predictions: (5426, 28), True values: (5426, 28)
Shapes after flattening - preds_flat: (5426,), labels_flat: (5426,)
Validation loss: 0.09243338590159136
F1 Score (weighted): 0.5604165131958991


Epoch 4:   0%|          | 0/2714 [00:00<?, ?it/s]

KeyboardInterrupt: 