### A lot of the code (evaluation) was taken from https://github.com/baotramduong/Twitter-Sentiment-Analysis-with-Deep-Learning-using-BERT/blob/main/Notebook.ipynb

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer

from transformers import DataCollatorWithPadding
from torch.utils.data import TensorDataset, DataLoader

import numpy as np

import torch.nn as nn

from datasets import load_dataset
imdb = load_dataset("imdb")

device = device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
train_dataset = imdb["train"].shuffle(seed=42)
test_dataset = imdb["test"].shuffle(seed=42)

In [3]:
num_classes = len(np.unique(np.array(train_dataset['label'])))

In [4]:
#load pre-trained BERT

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(samples):
    return tokenizer(samples['text'], truncation=True)

tokenized_imdb = imdb.map(tokenize, batched=True)
     

bert = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = num_classes,
                                                      output_attentions = False,
                                                      output_hidden_states = False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
# First, properly tokenize your dataset with explicit padding and truncation settings
tokenized_imdb = {
    'train': tokenizer(
        imdb['train']['text'],
        padding=True,
        truncation=True,
        max_length=512,  # Adjust this value based on your needs
        return_tensors=None  # Important: don't convert to tensors yet
    ),
    'test': tokenizer(
        imdb['test']['text'],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors=None
    )
}

# Add labels to the tokenized datasets
tokenized_imdb['train']['labels'] = imdb['train']['label']
tokenized_imdb['test']['labels'] = imdb['test']['label']

# Convert to Dataset objects if they aren't already
from datasets import Dataset
train_dataset = Dataset.from_dict(tokenized_imdb['train'])
test_dataset = Dataset.from_dict(tokenized_imdb['test'])

# Now create the DataLoader with the DataCollator
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

# Create the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create PyTorch DataLoaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=data_collator
)

validation_dataloader = DataLoader(
    test_dataset,
    batch_size=16,
    shuffle=False,
    collate_fn=data_collator
)

In [6]:
class sentimentClassifier(nn.Module):
    def __init__(self, bert_model, dropout = 0.2, input_dim = 768, classifier_dims = None):
        super(sentimentClassifier, self).__init__()
        
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout)
        if classifier_dims == None:
            self.classifier_dims = [768, 768, 512, 512]
        else:
            self.classifier_dims = classifier_dims
        
        self.num_classifier_layers = len(self.classifier_dims)
        
        classifier_layers = []
        for classifier_dim in self.classifier_dims:
            classifier_layers.extend([
                nn.Linear(input_dim, classifier_dim),
                nn.ReLU(),
                nn.BatchNorm1d(classifier_dim),
                nn.Dropout(dropout)
            ])
            input_dim = classifier_dim
        
        classifier_layers.append(nn.Linear(classifier_dim, 2))
        self.classifier = nn.Sequential(*classifier_layers)
        self.freeze_pretrained()
    
    def freeze_pretrained(self):
        for param in self.bert.parameters():
            param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        
        encoded = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(encoded[1])
        return self.classifier(pooled_output)


def analyze_state_dict_shapes_and_names(model):
    """Analyze model's state dictionary"""
    print("\n===== MODEL STATE DICT ANALYSIS =====")
    
    # Get state dict
    state_dict = model.state_dict()
    
    # Print keys and shapes
    for name, param in state_dict.items():
        print(f"{name}: {param.shape}")
    
    # Check trainable vs. non-trainable parameters
    trainable_params = {name: param for name, param in model.named_parameters() if param.requires_grad}
    non_trainable_params = {name: param for name, param in model.named_parameters() if not param.requires_grad}
    
    print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters())}")
    print(f"Trainable parameters: {sum(p.numel() for p in trainable_params.values())}")
    print(f"Non-trainable parameters: {sum(p.numel() for p in non_trainable_params.values())}")
    
    # Check for any parameters that aren't trainable
    #if non_trainable_params:
    #    print("\nNon-trainable parameter names:")
    #    for name in non_trainable_params.keys():
    #        print(f"- {name}")
    #else:
    #    print("\nAll parameters are trainable")



In [7]:
model = sentimentClassifier(bert_model = bert.bert)

In [8]:
analyze_state_dict_shapes_and_names(model)


===== MODEL STATE DICT ANALYSIS =====
bert.embeddings.word_embeddings.weight: torch.Size([30522, 768])
bert.embeddings.position_embeddings.weight: torch.Size([512, 768])
bert.embeddings.token_type_embeddings.weight: torch.Size([2, 768])
bert.embeddings.LayerNorm.weight: torch.Size([768])
bert.embeddings.LayerNorm.bias: torch.Size([768])
bert.encoder.layer.0.attention.self.query.weight: torch.Size([768, 768])
bert.encoder.layer.0.attention.self.query.bias: torch.Size([768])
bert.encoder.layer.0.attention.self.key.weight: torch.Size([768, 768])
bert.encoder.layer.0.attention.self.key.bias: torch.Size([768])
bert.encoder.layer.0.attention.self.value.weight: torch.Size([768, 768])
bert.encoder.layer.0.attention.self.value.bias: torch.Size([768])
bert.encoder.layer.0.attention.output.dense.weight: torch.Size([768, 768])
bert.encoder.layer.0.attention.output.dense.bias: torch.Size([768])
bert.encoder.layer.0.attention.output.LayerNorm.weight: torch.Size([768])
bert.encoder.layer.0.attention

In [9]:
batch = next(iter(train_dataloader))

output = model( batch['input_ids'], batch['attention_mask'])


In [19]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

epochs = 10

#load optimizer
optimizer = AdamW(model.parameters(),
                 lr = 1e-5,
                 eps = 1e-8)
#load scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = len(train_dataloader)*epochs)

criterion = nn.CrossEntropyLoss()

import numpy as np
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

def accuracy_per_class(preds, labels):
    
    #make prediction
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')
        
def evaluate(validation_dataloader):

    #evaluation mode disables the dropout layer 
    model.eval()
    
    #tracking variables
    loss_val_total = 0
    predictions, true_vals = [], []
    with torch.no_grad():
        for batch in tqdm(validation_dataloader):

            #load into GPU
            batch = tuple(batch[b].to(device) for b in batch.keys())

            #define inputs
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1]}
            labels = batch[2]
            outputs = model(**inputs)
            loss = criterion(outputs, labels)
            loss_val_total += loss.item()

            #compute accuracy
            logits = outputs.detach().cpu().numpy()
            label_ids = labels.cpu().numpy()
            predictions.append(logits)
            true_vals.append(label_ids)
    
    #compute average loss
    loss_val_avg = loss_val_total/len(validation_dataloader) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals



In [11]:
import random
from tqdm import tqdm 
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [14]:
# Early stopping setup
best_val = 0
patience = 3
counter = 0
    
# Training history
history = {
    'accuracy': [],
    'val_accuracy': []
}
    
# Training loop
epochs = 25

for epoch in range(epochs):

    model.train()
    true_vals = []
    loss_train_total = 0
    for batch in tqdm(train_dataloader):
        #set gradient to 0
        optimizer.zero_grad()

        batch = tuple(batch[b].to(device) for b in batch.keys())

        #define inputs
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1]}
        labels = batch[2]
        outputs = model(**inputs)
        loss = criterion(outputs, labels)
        loss_train_total +=loss.item()

        loss.backward()
        
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        
        #update optimizer
        optimizer.step()

        #update scheduler
        scheduler.step()        
    #print training result
    loss_train_avg = loss_train_total/len(train_dataloader)    
    
    #evaluate
    val_loss, predictions, true_vals = evaluate(validation_dataloader)
    val_f1 = f1_score_func(predictions, true_vals)
    print(f'Epoch {epoch+1}/{epochs} - Loss: {loss_train_avg:.4f} - Val Accuracy: {val_f1:.4f}')
    
    if val_f1 > best_val:
            best_val = val_f1
            # Save the best model
            torch.save(model.state_dict(), 'best_model_sentiment.pt')
            counter = 0
    else:
        counter += 1
        if counter >= patience:
            print(f'Early stopping after {epoch+1} epochs')
            # Load best model
            model.load_state_dict(torch.load('best_model_sentiment.pt'))
            break

100%|██████████| 1563/1563 [05:31<00:00,  4.72it/s]
100%|██████████| 1563/1563 [05:14<00:00,  4.97it/s]


Epoch 1/25 - Loss: 0.6789 - Val Accuracy: 0.7266


100%|██████████| 1563/1563 [05:30<00:00,  4.72it/s]
100%|██████████| 1563/1563 [05:19<00:00,  4.90it/s]


Epoch 2/25 - Loss: 0.6472 - Val Accuracy: 0.7473


100%|██████████| 1563/1563 [05:15<00:00,  4.96it/s]
100%|██████████| 1563/1563 [04:16<00:00,  6.09it/s]


Epoch 3/25 - Loss: 0.6335 - Val Accuracy: 0.7602


100%|██████████| 1563/1563 [04:31<00:00,  5.76it/s]
100%|██████████| 1563/1563 [04:16<00:00,  6.09it/s]


Epoch 4/25 - Loss: 0.6155 - Val Accuracy: 0.7630


100%|██████████| 1563/1563 [04:31<00:00,  5.75it/s]
100%|██████████| 1563/1563 [04:16<00:00,  6.08it/s]


Epoch 5/25 - Loss: 0.6096 - Val Accuracy: 0.7703


100%|██████████| 1563/1563 [04:31<00:00,  5.76it/s]
100%|██████████| 1563/1563 [04:26<00:00,  5.87it/s]


Epoch 6/25 - Loss: 0.5952 - Val Accuracy: 0.7760


100%|██████████| 1563/1563 [04:31<00:00,  5.76it/s]
100%|██████████| 1563/1563 [04:16<00:00,  6.09it/s]


Epoch 7/25 - Loss: 0.5932 - Val Accuracy: 0.7743


100%|██████████| 1563/1563 [04:32<00:00,  5.74it/s]
100%|██████████| 1563/1563 [04:16<00:00,  6.09it/s]


Epoch 8/25 - Loss: 0.5888 - Val Accuracy: 0.7777


100%|██████████| 1563/1563 [04:31<00:00,  5.75it/s]
100%|██████████| 1563/1563 [04:16<00:00,  6.09it/s]


Epoch 9/25 - Loss: 0.5889 - Val Accuracy: 0.7737


100%|██████████| 1563/1563 [04:31<00:00,  5.76it/s]
100%|██████████| 1563/1563 [04:16<00:00,  6.08it/s]


Epoch 10/25 - Loss: 0.5887 - Val Accuracy: 0.7721


100%|██████████| 1563/1563 [04:31<00:00,  5.76it/s]
100%|██████████| 1563/1563 [04:16<00:00,  6.09it/s]


Epoch 11/25 - Loss: 0.5887 - Val Accuracy: 0.7743
Early stopping after 11 epochs


### Test

In [12]:
model.load_state_dict(torch.load('best_model_sentiment.pt'))
val_loss, predictions, true_vals = evaluate(validation_dataloader)

100%|██████████| 1563/1563 [04:18<00:00,  6.04it/s]


In [13]:
val_f1 = f1_score_func(predictions, true_vals)
print(f'Validation loss: {val_loss}')
print(f'F1 Score (weighted): {val_f1}')

Validation loss: 0.5036351951417142
F1 Score (weighted): 0.7777134890738397


In [20]:
accuracy_per_class(predictions, true_vals)

Class: 0
Accuracy:9343/12500

Class: 1
Accuracy:10105/12500

