# BERT Transfer Learning for Multiclass Sentiment Analysis

## Imports and Setup

In [None]:


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seed
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

2026-02-09 13:57:23.159671: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770645443.377236      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770645443.432173      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770645443.972785      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770645443.972823      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770645443.972825      24 computation_placer.cc:177] computation placer alr

Using device: cuda


## Configuration

In [None]:
CONFIG = {
    'model_name': 'bert-base-uncased',
    'max_length': 64,  # Reduced from 128 - sentiment is usually short
    'batch_size': 128, 
    'epochs': 4, 
    'learning_rate': 2e-5,
    'use_mixed_precision': True,  # FP16 training - 2x faster!
    'freeze_bert_layers': 7,  # Freeze first 7 layers (0 = train all, 12 = freeze all)
}

## Load Dataset

In [3]:
print("\nLoading dataset...")
dataset = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")

# Check available splits
print(f"Available splits: {dataset.keys()}")

# Convert splits to pandas for exploration
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

print(f"\nTrain samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")
print(f"\nTrain class distribution:\n{train_df['label'].value_counts()}")

# Extract texts and labels from each split
train_texts = train_df['text'].tolist()
train_labels = train_df['label'].tolist()

val_texts = val_df['text'].tolist()
val_labels = val_df['label'].tolist()

test_texts = test_df['text'].tolist()
test_labels = test_df['label'].tolist()

# Get number of classes
num_labels = len(set(train_labels))
print(f"\nNumber of classes: {num_labels}")


Loading dataset...


README.md: 0.00B [00:00, ?B/s]

train_df.csv: 0.00B [00:00, ?B/s]

val_df.csv: 0.00B [00:00, ?B/s]

test_df.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/31232 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5205 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5206 [00:00<?, ? examples/s]

Available splits: dict_keys(['train', 'validation', 'test'])

Train samples: 31232
Validation samples: 5205
Test samples: 5206

Train class distribution:
label
1    11649
2    10478
0     9105
Name: count, dtype: int64

Number of classes: 3


## Define Custom Model Class

In [4]:
class BERTSentimentClassifier(nn.Module):
    """
    Custom BERT-based sentiment classifier
    Architecture: BERT + Dropout + Linear Classification Head
    """
    
    def __init__(self, num_classes, dropout=0.3, freeze_layers=0):
        super(BERTSentimentClassifier, self).__init__()
        
        # Load pre-trained BERT
        self.bert = BertModel.from_pretrained(CONFIG['model_name'])
        
        # Freeze BERT layers for faster training
        if freeze_layers > 0:
            # Freeze embeddings
            for param in self.bert.embeddings.parameters():
                param.requires_grad = False
            
            # Freeze encoder layers
            for layer in self.bert.encoder.layer[:freeze_layers]:
                for param in layer.parameters():
                    param.requires_grad = False
            
            print(f"Frozen first {freeze_layers} BERT layers")
        
        # Classification head
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use [CLS] token representation
        pooled_output = outputs.pooler_output
        
        # Apply dropout and classification
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits

## Define Dataset Class

In [5]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Handle None or empty text
        if text is None or (isinstance(text, float) and pd.isna(text)):
            text = ""
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

## Initialize Tokenizer and Create Datasets

In [6]:
print("\nInitializing tokenizer...")
tokenizer = BertTokenizer.from_pretrained(CONFIG['model_name'])

# Create datasets
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, CONFIG['max_length'])
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, CONFIG['max_length'])
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer, CONFIG['max_length'])

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'])

print(f"Train batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")


Initializing tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Train batches: 244
Validation batches: 41
Test batches: 41


## Initialize Model, Loss, and Optimizer

In [7]:
print("\nInitializing model...")
model = BERTSentimentClassifier(
    num_classes=num_labels,
    freeze_layers=CONFIG['freeze_bert_layers']
)

# Use both GPUs if available
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

model = model.to(device)

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'])

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


Initializing model...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Frozen first 7 BERT layers
Using 2 GPUs!
Model parameters: 109,484,547
Trainable parameters: 36,032,259


## Training Function

In [8]:
def train_epoch(model, dataloader, criterion, optimizer, device, use_amp=False):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    # Mixed precision scaler
    scaler = torch.amp.GradScaler('cuda') if use_amp else None
    
    for batch in tqdm(dataloader, desc="Training"):
        # Move data to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        # Mixed precision training
        if use_amp:
            with torch.amp.autocast('cuda'):
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            # Normal training
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
        
        # Calculate accuracy
        predictions = torch.argmax(logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

## Validation Function

In [9]:
def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            
            # Calculate accuracy
            predictions = torch.argmax(logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
            total_loss += loss.item()
            
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy, all_preds, all_labels

## Training Loop

In [10]:
print("\n" + "="*60)
print("Starting Training")
print("="*60)

best_val_accuracy = 0

for epoch in range(CONFIG['epochs']):
    print(f"\nEpoch {epoch + 1}/{CONFIG['epochs']}")
    print("-" * 40)
    
    # Train
    train_loss, train_acc = train_epoch(
        model, train_loader, criterion, optimizer, device, 
        use_amp=CONFIG['use_mixed_precision']
    )
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    
    # Validate
    val_loss, val_acc, val_preds, val_true = validate(model, val_loader, criterion, device)
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
    
    # Save best model
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        torch.save(model.state_dict(), 'best_model.pt')
        print(f"✓ New best model saved! (Accuracy: {val_acc:.4f})")



Starting Training

Epoch 1/4
----------------------------------------


Training: 100%|██████████| 244/244 [01:04<00:00,  3.81it/s]


Train Loss: 0.7459 | Train Acc: 0.6605


Validating: 100%|██████████| 41/41 [00:13<00:00,  3.07it/s]


Val Loss: 0.6379 | Val Acc: 0.7256
✓ New best model saved! (Accuracy: 0.7256)

Epoch 2/4
----------------------------------------


Training: 100%|██████████| 244/244 [01:04<00:00,  3.79it/s]


Train Loss: 0.6035 | Train Acc: 0.7452


Validating: 100%|██████████| 41/41 [00:15<00:00,  2.63it/s]


Val Loss: 0.5983 | Val Acc: 0.7433
✓ New best model saved! (Accuracy: 0.7433)

Epoch 3/4
----------------------------------------


Training: 100%|██████████| 244/244 [01:05<00:00,  3.74it/s]


Train Loss: 0.5571 | Train Acc: 0.7666


Validating: 100%|██████████| 41/41 [00:14<00:00,  2.76it/s]


Val Loss: 0.5953 | Val Acc: 0.7499
✓ New best model saved! (Accuracy: 0.7499)

Epoch 4/4
----------------------------------------


Training: 100%|██████████| 244/244 [01:05<00:00,  3.72it/s]


Train Loss: 0.5186 | Train Acc: 0.7857


Validating: 100%|██████████| 41/41 [00:14<00:00,  2.77it/s]


Val Loss: 0.5962 | Val Acc: 0.7525
✓ New best model saved! (Accuracy: 0.7525)


## Load Best Model and Final Evaluation

In [11]:
print("\n" + "="*60)
print("Final Evaluation")
print("="*60)

model.load_state_dict(torch.load('best_model.pt'))
_, final_acc, final_preds, final_labels = validate(model, val_loader, criterion, device)

print(f"\nBest Validation Accuracy: {final_acc:.4f}")
print("\nClassification Report (Validation):")
print(classification_report(final_labels, final_preds))


Final Evaluation


Validating: 100%|██████████| 41/41 [00:14<00:00,  2.79it/s]


Best Validation Accuracy: 0.7525

Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.77      0.75      0.76      1517
           1       0.69      0.69      0.69      1928
           2       0.80      0.82      0.81      1760

    accuracy                           0.75      5205
   macro avg       0.76      0.75      0.75      5205
weighted avg       0.75      0.75      0.75      5205






## Test Set Evaluation

In [12]:
print("\n" + "="*60)
print("Test Set Evaluation")
print("="*60)

test_loss, test_acc, test_preds, test_true = validate(model, test_loader, criterion, device)
print(f"\nTest Accuracy: {test_acc:.4f}")
print("\nClassification Report (Test):")
print(classification_report(test_true, test_preds))


Test Set Evaluation


Validating: 100%|██████████| 41/41 [00:14<00:00,  2.82it/s]


Test Accuracy: 0.7539

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.77      0.76      0.77      1546
           1       0.70      0.68      0.69      1930
           2       0.79      0.83      0.81      1730

    accuracy                           0.75      5206
   macro avg       0.76      0.76      0.76      5206
weighted avg       0.75      0.75      0.75      5206






## Conclusion

### Score Breakdown
At the end of training, we can see that the model achieves a strong performance on the validation set, with an overall accuracy of around **0.75** compare to the BiLSTM who scored around 0.70, it is a significant improvement.

Wee can se that the model learns well and generalizes to the validation set.

But we can also see that the model struggles more with the "neutral" class, which is common in sentiment analysis tasks. This is likely due to the inherent ambiguity of neutral sentiment and the fact that it can be easily confused with both positive and negative sentiments.