# 🎯 Fine-tuning BERT for Sentiment Analysis

Welcome to **BERT fine-tuning**! In this notebook, we'll leverage the power of pre-trained language models to achieve state-of-the-art sentiment analysis results with minimal training time.

## What you'll learn:
- BERT architecture and bidirectional attention
- Transfer learning with pre-trained models
- Fine-tuning strategies and techniques
- Attention visualization and interpretability

Let's achieve SOTA results! 🚀

In [None]:
# Install required packages (run once)
# !pip install transformers datasets torch torchvision torchaudio

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Transformers and PyTorch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, pipeline
)
from datasets import Dataset as HFDataset

plt.style.use('seaborn-v0_8')
np.random.seed(42)
torch.manual_seed(42)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Create sample sentiment dataset
def create_sentiment_dataset():
    """Create a sample sentiment analysis dataset"""
    positive_samples = [
        "This movie is absolutely fantastic! I loved every minute of it.",
        "Amazing performance by the actors. Highly recommended!",
        "One of the best films I've ever seen. Brilliant storytelling.",
        "Incredible cinematography and outstanding direction.",
        "A masterpiece that will be remembered for years to come.",
        "Excellent plot with great character development.",
        "Wonderful experience! The movie exceeded my expectations.",
        "Perfect blend of action, drama, and emotion.",
        "Outstanding performances from the entire cast.",
        "A truly inspiring and uplifting story."
    ] * 50  # Repeat for more samples
    
    negative_samples = [
        "This movie was terrible. Complete waste of time.",
        "Boring plot and poor acting. Very disappointing.",
        "One of the worst films I've ever watched.",
        "Awful direction and terrible screenplay.",
        "I couldn't even finish watching this movie.",
        "Poor character development and weak storyline.",
        "Completely overrated. Don't waste your money.",
        "Terrible acting and confusing plot.",
        "Disappointing performances from all actors.",
        "A complete disaster of a movie."
    ] * 50  # Repeat for more samples
    
    # Create dataset
    texts = positive_samples + negative_samples
    labels = [1] * len(positive_samples) + [0] * len(negative_samples)
    
    # Shuffle the data
    data = list(zip(texts, labels))
    np.random.shuffle(data)
    texts, labels = zip(*data)
    
    return list(texts), list(labels)

# Create dataset
texts, labels = create_sentiment_dataset()

print(f"Dataset size: {len(texts)} samples")
print(f"Positive samples: {sum(labels)}")
print(f"Negative samples: {len(labels) - sum(labels)}")

# Display sample texts
print("\n📝 Sample Texts:")
for i in range(5):
    sentiment = "Positive" if labels[i] == 1 else "Negative"
    print(f"{sentiment}: {texts[i]}")

# Split into train/test
split_idx = int(0.8 * len(texts))
train_texts, test_texts = texts[:split_idx], texts[split_idx:]
train_labels, test_labels = labels[:split_idx], labels[split_idx:]

print(f"\nTrain samples: {len(train_texts)}")
print(f"Test samples: {len(test_texts)}")

In [None]:
# Initialize BERT tokenizer and model
MODEL_NAME = "distilbert-base-uncased"  # Smaller, faster version of BERT
MAX_LENGTH = 128

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=2,  # Binary classification
    output_attentions=True,  # For attention visualization
    output_hidden_states=True
)

print(f"✅ Loaded {MODEL_NAME}")
print(f"Model parameters: {model.num_parameters():,}")
print(f"Tokenizer vocabulary size: {tokenizer.vocab_size:,}")

# Test tokenization
sample_text = "This movie is absolutely fantastic!"
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.encode(sample_text)

print(f"\n🔤 Tokenization Example:")
print(f"Original: {sample_text}")
print(f"Tokens: {tokens}")
print(f"Token IDs: {token_ids}")
print(f"Decoded: {tokenizer.decode(token_ids)}")

In [None]:
# Create custom dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer, MAX_LENGTH)

print(f"✅ Created datasets")
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Test dataset
sample = train_dataset[0]
print(f"\n📊 Sample from dataset:")
print(f"Input IDs shape: {sample['input_ids'].shape}")
print(f"Attention mask shape: {sample['attention_mask'].shape}")
print(f"Label: {sample['labels']}")

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    learning_rate=2e-5,
)

# Define metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

print("✅ Trainer configured")
print(f"Training epochs: {training_args.num_train_epochs}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Learning rate: {training_args.learning_rate}")

In [None]:
# Fine-tune the model
print("🚀 Starting BERT fine-tuning...")
print("This may take a few minutes...")

# Train the model
trainer.train()

print("\n🎉 Fine-tuning completed!")

# Evaluate the model
eval_results = trainer.evaluate()
print(f"\n📊 Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

In [None]:
# Make predictions on test set
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = test_labels

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
print(f"🎯 Test Accuracy: {accuracy:.4f}")

# Classification report
print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred, target_names=['Negative', 'Positive']))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('🎯 Confusion Matrix - BERT Sentiment Analysis')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Training history
if hasattr(trainer.state, 'log_history'):
    log_history = trainer.state.log_history
    
    # Extract training and evaluation metrics
    train_loss = [log['train_loss'] for log in log_history if 'train_loss' in log]
    eval_accuracy = [log['eval_accuracy'] for log in log_history if 'eval_accuracy' in log]
    
    if train_loss and eval_accuracy:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        # Training loss
        ax1.plot(train_loss)
        ax1.set_title('📉 Training Loss')
        ax1.set_xlabel('Step')
        ax1.set_ylabel('Loss')
        ax1.grid(True, alpha=0.3)
        
        # Evaluation accuracy
        ax2.plot(eval_accuracy, marker='o')
        ax2.set_title('📈 Evaluation Accuracy')
        ax2.set_xlabel('Epoch')
        ax2.set_ylabel('Accuracy')
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

In [None]:
# Create sentiment analysis pipeline
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Test with custom examples
test_examples = [
    "This movie is absolutely amazing! I loved it.",
    "Terrible film, complete waste of time.",
    "The acting was decent but the plot was confusing.",
    "One of the best movies I've ever seen!",
    "Not great, not terrible. Just okay."
]

print("🎭 Sentiment Analysis Results:")
print("=" * 60)

for text in test_examples:
    result = sentiment_pipeline(text)[0]
    label = "Positive" if result['label'] == 'LABEL_1' else "Negative"
    confidence = result['score']
    
    print(f"Text: {text}")
    print(f"Sentiment: {label} (Confidence: {confidence:.3f})")
    print("-" * 40)

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_bert')
tokenizer.save_pretrained('./fine_tuned_bert')

print("\n💾 Model saved to './fine_tuned_bert'")

print(f"\n📊 Final Results Summary:")
print(f"Model: {MODEL_NAME}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Training Samples: {len(train_texts)}")
print(f"Test Samples: {len(test_texts)}")
print(f"Max Sequence Length: {MAX_LENGTH}")

## 🎉 Congratulations!

You've successfully fine-tuned BERT for sentiment analysis! Here's what you've accomplished:

✅ **BERT Fine-tuning**: Adapted pre-trained model for sentiment analysis  
✅ **Transfer Learning**: Leveraged pre-trained knowledge  
✅ **Tokenization**: Processed text for BERT input format  
✅ **High Accuracy**: Achieved excellent classification performance  
✅ **Pipeline Creation**: Built ready-to-use sentiment analyzer  

### 🚀 Next Steps:
1. Try different BERT variants (RoBERTa, ALBERT)
2. Experiment with different datasets
3. Implement attention visualization
4. Move on to **Project 11: GPT-2 for Text Generation**

Ready for autoregressive text generation? Let's generate text! 📝