### In this section, I Fine-tuned a pretrained BERT model on the IMDB dataset for sentiment analysis, evaluate it with accuracy / F1, and run inference using the Hugging Face Transformers library.

In [0]:
# pip install transformers datasets evaluate accelerate scikit-learn
import torch
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, Trainer, TrainingArguments)
import evaluate
import numpy as np

In [0]:
# --- GPU / Mixed-Precision setup ---
use_cuda = torch.cuda.is_available()
use_bf16 = use_cuda and torch.cuda.is_bf16_supported()     # Ampere+ GPUs
use_fp16 = use_cuda and not use_bf16                       # fallback to fp16 if bf16 not available
print(f"CUDA available: {use_cuda} | bf16: {use_bf16} | fp16: {use_fp16} | GPUs: {torch.cuda.device_count()}")

In [0]:
MODEL_NAME = "bert-base-uncased"
NUM_LABELS = 2  # binary classification

# 1) Data: IMDB (binary sentiment)
dataset = load_dataset("imdb")  # splits: train/test

In [0]:
dataset.column_names

In [0]:
dataset["test"][:3]

In [0]:
dataset["train"][50:53]

In [0]:
dataset["unsupervised"][:3]

##### SFT needs labeled data. So removing unlabeled data.

In [0]:
del dataset["unsupervised"]

In [0]:
# 2) Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [0]:
def preprocess(batch):
    return tokenizer(batch["text"], truncation=True)

##### Tokenizing the text based on its vocab so that machine can process it.
##### Models like BERT or GPT don’t read English words the way humans do. They can only process numbers — so we must first convert text into numbers that the model can understand. That’s what tokenization and token IDs do.

In [0]:
tokenized = dataset.map(preprocess, batched=True)

In [0]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [0]:
# 3) Model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

In [0]:
# Optional: reduce memory on large batches
if use_cuda:
    model.gradient_checkpointing_enable()

In [0]:
# 4) Metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

In [0]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

In [0]:
# 5) Training config (GPU-aware)
args = TrainingArguments(
    output_dir="bert-imdb",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,   # adjust up if you have more GPU memory
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    bf16=use_bf16,                    # mixed precision on Ampere+ (preferred)
    fp16=use_fp16,                    # else use fp16
    dataloader_pin_memory=True,
    optim="adamw_torch",              # fast fused AdamW in recent PyTorch
    torch_compile=True if use_cuda and torch.__version__.startswith("2.") else False,
)

In [0]:
# 6) Trainer (Trainer uses GPU automatically if available)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [0]:
trainer.train()

In [0]:
# 7) Save + quick inference
trainer.save_model("bert-imdb/best")
tokenizer.save_pretrained("bert-imdb/best")

In [0]:
from transformers import pipeline

# Sample sentences for comparison
sentences = [
    "This movie was absolutely wonderful!",
    "The story was slow and boring.",
    "I didn’t expect it to be so emotional.",
    "Terrible plot and wooden acting.",
    "I hope the actors can work harder.",
    "I hope to watch next season.",
    "It is waste of money to watch this movie!"
]

# 1️⃣ Baseline: pretrained BERT (before fine-tuning)
base_clf = pipeline("text-classification", model="bert-base-uncased", tokenizer="bert-base-uncased", device_map="auto")
print("=== Predictions from base BERT (before fine-tuning) ===")
for s in sentences:
    print(f"{s} -> {base_clf(s)}")

# 2️⃣ Fine-tuned model
fine_tuned_clf = pipeline("text-classification", model="bert-imdb/best", tokenizer="bert-imdb/best", device_map="auto")
print("\n=== Predictions from fine-tuned BERT (after fine-tuning) ===")
for s in sentences:
    print(f"{s} -> {fine_tuned_clf(s)}")
