In [None]:
!pip install transformers datasets --quiet
!pip install -U scikit-learn


In [None]:
# Step 1: Import Libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 2: Load Dataset (e.g., IMDb - sentiment classification)
dataset = load_dataset("imdb")
train_dataset = dataset["train"].shuffle(seed=42).select(range(2000))  # use small sample for speed
test_dataset = dataset["test"].shuffle(seed=42).select(range(1000))

# Step 3: Load Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Step 4: Tokenize Text
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Step 5: Format for PyTorch
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Step 6: Load Pretrained BERT with Classification Head
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 7: Define Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Step 8: Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Step 9: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

# Step 10: Train
trainer.train()

# Step 11: Evaluate
results = trainer.evaluate()
print(results)
