# 4️⃣ RoBERTa Fine-tuning on TREC

### 🧠 Model: RoBERTa Fine-tuning
**Dataset:** TREC  
**Classes:** 6  
**Technique:** We fine-tune RoBERTa-base on the TREC question dataset.  
This model excels at capturing subtle semantic nuances across short queries. Label smoothing helps generalization on small, imbalanced datasets.


In [None]:
!pip install -q transformers accelerate scikit-learn

In [None]:
!pip install -U -q datasets

In [3]:
from datasets import load_dataset
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

In [None]:
# Load Dataset
dataset = load_dataset("trec")

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'coarse_label', 'fine_label'],
        num_rows: 5452
    })
    test: Dataset({
        features: ['text', 'coarse_label', 'fine_label'],
        num_rows: 500
    })
})

In [None]:
# Tokenization
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

tokenized_ds = dataset.map(tokenize, batched=True)
tokenized_ds = tokenized_ds.rename_column("coarse_label", "labels")
tokenized_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [17]:
# Load Model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=6)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [22]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./roberta-trec",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none",
    label_smoothing_factor=0.1
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"].shuffle(seed=42).select(range(2000)),
    eval_dataset=tokenized_ds["test"].select(range(500)),
    compute_metrics=compute_metrics,
)

In [23]:
# Train
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.541851,0.962,0.96272,0.962,0.961131
2,No log,0.534279,0.96,0.960416,0.96,0.959075


TrainOutput(global_step=250, training_loss=0.534156982421875, metrics={'train_runtime': 136.3803, 'train_samples_per_second': 29.33, 'train_steps_per_second': 1.833, 'total_flos': 263120504832000.0, 'train_loss': 0.534156982421875, 'epoch': 2.0})

In [24]:
# Evaluate
results = trainer.evaluate()
print(results)

{'eval_loss': 0.5342788696289062, 'eval_accuracy': 0.96, 'eval_precision': 0.9604155305441681, 'eval_recall': 0.96, 'eval_f1': 0.9590752853395819, 'eval_runtime': 2.7419, 'eval_samples_per_second': 182.358, 'eval_steps_per_second': 5.835, 'epoch': 2.0}
