In [None]:
!pip install -q -U bitsandbytes transformers tensorboard==2.11 accelerate datasets scipy einops evaluate trl rouge_score hf_xet
!sudo apt-get install git-lfs --yes

In [1]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    AutoConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
import os
from huggingface_hub import notebook_login, HfFolder

# interpreter_login()
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Token has not been saved to git credential helper.


# INTRO

I am finetuning the roberta-base model from FacebookAI. HuggingFace link: http://huggingface.co/FacebookAI/roberta-base

The finetuned model can be found here (the dataset it was trained on can be found here aswell): https://huggingface.co/nicolasacosta/roberta-base_bbc-news

Dataset on HuggingFace: https://huggingface.co/datasets/SetFit/bbc-news

Link to the original dataset (paper is there also): http://mlg.ucd.ie/datasets/bbc.html

The code has been adapted from the following guide: https://achimoraites.medium.com/fine-tuning-roberta-for-topic-classification-with-hugging-face-transformers-and-datasets-library-c6f8432d0820

### Setting IDs for model and dataset repositories from HuggingFace

In [2]:
model_id = "roberta-base"
dataset_id = "SetFit/bbc-news"
# relace the value with your model: ex <hugging-face-user>/<model-name>
repository_id = "nicolasacosta/roberta-base_bbc-news"

### Load and adjust dataset 

In [3]:
from datasets import ClassLabel

label_names = ["tech", "business", "sport", "entertainment", "politics"]

# Load dataset
dataset = load_dataset(dataset_id)

# Training and testing datasets
train_dataset = dataset['train']
test_dataset = dataset["test"].shard(num_shards=2, index=0)

# Validation dataset
val_dataset = dataset['test'].shard(num_shards=2, index=1)

# Ensure label column aligns with label_text (in case it's not yet properly aligned)
def align_label(example):
    example["label"] = label_names.index(example["label_text"])
    return example

# Create the ClassLabel feature
class_label = ClassLabel(num_classes=len(label_names), names=label_names)

# Map and cast on each split
train_dataset = train_dataset.map(align_label)
train_dataset = train_dataset.cast_column("label", class_label)

test_dataset = test_dataset.map(align_label)
test_dataset = test_dataset.cast_column("label", class_label)

val_dataset = val_dataset.map(align_label)
val_dataset = val_dataset.cast_column("label", class_label)

print(train_dataset.features)

{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['tech', 'business', 'sport', 'entertainment', 'politics'], id=None), 'label_text': Value(dtype='string', id=None)}


### Preprocess text (data was cleaned by original authors of the dataset)

In [4]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

# This function tokenizes the input text using the RoBERTa tokenizer. 
# It applies padding and truncation to ensure that all sequences have the same length (256 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [5]:
# Set dataset format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

### Add label mapping to Model Config (will make model inference easier)

In [6]:
# We will need this to directly output the class names when using the pipeline without mapping the labels later.
# Extract the number of classes and their names
num_labels = train_dataset.features['label'].num_classes
class_names = train_dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}
print(id2label)

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

number of labels: 5
the labels: ['tech', 'business', 'sport', 'entertainment', 'politics']
{0: 'tech', 1: 'business', 2: 'sport', 3: 'entertainment', 4: 'politics'}


### Helper function for model training metrics

In [7]:
def compute_metrics(pred):
    logits, labels = pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    
    # Calculate all metrics
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    precision = precision_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="weighted")
    
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    } 

### Load Model with config and define Trainer and training hyperparameters

In [8]:
# Model
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

# TrainingArguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Train the model

In [9]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1976,0.155735,0.956,0.95625,0.958922,0.956
2,0.2669,0.148644,0.968,0.967924,0.968387,0.968
3,0.0181,0.2332,0.97,0.969995,0.970477,0.97
4,0.0957,0.137774,0.978,0.978119,0.978484,0.978
5,0.0009,0.18259,0.974,0.974012,0.974281,0.974


TrainOutput(global_step=770, training_loss=0.29517584381528295, metrics={'train_runtime': 1424.1757, 'train_samples_per_second': 4.301, 'train_steps_per_second': 0.541, 'total_flos': 805799311296000.0, 'train_loss': 0.29517584381528295, 'epoch': 5.0})

### Evaluate the model after training

In [10]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 0.13777443766593933,
 'eval_accuracy': 0.978,
 'eval_f1': 0.9781194073686014,
 'eval_precision': 0.9784841219575017,
 'eval_recall': 0.978,
 'eval_runtime': 30.6127,
 'eval_samples_per_second': 16.333,
 'eval_steps_per_second': 2.058,
 'epoch': 5.0}

### Save tokenizer and push to HuggingFace repo

In [11]:
# Save our tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

events.out.tfevents.1746045283.LAPTOP-LUNKGTUI.27355.1:   0%|          | 0.00/512 [00:00<?, ?B/s]

events.out.tfevents.1746043714.LAPTOP-LUNKGTUI.27355.0:   0%|          | 0.00/24.0k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/nicolasacosta/roberta-base_bbc-news/commit/bf81750a223e0c9d657efc4dc5cba46d6e39f1ae', commit_message='End of training', commit_description='', oid='bf81750a223e0c9d657efc4dc5cba46d6e39f1ae', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nicolasacosta/roberta-base_bbc-news', endpoint='https://huggingface.co', repo_type='model', repo_id='nicolasacosta/roberta-base_bbc-news'), pr_revision=None, pr_num=None)

### Model Inference

In [None]:
# TEST MODEL

# from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

# Alternative way to conduct model inference, will return the same result
config = AutoConfig.from_pretrained(repository_id)

# Load the pre-trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(repository_id, config=config)
tokenizer = AutoTokenizer.from_pretrained(repository_id)

text = "Traditional TV Outlets Will Begin to Be Rolled Up by Private Equity in 2026, Analyst Predicts  Despite the initial view in Hollywood and on Wall Street that the Trump administration would accelerate consolidation in the media industry, that hasn’t happened. Instead, the president has taken aim… [+3241 chars]"
        
# Tokenize and prepare input for model
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

# Perform classification
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    confidence, predicted_class = torch.max(predictions, dim=1)
    
    # Get the predicted label
    predicted_label = model.config.id2label[predicted_class.item()]
    confidence = confidence.item()

from transformers import pipeline

classifier = pipeline('text-classification', repository_id)

text = "Traditional TV Outlets Will Begin to Be Rolled Up by Private Equity in 2026, Analyst Predicts  Despite the initial view in Hollywood and on Wall Street that the Trump administration would accelerate consolidation in the media industry, that hasn’t happened. Instead, the president has taken aim… [+3241 chars]"
result = classifier(text)

predicted_label = result[0]["label"]
print(f"Predicted label: {predicted_label}")
predicted_score = result[0]["score"]
print(f"Predicted score: {predicted_score}")

Device set to use cuda:0


Predicted label: business
Predicted score: 0.9955815672874451


: 