# Fine-Tuning RoBERTa with Hugging Face Transformers: Unveiling Sentiments in Political News from the 2016 US Presidential Campaign

https://colab.research.google.com/drive/1zOCMZkZlo88Fst9P6CgoHUYaPdqUdqGA?usp=sharing

In [2]:
# this code is only needed for google colab, it is advised to use google colab as this code might not run on 
# on your local computer.

# !pip install transformers datasets huggingface_hub tensorboard==2.11
# !sudo apt-get install git-lfs --yes
# !pip install transformers[torch]
# !pip install accelerate -U

In [None]:
import torch
from datasets import load_dataset
from datasets import Dataset
import datasets as ds
from datasets import DatasetDict
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
import pandas as pd

In [None]:
notebook_login()

In [5]:
model_id = "roberta-base"
dataset_id = "fhamborg/news_sentiment_newsmtsc"
repository_id = "MaxG1/roberta_fine_tuned_on_newsmstc_02_split"

In [None]:
# Load dataset

dataset_sent = load_dataset(dataset_id)

# Define the columns to keep
columns_to_keep = ["polarity", "sentence"]

# Remove columns from each split
for split in dataset_sent.keys():
    dataset_sent[split] = dataset_sent[split].remove_columns(
        [col for col in dataset_sent[split].column_names if col not in columns_to_keep]
    )
# rename, so that datasets understands what is what
dataset_sent = dataset_sent.rename_columns({"polarity": "label", "sentence": "text"})

# rename because the models only support Z as numbers
df_train = pd.DataFrame(dataset_sent["train"]).replace({-1: 0, 0: 1, 1: 2})
df_test = pd.DataFrame(dataset_sent["test"]).replace({-1: 0, 0: 1, 1: 2})
df_val = pd.DataFrame(dataset_sent["validation"]).replace({-1: 0, 0: 1, 1: 2})

# recreation the dataset dictionary
dataset_sent = ds.DatasetDict(
    {
        "train": Dataset.from_pandas(df_train),
        "test": Dataset.from_pandas(df_test),
        "validation": Dataset.from_pandas(df_val),
    }
)

# encode as label column
dataset_sent["train"] = dataset_sent["train"].class_encode_column("label")
dataset_sent["validation"] = dataset_sent["validation"].class_encode_column("label")
dataset_sent["test"] = dataset_sent["test"].class_encode_column("label")

# creating feature names
feat_sentiment = ds.ClassLabel(num_classes=3, names=["negative", "neutral", "positive"])
# casting the feature names to the columns

dataset_sent["train"] = dataset_sent["train"].cast_column("label", feat_sentiment)
dataset_sent["validation"] = dataset_sent["validation"].cast_column(
    "label", feat_sentiment
)
dataset_sent["test"] = dataset_sent["test"].cast_column("label", feat_sentiment)


# final, creation of the dataset
train_dataset = dataset_sent["train"]
test_dataset = dataset_sent["test"]
val_dataset = dataset_sent["validation"]

In [None]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

# Creatino of the tokenizer, truncation true, because the datasets is not truncated at some single positions.
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Extract the number of classess and their names
num_labels = dataset_sent['train'].features['label'].num_classes
class_names = dataset_sent["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this to directly output the class names when using the pipeline without needing to map the labels later.
id2label = {i: label for i, label in enumerate(class_names)}
id2label = {0: 'negative', 1: "neutral", 2: "positive"}

# 3. Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

In [None]:
# Model
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

# TrainingArguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

Bibliography

Moraites, A. (2023, March 24). Fine-tuning RoBERTa for Topic Classification with Hugging Face Transformers and Datasets Library. Medium. https://medium.com/@achillesmoraites/fine-tuning-roberta-for-topic-classification-with-hugging-face-transformers-and-datasets-library-c6f8432d0820
