<a href="https://colab.research.google.com/github/MaxIG1/Sentiment_Analysis_with_Roberta/blob/main/RoBERTa_Finetuning_Sentiment_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Tuning RoBERTa with Hugging Face Transformers: Unveiling Sentiments in Political News from the 2016 USD Presidential Campaign

In [None]:
!pip install transformers datasets huggingface_hub tensorboard==2.11
!sudo apt-get install git-lfs --yes
!pip install transformers[torch]
!pip install accelerate -U


Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard==2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
Collecting google-auth-oauthlib<0.5,>=0.4.1 (from tensorboard==2.11)
  Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)
Collecting tensorboard-data-server<0.7.0,>=0.6.0 (from tensorboard==2.11)
  Downloading tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorboard-plugin-wit>=1.6.0 (from tensorboard==2.11)
  Downloading tensorboard_plugin_wit-1.8.1-py3-none-any.whl (781 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
from datasets import load_dataset
from datasets import Dataset
import datasets as ds
from datasets import DatasetDict
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_id = "roberta-base"
dataset_id = "fhamborg/news_sentiment_newsmtsc"
repository_id = "superHans1/roberta_fine_tuned_on_newsmstc_02_split"

In [None]:
# Load dataset

dataset_sent = load_dataset(dataset_id)

# Define the columns to keep
columns_to_keep = ["polarity", "sentence"]

# Remove columns from each split
for split in dataset_sent.keys():
    dataset_sent[split] = dataset_sent[split].remove_columns(
        [col for col in dataset_sent[split].column_names if col not in columns_to_keep]
    )
# rename, so that datasets understands what is what
dataset_sent = dataset_sent.rename_columns({"polarity": "label", "sentence": "text"})

# rename because the models only support Z as numbers
df_train = pd.DataFrame(dataset_sent["train"]).replace({-1: 0, 0: 1, 1: 2})
df_test = pd.DataFrame(dataset_sent["test"]).replace({-1: 0, 0: 1, 1: 2})
df_val = pd.DataFrame(dataset_sent["validation"]).replace({-1: 0, 0: 1, 1: 2})

# Concatenate vertically
result_df = pd.concat([df_train, df_test, df_val], axis=0, ignore_index=True)

# Split into train (80%), validation (10%), and test (10%)
train_df, temp_df = train_test_split(result_df, test_size=0.2, random_state=42)
df_val, df_test = train_test_split(temp_df, test_size=0.5, random_state=42)


# recreation the dataset dictionary
dataset_sent = ds.DatasetDict(
    {
        "train": Dataset.from_pandas(df_train),
        "test": Dataset.from_pandas(df_test),
        "validation": Dataset.from_pandas(df_val),
    }
)

# encode as label column
dataset_sent["train"] = dataset_sent["train"].class_encode_column("label")
dataset_sent["validation"] = dataset_sent["validation"].class_encode_column("label")
dataset_sent["test"] = dataset_sent["test"].class_encode_column("label")

# creating feature names
feat_sentiment = ds.ClassLabel(num_classes=3, names=["negative", "neutral", "positive"])
# casting the feature names to the columns

dataset_sent["train"] = dataset_sent["train"].cast_column("label", feat_sentiment)
dataset_sent["validation"] = dataset_sent["validation"].cast_column(
    "label", feat_sentiment
)
dataset_sent["test"] = dataset_sent["test"].cast_column("label", feat_sentiment)


# final, creation of the dataset
train_dataset = dataset_sent["train"]
test_dataset = dataset_sent["test"]
val_dataset = dataset_sent["validation"]

Stringifying the column:   0%|          | 0/8739 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/8739 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/988 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/988 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/989 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/989 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8739 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/988 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/989 [00:00<?, ? examples/s]

In [None]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

# Creatino of the tokenizer, truncation true, because the datasets is not truncated at some single positions.
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Extract the number of classess and their names
num_labels = dataset_sent['train'].features['label'].num_classes
class_names = dataset_sent["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this to directly output the class names when using the pipeline without needing to map the labels later.
id2label = {i: label for i, label in enumerate(class_names)}
id2label = {0: 'negative', 1: "neutral", 2: "positive"}

# 3. Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

Map:   0%|          | 0/8739 [00:00<?, ? examples/s]

Map:   0%|          | 0/988 [00:00<?, ? examples/s]

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

number of labels: 3
the labels: ['negative', 'neutral', 'positive']


In [None]:
# Model
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

# TrainingArguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6353,0.495326
2,0.5258,0.375376


TrainOutput(global_step=2186, training_loss=0.6605266946668703, metrics={'train_runtime': 1603.001, 'train_samples_per_second': 10.903, 'train_steps_per_second': 1.364, 'total_flos': 4598696315049984.0, 'train_loss': 0.6605266946668703, 'epoch': 2.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.3753761947154999,
 'eval_runtime': 8.097,
 'eval_samples_per_second': 122.021,
 'eval_steps_per_second': 15.314,
 'epoch': 2.0}

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

events.out.tfevents.1702997423.a3e41d820b2e.1085.7:   0%|          | 0.00/311 [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1702995793.a3e41d820b2e.1085.6:   0%|          | 0.00/39.5k [00:00<?, ?B/s]

'https://huggingface.co/superHans1/roberta_fine_tuned_on_newsmstc_02_split/tree/main/'

Bibliography

Moraites, A. (2023, March 24). Fine-tuning RoBERTa for Topic Classification with Hugging Face Transformers and Datasets Library. Medium. https://medium.com/@achillesmoraites/fine-tuning-roberta-for-topic-classification-with-hugging-face-transformers-and-datasets-library-c6f8432d0820
