**Tweet Emotion Multi-Class Classification using Transformer Model**

In [None]:
# import necessary libraries
import numpy as np  # Matrix and vector computation package
import pandas as pd
import matplotlib.pyplot as plt  # Plotting library
from tqdm import tqdm_notebook

In [None]:
# Transformers installation
! pip install transformers datasets --q
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

In [None]:
!git clone https://github.com/huggingface/transformers
!pip install /content/transformers

fatal: destination path 'transformers' already exists and is not an empty directory.
Processing ./transformers
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.49.0.dev0-py3-none-any.whl size=10770282 sha256=60357fde731aead7dc7f1403d82633d6ee4f54d94a4ffa6ac30b90d7f7fc07e2
  Stored in directory: /tmp/pip-ephem-wheel-cache-x2wyd4nq/wheels/9f/62/72/77fdff469e8308ad837268261590df9cabff9926cc4ab177c0
Successfully built transformers
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.49.0.dev0
    Uninstalling transformers-4.49.0.dev0:
      Successfully uninstalled transformers-4.49.0.dev0
Successfully installed tra

In [None]:
!pip install accelerate -U



In [None]:
# Install necessary libraries
!pip install datasets transformers evaluate --q

# Import required libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
from sklearn.metrics import accuracy_score


**Load the dataset**

In [None]:
# Load the dataset
dataset = load_dataset("argilla/twitter-coronavirus")

# Split dataset into 90% train, 10% test
split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


**Tokenizer**

In [None]:
# Load RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")


In [None]:

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply tokenization to datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

In [None]:
#  Extract labels and convert text labels to numeric labels
label_mapping = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}

def extract_label(example):
    if example["prediction"]:
        label_text = example["prediction"][0]["label"]
        return {"label": label_mapping[label_text]}
    else:
        return {"label": 2}

# Apply label extraction
tokenized_train_dataset = tokenized_train_dataset.map(extract_label)
tokenized_eval_dataset = tokenized_eval_dataset.map(extract_label)


**Testing model with samller sample**

In [None]:
# Smaller Dataset for Faster Training
small_train_dataset = tokenized_train_dataset.shuffle(seed=42).select(range(200))
small_eval_dataset = tokenized_eval_dataset.shuffle(seed=42).select(range(50))

**Initialize model**

In [None]:
# Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch


In [None]:
# Convert dataset to PyTorch format
small_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
small_eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
# Load pre-trained RoBERTa model for classification
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=5)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import os

os.environ["WANDB_DISABLED"] = "true"

**Initialize Trainer**

In [11]:
#  Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


**Define metrics for evaluation**

In [12]:
# Local Accuracy Metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}


In [13]:
# Load accuracy metric
metric = evaluate.load("accuracy")

In [14]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

**Train and Eval**

In [None]:
# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.594128,0.28


Evaluation Results: {'eval_loss': 1.5941276550292969, 'eval_accuracy': 0.28, 'eval_runtime': 98.0884, 'eval_samples_per_second': 0.51, 'eval_steps_per_second': 0.071, 'epoch': 1.0}


**Testing different hyperparameters**

In [15]:
# Define different hyperparameter settings
test_configs = [
    {"epochs": 3, "lr": 2e-5, "batch_size": 8},
    {"epochs": 1, "lr": 5e-5, "batch_size": 8},
]
# Loop through different settings and train models
for config in test_configs:
    print(f"\n Running with epochs={config['epochs']}, lr={config['lr']}, batch_size={config['batch_size']}")

    training_args = TrainingArguments(
        output_dir=f"./results_epochs{config['epochs']}_lr{config['lr']}_batch{config['batch_size']}",
        num_train_epochs=config["epochs"],
        per_device_train_batch_size=config["batch_size"],
        per_device_eval_batch_size=config["batch_size"],
        learning_rate=config["lr"],
        evaluation_strategy="epoch",
        logging_dir="./logs",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Evaluate after training
    results = trainer.evaluate()
    print(f"Results: {results}")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



 Running with epochs=3, lr=2e-05, batch_size=8


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.572971,0.2
2,No log,1.557245,0.3
3,No log,1.541921,0.32


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Results: {'eval_loss': 1.5419211387634277, 'eval_accuracy': 0.32, 'eval_runtime': 93.1913, 'eval_samples_per_second': 0.537, 'eval_steps_per_second': 0.075, 'epoch': 3.0}

 Running with epochs=1, lr=5e-05, batch_size=8


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.532626,0.32


Results: {'eval_loss': 1.5326259136199951, 'eval_accuracy': 0.32, 'eval_runtime': 93.7599, 'eval_samples_per_second': 0.533, 'eval_steps_per_second': 0.075, 'epoch': 1.0}
