# Fine-tuning Transformer with HuggingFace Trainer: Tweet emotion Multi-class Classification

In [None]:
!pip install transformers evaluate tqdm datasets

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [None]:
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import pandas as pd
import numpy as np
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

### Data Exploration and Class distribution

In [None]:
df = pd.read_csv('data.csv', encoding='latin-1')
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [None]:
# Clean the dataset, removing link and mention:
import re

def remove_links(text):
    # Regex pattern to match URLs (http, https, www)
    url_pattern = r"http[s]?://\S+|www\.\S+"
    return re.sub(url_pattern, "", text)

def remove_mentions(text):
    # Regex pattern to match mentions (@username)
    mention_pattern = r"@\w+"
    return re.sub(mention_pattern, "", text)

df['OriginalTweet'] = df['OriginalTweet'].apply(remove_links)
df['OriginalTweet'] = df['OriginalTweet'].apply(remove_mentions)
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,and and,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [None]:
# Class distribution
import plotly.express as px

px.bar(df['Sentiment'].value_counts(ascending=True),template='plotly_white')

**Imbalaned classes, should be taken into consideration when doing train test split**

In [None]:
df["Words Per Tweet"] = df["OriginalTweet"].str.split().apply(len)

px.box(df,y='Words Per Tweet',
       color='Sentiment',
       template='plotly_white')

**For applications using DistilBERT, the maximum context size is 512 tokens
Most tweets are around 10-40 words long, which falls well within this limit**

In [None]:
## Selecting and remaning columns
df = df[['OriginalTweet', 'Sentiment']].rename(columns={'OriginalTweet': 'text', 'Sentiment':'label_name'})
# Define a dictionary to map sentiment strings to integer labels
sentiment_mapping = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}

# Create a new 'label' column using the map function
df['label'] = df['label_name'].map(sentiment_mapping)
df.head()

Unnamed: 0,text,label_name,label
0,and and,Neutral,2
1,advice Talk to your neighbours family to excha...,Positive,3
2,Coronavirus Australia: Woolworths to give elde...,Positive,3
3,My food stock is not the only one which is emp...,Positive,3
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,0


### Train-Test split

In [None]:
# Use part of the dataset only, maintaining original distribution
# Stratified sampling to maintain class distribution
df_subset = df.groupby('label_name', group_keys=False).apply(lambda x: x.sample(frac=0.1, random_state=42))
len(df_subset)






4495

In [None]:
# Using stratify to maintain the class distribution
train_df, eval_df = train_test_split(df_subset, test_size=0.2, stratify=df_subset['label'], random_state=42)
# Convert to Dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

### Tokenizer

In [None]:
pretrained_model_name = 'distilbert-base-uncased'
number_of_label = 5

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128  # instead of using 512, lower the limit to reduce memory usage and fater training
    )

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3596 [00:00<?, ? examples/s]

Map:   0%|          | 0/899 [00:00<?, ? examples/s]

### Initialize model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name,
    num_labels=number_of_label
    )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Initialize Trainer

In [None]:
# 1. Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=0.005,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)


# 2. Define metrics for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Compute accuracy
    accuracy = accuracy_score(labels, predictions)

    # Compute precision, recall, and F1-score (macro & weighted for multi-class)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }


# 3. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)



### Train and Eval

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Score
1,No log,1.576734,0.27475,0.075487,0.27475,0.118435


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=225, training_loss=1.6677117241753472, metrics={'train_runtime': 1790.126, 'train_samples_per_second': 2.009, 'train_steps_per_second': 0.126, 'total_flos': 119094562698240.0, 'train_loss': 1.6677117241753472, 'epoch': 1.0})

In [None]:
print(trainer.evaluate())

{'eval_loss': 1.576734185218811, 'eval_accuracy': 0.27474972191323693, 'eval_precision': 0.07548740969140103, 'eval_recall': 0.27474972191323693, 'eval_f1_score': 0.11843487140064489, 'eval_runtime': 123.5726, 'eval_samples_per_second': 7.275, 'eval_steps_per_second': 0.461, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Initialize
pretrained_model_name = 'distilbert-base-uncased'
number_of_label = 5

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True
    )

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)


model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name,
    num_labels=number_of_label
    )

# 2. Define metrics for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Compute accuracy
    accuracy = accuracy_score(labels, predictions)

    # Compute precision, recall, and F1-score (macro & weighted for multi-class)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }

# 2. Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)


# 3. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3596 [00:00<?, ? examples/s]

Map:   0%|          | 0/899 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Score
1,No log,1.26678,0.458287,0.569134,0.458287,0.419523


TrainOutput(global_step=225, training_loss=1.4181065538194444, metrics={'train_runtime': 11783.4784, 'train_samples_per_second': 0.305, 'train_steps_per_second': 0.019, 'total_flos': 476378250792960.0, 'train_loss': 1.4181065538194444, 'epoch': 1.0})

In [None]:
trainer.push_to_hub('finetuned-distilbert-tweet-emotion')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/minh221/results/commit/4c42cad1b0106a6cde4af356e9dc243117d36c52', commit_message='finetuned-distilbert-tweet-emotion', commit_description='', oid='4c42cad1b0106a6cde4af356e9dc243117d36c52', pr_url=None, repo_url=RepoUrl('https://huggingface.co/minh221/results', endpoint='https://huggingface.co', repo_type='model', repo_id='minh221/results'), pr_revision=None, pr_num=None)