# Fine Tune BERT for sentiment analysis

This file fine-tunes `bert-base-uncased` on `argilla/twitter-coronavirus`. The dataset can be found at [here](https://huggingface.co/datasets/argilla/twitter-coronavirus)

This model is used for generating sentiment scores.

In [1]:
!pip install datasets transformers accelerate evaluate

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310

In [2]:
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
import evaluate
import numpy as np

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Load Data

In [3]:
data = load_dataset("argilla/twitter-coronavirus")
data = data.remove_columns(['inputs', 'prediction_agent', 'annotation', 'annotation_agent', 'multi_label', 'explanation', 'id', 'metadata', 'status', 'event_timestamp', 'metrics'])
data = data["train"].train_test_split(test_size=0.3)
data['train'] = data['train'].train_test_split(test_size=0.3)
data['dev'] = data['train']['test']
data['train'] = data['train']['train']
data

Downloading readme:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/15.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/44955 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'prediction'],
        num_rows: 22027
    })
    test: Dataset({
        features: ['text', 'prediction'],
        num_rows: 13487
    })
    dev: Dataset({
        features: ['text', 'prediction'],
        num_rows: 9441
    })
})

## Pre-processing

In [4]:
label2idx = {'Extremely Negative': 0, 'Negative': 1, 'Neutral':2, 'Positive':3, 'Extremely Positive':4}
idx2label = {0: 'Extremely Negative', 1: 'Negative', 2: 'Neutral', 3: 'Positive', 4: 'Extremely Positive'}

In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def preprocessing_func(examples):
    inputs = tokenizer(examples['text'], truncation=True)
    inputs['label'] = [label2idx[each[0]['label']] for each in examples['prediction']]
    return inputs

tokenized_data = data.map(preprocessing_func, batched=True, remove_columns=data['test'].column_names)
tokenized_data

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/22027 [00:00<?, ? examples/s]

Map:   0%|          | 0/13487 [00:00<?, ? examples/s]

Map:   0%|          | 0/9441 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 22027
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 13487
    })
    dev: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 9441
    })
})

In [6]:
data_collator = DataCollatorWithPadding(tokenizer)

## Training

In [7]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    matrix = accuracy.compute(predictions=predictions, references=labels)
    matrix['weighted f1'] = f1_score(labels, predictions, average='weighted')
    matrix['macro f1'] = f1_score(labels, predictions, average='macro')
    return matrix

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=5, id2label=idx2label, label2id=label2idx
)

training_args = TrainingArguments(
    output_dir="model-output",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
trainer.save_model('bert-base-sentiment-analysis')

## Evaluation

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    "kwang123/bert-sentiment-analysis", num_labels=5
)

training_args = TrainingArguments(
    output_dir="model-output",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.evaluate(tokenized_data["test"])

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.3354743421077728,
 'eval_accuracy': 0.8924890635426708,
 'eval_weighted f1': 0.8926163493705788,
 'eval_macro f1': 0.8966626784065721,
 'eval_runtime': 72.4304,
 'eval_samples_per_second': 186.206,
 'eval_steps_per_second': 11.639}