In [8]:
!unzip -o datasets/tokenized_train.zip -d datasets/tokenized_train

!unzip -o datasets/tokenized_test.zip -d datasets/tokenized_test

Archive:  datasets/tokenized_train.zip
  inflating: datasets/tokenized_train/dataset_info.json  
  inflating: datasets/tokenized_train/data-00000-of-00001.arrow  
  inflating: datasets/tokenized_train/state.json  
Archive:  datasets/tokenized_test.zip
  inflating: datasets/tokenized_test/dataset_info.json  
  inflating: datasets/tokenized_test/data-00000-of-00001.arrow  
  inflating: datasets/tokenized_test/state.json  


In [1]:
from datasets import load_from_disk

# Load the dataset from the unzipped directory
train_dataset = load_from_disk('datasets/tokenized_train/')
test_dataset = load_from_disk('datasets/tokenized_test/')

In [2]:
from transformers import pipeline, RobertaTokenizerFast, RobertaForSequenceClassification, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [3]:
device = 0 if torch.cuda.is_available() else -1

In [4]:
model = RobertaForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-emotion", 
    num_labels = 3,
    ignore_mismatched_sizes=True).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
import numpy as np
from sklearn.metrics import accuracy_score

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [10]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    eval_strategy="epoch",     # Evaluate after each epoch
    save_strategy ="epoch",
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    num_train_epochs=15,              # Number of epochs
    weight_decay=0.01,               # Strength of weight decay
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9562,0.921654,0.581187
2,0.832,0.857479,0.614765
3,0.6706,0.907373,0.628795
4,0.5137,1.212114,0.612925
5,0.3891,1.511429,0.614765
6,0.3228,2.221519,0.612236


TrainOutput(global_step=13050, training_loss=0.6155856198277967, metrics={'train_runtime': 3724.9329, 'train_samples_per_second': 70.04, 'train_steps_per_second': 8.759, 'total_flos': 2.7457990047258624e+16, 'train_loss': 0.6155856198277967, 'epoch': 6.0})

In [16]:
eval_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Validation Accuracy: {eval_results['eval_accuracy']:.4f}")

train_results = trainer.evaluate(eval_dataset=train_dataset)
print(f"Training Accuracy: {train_results['eval_accuracy']:.4f}")

Validation Accuracy: 0.6288
Training Accuracy: 0.8461


In [13]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
text = "Poor enterpreneurs working hard and suffer from damn taxes!"
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs = {key: value.to(device) for key, value in inputs.items()}

model.eval()

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)

predicted_label = torch.argmax(outputs.logits, dim=1).item()

label_mapping = {0: "left", 1: "center", 2: "right"}
print(f"Predicted class: {label_mapping[predicted_label]}")

Predicted class: right


In [15]:
# To save the model

model.save_pretrained("./fine-tuned-model/v1")