In [1]:
import os
from pathlib import Path

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, Trainer, TrainingArguments

2025-04-29 15:43:18.752360: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-29 15:43:18.763583: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-29 15:43:18.776105: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8473] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-29 15:43:18.779897: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1471] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-29 15:43:18.790113: I tensorflow/core/platform/cpu_feature_guar

In [2]:
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        } | {"labels": torch.tensor(self.labels[idx])}
    def __len__(self):
        return len(self.labels)

In [3]:
training_path = Path("Twitter Emotions Classification Dataset/data.csv")

df = pd.read_csv(training_path)  # or load however you want
# Assumes columns: 'text' and 'label'

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    stratify=df['label']
)

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)
mini_train_dataset = EmotionDataset(train_encodings, train_labels[:2000])
mini_val_dataset = EmotionDataset(val_encodings, val_labels[:500])

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    disable_tqdm=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

"""
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

print(output)
"""

preds = trainer.predict(val_dataset)
y_pred = preds.predictions.argmax(-1)
print(classification_report(val_labels, y_pred))

model.save_pretrained("./emotion_model")
tokenizer.save_pretrained("./emotion_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,1.7342
20,1.7749
30,1.5351
40,1.6985
50,1.6256
60,1.6522
70,1.5618
80,1.5316
90,1.804
100,1.5268
