In [1]:
from sklearn.metrics import f1_score, precision_score, recall_score
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from torch.nn.parallel import DataParallel
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [2]:
df = pd.read_csv("/kaggle/input/emotions/text.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [3]:
df.shape

(416809, 3)

In [4]:
counts = df['label'].value_counts().rename_axis('Labels').reset_index(name='Count')
fig = px.bar(counts, x='Labels', y='Count', color='Labels', title='Label Counts')
fig.update_layout(xaxis_title='Labels', yaxis_title='Count')
fig.show()

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_metric

import os
os.environ["WANDB_DISABLED"] = "true"

class TextStratifiedData(Dataset):
    def __init__(self, df, length=None):
        if length is not None and length > df.shape[0]:
            raise ValueError("Length parameter cannot be greater than the size of the dataset.")
        self.length = length  # Store length (even if None)
        self.df = self.stratify(df)  # Stratify the data

    def stratify(self, df):
        # Step 1: Balance the dataset (min_count per class)
        min_count = df['label'].value_counts().min()
        stratified_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
        
        # Step 2: Only subsample if `self.length` is specified
        if self.length is not None:
            return stratified_df.sample(min(self.length, len(stratified_df)))
        else:
            return stratified_df  # Return the full balanced dataset

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.df.iloc[idx, :]
    
    def get_all(self):
        return self.df

df = TextStratifiedData(df)
df = df.get_all()

2025-05-22 12:38:45.614202: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-22 12:38:45.614315: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-22 12:38:45.764255: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered




In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)


train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")


train_labels = train_labels.to_numpy()
val_labels = val_labels.to_numpy()


train_dataset = torch.utils.data.TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], torch.tensor(train_labels, dtype=torch.int64))
val_dataset = torch.utils.data.TensorDataset(val_encodings["input_ids"], val_encodings["attention_mask"], torch.tensor(val_labels, dtype=torch.int64))



### Here we define the Args of our training process
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)


accuracy_metric = load_metric("accuracy")


def compute_metrics(eval_pred, metric):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


class CustomDataCollator:
    def __call__(self, data):
        input_ids = torch.stack([item[0] for item in data])
        attention_mask = torch.stack([item[1] for item in data])
        labels = torch.tensor([item[2] for item in data])
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }


data_collator = CustomDataCollator()
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda eval_pred: compute_metrics(eval_pred, accuracy_metric),
    data_collator=data_collator,
)


#### Here We start Training
trainer.train()


eval_result = trainer.evaluate()
print(f"Accuracy: {eval_result['eval_accuracy']}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]


Passing the following arguments to `Accelerator` is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches', 'even_batches', 'use_seedable_sampler']). Please pass an `accelerate.DataLoaderConfiguration` instead: 
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.



Epoch,Training Loss,Validation Loss,Accuracy
1,0.164,0.136788,0.949741
2,0.1124,0.112128,0.953693
3,0.1011,0.11317,0.953025



Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.


Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.


Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.



Accuracy: 0.9536928813936661


events.out.tfevents.1747917551.9bdd49b64e5c.26.0:   0%|          | 0.00/7.40k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

events.out.tfevents.1747920525.9bdd49b64e5c.26.1:   0%|          | 0.00/411 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nadchan/results/commit/78de5fd061bda2a848babeb0cea846afc97c1a19', commit_message='bert-emot', commit_description='', oid='78de5fd061bda2a848babeb0cea846afc97c1a19', pr_url=None, pr_revision=None, pr_num=None)