In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install -q transformers datasets torch accelerate scikit-learn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/75.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.1/75.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.0/201.0 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.3/150.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.9/193.9 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

**Import All Requirmrnt**

In [4]:
import pandas as pd
import numpy as np
import torch

from datasets import Dataset
from transformers import (AutoTokenizer,AutoModelForSequenceClassification,TrainingArguments,Trainer)

from sklearn.metrics import accuracy_score, f1_score, classification_report




# Pre-Processing

In [5]:
DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/Sentiment and Emotion Analysis.csv"

df = pd.read_csv(DATA_PATH)

# Ensure correct columns
df = df[['sentence', 'emotion']].dropna()

print(df.head())
print(df['emotion'].value_counts())


                                            sentence emotion
0      i just feel really helpless and heavy hearted    fear
1  ive enjoyed being able to slouch about relax a...     sad
2  i gave up my internship with the dmrg and am f...    fear
3                         i dont know i feel so lost     sad
4  i am a kindergarten teacher and i am thoroughl...    fear
emotion
joy        3395
sad        2935
anger      1367
fear       1149
love        789
suprise     364
Name: count, dtype: int64


# Encode Labels

In [6]:
label_list = sorted(df['emotion'].unique())

label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

df['label'] = df['emotion'].map(label2id)


In [7]:
df = df.sample(n=5000, random_state=42).reset_index(drop=True) #Taking only 5000 sample for my low confeguration


# Spliting

In [8]:
from datasets import Dataset, Features, ClassLabel, Value

features = Features({
    'sentence': Value('string'),
    'label': ClassLabel(names=label_list)
})

dataset = Dataset.from_pandas(df[['sentence', 'label']], features=features)
dataset = dataset.train_test_split(
    test_size=0.2,
    stratify_by_column='label',
    seed=42
)

train_ds = dataset['train']
test_ds = dataset['test']

# Tokenizer

In [9]:
MODEL_NAME = "microsoft/deberta-v3-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [10]:
def tokenize(batch):
    return tokenizer(
        batch['sentence'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

train_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

# Load DeBERTa-v3-base Model

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Evaluation Metrics

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average='macro')
    }


# Training Arguments

In [13]:
training_args = TrainingArguments(
    output_dir="./deberta_emotion_model",
    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,

    weight_decay=0.01,
    warmup_ratio=0.1,

    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",

    fp16=False,
    optim=" torch",
    report_to="none"
)


# Trainer Setup & Train The model

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,0.767967,0.758,0.522131
2,0.961400,0.442753,0.86,0.720416
3,0.961400,0.353631,0.891,0.822757




TrainOutput(global_step=750, training_loss=0.7518279215494792, metrics={'train_runtime': 353.1756, 'train_samples_per_second': 33.977, 'train_steps_per_second': 2.124, 'total_flos': 789375670272000.0, 'train_loss': 0.7518279215494792, 'epoch': 3.0})

# Saving the model

In [16]:
SAVE_PATH = "/content/drive/MyDrive/emotion_model_final"

trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

import pickle
with open(f"{SAVE_PATH}/label2id.pkl", "wb") as f:
    pickle.dump(label2id, f)
with open(f"{SAVE_PATH}/id2label.pkl", "wb") as f:
    pickle.dump(id2label, f)

# Test

In [17]:
predictions = trainer.predict(test_ds)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

print(classification_report(
    y_true,
    y_pred,
    target_names=label_list
))




              precision    recall  f1-score   support

       anger       0.88      0.92      0.90       138
        fear       0.83      0.91      0.87       118
         joy       0.90      0.94      0.92       338
        love       0.80      0.64      0.71        77
         sad       0.93      0.94      0.93       292
     suprise       0.89      0.46      0.61        37

    accuracy                           0.89      1000
   macro avg       0.87      0.80      0.82      1000
weighted avg       0.89      0.89      0.89      1000



# Emotion Prediction Function

In [18]:
def predict_emotion(text: str) -> str:
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )

    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    pred_id = torch.argmax(outputs.logits, dim=1).item()
    return id2label[pred_id]


# Example

In [58]:
predict_emotion("happy momment for us")


'joy'