In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import os

from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report
from torch.nn.parallel import DataParallel
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModel
from datasets import load_metric

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

from huggingface_hub import login

# Login to Hugging Face Hub
login("") #token

In [None]:
df = pd.read_csv("/kaggle/input/emotions/text.csv")
df.head()

In [None]:
df.shape

In [None]:
counts = df['label'].value_counts().rename_axis('Labels').reset_index(name='Count')
fig = px.bar(counts, x='Labels', y='Count', color='Labels', title='Label Counts')
fig.update_layout(xaxis_title='Labels', yaxis_title='Count')
fig.show()

In [None]:
os.environ["WANDB_DISABLED"] = "true"

class TextStratifiedData(Dataset):
    def __init__(self, df, length=None):
        if length is not None and length > df.shape[0]:
            raise ValueError("too big")
        self.length = length 
        self.df = self.stratify(df) 

    def stratify(self, df):
        # balance the dataset (min_count per class)
        min_count = df['label'].value_counts().min()
        stratified_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
        
        if self.length is not None:
            return stratified_df.sample(min(self.length, len(stratified_df)))
        else:
            return stratified_df 

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.df.iloc[idx, :]
    
    def get_all(self):
        return self.df

df = TextStratifiedData(df)
df = df.get_all()


In [None]:
df.head()

In [None]:
#remove special characters and punctuation
df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)

#remove extra whitespaces
df['text'] = df['text'].str.replace(r'\s+', ' ', regex=True)

#remove stop words
stop = stopwords.words('english')
df["text"] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

#remove non-alphanumeric characters from the 'Text' column
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

df.head()

In [None]:
# split into 80% train, 10% validation, and 10% test
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)

# tokenize the data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")

# convert labels to numpy arrays
train_labels = train_labels.to_numpy()
val_labels = val_labels.to_numpy()
test_labels = test_labels.to_numpy()

# create datasets
train_dataset = torch.utils.data.TensorDataset(
    train_encodings["input_ids"], 
    train_encodings["attention_mask"], 
    torch.tensor(train_labels, dtype=torch.int64)
)

val_dataset = torch.utils.data.TensorDataset(
    val_encodings["input_ids"], 
    val_encodings["attention_mask"], 
    torch.tensor(val_labels, dtype=torch.int64)
)

test_dataset = torch.utils.data.TensorDataset(
    test_encodings["input_ids"], 
    test_encodings["attention_mask"], 
    torch.tensor(test_labels, dtype=torch.int64)
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred, metric):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

class CustomDataCollator:
    def __call__(self, data):
        input_ids = torch.stack([item[0] for item in data])
        attention_mask = torch.stack([item[1] for item in data])
        labels = torch.tensor([item[2] for item in data])
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

data_collator = CustomDataCollator()
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  
    tokenizer=tokenizer,
    compute_metrics=lambda eval_pred: compute_metrics(eval_pred, accuracy_metric),
    data_collator=data_collator,
)

# train the model
trainer.train()

val_results = trainer.evaluate()
print(f"Validation Accuracy: {val_results['eval_accuracy']}")

In [None]:
test_results = trainer.evaluate(test_dataset)
print(f"Test Accuracy: {test_results['eval_accuracy']}")

predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

true_labels = test_labels

print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))

# confusion matrix
conf_matrix = confusion_matrix(true_labels, pred_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Sadness', 'Joy', 'Love', 'Anger', 'Fear', 'Surprise'], 
            yticklabels=['Sadness', 'Joy', 'Love', 'Anger', 'Fear', 'Surprise'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


# Pushing to Hugging Face Hub
trainer.push_to_hub("Sentalysis")