In [3]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split

import torch 
from torch import nn
from torch import optim

In [4]:
emotions = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']

# Read your data (replace with your actual dataset)
train = pd.read_csv('eng_train.csv')

# Split the dataset into training and validation sets
train, val = train_test_split(train, test_size=0.05, random_state=42)

In [6]:
vectorizer = CountVectorizer(ngram_range=(1,2))
X_train = vectorizer.fit_transform(train['text'].str.lower()).toarray()
X_val = vectorizer.transform(val['text'].str.lower()).toarray()

y_train = train[emotions].values
y_val = val[emotions].values

X_train_t = torch.Tensor(X_train)
y_train_t = torch.Tensor(y_train)

X_val_t = torch.Tensor(X_val)
y_val_t = torch.Tensor(y_val)

In [7]:
print(f'Shape of X: {X_train.shape}')
print(f'Shape of y: {y_train.shape}')
print(f'Number of positives per emotion class:')
_ = [print(f' - {e}: {v} ({round(100*v/len(y_train))}%)') for e,v in zip(emotions, y_train.sum(axis=0))]

Shape of X: (2629, 27998)
Shape of y: (2629, 5)
Number of positives per emotion class:
 - Anger: 319 (12%)
 - Fear: 1531 (58%)
 - Joy: 645 (25%)
 - Sadness: 832 (32%)
 - Surprise: 795 (30%)


In [8]:
def get_predictions(X_val, model, threshold=0.5):
    sig = nn.Sigmoid() 
    yhat = sig(model(X_val)).detach().numpy()
    y_pred = yhat > threshold
    
    return y_pred

def evaluate(y_val, y_pred):
    for average in ['micro', 'macro']:
        recall = recall_score(y_val, y_pred, average=average, zero_division=0)
        precision = precision_score(y_val, y_pred, average=average, zero_division=0)
        f1 = f1_score(y_val, y_pred, average=average, zero_division=0)
    
        print(f'{average.upper()} recall: {round(recall, 4)}, precision: {round(precision, 4)}, f1: {round(f1, 4)}')

def evaluate_per_class(y_val, y_pred):
    for i, emotion in enumerate(emotions):
        print(f'*** {emotion} ***')
    
        recall = recall_score(y_val[:,i], y_pred[:,i], zero_division=0)
        precision = precision_score(y_val[:,i], y_pred[:,i], zero_division=0)
        f1 = f1_score(y_val[:,i], y_pred[:,i], zero_division=0)
        
        print(f'recall: {round(recall, 4)}, precision: {round(precision, 4)}, f1: {round(f1, 4)}\n')

In [9]:
weights = y_train.sum(axis=0)/y_train.sum()
weights = max(weights)/weights

In [18]:
model = nn.Sequential(
    nn.Linear(X_train.shape[1], 128),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(128, 64),
    nn.BatchNorm1d(64),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(64, y_train.shape[1])
)


criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor(weights))
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)

best_val_loss = float('inf')
patience = 10
patience_counter = 0

for epoch in range(1000):
    optimizer.zero_grad()
    output = model(X_train_t)
    loss = criterion(output, y_train_t)
    loss.backward()
    optimizer.step()
    scheduler.step()
    
    val_loss = criterion(model(X_val_t), y_val_t)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping.")
            break

# maybe optimize threshold
y_pred = get_predictions(X_val_t, model, 0.37)
evaluate(y_val, y_pred)


print('\nPER CLASS BREAKDOWN\n')
evaluate_per_class(y_val, y_pred)

Early stopping.
MICRO recall: 0.7653, precision: 0.4766, f1: 0.5874
MACRO recall: 0.7125, precision: 0.4315, f1: 0.5305

PER CLASS BREAKDOWN

*** Anger ***
recall: 0.5, precision: 0.1707, f1: 0.2545

*** Fear ***
recall: 0.8375, precision: 0.6381, f1: 0.7243

*** Joy ***
recall: 0.6897, precision: 0.3448, f1: 0.4598

*** Sadness ***
recall: 0.7174, precision: 0.4583, f1: 0.5593

*** Surprise ***
recall: 0.8182, precision: 0.5455, f1: 0.6545

