# Read Data

In [1]:
import numpy as np
import pandas as pd

In [242]:
X = pd.read_pickle('data.pickle')
X_embed = pd.read_pickle('precomputed_embeddings.pkl')
y = pd.read_pickle('labels.pickle')
with open('vocab.txt', 'r') as f:
	vocab = f.read().split(" ")
	vocab.pop()

In [243]:
len(X), len(X_embed), len(y), len(vocab)

(2784, 2784, 2784, 9210)

# Split Data

In [244]:
import torch
from sklearn.model_selection import train_test_split

In [245]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=777)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.15, random_state=777)

In [246]:
train_X_embed, test_X_embed, train_y_embed, test_y_embed = train_test_split(X_embed, y, test_size=0.2, random_state=777)
train_X_embed, val_X_embed, train_y_embed, val_y_embed = train_test_split(train_X_embed, train_y_embed, test_size=0.15, random_state=777)

In [248]:
train_X.shape, val_X.shape, test_X.shape, train_X_embed.shape, val_X_embed.shape, test_X_embed.shape, 

((1892,), (335,), (557,), (1892,), (335,), (557,))

In [249]:
train_y.shape, val_y.shape, test_y.shape

((1892, 4), (335, 4), (557, 4))

In [279]:
from torch.utils.data import Dataset, DataLoader
class ClassificationDataset(Dataset):
    def __init__(self, X, y, is_embeddings):
        self.X = X
        self.y = y
        self.is_embeddings = is_embeddings
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        if self.is_embeddings:
            return torch.tensor(self.X[idx], dtype=torch.float), torch.tensor(self.y[idx], dtype=torch.float)
        return torch.tensor(self.X[idx], dtype=torch.int), torch.tensor(self.y[idx], dtype=torch.float)


In [280]:
train_dataset_embed = ClassificationDataset(train_X_embed, train_y_embed, True)
val_dataset_embed = ClassificationDataset(val_X_embed, val_y_embed, True)
test_dataset_embed = ClassificationDataset(test_X_embed, test_y_embed, True)

train_loader_embed = DataLoader(train_dataset_embed, batch_size=16, shuffle=True)
val_loader_embed = DataLoader(val_dataset_embed, batch_size=16, shuffle=False)
test_loader_embed = DataLoader(test_dataset_embed, batch_size=16, shuffle=False)

In [281]:
train_dataset = ClassificationDataset(train_X, train_y, False)
val_dataset = ClassificationDataset(val_X, val_y, False)
test_dataset = ClassificationDataset(test_X, test_y, False)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# RNN

In [282]:
from torch import nn
from torch.optim import Adam
from torch.functional import F

In [303]:
class LSTM(nn.Module):
    def __init__(self,
              vocab_size,
              embedding_dim,
              hidden_dim,
              num_layers,
              num_classes,
              max_len,
              bidirectional,
              dropout,
              is_embedding_layer = True,
    ):
        super().__init__()
        self.is_embedding_layer = is_embedding_layer
        # Embeddings, which can be pretrained or normally trained
        if (self.is_embedding_layer):
            self.embeddings = nn.Embedding(
                num_embeddings=vocab_size,
                embedding_dim=embedding_dim
            )
        # LSTM Layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, bidirectional=bidirectional, dropout=dropout)
        # Linear Layer
        if bidirectional:
            self.linear = nn.Linear(max_len * hidden_dim * 2, num_classes)
        else:
            self.linear = nn.Linear(max_len * hidden_dim, num_classes)
       
        self.softmax = nn.Softmax()
    def forward(self, x):
        if self.is_embedding_layer:
            x = self.embeddings(x)
        lstm_out, _ = self.lstm(x)
        if self.is_embedding_layer:
            lstm_out = lstm_out.reshape(lstm_out.shape[0], -1)
        linear = self.linear(lstm_out)
        return linear

# Train & Validate 

In [284]:
import optuna
import itertools
from tqdm import tqdm
from torcheval.metrics.functional import multiclass_f1_score

In [285]:
VOCAB_SIZE = len(vocab)
NUM_CLASSES = 4
MAX_LEN = 37
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [364]:
def objective(trial, epochs=3):
    # Hyperparameter search space
    embedding_dim = trial.suggest_categorical("embedding_dim", [128, 256, 512, 1024])
    hidden_dim = trial.suggest_categorical("hidden_dim", [64, 128, 256, 512])
    num_layers = trial.suggest_int("num_layers", 1, 3, 5)
    bidirectional = trial.suggest_categorical("bidirectional", [True, False])
    dropout = trial.suggest_categorical("dropout", [0.1, 0.25, 0.5])
    weights_choice = trial.suggest_categorical("weights", [None, [0.9521203830369357, 1.6893203883495145, 1.86096256684492, 0.5493291239147593]])

    weights = (
        None if weights_choice is None
        else torch.tensor(weights_choice).to(DEVICE)
    )
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    model = LSTM(
        vocab_size=VOCAB_SIZE,
        embedding_dim=embedding_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        num_classes=NUM_CLASSES,
        max_len=MAX_LEN,
        bidirectional=bidirectional,
        dropout=dropout,
        is_embedding_layer=True
    ).to(DEVICE)
    criterion = nn.CrossEntropyLoss(weight=weights)
    optimizer = Adam(model.parameters(), lr=lr)
    
	
    model.train()
    for epoch in range(epochs):
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            optimizer.zero_grad()
            y_preds = model(X_batch)
            loss = criterion(y_preds, y_batch)
            loss.backward()
            optimizer.step()

    # Validation
    model.eval()
    y_preds_list = []
    y_true_list = []
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            y_preds = model(X_batch)
            y_preds_list.extend(y_preds.cpu().numpy())
            y_true_list.extend(torch.argmax(y_batch, dim=1).cpu().numpy())

    f1 = multiclass_f1_score(torch.Tensor(np.array(y_preds_list)), torch.Tensor(np.array(y_true_list)), num_classes=4)
    return f1


In [373]:
def objective_embed(trial, epochs=10):
    # Hyperparameter search space
    embedding_dim = trial.suggest_categorical("embedding_dim", [768])
    hidden_dim = trial.suggest_categorical("hidden_dim", [256, 512, 1024, 2048])
    num_layers = trial.suggest_int("num_layers", 1, 3, 5)
    bidirectional = trial.suggest_categorical("bidirectional", [True, False])
    dropout = trial.suggest_categorical("dropout", [0.1, 0.25, 0.5])
    weights_choice = trial.suggest_categorical("weights", [None, [0.9521203830369357, 1.6893203883495145, 1.86096256684492, 0.5493291239147593]])

    weights = (
        None if weights_choice is None
        else torch.tensor(weights_choice).to(DEVICE)
    )
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    model = LSTM(
        vocab_size=VOCAB_SIZE,
        embedding_dim=embedding_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        num_classes=NUM_CLASSES,
        max_len=1,
        bidirectional=bidirectional,
        dropout=dropout,
        is_embedding_layer=False
    ).to(DEVICE)
    criterion = nn.CrossEntropyLoss(weight=weights)
    optimizer = Adam(model.parameters(), lr=lr)
    
	
    model.train()
    for epoch in range(epochs):
        for X_batch, y_batch in train_loader_embed:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            optimizer.zero_grad()
            y_preds = model(X_batch)
            loss = criterion(y_preds, y_batch)
            loss.backward()
            optimizer.step()

    # Validation
    model.eval()
    y_preds_list = []
    y_true_list = []
    with torch.no_grad():
        for X_batch, y_batch in val_loader_embed:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            y_preds = model(X_batch)
            y_preds_list.extend(y_preds.cpu().numpy())
            y_true_list.extend(torch.argmax(y_batch, dim=1).cpu().numpy())

    f1 = multiclass_f1_score(torch.Tensor(np.array(y_preds_list)), torch.Tensor(np.array(y_true_list)), num_classes=4)
    return f1


In [366]:
# ---- Run the Optuna Study ----
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Best hyperparameters:", study.best_trial.params)

[I 2025-05-09 17:00:49,497] A new study created in memory with name: no-name-e0564f13-1c94-407a-997e-fbbe18300dc0
  num_layers = trial.suggest_int("num_layers", 1, 3, 5)
[I 2025-05-09 17:00:51,693] Trial 0 finished with value: 0.6000000238418579 and parameters: {'embedding_dim': 1024, 'hidden_dim': 512, 'num_layers': 1, 'bidirectional': False, 'dropout': 0.5, 'weights': None, 'lr': 0.0034419395608958924}. Best is trial 0 with value: 0.6000000238418579.
[I 2025-05-09 17:00:52,682] Trial 1 finished with value: 0.5701492428779602 and parameters: {'embedding_dim': 512, 'hidden_dim': 64, 'num_layers': 1, 'bidirectional': False, 'dropout': 0.25, 'weights': [0.9521203830369357, 1.6893203883495145, 1.86096256684492, 0.5493291239147593], 'lr': 0.0071210106040826195}. Best is trial 0 with value: 0.6000000238418579.
[I 2025-05-09 17:00:53,475] Trial 2 finished with value: 0.48059701919555664 and parameters: {'embedding_dim': 256, 'hidden_dim': 64, 'num_layers': 1, 'bidirectional': False, 'dropout

Best hyperparameters: {'embedding_dim': 1024, 'hidden_dim': 256, 'num_layers': 1, 'bidirectional': True, 'dropout': 0.25, 'weights': None, 'lr': 0.001049532082790948}


In [374]:
# ---- Run the Optuna Study ----
study_embed = optuna.create_study(direction="maximize")
study_embed.optimize(objective_embed, n_trials=100)

print("Best hyperparameters:", study_embed.best_trial.params)

[I 2025-05-09 17:06:06,310] A new study created in memory with name: no-name-7e256b3e-a9a6-4918-9afc-df1e4abc1247
  num_layers = trial.suggest_int("num_layers", 1, 3, 5)
[I 2025-05-09 17:06:10,903] Trial 0 finished with value: 0.6507462859153748 and parameters: {'embedding_dim': 768, 'hidden_dim': 1024, 'num_layers': 1, 'bidirectional': False, 'dropout': 0.5, 'weights': [0.9521203830369357, 1.6893203883495145, 1.86096256684492, 0.5493291239147593], 'lr': 0.005053478988768809}. Best is trial 0 with value: 0.6507462859153748.
[I 2025-05-09 17:06:13,744] Trial 1 finished with value: 0.6865671873092651 and parameters: {'embedding_dim': 768, 'hidden_dim': 512, 'num_layers': 1, 'bidirectional': False, 'dropout': 0.1, 'weights': None, 'lr': 0.00017391786259928437}. Best is trial 1 with value: 0.6865671873092651.
[I 2025-05-09 17:06:18,105] Trial 2 finished with value: 0.6716417670249939 and parameters: {'embedding_dim': 768, 'hidden_dim': 1024, 'num_layers': 1, 'bidirectional': False, 'dropou

Best hyperparameters: {'embedding_dim': 768, 'hidden_dim': 1024, 'num_layers': 1, 'bidirectional': True, 'dropout': 0.25, 'weights': None, 'lr': 0.00010132935513803993}


In [380]:
def train_val(
        model: LSTM,
        optim: Adam,
        criterion: nn.CrossEntropyLoss,
        epochs: int,
        train_dataloader: DataLoader,
        val_dataloader: DataLoader,
        device
    ):
    best_f1 = 0
    best_model = None
    model.to(device)
    for epoch in tqdm(range(epochs)):
        model.train()
        train_loss = 0
        y_preds_list_train = []
        y_true_list_train = []
        for train_X, train_y in train_dataloader:
            train_X, train_y = train_X.to(device), train_y.to(device)
    
            y_preds = model(train_X)
            loss = criterion(y_preds, train_y)
    
            optim.zero_grad()
            loss.backward()
            optim.step()
    
            train_loss += loss.item()
            y_preds_list_train.extend(y_preds.detach().cpu().numpy())
            y_true_list_train.extend(torch.argmax(train_y, dim=1).cpu().numpy())

        train_f1 = multiclass_f1_score(torch.Tensor(np.array(y_preds_list_train)), torch.Tensor(np.array(y_true_list_train)), num_classes=4)

        # Validation
        model.eval()
        val_loss = 0
        y_preds_list_val = []
        y_true_list_val = []
        with torch.no_grad():
            for val_X, val_y in val_dataloader:
                val_X, val_y = val_X.to(device), val_y.to(device)
    
                y_preds = model(val_X)
                loss = criterion(y_preds, val_y)
                val_loss += loss.item()
                predicted = torch.argmax(y_preds, dim=1)
                y_preds_list_val.extend(y_preds.cpu().numpy())
                y_true_list_val.extend(torch.argmax(val_y, dim=1).cpu().numpy())
    
        val_f1 = multiclass_f1_score(torch.Tensor(np.array(y_preds_list_val)), torch.Tensor(np.array(y_true_list_val)), num_classes=4)
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model = model.state_dict()
        print(
            f"Epoch {epoch+1}/{epochs}, "
            f"Train Loss: {train_loss/len(train_dataloader):.4f}, "
            f"Val Loss: {val_loss/len(val_dataloader):.4f}, "
            f"Train F1: {train_f1:.2f}%, "
            f"Val F1: {val_f1:.2f}%"
        )
    return best_model

In [390]:
model = LSTM(
			vocab_size=VOCAB_SIZE,
			embedding_dim=study.best_trial.params['embedding_dim'],
			hidden_dim=study.best_trial.params['hidden_dim'],
			num_layers=study.best_trial.params['num_layers'],
			num_classes=NUM_CLASSES,
			max_len=MAX_LEN,
			bidirectional=study.best_trial.params['bidirectional'],
            dropout = study.best_trial.params['dropout'],
            is_embedding_layer=True
		).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=study.best_trial.params['lr'])
params = train_val(
    model,
    optimizer,
    criterion,
    epochs = 3,
    train_dataloader = train_loader,
    val_dataloader = val_loader,
    device=DEVICE
)

 33%|████████████████████████████▎                                                        | 1/3 [00:01<00:02,  1.05s/it]

Epoch 1/3, Train Loss: 1.1578, Val Loss: 1.0200, Train F1: 0.53%, Val F1: 0.59%


 67%|████████████████████████████████████████████████████████▋                            | 2/3 [00:01<00:00,  1.10it/s]

Epoch 2/3, Train Loss: 0.3097, Val Loss: 0.9747, Train F1: 0.93%, Val F1: 0.62%


100%|█████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.14it/s]

Epoch 3/3, Train Loss: 0.0569, Val Loss: 1.0268, Train F1: 0.99%, Val F1: 0.63%





In [391]:
model_embed = LSTM(
			vocab_size=VOCAB_SIZE,
			embedding_dim=study_embed.best_trial.params['embedding_dim'],
			hidden_dim=study_embed.best_trial.params['hidden_dim'],
			num_layers=study_embed.best_trial.params['num_layers'],
			num_classes=NUM_CLASSES,
			max_len=1,
			bidirectional=study_embed.best_trial.params['bidirectional'],
            dropout = study_embed.best_trial.params['dropout'],
            is_embedding_layer=False
		).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer_embed = Adam(model_embed.parameters(), lr=study.best_trial.params['lr'])

embed_params=train_val(
    model_embed,
    optimizer_embed,
    criterion,
    epochs = 10,
    train_dataloader = train_loader_embed,
    val_dataloader = val_loader_embed,
    device=DEVICE
)

 10%|████████▍                                                                           | 1/10 [00:00<00:08,  1.06it/s]

Epoch 1/10, Train Loss: 0.8310, Val Loss: 0.7498, Train F1: 0.68%, Val F1: 0.70%


 20%|████████████████▊                                                                   | 2/10 [00:01<00:07,  1.04it/s]

Epoch 2/10, Train Loss: 0.5597, Val Loss: 0.7759, Train F1: 0.78%, Val F1: 0.69%


 30%|█████████████████████████▏                                                          | 3/10 [00:02<00:06,  1.05it/s]

Epoch 3/10, Train Loss: 0.3702, Val Loss: 0.8296, Train F1: 0.87%, Val F1: 0.69%


 40%|█████████████████████████████████▌                                                  | 4/10 [00:03<00:05,  1.05it/s]

Epoch 4/10, Train Loss: 0.2194, Val Loss: 0.9395, Train F1: 0.93%, Val F1: 0.67%


 50%|██████████████████████████████████████████                                          | 5/10 [00:04<00:04,  1.03it/s]

Epoch 5/10, Train Loss: 0.1180, Val Loss: 1.0492, Train F1: 0.96%, Val F1: 0.67%


 60%|██████████████████████████████████████████████████▍                                 | 6/10 [00:05<00:03,  1.03it/s]

Epoch 6/10, Train Loss: 0.0754, Val Loss: 1.1637, Train F1: 0.98%, Val F1: 0.70%


 70%|██████████████████████████████████████████████████████████▊                         | 7/10 [00:06<00:02,  1.02it/s]

Epoch 7/10, Train Loss: 0.0427, Val Loss: 1.1341, Train F1: 0.99%, Val F1: 0.68%


 80%|███████████████████████████████████████████████████████████████████▏                | 8/10 [00:07<00:01,  1.03it/s]

Epoch 8/10, Train Loss: 0.0342, Val Loss: 1.2302, Train F1: 0.99%, Val F1: 0.69%


 90%|███████████████████████████████████████████████████████████████████████████▌        | 9/10 [00:08<00:00,  1.04it/s]

Epoch 9/10, Train Loss: 0.0236, Val Loss: 1.3062, Train F1: 0.99%, Val F1: 0.68%


100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.02it/s]

Epoch 10/10, Train Loss: 0.0121, Val Loss: 1.3204, Train F1: 1.00%, Val F1: 0.68%





# Test

In [336]:
from sklearn.metrics import classification_report

In [431]:
def test(
		model: LSTM,
		optim: Adam,
		criterion: nn.CrossEntropyLoss,
		test_dataloader: DataLoader,
		device
    ):
    model.to(device)
    model.eval()
    test_loss = 0
    test_correct = 0
    test_total = 0
    y_preds_list = [] 
    with torch.no_grad():
        for test_X, test_y in test_dataloader:
            test_X, test_y = test_X.to(device), test_y.to(device)
    
            y_preds = model(test_X)
            y_preds_list.extend(y_preds.numpy())
            loss = criterion(y_preds, test_y)
            test_loss += loss.item()
            predicted = torch.argmax(y_preds, dim=1)
            test_correct += (predicted == np.argmax(test_y, axis=1)).sum().item()
            test_total += test_y.size(0)
    
    print(
        f"test Loss: {test_loss/len(test_dataloader):.4f}, "
        f"test Acc: {100 * test_correct/test_total:.2f}%"
    )
    return np.array(y_preds_list)

In [432]:
best_model = LSTM(
			vocab_size=VOCAB_SIZE,
			embedding_dim=study.best_trial.params['embedding_dim'],
			hidden_dim=study.best_trial.params['hidden_dim'],
			num_layers=study.best_trial.params['num_layers'],
			num_classes=NUM_CLASSES,
			max_len=MAX_LEN,
			bidirectional=study.best_trial.params['bidirectional'],
            dropout = study.best_trial.params['dropout'],
            is_embedding_layer=True
		)
best_model.load_state_dict(params)
y_preds=test(best_model, optimizer, criterion, test_loader, 'cpu')

test Loss: 1.0433, test Acc: 59.96%


In [434]:
best_model_embed = LSTM(
			vocab_size=VOCAB_SIZE,
			embedding_dim=study_embed.best_trial.params['embedding_dim'],
			hidden_dim=study_embed.best_trial.params['hidden_dim'],
			num_layers=study_embed.best_trial.params['num_layers'],
			num_classes=NUM_CLASSES,
			max_len=1,
			bidirectional=study_embed.best_trial.params['bidirectional'],
            dropout = study_embed.best_trial.params['dropout'],
            is_embedding_layer=False
		)
best_model_embed.load_state_dict(embed_params)
y_preds_embed=test(best_model_embed, optimizer_embed, criterion, test_loader_embed, 'cpu')

test Loss: 1.2217, test Acc: 71.99%


In [435]:
print(classification_report(np.argmax(test_y, axis=1), np.argmax(y_preds, axis=1)))
print(classification_report(np.argmax(test_y_embed, axis=1), np.argmax(y_preds_embed, axis=1)))

              precision    recall  f1-score   support

           0       0.59      0.70      0.64       131
           1       0.51      0.32      0.39        87
           2       0.40      0.10      0.16        82
           3       0.63      0.80      0.71       257

    accuracy                           0.60       557
   macro avg       0.53      0.48      0.48       557
weighted avg       0.57      0.60      0.56       557

              precision    recall  f1-score   support

           0       0.77      0.75      0.76       131
           1       0.54      0.63      0.59        87
           2       0.67      0.45      0.54        82
           3       0.77      0.82      0.79       257

    accuracy                           0.72       557
   macro avg       0.69      0.66      0.67       557
weighted avg       0.72      0.72      0.72       557

