# Read Data

In [59]:
import numpy as np
import pandas as pd

In [60]:
X = pd.read_pickle('precomputed_embeddings.pkl')
y = pd.read_pickle('multilabel.pkl')
with open('vocab.txt', 'r') as f:
	vocab = f.read().split(" ")
	vocab.pop()

In [61]:
len(X), len(y), len(vocab)

(2784, 2784, 9210)

In [84]:
y

array([[1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       ...,
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1]], dtype=int32)

# Split Data

In [62]:
import torch
from sklearn.model_selection import train_test_split

In [63]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=777)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.15, random_state=777)

In [64]:
train_X.shape, val_X.shape, test_X.shape

((1892,), (335,), (557,))

In [65]:
train_y.shape, val_y.shape, test_y.shape

((1892, 4), (335, 4), (557, 4))

In [111]:
from torch.utils.data import Dataset, DataLoader
class ClassificationDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float), torch.tensor(self.y[idx], dtype=torch.float)


In [112]:
train_dataset = ClassificationDataset(train_X, train_y)
val_dataset = ClassificationDataset(val_X, val_y)
test_dataset = ClassificationDataset(test_X, test_y)

train_loader = DataLoader(train_dataset, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# RNN

In [113]:
from torch import nn
from torch.optim import Adam
from torch.functional import F
from sklearn.metrics import f1_score

In [114]:
class MultiLabelLSTM(nn.Module):
    def __init__(self,
              vocab_size,
              embedding_dim,
              hidden_dim,
              num_layers,
              num_classes,
              bidirectional,
              dropout,
    ):
        super().__init__()
        # LSTM Layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, bidirectional=bidirectional, dropout=dropout)
        # Linear Layer
        if bidirectional:
            self.linear = nn.Linear(hidden_dim * 2, num_classes)
        else:
            self.linear = nn.Linear(hidden_dim, num_classes)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        linear = self.linear(lstm_out)
        return self.sigmoid(linear)

# Train & Validate 

In [115]:
import optuna
import itertools
from tqdm import tqdm
from torcheval.metrics.functional import multiclass_f1_score

In [None]:
def get_prediction(row):
    row[row >=0.5] = 1
    row[row <0.5] = 0
    row.astype(np.float32)
    return row

In [150]:
def objective(trial, epochs=3):
    # Hyperparameter search space
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    embedding_dim = trial.suggest_categorical("embedding_dim", [768])
    hidden_dim = trial.suggest_categorical("hidden_dim", [64, 128, 256, 512])
    num_layers = trial.suggest_int("num_layers", 1, 5, step=1)
    bidirectional = trial.suggest_categorical("bidirectional", [True, False])
    dropout = trial.suggest_categorical("dropout", [0.1, 0.25, 0.5])
    VOCAB_SIZE = len(vocab)
    NUM_CLASSES = 4
    MAX_LEN = 1
    # weights = (
    #     None if weights_choice is None
    #     else torch.tensor(weights_choice).to(DEVICE)
    # )
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    model = MultiLabelLSTM(
        vocab_size=VOCAB_SIZE,
        embedding_dim=embedding_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        num_classes=NUM_CLASSES,
        bidirectional=bidirectional,
        dropout=dropout,
    ).to(DEVICE)
    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=lr)
    
	
    model.train()
    for epoch in range(epochs):
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            optimizer.zero_grad()
            y_preds = model(X_batch)
            loss = criterion(y_preds, y_batch)
            loss.backward()
            optimizer.step()
    # Validation
    model.eval()
    y_preds_list = []
    y_true_list = []
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            y_preds = model(X_batch)
            y_preds_list.extend(y_preds.detach().cpu().numpy())
            y_true_list.extend(y_batch.detach().cpu().numpy())
    y_preds = np.array([get_prediction(pred) for pred in y_preds_list])
    y_true = np.array(y_true_list)

    f1 = f1_score(y_true, y_preds, average='weighted')
    return f1


In [151]:
# ---- Run the Optuna Study ----
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Best hyperparameters:", study.best_trial.params)

[I 2025-05-09 23:15:57,676] A new study created in memory with name: no-name-b115b0c0-aa55-4c0e-856a-59fbf45230bf
[I 2025-05-09 23:15:59,247] Trial 0 finished with value: 0.6650916317758602 and parameters: {'embedding_dim': 768, 'hidden_dim': 512, 'num_layers': 2, 'bidirectional': False, 'dropout': 0.25, 'lr': 0.00033287359690745496}. Best is trial 0 with value: 0.6650916317758602.
[I 2025-05-09 23:16:00,413] Trial 1 finished with value: 0.6644502500430883 and parameters: {'embedding_dim': 768, 'hidden_dim': 128, 'num_layers': 2, 'bidirectional': True, 'dropout': 0.1, 'lr': 0.003921736302176072}. Best is trial 0 with value: 0.6650916317758602.
[I 2025-05-09 23:16:04,056] Trial 2 finished with value: 0.5809203732722411 and parameters: {'embedding_dim': 768, 'hidden_dim': 512, 'num_layers': 4, 'bidirectional': True, 'dropout': 0.25, 'lr': 0.0002526469994742172}. Best is trial 0 with value: 0.6650916317758602.
[I 2025-05-09 23:16:04,920] Trial 3 finished with value: 0.658605780620706 and 

Best hyperparameters: {'embedding_dim': 768, 'hidden_dim': 128, 'num_layers': 2, 'bidirectional': True, 'dropout': 0.25, 'lr': 0.0011093011094073127}


In [163]:
def train_val(
        model: LSTM,
        optim: Adam,
        criterion: nn.BCELoss,
        epochs: int,
        train_dataloader: DataLoader,
        val_dataloader: DataLoader,
        device
    ):
    best_f1 = 0
    best_model = None
    model.to(device)
    for epoch in tqdm(range(epochs)):
        model.train()
        train_loss = 0
        y_preds_list_train = []
        y_true_list_train = []
        for train_X, train_y in train_dataloader:
            train_X, train_y = train_X.to(device), train_y.to(device)
    
            y_preds = model(train_X)
            loss = criterion(y_preds, train_y)
    
            optim.zero_grad()
            loss.backward()
            optim.step()
    
            train_loss += loss.item()
            y_preds_list_train.extend(y_preds.detach().cpu().numpy())
            y_true_list_train.extend(train_y.detach().cpu().numpy())
    
        y_preds = np.array([get_prediction(pred) for pred in y_preds_list_train])
        y_true = np.array(y_true_list_train) 
        train_f1 =f1_score(y_true, y_preds, average='weighted')
        # Validation
        model.eval()
        val_loss = 0
        y_preds_list_val = []
        y_true_list_val = []
        with torch.no_grad():
            for val_X, val_y in val_dataloader:
                val_X, val_y = val_X.to(device), val_y.to(device)
    
                y_preds = model(val_X)
                loss = criterion(y_preds, val_y)
                val_loss += loss.item()
                predicted = torch.argmax(y_preds, dim=1)
                y_preds_list_val.extend(y_preds.detach().cpu().numpy())
                y_true_list_val.extend(val_y.detach().cpu().numpy())
    
        y_preds = np.array([get_prediction(pred) for pred in y_preds_list_val])
        y_true = np.array(y_true_list_val)
        val_f1 = f1_score(y_true, y_preds, average='weighted')
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model = model.state_dict()
        print(
            f"Epoch {epoch+1}/{epochs}, "
            f"Train Loss: {train_loss/len(train_dataloader):.4f}, "
            f"Val Loss: {val_loss/len(val_dataloader):.4f}, "
            f"Train F1: {train_f1:.2f}%, "
            f"Val F1: {val_f1:.2f}%"
        )
    return best_model

In [165]:
model = MultiLabelLSTM(
			vocab_size=VOCAB_SIZE,
			embedding_dim=study.best_trial.params['embedding_dim'],
			hidden_dim=study.best_trial.params['hidden_dim'],
			num_layers=study.best_trial.params['num_layers'],
			num_classes=NUM_CLASSES,
			bidirectional=study.best_trial.params['bidirectional'],
            dropout = study.best_trial.params['dropout'],
		).to(DEVICE)
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=study.best_trial.params['lr'])
params = train_val(
    model,
    optimizer,
    criterion,
    epochs = 10,
    train_dataloader = train_loader,
    val_dataloader = val_loader,
    device=DEVICE
)

 10%|████████▍                                                                           | 1/10 [00:00<00:06,  1.46it/s]

Epoch 1/10, Train Loss: 0.4139, Val Loss: 0.3441, Train F1: 0.51%, Val F1: 0.61%


 20%|████████████████▊                                                                   | 2/10 [00:01<00:04,  1.67it/s]

Epoch 2/10, Train Loss: 0.2620, Val Loss: 0.3433, Train F1: 0.77%, Val F1: 0.67%


 30%|█████████████████████████▏                                                          | 3/10 [00:01<00:03,  1.79it/s]

Epoch 3/10, Train Loss: 0.1522, Val Loss: 0.3935, Train F1: 0.89%, Val F1: 0.67%


 40%|█████████████████████████████████▌                                                  | 4/10 [00:02<00:03,  1.80it/s]

Epoch 4/10, Train Loss: 0.0709, Val Loss: 0.5041, Train F1: 0.96%, Val F1: 0.64%


 50%|██████████████████████████████████████████                                          | 5/10 [00:02<00:02,  1.82it/s]

Epoch 5/10, Train Loss: 0.0338, Val Loss: 0.6250, Train F1: 0.98%, Val F1: 0.65%


 60%|██████████████████████████████████████████████████▍                                 | 6/10 [00:03<00:02,  1.83it/s]

Epoch 6/10, Train Loss: 0.0305, Val Loss: 0.5847, Train F1: 0.98%, Val F1: 0.66%


 70%|██████████████████████████████████████████████████████████▊                         | 7/10 [00:03<00:01,  1.84it/s]

Epoch 7/10, Train Loss: 0.0125, Val Loss: 0.6364, Train F1: 0.99%, Val F1: 0.65%


 80%|███████████████████████████████████████████████████████████████████▏                | 8/10 [00:04<00:01,  1.83it/s]

Epoch 8/10, Train Loss: 0.0032, Val Loss: 0.6664, Train F1: 1.00%, Val F1: 0.65%


 90%|███████████████████████████████████████████████████████████████████████████▌        | 9/10 [00:04<00:00,  1.86it/s]

Epoch 9/10, Train Loss: 0.0019, Val Loss: 0.6933, Train F1: 1.00%, Val F1: 0.66%


100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.82it/s]

Epoch 10/10, Train Loss: 0.0012, Val Loss: 0.6952, Train F1: 1.00%, Val F1: 0.65%





# Test

In [177]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [178]:
def test(
		model: LSTM,
		optim: Adam,
		criterion: nn.BCEWithLogitsLoss,
		test_dataloader: DataLoader,
		device
    ):
    model.to(device)
    model.eval()
    test_loss = 0
    y_preds_list = []
    y_true_list = []
    with torch.no_grad():
        for test_X, test_y in test_dataloader:
            test_X, test_y = test_X.to(device), test_y.to(device)
    
            y_preds = model(test_X)
            y_preds_list.extend(y_preds.detach().cpu().numpy())
            y_true_list.extend(test_y.detach().cpu().numpy())
            loss = criterion(y_preds, test_y)
            test_loss += loss.item()

        y_preds_list = [get_prediction(pred) for pred in y_preds_list]
    print(
        f"test Loss: {test_loss/len(test_dataloader):.4f}, "
        f"test Acc: {accuracy_score(y_true_list, y_preds_list)}%"
    )
    return np.array(y_preds_list)

In [179]:
best_model = MultiLabelLSTM(
			vocab_size=VOCAB_SIZE,
			embedding_dim=study.best_trial.params['embedding_dim'],
			hidden_dim=study.best_trial.params['hidden_dim'],
			num_layers=study.best_trial.params['num_layers'],
			num_classes=NUM_CLASSES,
			bidirectional=study.best_trial.params['bidirectional'],
            dropout = study.best_trial.params['dropout'],
		)
best_model.load_state_dict(params)
y_preds=test(best_model, optimizer, criterion, test_loader, 'cuda')

test Loss: 0.5416, test Acc: 0.6804308797127468%


In [180]:
print(classification_report(test_y, y_preds))

              precision    recall  f1-score   support

           0       0.76      0.73      0.74       131
           1       0.63      0.62      0.62        87
           2       0.66      0.50      0.57        82
           3       0.83      0.76      0.79       257

   micro avg       0.76      0.69      0.72       557
   macro avg       0.72      0.65      0.68       557
weighted avg       0.75      0.69      0.72       557
 samples avg       0.69      0.69      0.69       557



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
