In [None]:
!pip install optuna --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/386.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/231.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/231.9 kB[0m [31m126.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from torch.optim import AdamW

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, recall_score, precision_score
import optuna
import matplotlib.pyplot as plt
import os
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using:",device)

Mounted at /content/drive
Using: cuda


In [None]:
# Path to the database file
db_path = "/content/drive/My Drive/ADL Final Project/optuna_study.db"

# Create or load the study
study = optuna.create_study(
    study_name = "optuna_study",
    storage = f"sqlite:///{db_path}",
    load_if_exists = True,
    direction = "maximize"
)

[I 2025-04-14 06:01:17,549] A new study created in RDB with name: optuna_study


In [None]:
best_params = {'lstm_units': 64, 'learning_rate': 0.009885647542627366, 'weight_decay': 0.00959695196216962}
print(best_params)
best_lstm_units = best_params['lstm_units']
best_learning_rate = best_params['learning_rate']
best_weight_decay = best_params['weight_decay']

{'lstm_units': 64, 'learning_rate': 0.009885647542627366, 'weight_decay': 0.00959695196216962}


In [None]:
train_df = pd.read_csv("/content/drive/My Drive/ADL Final Project/yelp_dataset_train.csv")
test_df = pd.read_csv("/content/drive/My Drive/ADL Final Project/yelp_dataset_test.csv")

texts = train_df['review_text'].tolist()
labels = train_df['class_index'].tolist()

vocab_size = 20000
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<OOV>")
tokenizer.fit_on_texts(texts)

train_sequences = tokenizer.texts_to_sequences(train_df['review_text'])
test_sequences = tokenizer.texts_to_sequences(test_df['review_text'])

max_length = 512

train_padded = pad_sequences(train_sequences, maxlen = max_length, padding = 'post', truncating = 'post')
test_padded = pad_sequences(test_sequences, maxlen = max_length, padding = 'post', truncating = 'post')

X_train, X_val, y_train, y_val = train_test_split(train_padded, labels, test_size = 0.2, stratify = labels, random_state = 42)

In [None]:
# Convert padded sequences and labels to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype = torch.long)
y_train_tensor = torch.tensor(y_train, dtype = torch.long)
X_val_tensor = torch.tensor(X_val, dtype = torch.long)
y_val_tensor = torch.tensor(y_val, dtype = torch.long)
X_test_tensor = torch.tensor(test_padded, dtype = torch.long)

# Create Dataset objects
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, torch.tensor(test_df['class_index'].values, dtype = torch.long))

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size = 128, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 128, shuffle = False)
test_loader = DataLoader(test_dataset, batch_size = 128, shuffle = False)

In [None]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True, bidirectional = True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # *2 for bidirectional
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        pooled = torch.mean(lstm_out, dim = 1)
        out = self.dropout(pooled)
        out = self.fc(out)
        return self.softmax(out)

In [None]:
model = BiLSTMClassifier(vocab_size = 20000, embedding_dim = 100, hidden_dim = best_lstm_units, output_dim = 3,  dropout = 0.3).to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr = best_learning_rate, weight_decay = best_weight_decay)

In [None]:
num_epochs = 10

# Track metrics
train_losses = []
val_losses = []
val_accuracies = []
val_f1s = []
val_recalls = []
val_precisions = []
epoch_times = []

total_start_time = time.time()

for epoch in range(num_epochs):
    epoch_start_time = time.time()

    # ---------- Training ----------
    model.train()
    epoch_train_losses = []

    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        epoch_train_losses.append(loss.item())

    avg_train_loss = sum(epoch_train_losses) / len(epoch_train_losses)
    train_losses.append(avg_train_loss)

    # ---------- Validation ----------
    model.eval()
    epoch_val_losses = []
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for batch in val_loader:
            inputs, targets = batch
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, targets)
            epoch_val_losses.append(loss.item())

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    avg_val_loss = sum(epoch_val_losses) / len(epoch_val_losses)
    val_accuracy = accuracy_score(all_targets, all_preds)
    val_f1 = f1_score(all_targets, all_preds, average="weighted")
    val_recall = recall_score(all_targets, all_preds, average="weighted")
    val_precision = precision_score(all_targets, all_preds, average="weighted")

    val_losses.append(avg_val_loss)
    val_accuracies.append(val_accuracy)
    val_f1s.append(val_f1)
    val_recalls.append(val_recall)
    val_precisions.append(val_precision)

    epoch_time = time.time() - epoch_start_time
    epoch_times.append(epoch_time)

    print(f"\nEpoch {epoch+1}/{num_epochs} - Time: {epoch_time:.2f} sec")
    print(f"Train Loss      : {avg_train_loss:.4f}")
    print(f"Val Loss        : {avg_val_loss:.4f}")
    print(f"Val Accuracy    : {val_accuracy:.4f}")
    print(f"Val F1 Score    : {val_f1:.4f}")
    print(f"Val Recall      : {val_recall:.4f}")
    print(f"Val Precision   : {val_precision:.4f}")

total_training_time = time.time() - total_start_time
print(f"\nTotal Training Time: {total_training_time:.2f} seconds")


Epoch 1/10 - Time: 53.76 sec
Train Loss      : 0.7534
Val Loss        : 0.7138
Val Accuracy    : 0.8341
Val F1 Score    : 0.8344
Val Recall      : 0.8341
Val Precision   : 0.8365

Epoch 2/10 - Time: 56.36 sec
Train Loss      : 0.7066
Val Loss        : 0.7178
Val Accuracy    : 0.8319
Val F1 Score    : 0.8313
Val Recall      : 0.8319
Val Precision   : 0.8313

Epoch 3/10 - Time: 57.61 sec
Train Loss      : 0.7011
Val Loss        : 0.7040
Val Accuracy    : 0.8449
Val F1 Score    : 0.8451
Val Recall      : 0.8449
Val Precision   : 0.8458

Epoch 4/10 - Time: 58.11 sec
Train Loss      : 0.6951
Val Loss        : 0.6988
Val Accuracy    : 0.8487
Val F1 Score    : 0.8477
Val Recall      : 0.8487
Val Precision   : 0.8478

Epoch 5/10 - Time: 57.62 sec
Train Loss      : 0.6901
Val Loss        : 0.6957
Val Accuracy    : 0.8519
Val F1 Score    : 0.8523
Val Recall      : 0.8519
Val Precision   : 0.8534

Epoch 6/10 - Time: 58.30 sec
Train Loss      : 0.6848
Val Loss        : 0.6944
Val Accuracy    : 0.

In [None]:
final_model_dir = "/content/drive/My Drive/ADL Final Project/lstm_final_model"
os.makedirs(final_model_dir, exist_ok=True)

# Save the model state_dict
model_path = os.path.join(final_model_dir, "bilstm_model.pth")
torch.save(model.state_dict(), model_path)

In [None]:
tokenizer_path = "/content/drive/My Drive/ADL Final Project/lstm_final_model/tokenizer.pkl"

# Save tokenizer
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)