In [1]:
# Imports
import os
import pandas as pd
from collections import Counter
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import json

In [2]:
# Parameters
max_len = 100
embedding_dim = 64
hidden_dim = 120
num_layers = 1
dropout = 0.5
batch_size = 18
epochs = 15
patience_limit = 4

save_path = '../../models/sentiment/'
os.makedirs(save_path, exist_ok=True)
vocab_path = os.path.join(save_path, 'vocab.json')

In [3]:
# Load processed data
train_df = pd.read_csv('../../data/sentiment-analysis/processed_train.csv')
val_df = pd.read_csv('../../data/sentiment-analysis/processed_val.csv')

In [4]:
# Build vocabulary
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        tokens = text.split()
        counter.update(tokens)
    vocab = [w for w, freq in counter.items() if freq >= min_freq]
    word2idx = {w: i + 2 for i, w in enumerate(vocab)}
    word2idx['<PAD>'] = 0
    word2idx['<UNK>'] = 1
    return word2idx

vocab = build_vocab(train_df['clean_text'])
vocab_size = len(vocab)

with open(vocab_path, 'w') as f:
    json.dump(vocab, f)

print("Vocab saved at:", vocab_path)
print("Vocab size:", vocab_size)

Vocab saved at: ../../models/sentiment/vocab.json
Vocab size: 9298


In [5]:
# Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, df, word2idx, max_len=max_len):
        self.texts = df['clean_text'].fillna('').values
        self.labels = df['label'].values
        self.word2idx = word2idx
        self.max_len = max_len

    def encode(self, text):
        tokens = text.split()
        ids = [self.word2idx.get(t, self.word2idx['<UNK>']) for t in tokens]
        if len(ids) < self.max_len:
            ids += [self.word2idx['<PAD>']] * (self.max_len - len(ids))
        else:
            ids = ids[:self.max_len]
        return ids

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = torch.tensor(self.encode(self.texts[idx]), dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

In [6]:
# DataLoaders
train_dataset = SentimentDataset(train_df, vocab)
val_dataset = SentimentDataset(val_df, vocab)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [10]:
# Model
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim,
                 output_dim=3, num_layers=num_layers, dropout=dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers,
                            batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)
        hidden_cat = torch.cat((hidden[-2], hidden[-1]), dim=1)
        out = self.dropout(hidden_cat)
        return self.fc(out)

In [11]:
# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMClassifier(vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='max', patience=2, factor=0.5, min_lr=1e-5)



In [12]:
# Training loop with early stopping
best_val_acc = 0
patience_counter = 0

for epoch in range(epochs):
    # Train
    model.train()
    total, correct = 0, 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(x_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        preds = output.argmax(dim=1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)
    train_acc = correct / total

    # Validation
    model.eval()
    val_correct, val_total = 0, 0
    with torch.no_grad():
        for x_batch, y_batch in val_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            output = model(x_batch)
            preds = output.argmax(dim=1)
            val_correct += (preds == y_batch).sum().item()
            val_total += y_batch.size(0)
    val_acc = val_correct / val_total
    print(f"Epoch {epoch+1}: Train acc {train_acc:.3f}, Val acc {val_acc:.3f}")

    scheduler.step(val_acc)
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        torch.save(model.state_dict(), os.path.join(save_path, 'best_sentiment_model.pt'))
    else:
        patience_counter += 1
        if patience_counter >= patience_limit:
            print("Early stopping!")
            break

print("Finished training. Best val acc:", best_val_acc)

Epoch 1: Train acc 0.538, Val acc 0.636
Epoch 2: Train acc 0.678, Val acc 0.684
Epoch 3: Train acc 0.741, Val acc 0.695
Epoch 4: Train acc 0.777, Val acc 0.707
Epoch 5: Train acc 0.807, Val acc 0.706
Epoch 6: Train acc 0.830, Val acc 0.703
Epoch 7: Train acc 0.855, Val acc 0.700
Epoch 8: Train acc 0.893, Val acc 0.704
Early stopping!
Finished training. Best val acc: 0.7069383794274624
