In [1]:
# Imports
import os
import pandas as pd
import numpy as np
from collections import Counter
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [2]:
# Load data
vocab_path = '../../models/sentiment/vocab.json'
save_path = '../../models/sentiment/'
train_df = pd.read_csv('../../data/sentiment-analysis/processed_train.csv')
val_df = pd.read_csv('../../data/sentiment-analysis/processed_val.csv')

# Prep text column safety
train_df = train_df.dropna(subset=['clean_text'])
train_df['clean_text'] = train_df['clean_text'].astype(str)
val_df = val_df.dropna(subset=['clean_text'])
val_df['clean_text'] = val_df['clean_text'].astype(str)

max_len = 100
embedding_dim = 256

In [3]:
# Build Vocabulary from training data (simple word-to-index)
def build_vocab(texts, min_freq=2):
    from collections import Counter
    counter = Counter()
    for text in texts:
        tokens = text.split()
        counter.update(tokens)
    vocab = [w for w, freq in counter.items() if freq >= min_freq]
    word2idx = {w: i + 2 for i, w in enumerate(vocab)}
    word2idx['<PAD>'] = 0
    word2idx['<UNK>'] = 1
    return word2idx

vocab = build_vocab(train_df['clean_text'])
vocab_size = len(vocab)

# Save
import json
with open(vocab_path, 'w') as f:
    json.dump(vocab, f)

print("Vocab saved.")
print("Vocab size:", vocab_size)

Vocab saved.
Vocab size: 9298


In [4]:
# Dataset class
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, df, word2idx, max_len=max_len):
        self.texts = df['clean_text'].values
        self.labels = df['label'].values
        self.word2idx = word2idx
        self.max_len = max_len

    def encode(self, text):
        tokens = text.split()
        ids = [self.word2idx.get(t, self.word2idx['<UNK>']) for t in tokens]
        if len(ids) < self.max_len:
            ids.extend([self.word2idx['<PAD>']] * (self.max_len - len(ids)))
        else:
            ids = ids[:self.max_len]
        return ids

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = torch.tensor(self.encode(self.texts[idx]), dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

In [5]:
# DataLoaders
train_dataset = SentimentDataset(train_df, vocab)
val_dataset = SentimentDataset(val_df, vocab)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

In [7]:
# LSTM classifier
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128, output_dim=3, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)
        out = self.dropout(hidden[-1])
        out = self.fc(out)
        return out

# Bi-LSTM
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=embedding_dim, hidden_dim=128, output_dim=3, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # 2 for bidirectional
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)
        hidden_cat = torch.cat((hidden[-2], hidden[-1]), dim=1)
        out = self.dropout(hidden_cat)
        return self.fc(out)

# Instantiate the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(vocab_size).to(device)

In [8]:
# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMClassifier(vocab_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [9]:
# Training loop with best model save
best_val_acc = 0.0

for epoch in range(7):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(x_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x_batch.size(0)
        preds = output.argmax(dim=1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)
    train_acc = correct / total

    model.eval()
    val_correct, val_total = 0, 0
    with torch.no_grad():
        for x_batch, y_batch in val_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            output = model(x_batch)
            preds = output.argmax(dim=1)
            val_correct += (preds == y_batch).sum().item()
            val_total += y_batch.size(0)
    val_acc = val_correct / val_total

    print(f"Epoch {epoch+1}: Train acc {train_acc:.3f}, Val acc {val_acc:.3f}")
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        torch.save(model.state_dict(), os.path.join(save_path, 'best_sentiment_model.pt'))

print('Finished training. Best val acc:', best_val_acc)

Epoch 1: Train acc 0.586, Val acc 0.665
Epoch 2: Train acc 0.720, Val acc 0.699
Epoch 3: Train acc 0.785, Val acc 0.707
Epoch 4: Train acc 0.835, Val acc 0.701
Epoch 5: Train acc 0.882, Val acc 0.699
Epoch 6: Train acc 0.914, Val acc 0.683
Epoch 7: Train acc 0.939, Val acc 0.691
Finished training. Best val acc: 0.7074235807860262
