In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("/Users/nguyennguyen/Desktop/github_repos/personal/rag_medical/src/data/emotion_data/training_small.csv")
df['label'] = df['label'].astype(int)

# Simple tokenizer and vocab builder
def build_vocab(texts):
    tokens = [word for text in texts for word in text.split()]
    vocab = {"<pad>": 0}
    for word in tokens:
        if word not in vocab:
            vocab[word] = len(vocab)
    return vocab

class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.data = [torch.tensor([vocab.get(word, 0) for word in text.split()]) for text in texts]
        self.labels = torch.tensor(labels)
    def __len__(self): return len(self.labels)
    def __getitem__(self, idx): return self.data[idx], self.labels[idx]

def collate(batch):
    texts, labels = zip(*batch)
    return pad_sequence(texts, batch_first=True), torch.tensor(labels)

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, num_classes=6):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
    def forward(self, x):
        x = self.embed(x)
        _, (h_n, _) = self.lstm(x)
        return self.fc(h_n[-1])

# Prepare data
train_texts, test_texts, train_labels, test_labels = train_test_split(df["text"], df["label"], test_size=0.2)
vocab = build_vocab(train_texts)
train_ds = TextDataset(train_texts.tolist(), train_labels.tolist(), vocab)
test_ds = TextDataset(test_texts.tolist(), test_labels.tolist(), vocab)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate)
test_dl = DataLoader(test_ds, batch_size=32, collate_fn=collate)

# Train
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(len(vocab)).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(200):
    model.train()
    for x, y in train_dl:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        loss = loss_fn(model(x), y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} done")

# Evaluate
model.eval()
correct = total = 0
with torch.no_grad():
    for x, y in test_dl:
        x, y = x.to(device), y.to(device)
        preds = model(x).argmax(1)
        correct += (preds == y).sum().item()
        total += y.size(0)
print(f"Accuracy: {correct / total:.2%}")


Epoch 1 done
Epoch 2 done
Epoch 3 done
Epoch 4 done
Epoch 5 done
Epoch 6 done
Epoch 7 done
Epoch 8 done
Epoch 9 done
Epoch 10 done
Epoch 11 done
Epoch 12 done
Epoch 13 done
Epoch 14 done
Epoch 15 done
Epoch 16 done
Epoch 17 done
Epoch 18 done
Epoch 19 done
Epoch 20 done
Epoch 21 done
Epoch 22 done
Epoch 23 done
Epoch 24 done
Epoch 25 done
Epoch 26 done
Epoch 27 done
Epoch 28 done
Epoch 29 done
Epoch 30 done
Epoch 31 done
Epoch 32 done
Epoch 33 done
Epoch 34 done
Epoch 35 done
Epoch 36 done
Epoch 37 done
Epoch 38 done
Epoch 39 done
Epoch 40 done
Epoch 41 done
Epoch 42 done
Epoch 43 done
Epoch 44 done
Epoch 45 done
Epoch 46 done
Epoch 47 done
Epoch 48 done
Epoch 49 done
Epoch 50 done
Epoch 51 done
Epoch 52 done
Epoch 53 done
Epoch 54 done
Epoch 55 done
Epoch 56 done
Epoch 57 done
Epoch 58 done
Epoch 59 done
Epoch 60 done
Epoch 61 done
Epoch 62 done
Epoch 63 done
Epoch 64 done
Epoch 65 done
Epoch 66 done
Epoch 67 done
Epoch 68 done
Epoch 69 done
Epoch 70 done
Epoch 71 done
Epoch 72 done
E