In [19]:
!pip install torch torchvision torchaudio
!pip install pandas
!pip install pyarrow



In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn

In [21]:
from google.colab import drive
drive.mount('/content/drive')

train_df = pd.read_parquet('/content/drive/MyDrive/train.parquet')
test_df = pd.read_parquet('/content/drive/MyDrive/test.parquet')

train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=42)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        tokens = text.lower().split()
        counter.update(tokens)
    vocab = {"<unk>": 0, "<pad>": 1}
    idx = 2
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = idx
            idx += 1
    return vocab

vocab = build_vocab(train_df['text'])

In [23]:
def text_to_indices(text, vocab):
    return [vocab.get(token, vocab["<unk>"]) for token in text.lower().split()]

class TextDataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]['text']
        label = self.df.iloc[idx]['label']
        tokens = torch.tensor(text_to_indices(text, self.vocab), dtype=torch.long)
        return tokens, label

def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=vocab["<pad>"])
    labels = torch.tensor(labels, dtype=torch.long)
    return texts, labels


In [24]:
train_dataset = TextDataset(train_df, vocab)
val_dataset = TextDataset(val_df, vocab)
test_dataset = TextDataset(test_df, vocab)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch)

In [25]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, hidden = self.rnn(x)
        return self.fc(hidden.squeeze(0))

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RNNClassifier(
    vocab_size=len(vocab),
    embed_dim=100,
    hidden_dim=128,
    output_dim=len(train_df['label'].unique()),
    pad_idx=vocab["<pad>"]
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train_epoch(model, loader):
    model.train()
    total_loss, total_correct = 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(y)
        total_correct += (output.argmax(1) == y).sum().item()
    return total_loss / len(loader.dataset), total_correct / len(loader.dataset)

def evaluate(model, loader):
    model.eval()
    total_loss, total_correct = 0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            output = model(x)
            loss = criterion(output, y)
            total_loss += loss.item() * len(y)
            total_correct += (output.argmax(1) == y).sum().item()
    return total_loss / len(loader.dataset), total_correct / len(loader.dataset)

In [27]:
for epoch in range(1, 6):
    train_loss, train_acc = train_epoch(model, train_loader)
    val_loss, val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch}: Train loss {train_loss:.4f}, acc {train_acc:.4f} | Val loss {val_loss:.4f}, acc {val_acc:.4f}")

Epoch 1: Train loss 1.3851, acc 0.2511 | Val loss 1.3826, acc 0.2528
Epoch 2: Train loss 1.3832, acc 0.2533 | Val loss 1.3833, acc 0.2507
Epoch 3: Train loss 1.3903, acc 0.2538 | Val loss 1.3979, acc 0.2529
Epoch 4: Train loss 1.3892, acc 0.2653 | Val loss 1.3810, acc 0.2809
Epoch 5: Train loss 1.3892, acc 0.2596 | Val loss 1.3866, acc 0.2712


In [28]:
test_loss, test_acc = evaluate(model, test_loader)
print(f"Final Test loss: {test_loss:.4f}, accuracy: {test_acc:.4f}")

Final Test loss: 1.3871, accuracy: 0.2722
