In [31]:
!pip install pyarrow
!pip install pandas scikit-learn



In [32]:
import warnings
warnings.filterwarnings('ignore',category=UserWarning)

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import math

In [34]:
from google.colab import drive
drive.mount('/content/drive')
train_df = pd.read_parquet('/content/drive/MyDrive/train.parquet')
test_df = pd.read_parquet('/content/drive/MyDrive/test.parquet')
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=42)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        tokens = text.lower().split()
        counter.update(tokens)
    vocab = {"<unk>": 0, "<pad>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

vocab = build_vocab(train_df['text'])

In [36]:
def text_to_indices(text, vocab):
    return [vocab.get(token, vocab["<unk>"]) for token in text.lower().split()]

class TextDataset(Dataset):
    def __init__(self, df, vocab):
        self.texts = df['text'].tolist()
        self.labels = df['label'].tolist()
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = torch.tensor(text_to_indices(self.texts[idx], self.vocab), dtype=torch.long)
        return tokens, self.labels[idx]

def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=vocab["<pad>"])
    return texts, torch.tensor(labels)

In [37]:
train_loader = DataLoader(TextDataset(train_df, vocab), batch_size=64, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(TextDataset(val_df, vocab), batch_size=64, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(TextDataset(test_df, vocab), batch_size=64, shuffle=False, collate_fn=collate_batch)

In [38]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, output_dim, pad_idx,max_len=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.pos_encoder = PositionalEncoding(embed_dim,max_len=max_len)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim,batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, output_dim)
        self.pad_idx=pad_idx

    def forward(self, x):
        mask = (x == vocab["<pad>"])
        x = self.embedding(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x, src_key_padding_mask=mask).mean(dim=1)
        return self.fc(x)

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerClassifier(
    vocab_size=len(vocab),
    embed_dim=128,
    num_heads=4,
    hidden_dim=256,
    num_layers=2,
    output_dim=4,
    pad_idx=vocab["<pad>"]
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

def train_epoch(model, loader):
    model.train()
    total_loss, total_correct = 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(y)
        total_correct += (output.argmax(1) == y).sum().item()
    return total_loss / len(loader.dataset), total_correct / len(loader.dataset)

def evaluate(model, loader):
    model.eval()
    total_loss, total_correct = 0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            output = model(x)
            loss = criterion(output, y)
            total_loss += loss.item() * len(y)
            total_correct += (output.argmax(1) == y).sum().item()
    return total_loss / len(loader.dataset), total_correct / len(loader.dataset)

In [40]:
for epoch in range(1, 6):
    train_loss, train_acc = train_epoch(model, train_loader)
    val_loss, val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch}: Train loss {train_loss:.4f}, acc {train_acc:.4f} | Val loss {val_loss:.4f}, acc {val_acc:.4f}")

test_loss, test_acc = evaluate(model, test_loader)
print(f"Final Test loss: {test_loss:.4f}, accuracy: {test_acc:.4f}")

Epoch 1: Train loss 0.5881, acc 0.7747 | Val loss 0.5388, acc 0.8627
Epoch 2: Train loss 0.3423, acc 0.8788 | Val loss 0.4559, acc 0.8876
Epoch 3: Train loss 0.2684, acc 0.9065 | Val loss 0.3940, acc 0.8984
Epoch 4: Train loss 0.2193, acc 0.9238 | Val loss 0.3605, acc 0.8982
Epoch 5: Train loss 0.1805, acc 0.9373 | Val loss 0.3519, acc 0.8971
Final Test loss: 0.3625, accuracy: 0.8978
