In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models.fasttext import load_facebook_model
from torch.utils.data import DataLoader, Dataset
import numpy as np

FASTTEXT_PATH = "FastText.bin"
TRAIN_PATH = "TRAIN.tsv"
TEST_PATH = "Test-2.tsv"

print("Učitavanje FastText modela...")
ft_model = load_facebook_model(FASTTEXT_PATH)
embedding_dim = ft_model.vector_size

def tokenize(text):
    return text.lower().split()

class FastTextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = [tokenize(text) for text in texts]
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        tokens = self.texts[idx]
        vectors = [ft_model.wv[token] if token in ft_model.wv else np.zeros(embedding_dim) for token in tokens]
        max_len = 50
        if len(vectors) > max_len:
            vectors = vectors[:max_len]
        else:
            vectors += [np.zeros(embedding_dim)] * (max_len - len(vectors))
        return torch.tensor(vectors, dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

class CNNClassifier(nn.Module):
    def __init__(self, embedding_dim, num_classes):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv1d(embedding_dim, 100, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(100, num_classes)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.relu(self.conv1(x))
        x = self.pool(x).squeeze(2)
        x = self.dropout(x)
        return self.fc(x)

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        x = torch.cat((hn[-2], hn[-1]), dim=1)  # concatenate forward and backward hidden states
        x = self.dropout(x)
        return self.fc(x)

print("Učitavanje podataka...")
train_df = pd.read_csv(TRAIN_PATH, sep="\t").rename(columns={"Sentence": "text", "Label": "label"})
test_df = pd.read_csv(TEST_PATH, sep="\t").rename(columns={"Sentence": "text", "Label": "label"})
train_df["label"] = train_df["label"].astype(int)
test_df["label"] = test_df["label"].astype(int)

num_classes = train_df["label"].nunique()

train_dataset = FastTextDataset(train_df["text"].tolist(), train_df["label"].tolist())
test_dataset = FastTextDataset(test_df["text"].tolist(), test_df["label"].tolist())

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            output = model(x)
            preds = torch.argmax(output, dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
    return correct / total

for model_type in ["LSTM", "CNN"]:
    print(f"\n==============================")
    print(f"Treniramo model: {model_type}")
    print(f"==============================")

    if model_type == "LSTM":
        model = LSTMClassifier(embedding_dim, hidden_dim=256, num_classes=num_classes)
    else:
        model = CNNClassifier(embedding_dim, num_classes=num_classes)

    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(1, 11):
        train_loss = train(model, train_loader, optimizer, criterion)
        test_acc = evaluate(model, test_loader)
        print(f"{model_type} | Epoch {epoch} | Loss: {train_loss:.4f} | Test Accuracy: {test_acc:.4f}")

    model_path = f"fasttext_{model_type.lower()}.pt"
    torch.save(model.state_dict(), model_path)
    print(f"{model_type} model spremljen kao: {model_path}")

Učitavanje FastText modela...
Učitavanje podataka...

Treniramo model: LSTM


  return torch.tensor(vectors, dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)


LSTM | Epoch 1 | Loss: 1.0857 | Test Accuracy: 0.2578
LSTM | Epoch 2 | Loss: 1.0200 | Test Accuracy: 0.5385
LSTM | Epoch 3 | Loss: 0.9697 | Test Accuracy: 0.5236
LSTM | Epoch 4 | Loss: 0.9375 | Test Accuracy: 0.4764
LSTM | Epoch 5 | Loss: 0.9093 | Test Accuracy: 0.6194
LSTM | Epoch 6 | Loss: 0.9120 | Test Accuracy: 0.6505
LSTM | Epoch 7 | Loss: 0.8765 | Test Accuracy: 0.5843
LSTM | Epoch 8 | Loss: 0.8600 | Test Accuracy: 0.4494
LSTM | Epoch 9 | Loss: 0.8354 | Test Accuracy: 0.5776
LSTM | Epoch 10 | Loss: 0.8195 | Test Accuracy: 0.5263
LSTM model spremljen kao: fasttext_lstm.pt

Treniramo model: CNN
CNN | Epoch 1 | Loss: 1.0576 | Test Accuracy: 0.5466
CNN | Epoch 2 | Loss: 0.9960 | Test Accuracy: 0.5452
CNN | Epoch 3 | Loss: 0.9448 | Test Accuracy: 0.6032
CNN | Epoch 4 | Loss: 0.9036 | Test Accuracy: 0.5587
CNN | Epoch 5 | Loss: 0.8664 | Test Accuracy: 0.5439
CNN | Epoch 6 | Loss: 0.8334 | Test Accuracy: 0.6100
CNN | Epoch 7 | Loss: 0.8104 | Test Accuracy: 0.6100
CNN | Epoch 8 | Loss: 0