In [3]:
# !pip install gensim scikit-learn pandas numpy torch tqdm

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter
import gensim

# --- UČITAVANJE PODATAKA ---
train_df = pd.read_csv('train-2.csv')
test2_df = pd.read_csv('test-2.csv')

def get_text_column(df):
    for col in df.columns:
        if col.lower() in ['sentence', 'text']:
            return col
    raise ValueError("Nema stupca 'Sentence' ili 'Text'!")

train_text_col = get_text_column(train_df)
test2_text_col = get_text_column(test2_df)

# --- TOKENIZACIJA I VOKABULAR ---
def tokenize(text):
    return text.lower().split()

counter = Counter()
for text in train_df[train_text_col]:
    counter.update(tokenize(text))
vocab = {word: idx+2 for idx, (word, _) in enumerate(counter.most_common())}
vocab['<unk>'] = 0
vocab['<pad>'] = 1

# --- EMBEDDING ---
embedding_path = 'cc.hr.300.vec'
embeddings = gensim.models.KeyedVectors.load_word2vec_format(embedding_path)
embedding_dim = embeddings.vector_size
embedding_matrix = np.zeros((len(vocab), embedding_dim))
for word, idx in vocab.items():
    if word in embeddings:
        embedding_matrix[idx] = embeddings[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim, ))

# --- DATASET ---
class TextDataset(Dataset):
    def __init__(self, df, text_col, vocab, max_len=50):
        self.texts = df[text_col].tolist()
        self.labels = df['Label'].tolist()
        self.vocab = vocab
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        tokens = tokenize(self.texts[idx])
        ids = [self.vocab.get(token, self.vocab['<unk>']) for token in tokens][:self.max_len]
        ids += [self.vocab['<pad>']] * (self.max_len - len(ids))
        return torch.tensor(ids), torch.tensor(self.labels[idx])

max_len = 50
batch_size = 16
train_ds = TextDataset(train_df, train_text_col, vocab, max_len)
test2_ds = TextDataset(test2_df, test2_text_col, vocab, max_len)

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test2_dl = DataLoader(test2_ds, batch_size=batch_size)

# --- MODELI ---
class GRUClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=128, num_classes=3, dropout=0.7):
        super().__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embedding.weight.requires_grad = False
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)
    def forward(self, x):
        x = self.embedding(x)
        _, hidden = self.gru(x)
        out = self.dropout(hidden[-1])
        return self.fc(out)

class CNNClassifier(nn.Module):
    def __init__(self, embedding_matrix, num_filters=128, kernel_sizes=[3,4,5], num_classes=3, dropout=0.7):
        super().__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embedding.weight.requires_grad = False
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embedding_dim)) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)
    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [torch.max(pool, dim=2)[0] for pool in x]
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        return self.fc(x)

# --- TRENING I EVAL ---
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def eval_model(model, dataloader, device):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            logits = model(x)
            pred = logits.argmax(1).cpu().numpy()
            preds.extend(pred)
            targets.extend(y.numpy())
    return np.array(preds), np.array(targets)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def run_training(model_class, name, epochs=30, dropout=0.7, lr=5e-4):
    print(f"\n{name} training...")
    model = model_class(embedding_matrix, dropout=dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        train_loss = train_epoch(model, train_dl, optimizer, criterion, device)
        print(f"{name} Epoch {epoch+1}: Train loss {train_loss:.4f}")
    preds, targets = eval_model(model, test2_dl, device)
    report = classification_report(targets, preds, digits=4, output_dict=True, target_names=["positive", "neutral", "negative"])
    matrix = confusion_matrix(targets, preds)
    print("\nClassification report:\n", classification_report(targets, preds, digits=4, target_names=["positive", "neutral", "negative"]))
    print("Confusion matrix:\n", matrix)
    return {
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1': report['macro avg']['f1-score'],
        'accuracy': report['accuracy'],
        'confusion_matrix': matrix.tolist(),
        'full_report': classification_report(targets, preds, digits=4, target_names=["positive", "neutral", "negative"])
    }

gru_results = run_training(GRUClassifier, "GRU", epochs=30, dropout=0.7, lr=5e-4)
cnn_results = run_training(CNNClassifier, "CNN", epochs=30, dropout=0.7, lr=5e-4)

# --- Spremi u results.md ---
with open('results.md', 'w', encoding='utf-8') as f:
    for model_name, results in [('GRU', gru_results), ('CNN', cnn_results)]:
        f.write(f"## {model_name}\n\n")
        f.write(f"- Precision: {results['precision']:.4f}\n")
        f.write(f"- Recall: {results['recall']:.4f}\n")
        f.write(f"- F1: {results['f1']:.4f}\n")
        f.write(f"- Accuracy: {results['accuracy']:.4f}\n")
        f.write(f"- Confusion matrix: {results['confusion_matrix']}\n\n")
        f.write(f"Full classification report:\n{results['full_report']}\n\n")



GRU training...
GRU Epoch 1: Train loss 0.9762
GRU Epoch 2: Train loss 0.9636
GRU Epoch 3: Train loss 0.9614
GRU Epoch 4: Train loss 0.9509
GRU Epoch 5: Train loss 0.9424
GRU Epoch 6: Train loss 0.9390
GRU Epoch 7: Train loss 0.9315
GRU Epoch 8: Train loss 0.9138
GRU Epoch 9: Train loss 0.8970
GRU Epoch 10: Train loss 0.8567
GRU Epoch 11: Train loss 0.7876
GRU Epoch 12: Train loss 0.6921
GRU Epoch 13: Train loss 0.5870
GRU Epoch 14: Train loss 0.4855
GRU Epoch 15: Train loss 0.3708
GRU Epoch 16: Train loss 0.3034
GRU Epoch 17: Train loss 0.2592
GRU Epoch 18: Train loss 0.2131
GRU Epoch 19: Train loss 0.1887
GRU Epoch 20: Train loss 0.1441
GRU Epoch 21: Train loss 0.1493
GRU Epoch 22: Train loss 0.1389
GRU Epoch 23: Train loss 0.1099
GRU Epoch 24: Train loss 0.1019
GRU Epoch 25: Train loss 0.1018
GRU Epoch 26: Train loss 0.1170
GRU Epoch 27: Train loss 0.0990
GRU Epoch 28: Train loss 0.0695
GRU Epoch 29: Train loss 0.0631
GRU Epoch 30: Train loss 0.0704

Classification report:
        