## Module loading, data files upload

In [40]:
import gzip
import shutil
import pandas as pd
from gensim.models import KeyedVectors
import numpy as np
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
train = pd.read_csv('train/train.tsv', sep='\t', header=None, names=['label', 'text'])
test = pd.read_csv('dev-0/in.tsv', sep='\t', header=None, names=['text'])
test_results = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None, names=['label'])

## Polish Word2Vec

In [42]:
model_path = './../word2vec_polish.bin'
w2v_model = KeyedVectors.load(model_path)
vector_size = w2v_model.vector_size

## Helper functions

In [43]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-ząćęłńóśźż ]', '', text)
    return text


def text_to_vec(text, model, vector_size):
    words = clean_text(text).split()
    vectors = [model[w] for w in words if w in model]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

## Datasets and DataLoaders creation

In [45]:
class BallDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [44]:
X_train = np.array([text_to_vec(t, w2v_model, vector_size) for t in train['text']])
y_train = train['label'].values

X_test = np.array([text_to_vec(t, w2v_model, vector_size) for t in test['text']])
y_test = test_results['label'].values

In [46]:
train_dataset = BallDataset(X_train, y_train)
test_dataset = BallDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

## Super Simple NN

In [47]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x.squeeze()


## Helper functions to train and evaluate model 

In [48]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleNN(vector_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

In [49]:
def train_model(model, train_loader, optimizer, criterion, device, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss/len(train_loader):.4f}")

In [50]:
def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            preds = (outputs > 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())
    return np.array(all_preds), np.array(all_labels)

In [51]:
train_model(model, train_loader, optimizer, criterion, device, epochs=10)

Epoch 1/10 | Loss: 0.1921
Epoch 2/10 | Loss: 0.1446
Epoch 3/10 | Loss: 0.1340
Epoch 4/10 | Loss: 0.1273
Epoch 5/10 | Loss: 0.1230
Epoch 6/10 | Loss: 0.1196
Epoch 7/10 | Loss: 0.1163
Epoch 8/10 | Loss: 0.1123
Epoch 9/10 | Loss: 0.1094
Epoch 10/10 | Loss: 0.1060


In [52]:
preds, labels = evaluate_model(model, test_loader, device)
print("Test accuracy:", accuracy_score(labels, preds))

Test accuracy: 0.9499266324284666
