In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import re

# Definicije datoteka
train_files = {
    "train2": ["train-2.csv"],
    "TRAIN": ["train-1.csv", "train-2.csv", "train-3.csv"]
}
test_files = {
    "Test 1": "test-1.csv",
    "Test 2": "test-2.csv",
    "Test 3": "test-3.csv"
}
text_column = "Sentence"
target_column = "Label"

# Mapiranje labela
label_map = {
    0: 'positive',
    1: 'neutral',
    2: 'negative'
}

# Funkcija za predobradu teksta (bez NLTK-a)
def preprocess_text(text):
    text = text.lower()  # Pretvori u mala slova
    text = re.sub(r'[^\w\s]', '', text)  # Ukloni sve znakove osim slova, brojeva i razmaka
    return text

def load_and_filter(files):
    if isinstance(files, str):
        files = [files]
    X_all, y_all = [], []
    for file in files:
        df = pd.read_csv(file)
        df = df[df[target_column].isin([0, 1, 2])]
        # Predobrada teksta (bez NLTK-a)
        texts = [preprocess_text(text) for text in df[text_column].astype(str)]
        X_all.extend(texts)
        y_all.extend(df[target_column])
    return X_all, y_all

models = [
    ('1.a.i', 'Machine learning', 'SVM', SVC(class_weight='balanced', kernel='rbf', probability=True, random_state=42)),
    ('1.b.i', 'Machine learning', 'XGB', GradientBoostingClassifier(random_state=42)),
]

table = []

# Učitaj test skupove jednom
X_tests_raw = {}
y_tests = {}
for test_name, test_file in test_files.items():
    X_text, y = load_and_filter(test_file)
    X_tests_raw[test_name] = X_text
    y_tests[test_name] = y

for code, method, algorithm, model in models:
    for train_name, train_sources in train_files.items():
        # Učitaj train skup
        X_train_text, y_train = load_and_filter(train_sources)
        # Vektorizacija
        vectorizer = TfidfVectorizer(max_features=7000, ngram_range=(1, 4), sublinear_tf=True, norm='l2')
        X_train = vectorizer.fit_transform(X_train_text)
        # Pripremi test skupove za ovaj vektorizer
        X_tests = {name: vectorizer.transform(X_tests_raw[name]) for name in X_tests_raw}
        # Treniraj model
        model.fit(X_train, y_train)
        # Za svaki test skup izračunaj metrike
        metrics = []
        for test_name in ["Test 1", "Test 2", "Test 3"]:
            y_true = y_tests[test_name]
            y_pred = model.predict(X_tests[test_name])
            precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
            recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
            accuracy = accuracy_score(y_true, y_pred)
            metric_str = f"Precision={precision:.3f}, Recall={recall:.3f}, F1={f1:.3f}, Accuracy={accuracy:.3f}"
            metrics.append(metric_str)
        # Dodaj redak u tablicu
        row = [code, method, algorithm, train_name] + metrics
        table.append(row)

# Ispis tablice u markdown formatu
header = "| #      | method           | algorithm           | train   | Test 1                                           | Test 2                                           | Test 3                                           |"
sep    = "|--------|------------------|---------------------|---------|--------------------------------------------------|--------------------------------------------------|--------------------------------------------------|"
print("\n" + header)
print(sep)
for row in table:
    print(f"| {row[0]:<6} | {row[1]:<16} | {row[2]:<19} | {row[3]:<7} | {row[4]:<48} | {row[5]:<48} | {row[6]:<48} |")

with open('results.md', 'w', encoding='utf-8') as f:
    f.write(header + "\n")
    f.write(sep + "\n")
    for row in table:
        f.write(f"| {row[0]:<6} | {row[1]:<16} | {row[2]:<19} | {row[3]:<7} | {row[4]:<48} | {row[5]:<48} | {row[6]:<48} |\n")
    


| #      | method           | algorithm           | train   | Test 1                                           | Test 2                                           | Test 3                                           |
|--------|------------------|---------------------|---------|--------------------------------------------------|--------------------------------------------------|--------------------------------------------------|
| 1.a.i  | Machine learning | SVM                 | train2  | Precision=0.579, Recall=0.608, F1=0.590, Accuracy=0.608 | Precision=0.583, Recall=0.618, F1=0.574, Accuracy=0.618 | Precision=0.526, Recall=0.435, F1=0.366, Accuracy=0.435 |
| 1.a.i  | Machine learning | SVM                 | TRAIN   | Precision=0.653, Recall=0.640, F1=0.646, Accuracy=0.640 | Precision=0.617, Recall=0.628, F1=0.619, Accuracy=0.628 | Precision=0.766, Recall=0.739, F1=0.732, Accuracy=0.739 |
| 1.b.i  | Machine learning | XGB                 | train2  | Precision=0.585, Recall=0.649, F1=0