In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import os

# Priprema foldera za confusion matrices
os.makedirs('confusion_matrices', exist_ok=True)

# Definicije datoteka
train_file = 'train-2.csv'
test_files = ['test-1.csv', 'test-2.csv', 'test-3.csv']
test_names = ['Test 1', 'Test 2', 'Test 3']

text_column = 'Sentence'
target_column = 'Label'

# Funkcija za učitavanje podataka
def load_data(file):
    df = pd.read_csv(file)
    X_text = df[text_column].astype(str)
    y = df[target_column]
    return X_text, y

# Modeli
models = [
    ('1.a', 'Machine learning', 'Logistic regression', LogisticRegression(max_iter=1000, solver='liblinear', class_weight='balanced')),
    ('1.b', 'Machine learning', 'Decision tree classifier', DecisionTreeClassifier(class_weight='balanced', random_state=42))
]

# Priprema tablice (naizmjenično Train2, Test)
table = []

for code, method, algorithm, model in models:
    # --- Train2 red ---
    row_train = [f"{code}.i", method, algorithm, "Train2"]
    X_train_text, y_train = load_data(train_file)
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(X_train_text)
    for idx, test_file in enumerate(test_files):
        X_test_text, y_test = load_data(test_file)
        X_test = vectorizer.transform(X_test_text)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        accuracy = accuracy_score(y_test, y_pred)
        metric_str = f"Precision={precision:.3f}, Recall={recall:.3f}, F1={f1:.3f}, Accuracy={accuracy:.3f}"
        row_train.append(metric_str)
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(cmap=plt.cm.Blues)
        plt.title(f'Confusion Matrix: {algorithm}\nTrain: Train2 Test: {test_names[idx]}')
        plt.savefig(f'confusion_matrices/cm_{algorithm.replace(" ", "_")}_Train2_{test_names[idx].replace(" ", "")}.png')
        plt.close()
    table.append(row_train)

    # --- Test red ---
    row_test = [f"{code}.ii", method, algorithm, "Test"]
    for idx, test_file in enumerate(test_files):
        X_test_text, y_test = load_data(test_file)
        X_test = vectorizer.fit_transform(X_test_text)
        model.fit(X_test, y_test)
        y_pred = model.predict(X_test)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        accuracy = accuracy_score(y_test, y_pred)
        metric_str = f"Precision={precision:.3f}, Recall={recall:.3f}, F1={f1:.3f}, Accuracy={accuracy:.3f}"
        row_test.append(metric_str)
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(cmap=plt.cm.Blues)
        plt.title(f'Confusion Matrix: {algorithm}\nTrain: Test Test: {test_names[idx]}')
        plt.savefig(f'confusion_matrices/cm_{algorithm.replace(" ", "_")}_Test_{test_names[idx].replace(" ", "")}.png')
        plt.close()
    table.append(row_test)

# Ispis tablice u željenom markdown formatu
header = "| #      | method           | algorithm                | skup   | Test 1                                                  | Test 2                                                  | Test 3                                                  |"
sep =    "|--------|------------------|--------------------------|--------|---------------------------------------------------------|---------------------------------------------------------|---------------------------------------------------------|"
print(header)
print(sep)
for row in table:
    print(f"| {row[0]:<6} | {row[1]:<16} | {row[2]:<24} | {row[3]:<6} | {row[4]:<55} | {row[5]:<55} | {row[6]:<55} |")

# (Opcionalno) spremi u .md datoteku
with open('results_group2.md', 'w', encoding='utf-8') as f:
    f.write(header + "\n")
    f.write(sep + "\n")
    for row in table:
        f.write(f"| {row[0]:<6} | {row[1]:<16} | {row[2]:<24} | {row[3]:<6} | {row[4]:<55} | {row[5]:<55} | {row[6]:<55} |\n")

print("\nTablica je ispisana i spremljena u 'results_group2.md'.")
print("Sve confusion matrice su spremljene u folder 'confusion_matrices/'.")


| #      | method           | algorithm                | skup   | Test 1                                                  | Test 2                                                  | Test 3                                                  |
|--------|------------------|--------------------------|--------|---------------------------------------------------------|---------------------------------------------------------|---------------------------------------------------------|
| 1.a.i  | Machine learning | Logistic regression      | Train2 | Precision=0.583, Recall=0.587, F1=0.584, Accuracy=0.587 | Precision=0.572, Recall=0.606, F1=0.573, Accuracy=0.606 | Precision=0.528, Recall=0.465, F1=0.427, Accuracy=0.465 |
| 1.a.ii | Machine learning | Logistic regression      | Test   | Precision=0.945, Recall=0.940, F1=0.938, Accuracy=0.940 | Precision=0.965, Recall=0.964, F1=0.963, Accuracy=0.964 | Precision=0.982, Recall=0.982, F1=0.982, Accuracy=0.982 |
| 1.b.i  | Machine learning | Decision t