# 1. Inštalácia knižníc a prostredia

In [None]:
# Trainer nepracuje správne v novšej verzii 
#Úplné odstránenie nekompatibilných verzií
!pip uninstall -y transformers huggingface-hub tokenizers sentence-transformers peft

# Vyčistenie cache
!rm -rf /root/.cache/huggingface

#Inštalácia stabilných verzií
!pip install transformers==4.40.1 huggingface-hub==0.30.2 tokenizers==0.19.1


In [None]:
#  Vyhodnocovacia funkcia – metriky pre binárnu klasifikáciu
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        "eval_accuracy": accuracy_score(labels, preds),
        "eval_precision": precision_score(labels, preds),
        "eval_recall": recall_score(labels, preds),
        "eval_f1": f1_score(labels, preds)  # kľúčové pre F1 graf a výber najlepšieho modelu
    }


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# 2. Načítanie a filtrovanie dát

In [None]:
# celý dataset je príliš veľký(3M),na Kaggle to padalo na pamäti
# len na 20k komentároch ako pri ToxicBERT, aby boli výsledky porovnateľné
# Načítanie datasetu len SK a EN
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# Načítame celý dataset a nechame iba sk en
ds = load_dataset("FredZhang7/toxi-text-3M", split="train", verification_mode="no_checks")
df = ds.to_pandas()
df = df[df["lang"].isin(["sk", "en"])].reset_index(drop=True)

#pridáme binárny label
df["label"] = df["is_toxic"]

# Stratifikovaný výber 20 000 komentárov, rovnaké ako pri ToxicBERT
df_sample, _ = train_test_split(df, train_size=20000, stratify=df["label"], random_state=42)

#Rozdelenie na train val test množiny + stratifikovane
train_df, temp_df = train_test_split(df_sample, test_size=0.2, stratify=df_sample["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

# Uloženie ako csv
train_df.to_csv("train_multilingual.csv", index=False)
val_df.to_csv("val_multilingual.csv", index=False)
test_df.to_csv("test_multilingual.csv", index=False)

print("Stratifikovanie a rozdelenie hotové:")
print("Train:", train_df.shape, "| Val:", val_df.shape, "| Test:", test_df.shape)


In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)


# 3. Tokenizácia a konverzia na Dataset

In [None]:
checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint),
# tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
# skúšal som aj large verziu, ale bola výrazne pomalšia


def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_ds = Dataset.from_pandas(train_df).map(tokenize, batched=True).remove_columns(["text", "lang"])
valid_ds = Dataset.from_pandas(val_df).map(tokenize, batched=True).remove_columns(["text", "lang"])
test_ds = Dataset.from_pandas(test_df).map(tokenize, batched=True).remove_columns(["text", "lang"])

train_ds = train_ds.rename_column("label", "labels")
valid_ds = valid_ds.rename_column("label", "labels")
test_ds = test_ds.rename_column("label", "labels")


In [None]:
training_losses = []
validation_losses = []
eval_accuracies = []
epochs_logged = []

class EpochMetricsTracker(TrainerCallback):
    def __init__(self):
        self.last_logged_epoch = -1

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs and 'epoch' in logs:
            epoch = int(logs['epoch'])
            if epoch != self.last_logged_epoch:
                training_losses.append(logs['loss'])
                self.last_logged_epoch = epoch

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            if 'eval_loss' in metrics:
                validation_losses.append(metrics['eval_loss'])
            if 'eval_accuracy' in metrics:
                eval_accuracies.append(metrics['eval_accuracy'])
            epochs_logged.append(int(state.epoch))


# 4. Tréningové argumenty + trénovanie modelu

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5, #  pri 3e-5 sa rýchlo preučil
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=7,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    logging_dir="./logs",
    logging_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EpochMetricsTracker(), EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()


In [None]:
from datasets import Dataset
checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

train_ds = Dataset.from_pandas(train_df).map(tokenize, batched=True)
val_ds = Dataset.from_pandas(val_df).map(tokenize, batched=True)
test_ds = Dataset.from_pandas(test_df).map(tokenize, batched=True)

for ds in [train_ds, val_ds, test_ds]:
    ds = ds.rename_column("label", "labels")
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


# 5. Vyhodnotenie výsledkov a vizualizácia

In [None]:
#Vyhodnotenie modelu 
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef, roc_auc_score, precision_recall_curve, auc, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

# Predikcia na testovacej množine
preds_output = trainer.predict(test_ds)
labels = preds_output.label_ids
preds = np.argmax(preds_output.predictions, axis=1)
probs = torch.nn.functional.softmax(torch.tensor(preds_output.predictions), dim=1)[:, 1].numpy()

#  priečinok pre uloženie
os.makedirs("results", exist_ok=True)

# Klasifikačný report
report = classification_report(labels, preds, target_names=["Non-toxic", "Toxic"], digits=4)
with open("results/classification_report.txt", "w") as f:
    f.write(report)

# Klasifikačný report do scv
report_dict = classification_report(labels, preds, target_names=["Non-toxic", "Toxic"], digits=4, output_dict=True)
pd.DataFrame(report_dict).transpose().to_csv("results/classification_report_full.csv")

                             
# MCC
mcc = matthews_corrcoef(labels, preds)
print("Matthews Correlation Coefficient (MCC):", round(mcc, 4))

# Confusion Matrix
cm = confusion_matrix(labels, preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non-toxic", "Toxic"], yticklabels=["Non-toxic", "Toxic"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.savefig("results/confusion_matrix.png")
plt.close()                                   

# ROC
fpr, tpr, _ = roc_curve(labels, probs)
roc_auc = roc_auc_score(labels, probs)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc:.4f}", color="darkorange")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("results/roc_curve.png")
plt.close()                             

# Precision-recall 
precision, recall, _ = precision_recall_curve(labels, probs)
pr_auc = auc(recall, precision)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f"PR AUC = {pr_auc:.4f}", color="blue")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.grid(True)
plt.legend()
plt.tight_layout()



plt.savefig("results/pr_curve.png")
plt.close()


In [None]:
# Tréningové metriky 
epoch_logs = [x for x in trainer.state.log_history if "epoch" in x and "eval_loss" in x]
train_loss_epoch = [x["loss"] for x in trainer.state.log_history if "loss" in x and "epoch" in x]
val_loss_epoch = [x["eval_loss"] for x in epoch_logs]

f1_epoch = [x["eval_f1"] for x in epoch_logs if "eval_f1" in x]
epochs = [x["epoch"] for x in epoch_logs]

def smooth(values, alpha=0.3):
    if not values:
        return []
    smoothed = []
    last = values[0]
    for v in values:
        smoothed_val = alpha * v + (1 - alpha) * last
        smoothed.append(smoothed_val)
        last = smoothed_val
    return smoothed



if train_loss_epoch and val_loss_epoch:
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, smooth(train_loss_epoch), label="Train Loss", linewidth=2)
    plt.plot(epochs, smooth(val_loss_epoch), label="Validation Loss", linewidth=2)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Training and Validation Loss")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig("results/loss_clean_graph.png")
    plt.show()

if f1_epoch:
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, smooth(f1_epoch), marker='o', label="F1 Score", linewidth=2)
    plt.xlabel("Epochs")
    plt.ylabel("F1")
    plt.title("F1 Score Over Epochs")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig("results/accuracy_graph.png")

    
    plt.show()


In [None]:
import zipfile

with zipfile.ZipFile("final_results_multilingual.zip", "w") as zipf:
    for file in os.listdir("results"):
        zipf.write(os.path.join("results", file), arcname=file)
print("✅ Výstupy boli zozipované.")
