# 1. Inštalácia knižníc a prostredia

In [None]:
# Trainer nepracuje správne v novšej verzii 
#Úplné odstránenie nekompatibilných verzií
!pip uninstall -y transformers huggingface-hub tokenizers sentence-transformers peft

# Vyčistenie cache
!rm -rf /root/.cache/huggingface

#Inštalácia stabilných verzií
!pip install transformers==4.40.1 huggingface-hub==0.30.2 tokenizers==0.19.1


In [None]:
# otestuj aku mas verziu
import transformers
print(transformers.__version__)


In [None]:
#Inštalácia knižníc
!pip install -q transformers datasets scikit-learn matplotlib seaborn


# 2. Import knižníc

In [None]:
#Import knižníc a inicializácia premenných
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
import os

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, TrainerCallback, EarlyStoppingCallback
)
from datasets import load_dataset
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, classification_report,
    confusion_matrix, precision_recall_curve, auc, f1_score
)

torch.manual_seed(42)
np.random.seed(42)

training_losses = []
validation_losses = []
eval_accuracies = []
epochs_logged = []


# 3. Definovanie metrík a callbackov

In [None]:
#Výpočet metrík
def compute_metrics(p):
    predicted_labels = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predicted_labels, average='binary', zero_division=1)
    acc = accuracy_score(labels, predicted_labels)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [None]:
#Callback na zber train  a valid
class EpochMetricsTracker(TrainerCallback):
    def __init__(self):
        self.last_logged_epoch = -1

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs and 'epoch' in logs:
            epoch = int(logs['epoch'])
            if epoch != self.last_logged_epoch:
                training_losses.append(logs['loss'])
                self.last_logged_epoch = epoch

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            if 'eval_loss' in metrics:
                validation_losses.append(metrics['eval_loss'])
            if 'eval_accuracy' in metrics:
                eval_accuracies.append(metrics['eval_accuracy'])
            epochs_logged.append(int(state.epoch))


# 4. Načítanie a tokenizácia dát

In [None]:
#Načítanie a tokenizácia datasetu
print("Nacitavam dataset")
dataset = load_dataset("TUKE-KEMT/hate_speech_slovak")
train_dataset = dataset['train']
test_dataset = dataset['test']

model_name = "gerulata/slovakbert"
slovak_tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return slovak_tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)


# 5. Architektúra modelu SlovakBERT s váhovaným lossom

In [None]:
#Úprava modelu s váhovaným lossom
class SlovakBERTWeighted(nn.Module):
    def __init__(self, model_name, num_labels=2, pos_weight=None):
        super(SlovakBERTWeighted, self).__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = None
        if labels is not None:
            labels_one_hot = nn.functional.one_hot(labels, num_classes=2).float()
            loss = self.loss_fn(logits, labels_one_hot)
        return {"loss": loss, "logits": logits}


In [None]:
#Výpočet váh pre triedy
tox_count = sum([1 for x in train_dataset if x['label'] == 1])
non_tox_count = sum([1 for x in train_dataset if x['label'] == 0])
pos_weight = torch.tensor([non_tox_count / tox_count]).to("cuda" if torch.cuda.is_available() else "cpu")

# Inicializácia modelu 
model = SlovakBERTWeighted(model_name=model_name, pos_weight=pos_weight)


# 6. Tréningové argumenty

In [None]:
#Nastavenie tréningových parametrov
results_dir = "./results_slovakbert"
os.makedirs(results_dir, exist_ok=True)

training_args = TrainingArguments(
    results_dir=results_dir,
    num_train_epochs=15,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5, # skúšal som learning_rate=2e-5 ale nebolo to stabilné
    weight_decay=0.05,
    lr_scheduler_type="cosine",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    fp16=True
)


# 7. Tréning modelu

In [None]:
#Spustenie trénovania
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    slovak_tokenizer=slovak_tokenizer,
    callbacks=[EpochMetricsTracker(), EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Spúšťam tréning")
trainer.train()


# 8. Optimalizácia prahu

In [None]:
#Návrh optimálneho prahu pre lepši precision a recall
predictions = trainer.predict(test_dataset)
probs = torch.softmax(torch.tensor(predictions.predictions), dim=1)[:, 1].numpy()
labels = predictions.label_ids

thresholds = np.arange(0.3, 0.71, 0.01)
f1_scores = []

for t in thresholds:
    predicted_labels = (probs >= t).astype(int)
    f1 = f1_score(labels, predicted_labels)
    f1_scores.append(f1)

best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Najlepší threshold: {best_threshold:.2f} (F1 = {f1_scores[best_idx]:.4f})")
final_predictions = (probs >= best_threshold).astype(int)


# 11. Vizualizácia výsledkov (grafy)

In [None]:
report = classification_report(
    labels,
    final_predictions,
    target_names=["Netoxický", "Toxický"],
    digits=4,
    output_dict=True
)

# Textová verzia 
with open(f"{results_dir}/classification_report.txt", "w") as f:
    f.write(classification_report(labels, final_predictions, target_names=["Netoxický", "Toxický"], digits=4))

# CSV verzia pre tabuľku do bakalárky
pd.DataFrame(report).transpose().to_csv(f"{results_dir}/classification_report_full.csv")

# Výpis do výstupu notebooku
import pprint
pprint.pprint(report)

# Confusion matrix (nezobrazuje sa správne, nižšie je ok)
cm = confusion_matrix(labels, final_predictions)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Netoxický", "Toxický"],
            yticklabels=["Netoxický", "Toxický"])
plt.title("Confusion matrix")
plt.xlabel("Predikované")
plt.ylabel("Skutočné")
plt.tight_layout()
plt.savefig(f"{results_dir}/confusion_matrix.png")
plt.close()

In [None]:
#vykreslenie grafov 
min_len = min(len(training_losses), len(validation_losses), len(eval_accuracies))
epochs = list(range(1, min_len + 1))
train_loss_epoch = training_losses[:min_len]
val_loss_epoch = validation_losses[:min_len]
acc_epoch = eval_accuracies[:min_len]

# Loss graf
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_loss_epoch, label="Training Loss", color="blue", linewidth=2)
plt.plot(epochs, val_loss_epoch, label="Validation Loss", color="orange", linewidth=2)
plt.xlabel("Epoche", fontsize=14)
plt.ylabel("Loss", fontsize=14)
plt.title("Training and Validation Loss", fontsize=16)
plt.legend(fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{results_dir}/loss_clean_graph.png", dpi=300)
plt.show()

#accuracy graf
plt.figure(figsize=(10, 6))
plt.plot(epochs, acc_epoch, la#
plt.xlabel("Epoche", fontsize=14)
plt.ylabel("Presnosť", fontsize=14)
plt.title("Presnosť modelu počas tréningu", fontsize=16)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig(f"{results_dir}/accuracy_graph.png", dpi=300)
plt.show()

# Precision-recall krivka
precision, recall, _ = precision_recall_curve(labels, probs)
pr_auc = auc(recall, precision)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f"PR-AUC = {pr_auc:.2f}", color="blue")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall krivka")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig(f"{results_dir}/precision_recall_curve.png", dpi=300)
plt.show()


In [None]:
##Export natrénovaného modelu a zazipovanie celej zložky výsledkov
##trainer.save_model(f"{results_dir}/fine_tuned_model")
#!zip -r /kaggle/working/results_slovakbert.zip {results_dir} > /dev/null


In [None]:
#Spustenie predikcie 
preds_output = trainer.predict(test_dataset)

# Získame predikcie
labels = preds_output.label_ids
final_predictions = np.argmax(preds_output.predictions, axis=1)
probs = torch.nn.functional.softmax(torch.tensor(preds_output.predictions), dim=1)[:, 1].numpy()


In [None]:
#Výpočet metrík
from sklearn.metrics import matthews_corrcoef, roc_auc_score, precision_recall_curve, auc, roc_curve

# Predikcie 
probs = torch.nn.functional.softmax(torch.tensor(preds_output.predictions), dim=1)[:, 1].numpy()
final_predictions = np.argmax(preds_output.predictions, axis=1)
labels = preds_output.label_ids

# MCC
mcc = matthews_corrcoef(labels, final_predictions)
print("Matthews Correlation Coefficient (MCC):", round(mcc, 4))

# ROC AUC
roc_auc = roc_auc_score(labels, probs)
print("ROC AUC:", round(roc_auc, 4))

# PR AUC
precision, recall, _ = precision_recall_curve(labels, probs)
pr_auc = auc(recall, precision)
print("PR AUC:", round(pr_auc, 4))


In [None]:
#ROC
import matplotlib.pyplot as plt

fpr, tpr, _ = roc_curve(labels, probs)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc:.4f}", color="darkorange")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate", fontsize=12)
plt.ylabel("True Positive Rate", fontsize=12)
plt.title("ROC Curve", fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{results_dir}/roc_curve.png", dpi=300)
plt.show()


In [None]:
#  Precision-Recall 
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f"PR AUC = {pr_auc:.4f}", color="blue")
plt.xlabel("Recall", fontsize=12)
plt.ylabel("Precision", fontsize=12)
plt.title("Precision-Recall Curve", fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)
plt.tight_layout()
plt.savefig(f"{results_dir}/precision_recall_curve.png", dpi=300)
plt.show()


In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt



cm = confusion_matrix(labels, final_predictions)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Non-toxic", "Toxic"],
            yticklabels=["Non-toxic", "Toxic"])
plt.title("Confusion Matrix", fontsize=14)
plt.xlabel("Predicted", fontsize=12)
plt.ylabel("Actual", fontsize=12)
plt.tight_layout()
plt.savefig(f"{results_dir}/confusion_matrix.png", dpi=300)
plt.show()


In [None]:
# Uloženie klasifikačného reportu
from sklearn.metrics import classification_report

report_txt = classification_report(labels, final_predictions, target_names=["Non-toxic", "Toxic"], digits=4)
with open(f"{results_dir}/classification_report.txt",  "w") as f:
    f.write(report_txt)
