# 1. Inštalácia knižníc a prostredia

In [None]:
# Trainer nepracuje správne v novšej verzii 
#Úplné odstránenie nekompatibilných verzií
!pip uninstall -y transformers huggingface-hub tokenizers sentence-transformers peft

# Vyčistenie cache
!rm -rf /root/.cache/huggingface

#Inštalácia stabilných verzií
!pip install transformers==4.40.1 huggingface-hub==0.30.2 tokenizers==0.19.1


# 2. Príprava dát a tokenizácia

In [None]:


from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd

dataset = load_dataset("civil_comments", split="train")
df = dataset.to_pandas()
df["toxicity_label"] = (df["toxicity"] >= 0.5).astype(int)

# Stratifikovaný výber 20 000 komentárov
df_sample, _ = train_test_split(df, train_size=20000, stratify=df["toxicity_label"], random_state=42)

# Stratifikované rozdelenie train val test
train_df, temp_df = train_test_split(df_sample, test_size=0.2, stratify=df_sample["toxicity_label"], random_state=42) # test_size=0.3, ale nerovnováha tried bola väčšia
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["toxicity_label"], random_state=42)

# Výpis 
for name, split in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
    print(f"{name} množina: {len(split)} komentárov")
    print(split["toxicity_label"].value_counts(normalize=True).rename("proportion"))
    print(split["toxicity_label"].value_counts().rename("počet"), "\n")

train_df.to_csv("train_civil.csv", index=False)
val_df.to_csv("val_civil.csv", index=False)
test_df.to_csv("test_civil.csv", index=False)


# 2. Import knižníc

In [None]:

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from datasets import Dataset
# from sklearn.preprocessing import StandardScaler  # zvažoval som škálovanie dĺžky komentárov ako vstup
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

#Najprv vytvorenie Hugging Face Dataset objektov
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

# inicializácia tokenizera
checkpoint = "unitary/toxic-bert"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenizačná funkcia
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

# Tokenizácia
train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)

#Premenovanie stĺpca toxicity_label na label potrebuje to Trainer
train_ds = train_ds.rename_column("toxicity_label", "label")
val_ds = val_ds.rename_column("toxicity_label", "label")
test_ds = test_ds.rename_column("toxicity_label", "label")

#Nastavenie formátu pre PyTorch
train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


# 3. Tréning a vyhodnotenie

In [None]:
!pip install evaluate

from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate
import numpy as np

# Nastavenie checkpointu
checkpoint = "unitary/toxic-bert"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# pre binárnu klasifikáciu
config = AutoConfig.from_pretrained(checkpoint)
config.num_labels = 2
config.problem_type = "single_label_classification"

# Model s upravenou konfiguráciou a automatickou zmenou výstupnej vrstvy
toxicbert_model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    config=config,
    ignore_mismatched_sizes=True
).to(device)

# Metriky
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predicted_labels = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predicted_labels, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predicted_labels, references=labels, average="macro")["f1"]
    }

# Tréningové argumenty – upravené proti overfittingu
training_args = TrainingArguments(
    results_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16, # 
    per_device_eval_batch_size=64,
    num_train_epochs=5,  # znížené z 10 na 5
    learning_rate=1e-5,  # znížené z 2e-5 na 1e-5
    weight_decay=0.01,
    label_smoothing_factor=0.1,  #pridané
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none"
)


# Tréner s early stopping
trainer = Trainer(
    toxicbert_model=toxicbert_model,
    args=training_args,
    train_dataset=train_ds,     # treba sa uistiť že train_ds je pripravený
    eval_dataset=val_ds,        # aj tu val_ds je pripravený
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Tréning
train_result = trainer.train()

In [None]:

# ZLEPŠENÉ VYHODNOTENIE 

from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_curve, roc_auc_score, matthews_corrcoef, auc
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import torch
import numpy as np
import os



results_dir= "./results"
os.makedirs(results_dir, exist_ok=True)

# Predikcie na test
preds_output = trainer.predict(test_ds)
labels = preds_output.label_ids
final_preds = np.argmax(preds_output.predictions, axis=1)
probs = torch.nn.functional.softmax(torch.tensor(preds_output.predictions), dim=1)[:, 1].numpy()

# Classification Report
report = classification_report(labels, final_preds, target_names=["Non-toxic", "Toxic"], digits=4, output_dict=True)
pd.DataFrame(report).transpose().to_csv(f"{results_dir}/classification_report_full.csv")

with open(f"{results_dir}/classification_report.txt", "w") as f:
    f.write(classification_report(labels, final_preds, target_names=["Non-toxic", "Toxic"], digits=4))



#Matthews Correlation Coefficient
mcc = matthews_corrcoef(labels, final_preds)
print("Matthews Correlation Coefficient (MCC):", round(mcc, 4))

#Confusion Matrix
cm = confusion_matrix(labels, final_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Non-toxic", "Toxic"],
            yticklabels=["Non-toxic", "Toxic"])
plt.title("Confusion Matrix – Test Set")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.savefig(f"{results_dir}/confusion_matrix.png", dpi=300)
plt.show()



# ROc
fpr, tpr, _ = roc_curve(labels, probs)
roc_auc = roc_auc_score(labels, probs)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc:.4f}", color="darkorange")
plt.plot([0, 1], [0, 1], "k--", label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{results_dir}/roc_curve.png", dpi=300)
plt.show()              

# Precision-Recall 
precision, recall, _ = precision_recall_curve(labels, probs)
pr_auc = auc(recall, precision)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f"PR-AUC = {pr_auc:.4f}", color="blue")
plt.xlabel("Recall")
plt.ylabel("Precision")                        
plt.title("Precision-Recall Curve")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig(f"{results_dir}/precision_recall_curve.png", dpi=300)
plt.show()

#Zlepšené grafy 
epoch_logs = [x for x in trainer.state.log_history if "epoch" in x and "eval_loss" in x]


def smooth(values, alpha=0.3): # funkcia smooth() vyhladí výkyvy 
    smoothed = []
    last = values[0]
    for v in values:
        smoothed_val = alpha * v + (1 - alpha) * last                              
        smoothed.append(smoothed_val)
        last = smoothed_val
    return smoothed

train_loss_epoch = [x["loss"] for x in trainer.state.log_history if "loss" in x and "epoch" in x]
val_loss_epoch = [x["eval_loss"] for x in epoch_logs] # train_loss_epoch obsahuje stratu počas učenia val_loss_epoch sleduje chybu na validačných dátach.
acc_epoch = [x["eval_accuracy"] for x in epoch_logs] # acc_epoch - presnosť klasifikácie
epochs = [x["epoch"] for x in epoch_logs]

# Loss 
plt.figure(figsize=(10, 6))
plt.plot(epochs, smooth(train_loss_epoch), label="Train Loss", color="blue", linewidth=2)
plt.plot(epochs, smooth(val_loss_epoch), label="Validation Loss", color="orange", linewidth=2)
plt.xlabel("Epochs", fontsize=14)
plt.ylabel("Loss", fontsize=14)
plt.title("Training and Validation Loss", fontsize=16)
plt.legend(fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{results_dir}/loss_clean_graph.png", dpi=300)
plt.show()

# Accuracy
plt.figure(figsize=(10, 6))
plt.plot(epochs, smooth(acc_epoch), label="Validation Accuracy", marker='o', color="green", linewidth=2)
plt.xlabel("Epochs", fontsize=14)
plt.ylabel("Accuracy", fontsize=14)
plt.title("Model Accuracy Over Epochs", fontsize=16)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig(f"{results_dir}/accuracy_graph.png", dpi=300)
plt.show()

#Chybová analýza
try:
    test_texts = test_ds["text"]
    errors = [(i, p, l, prob) for i, (p, l, prob) in enumerate(zip(final_preds, labels, probs)) if p != l]
    errors_sorted = sorted(errors, key=lambda x: abs(x[3] - 0.5))[:5]

    for idx, pred, label, prob in errors_sorted:
        print(f"Text: {test_texts[idx][:100]}...")
        print(f"Predicted: {pred}, Actual: {label}, Confidence: {prob:.4f}")
        print("-" * 80)
except:
    print("Chybová analýza: Texty nie sú dostupné.")

In [None]:

from sklearn.metrics import precision_score, recall_score, f1_score

# mikro, makro a weighted metríky
f1_micro = f1_score(labels, final_preds, average='micro')
f1_macro = f1_score(labels, final_preds, average='macro')
f1_weighted = f1_score(labels, final_preds, average='weighted')

precision_micro = precision_score(labels, final_preds, average='micro')
precision_macro = precision_score(labels, final_preds, average='macro')
precision_weighted = precision_score(labels, final_preds, average='weighted')

recall_micro = recall_score(labels, final_preds, average='micro')
recall_macro = recall_score(labels, final_preds, average='macro')
recall_weighted = recall_score(labels, final_preds, average='weighted')

print(f"F1 micro: {f1_micro:.4f}")
print(f"F1 macro: {f1_macro:.4f}")
print(f"F1 weighted: {f1_weighted:.4f}")
print(f"Precision micro: {precision_micro:.4f}")
print(f"Precision macro: {precision_macro:.4f}")
print(f"Precision weighted: {precision_weighted:.4f}")
print(f"Recall micro: {recall_micro:.4f}")
print(f"Recall macro: {recall_macro:.4f}")
print(f"Recall weighted: {recall_weighted:.4f}")

# Uloženie do csv pre bakalárku
import pandas as pd
metrics_summary = pd.DataFrame({
    "F1": [f1_micro, f1_macro, f1_weighted],
    "Precision": [precision_micro, precision_macro, precision_weighted],
    "Recall": [recall_micro, recall_macro, recall_weighted]
}, index=["micro", "macro", "weighted"])

metrics_summary.to_csv(f"{results_dir}/micro_macro_metrics.csv", float_format="%.4f")
metrics_summary

In [None]:
import zipfile
import os

output_zip = "toxicbert_bin_results.zip"
results_dir = "results"

with zipfile.ZipFile(output_zip, "w") as zipf:
    for file in os.listdir(results_dir):
        if file.endswith(".png") or file.endswith(".csv") or file.endswith(".txt"):
            zipf.write(os.path.join(results_dir, file), arcname=file)

print(f"ZIP súbor vytvorený: {output_zip}")
