In [3]:
%pip install tf-keras
%pip install "numpy<2"
%pip install transformers[torch]
%pip install 'accelerate>=0.26.0

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
/bin/ba

In [6]:
# === Cell 2: Imports & config ===
import os
import logging
from datetime import datetime
import random
import pandas as pd
import numpy as np
import torch
import optuna
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, StratifiedKFold
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    confusion_matrix,
)
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)

# Environment & seed
os.environ["TOKENIZERS_PARALLELISM"] = "false"
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Logger setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
logger.info("Cell 2: Imports and global configuration loaded")


2025-04-29 11:46:03,124 - INFO - Cell 2: Imports and global configuration loaded


In [7]:
# === Cell 3: load_data() ===
def load_data(path="data/sentences_final.csv"):
    logger.info(f"Loading data from {path}")
    df = pd.read_csv(path)
    keep_columns = [
        "sentence",
        "model",
        "noun_gender",
        "adjective_gender",
        "temperature",
    ]
    df = df.loc[:, [c for c in keep_columns if c in df.columns]]
    df["label"] = df.apply(
        lambda r: "MM" if (r.noun_gender == "male" and r.adjective_gender == "male")
        else "FF" if (r.noun_gender == "female" and r.adjective_gender == "female")
        else "MF" if (r.noun_gender == "male" and r.adjective_gender == "female")
        else "FM",
        axis=1,
    )
    df["stereotype"] = df["label"].isin(["MM", "FF"]).astype(int)
    df["stereotype_type"] = df["stereotype"].map({1: "S", 0: "S_bar"})
    df["stratify_group"] = df["stereotype"].astype(str)
    logger.info(f"Loaded {len(df)} rows")
    return df

logger.info("Cell 3: load_data() defined")


2025-04-29 11:46:03,131 - INFO - Cell 3: load_data() defined


In [8]:
# === Cell 4: compute_metrics & compute_tpr_gap ===

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1, zero_division=0
    )
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

def compute_detailed_metrics(pred, metadata):
    """
    Berekent gedetailleerde metrics inclusief TPR Gap per gender
    """
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    
    # Basis metrics
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1, zero_division=0 
    )
    
    results = {
        "accuracy": acc, 
        "precision": prec, 
        "recall": rec, 
        "f1": f1
    }
    
    # Gender-specifieke metrics berekenen als we gender info hebben
    if 'noun_gender' in metadata.columns:
        # Maak subsets per gender en class
        male_indices = metadata['noun_gender'] == 'male'
        female_indices = metadata['noun_gender'] == 'female'
        
        # Stereotype klasse (positieve klasse = 1)
        stereotype_indices = metadata['stereotype'] == 1
        contra_indices = metadata['stereotype'] == 0
        
        # TPR voor mannelijke stereotypes (male + stereotype = 1)
        male_stereotype = male_indices & stereotype_indices
        if sum(male_stereotype) > 0:
            tpr_male_s = accuracy_score(
                labels[male_stereotype], 
                preds[male_stereotype]
            )
        else:
            tpr_male_s = 0
            
        # TPR voor vrouwelijke stereotypes (female + stereotype = 1)
        female_stereotype = female_indices & stereotype_indices
        if sum(female_stereotype) > 0:
            tpr_female_s = accuracy_score(
                labels[female_stereotype], 
                preds[female_stereotype]
            )
        else:
            tpr_female_s = 0
            
        # TPR Gap voor stereotype klasse (S)
        tpr_gap_s = tpr_male_s - tpr_female_s
        
        # TPR voor mannelijke contra-stereotypes (male + contra-stereotype)
        male_contra = male_indices & contra_indices
        if sum(male_contra) > 0:
            tpr_male_contra = accuracy_score(
                labels[male_contra], 
                preds[male_contra]
            )
        else:
            tpr_male_contra = 0
            
        # TPR voor vrouwelijke contra-stereotypes (female + contra-stereotype)
        female_contra = female_indices & contra_indices
        if sum(female_contra) > 0:
            tpr_female_contra = accuracy_score(
                labels[female_contra], 
                preds[female_contra]
            )
        else:
            tpr_female_contra = 0
            
        # TPR Gap voor contra-stereotype klasse (contra-S)
        tpr_gap_contra = tpr_male_contra - tpr_female_contra
        
        # Voeg gender-specifieke metrics toe
        results.update({
            "tpr_male_s": tpr_male_s,
            "tpr_female_s": tpr_female_s,
            "tpr_gap_s": tpr_gap_s,
            "tpr_male_contra": tpr_male_contra,
            "tpr_female_contra": tpr_female_contra,
            "tpr_gap_contra": tpr_gap_contra
        })
    
    return results

logger.info("Cell 4: metric functions defined")
# === Cell 5: TorchDataset ===
class TorchDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.tolist()
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: v[idx] for k,v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

logger.info("Cell 5: TorchDataset defined")


2025-04-29 11:46:03,143 - INFO - Cell 4: metric functions defined
2025-04-29 11:46:03,145 - INFO - Cell 5: TorchDataset defined


In [9]:
def run_cv_hp_search(model_name, tokenizer_name, df, n_splits=5, n_trials=5):
    logger.info(f"Starting CV+HPO for {model_name}")
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    all_results = []
    all_details = []
    all_grouped = []

    def tokenize(df_):
        return tokenizer(
            df_["sentence"].tolist(),
            padding=True, truncation=True,
            max_length=128, return_tensors="pt"
        )

    for fold, (train_i, test_i) in enumerate(skf.split(df, df["stratify_group"]), 1):
        logger.info(f"Fold {fold}/{n_splits}")
        train_full = df.iloc[train_i]
        test_df = df.iloc[test_i].reset_index(drop=True)

        # inner train/val split
        try:
            tr_df, val_df = train_test_split(
                train_full, test_size=0.2, random_state=SEED,
                stratify=train_full["stratify_group"]
            )
        except ValueError:
            tr_df, val_df = train_test_split(
                train_full, test_size=0.2, random_state=SEED,
                stratify=train_full["stereotype"]
            )
        tr_df, val_df = tr_df.reset_index(drop=True), val_df.reset_index(drop=True)

        # datasets
        ds_tr = TorchDataset(tokenize(tr_df), tr_df["stereotype"])
        ds_val = TorchDataset(tokenize(val_df), val_df["stereotype"])
        ds_te = TorchDataset(tokenize(test_df), test_df["stereotype"])

        # trainer setup
        model_init = lambda: AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=2
        )
        out_b = f"./results/{model_name.replace('/', '_')}/fold_{fold}"
        log_b = f"./logs/{model_name.replace('/', '_')}/fold_{fold}"

        args = TrainingArguments(
            output_dir=out_b,
            eval_strategy="epoch",
            save_strategy="no",  # No automatic saving
            logging_strategy="epoch",
            logging_dir=log_b,
            report_to="none",
            fp16=True,
            num_train_epochs=3,
            load_best_model_at_end=False,  # No best model loading
            metric_for_best_model="accuracy",
            greater_is_better=True,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            seed=SEED,
        )

        trainer = Trainer(
            model_init=model_init,
            args=args,
            train_dataset=ds_tr,
            eval_dataset=ds_val,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
        )

        # Hyperparameter search
        best = trainer.hyperparameter_search(
            direction="maximize",
            backend="optuna",
            hp_space=lambda t: {
                "learning_rate": t.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
                "per_device_train_batch_size": t.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
                "weight_decay": t.suggest_float("weight_decay", 1e-6, 1e-2, log=True),
            },
            n_trials=n_trials,
            sampler=optuna.samplers.TPESampler(seed=SEED),
        )
        best_hyperparams = best.hyperparameters
        logger.info(f"Best HPO on fold {fold}: {best_hyperparams}")

        # Retrain met de beste hyperparameters
        trainer.args.per_device_train_batch_size = best_hyperparams["per_device_train_batch_size"]
        trainer.args.learning_rate = best_hyperparams["learning_rate"]
        trainer.args.weight_decay = best_hyperparams.get("weight_decay", 0.0)

        trainer.train()

        # Handmatig opslaan
        trainer.save_model(f"{out_b}/final_model")

        # Predicties en evaluatie
        test_output = trainer.predict(test_dataset=ds_te)
        metrics = trainer.evaluate(eval_dataset=ds_te)
        logger.info(f"Fold {fold} metrics: {metrics}")
        metrics.update({"fold": fold, "model": model_name, "hyperparams": best_hyperparams})
        all_results.append(metrics)

        # Gedetailleerde metrics
        detailed = compute_detailed_metrics(test_output, test_df)
        detailed.update({"fold": fold, "model": model_name})
        all_details.append({**metrics, **detailed})

        # Subset-analyse per model en temperatuur
        subset_records = []
        for model_type in test_df['model'].unique():
            for temp in test_df['temperature'].unique():
                subset = test_df[(test_df['model'] == model_type) & (test_df['temperature'] == temp)]
                if len(subset) < 10:
                    continue
                idx = subset.index.to_numpy() - min(test_df.index)
                preds = np.argmax(test_output.predictions[idx], axis=1)
                labels = test_output.label_ids[idx]
                acc = accuracy_score(labels, preds)
                prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
                sub_det = compute_detailed_metrics(
                    type('O', (), {'predictions': test_output.predictions[idx], 'label_ids': test_output.label_ids[idx]}),
                    subset
                )
                record = {
                    'fold': fold,
                    'classifier_model': model_name,
                    'llm_model': model_type,
                    'temperature': temp,
                    'accuracy': acc,
                    'precision': prec,
                    'recall': rec,
                    'f1': f1,
                    'tpr_gap_s': sub_det.get('tpr_gap_s'),
                    'tpr_gap_contra': sub_det.get('tpr_gap_contra'),
                    'sample_size': len(subset)
                }
                subset_records.append(record)
        all_grouped.extend(subset_records)

    # Samenvatten en opslaan
    results_df = pd.DataFrame(all_results)
    detailed_df = pd.DataFrame(all_details)
    grouped_df = pd.DataFrame(all_grouped).groupby(['llm_model', 'temperature']).agg({
        'accuracy': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'tpr_gap_s': ['mean', 'std'],
        'tpr_gap_contra': ['mean', 'std'],
        'sample_size': 'mean'
    }).reset_index()

    results_df.to_csv(f"results_base_{model_name.replace('/', '_')}.csv", index=False)
    detailed_df.to_csv(f"results_detailed_{model_name.replace('/', '_')}.csv", index=False)
    grouped_df.to_csv(f"results_by_model_temp_{model_name.replace('/', '_')}.csv", index=False)

    avg_metrics = {
        'avg_accuracy': results_df['eval_accuracy'].mean(),
        'std_accuracy': results_df['eval_accuracy'].std(),
        'avg_f1': results_df['eval_f1'].mean(),
        'std_f1': results_df['eval_f1'].std(),
        'model': model_name,
        'hyperparams': best_hyperparams
    }
    logger.info(f"Gemiddelde resultaten voor {model_name}: {avg_metrics}")

    return results_df, detailed_df, grouped_df, avg_metrics

logger.info("Cell 6: run_cv_hp_search gereviseerd en gedefinieerd (minimal memory usage)")




2025-04-29 11:46:05,186 - INFO - Cell 6: run_cv_hp_search gereviseerd en gedefinieerd (minimal memory usage)


In [None]:
# === Cell 8: Main ===
if __name__ == "__main__":
    df = load_data()
    models = [
        #("GroNLP/bert-base-dutch-cased", "GroNLP/bert-base-dutch-cased"),
        ("distilbert/distilbert-base-multilingual-cased", "distilbert/distilbert-base-multilingual-cased"),
        #("DTAI-KULeuven/robbert-2023-dutch-large", "DTAI-KULeuven/robbert-2023-dutch-large"),
    ]
    # Resultaten opslaan per model
    all_summary_metrics = []
    all_detailed_results_dfs = []
    all_grouped_results = []
    
    # Voor elk model
    for model_name, tokenizer_name in models:
        logger.info(f"Starting model {model_name}")
        _, detailed_results_df, grouped_results, summary_metrics = run_cv_hp_search(
            model_name, tokenizer_name, df, n_splits=5, n_trials=3
        )
        all_summary_metrics.append(summary_metrics)
        all_detailed_results_dfs.append(detailed_results_df)
        all_grouped_results.append(grouped_results)
    
    # Alle resultaten combineren en vergelijken
    summary_df = pd.DataFrame(all_summary_metrics)
    summary_df.to_csv("all_models_comparison.csv", index=False)
    
    # Combineer alle gedetailleerde resultaten
    combined_detailed = pd.concat(all_detailed_results_dfs)
    combined_detailed.to_csv("all_detailed_results.csv", index=False)

    logger.info("All models finished")


2025-04-29 11:46:09,388 - INFO - Loading data from data/sentences_final.csv
2025-04-29 11:46:09,744 - INFO - Loaded 14295 rows
2025-04-29 11:46:09,745 - INFO - Starting model DTAI-KULeuven/robbert-2023-dutch-large
2025-04-29 11:46:09,746 - INFO - Starting CV+HPO for DTAI-KULeuven/robbert-2023-dutch-large


tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/841k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/502k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.19M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

2025-04-29 11:46:15,724 - INFO - Fold 1/5


config.json:   0%|          | 0.00/867 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-29 11:46:24,485] A new study created in memory with name: no-name-ca537256-59b7-4a22-b61b-cd962012ecde
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will 

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5317,0.294956,0.899038,0.875,0.930884,0.902077
2,0.2916,0.297277,0.903846,0.882353,0.931759,0.906383
3,0.2328,0.294645,0.908654,0.878444,0.948381,0.912074


[I 2025-04-29 11:57:11,404] Trial 0 finished with value: 3.6475534256623066 and parameters: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}. Best is trial 0 with value: 3.6475534256623066.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7022,0.693205,0.500437,0.0,0.0,0.0
2,0.6964,0.693368,0.499563,0.499563,1.0,0.666278


[I 2025-04-29 12:02:12,943] Trial 1 finished with value: 2.665403927171633 and parameters: {'learning_rate': 1.2853916978930139e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.0006796578090758161}. Best is trial 0 with value: 3.6475534256623066.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4238,0.28306,0.896416,0.875,0.924759,0.899192
2,0.2561,0.288049,0.906906,0.881773,0.939633,0.909784
3,0.2061,0.325744,0.904283,0.885,0.929134,0.90653


[I 2025-04-29 12:12:59,365] Trial 2 finished with value: 3.6249471646796145 and parameters: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}. Best is trial 0 with value: 3.6475534256623066.
2025-04-29 12:12:59,366 - INFO - Best HPO on fold 1: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5317,0.294956,0.899038,0.875,0.930884,0.902077
2,0.2916,0.297277,0.903846,0.882353,0.931759,0.906383
3,0.2328,0.294645,0.908654,0.878444,0.948381,0.912074






2025-04-29 12:24:16,977 - INFO - Fold 1 metrics: {'eval_loss': 0.2525213062763214, 'eval_accuracy': 0.919202518363064, 'eval_precision': 0.8951187335092349, 'eval_recall': 0.9496151154653604, 'eval_f1': 0.9215619694397283, 'eval_runtime': 13.1847, 'eval_samples_per_second': 216.842, 'eval_steps_per_second': 3.413, 'epoch': 3.0}
2025-04-29 12:24:17,105 - INFO - Fold 2/5
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-29 12:24:19,769] A new study created in memory with name: no-name-2618dad1-f01c-4877-a3cc-fd29c14e34c0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-d

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7042,0.693448,0.499563,0.499563,1.0,0.666278
2,0.6978,0.694686,0.499563,0.499563,1.0,0.666278


[I 2025-04-29 12:31:30,982] Trial 0 finished with value: 2.665403927171633 and parameters: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}. Best is trial 0 with value: 2.665403927171633.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6415,0.348088,0.870192,0.8525,0.895013,0.873239
2,0.3006,0.262508,0.894668,0.870279,0.927384,0.897925
3,0.226,0.268492,0.905157,0.869219,0.953631,0.90947


[I 2025-04-29 12:39:02,712] Trial 1 finished with value: 3.637476810652414 and parameters: {'learning_rate': 1.2853916978930139e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.0006796578090758161}. Best is trial 1 with value: 3.637476810652414.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4316,0.273219,0.905594,0.868149,0.956255,0.910075
2,0.2574,0.269004,0.903409,0.878489,0.936133,0.906396


[I 2025-04-29 12:46:14,145] Trial 2 finished with value: 3.6244269961381854 and parameters: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}. Best is trial 1 with value: 3.637476810652414.
2025-04-29 12:46:14,147 - INFO - Best HPO on fold 2: {'learning_rate': 1.2853916978930139e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.0006796578090758161}
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6415,0.348088,0.870192,0.8525,0.895013,0.873239
2,0.3006,0.262508,0.894668,0.870279,0.927384,0.897925
3,0.226,0.268492,0.905157,0.869219,0.953631,0.90947






2025-04-29 12:54:13,109 - INFO - Fold 2 metrics: {'eval_loss': 0.234727680683136, 'eval_accuracy': 0.9160545645330536, 'eval_precision': 0.8862158647594278, 'eval_recall': 0.9544817927170869, 'eval_f1': 0.9190829399865138, 'eval_runtime': 12.009, 'eval_samples_per_second': 238.072, 'eval_steps_per_second': 3.747, 'epoch': 3.0}
2025-04-29 12:54:13,235 - INFO - Fold 3/5
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-29 12:54:16,010] A new study created in memory with name: no-name-203386e8-fb47-420a-9d29-a4015911f4ad
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-du

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7026,0.693718,0.499563,0.499563,1.0,0.666278
2,0.6973,0.693348,0.499563,0.499563,1.0,0.666278


[I 2025-04-29 13:01:27,388] Trial 0 finished with value: 2.665403927171633 and parameters: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}. Best is trial 0 with value: 2.665403927171633.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.699,0.693877,0.499563,0.499563,1.0,0.666278
2,0.6963,0.694649,0.499563,0.499563,1.0,0.666278


[I 2025-04-29 13:06:29,083] Trial 1 finished with value: 2.665403927171633 and parameters: {'learning_rate': 1.2853916978930139e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.0006796578090758161}. Best is trial 0 with value: 2.665403927171633.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5168,0.293107,0.905594,0.884647,0.932633,0.908007
2,0.271,0.3106,0.908654,0.892437,0.929134,0.910416
3,0.2166,0.322743,0.910839,0.894207,0.931759,0.912596


[I 2025-04-29 13:17:15,523] Trial 2 finished with value: 3.649400641169554 and parameters: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}. Best is trial 2 with value: 3.649400641169554.
2025-04-29 13:17:15,524 - INFO - Best HPO on fold 3: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5168,0.293107,0.905594,0.884647,0.932633,0.908007
2,0.271,0.3106,0.908654,0.892437,0.929134,0.910416
3,0.2166,0.322743,0.910839,0.894207,0.931759,0.912596






2025-04-29 13:28:34,356 - INFO - Fold 3 metrics: {'eval_loss': 0.32218459248542786, 'eval_accuracy': 0.906610703043022, 'eval_precision': 0.8757281553398059, 'eval_recall': 0.9474789915966386, 'eval_f1': 0.9101917255297679, 'eval_runtime': 13.1438, 'eval_samples_per_second': 217.517, 'eval_steps_per_second': 3.424, 'epoch': 3.0}
2025-04-29 13:28:34,482 - INFO - Fold 4/5
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-29 13:28:37,016] A new study created in memory with name: no-name-551eb41a-9c6d-47d5-a45c-2f60fc738b6e
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7031,0.694589,0.499563,0.499563,1.0,0.666278
2,0.6995,0.693252,0.499563,0.499563,1.0,0.666278


[I 2025-04-29 13:35:48,733] Trial 0 finished with value: 2.665403927171633 and parameters: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}. Best is trial 0 with value: 2.665403927171633.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4977,0.25161,0.904283,0.883085,0.931759,0.906769
2,0.2477,0.243133,0.910839,0.876504,0.956255,0.914644
3,0.194,0.24276,0.912587,0.887428,0.944882,0.915254


[I 2025-04-29 13:43:21,252] Trial 1 finished with value: 3.660151641529221 and parameters: {'learning_rate': 1.2853916978930139e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.0006796578090758161}. Best is trial 1 with value: 3.660151641529221.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4351,0.244851,0.908654,0.895763,0.924759,0.91003
2,0.2464,0.256299,0.913462,0.881356,0.955381,0.916877
3,0.206,0.289387,0.914336,0.890354,0.944882,0.916808


[I 2025-04-29 13:54:08,685] Trial 2 finished with value: 3.666380196497797 and parameters: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}. Best is trial 2 with value: 3.666380196497797.
2025-04-29 13:54:08,687 - INFO - Best HPO on fold 4: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4351,0.244851,0.908654,0.895763,0.924759,0.91003
2,0.2464,0.256299,0.913462,0.881356,0.955381,0.916877
3,0.206,0.289387,0.914336,0.890354,0.944882,0.916808






2025-04-29 14:05:25,122 - INFO - Fold 4 metrics: {'eval_loss': 0.3055974841117859, 'eval_accuracy': 0.9118572927597062, 'eval_precision': 0.8894039735099337, 'eval_recall': 0.9404761904761905, 'eval_f1': 0.9142273655547992, 'eval_runtime': 13.1521, 'eval_samples_per_second': 217.38, 'eval_steps_per_second': 3.422, 'epoch': 3.0}
2025-04-29 14:05:25,264 - INFO - Fold 5/5
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-29 14:05:27,875] A new study created in memory with name: no-name-5004b9d8-59b8-4d10-98b0-daa9b2eb161b
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-d

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7019,0.693463,0.499563,0.499563,1.0,0.666278
2,0.6976,0.693334,0.499563,0.499563,1.0,0.666278


[I 2025-04-29 14:12:41,373] Trial 0 finished with value: 2.665403927171633 and parameters: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}. Best is trial 0 with value: 2.665403927171633.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4936,0.272941,0.905157,0.865719,0.95888,0.909921
2,0.2473,0.22541,0.920017,0.915225,0.925634,0.9204
3,0.1912,0.233855,0.919143,0.907313,0.933508,0.920224


[I 2025-04-29 14:20:14,605] Trial 1 finished with value: 3.6801888278583643 and parameters: {'learning_rate': 1.2853916978930139e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.0006796578090758161}. Best is trial 1 with value: 3.6801888278583643.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4254,0.284589,0.909965,0.8745,0.95713,0.913952
2,0.2515,0.233662,0.923951,0.905439,0.946632,0.925577
3,0.2262,0.30949,0.91958,0.906012,0.936133,0.920826


[I 2025-04-29 14:31:02,974] Trial 2 finished with value: 3.682551419108226 and parameters: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}. Best is trial 2 with value: 3.682551419108226.
2025-04-29 14:31:02,976 - INFO - Best HPO on fold 5: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4254,0.284589,0.909965,0.8745,0.95713,0.913952
2,0.2515,0.233662,0.923951,0.905439,0.946632,0.925577
3,0.2262,0.30949,0.91958,0.906012,0.936133,0.920826






2025-04-29 14:42:16,283 - INFO - Fold 5 metrics: {'eval_loss': 0.37825778126716614, 'eval_accuracy': 0.9024134312696747, 'eval_precision': 0.8767213114754099, 'eval_recall': 0.9362745098039216, 'eval_f1': 0.9055198103623434, 'eval_runtime': 11.3128, 'eval_samples_per_second': 252.722, 'eval_steps_per_second': 3.978, 'epoch': 3.0}
2025-04-29 14:42:16,434 - INFO - Gemiddelde resultaten voor DTAI-KULeuven/robbert-2023-dutch-large: {'avg_accuracy': 0.9112277019937041, 'std_accuracy': 0.0068246012766040465, 'avg_f1': 0.9141167621746306, 'std_f1': 0.006508067292090506, 'model': 'DTAI-KULeuven/robbert-2023-dutch-large', 'hyperparams': {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}}
2025-04-29 14:42:16,527 - INFO - All models finished
