In [None]:
%pip install tf-keras
%pip install "numpy<2"
%pip install transformers[torch]
%pip install 'accelerate>=0.26.0

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
# === Cell 2: Imports & config ===
import os
import logging
from datetime import datetime
import random
import pandas as pd
import numpy as np
import torch
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
)
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)

# Environment & seed
os.environ["TOKENIZERS_PARALLELISM"] = "false"
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Logger setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
logger.info("Cell 2: Imports and global configuration loaded")


2025-04-29 10:16:44.260948: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-29 10:16:44.264331: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-29 10:16:44.275156: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745921804.295778   24006 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745921804.302258   24006 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745921804.318059   24006 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

In [3]:
# === Cell 3: load_data() ===
def load_data(path="data/sentences_final.csv"):
    logger.info(f"Loading data from {path}")
    df = pd.read_csv(path)
    keep_columns = [
        "sentence",
        "model",
        "noun_gender",
        "adjective_gender",
        "temperature",
    ]
    df = df.loc[:, [c for c in keep_columns if c in df.columns]]
    df["label"] = df.apply(
        lambda r: "MM" if (r.noun_gender == "male" and r.adjective_gender == "male")
        else "FF" if (r.noun_gender == "female" and r.adjective_gender == "female")
        else "MF" if (r.noun_gender == "male" and r.adjective_gender == "female")
        else "FM",
        axis=1,
    )
    df["stereotype"] = df["label"].isin(["MM", "FF"]).astype(int)
    df["stereotype_type"] = df["stereotype"].map({1: "S", 0: "S_bar"})
    df["stratify_group"] = df["stereotype"].astype(str)
    logger.info(f"Loaded {len(df)} rows")
    return df

logger.info("Cell 3: load_data() defined")


2025-04-29 10:16:46,261 - INFO - Cell 3: load_data() defined


In [4]:
# === Cell 4: compute_metrics & compute_tpr_gap ===

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1, zero_division=0
    )
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

def compute_detailed_metrics(pred, metadata):
    """
    Berekent gedetailleerde metrics inclusief TPR Gap per gender
    """
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    
    # Basis metrics
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1, zero_division=0 
    )
    
    results = {
        "accuracy": acc, 
        "precision": prec, 
        "recall": rec, 
        "f1": f1
    }
    
    # Gender-specifieke metrics berekenen als we gender info hebben
    if 'noun_gender' in metadata.columns:
        # Maak subsets per gender en class
        male_indices = metadata['noun_gender'] == 'male'
        female_indices = metadata['noun_gender'] == 'female'
        
        # Stereotype klasse (positieve klasse = 1)
        stereotype_indices = metadata['stereotype'] == 1
        contra_indices = metadata['stereotype'] == 0
        
        # TPR voor mannelijke stereotypes (male + stereotype = 1)
        male_stereotype = male_indices & stereotype_indices
        if sum(male_stereotype) > 0:
            tpr_male_s = accuracy_score(
                labels[male_stereotype], 
                preds[male_stereotype]
            )
        else:
            tpr_male_s = 0
            
        # TPR voor vrouwelijke stereotypes (female + stereotype = 1)
        female_stereotype = female_indices & stereotype_indices
        if sum(female_stereotype) > 0:
            tpr_female_s = accuracy_score(
                labels[female_stereotype], 
                preds[female_stereotype]
            )
        else:
            tpr_female_s = 0
            
        # TPR Gap voor stereotype klasse (S)
        tpr_gap_s = tpr_male_s - tpr_female_s
        
        # TPR voor mannelijke contra-stereotypes (male + contra-stereotype)
        male_contra = male_indices & contra_indices
        if sum(male_contra) > 0:
            tpr_male_contra = accuracy_score(
                labels[male_contra], 
                preds[male_contra]
            )
        else:
            tpr_male_contra = 0
            
        # TPR voor vrouwelijke contra-stereotypes (female + contra-stereotype)
        female_contra = female_indices & contra_indices
        if sum(female_contra) > 0:
            tpr_female_contra = accuracy_score(
                labels[female_contra], 
                preds[female_contra]
            )
        else:
            tpr_female_contra = 0
            
        # TPR Gap voor contra-stereotype klasse (contra-S)
        tpr_gap_contra = tpr_male_contra - tpr_female_contra
        
        # Voeg gender-specifieke metrics toe
        results.update({
            "tpr_male_s": tpr_male_s,
            "tpr_female_s": tpr_female_s,
            "tpr_gap_s": tpr_gap_s,
            "tpr_male_contra": tpr_male_contra,
            "tpr_female_contra": tpr_female_contra,
            "tpr_gap_contra": tpr_gap_contra
        })
    
    return results

logger.info("Cell 4: metric functions defined")
# === Cell 5: TorchDataset ===
class TorchDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.tolist()
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: v[idx] for k,v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

logger.info("Cell 5: TorchDataset defined")


2025-04-29 10:16:46,274 - INFO - Cell 4: metric functions defined
2025-04-29 10:16:46,276 - INFO - Cell 5: TorchDataset defined


In [None]:
def run_cv_hp_search(model_name, tokenizer_name, df, n_splits=5, n_trials=5):
    logger.info(f"Starting CV+HPO for {model_name}")
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    all_results = []
    all_details = []
    all_grouped = []

    def tokenize(df_):
        return tokenizer(
            df_["sentence"].tolist(),
            padding=True, truncation=True,
            max_length=128, return_tensors="pt"
        )

    for fold, (train_i, test_i) in enumerate(skf.split(df, df["stratify_group"]), 1):
        logger.info(f"Fold {fold}/{n_splits}")
        train_full = df.iloc[train_i]
        test_df = df.iloc[test_i].reset_index(drop=True)

        # inner train/val split
        try:
            tr_df, val_df = train_test_split(
                train_full, test_size=0.2, random_state=SEED,
                stratify=train_full["stratify_group"]
            )
        except ValueError:
            tr_df, val_df = train_test_split(
                train_full, test_size=0.2, random_state=SEED,
                stratify=train_full["stereotype"]
            )
        tr_df, val_df = tr_df.reset_index(drop=True), val_df.reset_index(drop=True)

        # datasets
        ds_tr = TorchDataset(tokenize(tr_df), tr_df["stereotype"])
        ds_val = TorchDataset(tokenize(val_df), val_df["stereotype"])
        ds_te = TorchDataset(tokenize(test_df), test_df["stereotype"])

        # trainer setup
        model_init = lambda: AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=2
        )
        out_b = f"./results/{model_name.replace('/', '_')}/fold_{fold}"
        log_b = f"./logs/{model_name.replace('/', '_')}/fold_{fold}"

        args = TrainingArguments(
            output_dir=out_b,
            eval_strategy="epoch",
            save_strategy="no",  
            logging_strategy="epoch",
            logging_dir=log_b,
            report_to="none",
            fp16=True,
            num_train_epochs=3,
            load_best_model_at_end=False, 
            metric_for_best_model="accuracy",
            greater_is_better=True,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            seed=SEED,
        )

        trainer = Trainer(
            model_init=model_init,
            args=args,
            train_dataset=ds_tr,
            eval_dataset=ds_val,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
        )

        # Hyperparameter search
        best = trainer.hyperparameter_search(
            direction="maximize",
            backend="optuna",
            hp_space=lambda t: {
                "learning_rate": t.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
                "per_device_train_batch_size": t.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
                "weight_decay": t.suggest_float("weight_decay", 1e-6, 1e-2, log=True),
            },
            n_trials=n_trials,
            sampler=optuna.samplers.TPESampler(seed=SEED),
        )
        best_hyperparams = best.hyperparameters
        logger.info(f"Best HPO on fold {fold}: {best_hyperparams}")

        # Retrain met de beste hyperparameters
        trainer.args.per_device_train_batch_size = best_hyperparams["per_device_train_batch_size"]
        trainer.args.learning_rate = best_hyperparams["learning_rate"]
        trainer.args.weight_decay = best_hyperparams.get("weight_decay", 0.0)

        trainer.train()

        # Handmatig opslaan
        trainer.save_model(f"{out_b}/final_model")

        # Predicties en evaluatie
        test_output = trainer.predict(test_dataset=ds_te)
        metrics = trainer.evaluate(eval_dataset=ds_te)
        logger.info(f"Fold {fold} metrics: {metrics}")
        metrics.update({"fold": fold, "model": model_name, "hyperparams": best_hyperparams})
        all_results.append(metrics)

        # Gedetailleerde metrics
        detailed = compute_detailed_metrics(test_output, test_df)
        detailed.update({"fold": fold, "model": model_name})
        all_details.append({**metrics, **detailed})

        # Subset-analyse per model en temperatuur
        subset_records = []
        for model_type in test_df['model'].unique():
            for temp in test_df['temperature'].unique():
                subset = test_df[(test_df['model'] == model_type) & (test_df['temperature'] == temp)]
                if len(subset) < 10:
                    continue
                idx = subset.index.to_numpy() - min(test_df.index)
                preds = np.argmax(test_output.predictions[idx], axis=1)
                labels = test_output.label_ids[idx]
                acc = accuracy_score(labels, preds)
                prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
                sub_det = compute_detailed_metrics(
                    type('O', (), {'predictions': test_output.predictions[idx], 'label_ids': test_output.label_ids[idx]}),
                    subset
                )
                record = {
                    'fold': fold,
                    'classifier_model': model_name,
                    'llm_model': model_type,
                    'temperature': temp,
                    'accuracy': acc,
                    'precision': prec,
                    'recall': rec,
                    'f1': f1,
                    'tpr_gap_s': sub_det.get('tpr_gap_s'),
                    'tpr_gap_contra': sub_det.get('tpr_gap_contra'),
                    'sample_size': len(subset)
                }
                subset_records.append(record)
        all_grouped.extend(subset_records)

    # Samenvatten en opslaan
    results_df = pd.DataFrame(all_results)
    detailed_df = pd.DataFrame(all_details)
    grouped_df = pd.DataFrame(all_grouped).groupby(['llm_model', 'temperature']).agg({
        'accuracy': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'tpr_gap_s': ['mean', 'std'],
        'tpr_gap_contra': ['mean', 'std'],
        'sample_size': 'mean'
    }).reset_index()

    results_df.to_csv(f"results_base_{model_name.replace('/', '_')}.csv", index=False)
    detailed_df.to_csv(f"results_detailed_{model_name.replace('/', '_')}.csv", index=False)
    grouped_df.to_csv(f"results_by_model_temp_{model_name.replace('/', '_')}.csv", index=False)

    avg_metrics = {
        'avg_accuracy': results_df['eval_accuracy'].mean(),
        'std_accuracy': results_df['eval_accuracy'].std(),
        'avg_f1': results_df['eval_f1'].mean(),
        'std_f1': results_df['eval_f1'].std(),
        'model': model_name,
        'hyperparams': best_hyperparams
    }
    logger.info(f"Gemiddelde resultaten voor {model_name}: {avg_metrics}")

    return results_df, detailed_df, grouped_df, avg_metrics

logger.info("Cell 6: run_cv_hp_search")




2025-04-29 10:17:44,966 - INFO - Cell 6: run_cv_hp_search gereviseerd en gedefinieerd (minimal memory usage)


In [None]:
# === Cell 7: Main ===
if __name__ == "__main__":
    df = load_data()
    models = [
        ("GroNLP/bert-base-dutch-cased", "GroNLP/bert-base-dutch-cased"),
        #("bert-base-multilingual-cased", "bert-base-multilingual-cased"),
        #("DTAI-KULeuven/robbert-2023-dutch-large", "DTAI-KULeuven/robbert-2023-dutch-large"),
    ]
    # Resultaten opslaan per model
    all_summary_metrics = []
    all_detailed_results_dfs = []
    all_grouped_results = []
    
    # Voor elk model
    for model_name, tokenizer_name in models:
        logger.info(f"Starting model {model_name}")
        _, detailed_results_df, grouped_results, summary_metrics = run_cv_hp_search(
            model_name, tokenizer_name, df, n_splits=5, n_trials=3
        )
        all_summary_metrics.append(summary_metrics)
        all_detailed_results_dfs.append(detailed_results_df)
        all_grouped_results.append(grouped_results)
    
    # Alle resultaten combineren en vergelijken
    summary_df = pd.DataFrame(all_summary_metrics)
    summary_df.to_csv("all_models_comparison.csv", index=False)
    
    # Combineer alle gedetailleerde resultaten
    combined_detailed = pd.concat(all_detailed_results_dfs)
    combined_detailed.to_csv("all_detailed_results.csv", index=False)

    logger.info("All models finished")


2025-04-29 10:17:49,011 - INFO - Loading data from data/sentences_final.csv


14295
14295


2025-04-29 10:17:49,339 - INFO - Loaded 14295 rows
2025-04-29 10:17:49,339 - INFO - Starting model GroNLP/bert-base-dutch-cased
2025-04-29 10:17:49,340 - INFO - Starting CV+HPO for GroNLP/bert-base-dutch-cased
2025-04-29 10:17:49,505 - INFO - Fold 1/5


14295
14295


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-29 10:17:52,307] A new study created in memory with name: no-name-fb0f31d9-4165-4605-8211-c2879af1f433
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4058,0.245543,0.903846,0.87429,0.943132,0.907407
2,0.2157,0.289819,0.904283,0.87931,0.937008,0.907243


[I 2025-04-29 10:28:10,883] Trial 2 finished with value: 3.6302842630428467 and parameters: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}. Best is trial 0 with value: 3.652381713770728.
2025-04-29 10:28:10,885 - INFO - Best HPO on fold 1: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4058,0.245543,0.903846,0.87429,0.943132,0.907407
2,0.2157,0.289819,0.904283,0.87931,0.937008,0.907243


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

[I 2025-04-29 10:41:15,372] Trial 2 finished with value: 3.6274512628435613 and parameters: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}. Best is trial 1 with value: 3.6293247919326226.
2025-04-29 10:41:15,373 - INFO - Best HPO on fold 2: {'learning_rate': 1.2853916978930139e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.0006796578090758161}
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5044,0.304341,0.885927,0.851675,0.934383,0.891114
2,0.2529,0.258221,0.90472,0.881286,0.935258,0.90747
3,0.1903,0.266516,0.905157,0.883914,0.932633,0.90762






2025-04-29 10:43:52,683 - INFO - Fold 2 metrics: {'eval_loss': 0.2523389458656311, 'eval_accuracy': 0.9073102483385799, 'eval_precision': 0.8884435537742151, 'eval_recall': 0.9313725490196079, 'eval_f1': 0.9094017094017094, 'eval_runtime': 3.6904, 'eval_samples_per_second': 774.708, 'eval_steps_per_second': 12.194, 'epoch': 3.0}
2025-04-29 10:43:52,808 - INFO - Fold 3/5
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-29 10:43:55,103] A new study created in memory with name: no-name-64c57eee-cf96-421b-94bd-63810b225228
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['b

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3869,0.284705,0.902972,0.893926,0.914261,0.903979
2,0.218,0.313696,0.907343,0.900947,0.915136,0.907986
3,0.151,0.387812,0.906906,0.900862,0.914261,0.907512


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

[I 2025-04-29 10:47:51,306] Trial 0 finished with value: 3.6295403217280264 and parameters: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}. Best is trial 0 with value: 3.6295403217280264.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Onc

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4889,0.292796,0.891171,0.874372,0.913386,0.893453
2,0.2441,0.269039,0.902972,0.889924,0.91951,0.904475
3,0.1894,0.273472,0.904283,0.889545,0.92301,0.905968


[I 2025-04-29 10:50:20,911] Trial 1 finished with value: 3.6228057553139648 and parameters: {'learning_rate': 1.2853916978930139e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.0006796578090758161}. Best is trial 0 with value: 3.6295403217280264.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4543,0.291581,0.895542,0.88765,0.905512,0.896492
2,0.2384,0.293848,0.902972,0.891915,0.916885,0.904228
3,0.1789,0.30367,0.902972,0.890585,0.918635,0.904393


[I 2025-04-29 10:54:16,581] Trial 2 finished with value: 3.6165852051638634 and parameters: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}. Best is trial 0 with value: 3.6295403217280264.
2025-04-29 10:54:16,582 - INFO - Best HPO on fold 3: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3869,0.284705,0.902972,0.893926,0.914261,0.903979


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

[I 2025-04-29 11:03:30,611] Trial 1 finished with value: 3.6305681075673295 and parameters: {'learning_rate': 1.2853916978930139e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.0006796578090758161}. Best is trial 0 with value: 3.651764062235709.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Onc

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4611,0.269857,0.901224,0.880498,0.928259,0.903748
2,0.2446,0.264416,0.909528,0.881729,0.945757,0.912621
3,0.1894,0.280305,0.908217,0.883951,0.939633,0.910941


[I 2025-04-29 11:07:27,704] Trial 2 finished with value: 3.6427414222594647 and parameters: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}. Best is trial 0 with value: 3.651764062235709.
2025-04-29 11:07:27,705 - INFO - Best HPO on fold 4: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3994,0.240284,0.912587,0.884898,0.948381,0.915541
2,0.216,0.260476,0.910402,0.885057,0.943132,0.913172






2025-04-29 11:10:13,244 - INFO - Fold 4 metrics: {'eval_loss': 0.26728877425193787, 'eval_accuracy': 0.9122070654074851, 'eval_precision': 0.8859016393442622, 'eval_recall': 0.946078431372549, 'eval_f1': 0.9150016931933627, 'eval_runtime': 4.0132, 'eval_samples_per_second': 712.408, 'eval_steps_per_second': 11.213, 'epoch': 2.0}
2025-04-29 11:10:13,370 - INFO - Fold 5/5
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-29 11:10:15,598] A new study created in memory with name: no-name-79bfca5e-a66e-41bd-8a4a-633be6334f75
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['b

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4016,0.260767,0.907343,0.876923,0.947507,0.910849
2,0.2143,0.246083,0.916958,0.899413,0.938758,0.918664
3,0.158,0.347054,0.914336,0.896899,0.936133,0.916096


[I 2025-04-29 11:14:09,559] Trial 0 finished with value: 3.663463113144657 and parameters: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}. Best is trial 0 with value: 3.663463113144657.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4952,0.299901,0.887238,0.853717,0.934383,0.892231
2,0.2456,0.253327,0.903846,0.884263,0.929134,0.906143
3,0.1872,0.261273,0.903409,0.884167,0.928259,0.905676


[I 2025-04-29 11:16:38,676] Trial 1 finished with value: 3.621511208346076 and parameters: {'learning_rate': 1.2853916978930139e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.0006796578090758161}. Best is trial 0 with value: 3.663463113144657.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4679,0.281028,0.894231,0.864188,0.935258,0.898319
2,0.2378,0.253637,0.909528,0.886139,0.939633,0.912102
3,0.1851,0.289729,0.907343,0.885029,0.936133,0.909864


[I 2025-04-29 11:20:32,439] Trial 2 finished with value: 3.638368535843045 and parameters: {'learning_rate': 1.0336843570697396e-05, 'per_device_train_batch_size': 8, 'weight_decay': 5.337032762603957e-06}. Best is trial 0 with value: 3.663463113144657.
2025-04-29 11:20:32,440 - INFO - Best HPO on fold 5: {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4016,0.260767,0.907343,0.876923,0.947507,0.910849
2,0.2143,0.246083,0.916958,0.899413,0.938758,0.918664
3,0.158,0.347054,0.914336,0.896899,0.936133,0.916096






2025-04-29 11:24:35,717 - INFO - Fold 5 metrics: {'eval_loss': 0.4026331603527069, 'eval_accuracy': 0.9076600209863589, 'eval_precision': 0.885430463576159, 'eval_recall': 0.9362745098039216, 'eval_f1': 0.910142954390742, 'eval_runtime': 3.4412, 'eval_samples_per_second': 830.822, 'eval_steps_per_second': 13.077, 'epoch': 3.0}
2025-04-29 11:24:35,866 - INFO - Gemiddelde resultaten voor GroNLP/bert-base-dutch-cased: {'avg_accuracy': 0.9084295208114727, 'std_accuracy': 0.008551208043239668, 'avg_f1': 0.9107731026739634, 'std_f1': 0.00838172754220645, 'model': 'GroNLP/bert-base-dutch-cased', 'hyperparams': {'learning_rate': 1.827226177606625e-05, 'per_device_train_batch_size': 8, 'weight_decay': 4.2079886696066345e-06}}
2025-04-29 11:24:35,958 - INFO - All models finished
