In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import xgboost as xgb
import pandas as pd
import numpy as np

# === Load the transformed dataset ===
file_path = "vroegtijdig-schoolverlaten-woonplaats-dataset-2022-2023.xlsx"
bad_df = pd.read_excel(file_path)

def transform_dataset(bad_df):
    transformed_rows = []
    for _, row in bad_df.iterrows():
        vsv_noemer = row['vsv_noemer']
        vsv_teller = row['vsv_teller']
        for i in range(vsv_noemer):
            new_row = row.copy()
            new_row['vsv_teller'] = 1 if i < vsv_teller else 0
            transformed_rows.append(new_row)
    df = pd.DataFrame(transformed_rows)
    df = df.drop(columns=['vsv_noemer'])
    return df

df = transform_dataset(bad_df)

# Drop unnecessary column
if 'schooljaar' in df.columns:
    df.drop(columns=['schooljaar'], inplace=True)

# Separate features and target
X = df.drop(columns=['vsv_teller'])
y = df['vsv_teller']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Define columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

# Define preprocessor
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols)
], remainder='passthrough')

# Define resampling steps
over = SMOTE(sampling_strategy=0.5, random_state=42)
under = RandomUnderSampler(sampling_strategy=0.8, random_state=42)

# Define models
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
logreg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
xgb_model = xgb.XGBClassifier(scale_pos_weight=3, use_label_encoder=False, eval_metric='logloss')

# Pipelines
models = {
    "Baseline_RF": Pipeline([('preprocess', preprocessor), ('clf', rf)]),
    "BiasMitigated_RF": ImbPipeline([('preprocess', preprocessor), ('over', over), ('under', under), ('clf', rf)]),
    "BiasMitigated_LogReg": ImbPipeline([('preprocess', preprocessor), ('over', over), ('under', under), ('clf', logreg)]),
    "BiasMitigated_XGBoost": Pipeline([('preprocess', preprocessor), ('clf', xgb_model)]),
}

# Evaluate all models
def evaluate_all_models(models, X_train, X_test, y_train, y_test):
    results = {}
    for name, pipeline in models.items():
        print(f"\n📦 Training model: {name}")
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_proba = pipeline.predict_proba(X_test)[:, 1] if hasattr(pipeline, "predict_proba") else None

        acc = np.mean(y_pred == y_test)
        auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
        report = classification_report(y_test, y_pred, output_dict=True)

        results[name] = {
            'model': pipeline,
            'y_pred': y_pred,
            'y_proba': y_proba,
            'accuracy': acc,
            'roc_auc': auc,
            'report': report
        }

        print(f"→ Accuracy: {acc:.4f}")
        if auc:
            print(f"→ ROC AUC: {auc:.4f}")
        print(classification_report(y_test, y_pred))
    return results

model_results = evaluate_all_models(models, X_train, X_test, y_train, y_test)



📦 Training model: Baseline_RF
→ Accuracy: 0.9468
→ ROC AUC: 0.9303
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     12444
           1       0.85      0.73      0.78      1890

    accuracy                           0.95     14334
   macro avg       0.90      0.85      0.88     14334
weighted avg       0.94      0.95      0.95     14334


📦 Training model: BiasMitigated_RF
→ Accuracy: 0.9436
→ ROC AUC: 0.9361
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     12444
           1       0.81      0.75      0.78      1890

    accuracy                           0.94     14334
   macro avg       0.88      0.86      0.87     14334
weighted avg       0.94      0.94      0.94     14334


📦 Training model: BiasMitigated_LogReg
→ Accuracy: 0.9103
→ ROC AUC: 0.9400
              precision    recall  f1-score   support

           0       0.97      0.92      0.95     12444
           1       

Parameters: { "use_label_encoder" } are not used.



→ Accuracy: 0.9419
→ ROC AUC: 0.9450
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     12444
           1       0.78      0.78      0.78      1890

    accuracy                           0.94     14334
   macro avg       0.87      0.87      0.87     14334
weighted avg       0.94      0.94      0.94     14334



In [2]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

protected_attrs = ['geslacht', 'belg_nietbelg', 'eu_nieteu', 'woonplaats_provincie_naam', 'leeftijdscategorie', 'opleidingsniveau_moeder', 'schoolse_vordering']

def fairness_analysis(X_test, y_test, y_pred, protected_features):
    X_test_copy = X_test.copy()
    X_test_copy['true'] = y_test.values
    X_test_copy['pred'] = y_pred

    for feature in protected_features:
        print(f"\n Fairness Analysis by: {feature}")
        for group in X_test_copy[feature].unique():
            subset = X_test_copy[X_test_copy[feature] == group]
            if subset.shape[0] < 10:
                print(f"  Skipping group '{group}' due to too few samples.")
                continue

            y_true = subset['true']
            y_group_pred = subset['pred']

            tn, fp, fn, tp = confusion_matrix(y_true, y_group_pred).ravel()
            precision = precision_score(y_true, y_group_pred, zero_division=0)
            recall = recall_score(y_true, y_group_pred, zero_division=0)
            f1 = f1_score(y_true, y_group_pred, zero_division=0)
            fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
            fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

            print(f"  Group '{group}' (n={len(subset)}):")
            print(f"    TP={tp}, FP={fp}, FN={fn}, TN={tn}")
            print(f"    Precision: {precision:.3f} | Recall: {recall:.3f} | F1: {f1:.3f}")
            print(f"    FPR: {fpr:.3f} | FNR: {fnr:.3f}")

def compare_fairness(model_results, X_test, y_test, protected_attrs):
    for model_name, data in model_results.items():
        print(f"\n=== Fairness Evaluation for: {model_name} ===")
        y_pred = data['y_pred']
        fairness_analysis(X_test, y_test, y_pred, protected_attrs)

In [3]:
compare_fairness(model_results, X_test, y_test, protected_attrs)


=== Fairness Evaluation for: Baseline_RF ===

 Fairness Analysis by: geslacht
  Group 'Vrouwelijk' (n=6900):
    TP=487, FP=89, FN=203, TN=6121
    Precision: 0.845 | Recall: 0.706 | F1: 0.769
    FPR: 0.014 | FNR: 0.294
  Group 'Mannelijk' (n=7434):
    TP=885, FP=155, FN=315, TN=6079
    Precision: 0.851 | Recall: 0.738 | F1: 0.790
    FPR: 0.025 | FNR: 0.263

 Fairness Analysis by: belg_nietbelg
  Group 'Belg' (n=13089):
    TP=1046, FP=198, FN=431, TN=11414
    Precision: 0.841 | Recall: 0.708 | F1: 0.769
    FPR: 0.017 | FNR: 0.292
  Group 'niet-Belg' (n=1245):
    TP=326, FP=46, FN=87, TN=786
    Precision: 0.876 | Recall: 0.789 | F1: 0.831
    FPR: 0.055 | FNR: 0.211

 Fairness Analysis by: eu_nieteu
  Group 'EU' (n=13807):
    TP=1197, FP=223, FN=470, TN=11917
    Precision: 0.843 | Recall: 0.718 | F1: 0.776
    FPR: 0.018 | FNR: 0.282
  Group 'niet-EU' (n=527):
    TP=175, FP=21, FN=48, TN=283
    Precision: 0.893 | Recall: 0.785 | F1: 0.835
    FPR: 0.069 | FNR: 0.215

 Fair