In [26]:
import pandas as pd
import numpy as np

def eurostat_json_to_tidy(json_data):
    """
    Convert a Eurostat JSON (statistics 1.0 API) dataset
    into a tidy Pandas DataFrame.

    Each row in the result is one observation.
    Columns are the dimensions (e.g. freq, iccs, unit, geo, time)
    plus one column "value".
    """

    # 1. Read the list of dimension ids and their sizes
    # Example: id = ["freq", "iccs", "unit", "geo", "time"]
    dim_ids = json_data["id"]           # list of dimension names
    dim_sizes = json_data["size"]       # list of sizes for each dimension
    dims = json_data["dimension"]       # details for each dimension

    # 2. Read the "value" object (a dict: index -> number)
    val_dict = json_data["value"]
    # keys are strings "0", "1", ..., convert to int
    obs_idx = np.array(list(map(int, val_dict.keys())))
    obs_values = np.array(list(val_dict.values()), dtype=float)

    # 3. Convert linear indices to multi-dimensional coordinates
    # This uses the dimension sizes in the order of dim_ids.
    coords = np.vstack(np.unravel_index(obs_idx, dim_sizes)).T
    # coords shape: (n_obs, n_dims)

    # 4. Build code lists for each dimension
    # For each dimension, we have a mapping: code -> position
    # We invert this to get an array: position -> code
    code_lists = {}
    for dim_name in dim_ids:
        index_map = dims[dim_name]["category"]["index"]  # dict: code -> position
        # sort by position (the value in the dict)
        codes_sorted = [code for code, pos in sorted(index_map.items(), key=lambda kv: kv[1])]
        code_lists[dim_name] = np.array(codes_sorted)

    # 5. Map numeric positions in coords to actual codes
    data = {}
    for dim_i, dim_name in enumerate(dim_ids):
        # coords[:, dim_i] gives the position index for this dimension
        data[dim_name] = code_lists[dim_name][coords[:, dim_i]]

    # 6. Add the numeric values
    data["value"] = obs_values

    # 7. Build the final tidy DataFrame
    df_tidy = pd.DataFrame(data)
    return df_tidy


# ---------- Example usage ----------

# You already have: json_data = response.json()
df_tidy = eurostat_json_to_tidy(json_data)

print(df_tidy.head())
print("Shape:", df_tidy.shape)


  freq      iccs unit geo  time  value
0    A  ICCS0101   NR  AL  2008   88.0
1    A  ICCS0101   NR  AL  2009   82.0
2    A  ICCS0101   NR  AL  2010  118.0
3    A  ICCS0101   NR  AL  2011  124.0
4    A  ICCS0101   NR  AL  2012  125.0
Shape: (19582, 6)


In [42]:
import os
from sklearn.model_selection import train_test_split

# Create directory if not exists
task_dir = r"C:\TUB\RDEP\crime_task"
os.makedirs(task_dir, exist_ok=True)

# Split
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["label"])

# Save
train_df.to_csv(os.path.join(task_dir, "train.csv"), index=False)
test_df.to_csv(os.path.join(task_dir, "test.csv"), index=False)

print("Saved to:", task_dir)


Saved to: C:\TUB\RDEP\crime_task


In [43]:
import os
import yaml

task_dir = r"C:\TUB\RDEP\crime_task"
metadata_path = os.path.join(task_dir, "metadata.yaml")

metadata = {
    "task_type": "classification",
    "label": "label",
    "id_column": None,
    "description": "Predict HIGH or LOW crime rate from Eurostat crim_off_cat dataset."
}

# Write YAML file
with open(metadata_path, "w", encoding="utf-8") as f:
    yaml.dump(metadata, f, sort_keys=False)

print("metadata.yaml saved to:", metadata_path)


metadata.yaml saved to: C:\TUB\RDEP\crime_task\metadata.yaml


In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

from joblib import dump, load

try:
    import yaml
except ImportError:
    yaml = None

try:
    import aide
except ImportError:
    aide = None


# =========================
# 1. Path settings
# =========================

BASE_ROOT = r"C:\TUB\RDEP"
TASK_WITH_VALUE = os.path.join(BASE_ROOT, "crime_task")
TASK_NOVALUE = os.path.join(BASE_ROOT, "crime_task_novalue")
os.makedirs(TASK_NOVALUE, exist_ok=True)

WORK_DIR = os.path.join(TASK_NOVALUE, "working")
os.makedirs(WORK_DIR, exist_ok=True)


# =========================
# 2. Build new task (without "value")
# =========================

def build_task_without_value():
    """
    Create a new task where we remove the column 'value'.
    This task will only use geo, iccs, and time.
    """
    train_path = os.path.join(TASK_WITH_VALUE, "train.csv")
    test_path = os.path.join(TASK_WITH_VALUE, "test.csv")

    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    # Only keep these columns
    train_novalue = train[["geo", "iccs", "time", "label"]].copy()
    test_novalue = test[["geo", "iccs", "time"]].copy()

    train_novalue.to_csv(os.path.join(TASK_NOVALUE, "train.csv"), index=False)
    test_novalue.to_csv(os.path.join(TASK_NOVALUE, "test.csv"), index=False)

    print("[STEP] New task created: crime_task_novalue")
    print("Train shape:", train_novalue.shape)
    print("Test shape :", test_novalue.shape)

    # Write metadata if YAML is available
    if yaml is not None:
        metadata = {
            "task_type": "classification",
            "label": "label",
            "id_column": None,
            "description": "Predict crime level using geo, iccs, and time.",
        }
        with open(os.path.join(TASK_NOVALUE, "metadata.yaml"), "w") as f:
            yaml.dump(metadata, f)
        print("[STEP] metadata.yaml created.")
    else:
        print("[WARN] PyYAML not installed. metadata.yaml not created.")


# =========================
# 3. Fairness helper functions
# =========================

def compute_rates(y_true, y_pred):
    """
    Compute fairness metrics:
    TPR, FPR, FNR, PositiveRate and confusion counts.
    """
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    eps = 1e-12

    return {
        "TPR": tp / (tp + fn + eps),
        "FPR": fp / (fp + tn + eps),
        "FNR": fn / (fn + tp + eps),
        "PositiveRate": (tp + fp) / (tp + fp + tn + fn + eps),
        "N": tn + fp + fn + tp,
        "TP": tp, "FP": fp, "TN": tn, "FN": fn
    }


def evaluate_fairness_by_group(model, X, y, groups, group_name="geo", save_prefix="model"):
    """
    Calculate fairness metrics for each group.
    For example, groups = geo.
    """
    y_pred = model.predict(X)

    result_list = []
    for g in sorted(groups.unique()):
        mask = (groups == g)
        rates = compute_rates(y[mask], y_pred[mask])
        rates["group"] = g
        result_list.append(rates)

    df_group = pd.DataFrame(result_list).set_index("group")

    # Calculate fairness gaps
    gaps = {}
    for metric in ["PositiveRate", "TPR", "FPR", "FNR"]:
        gaps[metric + "_gap"] = df_group[metric].max() - df_group[metric].min()

    df_gaps = pd.DataFrame.from_dict(gaps, orient="index", columns=["value"])

    print("\n[FAIRNESS] Per-group results:")
    print(df_group)

    print("\n[FAIRNESS] Gaps:")
    print(df_gaps)

    df_group.to_csv(os.path.join(WORK_DIR, f"{save_prefix}_fairness_by_{group_name}.csv"))
    df_gaps.to_csv(os.path.join(WORK_DIR, f"{save_prefix}_fairness_gaps_{group_name}.csv"))

    return df_group, df_gaps


# =========================
# 4. Baseline Models
# =========================

def run_baseline_models():
    """
    Train baseline models (RandomForest and LogisticRegression)
    and test their fairness.
    """
    train = pd.read_csv(os.path.join(TASK_NOVALUE, "train.csv"))
    X = train[["geo", "iccs", "time"]]
    y = train["label"]
    groups = train["geo"]

    # Split for validation
    X_train, X_val, y_train, y_val, groups_train, groups_val = train_test_split(
        X, y, groups, test_size=0.3, random_state=42, stratify=y
    )

    # Preprocessing: encode geo, iccs + scale time
    preproc = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), ["geo", "iccs"]),
            ("num", StandardScaler(), ["time"])
        ]
    )

    # --- Random Forest ---
    rf = Pipeline([
        ("preproc", preproc),
        ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
    ])

    rf.fit(X_train, y_train)
    print("\n[RF] Accuracy:", accuracy_score(y_val, rf.predict(X_val)))

    evaluate_fairness_by_group(
        rf, X_val, y_val, groups_val, save_prefix="rf"
    )

    dump(rf, os.path.join(WORK_DIR, "rf_model.joblib"))

    # --- Logistic Regression ---
    log = Pipeline([
        ("preproc", preproc),
        ("clf", LogisticRegression(max_iter=1000))
    ])

    log.fit(X_train, y_train)
    print("\n[LOG] Accuracy:", accuracy_score(y_val, log.predict(X_val)))

    evaluate_fairness_by_group(
        log, X_val, y_val, groups_val, save_prefix="log"
    )

    dump(log, os.path.join(WORK_DIR, "log_model.joblib"))


# =========================
# 5. AIDE experiment
# =========================

def run_aide_experiment():
    """
    Run AIDE automatic model search.
    You may need to adjust the API for your own AIDE version.
    """
    if aide is None:
        print("\n[WARN] AIDE is not installed. Skip this step.")
        return

    print("\n[AIDE] Start search...")

    exp = aide.Experiment(
        data_dir=TASK_NOVALUE,
        goal="Predict crime level.",
        eval="Classification accuracy"
    )

    best_solution = exp.run(steps=5)

    print("\n[AIDE] Best accuracy:", best_solution.valid_metric)
    print("\n[AIDE] Best code:\n", best_solution.code)

    # You may need to change this line to match your own AIDE version.
    try:
        aide_model = best_solution.model
    except:
        print("[ERROR] Could not load AIDE model. Adjust API manually.")
        return

    # Train AIDE model on train split
    train = pd.read_csv(os.path.join(TASK_NOVALUE, "train.csv"))
    X = train[["geo", "iccs", "time"]]
    y = train["label"]
    groups = train["geo"]

    X_train, X_val, y_train, y_val, groups_train, groups_val = train_test_split(
        X, y, groups, test_size=0.3, random_state=42, stratify=y
    )

    aide_model.fit(X_train, y_train)
    print("\n[AIDE] Accuracy:", accuracy_score(y_val, aide_model.predict(X_val)))

    evaluate_fairness_by_group(
        aide_model, X_val, y_val, groups_val, save_prefix="aide"
    )

    dump(aide_model, os.path.join(WORK_DIR, "aide_model.joblib"))


# =========================
# MAIN
# =========================

if __name__ == "__main__":
    print("=== Build new task ===")
    build_task_without_value()

    print("\n=== Baseline models ===")
    run_baseline_models()

    print("\n=== AIDE search ===")
    run_aide_experiment()

    print("\nAll steps completed.")


=== Build new task ===
[STEP] New task created: crime_task_novalue
Train shape: (13707, 4)
Test shape : (5875, 3)
[STEP] metadata.yaml created.

=== Baseline models ===

[RF] Accuracy: 0.6100170192073912

[FAIRNESS] Per-group results:
            TPR       FPR       FNR  PositiveRate    N  TP  FP  TN  FN
group                                                                 
AL     0.431818  0.303030  0.568182      0.354545  110  19  20  46  25
AT     0.600000  0.327586  0.400000      0.466102  118  36  19  39  24
BA     0.852941  0.171429  0.147059      0.507246   69  29   6  29   5
BE     0.712329  0.738095  0.287671      0.721739  115  52  31  11  21
BG     0.510204  0.328571  0.489796      0.403361  119  25  23  47  24
CH     0.694915  0.444444  0.305085      0.575221  113  41  24  30  18
CY     0.657143  0.148649  0.342857      0.311927  109  23  11  63  12
CZ     0.452830  0.358209  0.547170      0.400000  120  24  24  43  29
DE     0.500000  0.697674  0.500000      0.577982  109 

  layout = (layout - layout.min(axis=0)) / (layout.max(axis=0) - layout.min(axis=0))



[AIDE] Best accuracy: 0.7145

[AIDE] Best code:
 import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb

# Load data
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

# Features and label
features = ["geo", "iccs", "time"]
X = train[features].copy()
y = train["label"].copy()
X_test = test[features].copy()

# Label encode categorical features
for col in ["geo", "iccs"]:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    X_test[col] = le.transform(X_test[col])

# Prepare cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = []

for train_idx, valid_idx in skf.split(X, y):
    X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]

    model = lgb.LGBMClassifier(random_state=42)
    model.fit(X_tr, y_tr)
    pre

In [None]:
# =====================================================
# 0. Prompt for AIDE-ML (to drive code generation)
# =====================================================

PROMPT_FOR_AIDE = """
You are an AI assistant that automatically builds, trains and evaluates a fairness-aware
machine learning model for a crime prediction task.

Goal:
- Improve group fairness across countries (feature `geo`) while keeping classification accuracy high.
- Implement an end-to-end workflow that can be run as a single Python script.

Data description:
- Input files: `train.csv` and `test.csv` in a given BASE_DIR.
- Target column in train: `label` (binary).
- Candidate feature columns: "geo", "iccs", "time", "value" (some of them may be missing).
- Sensitive attribute: "geo" (country / region).
- Test.csv has the same feature columns as train, but without `label`.

Your generated code must do the following:

1. Load data and prepare features:
   - Set a BASE_DIR (e.g. "C:\\\\TUB\\\\RDEP\\\\crime_task").
   - Load `train.csv` and `test.csv` with pandas.
   - From ["geo", "iccs", "time", "value"], select only those columns that exist in train.
   - Define:
       X      = train[features]
       y      = train["label"]
       X_test = test[features]
   - Extract the sensitive attribute:
       groups = X["geo"].values

2. Create a train/validation split:
   - Use sklearn.model_selection.train_test_split
   - 80% train, 20% validation
   - stratify by y
   - Keep track of group labels for both sets (g_train, g_val).

3. Implement group fairness metrics:
   - Write a function `compute_group_fairness(y_true, y_pred, group_values)` that:
       * For each group g:
           - Computes TP, FP, TN, FN.
           - Computes:
               TPR = TP / (TP + FN)
               FPR = FP / (FP + TN)
               FNR = FN / (TP + FN)
               PositiveRate = (TP + FP) / N_group
       * Returns:
           - A per-group metrics DataFrame.
           - A DataFrame (or dict) with disparities:
               TPR_gap          = max(TPR) - min(TPR)
               FPR_gap          = max(FPR) - min(FPR)
               FNR_gap          = max(FNR) - min(FNR)
               PositiveRate_gap = max(PositiveRate) - min(PositiveRate).

4. Implement Reweighing (Kamiran & Calders, 2012):
   - Write a function `compute_reweighing_weights(df, group_col, label_col)` that:
       * Estimates empirical probabilities:
           P(a)   for each group a,
           P(y)   for each label y,
           P(a,y) joint probability of group a and label y.
       * Computes sample weights:
           w(a,y) = P(a) * P(y) / P(a,y).
       * Returns a numpy array of sample weights, aligned with the rows of df.
   - Construct a DataFrame from the training split:
       train_for_weights = X_train.copy()
       train_for_weights["label"] = y_train
   - Call `compute_reweighing_weights` to get `w_train`.

5. Define preprocessing and baseline classifier:
   - Use a ColumnTransformer that:
       * One-hot encodes categorical columns (dtype == object), e.g. "geo", "iccs".
       * Standard-scales numerical columns, e.g. "time", "value".
   - Wrap this in a Pipeline with:
       ("preproc", ColumnTransformer),
       ("clf", LogisticRegression(max_iter=1000, random_state=42)).

6. Baseline logistic regression (no fairness):
   - Fit the pipeline on (X_train, y_train) without sample weights.
   - Predict on X_val.
   - Compute:
       * Overall accuracy (sklearn.metrics.accuracy_score).
       * Per-group fairness metrics and disparities using `compute_group_fairness`.
   - Print:
       "=== Baseline logistic regression ==="
       - accuracy
       - per-group metrics
       - disparities (TPR_gap, FPR_gap, etc.).

7. Fairness-aware training with Reweighing:
   - Refit the same pipeline on (X_train, y_train), but now pass:
       sample_weight = w_train
     to the classifier in the pipeline via the correct parameter name
       (e.g. clf__sample_weight in sklearn Pipeline).
   - Predict on X_val.
   - Compute accuracy and fairness metrics again.
   - Print:
       "=== Fairness-aware logistic regression (reweighing) ==="
       - accuracy
       - per-group metrics
       - disparities.
   - Compare fairness gaps with the baseline to show improvement.

8. Post-processing fairness improvement (group-wise thresholds, Hardt-style):
   - Implement a function:
       search_group_thresholds(model, X, y_true, groups, n_grid=21)
     that:
       * Uses model.predict_proba(X)[:, 1] to get positive-class scores.
       * For each group g:
           - Loops over thresholds t in np.linspace(0.1, 0.9, n_grid).
           - For each t, creates binary predictions y_hat = (proba >= t).
           - Computes accuracy and FPR on that group.
           - Defines a score = accuracy - FPR (simple trade-off).
           - Keeps the threshold t with the best score.
       * Returns a dictionary: {group_value: best_threshold}.
   - Implement a function:
       predict_with_thresholds(model, X, groups, thresholds, proba_pos_index=1)
     that:
       * Uses predict_proba to get scores.
       * Applies group-specific thresholds from the dict.
   - Apply these thresholds on the validation set:
       * Compute new accuracy and fairness metrics.
       * Print results as:
           "=== Post-processing thresholds ==="
           - thresholds per group
           - accuracy
           - disparities.

9. Final fairness-aware model and submission:
   - Recompute reweighing weights on the full training data (X, y).
   - Fit the pipeline on the full (X, y) with these weights.
   - Predict labels for X_test.
   - Save a CSV file:
       working/submission_fair_logreg.csv
     with a single column "label".

10. Requirements:
   - The script must be fully runnable as a single file, with all imports included.
   - It must print baseline vs. reweighing vs. post-processing metrics so that fairness improvements are visible.
   - It must save the final CSV file for submission.

Now, generate a complete Python script that implements all the steps above.
"""


# =====================================================
# 1. Imports and paths
# =====================================================

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# =====================================================
# 2. Paths and data loading
# =====================================================

BASE_DIR = r"C:\TUB\RDEP\crime_task"   
train_path = os.path.join(BASE_DIR, "train.csv")
test_path = os.path.join(BASE_DIR, "test.csv")
working_dir = os.path.join(BASE_DIR, "working")

os.makedirs(working_dir, exist_ok=True)

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# possible feature columns (with or without "value")
possible_features = ["geo", "iccs", "time", "value"]
features = [c for c in possible_features if c in train.columns]

print("Using features:", features)

X = train[features].copy()
y = train["label"].copy()
X_test = test[features].copy()

# sensitive attribute for group fairness
sensitive_col = "geo"
groups = X[sensitive_col].values


# =====================================================
# 3. Train / validation split
# =====================================================

X_train, X_val, y_train, y_val, g_train, g_val = train_test_split(
    X, y, groups,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# =====================================================
# 4. Fairness helper functions
# =====================================================

def compute_group_fairness(y_true, y_pred, group_values):
    """
    Compute per-group TPR, FPR, FNR and positive prediction rate.
    Also compute gaps (max - min) across groups.
    """
    data = pd.DataFrame({
        "y_true": y_true,
        "y_pred": y_pred,
        "group": group_values
    })

    rows = []

    for g, df_g in data.groupby("group"):
        N = len(df_g)
        tp = ((df_g["y_true"] == 1) & (df_g["y_pred"] == 1)).sum()
        fp = ((df_g["y_true"] == 0) & (df_g["y_pred"] == 1)).sum()
        tn = ((df_g["y_true"] == 0) & (df_g["y_pred"] == 0)).sum()
        fn = ((df_g["y_true"] == 1) & (df_g["y_pred"] == 0)).sum()

        tpr = tp / (tp + fn) if (tp + fn) > 0 else np.nan
        fpr = fp / (fp + tn) if (fp + tn) > 0 else np.nan
        fnr = fn / (tp + fn) if (tp + fn) > 0 else np.nan
        positive_rate = (tp + fp) / N if N > 0 else np.nan

        rows.append({
            "group": g,
            "N": N,
            "TP": tp,
            "FP": fp,
            "TN": tn,
            "FN": fn,
            "TPR": tpr,
            "FPR": fpr,
            "FNR": fnr,
            "PositiveRate": positive_rate
        })

    df_metrics = pd.DataFrame(rows).set_index("group")

    disparities = pd.DataFrame({
        "value": {
            "TPR_gap": df_metrics["TPR"].max() - df_metrics["TPR"].min(),
            "FPR_gap": df_metrics["FPR"].max() - df_metrics["FPR"].min(),
            "FNR_gap": df_metrics["FNR"].max() - df_metrics["FNR"].min(),
            "PositiveRate_gap": df_metrics["PositiveRate"].max() - df_metrics["PositiveRate"].min(),
        }
    })

    return df_metrics, disparities


def compute_reweighing_weights(df, group_col, label_col):
    """
    Reweighing as in Kamiran & Calders (2012).

    w(a,y) = P(y) * P(a) / P(a,y)

    This reduces sample bias between (group, label) combinations.
    """
    df = df.copy()
    a = df[group_col]
    y = df[label_col]

    n = len(df)

    # Marginal probabilities
    p_y = y.value_counts() / n         # P(y)
    p_a = a.value_counts() / n         # P(a)

    # Joint probability P(a,y)
    p_a_y = df.groupby([group_col, label_col]).size() / n  # P(a,y)

    weight_table = {}
    for (a_val, y_val), p_ay in p_a_y.items():
        w = (p_y[y_val] * p_a[a_val]) / p_ay
        weight_table[(a_val, y_val)] = w

    # Map back to each sample
    sample_weights = []
    for a_val, y_val in zip(a, y):
        sample_weights.append(weight_table[(a_val, y_val)])

    return np.array(sample_weights)


# =====================================================
# 5. Compute reweighing weights on training split
# =====================================================

train_for_weights = X_train.copy()
train_for_weights["label"] = y_train.values

w_train = compute_reweighing_weights(
    train_for_weights,
    group_col=sensitive_col,
    label_col="label"
)

print("Reweighing weights range:", w_train.min(), "to", w_train.max())


# =====================================================
# 6. Preprocessing + logistic regression pipeline
# =====================================================

cat_cols = []
num_cols = []
for c in features:
    if train[c].dtype == "object":
        cat_cols.append(c)
    else:
        num_cols.append(c)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols),
    ]
)

log_reg = LogisticRegression(max_iter=1000, random_state=42)

pipe_log = Pipeline(
    steps=[
        ("preproc", preprocessor),
        ("clf", log_reg),
    ]
)


# =====================================================
# 7. Baseline logistic regression (no fairness)
# =====================================================

print("\n=== Baseline logistic regression ===")
pipe_log.fit(X_train, y_train)
y_val_pred_base = pipe_log.predict(X_val)
acc_base = accuracy_score(y_val, y_val_pred_base)
print(f"Baseline accuracy: {acc_base:.4f}")

fair_base, disp_base = compute_group_fairness(y_val, y_val_pred_base, g_val)
print("Baseline per-group metrics:")
print(fair_base)
print("\nBaseline disparities:")
print(disp_base)


# =====================================================
# 8. Fairness-aware training (reweighing)
# =====================================================

print("\n=== Fairness-aware logistic regression (reweighing) ===")
pipe_log.fit(X_train, y_train, clf__sample_weight=w_train)
y_val_pred_fair = pipe_log.predict(X_val)
acc_fair = accuracy_score(y_val, y_val_pred_fair)
print(f"Fairness-aware accuracy: {acc_fair:.4f}")

fair_fair, disp_fair = compute_group_fairness(y_val, y_val_pred_fair, g_val)
print("Reweighing per-group metrics:")
print(fair_fair)
print("\nReweighing disparities:")
print(disp_fair)


# =====================================================
# 9. Post-processing: group-wise thresholds (Hardt-style)
# =====================================================

def predict_with_thresholds(model, X, groups, thresholds, proba_pos_index=1):
    """
    Apply different thresholds per group on predicted probabilities.
    """
    proba = model.predict_proba(X)[:, proba_pos_index]
    preds = []
    for p, g in zip(proba, groups):
        t = thresholds.get(g, 0.5)
        preds.append(1 if p >= t else 0)
    return np.array(preds)


def search_group_thresholds(model, X, y_true, groups, n_grid=21):
    """
    For each group, search a threshold that gives good accuracy
    and lower FPR (simple trade-off).
    """
    proba = model.predict_proba(X)[:, 1]
    df = pd.DataFrame({
        "proba": proba,
        "y_true": y_true,
        "group": groups
    })

    thresholds = {}
    for g, df_g in df.groupby("group"):
        best_t = 0.5
        best_score = -1.0

        for t in np.linspace(0.1, 0.9, n_grid):
            y_hat = (df_g["proba"].values >= t).astype(int)
            y_true_g = df_g["y_true"].values

            tp = ((y_true_g == 1) & (y_hat == 1)).sum()
            fp = ((y_true_g == 0) & (y_hat == 1)).sum()
            tn = ((y_true_g == 0) & (y_hat == 0)).sum()
            fn = ((y_true_g == 1) & (y_hat == 0)).sum()

            acc = (tp + tn) / len(df_g) if len(df_g) > 0 else 0.0
            fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

            # simple trade-off: high accuracy and low FPR
            score = acc - fpr

            if score > best_score:
                best_score = score
                best_t = t

        thresholds[g] = best_t

    return thresholds


print("\n=== Post-processing thresholds (group-wise) ===")
thresholds = search_group_thresholds(pipe_log, X_val, y_val, g_val, n_grid=21)
print("Example thresholds (first 10 groups):", list(thresholds.items())[:10])

y_val_pred_post = predict_with_thresholds(pipe_log, X_val, g_val, thresholds)
acc_post = accuracy_score(y_val, y_val_pred_post)
print(f"Post-processed accuracy: {acc_post:.4f}")

fair_post, disp_post = compute_group_fairness(y_val, y_val_pred_post, g_val)
print("Post-processing per-group metrics:")
print(fair_post)
print("\nPost-processing disparities:")
print(disp_post)


# =====================================================
# 10. Train final fairness-aware model on full train
#     and save submission for test set
# =====================================================

print("\n=== Train final model on full train (with reweighing) ===")

train_full = X.copy()
train_full["label"] = y.values
w_full = compute_reweighing_weights(
    train_full,
    group_col=sensitive_col,
    label_col="label"
)

pipe_log.fit(X, y, clf__sample_weight=w_full)

test_pred = pipe_log.predict(X_test)
submission = pd.DataFrame({"label": test_pred})

out_path = os.path.join(working_dir, "submission_fair_logreg.csv")
submission.to_csv(out_path, index=False)
print("Saved final submission to:", out_path)


Using features: ['geo', 'iccs', 'time', 'value']
Reweighing weights range: 0.6431338295772507 to 2.2461064639351735

=== Baseline logistic regression ===
Baseline accuracy: 0.7786
Baseline per-group metrics:
        N  TP  FP  TN  FN       TPR       FPR       FNR  PositiveRate
group                                                                
AL     70  13   2  40  15  0.464286  0.047619  0.535714      0.214286
AT     84  40   3  35   6  0.869565  0.078947  0.130435      0.511905
BA     41  16   5  14   6  0.727273  0.263158  0.272727      0.512195
BE     80  41  10  20   9  0.820000  0.333333  0.180000      0.637500
BG     82  21   6  44  11  0.656250  0.120000  0.343750      0.329268
CH     71  27   2  31  11  0.710526  0.060606  0.289474      0.408451
CY     74  10   1  47  16  0.384615  0.020833  0.615385      0.148649
CZ     77  15   6  43  13  0.535714  0.122449  0.464286      0.272727
DE     69  35   1  23  10  0.777778  0.041667  0.222222      0.521739
DK     67  31   3  23 