<a   href="https://colab.research.google.com/github/N-Nieto/OHBM_SEA-SIG_Educational_Course/blob/master/03_pitfalls/03_02_data_leakage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Leakage exploration

In [None]:
# Import modules
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifierCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from pathlib import Path

In [None]:
# Prepare the data
# Features: Cortical + Subcortical
features = ["cortical", "subcortical"]

# Target: Sex
target = ["SEX_ID (1=m, 2=f)"]
# Confounding variables: No for this example
confounding = []

data_path = Path("../data/")
df_data = pd.read_csv(data_path / "cleaned_IXI_behavioural.csv", index_col=0)
columns_features = []
for feature in features:
    if feature == "cortical":
        df_feature = pd.read_csv(
            data_path / "cleaned_VBM_GM_Schaefer100x17_mean_aggregation.csv",
            index_col=0,
        )
    elif feature == "subcortical":
        df_feature = pd.read_csv(
            data_path
            / "cleaned_VBM_GM_TianxS1x3TxMNI6thgeneration_mean_aggregation.csv",
            index_col=0,
        )
    else:
        print("feature not recognized")

    df_data = df_data.join(df_feature, how="inner")
    columns_features = columns_features + df_feature.columns.to_list()


print(f"Initial data shape: {df_data.shape}")

# Check for NaNs in confounding columns
confounding_cols = target + confounding
for col in confounding_cols:
    if df_data[col].isna().sum() > 0:
        print(f"{df_data[col].isna().sum()} NaNs in column {col}.")
        print("Drop NaNs and align subjects")

        # Drop NaNs from the brain dataframe (which contains all columns)
        df_data = df_data.dropna(subset=[col])
        print(f"New data shape: {df_data.shape}")
    else:
        print(f"No NaNs in column {col}.")

print(f"Final data shape: {df_data.shape}")

y = df_data[target].values.ravel()
if target == ["SEX_ID (1=m, 2=f)"]:
    y = np.where(y == 2, 0, 1)  # 1


X = df_data.loc[:, columns_features].values  # only brain features

print("X shape")
print(X.shape)


Final data shape: (588, 122)


In [34]:
# Set parameters
random_state = 50
test_size = 0.3


# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, shuffle=True, random_state=random_state
)

In [18]:
df_data["SEX_ID (1=m, 2=f)"].value_counts()  # the classes look reasonably balanced


# random samply majority class to have balanced classes
def random_undersample(X, y, random_state=42):
    # Find the indices of each class
    class_0_indices = np.where(y == 1)[0]
    class_1_indices = np.where(y == 2)[0]

    # Determine the size of the minority class
    min_class_size = min(len(class_0_indices), len(class_1_indices))

    # Randomly sample from each class to match the minority class size
    np.random.seed(random_state)
    sampled_class_0_indices = np.random.choice(
        class_0_indices, min_class_size, replace=False
    )
    sampled_class_1_indices = np.random.choice(
        class_1_indices, min_class_size, replace=False
    )

    # Combine the sampled indices and shuffle them
    combined_indices = np.concatenate(
        (sampled_class_0_indices, sampled_class_1_indices)
    )
    np.random.shuffle(combined_indices)

    # Return the undersampled X and y
    return X[combined_indices], y[combined_indices]


X, y = random_undersample(X, y, random_state=random_state)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, shuffle=True, random_state=random_state
)


In [35]:
# Check size of data
print("X shape", X.shape)
print("y shape", y.shape)
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)


X shape (588, 116)
y shape (588,)
X_train shape (411, 116)
X_test shape (177, 116)
y_train shape (411,)
y_test shape (177,)


#  Leakage example 1:
### Train on whole data:

In [36]:
# Train our model on the whole data (Fig. 2 in Sasse et al., 2025)
dt_raw_lkg1 = DecisionTreeClassifier(max_depth=10, random_state=random_state)
dt_raw_lkg1.fit(X, y)

print("Raw Data - Train accuracy:", dt_raw_lkg1.score(X, y))
print("Raw Data - Test accuracy:", dt_raw_lkg1.score(X_test, y_test))


Raw Data - Train accuracy: 0.9795918367346939
Raw Data - Test accuracy: 0.9830508474576272


### Correct procedure:

In [37]:
# Train our model on the train set and test on the test set
dt_raw = DecisionTreeClassifier(max_depth=10, random_state=random_state)
dt_raw.fit(X_train, y_train)

print("Raw Data - Train accuracy:", dt_raw.score(X_train, y_train))
print("Raw Data - Test accuracy:", dt_raw.score(X_test, y_test))

Raw Data - Train accuracy: 0.9829683698296837
Raw Data - Test accuracy: 0.6666666666666666


When the model was trained in train set and tested on test set the test performance dropped.
When the model was trained in the whole dataset it performed well in both, train and test datasets. 
This is because the model learned patterns of the test set during training.

# Leakage example 2:
### Feature selection on whole dataset:

In [38]:
# Define reproducible cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# ========== Case 1: Data Leakage ==========
# Feature selection on entire dataset (leakage)
selector = SelectKBest(score_func=f_classif, k=5)
X_selected_leak = selector.fit_transform(X, y)

# Scale and reduce dimensionality on entire dataset (leakage)
scaler = StandardScaler()
X_scaled_leak = scaler.fit_transform(X_selected_leak)

# Evaluate using fixed CV
model = RidgeClassifierCV()
scores_leakage = []
for train, test in cv.split(X, y):
    model.fit(X_scaled_leak[train, :], y[train])
    pred = model.predict(X_scaled_leak[test, :])
    scores_leakage.append(roc_auc_score(y[test], pred))

### Correct procedure:

In [None]:
# ========== Case 2: No Leakage ==========
# Pipeline with feature selection inside each fold
selector = SelectKBest(score_func=f_classif, k=5)
scaler = StandardScaler()
model = RidgeClassifierCV()
scores_no_leakage = []

for train, test in cv.split(X, y):
    X_train = X[train, :]
    y_train = y[train]
    X_test = X[test, :]
    y_test = y[test]
    
    # Scale and reduce dimensionality on entire dataset (leakage)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Fit feature selector
    X_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Fit ML model
    model.fit(X_selected, y_train)

    pred = model.predict(X_test_selected)
    scores_no_leakage.append(roc_auc_score(y[test], pred))

In [41]:
# ========== Compare Results ==========
results_df = pd.DataFrame(
    {
        "Fold": np.arange(1, 6),
        "Accuracy with Leakage": scores_leakage,
        "Accuracy without Leakage": scores_no_leakage,
        "Difference": np.array(scores_leakage) - np.array(scores_no_leakage),
    }
)

print(results_df)
print("\nMean Accuracy with Leakage: ", round(np.mean(scores_leakage), 4))
print("Mean Accuracy without Leakage: ", round(np.mean(scores_no_leakage), 4))
print("Mean Difference: ", round(np.mean(results_df["Difference"]), 4))

   Fold  Accuracy with Leakage  Accuracy without Leakage  Difference
0     1               0.733974                  0.733974    0.000000
1     2               0.722902                  0.715326    0.007576
2     3               0.757867                  0.757867    0.000000
3     4               0.813462                  0.813462    0.000000
4     5               0.717308                  0.738462   -0.021154

Mean Accuracy with Leakage:  0.7491
Mean Accuracy without Leakage:  0.7518
Mean Difference:  -0.0027


The approach causing leakage generally yielded better performance than the correct approach. Even though in this example the effect of leakage is not huge, in bigger and complex datasets its effect is much severe.

It is important to note that the results and the effect of leakage might change based on the use of different models, seeds, samples, features, etc.
Also leakage is complex and it is often unclear where it might or might not show. However, it is always important to avoid it in order to yield valid estimations of model performance.
