In [5]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


n_samples = 50
n_features = 5000
n_classes = 2


X = np.random.randn(n_samples, n_features)

y = np.random.choice(n_classes, n_samples)


X_df = pd.DataFrame(X, columns=[f"Feature_{i+1}" for i in range(n_features)])
y_df = pd.DataFrame(y, columns=["Class"])

print(f"Dimensiones de X: {X_df.shape}")
print(f"Dimensiones de y: {y_df.shape}")

y_df.value_counts()



Dimensiones de X: (50, 5000)
Dimensiones de y: (50, 1)


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
1,28
0,22


# Wrong Way

In [14]:
top_k = 100
correlations = np.abs(np.corrcoef(X.T, y)[-1, :-1])
top_features = np.argsort(correlations)[-top_k:]


X_filtered = X[:, top_features]


kf = KFold(n_splits=5, shuffle=True, random_state=2025)
accuracies_incorrect = []

for train_idx, test_idx in kf.split(X_filtered):

    X_train, X_test = X_filtered[train_idx], X_filtered[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracies_incorrect.append(accuracy_score(y_test, y_pred))

print(f"The accuracy obtained with the incorrect appraoch is: {np.mean(accuracies_incorrect):.3f}")

The accuracy obtained with the incorrect appraoch is: 1.000


# Correct Way

In [15]:
kf = KFold(n_splits=5, shuffle=True, random_state=2025)
accuracies_correct = []

for train_idx, test_idx in kf.split(X):

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    correlations = np.abs(np.corrcoef(X_train.T, y_train)[-1, :-1])
    top_features = np.argsort(correlations)[-top_k:]


    X_train_filtered = X_train[:, top_features]
    X_test_filtered = X_test[:, top_features]

    model = LogisticRegression()
    model.fit(X_train_filtered, y_train)
    y_pred = model.predict(X_test_filtered)


    accuracies_correct.append(accuracy_score(y_test, y_pred))



# Reality vs Expectation

In [13]:
print(f"The accuracy obtained with the incorrect appraoch is: {np.mean(accuracies_incorrect):.3f}")
print(f"The accuracy obtained with the correct approach is: {np.mean(accuracies_correct):.3f}")

The accuracy obtained with the incorrect appraoch is: 1.000
The accuracy obtained with the correct approach is: 0.580


# Why this happens?

This happens because when you perform feature selection (top_features filtering), you are not separating the training and test sets beforehand. As a result, you're using information from the test set when selecting features, which causes data leakage. This means that the features are being 'optimized' based on the test data, leading to overly optimistic model performance during cross-validation.

In this example, we have 5k features, and given the large number of features, there is a high probability that some of them will appear correlated with the target variable purely by chance. This is due to the huge number of possible relationships between features and the target, and these correlations may not represent real patterns, but rather noise. This phenomenon is called cherry-picking

If you have a large amount of data, you can split the dataset into a training and a test set at the beginning and use only the training set for feature selection. However, when data is limited, this approach can lead to overfitting because the model is tuned too much to a specific set of features. In this case, using K-fold cross-validation is essential, as it ensures that the feature selection and model training process is evaluated across different subsets of data, providing a more robust estimate of model performance and minimizing the risk of overfitting.

**Disadvantages of the Correct Approach.**

In the K-Fold approach, for each fold, you are selecting a different subset of 100 features. This means that, across the different folds, the specific features included in the model may change slightly. As a result, the interpretability of the model can become more challenging because the selected features—and their associated coefficients—may vary across the folds. This variability can make it harder to draw clear conclusions about which features are truly important for the model’s predictions.