In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score

Design dataset

In [2]:
num_samples = 50
num_features = 1000

x = np.random.rand(num_samples, num_features)
y = np.round(np.random.rand(num_samples))

Build "Wrong" CV Model

In [3]:
# Select top predictors
df = pd.DataFrame(columns=['column', 'corr', 'corr_abs'])
for i in range(num_features):
    corr = np.corrcoef(x[:, i], y)[0, 1]
    df.loc[len(df)] = [i, corr, np.abs(corr)]
top_i = df.sort_values('corr_abs', ascending=False)['column'].astype(int).values[:20]
xx = x[:, top_i]

# Build model
scores = []
for train_index, test_index in StratifiedKFold(n_splits=10).split(xx, y):
    x_train, x_test = x[train_index, :], x[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    scores.append(roc_auc_score(y_test, lr.predict_proba(x_test)[:, 1]))
print(np.mean(scores))

0.4833333333333333


Build "Right" CV Model

In [4]:
# Build model
scores = []
for train_index, test_index in StratifiedKFold(n_splits=10).split(x, y):
    # Select training and test sets
    x_train, x_test = x[train_index, :], x[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    
    # Select top predictors
    df = pd.DataFrame(columns=['column', 'corr', 'corr_abs'])
    for i in range(num_features):
        corr = np.corrcoef(x_train[:, i], y_train)[0, 1]
        df.loc[len(df)] = [i, corr, np.abs(corr)]
    top_i = df.sort_values('corr_abs', ascending=False)['column'].astype(int).values[:20]
    xx_train = x_train[:, top_i]
    xx_test = x_test[:, top_i]
    
    # Build model
    lr = LogisticRegression()
    lr.fit(xx_train, y_train)
    scores.append(roc_auc_score(y_test, lr.predict_proba(xx_test)[:, 1]))
print(np.mean(scores))

    




0.6
