In [62]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (cross_val_score, cross_val_predict,
                                     StratifiedKFold, permutation_test_score)
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import roc_auc_score

seed = 243452
n_inform = 100
n_samples = 1000
n_samples_signal = 400
n_samples_no_signal = n_samples - n_samples_signal

In [47]:
X_1, y = make_classification(n_samples=n_samples,
                             n_features=1000,
                             n_informative=n_inform,
                             n_redundant=0,
                             shuffle=True,
                             random_state=seed)

In [48]:
clf = LogisticRegression()
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [49]:
scores = cross_val_score(clf, X_1, y, cv=cv, scoring='roc_auc')

In [50]:
print(scores)
print(scores.mean(), scores.std())


[0.86208621 0.89378938 0.90459046 0.90079008 0.8949895 ]
0.8912491249124912 0.015099271350604432


In [51]:
kbest = SelectKBest(k=n_inform)
kbest.fit(X_1, y)

SelectKBest(k=100, score_func=<function f_classif at 0x1a11eceae8>)

In [52]:
# remove information from X
X_2 = X_1.copy()
X_2 = X_2[:, kbest.get_support() != True]
X_2 = np.concatenate((np.random.permutation(X_2[:, -n_inform:]), X_2), axis=1)

In [63]:
n_samples_no_signal

600

In [64]:
# Make combine X
X_mixed = np.concatenate((X_1[:n_samples_signal], X_2[n_samples_signal:]))

In [67]:
scores_mixed = cross_val_score(clf, X_mixed, y, cv=cv, scoring='roc_auc')

In [68]:
print(scores_mixed)
print(scores_mixed.mean(), scores_mixed.std())

[0.6039604  0.65786579 0.58135814 0.57235724 0.63876388]
0.610861086108611 0.03282863891679688


In [69]:
scores_2 = cross_val_score(clf, X_2, y, cv=cv, scoring='roc_auc')

In [70]:
print(scores_2)
print(scores_2.mean(), scores_2.std())

[0.4460446  0.36683668 0.44714471 0.40164016 0.44264426]
0.42086208620862087 0.03189866917041709


In [84]:
clf = LogisticRegression()
clf.fit(X_1, y)
roc_auc_score(y, clf.predict(X_2))

0.5091609160916092