# 避免 data leakage

在数据处理如：正则化，归一化等过程中，难免会有数据信息和原始信息不一样，而造成data leakage。

在特征提取中，同样也存在。

如何解决，使用 Pipeline

In [1]:
# Data Preparation and Modeling Pipeline

from pandas import read_csv

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# load data 
# load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg ','plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv(url, names=names)

X = df.iloc[:, 0:8]
Y = df.iloc[:, 8]

# create pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('lda', LinearDiscriminantAnalysis()))
model = Pipeline(estimators)

# evaluate pipeline
kfold = KFold(n_splits=10, random_state=42)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.773462064252


In [3]:
# Feature Extraction and Modeling Pipeline
from pandas import read_csv

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.linear_model import LogisticRegression

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

# load data as before

# create feature union
features = []
features.append(('pca', PCA(n_components=3)))
features.append(('select_best', SelectKBest(k=6)))
features_union = FeatureUnion(features)

# create pipeline
estimators_features = []
estimators_features.append(('feature_union', features_union))
estimators_features.append(('logistic', LogisticRegression()))
model_features = Pipeline(estimators_features)
# evaluate pipeline
kfold = KFold(n_splits=10, random_state=42)
results_features = cross_val_score(model_features, X, Y, cv=kfold)
print(results_features.mean())

0.776042378674
