###### Cross validation gives a good perfomance

Model 1 : The naive approach. transform all then fit

In [1]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

X, y = make_classification(n_samples = 1000, n_features=20, 
                          n_redundant=5,random_state=7)


scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# define the model
model = LogisticRegression()

# Define the model evaluation procedure.
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate the model using cross-validation.
scores = cross_val_score(model, X, y ,
                         scoring= 'accuracy',
                        cv = cv, n_jobs = -1)

# we can then report the average accuracy across all of th rpeteats 
# and folds.
print('Accuracy: %.3f (%.3f)'% (mean(scores)*100, std(scores)*100))

Accuracy: 92.900 (2.675)


#####      The Reccommended Approach. using Cv 


In [5]:
# Data prep without leakage is done using pipeline.

from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# define dataset 

X1, y1 = make_classification(n_samples=1000, n_features=20,
                          n_redundant= 5, random_state=7)

# define the pipeline.
steps = list()
steps.append(('Scaler', MinMaxScaler()))
steps.append(('model', LogisticRegression()))
pipeline = Pipeline(steps=steps)

# Define the evaluation procedure.
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=7, random_state=1)

# evaluate the model using cross-validation

scores = cross_val_score(pipeline, X1, y1 , scoring='accuracy',
                        cv=cv , n_jobs = -1)
# report perfomance.
print("Accuracy: %.3f (%.3f)"%(mean(scores)*100, std(scores)*100))


Accuracy: 92.943 (2.596)


In [None]:
# Although the improvement may be as a result of parameter tuning
# the reccomended approach to avoiding leakage is scaling
# the data after spliting. 

#----- on to Data Cleaning ---Thanks Brownlee --->