In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

In [None]:
iris = load_iris()

X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

In [None]:
#Transformer
scaler = StandardScaler()
X_train_transformed = scaler.fit_transform(X_train)

In [None]:
#Estimator
model = SGDClassifier()
model.fit(X_train_transformed, y_train)

In [None]:
#Test
X_test_transformed = scaler.transform(X_test)
y_pred = model.predict(X_test_transformed)
model.score(X_test_transformed, y_test)

0.9

Combine operations using a pipeline

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
model = make_pipeline(StandardScaler(),
                      SGDClassifier())

In [None]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)
model.score(X_test, y_test)

0.9666666666666667

Cross Validation

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = make_pipeline(PolynomialFeatures(),
                      StandardScaler(),
                      SGDClassifier())

In [None]:
params = {
    'polynomialfeatures__degree' : [2, 3, 4],
    'sgdclassifier__penalty' : ['l1', 'l2']
}

grid = GridSearchCV(model, param_grid=params, cv=4)

grid.fit(X_train, y_train)

In [None]:
grid.best_params_

{'polynomialfeatures__degree': 3, 'sgdclassifier__penalty': 'l1'}

In [None]:
grid.best_score_

0.9916666666666667

Comparing to a raw model ~ without preprocessing ~

In [None]:
model_0 = SGDClassifier()

model_0.fit(X_train, y_train)
model_0.score(X_test, y_test)

0.7666666666666667