# sklearn

## Pipeline

Useful API to chain operations on dataset.

In [None]:
from sklearn.datasets import make_classification
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

data = make_classification(1000)

pipe = make_pipeline(
    StandardScaler(),
    PCA(),
    LogisticRegression()
)

pipe.fit(*data)


## Feature aggregation

In [None]:
from sklearn.datasets import make_classification
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.neural_network import MLPClassifier

data = make_classification(1000)

pipe = make_pipeline(
    StandardScaler(),
    make_union( 
                PCA(n_components=2),
                TruncatedSVD(n_components=2),
                KBinsDiscretizer()
                ),
    MLPClassifier()
)

pipe.fit(*data)

## Column Transformer

In [None]:
from sklearn.datasets import make_classification
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier

data = make_classification(1000)

pipe = make_pipeline(
    ColumnTransformer(
        [
            ("scaled", StandardScaler(), slice(0, 10)),
            ("disc", KBinsDiscretizer(n_bins=10), slice(10,12)),
            ("rest", "passthrough", slice(12,20))
    ]), 
    PCA(n_components=2),
    MLPClassifier()
)

pipe.fit(*data)


In [1]:
from sklearn.datasets import make_classification
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier

data = make_classification(1000)

pipe = make_pipeline(
    ColumnTransformer(
        [
            ("scaled", StandardScaler(), slice(0, 10)),
            ("disc", KBinsDiscretizer(n_bins=10), slice(10,12)),
            ("rest", "passthrough", slice(12,20))
    ]), 
    PCA(n_components=2),
    MLPClassifier()
)

pipe.fit(*data)




Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

## Feature aggregation

In [4]:
from sklearn.datasets import make_classification
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.neural_network import MLPClassifier

data = make_classification(1000)

pipe = make_pipeline(
    StandardScaler(),
    make_union( 
                PCA(n_components=2),
                TruncatedSVD(n_components=2),
                KBinsDiscretizer()
                ),
    MLPClassifier()
)

pipe.fit(*data)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('featureunion',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('pca',
                                                 PCA(copy=True,
                                                     iterated_power='auto',
                                                     n_components=2,
                                                     random_state=None,
                                                     svd_solver='auto', tol=0.0,
                                                     whiten=False)),
                                                ('truncatedsvd',
                                                 TruncatedSVD(algorithm='randomized',
                                                              n_components=2,
                                                              n_ite...
    

## Column Transformer

In [3]:
from sklearn.datasets import make_classification
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier

data = make_classification(1000)

pipe = make_pipeline(
    ColumnTransformer(
        [
            ("scaled", StandardScaler(), slice(0, 10)),
            ("disc", KBinsDiscretizer(n_bins=10), slice(10,12)),
            ("rest", "passthrough", slice(12,20))
    ]), 
    PCA(n_components=2),
    MLPClassifier()
)

pipe.fit(*data)


Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scaled',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  slice(0, 10, None)),
                                                 ('disc',
                                                  KBinsDiscretizer(encode='onehot',
                                                                   n_bins=10,
                                                                   strategy='quantile'),
                                                  slice(10, 12, None)),
      