In [43]:
import pandas as pd
import numpy as np
from scipy.stats import randint, uniform, geom, expon
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    MinMaxScaler,
    RobustScaler,
    OneHotEncoder
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

from sklearn import set_config
set_config(transform_output = "pandas")

In [3]:
np.random.seed(42)

data = {
    'numeric_col_0': randint(low=0, high=2).rvs(size=10),
    'numeric_col_1': randint(low=1, high=10).rvs(size=10),
    'float_col_0': uniform(0, 1).rvs(size=10),
    'float_col_1': uniform(0, 10).rvs(size=10),
    'categorical_col_0': np.random.choice(['a', 'b'], size=10, p=[.5, .5]),
    'categorical_col_1': np.random.choice(['x', 'y', 'z'], size=10, p=[.3, .3, .4]),
    'target': np.random.choice([0, 1], size=10, p=[.7, .3])
}

In [4]:
df_raw = pd.DataFrame(data)

In [5]:
X = df_raw.drop(columns='target')
y = df_raw.target

In [32]:
# seleciona as colunas de cada tipo
int_cols = X.select_dtypes(include=['int']).columns
float_cols = X.select_dtypes(include=['float']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [38]:
# para cada tipo de coluna faça um pipe proprio | da para fazer com make_pipeline
pipe_num = Pipeline(steps=[('int_imp', SimpleImputer(strategy='mean')), ('num_prep', MinMaxScaler())], verbose=True)

pipe_float = Pipeline(steps=[('float_imp', SimpleImputer(strategy='median')), ('float_prep', RobustScaler())], verbose=True)

pipe_cat = Pipeline(steps=[
    ('cat_imp', SimpleImputer(strategy='most_frequent')), 
    ('cat_prep', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
], verbose=True)

In [39]:
preprocess = ColumnTransformer(transformers=[
    ('prep_num', pipe_num, int_cols),
    ('prep_float', pipe_float, float_cols),
    ('prep_cat', pipe_cat, cat_cols)
], remainder='passthrough')

In [40]:
preprocess

In [41]:
cat_cols

Index(['categorical_col_0', 'categorical_col_1'], dtype='object')

In [42]:
preprocess.fit_transform(X)

[Pipeline] ........... (step 1 of 2) Processing int_imp, total=   0.0s
[Pipeline] .......... (step 2 of 2) Processing num_prep, total=   0.0s
[Pipeline] ......... (step 1 of 2) Processing float_imp, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing float_prep, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing cat_imp, total=   0.0s
[Pipeline] .......... (step 2 of 2) Processing cat_prep, total=   0.0s


Unnamed: 0,prep_num__numeric_col_0,prep_num__numeric_col_1,prep_float__float_col_0,prep_float__float_col_1,prep_cat__categorical_col_0_a,prep_cat__categorical_col_0_b,prep_cat__categorical_col_1_x,prep_cat__categorical_col_1_y,prep_cat__categorical_col_1_z
0,0.0,0.0,1.68097,-0.776176,1.0,0.0,0.0,1.0,0.0
1,1.0,0.8,1.297002,-0.340128,1.0,0.0,1.0,0.0,0.0
2,0.0,1.0,-0.435048,-0.128126,0.0,1.0,0.0,0.0,1.0
3,0.0,0.4,-0.520279,0.128126,0.0,1.0,1.0,0.0,0.0
4,0.0,0.2,-0.515867,1.068219,0.0,1.0,0.0,0.0,1.0
5,1.0,1.0,-0.178347,-0.604271,1.0,0.0,0.0,1.0,0.0
6,0.0,1.0,0.437585,0.294273,1.0,0.0,0.0,1.0,0.0
7,0.0,0.0,0.178347,0.517595,0.0,1.0,0.0,1.0,0.0
8,0.0,0.6,-0.214695,-1.041955,1.0,0.0,1.0,0.0,0.0
9,1.0,0.4,0.680859,0.560815,1.0,0.0,0.0,0.0,1.0


In [44]:
# ao final podemos colocar um modelo em um Pipeline
model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LogisticRegression())
])

In [45]:
model

In [47]:
model.fit(X, y)

[Pipeline] ........... (step 1 of 2) Processing int_imp, total=   0.0s
[Pipeline] .......... (step 2 of 2) Processing num_prep, total=   0.0s
[Pipeline] ......... (step 1 of 2) Processing float_imp, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing float_prep, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing cat_imp, total=   0.0s
[Pipeline] .......... (step 2 of 2) Processing cat_prep, total=   0.0s


In [48]:
model.predict(X)

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [49]:
y.values

array([1, 1, 1, 0, 1, 0, 0, 0, 0, 0])