In [1]:
import pandas as pd
import numpy as np
from scipy.stats import randint, uniform, geom, expon
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    MinMaxScaler,
    RobustScaler,
    OneHotEncoder
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

from sklearn import set_config
set_config(transform_output = "pandas")

In [2]:
np.random.seed(42)

data = {
    'numeric_col_0': randint(low=0, high=2).rvs(size=10),
    'numeric_col_1': randint(low=1, high=10).rvs(size=10),
    'float_col_0': uniform(0, 1).rvs(size=10),
    'float_col_1': uniform(0, 10).rvs(size=10),
    'categorical_col_0': np.random.choice(['a', 'b'], size=10, p=[.5, .5]),
    'categorical_col_1': np.random.choice(['x', 'y', 'z'], size=10, p=[.3, .3, .4]),
    'target': np.random.choice([0, 1], size=10, p=[.7, .3])
}

In [3]:
df_raw = pd.DataFrame(data)

In [4]:
X = df_raw.drop(columns='target')
y = df_raw.target

In [5]:
# seleciona as colunas de cada tipo
int_cols = X.select_dtypes(include=['int']).columns
float_cols = X.select_dtypes(include=['float']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [6]:
# para cada tipo de coluna faça um pipe proprio | da para fazer com make_pipeline
pipe_num = make_pipeline(SimpleImputer(strategy='mean'), MinMaxScaler(), verbose=True)

pipe_float = make_pipeline(SimpleImputer(strategy='median'), RobustScaler(), verbose=True)

pipe_cat = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore', sparse_output=False), verbose=True)

In [8]:
preprocess = make_column_transformer(
    (pipe_num, int_cols),
    (pipe_float, float_cols),
    (pipe_cat, cat_cols)
, remainder='passthrough')

In [9]:
preprocess

In [10]:
preprocess.fit_transform(X)

[Pipeline] ..... (step 1 of 2) Processing simpleimputer, total=   0.0s
[Pipeline] ...... (step 2 of 2) Processing minmaxscaler, total=   0.0s
[Pipeline] ..... (step 1 of 2) Processing simpleimputer, total=   0.0s
[Pipeline] ...... (step 2 of 2) Processing robustscaler, total=   0.0s
[Pipeline] ..... (step 1 of 2) Processing simpleimputer, total=   0.0s
[Pipeline] ..... (step 2 of 2) Processing onehotencoder, total=   0.0s


Unnamed: 0,pipeline-1__numeric_col_0,pipeline-1__numeric_col_1,pipeline-2__float_col_0,pipeline-2__float_col_1,pipeline-3__categorical_col_0_a,pipeline-3__categorical_col_0_b,pipeline-3__categorical_col_1_x,pipeline-3__categorical_col_1_y,pipeline-3__categorical_col_1_z
0,0.0,0.0,1.68097,-0.776176,1.0,0.0,0.0,1.0,0.0
1,1.0,0.8,1.297002,-0.340128,1.0,0.0,1.0,0.0,0.0
2,0.0,1.0,-0.435048,-0.128126,0.0,1.0,0.0,0.0,1.0
3,0.0,0.4,-0.520279,0.128126,0.0,1.0,1.0,0.0,0.0
4,0.0,0.2,-0.515867,1.068219,0.0,1.0,0.0,0.0,1.0
5,1.0,1.0,-0.178347,-0.604271,1.0,0.0,0.0,1.0,0.0
6,0.0,1.0,0.437585,0.294273,1.0,0.0,0.0,1.0,0.0
7,0.0,0.0,0.178347,0.517595,0.0,1.0,0.0,1.0,0.0
8,0.0,0.6,-0.214695,-1.041955,1.0,0.0,1.0,0.0,0.0
9,1.0,0.4,0.680859,0.560815,1.0,0.0,0.0,0.0,1.0


In [14]:
# ao final podemos colocar um modelo em um Pipeline
model = make_pipeline(
    preprocess,
    LogisticRegression()
)

In [15]:
model

In [16]:
model.fit(X, y)

[Pipeline] ..... (step 1 of 2) Processing simpleimputer, total=   0.0s
[Pipeline] ...... (step 2 of 2) Processing minmaxscaler, total=   0.0s
[Pipeline] ..... (step 1 of 2) Processing simpleimputer, total=   0.0s
[Pipeline] ...... (step 2 of 2) Processing robustscaler, total=   0.0s
[Pipeline] ..... (step 1 of 2) Processing simpleimputer, total=   0.0s
[Pipeline] ..... (step 2 of 2) Processing onehotencoder, total=   0.0s


In [17]:
model.predict(X)

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [18]:
y.values

array([1, 1, 1, 0, 1, 0, 0, 0, 0, 0])