In [20]:
import pandas as pd
import numpy as np
from scipy.stats import randint, uniform, geom, expon
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    MinMaxScaler,
    RobustScaler,
    OneHotEncoder,
    PowerTransformer
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

from sklearn import set_config
set_config(transform_output = "pandas")

In [3]:
np.random.seed(42)

data = {
    'numeric_col_0': randint(low=0, high=2).rvs(size=10),
    'numeric_col_1': randint(low=1, high=10).rvs(size=10),
    'float_col_0': uniform(0, 1).rvs(size=10),
    'float_col_1': uniform(0, 10).rvs(size=10),
    'categorical_col_0': np.random.choice(['a', 'b'], size=10, p=[.5, .5]),
    'categorical_col_1': np.random.choice(['x', 'y', 'z'], size=10, p=[.3, .3, .4]),
    'target': uniform(1, 100).rvs(size=10)
}

In [4]:
df_raw = pd.DataFrame(data)

In [5]:
X = df_raw.drop(columns='target')
y = df_raw.target

In [6]:
# seleciona as colunas de cada tipo
int_cols = X.select_dtypes(include=['int']).columns
float_cols = X.select_dtypes(include=['float']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [7]:
# para cada tipo de coluna faça um pipe proprio | da para fazer com make_pipeline
pipe_num = Pipeline(steps=[('int_imp', SimpleImputer(strategy='mean')), ('num_prep', MinMaxScaler())], verbose=True)

pipe_float = Pipeline(steps=[('float_imp', SimpleImputer(strategy='median')), ('float_prep', RobustScaler())], verbose=True)

pipe_cat = Pipeline(steps=[
    ('cat_imp', SimpleImputer(strategy='most_frequent')), 
    ('cat_prep', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
], verbose=True)

In [8]:
preprocess = ColumnTransformer(transformers=[
    ('prep_num', pipe_num, int_cols),
    ('prep_float', pipe_float, float_cols),
    ('prep_cat', pipe_cat, cat_cols)
], remainder='passthrough')

In [9]:
preprocess

In [21]:
pipe_model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LinearRegression())
])

In [22]:
model = TransformedTargetRegressor(
    regressor=pipe_model, transformer=PowerTransformer('box-cox'))

In [23]:
model

In [25]:
model.fit(X, y)

[Pipeline] ........... (step 1 of 2) Processing int_imp, total=   0.0s
[Pipeline] .......... (step 2 of 2) Processing num_prep, total=   0.0s
[Pipeline] ......... (step 1 of 2) Processing float_imp, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing float_prep, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing cat_imp, total=   0.0s
[Pipeline] .......... (step 2 of 2) Processing cat_prep, total=   0.0s




In [26]:
model.predict(X)

array([ 59.91274421,  87.58035036, 103.77994564,  51.41477682,
        57.53253837,   3.57702933,  21.7322767 ,  22.22797385,
        47.00203771,  59.97971031])

In [27]:
y.values

array([78.51328234, 94.94989416, 90.48273504, 60.78999788, 93.1874235 ,
        9.84925021, 20.59828624,  5.52272889, 33.53303308, 39.86772897])