In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

mpg_df = sns.load_dataset('mpg')

In [6]:
pipe_horsepower = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

pipe_horsepower

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [9]:
pipe_horsepower.fit(mpg_df[['horsepower']])
pipe_horsepower


0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [12]:
pipe_horsepower['scaler'].mean_

array([104.3040201])

In [14]:
pipe_horsepower['scaler'].n_samples_seen_

np.int64(398)

In [17]:
transformer_numeric = ColumnTransformer(
transformers = [
 ('with_nan', pipe_horsepower, ["horsepower"]),
     ('without_nan', StandardScaler(), ["mpg", "displacement","weight", "acceleration"]),
], # el resto por default se va a drop
   # remainder="passthrough"
)
transformer_numeric

0,1,2
,transformers,"[('with_nan', ...), ('without_nan', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [18]:
transformer_numeric.fit_transform(mpg_df)

array([[ 0.67311762, -0.7064387 ,  1.0906037 ,  0.63086987, -1.29549834],
       [ 1.58995818, -1.09075062,  1.5035143 ,  0.85433297, -1.47703779],
       [ 1.19702651, -0.7064387 ,  1.19623199,  0.55047045, -1.65857724],
       ...,
       [-0.53187283,  1.08701694, -0.56103873, -0.79858454, -1.4407299 ],
       [-0.66285006,  0.57460104, -0.70507731, -0.40841088,  1.10082237],
       [-0.58426372,  0.95891297, -0.71467988, -0.29608816,  1.39128549]],
      shape=(398, 5))

In [21]:
# vamos a crear otro mas grande
preprocessor =ColumnTransformer(
transformers = [
     ('numeric_with_nan', pipe_horsepower, ["horsepower"]),
     ('numeric_without_nan', StandardScaler(), ["mpg", "displacement","weight", "acceleration"]),
     ('categorical_ordinal', OrdinalEncoder(categories="auto"), ["cylinders", "model_year"]),
     ('categorical_nominal', OneHotEncoder(categories="auto", handle_unknown='error'), ["origin", "name"]),
],
remainder='drop'
)
preprocessor

0,1,2
,transformers,"[('numeric_with_nan', ...), ('numeric_without_nan', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [23]:
preprocessor.fit_transform(mpg_df).toarray()

array([[ 0.67311762, -0.7064387 ,  1.0906037 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.58995818, -1.09075062,  1.5035143 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.19702651, -0.7064387 ,  1.19623199, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.53187283,  1.08701694, -0.56103873, ...,  0.        ,
         0.        ,  0.        ],
       [-0.66285006,  0.57460104, -0.70507731, ...,  0.        ,
         0.        ,  0.        ],
       [-0.58426372,  0.95891297, -0.71467988, ...,  0.        ,
         0.        ,  0.        ]], shape=(398, 315))

In [None]:
pipe = Pipeline(steps=[
    ("preprocessor",preprocessor)
])