In [168]:
import pandas as pd
import numpy as np

data = { 'size': ['S','M',np.nan,'XL','XL'], 
        'color': ['red', 'blue', 'blue', 'black', np.nan], 
        'price': [2100, np.nan, 4500, 7300, 3200], 
        'quantity': [np.nan, 350, np.nan, 200, 10] }
X = pd.DataFrame(data) 
X_orig = X.copy()
X_orig.style.highlight_null(color='yellow')

Unnamed: 0,size,color,price,quantity
0,S,red,2100.0,
1,M,blue,,350.0
2,,blue,4500.0,
3,XL,black,7300.0,200.0
4,XL,,3200.0,10.0


In [169]:
X_col_num = ['price','quantity']
X_num = X[X_col_num]
X_num.style.highlight_null(color='yellow')

Unnamed: 0,price,quantity
0,2100.0,
1,,350.0
2,4500.0,
3,7300.0,200.0
4,3200.0,10.0


In [170]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
num_pl = make_pipeline(SimpleImputer(strategy='median'), 
                       StandardScaler())
num_pl.set_params(standardscaler=MinMaxScaler())
num_pl.fit_transform(X_num)

array([[0.        , 0.55882353],
       [0.33653846, 1.        ],
       [0.46153846, 0.55882353],
       [1.        , 0.55882353],
       [0.21153846, 0.        ]])

In [171]:
X_col_cat = ['size','color']
X_cat = X[X_col_cat]
X_cat.style.highlight_null(color='yellow')

Unnamed: 0,size,color
0,S,red
1,M,blue
2,,blue
3,XL,black
4,XL,


In [172]:
from sklearn.preprocessing import OneHotEncoder

cat_pl = make_pipeline(SimpleImputer(strategy='most_frequent'),
                       OneHotEncoder(sparse_output =False))
cat_pl.fit_transform(X_cat)


array([[0., 1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0.],
       [0., 0., 1., 1., 0., 0.],
       [0., 0., 1., 0., 1., 0.]])

In [173]:
from sklearn.compose import ColumnTransformer
data_pl = ColumnTransformer([
    ('num_pl', SimpleImputer(strategy='mean'), X_col_num),
    ('cat_pl', cat_pl, X_col_cat)
])
pd.DataFrame(data_pl.fit_transform(X))

Unnamed: 0,0,1,2,3,4,5,6,7
0,2100.0,186.666667,0.0,1.0,0.0,0.0,0.0,1.0
1,4275.0,350.0,1.0,0.0,0.0,0.0,1.0,0.0
2,4500.0,186.666667,0.0,0.0,1.0,0.0,1.0,0.0
3,7300.0,200.0,0.0,0.0,1.0,1.0,0.0,0.0
4,3200.0,10.0,0.0,0.0,1.0,0.0,1.0,0.0


In [174]:
# 第一步：取得cat_pl管道器
data_pl.named_transformers_['cat_pl']

In [175]:
# 第二步：取得onehotencoder欄位對應結果
data_pl.named_transformers_['cat_pl'].\
named_steps['onehotencoder'].get_feature_names_out()

array(['x0_M', 'x0_S', 'x0_XL', 'x1_black', 'x1_blue', 'x1_red'],
      dtype=object)

In [176]:
# 你如何知道管道器裡的轉換器名稱呢？make_pipeline會自動小寫轉換器的名稱當索引鍵。
# 如果還是不確定就用named_steps.keys()列出所有的索引鍵值
data_pl.named_transformers_['cat_pl'].named_steps.keys()

dict_keys(['simpleimputer', 'onehotencoder'])

In [177]:
# 第三步：將所有欄位整理到DataFrame裡
X_col_cat_oh = data_pl.named_transformers_['cat_pl'].\
named_steps['onehotencoder'].get_feature_names_out(X_col_cat)
columns = X_col_num + X_col_cat_oh.tolist()
print('整合後的欄位資料：',columns)
pd.DataFrame(data_pl.fit_transform(X), columns=columns)


整合後的欄位資料： ['price', 'quantity', 'size_M', 'size_S', 'size_XL', 'color_black', 'color_blue', 'color_red']


Unnamed: 0,price,quantity,size_M,size_S,size_XL,color_black,color_blue,color_red
0,2100.0,186.666667,0.0,1.0,0.0,0.0,0.0,1.0
1,4275.0,350.0,1.0,0.0,0.0,0.0,1.0,0.0
2,4500.0,186.666667,0.0,0.0,1.0,0.0,1.0,0.0
3,7300.0,200.0,0.0,0.0,1.0,1.0,0.0,0.0
4,3200.0,10.0,0.0,0.0,1.0,0.0,1.0,0.0
