In [2]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

data = {
    'size': ['S', 'M', np.nan, 'XL', 'XL'],
    'color': ['red', 'blue', 'blue', 'black', np.nan],
    'price': [2100, np.nan, 4500, 7300, 3200],
    'quantity': [np.nan, 350, np.nan, 200, 10]
}

df = pd.DataFrame(data)

In [5]:
# 數值型資料管道器
numeric_transformer = ColumnTransformer(
    transformers=[
        ('num',SimpleImputer(strategy='median'),  # 用中位數填補缺失值
         ['price', 'quantity'])  # 選擇要處理的數值型資料欄位
    ]
)

# 類別型資料管道器
categorical_transformer = ColumnTransformer(
    transformers=[
        ('cat', 
         SimpleImputer(strategy='most_frequent'),  # 用眾數填補缺失值
         ['size', 'color']),  # 選擇要處理的類別型資料欄位
        ('onehot',
         OneHotEncoder(),  # 獨熱編碼
         ['size', 'color'])  # 選擇要進行獨熱編碼的欄位
    ]
)

# 合併兩個管道器
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, ['price', 'quantity']),
        ('cat', categorical_transformer, ['size', 'color'])
    ]
)

# 擬合並轉換資料
transformed_data = preprocessor.fit_transform(df)
pd.DataFrame(transformed_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,2100.0,200.0,S,red,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3850.0,350.0,M,blue,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,4500.0,200.0,XL,blue,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,7300.0,200.0,XL,black,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,3200.0,10.0,XL,blue,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
