In [8]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import numpy as np



X_train = pd.DataFrame({
    "Sex": ["M", "F", "M", None],
    "Housing": ["own", "rent", None, "free"],
    "Age": [25, 40, None, 33],
    "Credit": [2000, 1500, 1200, 1800]
})


# TODO Для пайплайнов sklearn: всегда np.nan.
# Меняем dct  pd.NA на np.nan
X_train = X_train.replace({pd.NA: np.nan})

cat_cols = ["Sex", "Housing"]
num_cols = ["Age", "Credit"]

# Пайплайн для категориальных
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

# Пайплайн для числовых
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# ColumnTransformer
transf_obj = ColumnTransformer([
    ("cat", cat_pipe, cat_cols),
    ("num", num_pipe, num_cols)
])

# --- Здесь fit_transform на всем X_train ---
X_transformed = transf_obj.fit_transform(X_train)

# Достаём имена признаков
feature_names = transf_obj.get_feature_names_out()

# Собираем DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)

print(X_transformed_df)


   cat__Sex_F  cat__Sex_M  cat__Housing_free  cat__Housing_own  \
0         0.0         1.0                0.0               1.0   
1         1.0         0.0                0.0               0.0   
2         0.0         1.0                1.0               0.0   
3         0.0         1.0                1.0               0.0   

   cat__Housing_rent  num__Age  num__Credit  
0                0.0 -1.444571     1.237179  
1                1.0  1.381763    -0.412393  
2                0.0  0.000000    -1.402136  
3                0.0  0.062807     0.577350  
