# Marking imputed values

In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    AddMissingIndicator,
    CategoricalImputer,
    MeanMedianImputer,
)

data = pd.read_csv('../data/credit_approval_uci.csv')

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('target', axis='columns'), data['target'],
    test_size=.3, random_state=37
)

varnames = ['A1', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8']

indicators = [f'{var}_na' for var in varnames]

X_train[indicators] = X_train[varnames].isnull().astype(int)
X_test[indicators] = X_test[varnames].isnull().astype(int)

X_train.sample(5)

# using feature-engine
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('target', axis='columns'), data['target'],
    test_size=.3, random_state=37
)

imputer = AddMissingIndicator(
    variables=None,
    missing_only=True,
)
imputer.fit(X_train)
imputer.variables_

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

pipe = Pipeline([
    ('ind', AddMissingIndicator(missing_only=True)),
    ('cat', CategoricalImputer(imputation_method='frequent')),
    ('num', MeanMedianImputer(imputation_method='mean'))
])

# 此处的Pipeline首先在DataFrame后面添加了存在missing value的字段，字段名设置为原字段名\_na
# 然后将分类型变量设置用众数填充，数值型变量用均值填充
X_train = pipe.fit_transform(X_train)
X_test = pipe.fit_transform(X_test)

# using scikit-learn
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('target', axis='columns'), data['target'],
    test_size=.3, random_state=37
)

num_vars = X_train.select_dtypes(exclude='O').columns.to_list()
cat_vars = X_train.select_dtypes(include='O').columns.to_list()

pipe = ColumnTransformer(
    [
        ('num_imputer', 
         SimpleImputer(strategy='mean', 
                       add_indicator=True),
         num_vars),
        ('cat_imputer',
         SimpleImputer(strategy='most_frequent',
                       add_indicator=True),
         cat_vars)]
)
X_train = pipe.fit_transform(X_train)
X_test = pipe.fit_transform(X_test)
pipe