# Finding extreme values for imputation

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from feature_engine.imputation import EndTailImputer

data = pd.read_csv('../data/credit_approval_uci.csv')

numeric_vars = [
    var for var in data.select_dtypes(exclude='O').columns.to_list() if var != 'target'
]
print(numeric_vars)

X_train, X_test, y_train, y_test = train_test_split(
    data[numeric_vars],
    data['target'],
    test_size=.3,
    random_state=0)

IQR = X_train.quantile(0.75) - X_train.quantile(0.25)
print(IQR)

imputation_dict = (X_train.quantile(.75) + 1.5 * IQR).to_dict()
print(imputation_dict)

X_train = X_train.fillna(value=imputation_dict)
X_test = X_test.fillna(value=imputation_dict)

print(X_train.isnull().any().any())
print(X_test.isnull().any().any())

# We can also replace missing data with values at the left tail of the distribution
X_train, X_test, y_train, y_test = train_test_split(
    data[numeric_vars],
    data['target'],
    test_size=.3,
    random_state=0)

imputer = EndTailImputer(
    imputation_method='iqr',
    tail='right',
    fold=3,
    variables=None)
imputer.fit(X_train)

print(imputer.imputer_dict_)

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

print(X_train.isnull().any().any())
print(X_test.isnull().any().any())

['A2', 'A3', 'A8', 'A11', 'A14', 'A15']
A2      16.420
A3       7.470
A8       2.835
A11      3.000
A14    212.000
A15    450.000
dtype: float64
{'A2': 63.550000000000004, 'A3': 19.675000000000004, 'A8': 7.2524999999999995, 'A11': 7.5, 'A14': 590.0, 'A15': 1125.0}
False
False
{'A2': 88.18, 'A3': 30.880000000000003, 'A8': 11.504999999999999, 'A11': 12.0, 'A14': 908.0, 'A15': 1800.0}
False
False
