In [1]:
import pandas as pd
sat_df = pd.read_csv('./nyc_hs_sat.csv', index_col = 0)

In [5]:
null_cols_idcs = sat_df.isnull().any(axis = 0).values

In [13]:
null_cols_df = sat_df.iloc[:, null_cols_idcs]
null_cols_df.columns

Index(['num_test_takers', 'reading_avg', 'math_avg', 'writing_score',
       'graduation_rate', 'college_career_rate'],
      dtype='object')

In [64]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.compose import ColumnTransformer

In [20]:
X_null = null_cols_df[['num_test_takers', 'graduation_rate', 'college_career_rate']]

In [37]:
pipe_impute = Pipeline(steps = [
    ('imputer', SimpleImputer())
])
imputed_np = pipe.fit_transform(X_null)
imputed_df = pd.DataFrame(imputed_np, columns = X_null.columns)

In [39]:
pipe_is_missing = Pipeline(steps = [
    ('missing', MissingIndicator())
])

In [60]:
combined_df = pd.DataFrame(combined_missing_df, columns =['num_test_takers', 'graduation_rate', 'college_career_rate']  + [f'{col}_is_null' for col in X_null.columns])

### Naming Columns

In [113]:
impute = ColumnTransformer([
        ("numerical", SimpleImputer(missing_values=np.nan, strategy="mean"), ['num_test_takers'])
],
remainder = 'passthrough'
)

In [114]:
import numpy as np
pipe_impute = Pipeline(steps = [('impute', impute)])

In [115]:
pipe_impute.fit_transform(sat_df)

array([[29.0, '01M292', 'HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES',
        ..., 0.66, 0.87, 0.36],
       [91.0, '01M448', 'UNIVERSITY NEIGHBORHOOD HIGH SCHOOL', ..., 0.9,
        0.93, 0.7],
       [70.0, '01M450', 'EAST SIDE COMMUNITY SCHOOL', ..., 0.92, 0.94,
        0.77],
       ...,
       [67.0, '32K552', 'ACADEMY OF URBAN PLANNING', ..., 0.79, 0.89,
        0.34],
       [39.0, '32K554', 'ALL CITY LEADERSHIP SECONDARY SCHOOL', ...,
        0.97, 0.93, 0.73],
       [23.0, '32K556',
        'BUSHWICK LEADERS HIGH SCHOOL FOR ACADEMIC EXCELLENCE', ...,
        0.73, 0.81, 0.42]], dtype=object)

In [47]:
is_missing_np = pipe_is_missing.fit_transform(X_null)

### Combining Features

In [53]:
union = FeatureUnion([
    ('impute', pipe_impute),
    ('add_missing', pipe_is_missing)
])

In [58]:
combined_missing_df = union.fit_transform(X_null)
combined_missing_df[:2]

array([[29.  ,  0.66,  0.36,  0.  ,  0.  ,  0.  ],
       [91.  ,  0.9 ,  0.7 ,  0.  ,  0.  ,  0.  ]])

In [86]:
sat_df.isnull().any(axis = 0)

dbn                    False
name                   False
num_test_takers         True
reading_avg             True
math_avg                True
writing_score           True
boro                   False
total_students         False
graduation_rate         True
attendance_rate        False
college_career_rate     True
dtype: bool

### Resources

[Column Transformer](https://machinelearningmastery.com/automate-machine-learning-workflows-pipelines-python-scikit-learn/)