This notebook runs the model on the preprocessed data. The goal is to predict if the patient will survive to its stay.

In [None]:
import sys
sys.path.append('../')
import pandas as pd

# Reload data

In [None]:
labs = pd.read_csv('data/labs_1_day.csv', index_col = [0, 1], header = [0, 1])
outcomes = pd.read_csv('data/outcomes_1_day.csv', index_col = 0)

In [None]:
outcomes['Death'] = outcomes['Death'] < 8

In [None]:
groups = outcomes[['ETHNICITY', 'GENDER', 'INSURANCE']]
groups.ETHNICITY = outcomes.ETHNICITY.str.contains('BLACK')
groups.GENDER = (outcomes.ETHNICITY == 'M')
groups.INSURANCE = (outcomes.INSURANCE == 'Private')

# Split 

In [None]:
# Results path
results = 'results/classification' 

In [None]:
training = pd.Series(outcomes.index.isin(outcomes.sample(frac = 0.8, random_state = 0).index), index = outcomes.index)

In [None]:
print('Total patients: {}'.format(len(training)))
print('Training patients: {}'.format(training.sum()))

# Imputation

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import numpy as np

def imputation(train_index, data, groups, strategy = 'Median', add_count = False, add_group = False, max_iter = 10):
    imputed = data.add_suffix('_data').groupby('Patient').last()

    if add_count:
        # Add count of observed test
        imputed = pd.concat([imputed, (imputed.isna()).add_suffix('_count')], axis = 1)

    if add_group:
        # Add group
        imputed = imputed.join(groups.add_suffix('_group'))

    if 'Group' in strategy:
        # Add group befoer splitting only for imputation
        imputed = imputed.join(groups.add_suffix('_group_reg'))

    # Data to use to learn imputation
    train_data = imputed.loc[imputed.index.get_level_values('Patient').isin(train_index)]
    train_index = train_data.index
    
    # Compute fill value
    if strategy == 'LOCF':
        imputed = imputed.groupby('Patient').ffill()
        impute = - 1

    if strategy == 'Individual':
        impute = imputed.groupby('Patient').median()
        
    if strategy == 'Median':
        impute = train_data.median()

    if strategy == 'Mean':
        impute = train_data.mean()

    if strategy == 'Group Median':
        impute = train_data.groupby(groups).transform(lambda x: x.fillna(x.median()))

    if strategy == 'Group Mean':
        impute = train_data.groupby(groups).transform(lambda x: x.fillna(x.mean()))

    if 'MICE' in strategy:
        impute = -1

        # MICE Algorithm
        ## 1. Init with median imputation
        missing = imputed.isna()
        imputed = pd.DataFrame(SimpleImputer(strategy = "median").fit(train_data.values).transform(imputed.values), index = imputed.index, columns = imputed.columns)

        ## 2. Iterate through columns
        ### Find columns with random values (start with the one with least)
        to_impute = missing.sum().sort_values()
        to_impute = to_impute[to_impute > 0]

        ### Impute one by one with regression until convergence
        for _ in range(max_iter):
            for c in to_impute.index:
                #### Take train points for which c is observed to train model
                train_data = imputed.loc[train_index][~missing.loc[train_index][c]]

                #### Fit regression
                lr = LinearRegression().fit(train_data.loc[:, imputed.columns != c].values, train_data[c].values)
                residuals = np.abs(lr.predict(train_data.loc[:, imputed.columns != c].values) - train_data[c])

                #### Draw with normal error
                prev = imputed.copy()
                imputed[c][missing[c]] = lr.predict(imputed.loc[:, imputed.columns != c][missing[c]].values) + np.random.normal(scale = np.std(residuals), size = missing[c].sum())
        else:
            if 'Group' in strategy:
                # Remove the group columns of imputed data
                imputed = imputed.iloc[:, :-1]

    return imputed, impute


def process(train_index, data, groups, **args):
    """
        Preprocesses data 
        Take last observation and impute given strategy
    """
    updated, impute = imputation(train_index, data, groups, **args)
    #resampled = updated.groupby('Patient').last()
    imputed = updated.fillna(impute)

    return imputed

In [None]:
from utils import Experiment

In [None]:
hyperparams = {
    'penalty': ['l2'],
    'C': [0.01, 0.1, 1., 10],
    'solver': ['sag'], 
    'max_iter': [1000],
    'n_jobs': [-1]
}

In [None]:
imputations = {
                #'Median': {'strategy': 'Median'},
                'Median Missing': {'strategy': 'Median', 'add_count': True},
                #'MICE': {'strategy': 'MICE', 'n_iter': 10},
                'MICE Missing': {'strategy': 'MICE', 'n_iter': 10, 'add_count': True},
                #'Group MICE': {'strategy': 'Group MICE', 'n_iter': 10},
                #'Group MICE Missing': {'strategy': 'Group MICE', 'n_iter': 10, 'add_count': True},
                #'Individual': {'strategy': 'Individual'},
                #'LOCF': {'strategy': 'LOCF'},
                #'LOCF Count': {'strategy': 'LOCF', 'add_count': True},
                #'LOCF Group': {'strategy': 'LOCF', 'add_count': True, 'add_group': True},
              }

In [None]:
for name, params in imputations.items():
    print('Imputation strategy: ', name)
    n_iter = params.pop('n_iter', 1)

    predictions = []
    for iter in range(n_iter):
        last = process(training[training].index, labs, groups, **params)
        assert (last == -1).sum().sum() == 0, "Non imputed values"

        se = Experiment.create(model = 'log', hyper_grid = hyperparams, save = False, path = results + name)
        pred = se.train(last, outcomes.Death, training)
        if pred is None: break # Reload previous copy
        predictions.append(pred)
    else:
        # Average Multiple imputations models
        used = [p.Use for p in predictions][-1]
        predictions = pd.concat([p[1] for p in predictions], axis = 1)
        predictions = pd.concat({'Mean': predictions.mean(1), 'Std': predictions.std(1)}, axis = 1)
        se = Experiment.create(model = 'log', hyper_grid = hyperparams, path = results + name)
        se.save_results(predictions, used)