In [21]:
import tensorflow as tf
import pandas as pd
import pickle
import numpy as np
import time
import os
import scipy
import sklearn
import matplotlib.pyplot as plt
from matplotlib import cm

from tensorflow import keras 
from tensorflow.keras import layers, losses, backend, activations
from tensorflow.keras import backend as K
from tensorflow.python.framework.ops import disable_eager_execution

from sklearn.metrics import roc_auc_score, average_precision_score, f1_score

from custom_utils import validation_setup, MLP_model

disable_eager_execution()
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
pd.set_option("display.max_rows", 5000)

---
### read data

In [22]:
data_path = '../data/mimic'
output_path = '../output/mimic'

In [23]:
with open(data_path+'/df_feature.pickle', 'rb') as f:
    df_feature = pickle.load(f).fillna(0)

with open(data_path+'/df_label.pickle', 'rb') as f:
    df_label = pickle.load(f).fillna(0).drop('C_HUA', axis=1) # not predicting disease HUA for most of the providers didn't have corresponding diagnoses records

---
### setups

In [24]:
provider_ids = df_feature.index.get_level_values(0).unique()

n_folds, n_providers_per_fold = 11, 5
n_providers = n_folds * n_providers_per_fold 
n_inputs = df_feature.shape[1]
n_labels = df_label.shape[1]

col_labels = df_label.columns

batch_size = 100
n_iters = 40
epsilon = 1e-7

---
### train test split, 2xk folds

In [25]:
train_list, valid_list = validation_setup(n_folds, n_providers_per_fold)

## Grid Search - step 1

In [26]:
layer_sel = [20, 30, 50, 100, 150, 200, 300, 400, 600]
layer_shape_list = [[layer_a, layer_b, layer_c] for layer_a in layer_sel for layer_b in layer_sel for layer_c in layer_sel if (layer_a > layer_b and layer_b > layer_c)]
layer_shape_list = layer_shape_list[-1::-1]
common_reg_list = [1e-3, 1e-4, 1e-5]

_current_index = pd.MultiIndex.from_product([common_reg_list, [str(layer_i) for layer_i in layer_shape_list], range(n_folds)], names=['reg', 'layer', 'valid'])
roc_grid_first = pd.DataFrame(columns=col_labels, index=_current_index)

alg_name = 'MLP_grid_search'

total_iters = roc_grid_first.shape[0]

if os.path.exists(output_path + '/%s/roc_grid_first.pickle' % (alg_name)):
    print('reading')
    with open(output_path + '/%s/roc_grid_first.pickle' % (alg_name), 'rb') as f:
        roc_grid_first = pd.concat([roc_grid_first, pickle.load(f)]).groupby(roc_grid_first.index.names).max()

print('reading finish')

reading
reading finish


In [27]:
i_iter = 0
        
for layer_shape in layer_shape_list:
        
    for j_fold in range(n_folds):
        
        valid_pids = provider_ids[valid_list[j_fold]]
        valid_inputs = df_feature.loc[valid_pids]
        valid_labels = df_label.loc[valid_pids]
    
        for common_reg in common_reg_list:

            valid_pred_list = []

            i_iter += 1

            if roc_grid_first.loc[common_reg, str(layer_shape), j_fold].isna()['C_ARR']:

                print('iter %d/%d' % (i_iter, total_iters), common_reg, str(layer_shape), j_fold)

                for i_folds in [0, 1]:

                    # select training and validation sets
                    train_pids = provider_ids[train_list[j_fold][i_folds]]
                    train_inputs = df_feature.loc[train_pids]
                    train_labels = df_label.loc[train_pids]

                    t0=time.time()

                    model = MLP_model(
                        params={'n_inputs': n_inputs,
                                'n_labels': n_labels,
                                'layer_shape': layer_shape,
                                'common_reg': common_reg,},
                    )

                    model.fit(
                        [train_inputs, train_labels], [], 
                        batch_size=batch_size, 
                        epochs=n_iters,
                        shuffle=True,
                        workers=20,
                        use_multiprocessing=True,
                        verbose=0,
                    )

                    valid_pred = pd.DataFrame(columns=col_labels, index=valid_inputs.index, data=model.predict([valid_inputs, np.zeros(valid_labels.shape)]))
                    valid_pred_list.append(valid_pred)

                    t1 = time.time()

                    print(train_inputs.shape, 'duration: ', t1-t0, 's')
                    
                    K.clear_session()
                    del model
                    
                pred_0, pred_1 = valid_pred_list

                for _label in col_labels:

                    if valid_labels[_label].sum() > 0:
                        _roc_auc_0 = sklearn.metrics.roc_auc_score(valid_labels[_label].values, pred_0[_label].values)
                        _roc_auc_1 = sklearn.metrics.roc_auc_score(valid_labels[_label].values, pred_1[_label].values)
                    else:
                        _roc_auc_0 = 0.5
                        _roc_auc_1 = 0.5

                    roc_grid_first.loc[(common_reg, str(layer_shape), j_fold), _label] = (_roc_auc_0 + _roc_auc_1) / 2
                    
                with open(output_path + '/%s/preds_first/pred_%s.pickle' % (alg_name, (common_reg, str(layer_shape), j_fold)), 'wb') as f:
                    pickle.dump(valid_pred_list, f)   
                with open(output_path + '/%s/roc_grid_first.pickle' % (alg_name), 'wb') as f:
                    pickle.dump(roc_grid_first, f)       

In [28]:
roc_finished = roc_grid_first[~roc_grid_first.isna()['C_ARR']]
roc_finished.mean(axis=1).groupby(['reg', 'layer']).apply(lambda x: (x.mean(),x.shape[0])).sort_values(ascending=False)

reg      layer          
0.00010  [200, 50, 30]      (0.7639141372029464, 11)
         [150, 50, 30]      (0.7639116926436992, 11)
         [100, 30, 20]      (0.7632927791174944, 11)
         [100, 50, 30]      (0.7627022463395235, 11)
         [150, 30, 20]       (0.762617276093102, 11)
         [300, 50, 30]      (0.7625983832464464, 11)
         [300, 30, 20]      (0.7624146806499429, 11)
         [200, 150, 50]     (0.7623926007073973, 11)
         [200, 30, 20]      (0.7621326814708957, 11)
         [150, 100, 50]      (0.761838087425692, 11)
         [100, 50, 20]      (0.7617542894761183, 11)
         [150, 100, 30]      (0.761626692338827, 11)
         [150, 50, 20]      (0.7616025170109206, 11)
         [200, 100, 30]     (0.7615860164184184, 11)
         [200, 150, 100]    (0.7614068392728401, 11)
         [300, 50, 20]      (0.7612125041509873, 11)
         [300, 150, 100]    (0.7611629779105804, 11)
         [300, 100, 50]     (0.7610442951524944, 11)
         [200, 50, 20

## Grid Search - step 2

In [29]:
layer_shape_list = [[200, 50, 30]]
common_reg_list = [1e-4, 2e-4, 3e-4, 4e-4, 5e-4, 7e-4, 1e-3, 5e-5, 7e-5, 8e-5, 4e-5, 3e-5, 2e-5, 1e-5]

_current_index = pd.MultiIndex.from_product([common_reg_list, [str(layer_i) for layer_i in layer_shape_list], range(n_folds)], names=['reg', 'layer', 'valid'])
roc_grid = pd.DataFrame(columns=col_labels, index=_current_index)
cor_grid = pd.DataFrame(columns=col_labels, index=_current_index)
rho_grid = pd.DataFrame(columns=col_labels, index=_current_index)
tau_grid = pd.DataFrame(columns=col_labels, index=_current_index)

alg_name = 'MLP_grid_search'

total_iters = roc_grid.shape[0]

if os.path.exists(output_path + '/%s/roc_grid.pickle' % (alg_name)):
    print('reading')
    with open(output_path + '/%s/roc_grid.pickle' % (alg_name), 'rb') as f:
        roc_grid = pd.concat([roc_grid, pickle.load(f)]).groupby(roc_grid.index.names).max()
    with open(output_path + '/%s/cor_grid.pickle' % (alg_name), 'rb') as f:
        cor_grid = pd.concat([cor_grid, pickle.load(f)]).groupby(cor_grid.index.names).max()
    with open(output_path + '/%s/rho_grid.pickle' % (alg_name), 'rb') as f:
        rho_grid = pd.concat([rho_grid, pickle.load(f)]).groupby(rho_grid.index.names).max()
    with open(output_path + '/%s/tau_grid.pickle' % (alg_name), 'rb') as f:
        tau_grid = pd.concat([tau_grid, pickle.load(f)]).groupby(tau_grid.index.names).max()

print('reading finish')

reading
reading finish


In [30]:
i_iter = 0
        
for layer_shape in layer_shape_list:
        
    for j_fold in range(n_folds):
        
        valid_pids = provider_ids[valid_list[j_fold]]
        valid_inputs = df_feature.loc[valid_pids]
        valid_labels = df_label.loc[valid_pids]
    
        for common_reg in common_reg_list:

            valid_pred_list = []

            i_iter += 1

            if roc_grid.loc[common_reg, str(layer_shape), j_fold].isna()['C_ARR']:

                print('iter %d/%d' % (i_iter, total_iters), common_reg, str(layer_shape), j_fold)

                for i_folds in [0, 1]:

                    # select training and validation sets
                    train_pids = provider_ids[train_list[j_fold][i_folds]]
                    train_inputs = df_feature.loc[train_pids]
                    train_labels = df_label.loc[train_pids]

                    t0=time.time()

                    model = MLP_model(
                        params={'n_inputs': n_inputs,
                                'n_labels': n_labels,
                                'layer_shape': layer_shape,
                                'common_reg': common_reg,},
                    )

                    model.fit(
                        [train_inputs, train_labels], [], 
                        batch_size=batch_size, 
                        epochs=n_iters,
                        shuffle=True,
                        workers=40,
                        use_multiprocessing=True,
                        verbose=0,
                    )

                    valid_pred = pd.DataFrame(columns=col_labels, index=valid_inputs.index, data=model.predict([valid_inputs, np.zeros(valid_labels.shape)]))
                    valid_pred_list.append(valid_pred)

                    t1 = time.time()

                    print(train_inputs.shape, 'duration: ', t1-t0, 's')
                    
                    K.clear_session()
                    del model
                    
                pred_0, pred_1 = valid_pred_list

                for _label in col_labels:

                    _cor = scipy.stats.pearsonr(pred_0[_label].values, pred_1[_label].values).statistic
                    _rho = scipy.stats.spearmanr(pred_0[_label].values, pred_1[_label].values).statistic
                    _tau = scipy.stats.kendalltau(pred_0[_label].values, pred_1[_label].values).statistic

                    if valid_labels[_label].sum() > 0:
                        _roc_auc_0 = sklearn.metrics.roc_auc_score(valid_labels[_label].values, pred_0[_label].values)
                        _roc_auc_1 = sklearn.metrics.roc_auc_score(valid_labels[_label].values, pred_1[_label].values)
                    else:
                        _roc_auc_0 = 0.5
                        _roc_auc_1 = 0.5

                    roc_grid.loc[(common_reg, str(layer_shape), j_fold), _label] = (_roc_auc_0 + _roc_auc_1) / 2
                    cor_grid.loc[(common_reg, str(layer_shape), j_fold), _label] = _cor
                    rho_grid.loc[(common_reg, str(layer_shape), j_fold), _label] = _rho
                    tau_grid.loc[(common_reg, str(layer_shape), j_fold), _label] = _tau
                    
                with open(output_path + '/%s/preds/pred_%s.pickle' % (alg_name, (common_reg, str(layer_shape), j_fold)), 'wb') as f:
                    pickle.dump(valid_pred_list, f)   
                with open(output_path + '/%s/roc_grid.pickle' % (alg_name), 'wb') as f:
                    pickle.dump(roc_grid, f)       
                with open(output_path + '/%s/cor_grid.pickle' % (alg_name), 'wb') as f:
                    pickle.dump(cor_grid, f)          
                with open(output_path + '/%s/rho_grid.pickle' % (alg_name), 'wb') as f:
                    pickle.dump(rho_grid, f)               
                with open(output_path + '/%s/tau_grid.pickle' % (alg_name), 'wb') as f:
                    pickle.dump(tau_grid, f)

In [31]:
roc_finished = roc_grid[~roc_grid.isna()['C_ARR']]
cor_finished = cor_grid[~rho_grid.isna()['C_ARR']]
rho_finished = rho_grid[~rho_grid.isna()['C_ARR']]
tau_finished = tau_grid[~rho_grid.isna()['C_ARR']]

In [32]:
metrics = (
    cor_finished.mean(axis=1).groupby(['reg', 'layer']).apply(lambda x: (x.mean(),x.shape[0])).sort_values(ascending=False),
    rho_finished.mean(axis=1).groupby(['reg', 'layer']).apply(lambda x: (x.mean(),x.shape[0])).sort_values(ascending=False),
    tau_finished.mean(axis=1).groupby(['reg', 'layer']).apply(lambda x: (x.mean(),x.shape[0])).sort_values(ascending=False),
    roc_finished.mean(axis=1).groupby(['reg', 'layer']).apply(lambda x: (x.mean(),x.shape[0])).sort_values(ascending=False),
)
metrics

(reg      layer        
 0.00100  [200, 50, 30]    (0.8874945701481466, 11)
 0.00070  [200, 50, 30]    (0.8839479006058717, 11)
 0.00030  [200, 50, 30]    (0.8755886990657996, 11)
 0.00020  [200, 50, 30]    (0.8730242549970053, 11)
 0.00040  [200, 50, 30]    (0.8723867285920626, 11)
 0.00010  [200, 50, 30]    (0.8631101766157062, 11)
 0.00008  [200, 50, 30]      (0.86197920867077, 11)
 0.00007  [200, 50, 30]    (0.8497091167593246, 11)
 0.00050  [200, 50, 30]    (0.8404469833945257, 11)
 0.00005  [200, 50, 30]    (0.8217925783541683, 11)
 0.00004  [200, 50, 30]    (0.7978493187950462, 11)
 0.00003  [200, 50, 30]    (0.7486739082199102, 11)
 0.00002  [200, 50, 30]    (0.6733764544383989, 11)
 0.00001  [200, 50, 30]    (0.5433027798243638, 11)
 dtype: object,
 reg      layer        
 0.00100  [200, 50, 30]    (0.9127109103757157, 11)
 0.00070  [200, 50, 30]     (0.897100373199974, 11)
 0.00030  [200, 50, 30]    (0.8904943224705275, 11)
 0.00020  [200, 50, 30]    (0.8878943414554951, 11)
