In [None]:
import tensorflow as tf
import pandas as pd
import pickle
import numpy as np
import os
import time

from tensorflow import keras 
from tensorflow.keras import layers, losses, backend, activations
from tensorflow.keras import backend as K
from tensorflow.python.framework.ops import disable_eager_execution

from sklearn.metrics import roc_auc_score, average_precision_score, f1_score

from custom_utils import validation_setup, MLP_model, PHL_model

disable_eager_execution()
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
pd.set_option("display.max_rows", 5000)

---
### read data

In [None]:
data_path = '../data/mimic'
output_path = '../output/mimic'

In [None]:
with open(data_path+'/df_feature.pickle', 'rb') as f:
    df_feature = pickle.load(f).fillna(0)

with open(data_path+'/df_label.pickle', 'rb') as f:
    df_label = pickle.load(f).fillna(0).drop('C_HUA', axis=1) # not predicting disease HUA for most of the providers didn't have corresponding diagnoses records

---
### setups

In [None]:
provider_ids = df_feature.index.get_level_values(0).unique()

n_folds, n_providers_per_fold = 11, 5
n_providers = n_folds * n_providers_per_fold 
n_inputs = df_feature.shape[1]
n_labels = df_label.shape[1]

col_labels = df_label.columns

batch_size = 100
n_iters = 40
epsilon = 1e-7

# common hyper-parameters (tuned on MLP)
layer_shape = [200, 50, 30]
wpa_list = np.arange(0,11)/2
used_reg = 8e-5
used_wpa = 2

# PHL hyper-parameters
gk_l1l2, gb_l1l2 = 1e-2, 3e-2

---
### train test split, $2\times k$ folds

In [None]:
train_list, valid_list = validation_setup(n_folds, n_providers_per_fold)

---
## 1.1 Baseline
### 1.1 train by folds (LORS)

In [None]:
alg_name = 'MLP'

if not os.path.exists(output_path + '/' + alg_name):
    os.mkdir(output_path + '/' + alg_name)
        
for j_fold in range(n_folds):

    for i_fold in range(2):

            # select training and validation sets
            train_pids = provider_ids[train_list[j_fold][i_fold]]
            train_inputs = df_feature.loc[train_pids]
            train_labels = df_label.loc[train_pids]

            valid_pids = provider_ids[valid_list[j_fold]]
            valid_inputs = df_feature.loc[valid_pids]
            valid_labels = df_label.loc[valid_pids]

            print(j_fold, i_fold, train_pids, valid_pids, train_inputs.shape,  valid_inputs.shape)
            
            model = MLP_model(
                params={'n_inputs': n_inputs,
                        'n_labels': n_labels,
                        'layer_shape': layer_shape,
                        'common_reg': used_reg,},
            ) 
            
            model.fit(
                [train_inputs, train_labels], [], 
                batch_size=batch_size, 
                epochs=n_iters,
                shuffle=True,
                workers=40,
                use_multiprocessing=True,
                verbose=0,
            )
            
            valid_pred = model.predict([valid_inputs, np.zeros(valid_labels.shape)])

            with open(output_path + '/%s/pred_valid_%d_split_%d.pickle' % (alg_name, j_fold, i_fold), 'wb') as f:
                pickle.dump(valid_pred, f)

---
## 2. the propensity-harnessed learning (PHL) - step 1 & 2 seperated

### 2.1 step 1 train individual models

In [None]:
alg_name = 'ind'

if not os.path.exists(output_path + '/' + alg_name):
    os.mkdir(output_path + '/' + alg_name)
    
for i_train, train_pid in enumerate(provider_ids):
    
    print('train',i_train, train_pid)
    
    ind_model = MLP_model(
        params={'n_inputs': n_inputs,
                'n_labels': n_labels,
                'layer_shape': layer_shape,
                'common_reg': used_reg,},
    ) 

    ind_model.fit(
        [df_feature.loc[train_pid], df_label.loc[train_pid]], [], 
        batch_size=batch_size, 
        epochs=n_iters,
        shuffle=True,
        workers=40,
        use_multiprocessing=True,
        verbose=0,
    )
    
    for valid_pid in provider_ids:
        
        valid_pred = ind_model.predict([df_feature.loc[valid_pid], 
                                        np.zeros(df_label.loc[valid_pid].shape)])
        
        with open(output_path + '/%s/pred_train_on_%s_valid_on_%s.pickle' % (alg_name, train_pid, valid_pid), 'wb') as f:
                pickle.dump(valid_pred, f)

### 2.2  step 2 by folds (LORS)

In [None]:
for wpa in wpa_list:

    alg_name = 'PHL-w=%.1f' % (wpa)
        
    if not os.path.exists(output_path + '/' + alg_name):
        os.mkdir(output_path + '/' + alg_name)
        
    for j_fold in range(n_folds):

        for i_fold in range(2):
                
            if os.path.exists(output_path + '/%s/pred_g_valid_%d_split_%d.pickle' % (alg_name, j_fold, i_fold)):
                continue

            # select training and validation sets
            train_pids = provider_ids[train_list[j_fold][i_fold]]
            n_train_providers = len(train_pids)
            train_inputs = df_feature.loc[train_pids]
            train_labels = df_label.loc[train_pids]
            train_flags = pd.concat([pd.DataFrame(index=df_feature.loc[pid].index, columns=[pid], data=1) for pid in train_pids]).fillna(0)

            valid_pids = provider_ids[valid_list[j_fold]]
            valid_inputs = df_feature.loc[valid_pids]
            valid_labels = df_label.loc[valid_pids]

            print(j_fold, i_fold, train_pids, valid_pids, train_inputs.shape,  valid_inputs.shape)

            # step 1, get predictions from individual classsifiers
            train_fk_inputs = []
            for train_pid in train_pids:
                ind_fk_inputs = []
                for valid_pid in train_pids:
                    with open(output_path + '/ind/pred_train_on_%s_valid_on_%s.pickle' % (train_pid, valid_pid), 'rb') as f:
                        ind_fk_inputs.append(pickle.load(f))
                ind_fk_inputs = np.vstack(ind_fk_inputs)
                train_fk_inputs.append(ind_fk_inputs[:, np.newaxis, :])
            train_fk_inputs = np.hstack(train_fk_inputs)

            valid_fk_inputs = []
            for train_pid in train_pids:
                ind_fk_inputs = []
                for valid_pid in valid_pids:
                    with open(output_path + '/ind/pred_train_on_%s_valid_on_%s.pickle' % (train_pid, valid_pid), 'rb') as f:
                        ind_fk_inputs.append(pickle.load(f))
                ind_fk_inputs = np.vstack(ind_fk_inputs)
                valid_fk_inputs.append(ind_fk_inputs[:, np.newaxis, :])
            valid_fk_inputs = np.hstack(valid_fk_inputs)
            
            # step 2, train PHL model
            PHL_model = PHL_model(
                 params={'n_inputs': n_inputs,
                         'n_labels': n_labels,
                         'layer_shape': layer_shape,
                         'common_reg': used_reg,    
                         'n_train_providers': n_train_providers,
                         'gk_l1l2': gk_l1l2,
                         'gb_l1l2': gb_l1l2},
                w_pa=wpa,
            ) 

            print('learning PHL model')

            t0=time.time()
                
            PHL_model.fit(
                [train_inputs, train_fk_inputs, train_labels, train_flags], [], 
                batch_size=batch_size, 
                epochs=n_iters,
                shuffle=True,
                workers=40,
                use_multiprocessing=True,
                verbose=0,
            )

            t1 = time.time()
                    
            print('duration: ', t1-t0, 's')
                    
            # prediction
            train_pred = PHL_model.predict([train_inputs, 
                                             train_fk_inputs,
                                             np.zeros(train_labels.shape),
                                             np.zeros([train_labels.shape[0], n_train_providers])])

            train_f_outputs, train_g_outputs = train_pred

            valid_pred = PHL_model.predict([valid_inputs, 
                                             valid_fk_inputs,
                                             np.zeros(valid_labels.shape),
                                             np.zeros([valid_labels.shape[0], n_train_providers])])

            valid_f_outputs, valid_g_outputs = valid_pred

            # storage
            with open(output_path + '/%s/train_pred_valid_%d_split_%d.pickle' % (alg_name, j_fold, i_fold), 'wb') as f:
                pickle.dump(train_f_outputs, f)
            with open(output_path + '/%s/train_pred_g_valid_%d_split_%d.pickle' % (alg_name, j_fold, i_fold), 'wb') as f:
                pickle.dump(train_g_outputs, f)
            with open(output_path + '/%s/pred_valid_%d_split_%d.pickle' % (alg_name, j_fold, i_fold), 'wb') as f:
                pickle.dump(valid_f_outputs, f)
            with open(output_path + '/%s/pred_g_valid_%d_split_%d.pickle' % (alg_name, j_fold, i_fold), 'wb') as f:        
                pickle.dump(valid_g_outputs, f)

### 2.3 step 2 using all physicians 

In [None]:
alg_name = 'PHL-all-w=%.1f' % (used_wpa)
        
if not os.path.exists(output_path + '/' + alg_name):
    os.mkdir(output_path + '/' + alg_name)         

# select training and validation sets
train_pids = provider_ids
n_train_providers = len(train_pids)
train_inputs = df_feature.loc[train_pids]
train_labels = df_label.loc[train_pids]
train_flags = pd.concat([pd.DataFrame(index=df_feature.loc[pid].index, columns=[pid], data=1) for pid in train_pids]).fillna(0)

print(train_pids, train_inputs.shape)

# step 1, get predictions from individual classsifiers
train_fk_inputs = []
for train_pid in train_pids:
    ind_fk_inputs = []
    for valid_pid in train_pids:
        with open(output_path + '/ind/pred_train_on_%s_valid_on_%s.pickle' % (train_pid, valid_pid), 'rb') as f:
            ind_fk_inputs.append(pickle.load(f))
    ind_fk_inputs = np.vstack(ind_fk_inputs)
    train_fk_inputs.append(ind_fk_inputs[:, np.newaxis, :])
train_fk_inputs = np.hstack(train_fk_inputs)

# step 2, train PHL model
PHL_model = PHL_model(
        params={'n_inputs': n_inputs,
                'n_labels': n_labels,
                'layer_shape': layer_shape,
                'common_reg': used_reg,    
                'n_train_providers': n_train_providers,
                'gk_l1l2': gk_l1l2,
                'gb_l1l2': gb_l1l2},
    w_pa=used_wpa,
) 

print('learning PHL model')

t0=time.time()
                
PHL_model.fit(
    [train_inputs, train_fk_inputs, train_labels, train_flags], [], 
    batch_size=batch_size, 
    epochs=n_iters,
    shuffle=True,
    workers=40,
    use_multiprocessing=True,
    verbose=0,
)

t1 = time.time()
                    
print('duration: ', t1-t0, 's')
                    
# prediction
train_pred = PHL_model.predict([train_inputs, 
                                 train_fk_inputs,
                                 np.zeros(train_labels.shape),
                                 np.zeros([train_labels.shape[0], n_train_providers])])

train_f_outputs, train_g_outputs = train_pred

# storage
with open(output_path + '/%s/train_pred.pickle' % (alg_name), 'wb') as f:
    pickle.dump(train_f_outputs, f)
with open(output_path + '/%s/train_pred_g.pickle' % (alg_name), 'wb') as f:
    pickle.dump(train_g_outputs, f)

In [None]:
g_multihead_layer = PHL_model.get_layer(name='g_multihead_layer')
with open(output_path + '/%s/train_kernel.pickle' % (alg_name), 'wb') as f:
    pickle.dump(K.eval(g_multihead_layer.kernel), f)
with open(output_path + '/%s/train_bias.pickle' % (alg_name), 'wb') as f:
    pickle.dump(K.eval(g_multihead_layer.bias), f)

---
## 3. the propensity-harnessed learning (PHL) - step 1 & 2 combined

In [None]:
alg_name = 'PHL'

if not os.path.exists(output_path + '/' + alg_name):
    os.mkdir(output_path + '/' + alg_name)        
    
for j_fold in range(n_folds):

    for i_fold in range(2):

        # select training and validation sets
        train_pids = provider_ids[train_list[j_fold][i_fold]]
        n_train_providers = len(train_pids)
        train_inputs = df_feature.loc[train_pids]
        train_labels = df_label.loc[train_pids]
        train_flags = pd.concat([pd.DataFrame(index=df_feature.loc[pid].index, columns=[pid], data=1) for pid in train_pids]).fillna(0)
        
        valid_pids = provider_ids[valid_list[j_fold]]
        valid_inputs = df_feature.loc[valid_pids]
        valid_labels = df_label.loc[valid_pids]

        print(j_fold, i_fold, train_pids, valid_pids, train_inputs.shape,  valid_inputs.shape)
            
        # step 1, train individual MLP models
        train_fk_inputs = []
        valid_fk_inputs = []
            
        for pid in train_pids:
                
            print('learning ind models, ', pid, end='\r')
                
            ind_model = MLP_model(
                params={'n_inputs': n_inputs,
                        'n_labels': n_labels,
                        'layer_shape': layer_shape,
                        'common_reg': used_reg,}
            ) 

            ind_model.fit(
                [df_feature.loc[pid], df_label.loc[pid]], [], 
                batch_size=batch_size, 
                epochs=n_iters,
                shuffle=True,
                workers=40,
                use_multiprocessing=True,
                verbose=0,
            )
                
            train_fk_inputs.append(ind_model.predict([train_inputs, np.zeros(train_labels.shape)])[:, np.newaxis, :])
            valid_fk_inputs.append(ind_model.predict([valid_inputs, np.zeros(valid_labels.shape)])[:, np.newaxis, :])
                
        train_fk_inputs = np.hstack(train_fk_inputs)
        valid_fk_inputs = np.hstack(valid_fk_inputs)
            
        # step 2, train PHL model
        PHL_model = PHL_model(
                params={'n_inputs': n_inputs,
                        'n_labels': n_labels,
                        'layer_shape': layer_shape,
                        'common_reg': used_reg,    
                        'n_train_providers': n_train_providers,
                        'gk_l1l2': gk_l1l2,
                        'gb_l1l2': gb_l1l2},
            w_pa=used_wpa,
        ) 
            
        print('learning PHL model')
            
        PHL_model.fit(
            [train_inputs, train_fk_inputs, train_labels, train_flags], [], 
            batch_size=batch_size, 
            epochs=n_iters,
            shuffle=True,
            workers=40,
            use_multiprocessing=True,
            verbose=2,
        )
        
        # prediction
        train_pred = PHL_model.predict([train_inputs, 
                                         train_fk_inputs,
                                         np.zeros(train_labels.shape),
                                         np.zeros([train_labels.shape[0], n_train_providers])])
            
        train_f_outputs, train_g_outputs = train_pred
            
        valid_pred = PHL_model.predict([valid_inputs, 
                                         valid_fk_inputs,
                                         np.zeros(valid_labels.shape),
                                         np.zeros([valid_labels.shape[0], n_train_providers])])
            
        valid_f_outputs, valid_g_outputs = valid_pred
            
        # storage
        with open(output_path + '/%s/train_pred_valid_%d_split_%d.pickle' % (alg_name, j_fold, i_fold), 'wb') as f:
            pickle.dump(train_f_outputs, f)
        with open(output_path + '/%s/train_pred_g_valid_%d_split_%d.pickle' % (alg_name, j_fold, i_fold), 'wb') as f:
            pickle.dump(train_g_outputs, f)
        with open(output_path + '/%s/pred_valid_%d_split_%d.pickle' % (alg_name, j_fold, i_fold), 'wb') as f:
            pickle.dump(valid_f_outputs, f)
        with open(output_path + '/%s/pred_g_valid_%d_split_%d.pickle' % (alg_name, j_fold, i_fold), 'wb') as f:        
            pickle.dump(valid_g_outputs, f)
            