# Example code to train VAE

In [29]:
from simulator import VariationalAutoencoder, save_dict_pickle, load_dict_pickle, SimulationDataScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

### Load Simulated Data

In [2]:
simulated_data = load_dict_pickle('20200120_simulated_data.pickle')

In [4]:
list(simulated_data.keys())

['X_train_complete',
 'X_val_complete',
 'X_test_complete',
 'Y_train_complete',
 'Y_val_complete',
 'Y_test_complete',
 'X_train_um',
 'X_val_um',
 'X_test_um',
 'Y_train_um',
 'Y_val_um',
 'Y_test_um']

In [6]:
X_train_um = simulated_data['X_train_um']
Y_train_um = simulated_data['Y_train_um']
X_val_um = simulated_data['X_val_um']
Y_val_um = simulated_data['Y_val_um']

In [14]:
X_train_um.describe()

Unnamed: 0,resp,shock_indexes,systolics,temperature,urine,MAP,o2_24h_max,spo2_24hr_min,pulse,AGE_AT_START_OF_ENCOUNTER,...,height,weight,platelets,shock_index_age,anion_gap,pulse_pressure,window_num,albumin,protein_level,diastolics
count,49960.0,49961.0,49987.0,48611.0,37130.0,49988.0,49960.0,49948.0,49995.0,50000.0,...,32580.0,42449.0,43742.0,49953.0,43573.0,49979.0,50000.0,29704.0,29355.0,49986.0
mean,18.431267,0.714924,122.480316,36.788361,274.272186,86.15139,2.142308,92.862534,85.224884,57.526039,...,1.699483,83.152763,222.761078,40.215744,12.053883,0.44377,5.036349,3.703878,6.479697,67.993462
std,2.30773,0.183246,22.053869,0.265728,145.952515,14.096407,2.0542,2.749843,15.372195,14.193875,...,0.065402,14.961164,64.168648,14.070885,1.780867,0.050794,4.729579,0.474927,0.612516,11.067472
min,14.364316,0.2141,73.620064,36.071255,-55.800388,54.802345,-1.355333,61.645058,43.585056,8.474298,...,1.460719,34.275742,-80.985092,0.801388,6.791385,0.238849,-2.576122,1.702451,3.983111,39.285114
25%,17.07394,0.584319,104.082581,36.641098,175.959747,74.469139,0.751234,91.832855,73.670395,47.978536,...,1.652951,72.84314,183.248138,30.27424,10.804937,0.410636,2.847863,3.400325,6.089336,59.218407
50%,17.804399,0.695298,118.635956,36.734791,238.529449,84.281525,1.43941,93.233742,83.905266,59.766405,...,1.699363,81.219131,217.120224,38.329742,11.866473,0.443224,3.807496,3.738561,6.51803,66.474869
75%,18.948053,0.825283,137.7108,36.852093,333.332825,96.065742,2.853918,94.521477,95.448303,68.239954,...,1.746406,91.574539,255.831543,48.442451,13.086332,0.477468,5.330729,4.041468,6.915159,75.622864
max,45.023113,1.650467,211.602493,39.900089,1294.61084,146.211853,16.435431,99.100471,152.501236,93.911957,...,1.921419,174.343643,634.443787,110.93277,22.771687,0.640637,70.544189,5.074804,8.302896,121.379723


### Check Missingness

In [9]:
X_train_um.isna().mean()

resp                         0.00080
shock_indexes                0.00078
systolics                    0.00026
temperature                  0.02778
urine                        0.25740
MAP                          0.00024
o2_24h_max                   0.00080
spo2_24hr_min                0.00104
pulse                        0.00010
AGE_AT_START_OF_ENCOUNTER    0.00000
bun                          0.13058
calcium                      0.12912
chloride                     0.12788
co2                          0.12976
creatinine                   0.12732
glucose                      0.09140
hematocrit                   0.11732
hemoglobin                   0.12062
mean_corps_hgb               0.12254
mean_corps_hgb_conc          0.12288
mean_corps_hgb_vol           0.12364
mean_platelet_vol            0.13586
potassium                    0.11558
red_blood_cell_count         0.12384
red_cell_dist_width          0.12604
sodium                       0.11644
gcs                          0.22154
w

### Scale and Impute with mean
* Better imputation methods could be used here

In [12]:
data_scaler = SimulationDataScaler()

In [13]:
data_scaler.fit(X_train_um)

<simulator.SimulationDataScaler at 0x7fe4bf2effd0>

In [17]:
X_train_um_scaled, X_train_um_ind = data_scaler.transform(X_train_um)
X_val_um_scaled, X_val_um_ind = data_scaler.transform(X_val_um)

In [18]:
X_train_um_scaled.describe()

Unnamed: 0,resp,shock_indexes,systolics,temperature,urine,MAP,o2_24h_max,spo2_24hr_min,pulse,AGE_AT_START_OF_ENCOUNTER,...,height,weight,platelets,shock_index_age,anion_gap,pulse_pressure,window_num,albumin,protein_level,diastolics
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,7.324219e-09,2.441406e-09,1.052856e-08,7.324219e-09,1.220703e-09,1.00708e-08,0.0,2.441406e-09,1.586914e-08,-1.220703e-08,...,-1.098633e-08,7.629395e-09,1.098633e-08,7.324219e-09,-9.765625e-09,1.039505e-08,-3.662109e-09,-4.882812e-09,1.342773e-08,-8.544922e-09
std,0.9996099,0.9996199,0.99988,0.9860221,0.861751,0.99989,0.99961,0.9994898,0.99996,1.00001,...,0.8072255,0.9214103,0.9353381,0.9995399,0.9335296,0.9998,1.00001,0.7707736,0.7662322,0.99987
min,-1.762334,-2.733105,-2.215518,-2.69868,-2.261537,-2.223925,-1.702695,-11.35257,-2.708803,-3.455873,...,-3.650757,-3.266965,-4.733614,-2.801157,-2.955055,-4.034411,-1.609561,-4.214249,-4.076025,-2.593966
25%,-0.5879122,-0.7120336,-0.8339903,-0.5443461,-0.5286755,-0.8286207,-0.676989,-0.3735618,-0.7516282,-0.672656,...,-0.3148487,-0.5847651,-0.5317143,-0.7057414,-0.6054935,-0.6520727,-0.4627278,-0.1247874,-0.1297121,-0.7927659
50%,-0.2711526,-0.106009,-0.1733065,-0.1815041,0.0,-0.1321312,-0.341537,0.1339325,-0.0857383,0.1578422,...,0.0,0.0,0.0,-0.1327677,0.0,-0.01032614,-0.2598256,0.0,0.0,-0.1367543
75%,0.2236488,0.6012142,0.6903922,0.22143,0.1249265,0.7031777,0.345502,0.6026543,0.6649721,0.7548344,...,0.3231322,0.409827,0.4051115,0.5836623,0.4573106,0.6631775,0.06224296,0.2687464,0.2535732,0.6891508
max,11.52306,5.105456,4.041153,11.71029,6.990989,4.260736,6.95807,2.268493,4.37654,2.56352,...,3.393467,6.095245,6.415709,5.02582,6.018375,3.875848,13.85081,2.886654,2.976625,4.823757


### Fit Model

In [20]:
input_features_with_y = X_train_um_scaled.shape[1] + 1 # add 1 for target

In [21]:
vae = VariationalAutoencoder(input_features=input_features_with_y, codings_size=16, d1_size=256, d2_size=128, d3_size=64, d4_size=32, patience=7, optimizer='adam')

In [23]:
history = vae.fit(X_train_um_scaled,Y_train_um,X_val_um_scaled,Y_val_um
                  ,batch_size=512, epochs=100)

Train on 50000 samples, validate on 10000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100


### Evaluate Performance

In [25]:
### Add Y to X matrix for evaluation

X_val_um_scaled_with_y = X_val_um_scaled.copy()
X_val_um_scaled_with_y['Target'] = Y_val_um

X_val_um_with_y = X_val_um.copy()
X_val_um_with_y['Target'] = Y_val_um

In [24]:
_, _, codings_val = vae.predict_encoder(X_val_um_scaled,Y_val_um)
X_val_with_y_pred = vae.predict_decoder(codings_val)

In [26]:
corr_mat_pred = pd.DataFrame(X_val_with_y_pred).corr()
corr_mat_imp = pd.DataFrame(X_val_um_scaled_with_y).corr()
corr_mat_raw = pd.DataFrame(X_val_um_with_y).corr()

In [27]:
(np.sign(np.triu(corr_mat_pred)) == np.sign(np.triu(corr_mat_raw))).mean()


0.967391304347826

* The signs of the upper triangle of the correlation matricies match very well!

In [30]:
X_val_um_with_y_df = pd.DataFrame(X_val_um_with_y)
X_val_with_y_pred_df = pd.DataFrame(X_val_with_y_pred)

loss1_cors = []
for i in range(0,X_val_um_with_y_df.shape[1]):
    use = X_val_um_with_y_df.iloc[:,i].notna().values
    r,p = pearsonr( X_val_um_with_y_df.loc[use].iloc[:,i].values,
                   X_val_with_y_pred_df.loc[use].iloc[:,i].values )
    loss1_cors.append(r)


In [31]:
np.mean(loss1_cors)

0.8751452669550324

* The average Pearson correlation between the reconstructions and the raw variables is strong

### Save model

In [33]:
out_model = 'example_model.h5'
vae.save(out_model)

In [34]:
!ls -lh $out_model

-rw-r--r-- 1 cgillies 4294967294 476K Jan 20 16:51 example_model.h5
