Objective of this notebook is to recreate vicreg for pretraining purposes

In [1]:
import argparse
import numpy as np
import logging, os
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
import tensorflow_probability as tfp
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score as auprc
from sklearn.metrics import roc_auc_score as auc_score
import keras

#from keras.utils import multi_gpu_model
from keras.layers import Input, Dense, GRU, Lambda, Permute, Concatenate, Dropout
from keras.models import Model
from interpolation_layer import single_channel_interp, cross_channel_interp
from mimic_preprocessing import load_data, trim_los, fix_input_format
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

np.random.seed(10)
from sklearn.feature_extraction.text import TfidfVectorizer
os.environ["CUDA_VISIBLE_DEVICES"] = "04"

In [2]:
'''
hid = 8192
mod = tf.keras.Sequential()
mod.add(Dense(hid, input_shape=(512,)))
mod.add(Dense(hid))
mod.add(Dense(hid))
mod.compile(optimizer="adam", loss="mean_squared_error")
mod.summary()

x = tf.random.uniform(shape=(2560, 512))
y = tf.random.uniform(shape=(2560, 8192), minval=0, maxval=1, dtype=tf.int32)

mod.fit(x, y, epochs=30)
'''

'\nhid = 8192\nmod = tf.keras.Sequential()\nmod.add(Dense(hid, input_shape=(512,)))\nmod.add(Dense(hid))\nmod.add(Dense(hid))\nmod.compile(optimizer="adam", loss="mean_squared_error")\nmod.summary()\n\nx = tf.random.uniform(shape=(2560, 512))\ny = tf.random.uniform(shape=(2560, 8192), minval=0, maxval=1, dtype=tf.int32)\n\nmod.fit(x, y, epochs=30)\n'

In [3]:
import sys
sys.path
if '/home/ugrads/n/nickcheng0921/TAMU-MedResearch/' not in sys.path:
    sys.path.append('/home/ugrads/n/nickcheng0921/TAMU-MedResearch/')
    
from helper import hold_out, mean_imputation

In [29]:
gpu_num = 1
epoch = 30
hid = 512 #can be 128-512
exp_hid = 512
ref_points = 128
hours_look_ahead = 48
if gpu_num > 0:
    batch = 512*gpu_num
else:
    batch = 512
    
#vicreg parameters
epsilon = .0001 #small scalar to prevent numerical instabilities
gamma = 1 #constant target value for SD

#nicks notes args
vocabulary = 6000

## Loading Data
adjust # of patients in mimic_preprocessing.py

In [30]:
# Loading dataset - explanation in multivariate notebook
vitals, label = load_data(look_ahead_time = hours_look_ahead)
vitals, timestamps = trim_los(vitals, hours_look_ahead)
x, m, T = fix_input_format(vitals, timestamps)
mean_imputation(x, m)
x = np.concatenate((x, m, T, hold_out(m)), axis=1)  # input format
y = np.array(label)
print(f"X shape: {x.shape}, Y shape: {y.shape}")
timestamp = x.shape[2]
num_features = x.shape[1] // 4
#     have an array representation.
# m : (N, D, tn) where m[i,j,k] = 0 means that x[i,j,k] is not observed.
# T : (N, D, tn) represents the actual time stamps of observation;

Loading files ...
Loading Done with 12000 patients! Nick
10852 10852
(10852, 12, 200) 10852
X shape: (10852, 48, 200), Y shape: (10852,)


In [31]:
import pickle
patient_notes = pickle.load(open('notes_12000_'+str(hours_look_ahead)+'hrs.p', 'rb'))

# Unsupervised Model

In [7]:
coef_var = 256
coef_cov = 1
coef_inv = 1

In [9]:
def customloss(ytrue, ypred):
    print("AUX loss")
    """ Autoencoder loss
    """
    # standard deviation of each feature mentioned in paper for MIMIC_III data
    wc = np.array([3.33, 23.27, 5.69, 22.45, 14.75, 2.32,
                   3.75, 1.0, 98.1, 23.41, 59.32, 1.41])
    wc.shape = (1, num_features)
    y = ytrue[:, :num_features, :]
    m2 = ytrue[:, 3*num_features:4*num_features, :]
    m2 = 1 - m2
    m1 = ytrue[:, num_features:2*num_features, :]
    m = m1*m2
    ypred = ypred[:, :num_features, :]
    x = (y - ypred)*(y - ypred)
    x = x*m
    count = tf.reduce_sum(m, axis=2)
    count = tf.where(count > 0, count, tf.ones_like(count))
    x = tf.reduce_sum(x, axis=2)/count
    x = x/(wc**2)  # dividing by standard deviation
    x = tf.reduce_sum(x, axis=1)/num_features
    return tf.reduce_mean(x)

seed = 0
results = {}
results['inv'] = []
results['var'] = []
results['cov'] = []

In [10]:
#LARS from https://github.com/keras-team/keras-contrib/blob/master/keras_contrib/optimizers/lars.py

In [32]:
#model https://github.com/eyalzk/sketch_rnn_keras/blob/002931b9abea957a77382688f37d95afbb2ae6cb/seq2seqVAE.py
#https://towardsdatascience.com/advanced-keras-constructing-complex-custom-losses-and-metrics-c07ca130a618
class FusionModel(object):
    def __init__(self):
        if gpu_num > 1:
            dev = "/cpu:0"
        else:
            dev = "/gpu:0"
        with tf.device(dev):
            self.main_input = Input(shape=(4*num_features, timestamp), name='input')
            self.notes_input = Input(shape=(vocabulary), name='notes_input')
            self.notes_output = Dense(hid, activation='sigmoid', name='text_dense_1')(self.notes_input)
            self.notes_output = Dropout(0.2)(self.notes_output)
            self.notes_output = Dense(hid, activation='sigmoid', name='text_dense_2')(self.notes_output)
            
            self.sci = single_channel_interp(ref_points, hours_look_ahead)
            self.cci = cross_channel_interp()
            self.interp = self.cci(self.sci(self.main_input))
            self.reconst = self.cci(self.sci(self.main_input, reconstruction=True),
                          reconstruction=True)
            self.aux_output = Lambda(lambda x: x, name='aux_output')(self.reconst)
            self.z = Permute((2, 1))(self.interp)
            self.z = GRU(hid, activation='tanh', recurrent_dropout=0.2, dropout=0.2, name='series_output')(self.z)
            
            self.exp_head = tf.keras.Sequential()
            self.exp_head.add(Dense(exp_hid, input_shape=(hid,)))
            #self.exp_head.add(Dense(exp_hid))
            #self.exp_head.add(Dense(exp_hid))
            
            self.notes_output = self.exp_head(self.notes_output)
            self.z = self.exp_head(self.z)
            
            #print(f"Z SHAPE {z.shape} NOTES SHAPE {notes_input.shape} MERGED SHAPE {merged_input.shape}")
            self.merged_output = Concatenate(name='merged_output')([self.notes_output, self.z])
            self.model = Model([self.main_input, self.notes_input], [self.merged_output, self.aux_output])
            
            trainable_count = np.sum([K.count_params(w) for w in self.model.trainable_weights])
            print(f"train params: {trainable_count}")
        
    def calculate_vic_loss(self, y_true, y_pred):
        """calculate mse across notes and series output for VICReg"""
        branch1, branch2 = tf.split(y_pred, num_or_size_splits=2, axis=1)
        batch_size = branch1.shape[0]
        
        #variance
        var1, var2 = tf.zeros(shape=[]), tf.zeros(shape=[])
        for i in range(hid):
            var1 += max(0, gamma - (tf.math.reduce_variance(branch1[:, i], axis=-1) + epsilon)**(1/2))
            var2 += max(0, gamma - (tf.math.reduce_variance(branch2[:, i], axis=-1) + epsilon)**(1/2))
        var1 /= hid
        var2 /= hid
        var1 *= coef_var
        var2 *= coef_var
        #invariance
        mse = coef_inv*tf.reduce_sum(K.square(branch1 - branch2))/batch_size
        
        #covariance
        cov1 = tfp.stats.covariance(branch1, sample_axis=1, event_axis=0)
        all_cov1 = tf.reduce_sum(K.square(cov1))
        diag_cov1 = tf.reduce_sum(K.square(tf.linalg.diag_part(cov1)))
        cov1 = coef_cov*(all_cov1 - diag_cov1)/hid
        
        cov2 = tfp.stats.covariance(branch2, sample_axis=1, event_axis=0)
        all_cov2 = tf.reduce_sum(K.square(cov2))
        diag_cov2 = tf.reduce_sum(K.square(tf.linalg.diag_part(cov2)))
        cov2 = coef_cov*(all_cov2 - diag_cov2)/hid
        
        results['inv'].append(mse)
        results['var'].append(var1+var2)
        results['cov'].append(cov1+cov2)
        return mse+var1+var2+cov1+cov2

## Notes vectorizer
Takes input of size n, and vocab of length l.
returns vector of (n, l) where l is vocab embeddings

In [33]:
#from tensorflow.python.framework.ops import disable_eager_execution, enable_eager_execution
#disable_eager_execution()
#enable_eager_execution()
#custom loss needs eager execution, custom optimizer needs disabled eager execution

In [27]:
from keras.losses import mse
from keras import backend as K

In [34]:
i = 1
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
notes_vectorizer = TfidfVectorizer(max_features=vocabulary)

model = None
for train, test in kfold.split(np.zeros(len(y)), y):
    print("Running Fold:", i)
    FM = FusionModel()
    model = FM.model  # re-initializing every time
    kfold_notes_train = [patient_notes[i] for i in train]
    kfold_notes_test = [patient_notes[i] for i in test]
    notes_tfidf = notes_vectorizer.fit(kfold_notes_train) #train vocab on train set, then use vectorizer on test set
    #https://stackoverflow.com/questions/52357542/attributeerror-tensor-object-has-no-attribute-numpy
    model.compile(
        optimizer='adam', run_eagerly = True, #disables graph execution, but allows us to grab tensor values instantly 
        loss={'merged_output': FM.calculate_vic_loss, 'aux_output': customloss}) #eager needed to view tensors
    model.fit(
        {'input': x[train], 'notes_input': notes_vectorizer.transform(kfold_notes_train).todense()}, {'merged_output': [], 'aux_output': x[train]},
        batch_size=batch,
        epochs=epoch,
        verbose=2)
    #model.metrics_names gives loss names for evaluate
    print(model.metrics_names)

    model.save('reg1_expander'+str(i)+'.weights')
    i += 1
    break

Running Fold: 1
train params: 4442624
Epoch 1/30
17/17 - 147s - loss: 545.2604 - merged_output_loss: 545.2604 - aux_output_loss: 0.0000e+00 - 147s/epoch - 9s/step
Epoch 2/30


KeyboardInterrupt: 

In [None]:
plt.plot(results['inv'], label='inv')
plt.plot(results['cov'], label='cov')
plt.plot(results['var'], label='var')
plt.legend(loc='lower right')

In [None]:
plt.title("Variance loss across iterations")
plt.plot([i/(2*coef_var) for i in results['var']])

# Save and load model

In [None]:
model.save('reg1_expander.weights')