Objective of this notebook is to recreate vicreg for pretraining purposes

In [1]:
import argparse
import numpy as np
import logging, os
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score as auprc
from sklearn.metrics import roc_auc_score as auc_score
import keras

#from keras.utils import multi_gpu_model
from keras.layers import Input, Dense, GRU, Lambda, Permute, Concatenate
from keras.models import Model
from interpolation_layer import single_channel_interp, cross_channel_interp
from mimic_preprocessing import load_data, trim_los, fix_input_format
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

np.random.seed(10)
from sklearn.feature_extraction.text import TfidfVectorizer
os.environ["CUDA_VISIBLE_DEVICES"] = "04"

In [2]:
import sys
sys.path
if '/home/ugrads/n/nickcheng0921/TAMU-MedResearch/' not in sys.path:
    sys.path.append('/home/ugrads/n/nickcheng0921/TAMU-MedResearch/')
    
from helper import hold_out, mean_imputation

In [10]:
gpu_num = 1
epoch = 3
hid = 64 #can be 128-512
ref_points = 128
hours_look_ahead = 48
if gpu_num > 0:
    batch = 512*gpu_num
else:
    batch = 512
    
#nicks notes args
vocabulary = 6000

## Loading Data
adjust # of patients in mimic_preprocessing.py

In [4]:
# Loading dataset - explanation in multivariate notebook
vitals, label = load_data(look_ahead_time = hours_look_ahead)
vitals, timestamps = trim_los(vitals, hours_look_ahead)
x, m, T = fix_input_format(vitals, timestamps)
mean_imputation(x, m)
x = np.concatenate((x, m, T, hold_out(m)), axis=1)  # input format
y = np.array(label)
print(f"X shape: {x.shape}, Y shape: {y.shape}")
timestamp = x.shape[2]
num_features = x.shape[1] // 4
#     have an array representation.
# m : (N, D, tn) where m[i,j,k] = 0 means that x[i,j,k] is not observed.
# T : (N, D, tn) represents the actual time stamps of observation;

Loading files ...
Loading Done with 5000 patients! Nick
4532 4532
(4532, 12, 200) 4532
X shape: (4532, 48, 200), Y shape: (4532,)


In [5]:
import pickle
patient_notes = pickle.load(open('notes_5000_'+str(hours_look_ahead)+'hrs.p', 'rb'))

# Unsupervised Model

In [6]:
def customloss(ytrue, ypred):
    """ Autoencoder loss
    """
    # standard deviation of each feature mentioned in paper for MIMIC_III data
    wc = np.array([3.33, 23.27, 5.69, 22.45, 14.75, 2.32,
                   3.75, 1.0, 98.1, 23.41, 59.32, 1.41])
    wc.shape = (1, num_features)
    y = ytrue[:, :num_features, :]
    m2 = ytrue[:, 3*num_features:4*num_features, :]
    m2 = 1 - m2
    m1 = ytrue[:, num_features:2*num_features, :]
    m = m1*m2
    ypred = ypred[:, :num_features, :]
    x = (y - ypred)*(y - ypred)
    x = x*m
    count = tf.reduce_sum(m, axis=2)
    count = tf.where(count > 0, count, tf.ones_like(count))
    x = tf.reduce_sum(x, axis=2)/count
    x = x/(wc**2)  # dividing by standard deviation
    x = tf.reduce_sum(x, axis=1)/num_features
    print("got loss", tf.reduce_mean(x))
    return tf.reduce_mean(x)

seed = 0
results = {}
results['aux loss'] = []
results['vic loss'] = []

In [7]:
#model https://github.com/eyalzk/sketch_rnn_keras/blob/002931b9abea957a77382688f37d95afbb2ae6cb/seq2seqVAE.py
#https://towardsdatascience.com/advanced-keras-constructing-complex-custom-losses-and-metrics-c07ca130a618
class FusionModel(object):
    def __init__(self):
        if gpu_num > 1:
            dev = "/cpu:0"
        else:
            dev = "/gpu:0"
        with tf.device(dev):
            self.main_input = Input(shape=(4*num_features, timestamp), name='input')
            self.notes_input = Input(shape=(vocabulary), name='notes_input')
            self.notes_output = Dense(hid, activation='sigmoid', name='notes_output')(self.notes_input)
            self.sci = single_channel_interp(ref_points, hours_look_ahead)
            self.cci = cross_channel_interp()
            self.interp = self.cci(self.sci(self.main_input))
            self.reconst = self.cci(self.sci(self.main_input, reconstruction=True),
                          reconstruction=True)
            self.aux_output = Lambda(lambda x: x, name='aux_output')(self.reconst)
            self.z = Permute((2, 1))(self.interp)
            self.z = GRU(hid, activation='tanh', recurrent_dropout=0.2, dropout=0.2, name='series_output')(self.z)
            #print(f"Z SHAPE {z.shape} NOTES SHAPE {notes_input.shape} MERGED SHAPE {merged_input.shape}")
            self.merged_output = Concatenate(name='merged_output')([self.notes_output, self.z])
            self.model = Model([self.main_input, self.notes_input], [self.merged_output, self.aux_output])
            print(self.model
        
    def calculate_mse_loss(self, y_true, y_pred):
        """calculate mse across notes and series output for VICReg"""
        #return K.mean(K.square(self.notes_output - self.z), axis=-1)
        return K.mean(y_pred)
    
class FusionModel2(object):
    def __init__(self):
        if gpu_num > 1:
            dev = "/cpu:0"
        else:
            dev = "/gpu:0"
        with tf.device(dev):
            self.main_input = Input(shape=(4*num_features, timestamp), name='input')
            self.notes_input = Input(shape=(vocabulary), name='notes_input')
            self.notes_output = Dense(hid, activation='sigmoid', name='notes_output')(self.notes_input)
            self.sci = single_channel_interp(ref_points, hours_look_ahead)
            self.cci = cross_channel_interp()
            self.interp = self.cci(self.sci(self.main_input))
            self.reconst = self.cci(self.sci(self.main_input, reconstruction=True),
                          reconstruction=True)
            self.aux_output = Lambda(lambda x: x, name='aux_output')(self.reconst)
            self.z = Permute((2, 1))(self.interp)
            self.z = GRU(hid, activation='tanh', recurrent_dropout=0.2, dropout=0.2, name='series_output')(self.z)
            #print(f"Z SHAPE {z.shape} NOTES SHAPE {notes_input.shape} MERGED SHAPE {merged_input.shape}")
            self.model = Model([self.main_input, self.notes_input], [self.aux_output])

## Notes vectorizer
Takes input of size n, and vocab of length l.
returns vector of (n, l) where l is vocab embeddings

In [8]:
i = 0
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
notes_vectorizer = TfidfVectorizer(max_features=vocabulary)
from keras.losses import mse
from keras import backend as K
x1, play = None, None
for train, test in kfold.split(np.zeros(len(y)), y):
    print("Running Fold:", i+1)
    FM = FusionModel()
    model = FM.model  # re-initializing every time
    kfold_notes_train = [patient_notes[i] for i in train]
    kfold_notes_test = [patient_notes[i] for i in test]
    notes_tfidf = notes_vectorizer.fit(kfold_notes_train) #train vocab on train set, then use vectorizer on test set
    model.compile(
        optimizer='adam',
        loss={'merged_output': FM.calculate_mse_loss, 'aux_output': customloss}) #eager needed to view tensors
    model.fit(
        {'input': x[train], 'notes_input': notes_vectorizer.transform(kfold_notes_train).todense()}, {'merged_output': [], 'aux_output': x[train]},
        batch_size=batch,
        epochs=epoch,
        verbose=2)
    #model.metrics_names gives loss names for evaluate
    print(model.metrics_names)

    i += 1
    break

Running Fold: 1
Epoch 1/10
8/8 - 7s - loss: 0.1864 - merged_output_loss: 0.1864 - aux_output_loss: 0.0000e+00 - 7s/epoch - 816ms/step
Epoch 2/10
8/8 - 3s - loss: 0.0924 - merged_output_loss: 0.0924 - aux_output_loss: 0.0000e+00 - 3s/epoch - 403ms/step
Epoch 3/10
8/8 - 3s - loss: 0.0051 - merged_output_loss: 0.0051 - aux_output_loss: 0.0000e+00 - 3s/epoch - 410ms/step
Epoch 4/10
8/8 - 3s - loss: -7.3775e-02 - merged_output_loss: -7.3775e-02 - aux_output_loss: 0.0000e+00 - 3s/epoch - 401ms/step
Epoch 5/10
8/8 - 3s - loss: -1.3551e-01 - merged_output_loss: -1.3551e-01 - aux_output_loss: 0.0000e+00 - 3s/epoch - 415ms/step
Epoch 6/10
8/8 - 4s - loss: -1.8518e-01 - merged_output_loss: -1.8518e-01 - aux_output_loss: 0.0000e+00 - 4s/epoch - 438ms/step
Epoch 7/10
8/8 - 4s - loss: -2.2222e-01 - merged_output_loss: -2.2222e-01 - aux_output_loss: 0.0000e+00 - 4s/epoch - 447ms/step
Epoch 8/10
8/8 - 4s - loss: -2.5543e-01 - merged_output_loss: -2.5543e-01 - aux_output_loss: 0.0000e+00 - 4s/epoch - 4

In [9]:
i = 0
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
notes_vectorizer = TfidfVectorizer(max_features=vocabulary)
from keras.losses import mse
from keras import backend as K
x1, play = None, None
for train, test in kfold.split(np.zeros(len(y)), y):
    print("Running Fold:", i+1)
    FM = FusionModel2()
    model = FM.model  # re-initializing every time
    kfold_notes_train = [patient_notes[i] for i in train]
    kfold_notes_test = [patient_notes[i] for i in test]
    notes_tfidf = notes_vectorizer.fit(kfold_notes_train) #train vocab on train set, then use vectorizer on test set
    model.compile(
        optimizer='adam',
        loss={'aux_output': customloss},
        loss_weights={'aux_output': 1.}) #eager needed to view tensors
    model.fit(
        {'input': x[train], 'notes_input': notes_vectorizer.transform(kfold_notes_train).todense()}, {'aux_output': x[train]},
        batch_size=batch,
        epochs=epoch,
        verbose=2)
    #model.metrics_names gives loss names for evaluate
    print(model.metrics_names)

    i += 1
    break

Running Fold: 1
Epoch 1/10
got loss Tensor("customloss/Mean:0", shape=(), dtype=float32)
got loss Tensor("customloss/Mean:0", shape=(), dtype=float32)
8/8 - 1s - loss: 0.3271 - 1s/epoch - 187ms/step
Epoch 2/10
8/8 - 1s - loss: 0.3271 - 828ms/epoch - 104ms/step
Epoch 3/10
8/8 - 1s - loss: 0.3271 - 836ms/epoch - 105ms/step
Epoch 4/10
8/8 - 1s - loss: 0.3271 - 850ms/epoch - 106ms/step
Epoch 5/10
8/8 - 1s - loss: 0.3271 - 845ms/epoch - 106ms/step
Epoch 6/10
8/8 - 1s - loss: 0.3271 - 821ms/epoch - 103ms/step
Epoch 7/10
8/8 - 1s - loss: 0.3271 - 830ms/epoch - 104ms/step
Epoch 8/10
8/8 - 1s - loss: 0.3271 - 813ms/epoch - 102ms/step
Epoch 9/10


KeyboardInterrupt: 

In [None]:
#default loss must have y_true w/ y_pred
#loss cannot natively see output layers
#Loss cannot be applied across layers
#no batch size in custom loss
#cannot look at loss individually, or else we lose batch info