# VAE

In [20]:
# imports
import sys
sys.path.append( '../FIA' )
sys.path.append( '../ML' )

from FIA import *
from ML4com import *

import keras
from keras.layers import Input, Dense, Lambda
from keras.layers import BatchNormalization, Dropout, LeakyReLU
from keras.models import Model
from keras.losses import mse
from keras.optimizers.legacy import Adam
from keras import backend as K

import tensorflow as tf
tf.compat.v1.disable_eager_execution()

import keras_tuner
from keras_tuner.tuners import Hyperband, BayesianOptimization

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
info_dir = "../../data/comm8_self"
data_dir = "../../runs/FIA/comm8/oms"
run_dir = "../../runs/ML/try"

info_dir = os.path.normpath(os.path.join(os.getcwd(), info_dir))
data_dir = os.path.normpath(os.path.join(os.getcwd(), data_dir))
run_dir = os.path.normpath(os.path.join(os.getcwd(), run_dir))

strains = pd.read_csv(os.path.join(info_dir, "strains.tsv"), sep="\t")
comm8 = pd.read_csv(os.path.join(info_dir, "comm8.tsv"), sep="\t")

fia_df = load_fia_df(data_dir, file_ending=".mzML", separator="\t")

Loading names:


100%|██████████| 72/72 [00:00<?, ?it/s]


Loading experiments:


100%|██████████| 68/68 [00:00<00:00, 100.46it/s]


In [3]:
# binned_dfs = bin_df_stepwise_batch(fia_df, binning_var="mz", binned_var="inty", statistic="sum", start=50.0, stop=1700.0, step=0.002)
# binned_dfs.to_csv(os.path.join(run_dir, "data_matrix.tsv"), sep="\t")
binned_dfs = pd.read_csv(os.path.join(run_dir, "data_matrix_oms.tsv"), sep="\t", index_col="mz", engine="pyarrow")

In [4]:
scaler = MaxAbsScaler()
binned_dfs[:] =  scaler.fit_transform(binned_dfs)

In [5]:
print(binned_dfs.shape)
print(comm8.shape)
print(strains.shape)

(825000, 68)
(68, 8)
(8, 1)


In [6]:
# Data & model configuration
batch_size = 32
no_epochs = 1000
latent_dim = 18

original_dim = binned_dfs.shape[0]
input_shape = (original_dim,)

## Model

In [15]:
def model_builder(hp):
    # # =================
    # # Encoder
    # # =================

    # Definition
    i       = Input(shape=input_shape, name='encoder_input')
    
    x       = Dense(hp.Int('encoder_units', min_value=30, max_value=220, step=10))(i)
    x       = LeakyReLU()(x)
    
    mu      = Dense(latent_dim, name='latent_mu')(x)
    sigma   = Dense(latent_dim, name='latent_sigma')(x)

    # Define sampling with reparameterization trick
    def sample_z(args):
        mu, sigma = args
        batch     = K.shape(mu)[0]
        dim       = K.int_shape(mu)[1]
        eps       = K.random_normal(shape=(batch, dim))
        return mu + K.exp(sigma / 2) * eps

    # Use reparameterization trick to ....??
    z       = Lambda(sample_z, output_shape=(latent_dim, ), name='z')([mu, sigma])

    # Instantiate encoder
    encoder = Model(i, [mu, sigma, z], name='encoder')
    
    # =================
    # Decoder
    # =================

    # Definition
    d_i   = Input(shape=(latent_dim, ), name='decoder_input')
    
    x     = Dense(hp.Int('decoder_units', min_value=20, max_value=220, step=10))(d_i)
    x     = LeakyReLU()(x)
        
    o     = Dense(original_dim)(x)

    # Instantiate decoder
    decoder = Model(d_i, o, name='decoder')
    
    # =================
    # VAE as a whole
    # =================

    # Define loss
    def kl_reconstruction_loss(true, pred):
      # Reconstruction loss
        reconstruction_loss = mse(true, pred)
        reconstruction_loss *= original_dim

        # KL divergence loss
        kl_loss = 1 + sigma - K.square(mu) - K.exp(sigma)
        kl_loss = K.sum(kl_loss, axis=-1)
        kl_loss *= -0.5
        
        # weight KL divergence loss here
        kl_loss *= hp.Float('kl_beta', min_value=1e-3, max_value=1e1, sampling='LOG', default=1e-2)

        return K.mean(reconstruction_loss + kl_loss)

    # Instantiate VAE
    vae_outputs = decoder(encoder(i)[2])
    vae         = Model(i, vae_outputs, name='vae')

    # Define optimizer
    optimizer = Adam(hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG', default=1e-3))

    # Compile VAE
    vae.compile(optimizer=optimizer, loss=kl_reconstruction_loss, metrics = ['mse'], experimental_run_tf_function=False)
    
    return vae

In [16]:
# Set tuner parameters
tuner = Hyperband(
    model_builder,
    objective='mse',
    factor=2,
    max_epochs=200,
    directory='hyperband_optimization',
    project_name='mtvae')

In [17]:
tuner.search_space_summary()

Search space summary
Default search space size: 4
encoder_units (Int)
{'default': None, 'conditions': [], 'min_value': 30, 'max_value': 220, 'step': 10, 'sampling': 'linear'}
decoder_units (Int)
{'default': None, 'conditions': [], 'min_value': 20, 'max_value': 220, 'step': 10, 'sampling': 'linear'}
learning_rate (Float)
{'default': 0.001, 'conditions': [], 'min_value': 0.0001, 'max_value': 0.01, 'step': None, 'sampling': 'log'}
kl_beta (Float)
{'default': 0.01, 'conditions': [], 'min_value': 0.001, 'max_value': 10.0, 'step': None, 'sampling': 'log'}


In [18]:
X = binned_dfs.transpose()
ys = comm8
kf = KFold(n_splits = 5, shuffle=True)     # stratified: skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

training_data, test_data, training_labels, test_labels = train_test_split(X, ys, test_size=0.2, random_state=42)

In [21]:
tuner.search(training_data, training_data, validation_data = (test_data, test_data))

Trial 5 Complete [00h 00m 05s]
mse: 4.165573773207143e-05

Best mse So Far: 1.733126373437699e-05
Total elapsed time: 00h 02m 15s

Search: Running Trial #6

Value             |Best Value So Far |Hyperparameter
170               |100               |encoder_units
40                |210               |decoder_units
0.00028832        |0.00027583        |learning_rate
0.85359           |1.665             |kl_beta
2                 |2                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
7                 |7                 |tuner/bracket
0                 |0                 |tuner/round

Train on 54 samples, validate on 14 samples
Epoch 1/2

## Return best

In [None]:
tuner.results_summary(num_trials = 3)

In [None]:
tuner.get_best_models()[0].summary()

In [None]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]

In [None]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]

print(best_hps.get('encoder_units'))
print(best_hps.get('decoder_units'))
print(best_hps.get('learning_rate'))
print(best_hps.get('kl_beta'))

## Training

In [None]:
sys.path.append( '../../../mtVAE' )
from models import *
from metric_functions import *

In [None]:
# Data & model configuration
input_dim = original_dim
intermediate_dim = 200
latent_dim = 18

kl_beta = 1e-2
learning_rate = 1e-3

batch_size = 32
n_epochs = 1000

save_folder = '../../runs/VAE'


# instantiate model
mtmodel = mtVAE(input_dim, intermediate_dim, latent_dim, kl_beta, learning_rate)

In [None]:
# Train model
mtmodel.train(training_data, test_data, n_epochs, batch_size)

In [None]:
# Save model
mtmodel.save_model(save_folder)

In [None]:
mtmodel.reconstruct(training_data)