# Training a CNN-LSTM Model on Tensorflow Datasets
## Prerequisites

In [1]:
import json
import math
import os
from typing import Callable

import numpy as np
import pandas as pd
import tensorflow as tf
import official.nlp.modeling as tfnlp
import keras_tuner as kt
from mmproteo.utils import log, paths, utils, visualization
from mmproteo.utils.formats.tf_dataset import DatasetLoader
from mmproteo.utils.ml import callbacks, evaluation, layers, losses

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [3]:
logger = log.DummyLogger(verbose=False)

INFO: Printing to Stdout


## Configuration

In [4]:
%pwd

'/tf/workspace/notebooks'

In [5]:
PROJECT = "PXD010000"
DUMP_PATH = os.path.join("..", "dumps", PROJECT)
TRAINING_COLUMNS_DUMP_PATH = os.path.join(DUMP_PATH, "training_columns")
FILES_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "*_mzmlid.parquet")
STATISTICS_FILE_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "statistics.parquet")
DATASET_DUMP_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "tf_datasets")
PROCESSING_FILE_PATH = os.path.join(DATASET_DUMP_PATH, "processing_info.json")

In [6]:
SEQ = 'peptide_sequence'

In [7]:
with open(PROCESSING_FILE_PATH, 'r') as file:
    PROCESSING_INFO = json.loads(file.read())
PROCESSING_INFO

{'padding_characters': {'peptide_sequence': '_',
  'mz_array': 0.0,
  'intensity_array': 0.0},
 'padding_lengths': {'mz_array': 2354,
  'intensity_array': 2354,
  'peptide_sequence': 50},
 'idx_to_char': {'0': 'A',
  '1': 'C',
  '2': 'D',
  '3': 'E',
  '4': 'F',
  '5': 'G',
  '6': 'H',
  '7': 'I',
  '8': 'K',
  '9': 'L',
  '10': 'M',
  '11': 'M(Oxidation)',
  '12': 'N',
  '13': 'P',
  '14': 'Q',
  '15': 'R',
  '16': 'S',
  '17': 'T',
  '18': 'V',
  '19': 'W',
  '20': 'Y',
  '21': '_'},
 'normalization': {'intensity_array': '<function base_peak_normalize at 0x7fa6046d5158>'},
 'split_value_columns': ['species', 'istrain'],
 'training_data_columns': ['mz_array', 'intensity_array'],
 'target_data_columns': ['peptide_sequence']}

In [8]:
idx_to_char = {int(idx): char for idx, char in PROCESSING_INFO["idx_to_char"].items()}
char_to_idx = {char: idx for idx, char in idx_to_char.items()}

## Loading Tensorflow Datasets

In [9]:
KEEP_CACHE = True

In [10]:
TRAINING_TYPE = 'Train'
TEST_TYPE = 'Test'
EVAL_TYPE = 'Eval'

In [11]:
dataset_file_paths = paths.assign_wildcard_paths_to_splits_grouped_by_path_position_value(
    wildcard_path = os.path.join(
        DATASET_DUMP_PATH, 
        '*',  # filename
        '*',  # species
        '*'   # istrain
    ),
    path_position = -2,
    splits = {
            TRAINING_TYPE: 0.4,
            TEST_TYPE: 0.5,
            EVAL_TYPE: 0.6
        },
    paths_dump_file = os.path.join(
            DATASET_DUMP_PATH,
            "dataset_file_paths.json"
        ),
    skip_existing = KEEP_CACHE,
    logger = logger
)

print()
print("assigned dataset files:")
visualization.print_list_length_in_dict(dataset_file_paths)

INFO: found file paths dump '../dumps/PXD010000/training_columns/tf_datasets/dataset_file_paths.json'

assigned dataset files:
#Train = 89
e.g.: ../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_C_indologenes_LIB_aerobic_02_03May16_Samwise_16-03-32_mzmlid.parquet/Chryseobacterium_indologenes/Train
#Test = 17
e.g.: ../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet/Acidiphilium_cryptum_JF-5/Train
#Eval = 29
e.g.: ../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_B_fragilis_CMcarb_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet/Bacteroides_fragilis_638R/Train


### Loading corresponding TF datasets

In [12]:
element_spec = (
    tuple(tf.TensorSpec(shape=(PROCESSING_INFO['padding_lengths'][col], ), dtype=tf.float32)
     for col in PROCESSING_INFO['training_data_columns']),
    tuple(tf.TensorSpec(shape=(PROCESSING_INFO['padding_lengths'][col], ), dtype=tf.int8)
     for col in PROCESSING_INFO['target_data_columns'])
)
element_spec

((TensorSpec(shape=(2354,), dtype=tf.float32, name=None),
  TensorSpec(shape=(2354,), dtype=tf.float32, name=None)),
 (TensorSpec(shape=(50,), dtype=tf.int8, name=None),))

In [13]:
BATCH_SIZE=16

In [14]:
datasets = DatasetLoader(
    element_spec=element_spec,
    batch_size=BATCH_SIZE,
    shuffle_buffer_size=100_000,
    keep_cache=KEEP_CACHE,
    logger=logger
).load_datasets_by_type(dataset_file_paths)
datasets



{'Train': <BatchDataset shapes: (((16, 2354), (16, 2354)), ((16, 50),)), types: ((tf.float32, tf.float32), (tf.int8,))>,
 'Test': <BatchDataset shapes: (((16, 2354), (16, 2354)), ((16, 50),)), types: ((tf.float32, tf.float32), (tf.int8,))>,
 'Eval': <BatchDataset shapes: (((16, 2354), (16, 2354)), ((16, 50),)), types: ((tf.float32, tf.float32), (tf.int8,))>}

## Defining the Tensorflow Model

In [15]:
def build_lstm_with_pooling_and_position_model(
    hp: kt.HyperParameters,
    model_name: str = "mmproteo_lstm_with_pooling_and_position"
) -> tf.keras.Model:
    input_layers_list, masked_input_layers_list = layers.create_masked_input_layers(
        [
            layers.InputLayerConfiguration(
                name=col,
                shape=PROCESSING_INFO['padding_lengths'][col],
                mask_value=PROCESSING_INFO['padding_characters'][col]
            )
            for col in PROCESSING_INFO['training_data_columns']
        ]
    )
    
    x = tf.stack(
        values=masked_input_layers_list, 
        axis=-1,
    )
    
    length = x.shape[1]
    position_embedding_size = hp.Int('position_embedding_size', min_value=4, max_value=20, step=1)
    
    position_embedding = tfnlp.layers.position_embedding.RelativePositionEmbedding(
        hidden_size=position_embedding_size,
        name='relative_position_embedding'
    )(x)
    position_embedding = tf.expand_dims(position_embedding, 0)
    position_embedding = tf.broadcast_to(
        input=position_embedding, 
        shape=(tf.shape(x)[0], *tf.shape(position_embedding)[1:])
    )
    
    y_layers=[position_embedding]
    
    dense_y = tf.keras.layers.TimeDistributed(
        tf.keras.layers.Dense(
            units=hp.Int('y_time_distributed_dense_units', min_value=4, max_value=64, step=4),
            activation='relu',
            name='y_time_distributed_dense',
        )
    )(x)
    y_layers.append(dense_y)
    
    for i in range(hp.Int('y_number_of_convolutions', min_value=1, max_value=8, step=1)):
        filter_count = hp.Int('y_base_conv_filter_count', min_value=4, max_value=20, step=4) * (i+1)
        kernel_size = hp.Int('y_base_conv_kernel_size', min_value=4, max_value=16, step=2)**i
        cnn_y = tf.keras.layers.Conv1D(
            filters=filter_count,
            kernel_size=kernel_size,
            activation='relu',
            padding='same',
            name=f"y_conv_{kernel_size}_{filter_count}",
        )(x)
        y_layers.append(cnn_y)
    
    x = tf.concat(
        values=y_layers,
        axis=-1
    )
    
    
    y = x
    for i in range(hp.Int('number_of_time_distributed_dense_layers', min_value=0, max_value=4, step=1)):
        y = tf.keras.layers.TimeDistributed(
            tf.keras.layers.Dense(
                units=x.shape[1],
                activation='relu',
                name=f'pre_lstm_time_distributed_dense_{i}',
            )
        )(y)
    
    x = tf.concat(
        values=[x, y],
        axis=-1,
    )
    
    bidirectional_lstm_units_exponent = hp.Int('bidirectional_lstm_units_exponent', min_value=5, max_value=9, step=1)
    x = tf.keras.layers.Bidirectional(
        layer=tf.keras.layers.LSTM(
            units=2**bidirectional_lstm_units_exponent,
            return_sequences=True,
            name='lstm'
        )
    )(x)
    
    x = tf.keras.layers.GlobalMaxPooling1D(
        name='global_max_pooling_over_time',
    )(x)
    
    upscaling_dense_max = hp.Int('upscaling_dense_layer_generator_max', min_value=1, max_value=4, step=1)
    upscaling_dense_min = hp.Int('upscaling_dense_layer_generator_min', min_value=0, max_value=upscaling_dense_max-1, step=1)
    for i in range(upscaling_dense_min, upscaling_dense_max):
        x = tf.keras.layers.Dense(
            units=2**(7 + i),
            activation='relu',
            name=f"upscaling_dense_{i}",
        )(x)
    
    final_dense_feature_units = hp.Int('final_dense_layer_over_length_feature_units', min_value=3, max_value=7, step=1)
    
    x = tf.keras.layers.Dense(
        units=PROCESSING_INFO['padding_lengths'][SEQ] * final_dense_feature_units,
        activation='relu',
        name="final_dense_layer_to_redefine_lengths",
    )(x)
    
    x = tf.reshape(x, (-1, PROCESSING_INFO['padding_lengths'][SEQ], final_dense_feature_units))
    
    x = tf.keras.layers.TimeDistributed(
        tf.keras.layers.Dense(
            units=len(idx_to_char),
            activation=None,
            name='final_time_distributed_dense'
        )
    )(x)
    
    x = tf.keras.activations.softmax(x)
    
    model = tf.keras.Model(
        inputs=input_layers_list, 
        outputs=x, 
        name=f"{model_name}_{utils.get_current_time_str()}")
    
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=[
            tf.keras.metrics.SparseCategoricalAccuracy(),
            tf.keras.metrics.SparseCategoricalCrossentropy()
        ]
    )
    
    return model

In [16]:
TUNER_PATH = os.path.join(DUMP_PATH, 'models', 'tuner')
TUNER_PATH

'../dumps/PXD010000/models/tuner'

In [17]:
utils.ensure_dir_exists(TUNER_PATH)

In [18]:
e0a305b3b09b85c8f7a9d3042965c733c2248ed14b6017fbe0a305b3b09b85c8f7a9d3042965c733c2248ed14b6017fbtuner = kt.Hyperband(build_lstm_with_pooling_and_position_model,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory=TUNER_PATH,
                     project_name='mmproteo-cnn-lstm')

INFO:tensorflow:Reloading Oracle from existing project ../dumps/PXD010000/models/tuner/mmproteo-cnn-lstm/oracle.json


In [19]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [20]:
TENSORBOARD_LOG_DIR = os.path.join(TUNER_PATH, "tensorboard")
os.path.realpath(TENSORBOARD_LOG_DIR)

'/tf/workspace/dumps/PXD010000/models/tuner/tensorboard'

In [21]:
%tensorboard --logdir $TENSORBOARD_LOG_DIR --bind_all

In [22]:
tuner.search(
    x=datasets[TRAINING_TYPE].repeat(),
    validation_data=datasets[TEST_TYPE].repeat(), 
    validation_steps=500,
    epochs=10,
    steps_per_epoch=int(10_000/BATCH_SIZE),
    callbacks=callbacks.create_callbacks(
        tensorboard=True,
        progressbar=False,
        reduce_lr=True,
        early_stopping=True,
        checkpoints=True,
        csv=True,
        base_path=TUNER_PATH,
    )
)


Search: Running Trial #1

Hyperparameter    |Value             |Best Value So Far 
position_embedd...|4                 |?                 
y_time_distribu...|52                |?                 
y_number_of_con...|4                 |?                 
y_base_conv_fil...|20                |?                 
y_base_conv_ker...|12                |?                 
number_of_time_...|3                 |?                 
bidirectional_l...|7                 |?                 
upscaling_dense...|4                 |?                 
upscaling_dense...|0                 |?                 
final_dense_lay...|7                 |?                 
learning_rate     |0.01              |?                 
tuner/epochs      |2                 |?                 
tuner/initial_e...|0                 |?                 
tuner/bracket     |2                 |?                 
tuner/round       |0                 |?                 

Epoch 1/2
 33/625 [>.............................] - ETA: 8:

KeyboardInterrupt: 

In [None]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_hps

In [None]:
model = tuner.hypermodel.build(best_hps)

In [None]:
MODEL_PATH = os.path.join(DUMP_PATH, "models", model.name)
MODEL_PATH

In [None]:
utils.ensure_dir_exists(MODEL_PATH)

In [None]:
tf.keras.utils.plot_model(
    model=model,
    to_file=os.path.join(MODEL_PATH, "model.png"),
    show_shapes=True
)

In [None]:
with open(os.path.join(MODEL_PATH, "summary.txt"), 'w') as file:
    def write_lines(line: str) -> None:
        file.write(line)
        file.write("\n")
    model.summary(print_fn=write_lines)
model.summary()

In [None]:
with open(os.path.join(MODEL_PATH, "model.json"), 'w') as file:
    file.write(model.to_json())

In [None]:
with open(os.path.join(MODEL_PATH, "model.yaml"), 'w') as file:
    file.write(model.to_yaml())

## Training the Tensorflow Model

In [None]:
history = model.fit(x=datasets[TRAINING_TYPE].repeat(),
          validation_data=datasets[TEST_TYPE].repeat(), 
          validation_steps=500,
          epochs=1,
          steps_per_epoch=1_000,
          callbacks=callbacks.create_callbacks(
              tensorboard=True,
              progressbar=False,
              reduce_lr=True,
              early_stopping=True,
              checkpoints=True,
              csv=True,
              base_path=MODEL_PATH,
          )
         )
history

## Evaluating the Tensorflow Model

In [None]:
decode_idx: Callable[[np.ndarray], np.ndarray] = np.vectorize(idx_to_char.get)

In [None]:
evaluator = evaluation.SequenceEvaluator(
    dataset=datasets[EVAL_TYPE],
    decode_func=decode_idx,
    batch_size=BATCH_SIZE,
    separator=" ",
    padding_character=PROCESSING_INFO['padding_characters'][SEQ],
)

In [None]:
evaluator.evaluate_model(model)

In [None]:
eval_df, (x_eval, y_eval, y_pred) = evaluator.evaluate_model_visually(
    model=model,
    sample_size=20,
    keep_separator=True,
)
eval_df

In [None]:
eval_df.predicted.map(print)
None

broken loss function?

In [None]:
np.argmax(y_pred[0], axis=1)

In [None]:
model.predict(datasets[EVAL_TYPE].take(1)).shape

In [None]:
y_pred.shape

In [None]:
y_pred