# Training a CNN-LSTM Model on Tensorflow Datasets
## Prerequisites

In [1]:
import json
import os
from typing import Callable

import numpy as np
import pandas as pd
import tensorflow as tf
from mmproteo.utils import log, paths, visualization
from mmproteo.utils.formats.tf_dataset import DatasetLoader
from mmproteo.utils.ml import callbacks, evaluation, layers, losses

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [3]:
logger = log.DummyLogger(verbose=False)

INFO: Printing to Stdout


## Configuration

In [4]:
%pwd

'/tf/workspace/notebooks'

In [5]:
PROJECT = "PXD010000"
DUMP_PATH = os.path.join("..", "dumps", PROJECT)
TRAINING_COLUMNS_DUMP_PATH = os.path.join(DUMP_PATH, "training_columns")
FILES_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "*_mzmlid.parquet")
STATISTICS_FILE_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "statistics.parquet")
DATASET_DUMP_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "tf_datasets")
PROCESSING_FILE_PATH = os.path.join(DATASET_DUMP_PATH, "processing_info.json")

In [6]:
SEQ = 'peptide_sequence'

In [7]:
with open(PROCESSING_FILE_PATH, 'r') as file:
    PROCESSING_INFO = json.loads(file.read())
PROCESSING_INFO

{'padding_characters': {'peptide_sequence': '_',
  'mz_array': 0.0,
  'intensity_array': 0.0},
 'padding_lengths': {'mz_array': 2354,
  'intensity_array': 2354,
  'peptide_sequence': 50},
 'idx_to_char': {'0': 'A',
  '1': 'C',
  '2': 'D',
  '3': 'E',
  '4': 'F',
  '5': 'G',
  '6': 'H',
  '7': 'I',
  '8': 'K',
  '9': 'L',
  '10': 'M',
  '11': 'M(Oxidation)',
  '12': 'N',
  '13': 'P',
  '14': 'Q',
  '15': 'R',
  '16': 'S',
  '17': 'T',
  '18': 'V',
  '19': 'W',
  '20': 'Y',
  '21': '_'},
 'normalization': {'intensity_array': '<function base_peak_normalize at 0x7fa6046d5158>'},
 'split_value_columns': ['species', 'istrain'],
 'training_data_columns': ['mz_array', 'intensity_array'],
 'target_data_columns': ['peptide_sequence']}

In [8]:
idx_to_char = {int(idx): char for idx, char in PROCESSING_INFO["idx_to_char"].items()}
char_to_idx = {char: idx for idx, char in idx_to_char.items()}

## Loading Tensorflow Datasets

In [9]:
KEEP_CACHE = True

In [10]:
TRAINING_TYPE = 'Train'
TEST_TYPE = 'Test'
EVAL_TYPE = 'Eval'

In [11]:
dataset_file_paths = paths.assign_wildcard_paths_to_splits_grouped_by_path_position_value(
    wildcard_path = os.path.join(
        DATASET_DUMP_PATH, 
        '*',  # filename
        '*',  # species
        '*'   # istrain
    ),
    path_position = -2,
    splits = {
            TRAINING_TYPE: 0.4,
            TEST_TYPE: 0.5,
            EVAL_TYPE: 0.6
        },
    paths_dump_file = os.path.join(
            DATASET_DUMP_PATH,
            "dataset_file_paths.json"
        ),
    skip_existing = KEEP_CACHE,
    logger = logger
)

print()
print("assigned dataset files:")
visualization.print_list_length_in_dict(dataset_file_paths)

INFO: found file paths dump '../dumps/PXD010000/training_columns/tf_datasets/dataset_file_paths.json'

assigned dataset files:
#Train = 89
e.g.: ../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_C_indologenes_LIB_aerobic_02_03May16_Samwise_16-03-32_mzmlid.parquet/Chryseobacterium_indologenes/Train
#Test = 17
e.g.: ../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet/Acidiphilium_cryptum_JF-5/Train
#Eval = 29
e.g.: ../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_B_fragilis_CMcarb_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet/Bacteroides_fragilis_638R/Train


### Loading corresponding TF datasets

In [12]:
element_spec = (
    tuple(tf.TensorSpec(shape=(PROCESSING_INFO['padding_lengths'][col], ), dtype=tf.float32)
     for col in PROCESSING_INFO['training_data_columns']),
    tuple(tf.TensorSpec(shape=(PROCESSING_INFO['padding_lengths'][col], ), dtype=tf.int8)
     for col in PROCESSING_INFO['target_data_columns'])
)
element_spec

((TensorSpec(shape=(2354,), dtype=tf.float32, name=None),
  TensorSpec(shape=(2354,), dtype=tf.float32, name=None)),
 (TensorSpec(shape=(50,), dtype=tf.int8, name=None),))

In [13]:
BATCH_SIZE=32

In [14]:
datasets = DatasetLoader(
    element_spec=element_spec,
    batch_size=BATCH_SIZE,
    shuffle_buffer_size=100_000,
    keep_cache=KEEP_CACHE,
    logger=logger
).load_datasets_by_type(dataset_file_paths)
datasets



{'Train': <BatchDataset shapes: (((32, 2354), (32, 2354)), ((32, 50),)), types: ((tf.float32, tf.float32), (tf.int8,))>,
 'Test': <BatchDataset shapes: (((32, 2354), (32, 2354)), ((32, 50),)), types: ((tf.float32, tf.float32), (tf.int8,))>,
 'Eval': <BatchDataset shapes: (((32, 2354), (32, 2354)), ((32, 50),)), types: ((tf.float32, tf.float32), (tf.int8,))>}

## Building the Tensorflow Model

In [15]:
input_layers_list, masked_input_layers_list = layers.create_masked_input_layers(
    [
        layers.InputLayerConfiguration(
            name=col,
            shape=PROCESSING_INFO['padding_lengths'][col],
            mask_value=PROCESSING_INFO['padding_characters'][col]
        )
        for col in PROCESSING_INFO['training_data_columns']
    ]
)
print(input_layers_list)
print(masked_input_layers_list)

[<KerasTensor: shape=(None, 2354) dtype=float32 (created by layer 'mz_array')>, <KerasTensor: shape=(None, 2354) dtype=float32 (created by layer 'intensity_array')>]
[<KerasTensor: shape=(None, 2354) dtype=float32 (created by layer 'masked_mz_array')>, <KerasTensor: shape=(None, 2354) dtype=float32 (created by layer 'masked_intensity_array')>]


In [16]:
masked_loss = losses.MaskedLoss(
    loss_function=tf.keras.losses.sparse_categorical_crossentropy,
    masking_value=tf.constant(
        value=char_to_idx[PROCESSING_INFO['padding_characters'][SEQ]],
        dtype=tf.int8
    )
)

In [49]:
x = tf.stack(
    values=masked_input_layers_list, 
    axis=-1,
)
x

<KerasTensor: shape=(None, 2354, 2) dtype=float32 (created by layer 'tf.stack_1')>

In [50]:
x.shape

TensorShape([None, 2354, 2])

In [51]:
x.shape[1:]

TensorShape([2354, 2])

In [52]:
y = tf.keras.layers.Conv1D(
    filters=42,
    kernel_size=100,
    padding='same',
    activation='relu',
)(x)
y

<KerasTensor: shape=(None, 2354, 42) dtype=float32 (created by layer 'conv1d_5')>

In [53]:
x = tf.concat(
    values=[x, y],
    axis=-1
)
x

<KerasTensor: shape=(None, 2354, 44) dtype=float32 (created by layer 'tf.concat_1')>

In [54]:
x = tf.keras.layers.Bidirectional(
    layer=tf.keras.layers.LSTM(
        units=512,
        return_sequences=True,
    )
)(x)
x

<KerasTensor: shape=(None, 2354, 1024) dtype=float32 (created by layer 'bidirectional_1')>

In [55]:
x.shape[1]

2354

In [56]:
pool_size=2

while x.shape[1] >= (pool_size + 1)*PROCESSING_INFO['padding_lengths'][SEQ]:
    x = tf.keras.layers.Conv1D(
        filters=22,
        kernel_size=42,
        activation='relu',
    )(x)
    print(x)
    x = tf.keras.layers.MaxPool1D(
        pool_size=2,
        padding='same',
    )(x)
    print(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    print(x)

KerasTensor(type_spec=TensorSpec(shape=(None, 2313, 22), dtype=tf.float32, name=None), name='conv1d_6/Relu:0', description="created by layer 'conv1d_6'")
KerasTensor(type_spec=TensorSpec(shape=(None, 1157, 22), dtype=tf.float32, name=None), name='max_pooling1d_4/Squeeze:0', description="created by layer 'max_pooling1d_4'")
KerasTensor(type_spec=TensorSpec(shape=(None, 1157, 22), dtype=tf.float32, name=None), name='dropout_4/Identity:0', description="created by layer 'dropout_4'")
KerasTensor(type_spec=TensorSpec(shape=(None, 1116, 22), dtype=tf.float32, name=None), name='conv1d_7/Relu:0', description="created by layer 'conv1d_7'")
KerasTensor(type_spec=TensorSpec(shape=(None, 558, 22), dtype=tf.float32, name=None), name='max_pooling1d_5/Squeeze:0', description="created by layer 'max_pooling1d_5'")
KerasTensor(type_spec=TensorSpec(shape=(None, 558, 22), dtype=tf.float32, name=None), name='dropout_5/Identity:0', description="created by layer 'dropout_5'")
KerasTensor(type_spec=TensorSpec

In [57]:
x = tf.keras.layers.Flatten()(x)

In [58]:
x = tf.keras.layers.Dense(
    units=PROCESSING_INFO['padding_lengths'][SEQ]*len(idx_to_char),
    activation=None,
)(x)

In [59]:
x = tf.reshape(x,(-1, PROCESSING_INFO['padding_lengths'][SEQ], len(idx_to_char)))

x = tf.keras.activations.softmax(x)

model = tf.keras.Model(inputs=input_layers_list, outputs=x, name='mmproteo')
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),#masked_loss,
              metrics=[
                  tf.keras.metrics.SparseCategoricalAccuracy(),
                  tf.keras.metrics.SparseCategoricalCrossentropy()
              ]
             )
model.summary()

Model: "mmproteo"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
mz_array (InputLayer)           [(None, 2354)]       0                                            
__________________________________________________________________________________________________
intensity_array (InputLayer)    [(None, 2354)]       0                                            
__________________________________________________________________________________________________
masked_mz_array (Masking)       (None, 2354)         0           mz_array[0][0]                   
__________________________________________________________________________________________________
masked_intensity_array (Masking (None, 2354)         0           intensity_array[0][0]            
___________________________________________________________________________________________

## Training the Tensorflow Model

In [60]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [61]:
TENSORBOARD_LOG_DIR = os.path.join(DUMP_PATH, "tensorboard", "logs")
os.path.realpath(TENSORBOARD_LOG_DIR)

'/tf/workspace/dumps/PXD010000/tensorboard/logs'

In [62]:
%tensorboard --logdir $TENSORBOARD_LOG_DIR --bind_all

Reusing TensorBoard on port 6006 (pid 109), started 3 days, 8:10:09 ago. (Use '!kill 109' to kill it.)

In [63]:
model.fit(x=datasets[TRAINING_TYPE].repeat(),
          validation_data=datasets[TEST_TYPE].repeat(), 
          validation_steps=500,
          epochs=10,
          steps_per_epoch=1_000,
          callbacks=[
              callbacks.create_tensorboard_callback(
                  tensorboard_log_dir = TENSORBOARD_LOG_DIR,
                  keep_logs = KEEP_CACHE
              )
          ]
         )

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

## Evaluating the Tensorflow Model

In [64]:
decode_idx: Callable[[np.ndarray], np.ndarray] = np.vectorize(idx_to_char.get)

In [65]:
evaluator = evaluation.SequenceEvaluator(
    dataset=datasets[EVAL_TYPE],
    decode_func=decode_idx,
    batch_size=BATCH_SIZE,
    separator=" ",
    padding_character=PROCESSING_INFO['padding_characters'][SEQ],
)

In [66]:
evaluator.evaluate_model(model)

  89/1250 [=>............................] - ETA: 4:30 - loss: 0.9077 - sparse_categorical_accuracy: 0.7401 - sparse_categorical_crossentropy: 0.9077

KeyboardInterrupt: 

In [67]:
eval_df, (x_eval, y_eval, y_pred) = evaluator.evaluate_model_visually(
    model=model,
    sample_size=20,
    keep_separator=True,
)
eval_df

Unnamed: 0,predicted,true
0,A L E I L I L A K _ _ _ _ _ _ _ _ _ _ _,K M G A Q T A E A N I N A G I A A A R
1,A A A A A A G A A A G A A _ _ _,I A H S D D A V L L H F N V K
2,A A A A A A A A A A A A _ _ _,A Y L A A W D A Y T N S D K
3,A A A A A A G A A,T V V V N F N K
4,A A A I A A A A A A A _ _ _ _,Q I P A N Y Q N D E T I V K
5,A A A I A L L A L A _ _ _ _ _ _ _ _ _,A V E G A A T Q S V A D Q E A I Q K
6,A A E A A G A A L A G,V G T V A G A V V K
7,A A A A A G D A L A G A,K V V K G D N N T P R
8,A A A A A A A A A A A A A A _ _ _ _ _ _,V V F Y K E P Q Y Y S H V I H L V S R
9,A A A A A G D A V A G A A,L S A M N P L L D G G R


In [68]:
eval_df.predicted.map(print)
None

A L E I L I L A K _ _ _ _ _ _ _ _ _ _ _
A A A A A A G A A A G A A _ _ _
A A A A A A A A A A A A _ _ _
A A A A A A G A A
A A A I A A A A A A A _ _ _ _
A A A I A L L A L A _ _ _ _ _ _ _ _ _
A A E A A G A A L A G
A A A A A G D A L A G A
A A A A A A A A A A A A A A _ _ _ _ _ _
A A A A A G D A V A G A A
A A E I A G L A V A G _ _ _
A A A A A G A A V A G A
A A A I A A G A _ _ _ _ _ _
A A A A A A D A L A
A A D A A V L A V K
A A A E A A G A _ _ _ _
A A A A A A A A A A A A _
A A A A A A D A V A G A A
A A A A A A G A A A A A A _ _
A A E A E G L G


broken loss function?

In [41]:
np.argmax(y_pred[0], axis=1)

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 21, 21, 21, 21, 21, 21,
       21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
       21, 21, 21, 21, 21, 21, 21, 21, 21,  0, 21, 21, 21, 21, 21, 21])

In [47]:
model.predict(datasets[EVAL_TYPE].take(1)).shape

(32, 50, 22)

In [48]:
y_pred.shape

(20, 50, 22)