# Training a CNN-LSTM Model on Tensorflow Datasets
## Prerequisites

In [1]:
import json
import os
from typing import Callable

import numpy as np
import pandas as pd
import tensorflow as tf
from mmproteo.utils import log, paths, visualization
from mmproteo.utils.formats.tf_dataset import DatasetLoader
from mmproteo.utils.ml import callbacks, evaluation, layers, losses

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [3]:
logger = log.DummyLogger(verbose=False)

INFO: Printing to Stdout


## Configuration

In [4]:
%pwd

'/tf/workspace/notebooks'

In [5]:
PROJECT = "PXD010000"
DUMP_PATH = os.path.join("..", "dumps", PROJECT)
TRAINING_COLUMNS_DUMP_PATH = os.path.join(DUMP_PATH, "training_columns")
FILES_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "*_mzmlid.parquet")
STATISTICS_FILE_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "statistics.parquet")
DATASET_DUMP_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "tf_datasets")
PROCESSING_FILE_PATH = os.path.join(DATASET_DUMP_PATH, "processing_info.json")

In [6]:
SEQ = 'peptide_sequence'

In [7]:
with open(PROCESSING_FILE_PATH, 'r') as file:
    PROCESSING_INFO = json.loads(file.read())
PROCESSING_INFO

{'padding_characters': {'peptide_sequence': '_',
  'mz_array': 0.0,
  'intensity_array': 0.0},
 'padding_lengths': {'mz_array': 2354,
  'intensity_array': 2354,
  'peptide_sequence': 50},
 'idx_to_char': {'0': 'A',
  '1': 'C',
  '2': 'D',
  '3': 'E',
  '4': 'F',
  '5': 'G',
  '6': 'H',
  '7': 'I',
  '8': 'K',
  '9': 'L',
  '10': 'M',
  '11': 'M(Oxidation)',
  '12': 'N',
  '13': 'P',
  '14': 'Q',
  '15': 'R',
  '16': 'S',
  '17': 'T',
  '18': 'V',
  '19': 'W',
  '20': 'Y',
  '21': '_'},
 'normalization': {'intensity_array': '<function base_peak_normalize at 0x7fa6046d5158>'},
 'split_value_columns': ['species', 'istrain'],
 'training_data_columns': ['mz_array', 'intensity_array'],
 'target_data_columns': ['peptide_sequence']}

In [8]:
idx_to_char = {int(idx): char for idx, char in PROCESSING_INFO["idx_to_char"].items()}
char_to_idx = {char: idx for idx, char in idx_to_char.items()}

## Loading Tensorflow Datasets

In [9]:
KEEP_CACHE = True

In [10]:
TRAINING_TYPE = 'Train'
TEST_TYPE = 'Test'
EVAL_TYPE = 'Eval'

In [11]:
dataset_file_paths = paths.assign_wildcard_paths_to_splits_grouped_by_path_position_value(
    wildcard_path = os.path.join(
        DATASET_DUMP_PATH, 
        '*',  # filename
        '*',  # species
        '*'   # istrain
    ),
    path_position = -2,
    splits = {
            TRAINING_TYPE: 0.4,
            TEST_TYPE: 0.5,
            EVAL_TYPE: 0.6
        },
    paths_dump_file = os.path.join(
            DATASET_DUMP_PATH,
            "dataset_file_paths.json"
        ),
    skip_existing = KEEP_CACHE,
    logger = logger
)

print()
print("assigned dataset files:")
visualization.print_list_length_in_dict(dataset_file_paths)

INFO: found file paths dump '../dumps/PXD010000/training_columns/tf_datasets/dataset_file_paths.json'

assigned dataset files:
#Train = 89
e.g.: ../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_C_indologenes_LIB_aerobic_02_03May16_Samwise_16-03-32_mzmlid.parquet/Chryseobacterium_indologenes/Train
#Test = 17
e.g.: ../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_A_cryptum_FeTSB_anaerobic_1_01Jun16_Pippin_16-03-39_mzmlid.parquet/Acidiphilium_cryptum_JF-5/Train
#Eval = 29
e.g.: ../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_B_fragilis_CMcarb_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet/Bacteroides_fragilis_638R/Train


### Loading corresponding TF datasets

In [12]:
element_spec = (
    tuple(tf.TensorSpec(shape=(PROCESSING_INFO['padding_lengths'][col], ), dtype=tf.float32)
     for col in PROCESSING_INFO['training_data_columns']),
    tuple(tf.TensorSpec(shape=(PROCESSING_INFO['padding_lengths'][col], ), dtype=tf.int8)
     for col in PROCESSING_INFO['target_data_columns'])
)
element_spec

((TensorSpec(shape=(2354,), dtype=tf.float32, name=None),
  TensorSpec(shape=(2354,), dtype=tf.float32, name=None)),
 (TensorSpec(shape=(50,), dtype=tf.int8, name=None),))

In [13]:
BATCH_SIZE=32

In [14]:
datasets = DatasetLoader(
    element_spec=element_spec,
    batch_size=BATCH_SIZE,
    shuffle_buffer_size=100_000,
    keep_cache=KEEP_CACHE,
    logger=logger
).load_datasets_by_type(dataset_file_paths)
datasets



{'Train': <BatchDataset shapes: (((32, 2354), (32, 2354)), ((32, 50),)), types: ((tf.float32, tf.float32), (tf.int8,))>,
 'Test': <BatchDataset shapes: (((32, 2354), (32, 2354)), ((32, 50),)), types: ((tf.float32, tf.float32), (tf.int8,))>,
 'Eval': <BatchDataset shapes: (((32, 2354), (32, 2354)), ((32, 50),)), types: ((tf.float32, tf.float32), (tf.int8,))>}

## Building the Tensorflow Model

In [15]:
input_layers_list, masked_input_layers_list = layers.create_masked_input_layers(
    [
        layers.InputLayerConfiguration(
            name=col,
            shape=PROCESSING_INFO['padding_lengths'][col],
            mask_value=PROCESSING_INFO['padding_characters'][col]
        )
        for col in PROCESSING_INFO['training_data_columns']
    ]
)
print(input_layers_list)
print(masked_input_layers_list)

[<KerasTensor: shape=(None, 2354) dtype=float32 (created by layer 'mz_array')>, <KerasTensor: shape=(None, 2354) dtype=float32 (created by layer 'intensity_array')>]
[<KerasTensor: shape=(None, 2354) dtype=float32 (created by layer 'masked_mz_array')>, <KerasTensor: shape=(None, 2354) dtype=float32 (created by layer 'masked_intensity_array')>]


In [16]:
masked_loss = losses.MaskedLoss(
    loss_function=tf.keras.losses.sparse_categorical_crossentropy,
    masking_value=tf.constant(
        value=char_to_idx[PROCESSING_INFO['padding_characters'][SEQ]],
        dtype=tf.int8
    )
)

In [17]:
x = tf.stack(
    values=masked_input_layers_list, 
    axis=-1,
)
x

<KerasTensor: shape=(None, 2354, 2) dtype=float32 (created by layer 'tf.stack')>

In [18]:
x.shape

TensorShape([None, 2354, 2])

In [19]:
x.shape[1:]

TensorShape([2354, 2])

In [20]:
y = tf.keras.layers.Conv1D(
    filters=42,
    kernel_size=100,
    padding='same',
    activation='relu',
)(x)
y

<KerasTensor: shape=(None, 2354, 42) dtype=float32 (created by layer 'conv1d')>

In [21]:
x = tf.concat(
    values=[x, y],
    axis=-1
)
x

<KerasTensor: shape=(None, 2354, 44) dtype=float32 (created by layer 'tf.concat')>

In [22]:
x = tf.keras.layers.Bidirectional(
    layer=tf.keras.layers.LSTM(
        units=512,
        return_sequences=True,
    )
)(x)
x

<KerasTensor: shape=(None, 2354, 1024) dtype=float32 (created by layer 'bidirectional')>

In [23]:
x.shape[1]

2354

In [24]:
pool_size=2

while x.shape[1] >= (pool_size + 1)*PROCESSING_INFO['padding_lengths'][SEQ]:
    x = tf.keras.layers.Conv1D(
        filters=22,
        kernel_size=42,
        activation='relu',
    )(x)
    print(x)
    x = tf.keras.layers.MaxPool1D(
        pool_size=2,
        padding='same',
    )(x)
    print(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    print(x)

KerasTensor(type_spec=TensorSpec(shape=(None, 2313, 22), dtype=tf.float32, name=None), name='conv1d_1/Relu:0', description="created by layer 'conv1d_1'")
KerasTensor(type_spec=TensorSpec(shape=(None, 1157, 22), dtype=tf.float32, name=None), name='max_pooling1d/Squeeze:0', description="created by layer 'max_pooling1d'")
KerasTensor(type_spec=TensorSpec(shape=(None, 1157, 22), dtype=tf.float32, name=None), name='dropout/Identity:0', description="created by layer 'dropout'")
KerasTensor(type_spec=TensorSpec(shape=(None, 1116, 22), dtype=tf.float32, name=None), name='conv1d_2/Relu:0', description="created by layer 'conv1d_2'")
KerasTensor(type_spec=TensorSpec(shape=(None, 558, 22), dtype=tf.float32, name=None), name='max_pooling1d_1/Squeeze:0', description="created by layer 'max_pooling1d_1'")
KerasTensor(type_spec=TensorSpec(shape=(None, 558, 22), dtype=tf.float32, name=None), name='dropout_1/Identity:0', description="created by layer 'dropout_1'")
KerasTensor(type_spec=TensorSpec(shape=(

In [25]:
x = tf.keras.layers.Flatten()(x)

In [26]:
x = tf.keras.layers.Dense(
    units=PROCESSING_INFO['padding_lengths'][SEQ]*len(idx_to_char),
    activation=None,
)(x)

In [27]:
x = tf.reshape(x,(-1, PROCESSING_INFO['padding_lengths'][SEQ], len(idx_to_char)))

x = tf.keras.activations.softmax(x)

model = tf.keras.Model(inputs=input_layers_list, outputs=x, name='mmproteo')
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),#masked_loss,
              metrics=[
                  tf.keras.metrics.SparseCategoricalAccuracy(),
                  tf.keras.metrics.SparseCategoricalCrossentropy()
              ]
             )
model.summary()

Model: "mmproteo"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
mz_array (InputLayer)           [(None, 2354)]       0                                            
__________________________________________________________________________________________________
intensity_array (InputLayer)    [(None, 2354)]       0                                            
__________________________________________________________________________________________________
masked_mz_array (Masking)       (None, 2354)         0           mz_array[0][0]                   
__________________________________________________________________________________________________
masked_intensity_array (Masking (None, 2354)         0           intensity_array[0][0]            
___________________________________________________________________________________________

## Training the Tensorflow Model

In [28]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [29]:
TENSORBOARD_LOG_DIR = os.path.join(DUMP_PATH, "tensorboard", "logs")
os.path.realpath(TENSORBOARD_LOG_DIR)

'/tf/workspace/dumps/PXD010000/tensorboard/logs'

In [30]:
%tensorboard --logdir $TENSORBOARD_LOG_DIR --bind_all

Reusing TensorBoard on port 6006 (pid 109), started 3 days, 20:17:01 ago. (Use '!kill 109' to kill it.)

In [40]:
model.fit(x=datasets[TRAINING_TYPE].repeat(),
          validation_data=datasets[TEST_TYPE].repeat(), 
          validation_steps=500,
          epochs=3,
          steps_per_epoch=1_000,
          callbacks=[
              callbacks.create_tensorboard_callback(
                  tensorboard_log_dir = TENSORBOARD_LOG_DIR,
                  keep_logs = KEEP_CACHE
              )
          ]
         )

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fc09a9d4f98>

## Evaluating the Tensorflow Model

In [41]:
decode_idx: Callable[[np.ndarray], np.ndarray] = np.vectorize(idx_to_char.get)

In [42]:
evaluator = evaluation.SequenceEvaluator(
    dataset=datasets[EVAL_TYPE],
    decode_func=decode_idx,
    batch_size=BATCH_SIZE,
    separator=" ",
    padding_character=PROCESSING_INFO['padding_characters'][SEQ],
)

In [43]:
evaluator.evaluate_model(model)



[1.0335773229599, 0.7211865186691284, 1.0335770845413208]

In [44]:
eval_df, (x_eval, y_eval, y_pred) = evaluator.evaluate_model_visually(
    model=model,
    sample_size=20,
    keep_separator=True,
)
eval_df

Unnamed: 0,predicted,true
0,A A A A A A A A A _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _,E V G S L Q C T K G G P I V M(Oxidation) V Q C E N E F G S Y V A Q R K
1,A A A A A A A A A _ _ _ _ _ _ _ _,N K T Q E E T P K K R P Y N L R
2,A A A A A A A A A _ _ _ _ _ _ _ _,I W V D N H T Y Q V N E D A S K
3,A A A A A A A A A _ _,K A M V D H L Q E R
4,A A A A A A A A A _ _,N Y L P Q E L K E K
5,A A A A A A A A A _,N R Q D V L D I R
6,A A A A A A A A A _ _ _,H S G M I Q A S E L K
7,A A A A A A A A A _ _ _ _,Y I T D I M(Oxidation) P A A N T K
8,A A A A A A A A A _ _ _ _,K P L D T W V N P E Q R
9,A A A A A A A A A _ _ _ _ _ _ _ _ _ _ _ _,L G E D N I N V V E G N E Q F I S A S K


In [45]:
eval_df.predicted.map(print)
None

A A A A A A A A A _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A A A A A A A A A _ _ _ _ _ _ _ _
A A A A A A A A A _ _ _ _ _ _ _ _
A A A A A A A A A _ _
A A A A A A A A A _ _
A A A A A A A A A _
A A A A A A A A A _ _ _
A A A A A A A A A _ _ _ _
A A A A A A A A A _ _ _ _
A A A A A A A A A _ _ _ _ _ _ _ _ _ _ _ _
A A A A A A A A A _ _ _
A A A A A A A A A _
A A A A A A A A A _ _ _ _ _ _
A A A A A A A A A _ _ _ _
A A A A A A A A A _ _
A A A A A A A A A _ _
A A A A A A A A A
A A A A A A A A A _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A A A A A A A A A _
A A A A A A A A A _


broken loss function?

In [37]:
np.argmax(y_pred[0], axis=1)

array([ 0,  0, 18,  0,  0,  0,  0,  0,  0, 21, 21, 21, 21, 21, 21, 21, 21,
       21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
       21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21])

In [38]:
model.predict(datasets[EVAL_TYPE].take(1)).shape

(32, 50, 22)

In [39]:
y_pred.shape

(20, 50, 22)

In [46]:
y_pred

array([[[1.2700804e-01, 1.1757603e-03, 4.3611769e-02, ...,
         8.0585908e-03, 2.8152566e-02, 1.5297088e-06],
        [1.3558742e-01, 2.5996319e-03, 3.9417513e-02, ...,
         6.0903039e-03, 3.3834171e-02, 1.5502601e-06],
        [1.2859781e-01, 2.8369194e-03, 8.2352206e-02, ...,
         5.3908275e-03, 2.7533930e-02, 1.4438237e-06],
        ...,
        [7.6850076e-05, 1.4050021e-06, 1.8950088e-06, ...,
         1.4758485e-06, 7.7825352e-06, 9.9878353e-01],
        [6.3071297e-05, 5.2118128e-05, 1.3421746e-06, ...,
         1.2822607e-06, 1.3108994e-06, 9.9941170e-01],
        [3.6003006e-05, 1.0142294e-06, 9.5947678e-07, ...,
         9.8051567e-07, 9.4997858e-07, 9.9976987e-01]],

       [[1.2700804e-01, 1.1757603e-03, 4.3611769e-02, ...,
         8.0585908e-03, 2.8152566e-02, 1.5297088e-06],
        [1.3558742e-01, 2.5996319e-03, 3.9417513e-02, ...,
         6.0903039e-03, 3.3834171e-02, 1.5502601e-06],
        [1.2859781e-01, 2.8369194e-03, 8.2352206e-02, ...,
         5.390