# Prototyping an ML Model on Tensorflow Datasets
## Prerequisites

In [1]:
import gc
import glob
import os
import shutil
from typing import Iterable, Callable, Dict, Any, Tuple, Optional, List, Union

import numpy as np
import pandas as pd
import tensorflow as tf

from mmproteo.utils import log, utils
from mmproteo.utils.formats.mz import FilteringProcessor
from mmproteo.utils.formats.tf_dataset import Parquet2DatasetFileProcessor
from mmproteo.utils.processing import ItemProcessor

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [3]:
logger = log.DummyLogger(verbose=False)

INFO: Printing to Stdout


## Configuration

In [4]:
pwd

'/tf/workspace/notebooks'

In [5]:
PROJECT = "PXD010000"
DUMP_PATH = os.path.join("..", "dumps", PROJECT)
TRAINING_COLUMNS_DUMP_PATH = os.path.join(DUMP_PATH, "training_columns")
FILES_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "*_mzmlid.parquet")
STATISTICS_FILE_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "statistics.parquet")
DATASET_DUMP_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "tf_datasets")

In [6]:
MZMLID_FILE_PATHS = glob.glob(FILES_PATH)
len(MZMLID_FILE_PATHS)

40

In [7]:
MZMLID_FILE_PATHS[0]

'../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet'

In [8]:
df = pd.read_parquet(MZMLID_FILE_PATHS[1])
df.head(2)

Unnamed: 0,peptide_sequence,mz_array,intensity_array,species,istrain
6,"[L, D, N, V, V, Y, R]","[100.03951, 100.07604, 100.08698, 101.0598, 101.07139, 101.107635, 102.05545, 107.04927, 110.07147, 112.050835, 112.07621, 112.07955, 112.08708, 112.11265, 113.07123, 114.05504, 114.10244, 115.0504, 115.08685, 116.07068, 116.97243, 117.10215, 117.8191, 119.049065, 120.08085, 121.08424, 126.054794, 126.06571, 127.0868, 127.095535, 128.07236, 128.08185, 129.06583, 129.1024, 129.1124, 130.0507, 130.08633, 130.09756, 130.10551, 133.06154, 133.09709, 136.07576, 137.07903, 138.06609, 138.0916, 139.08571, 139.69499, 140.0812, 140.14333, 141.06573, 141.1022, 143.08153, 143.11768, 145.06099, 145.09743, 147.11273, 152.07062, 153.06499, 155.08113, 155.1178, 156.10173, 157.09727, 157.10855, 157.13348, 157.14546, 158.08061, 158.09245, 158.13683, 159.07657, 159.09282, 159.11234, 165.1023, 166.06062, 169.08437, 169.09724, 169.13377, 171.07635, 171.11241, 171.14874, 173.0913, 173.12836, 175.11905, 176.12291, 177.10197, 180.06554, 181.09618, 181.13329, 183.11295, 184.09566, 184.11569, 185.05496, 185.12808, 185.16528, 186.1237, 187.07101, 187.10844, 187.12733, 187.1442, 191.11745, 193.09688, ...]","[1472.0198, 1778.061, 982.26117, 849.2956, 7433.908, 1517.598, 10654.481, 1285.867, 22276.096, 1036.7197, 1008.01794, 1077.3555, 17357.316, 1765.9006, 1840.2622, 884.15027, 1030.9141, 1466.3262, 15643.766, 12685.864, 884.1921, 1245.8772, 909.2175, 1600.0315, 22742.06, 1201.5543, 1346.1736, 790.16095, 7063.337, 811.5802, 1109.8296, 6045.0728, 4442.3276, 33509.23, 2367.9402, 1291.7451, 11317.632, 6297.992, 2274.9944, 1097.1204, 1409.3613, 63035.684, 3738.175, 901.6442, 1888.8175, 784.02856, 840.27997, 875.71985, 1409.5555, 2824.3079, 8497.284, 4310.6895, 6666.118, 1519.9, 967.4869, 12989.814, 5915.0684, 942.62933, 4556.132, 1359.317, 5664.3735, 1361.2703, 5972.5083, 20817.252, 1598.2277, 1266.6726, 26993.193, 1077.943, 1951.6704, 796.94104, 1074.771, 1259.3566, 1369.4341, 1098.0599, 14032.469, 3526.5652, 1029.5724, 5769.34, 4063.4348, 1824.1956, 7088.784, 141361.95, 6137.2266, 1245.3956, 1576.3531, 2777.617, 1268.7177, 65213.92, 894.09875, 5262.379, 4305.7144, 1807.8231, 3397.9797, 43168.617, 1053.397, 3030.1729, 3255.7651, 3606.3943, 1305.8389, 4084.5789, ...]",Alcaligenes_faecalis,Train
7,"[A, G, L, D, N, N, Y, V, K]","[100.03982, 100.07586, 101.071236, 101.107574, 102.05517, 107.04923, 110.0715, 111.05547, 112.0509, 112.08704, 113.071, 115.086555, 116.07075, 116.9723, 119.04964, 120.08085, 127.08648, 128.07094, 128.08191, 129.06592, 129.10237, 130.04881, 130.07718, 130.08636, 130.09673, 130.10614, 136.04158, 136.0756, 137.07896, 139.04985, 141.06577, 141.10187, 142.12172, 143.08133, 143.11786, 144.12074, 147.11266, 148.11627, 152.07063, 155.08133, 155.11841, 157.06078, 157.09642, 157.13316, 158.09187, 159.07617, 159.11238, 166.06183, 169.09738, 169.13321, 171.11256, 171.14896, 173.09091, 173.12814, 173.97696, 174.13239, 175.1189, 181.06166, 181.09688, 183.11264, 185.09157, 186.0881, 186.12384, 187.10779, 187.14383, 195.07558, 195.11305, 197.12822, 201.0985, 201.12306, 202.0831, 204.13431, 211.14339, 212.06625, 212.10194, 215.1022, 215.13902, 216.10397, 223.15463, 226.11867, 227.10811, 228.13367, 228.17004, 229.09404, 229.11703, 230.07692, 230.11682, 231.07915, 233.09244, 235.14424, 242.11343, 242.14897, 244.16553, 246.15945, 246.1812, 247.10516, 247.18224, 250.11873, 254.14992, 255.1447, ...]","[732.6617, 1411.32, 16631.832, 914.66113, 1730.3816, 1458.0236, 11018.118, 814.45605, 824.7896, 4943.581, 1140.477, 3491.9822, 809.35693, 1004.9697, 907.5157, 4783.916, 1493.2881, 1060.6812, 2808.3154, 18674.494, 37496.016, 1481.8646, 1417.5668, 31441.13, 1304.8645, 798.3385, 861.07764, 15419.474, 735.78265, 858.0814, 3415.7173, 1404.7439, 869.0062, 1809.3257, 44420.22, 3551.6487, 40098.707, 1224.2285, 1093.3992, 3180.0164, 834.1555, 840.0687, 1675.8429, 3058.9111, 3239.9705, 3116.7256, 899.7371, 752.41925, 5850.0176, 2989.287, 12923.561, 752.36975, 1145.8423, 3916.3987, 751.41974, 894.45795, 15483.53, 937.4926, 1308.637, 4491.5586, 1013.0089, 781.3709, 1171.0354, 9497.646, 3152.1099, 1440.8368, 904.2847, 4495.3774, 860.7605, 7126.398, 1022.1986, 5772.762, 1022.4706, 3545.5469, 945.3842, 8011.257, 3472.5981, 1743.3401, 728.37146, 1287.6128, 799.1571, 969.7322, 1415.0103, 15196.673, 3107.087, 22553.723, 735.41016, 945.4697, 906.6033, 4455.847, 3310.5352, 4453.3364, 4270.777, 1118.9838, 19797.84, 954.415, 893.77594, 6114.4775, 1068.6388, 729.6997, ...]",Alcaligenes_faecalis,Train


In [9]:
SEQ = 'peptide_sequence'
MZ = 'mz_array'
INT = 'intensity_array'

In [10]:
TRAINING_DATA_COLUMNS = [MZ, INT]
TARGET_DATA_COLUMNS = [SEQ]
SPLIT_VALUE_COLUMNS = ['species', 'istrain']

## Calculating Statistics over all MZMLID Files

In [11]:
file_path_count = len(MZMLID_FILE_PATHS)

def get_mzmlid_file_stats(item: Tuple[int, str]) -> Dict[str, Any]:
    idx, path = item
    info_text = f"Processing item {idx + 1}/{file_path_count} '{path}'"
    if idx % 10 == 0:
        logger.info(info_text)
    else:
        logger.debug(info_text)
    df = pd.read_parquet(path)
    max_sequence_length = df[SEQ].str.len().max()
    max_array_length = df[INT].str.len().max()
    alphabet = set.union(*df[SEQ].apply(set))
    item_count = len(df)
    del df
    gc.collect()
    
    return {
        "file_path": path,
        "max_sequence_length": max_sequence_length,
        "max_array_length": max_array_length,
        "alphabet": alphabet,
        "item_count": item_count
    }

if os.path.exists(STATISTICS_FILE_PATH):
    file_stats = pd.read_parquet(STATISTICS_FILE_PATH)
    file_stats.alphabet = file_stats.alphabet.apply(set)
    print(f"loaded previous statistics file '{STATISTICS_FILE_PATH}'")
else:
    file_stats = pd.DataFrame(
        ItemProcessor(
            items=enumerate(MZMLID_FILE_PATHS),
            item_processor=get_mzmlid_file_stats,
            action_name="analyse",
            subject_name="mzmlid file",
            thread_count=0,
            logger=logger
        ).process()
    )
    
    file_stats_writable = file_stats.copy()
    file_stats_writable.alphabet = file_stats_writable.alphabet.apply(list) # cannot store sets
    file_stats_writable.to_parquet(STATISTICS_FILE_PATH)

loaded previous statistics file '../dumps/PXD010000/training_columns/statistics.parquet'


In [12]:
file_stats.head(2)

Unnamed: 0,file_path,max_sequence_length,max_array_length,alphabet,item_count
0,../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet,50,1845,"{F, S, K, L, N, C, G, T, V, R, Y, A, M(Oxidation), E, W, M, D, I, H, P, Q}",26943
1,../dumps/PXD010000/training_columns/Biodiversity_A_faecalis_LB_aerobic_03_26Feb16_Arwen_16-01-01_mzmlid.parquet,49,1082,"{F, S, K, L, N, C, G, T, V, R, Y, A, M(Oxidation), E, W, M, D, I, H, P, Q}",16723


In [13]:
PADDING_LENGTHS = {
    MZ: file_stats.max_array_length.max(),
    INT: file_stats.max_array_length.max(),
    SEQ: file_stats.max_sequence_length.max()
}

In [14]:
print("padding lengths =", PADDING_LENGTHS)

TOTAL_ITEM_COUNT = file_stats.item_count.sum()
print(f"TOTAL_ITEM_COUNT = {TOTAL_ITEM_COUNT}")

ALPHABET = set.union(*file_stats.alphabet)
print(f"ALPHABET = {', '.join(sorted(ALPHABET))}")

padding lengths = {'mz_array': 1845, 'intensity_array': 1845, 'peptide_sequence': 50}
TOTAL_ITEM_COUNT = 820586
ALPHABET = A, C, D, E, F, G, H, I, K, L, M, M(Oxidation), N, P, Q, R, S, T, V, W, Y


## Data Normalization, Padding, and Conversion to Tensorflow Datasets

In [15]:
def l2_normalize(values: np.ndarray) -> np.ndarray:
    return tf.keras.utils.normalize(x=values, order=2)

def base_peak_normalize(values: np.ndarray) -> np.ndarray:
    return values / values.max(initial=0)

# by Tom, probably
# don't know, what it's based on
def ion_current_normalize(intensities: np.ndarray) -> np.ndarray:
    total_sum = np.sum(intensities**2)
    normalized = intensities/total_sum
    return normalized

NORMALIZATION = {
    INT: base_peak_normalize
}

In [16]:
PADDING_CHARACTERS = {
    SEQ: '_',
    MZ: 0.0,
    INT: 0.0,
}

ALPHABET.add(PADDING_CHARACTERS[SEQ])

In [17]:
char_to_idx = {char: idx for idx, char in enumerate(sorted(ALPHABET))}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
INDEX_ALPHABET = idx_to_char.keys()
char_to_idx

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'M(Oxidation)': 11,
 'N': 12,
 'P': 13,
 'Q': 14,
 'R': 15,
 'S': 16,
 'T': 17,
 'V': 18,
 'W': 19,
 'Y': 20,
 '_': 21}

In [18]:
Parquet2DatasetFileProcessor(
    training_data_columns=TRAINING_DATA_COLUMNS,
    target_data_columns=TARGET_DATA_COLUMNS,
    padding_lengths=PADDING_LENGTHS,
    padding_characters=PADDING_CHARACTERS,
    column_normalizations=NORMALIZATION,
    dataset_dump_path_prefix=DATASET_DUMP_PATH,
    char_to_idx_mapping_functions={
        SEQ: char_to_idx.get
    },
    item_count=len(MZMLID_FILE_PATHS),
    skip_existing=True,
    split_on_column_values_of=SPLIT_VALUE_COLUMNS,
    logger=logger
).process(parquet_file_paths=MZMLID_FILE_PATHS,
          thread_count=2)[:3]

INFO: Processing item 1/40: '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet'
INFO: Processing item 11/40: '../dumps/PXD010000/training_columns/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet'
INFO: Processing item 21/40: '../dumps/PXD010000/training_columns/Biodiversity_B_cereus_PN_L_CL_3_09Oct16_Pippin_16-05-06_mzmlid.parquet'
INFO: Processing item 31/40: '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_CMcarb_anaerobic_03_01Feb16_Arwen_15-07-13_mzmlid.parquet'
INFO: No mzmlid parquet files were parquet2tf_dataset-processed
INFO: Encountered 0 exceptions during processing


[]

## Loading Tensorflow Datasets

In [19]:
TRAINING_DATA_TYPES = {path.split(os.path.sep)[-1] for path in glob.glob(
    os.path.join(
        DATASET_DUMP_PATH, 
        '*',  # filename
        '*',  # species
        '*'   # istrain
    ))}
TRAINING_DATA_TYPES

{'Train'}

In [20]:
dataset_file_paths = {training_data_type: glob.glob(os.path.join(DATASET_DUMP_PATH, '*', '*', training_data_type))
for training_data_type in TRAINING_DATA_TYPES}

for training_data_type, paths in dataset_file_paths.items():
    print(f"#{training_data_type} = {len(paths)}")
    print(f"e.g.: {paths[0]}")
    print()

#Train = 40
e.g.: ../dumps/PXD010000/training_columns/tf_datasets/Biodiversity_B_subtilis_NCIB3610_24h_plates_1_13Jun16_Pippin_16-03-39_mzmlid.parquet/Bacillus_subtilis_NCIB3610/Train



In [21]:
element_spec = ((tf.TensorSpec(shape=(PADDING_LENGTHS[MZ],), dtype=tf.float32), 
  tf.TensorSpec(shape=(PADDING_LENGTHS[INT],), dtype=tf.float32)),
(tf.TensorSpec(shape=(PADDING_LENGTHS[SEQ],), dtype=tf.int8)))
element_spec

((TensorSpec(shape=(1845,), dtype=tf.float32, name=None),
  TensorSpec(shape=(1845,), dtype=tf.float32, name=None)),
 TensorSpec(shape=(50,), dtype=tf.int8, name=None))

In [22]:
typed_datasets = {
    training_data_type: [
        tf.data.experimental.load(path=path, element_spec=element_spec, compression='GZIP') for path in paths
    ] for training_data_type, paths in dataset_file_paths.items()
}

typed_datasets

{'Train': [<_LoadDataset shapes: (((1845,), (1845,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>,
  <_LoadDataset shapes: (((1845,), (1845,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>,
  <_LoadDataset shapes: (((1845,), (1845,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>,
  <_LoadDataset shapes: (((1845,), (1845,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>,
  <_LoadDataset shapes: (((1845,), (1845,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>,
  <_LoadDataset shapes: (((1845,), (1845,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>,
  <_LoadDataset shapes: (((1845,), (1845,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>,
  <_LoadDataset shapes: (((1845,), (1845,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>,
  <_LoadDataset shapes: (((1845,), (1845,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>,
  <_LoadDataset shapes: (((1845,), (1845,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>,
  <_LoadDat

## Concatenating Tensorflow Datasets

In [23]:
# for manual splits
def split_dataset(dataset, fraction):
    split_value = int(len(dataset) * fraction)
    a = dataset.take(split_value)
    b = dataset.skip(split_value)
    return a, b

In [24]:
BATCH_SIZE = 256
SHUFFLE_BUFFER_SIZE = 10_000_000

In [25]:
CACHED_DATASET_DUMP_PATH = os.path.join(DATASET_DUMP_PATH, "cache")
shutil.rmtree(CACHED_DATASET_DUMP_PATH)
utils.ensure_dir_exists(CACHED_DATASET_DUMP_PATH)
CACHED_DATASET_DUMP_PATH

'../dumps/PXD010000/training_columns/tf_datasets/cache'

In [26]:
def concatenate_datasets(datasets: List[tf.data.Dataset]) -> tf.data.Dataset:
    dataset = datasets[0]
    for ds in datasets[1:]:
        dataset = dataset.concatenate(ds)
    return dataset

merged_datasets = {
    training_data_type: concatenate_datasets(datasets)
        .cache(os.path.join(CACHED_DATASET_DUMP_PATH, training_data_type))
    for training_data_type, datasets in typed_datasets.items()
}

merged_datasets

{'Train': <CacheDataset shapes: (((1845,), (1845,)), (50,)), types: ((tf.float32, tf.float32), tf.int8)>}

In [29]:
def get_minimal_model():
    input_layers = [tf.keras.layers.Input(shape=(PADDING_LENGTHS[col],)) for col in TRAINING_DATA_COLUMNS]
    
    x = input_layers[0]
    for input_layer in input_layers[1:]:
        x = x + input_layer
    
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(PADDING_LENGTHS[SEQ]*len(ALPHABET))(x)
    x = tf.reshape(x,(-1, PADDING_LENGTHS[SEQ], len(ALPHABET)))
    x = tf.keras.activations.softmax(x)
    model = tf.keras.Model(input_layers,x)
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy())
    return model
        
def fill_cache(dataset):
    model = get_minimal_model()
    model.fit(dataset.batch(BATCH_SIZE), epochs=1)
    return dataset

In [30]:
merged_datasets = {
    training_data_type: fill_cache(dataset)
        .shuffle(SHUFFLE_BUFFER_SIZE, reshuffle_each_iteration=True)
        .batch(BATCH_SIZE, drop_remainder=True)
    for training_data_type, dataset in merged_datasets.items()
}



In [31]:
TRAINING_TYPE = 'Train'

## Building the Tensorflow Model

In [32]:
input_layers = [tf.keras.layers.Input(shape=(PADDING_LENGTHS[col],)) for col in TRAINING_DATA_COLUMNS]
input_layers

[<KerasTensor: shape=(None, 1845) dtype=float32 (created by layer 'input_5')>,
 <KerasTensor: shape=(None, 1845) dtype=float32 (created by layer 'input_6')>]

In [33]:
x = input_layers[0]
for input_layer in input_layers[1:]:
    x = x + input_layer

x = tf.keras.layers.Flatten()(x)

x = tf.keras.layers.Dense(PADDING_LENGTHS[SEQ]*len(ALPHABET))(x)

x = tf.reshape(x,(-1, PADDING_LENGTHS[SEQ], len(ALPHABET)))

x = tf.keras.activations.softmax(x)

model = tf.keras.Model(input_layers,x)
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy())
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 1845)]       0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 1845)]       0                                            
__________________________________________________________________________________________________
tf.__operators__.add_2 (TFOpLam (None, 1845)         0           input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
flatten_2 (Flatten)             (None, 1845)         0           tf.__operators__.add_2[0][0

## Training the Tensorflow Model

In [34]:
model.fit(merged_datasets[TRAINING_TYPE], epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f3b2452e7b8>

## Evaluating the Tensorflow Model