# Training an ML Model on Tensorflow Datasets
## Prerequisites

In [1]:
import glob
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import numpy as np
import pandas as pd
import tensorflow as tf

from mmproteo.utils import log, utils, visualization
from mmproteo.utils.formats.mz import MzmlidFileStatsCreator
from mmproteo.utils.formats.tf_dataset import Parquet2DatasetFileProcessor

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [3]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

## Configuration

In [4]:
pwd

'/hpi/fs00/home/mirko.krause/masterthesis/pride-downloader/notebooks'

In [5]:
PROJECT = "PXD010000"

In [6]:
DUMP_PATH = "/scratch/mirko.krause/pdeep"
TRAINING_COLUMNS_DUMP_PATH = os.path.join(DUMP_PATH, "training_columns")
FILES_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "file_*.parquet")
THREAD_COUNT=32
SPLIT_VALUE_COLUMNS = None

In [7]:
STATISTICS_FILE_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "statistics.parquet")
DATASET_DUMP_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "tf_datasets")
PROCESSING_FILE_PATH = os.path.join(DATASET_DUMP_PATH, "processing_info.json")

SEQ = 'peptide_sequence'
MZ = 'mz_array'
INT = 'intensity_array'

TRAINING_DATA_COLUMNS = [MZ, INT]
TARGET_DATA_COLUMNS = [SEQ]

PADDING_CHARACTERS = {
    SEQ: '_',
    MZ: 0.0,
    INT: 0.0,
}

In [8]:
utils.ensure_dir_exists(DATASET_DUMP_PATH)

In [9]:
logger = log.create_logger(
    name='mmproteo_dataset_generation',
    verbose=True,
    log_dir=DATASET_DUMP_PATH,
)

2021-07-22 12:21:44,569 - mmproteo_dataset_generation: Logging to file '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/mmproteo_dataset_generation.log' and to stderr


In [10]:
MZMLID_FILE_PATHS = glob.glob(FILES_PATH)
print(len(MZMLID_FILE_PATHS))
MZMLID_FILE_PATHS[0]

26


'/scratch/mirko.krause/pdeep/training_columns/file_10.parquet'

## Calculating Statistics over all MZMLID Files

In [11]:
file_stats = MzmlidFileStatsCreator(
    mzmlid_file_paths=MZMLID_FILE_PATHS,
    statistics_file_path=STATISTICS_FILE_PATH,
    seq_col_name=SEQ,
    int_col_name=INT,
    logger=logger
).process(thread_count=THREAD_COUNT)
print(len(file_stats))
file_stats.head(2)

26


2021-07-22 12:21:44,615 - mmproteo_dataset_generation: loaded previous statistics file '/scratch/mirko.krause/pdeep/training_columns/statistics.parquet'


Unnamed: 0,file_path,max_sequence_length,max_array_length,alphabet,item_count
0,/scratch/mirko.krause/pdeep/training_columns/file_10.parquet,30,88,"{F, N, D, T, V, Q, L, Y, M, I, S, K, C, E, R, H, G, A, W, P}",999596
1,/scratch/mirko.krause/pdeep/training_columns/file_11.parquet,30,89,"{F, N, D, T, V, Q, L, Y, M, I, S, K, C, E, R, H, G, A, W, P}",999486


In [12]:
PADDING_LENGTHS = {
    MZ: file_stats.max_array_length.max(),
    INT: file_stats.max_array_length.max(),
    SEQ: file_stats.max_sequence_length.max()
}

print("padding lengths =", PADDING_LENGTHS)

TOTAL_ITEM_COUNT = file_stats.item_count.sum()
print(f"TOTAL_ITEM_COUNT = {TOTAL_ITEM_COUNT}")

ALPHABET = set.union(*file_stats.alphabet)
print(f"ALPHABET = {', '.join(sorted(ALPHABET))}")

padding lengths = {'mz_array': 89, 'intensity_array': 89, 'peptide_sequence': 30}
TOTAL_ITEM_COUNT = 25142457
ALPHABET = A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y


## Data Normalization, Padding, and Conversion to Tensorflow Datasets

In [13]:
def l2_normalize(values: np.ndarray) -> np.ndarray:
    return tf.keras.utils.normalize(x=values, order=2)

def base_peak_normalize(values: np.ndarray) -> np.ndarray:
    return values / values.max(initial=0)

# by Tom, probably
# don't know, what it's based on
def ion_current_normalize(intensities: np.ndarray) -> np.ndarray:
    total_sum = np.sum(intensities**2)
    normalized = intensities/total_sum
    return normalized

NORMALIZATION = {
    INT: base_peak_normalize
}

In [14]:
ALPHABET.add(PADDING_CHARACTERS[SEQ])

In [15]:
char_to_idx = {char: idx for idx, char in enumerate(sorted(ALPHABET))}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
INDEX_ALPHABET = idx_to_char.keys()
char_to_idx

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'N': 11,
 'P': 12,
 'Q': 13,
 'R': 14,
 'S': 15,
 'T': 16,
 'V': 17,
 'W': 18,
 'Y': 19,
 '_': 20}

In [16]:
processing_results = Parquet2DatasetFileProcessor(
    training_data_columns=TRAINING_DATA_COLUMNS,
    target_data_columns=TARGET_DATA_COLUMNS,
    padding_lengths=PADDING_LENGTHS,
    padding_characters=PADDING_CHARACTERS,
    column_normalizations=NORMALIZATION,
    dataset_dump_path_prefix=DATASET_DUMP_PATH,
    char_to_idx_mapping_functions={
        SEQ: char_to_idx.get
    },
    item_count=len(MZMLID_FILE_PATHS),
    skip_existing=False,
    split_on_column_values_of=SPLIT_VALUE_COLUMNS,
    logger=logger
).process(parquet_file_paths=MZMLID_FILE_PATHS,
          thread_count=int(THREAD_COUNT/2),
          keep_exceptions_as=True,
         )
processing_results[:3]

2021-07-22 12:21:44,643 - mmproteo_dataset_generation: DEBUG: Processing items with 16 subprocesses
2021-07-22 12:21:44,729 - mmproteo_dataset_generation: Preprocessing item 1/26: '/scratch/mirko.krause/pdeep/training_columns/file_10.parquet'
2021-07-22 12:21:44,729 - mmproteo_dataset_generation: DEBUG: Preprocessing item 2/26: '/scratch/mirko.krause/pdeep/training_columns/file_11.parquet'
2021-07-22 12:21:44,729 - mmproteo_dataset_generation: DEBUG: Preprocessing item 3/26: '/scratch/mirko.krause/pdeep/training_columns/file_15.parquet'
2021-07-22 12:21:44,730 - mmproteo_dataset_generation: DEBUG: Preprocessing item 7/26: '/scratch/mirko.krause/pdeep/training_columns/file_6.parquet'
2021-07-22 12:21:44,731 - mmproteo_dataset_generation: DEBUG: Preprocessing item 8/26: '/scratch/mirko.krause/pdeep/training_columns/file_17.parquet'
2021-07-22 12:21:44,730 - mmproteo_dataset_generation: DEBUG: Preprocessing item 6/26: '/scratch/mirko.krause/pdeep/training_columns/file_9.parquet'
2021-07-2

2021-07-22 12:21:46,868 - mmproteo_dataset_generation: DEBUG: storing 1 df split from '/scratch/mirko.krause/pdeep/training_columns/file_5.parquet'
2021-07-22 12:21:46,867 - mmproteo_dataset_generation: DEBUG: finished reading '/scratch/mirko.krause/pdeep/training_columns/file_16.parquet' file
2021-07-22 12:21:46,869 - mmproteo_dataset_generation: DEBUG: skipped splitting df for '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_16.parquet' by column values
2021-07-22 12:21:46,870 - mmproteo_dataset_generation: DEBUG: storing 1 df split from '/scratch/mirko.krause/pdeep/training_columns/file_16.parquet'
2021-07-22 12:21:46,919 - mmproteo_dataset_generation: DEBUG: finished reading '/scratch/mirko.krause/pdeep/training_columns/file_17.parquet' file
2021-07-22 12:21:46,920 - mmproteo_dataset_generation: DEBUG: skipped splitting df for '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_17.parquet' by column values
2021-07-22 12:21:46,921 - mmproteo_dataset_generati

2021-07-22 12:21:59,904 - mmproteo_dataset_generation: DEBUG: padded df '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_16.parquet'
2021-07-22 12:21:59,941 - mmproteo_dataset_generation: DEBUG: padded df '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_20.parquet'
2021-07-22 12:22:06,161 - mmproteo_dataset_generation: DEBUG: mapped sequences to indices for '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_25.parquet'
2021-07-22 12:22:06,285 - mmproteo_dataset_generation: DEBUG: mapped sequences to indices for '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_3.parquet'
2021-07-22 12:22:06,312 - mmproteo_dataset_generation: DEBUG: mapped sequences to indices for '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_10.parquet'
2021-07-22 12:22:06,324 - mmproteo_dataset_generation: DEBUG: mapped sequences to indices for '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_7.parquet'
2021-07-22 12:22:06,421 - mmprot

2021-07-22 12:22:18,801 - mmproteo_dataset_generation: DEBUG: created TF dataset from stacked df '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_21.parquet'
2021-07-22 12:22:38,696 - mmproteo_dataset_generation: DEBUG: saved TF dataset to /scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_5.parquet
2021-07-22 12:22:38,697 - mmproteo_dataset_generation: DEBUG: TF dataset element spec: ((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))
2021-07-22 12:22:38,890 - mmproteo_dataset_generation: DEBUG: saved TF dataset to /scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_11.parquet
2021-07-22 12:22:38,892 - mmproteo_dataset_generation: DEBUG: TF dataset element spec: ((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))
2021-07-22 12:2

2021-07-22 12:22:40,523 - mmproteo_dataset_generation: DEBUG: saved TF dataset to /scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_17.parquet
2021-07-22 12:22:40,525 - mmproteo_dataset_generation: DEBUG: TF dataset element spec: ((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))
2021-07-22 12:22:40,581 - mmproteo_dataset_generation: DEBUG: saved TF dataset to /scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_24.parquet
2021-07-22 12:22:40,585 - mmproteo_dataset_generation: DEBUG: TF dataset element spec: ((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))
2021-07-22 12:22:40,627 - mmproteo_dataset_generation: DEBUG: Finished preprocessing item 16/26: '/scratch/mirko.krause/pdeep/training_columns/file_16.parquet'
2021-07-22 12:22:40,672 - 

2021-07-22 12:22:54,219 - mmproteo_dataset_generation: DEBUG: padded df '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_13.parquet'
2021-07-22 12:22:54,370 - mmproteo_dataset_generation: DEBUG: padded df '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_8.parquet'
2021-07-22 12:22:54,443 - mmproteo_dataset_generation: DEBUG: padded df '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_12.parquet'
2021-07-22 12:22:54,554 - mmproteo_dataset_generation: DEBUG: padded df '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_19.parquet'
2021-07-22 12:22:54,850 - mmproteo_dataset_generation: DEBUG: padded df '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_4.parquet'
2021-07-22 12:22:54,916 - mmproteo_dataset_generation: DEBUG: padded df '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_14.parquet'
2021-07-22 12:22:55,082 - mmproteo_dataset_generation: DEBUG: padded df '/scratch/mirko.krause/pdeep/training_columns/tf

2021-07-22 12:23:34,412 - mmproteo_dataset_generation: DEBUG: saved TF dataset to /scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_13.parquet
2021-07-22 12:23:34,414 - mmproteo_dataset_generation: DEBUG: TF dataset element spec: ((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))
2021-07-22 12:23:34,472 - mmproteo_dataset_generation: DEBUG: Finished preprocessing item 25/26: '/scratch/mirko.krause/pdeep/training_columns/file_4.parquet'
2021-07-22 12:23:34,476 - mmproteo_dataset_generation: DEBUG: saved TF dataset to /scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_19.parquet
2021-07-22 12:23:34,489 - mmproteo_dataset_generation: DEBUG: TF dataset element spec: ((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))
2021-07-22 12:23:34,945 - m

[{'dataset_path': '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_10.parquet',
  'element_spec': ((TensorSpec(shape=(89,), dtype=tf.float64, name=None),
    TensorSpec(shape=(89,), dtype=tf.float64, name=None)),
   (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))},
 {'dataset_path': '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_11.parquet',
  'element_spec': ((TensorSpec(shape=(89,), dtype=tf.float64, name=None),
    TensorSpec(shape=(89,), dtype=tf.float64, name=None)),
   (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))},
 {'dataset_path': '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_15.parquet',
  'element_spec': ((TensorSpec(shape=(89,), dtype=tf.float64, name=None),
    TensorSpec(shape=(89,), dtype=tf.float64, name=None)),
   (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))}]

In [17]:
pd.DataFrame(processing_results)

Unnamed: 0,dataset_path,element_spec
0,/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_10.parquet,"((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))"
1,/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_11.parquet,"((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))"
2,/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_15.parquet,"((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))"
3,/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_24.parquet,"((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))"
4,/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_22.parquet,"((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))"
5,/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_9.parquet,"((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))"
6,/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_6.parquet,"((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))"
7,/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_17.parquet,"((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))"
8,/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_7.parquet,"((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))"
9,/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_18.parquet,"((TensorSpec(shape=(89,), dtype=tf.float64, name=None), TensorSpec(shape=(89,), dtype=tf.float64, name=None)), (TensorSpec(shape=(30,), dtype=tf.int8, name=None),))"


In [19]:
processing_configuration = {
    'padding_characters': PADDING_CHARACTERS,
    'padding_lengths': PADDING_LENGTHS,
    'idx_to_char': idx_to_char,
    'normalization': NORMALIZATION,
    'split_value_columns': SPLIT_VALUE_COLUMNS,
    'training_data_columns': TRAINING_DATA_COLUMNS,
    'target_data_columns': TARGET_DATA_COLUMNS,
    'element_spec': None if len(processing_results) == 0 else repr(processing_results[0]['element_spec'])
}

processing_configuration = utils.denumpyfy(processing_configuration)

print(visualization.pretty_print_json(processing_configuration))

{
    "padding_characters": {
        "peptide_sequence": "_",
        "mz_array": 0.0,
        "intensity_array": 0.0
    },
    "padding_lengths": {
        "mz_array": 89,
        "intensity_array": 89,
        "peptide_sequence": 30
    },
    "idx_to_char": {
        "0": "A",
        "1": "C",
        "2": "D",
        "3": "E",
        "4": "F",
        "5": "G",
        "6": "H",
        "7": "I",
        "8": "K",
        "9": "L",
        "10": "M",
        "11": "N",
        "12": "P",
        "13": "Q",
        "14": "R",
        "15": "S",
        "16": "T",
        "17": "V",
        "18": "W",
        "19": "Y",
        "20": "_"
    },
    "normalization": {
        "intensity_array": "<function base_peak_normalize at 0x7f9e3ee96af0>"
    },
    "split_value_columns": null,
    "training_data_columns": [
        "mz_array",
        "intensity_array"
    ],
    "target_data_columns": [
        "peptide_sequence"
    ],
    "element_spec": "((TensorSpec(shape=(89,), dtype

In [20]:
if len(processing_results) > 0:
    # otherwise the previously stored known element_spec might get overwritten
    with open(PROCESSING_FILE_PATH, 'w') as file:
        file.write(visualization.pretty_print_json(processing_configuration))
        print(f"wrote processing info to '{PROCESSING_FILE_PATH}'")
else:
    print("DID NOT STORE THE NEW RESULTS")

wrote processing info to '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/processing_info.json'
