# Training an ML Model on Tensorflow Datasets
## Prerequisites

In [1]:
import glob
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import numpy as np
import pandas as pd
import tensorflow as tf

from mmproteo.utils import log, utils, visualization
from mmproteo.utils.formats.mz import MzmlidFileStatsCreator
from mmproteo.utils.formats.tf_dataset import Parquet2DatasetFileProcessor

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [3]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

## Configuration

In [4]:
pwd

'/hpi/fs00/home/mirko.krause/masterthesis/pride-downloader/notebooks'

In [5]:
PROJECT = "PXD010000"

In [6]:
DUMP_PATH = "/scratch/mirko.krause/pdeep"
TRAINING_COLUMNS_DUMP_PATH = os.path.join(DUMP_PATH, "training_columns")
FILES_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "file_*.parquet")
THREAD_COUNT=32
SPLIT_VALUE_COLUMNS = None

In [7]:
STATISTICS_FILE_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "statistics.parquet")
DATASET_DUMP_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "tf_datasets")
PROCESSING_FILE_PATH = os.path.join(DATASET_DUMP_PATH, "processing_info.json")

SEQ = 'peptide_sequence'
MZ = 'mz_array'
INT = 'intensity_array'

TRAINING_DATA_COLUMNS = [MZ, INT]
TARGET_DATA_COLUMNS = [SEQ]

PADDING_CHARACTERS = {
    SEQ: '_',
    MZ: 0.0,
    INT: 0.0,
}

In [8]:
utils.ensure_dir_exists(DATASET_DUMP_PATH)

In [9]:
logger = log.create_logger(
    name='mmproteo_dataset_generation',
    verbose=True,
    log_dir=DATASET_DUMP_PATH,
)

2021-07-20 16:53:32,761 - mmproteo_dataset_generation: Logging to file '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/mmproteo_dataset_generation.log' and to stderr


In [10]:
MZMLID_FILE_PATHS = glob.glob(FILES_PATH)
print(len(MZMLID_FILE_PATHS))
MZMLID_FILE_PATHS[0]

26


'/scratch/mirko.krause/pdeep/training_columns/file_10.parquet'

## Calculating Statistics over all MZMLID Files

In [11]:
file_stats = MzmlidFileStatsCreator(
    mzmlid_file_paths=MZMLID_FILE_PATHS,
    statistics_file_path=STATISTICS_FILE_PATH,
    seq_col_name=SEQ,
    int_col_name=INT,
    logger=logger
).process(thread_count=THREAD_COUNT)
print(len(file_stats))
file_stats.head(2)

26


2021-07-20 16:53:32,806 - mmproteo_dataset_generation: loaded previous statistics file '/scratch/mirko.krause/pdeep/training_columns/statistics.parquet'


Unnamed: 0,file_path,max_sequence_length,max_array_length,alphabet,item_count
0,/scratch/mirko.krause/pdeep/training_columns/file_10.parquet,30,88,"{Q, L, G, R, W, N, I, S, V, P, Y, H, A, T, E, F, M, C, K, D}",999596
1,/scratch/mirko.krause/pdeep/training_columns/file_11.parquet,30,89,"{Q, L, G, R, W, N, I, S, V, P, Y, H, A, T, E, F, M, C, K, D}",999486


In [12]:
PADDING_LENGTHS = {
    MZ: file_stats.max_array_length.max(),
    INT: file_stats.max_array_length.max(),
    SEQ: file_stats.max_sequence_length.max()
}

print("padding lengths =", PADDING_LENGTHS)

TOTAL_ITEM_COUNT = file_stats.item_count.sum()
print(f"TOTAL_ITEM_COUNT = {TOTAL_ITEM_COUNT}")

ALPHABET = set.union(*file_stats.alphabet)
print(f"ALPHABET = {', '.join(sorted(ALPHABET))}")

padding lengths = {'mz_array': 89, 'intensity_array': 89, 'peptide_sequence': 30}
TOTAL_ITEM_COUNT = 25142457
ALPHABET = A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y


## Data Normalization, Padding, and Conversion to Tensorflow Datasets

In [13]:
def l2_normalize(values: np.ndarray) -> np.ndarray:
    return tf.keras.utils.normalize(x=values, order=2)

def base_peak_normalize(values: np.ndarray) -> np.ndarray:
    return values / values.max(initial=0)

# by Tom, probably
# don't know, what it's based on
def ion_current_normalize(intensities: np.ndarray) -> np.ndarray:
    total_sum = np.sum(intensities**2)
    normalized = intensities/total_sum
    return normalized

NORMALIZATION = {
    INT: base_peak_normalize
}

In [14]:
ALPHABET.add(PADDING_CHARACTERS[SEQ])

In [15]:
char_to_idx = {char: idx for idx, char in enumerate(sorted(ALPHABET))}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
INDEX_ALPHABET = idx_to_char.keys()
char_to_idx

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'N': 11,
 'P': 12,
 'Q': 13,
 'R': 14,
 'S': 15,
 'T': 16,
 'V': 17,
 'W': 18,
 'Y': 19,
 '_': 20}

In [16]:
Parquet2DatasetFileProcessor(
    training_data_columns=TRAINING_DATA_COLUMNS,
    target_data_columns=TARGET_DATA_COLUMNS,
    padding_lengths=PADDING_LENGTHS,
    padding_characters=PADDING_CHARACTERS,
    column_normalizations=NORMALIZATION,
    dataset_dump_path_prefix=DATASET_DUMP_PATH,
    char_to_idx_mapping_functions={
        SEQ: char_to_idx.get
    },
    item_count=len(MZMLID_FILE_PATHS),
    skip_existing=True,
    split_on_column_values_of=SPLIT_VALUE_COLUMNS,
    logger=logger
).process(parquet_file_paths=MZMLID_FILE_PATHS,
          thread_count=int(THREAD_COUNT/2),
          keep_exceptions_as=True,
         )[:3]

2021-07-20 16:53:32,836 - mmproteo_dataset_generation: DEBUG: Processing items with 16 subprocesses
2021-07-20 16:53:32,919 - mmproteo_dataset_generation: DEBUG: Preprocessing item 2/26: '/scratch/mirko.krause/pdeep/training_columns/file_11.parquet'
2021-07-20 16:53:32,919 - mmproteo_dataset_generation: Preprocessing item 1/26: '/scratch/mirko.krause/pdeep/training_columns/file_10.parquet'
2021-07-20 16:53:32,919 - mmproteo_dataset_generation: DEBUG: Preprocessing item 3/26: '/scratch/mirko.krause/pdeep/training_columns/file_15.parquet'
2021-07-20 16:53:32,920 - mmproteo_dataset_generation: DEBUG: Preprocessing item 6/26: '/scratch/mirko.krause/pdeep/training_columns/file_9.parquet'
2021-07-20 16:53:32,920 - mmproteo_dataset_generation: DEBUG: Preprocessing item 7/26: '/scratch/mirko.krause/pdeep/training_columns/file_6.parquet'
2021-07-20 16:53:32,920 - mmproteo_dataset_generation: DEBUG: Preprocessing item 8/26: '/scratch/mirko.krause/pdeep/training_columns/file_17.parquet'
2021-07-2

2021-07-20 16:53:33,092 - mmproteo_dataset_generation: DEBUG: Skipped '/scratch/mirko.krause/pdeep/training_columns/file_13.parquet' because '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_13.parquet' already exists
2021-07-20 16:53:33,092 - mmproteo_dataset_generation: DEBUG: Skipped '/scratch/mirko.krause/pdeep/training_columns/file_19.parquet' because '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_19.parquet' already exists
2021-07-20 16:53:33,092 - mmproteo_dataset_generation: DEBUG: Skipped '/scratch/mirko.krause/pdeep/training_columns/file_14.parquet' because '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/file_14.parquet' already exists
2021-07-20 16:53:33,092 - mmproteo_dataset_generation: DEBUG: Preprocessing item 24/26: '/scratch/mirko.krause/pdeep/training_columns/file_23.parquet'
2021-07-20 16:53:33,092 - mmproteo_dataset_generation: DEBUG: Preprocessing item 25/26: '/scratch/mirko.krause/pdeep/training_columns/file_4.parquet'
2021-0

[]

In [17]:
processing_configuration = {
    'padding_characters': PADDING_CHARACTERS,
    'padding_lengths': PADDING_LENGTHS,
    'idx_to_char': idx_to_char,
    'normalization': NORMALIZATION,
    'split_value_columns': SPLIT_VALUE_COLUMNS,
    'training_data_columns': TRAINING_DATA_COLUMNS,
    'target_data_columns': TARGET_DATA_COLUMNS
}

In [18]:
processing_configuration = utils.denumpyfy(processing_configuration)

In [19]:
print(visualization.pretty_print_json(processing_configuration))

{
    "padding_characters": {
        "peptide_sequence": "_",
        "mz_array": 0.0,
        "intensity_array": 0.0
    },
    "padding_lengths": {
        "mz_array": 89,
        "intensity_array": 89,
        "peptide_sequence": 30
    },
    "idx_to_char": {
        "0": "A",
        "1": "C",
        "2": "D",
        "3": "E",
        "4": "F",
        "5": "G",
        "6": "H",
        "7": "I",
        "8": "K",
        "9": "L",
        "10": "M",
        "11": "N",
        "12": "P",
        "13": "Q",
        "14": "R",
        "15": "S",
        "16": "T",
        "17": "V",
        "18": "W",
        "19": "Y",
        "20": "_"
    },
    "normalization": {
        "intensity_array": "<function base_peak_normalize at 0x7fc52ee681f0>"
    },
    "split_value_columns": null,
    "training_data_columns": [
        "mz_array",
        "intensity_array"
    ],
    "target_data_columns": [
        "peptide_sequence"
    ]
}


In [20]:
with open(PROCESSING_FILE_PATH, 'w') as file:
    file.write(visualization.pretty_print_json(processing_configuration))
    print(f"wrote processing info to '{PROCESSING_FILE_PATH}'")

wrote processing info to '/scratch/mirko.krause/pdeep/training_columns/tf_datasets/processing_info.json'
