# Training an ML Model on Tensorflow Datasets
## Prerequisites

In [1]:
import glob
import os

import numpy as np
import pandas as pd
import tensorflow as tf

from mmproteo.utils import log, utils, visualization
from mmproteo.utils.formats.mz import MzmlidFileStatsCreator
from mmproteo.utils.formats.tf_dataset import Parquet2DatasetFileProcessor

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [3]:
logger = log.DummyLogger(verbose=False)

INFO: Printing to Stdout


## Configuration

In [4]:
pwd

'/tf/workspace/notebooks'

In [5]:
PROJECT = "PXD010000"
DUMP_PATH = os.path.join("..", "dumps", PROJECT)
TRAINING_COLUMNS_DUMP_PATH = os.path.join(DUMP_PATH, "training_columns")
FILES_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "*_mzmlid.parquet")
STATISTICS_FILE_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "statistics.parquet")
DATASET_DUMP_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "tf_datasets")
PROCESSING_FILE_PATH = os.path.join(DATASET_DUMP_PATH, "processing_info.json")

SEQ = 'peptide_sequence'
MZ = 'mz_array'
INT = 'intensity_array'

TRAINING_DATA_COLUMNS = [MZ, INT]
TARGET_DATA_COLUMNS = [SEQ]
SPLIT_VALUE_COLUMNS = ['species', 'istrain']

PADDING_CHARACTERS = {
    SEQ: '_',
    MZ: 0.0,
    INT: 0.0,
}

In [6]:
MZMLID_FILE_PATHS = glob.glob(FILES_PATH)
print(len(MZMLID_FILE_PATHS))
MZMLID_FILE_PATHS[0]

235


'../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet'

## Calculating Statistics over all MZMLID Files

In [7]:
file_stats = MzmlidFileStatsCreator(
    mzmlid_file_paths=MZMLID_FILE_PATHS,
    statistics_file_path=STATISTICS_FILE_PATH,
    seq_col_name=SEQ,
    int_col_name=INT,
    logger=logger
).process(thread_count=0)
print(len(file_stats))
file_stats.head(2)

INFO: loaded previous statistics file '../dumps/PXD010000/training_columns/statistics.parquet'
235


Unnamed: 0,file_path,max_sequence_length,max_array_length,alphabet,item_count
0,../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet,50,1845,"{Q, W, A, F, C, V, K, I, E, P, G, D, M(Oxidation), Y, N, R, H, T, S, L, M}",26943
1,../dumps/PXD010000/training_columns/Biodiversity_Cibrobacter_freundii_LB_aerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet,50,1697,"{Q, W, A, F, C, V, K, I, E, P, G, D, M(Oxidation), Y, N, R, H, T, S, L, M}",27516


In [8]:
PADDING_LENGTHS = {
    MZ: file_stats.max_array_length.max(),
    INT: file_stats.max_array_length.max(),
    SEQ: file_stats.max_sequence_length.max()
}

print("padding lengths =", PADDING_LENGTHS)

TOTAL_ITEM_COUNT = file_stats.item_count.sum()
print(f"TOTAL_ITEM_COUNT = {TOTAL_ITEM_COUNT}")

ALPHABET = set.union(*file_stats.alphabet)
print(f"ALPHABET = {', '.join(sorted(ALPHABET))}")

padding lengths = {'mz_array': 2354, 'intensity_array': 2354, 'peptide_sequence': 50}
TOTAL_ITEM_COUNT = 5513185
ALPHABET = A, C, D, E, F, G, H, I, K, L, M, M(Oxidation), N, P, Q, R, S, T, V, W, Y


## Data Normalization, Padding, and Conversion to Tensorflow Datasets

In [9]:
def l2_normalize(values: np.ndarray) -> np.ndarray:
    return tf.keras.utils.normalize(x=values, order=2)

def base_peak_normalize(values: np.ndarray) -> np.ndarray:
    return values / values.max(initial=0)

# by Tom, probably
# don't know, what it's based on
def ion_current_normalize(intensities: np.ndarray) -> np.ndarray:
    total_sum = np.sum(intensities**2)
    normalized = intensities/total_sum
    return normalized

NORMALIZATION = {
    INT: base_peak_normalize
}

In [10]:
ALPHABET.add(PADDING_CHARACTERS[SEQ])

In [11]:
char_to_idx = {char: idx for idx, char in enumerate(sorted(ALPHABET))}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
INDEX_ALPHABET = idx_to_char.keys()
char_to_idx

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'M(Oxidation)': 11,
 'N': 12,
 'P': 13,
 'Q': 14,
 'R': 15,
 'S': 16,
 'T': 17,
 'V': 18,
 'W': 19,
 'Y': 20,
 '_': 21}

In [12]:
Parquet2DatasetFileProcessor(
    training_data_columns=TRAINING_DATA_COLUMNS,
    target_data_columns=TARGET_DATA_COLUMNS,
    padding_lengths=PADDING_LENGTHS,
    padding_characters=PADDING_CHARACTERS,
    column_normalizations=NORMALIZATION,
    dataset_dump_path_prefix=DATASET_DUMP_PATH,
    char_to_idx_mapping_functions={
        SEQ: char_to_idx.get
    },
    item_count=len(MZMLID_FILE_PATHS),
    skip_existing=True,
    split_on_column_values_of=SPLIT_VALUE_COLUMNS,
    logger=logger
).process(parquet_file_paths=MZMLID_FILE_PATHS,
          thread_count=3)[:3]

INFO: Processing item 1/235: '../dumps/PXD010000/training_columns/Biodiversity_B_fragilis_01_28Jul15_Arwen_14-12-03_mzmlid.parquet'
INFO: Processing item 11/235: '../dumps/PXD010000/training_columns/Biodiversity_P_polymyxa_TBS_aerobic_3_17July16_Samwise_16-04-10_mzmlid.parquet'
INFO: Processing item 21/235: '../dumps/PXD010000/training_columns/M_alcali_copp_CH4_B2_T1_09_QE_23Mar18_Oak_18-01-07_mzmlid.parquet'
INFO: Processing item 31/235: '../dumps/PXD010000/training_columns/Cj_media_MH_R4_23Feb15_Arwen_14-12-03_mzmlid.parquet'
INFO: Processing item 41/235: '../dumps/PXD010000/training_columns/Biodiversity_C_Baltica_T240_R2_C_27Jan16_Arwen_15-07-13_mzmlid.parquet'
INFO: Processing item 51/235: '../dumps/PXD010000/training_columns/Biodiversity_M_xanthus_DZ2_plates_1_03May16_Samwise_16-03-32_mzmlid.parquet'
INFO: Processing item 61/235: '../dumps/PXD010000/training_columns/Biodiversity_B_thet_CMgluc_anaerobic_02_01Feb16_Arwen_15-07-13_mzmlid.parquet'
INFO: Processing item 71/235: '../dum

[]

In [13]:
processing_configuration = {
    'padding_characters': PADDING_CHARACTERS,
    'padding_lengths': PADDING_LENGTHS,
    'idx_to_char': idx_to_char,
    'normalization': NORMALIZATION,
    'split_value_columns': SPLIT_VALUE_COLUMNS,
    'training_data_columns': TRAINING_DATA_COLUMNS,
    'target_data_columns': TARGET_DATA_COLUMNS
}

In [14]:
processing_configuration = utils.denumpyfy(processing_configuration)

In [15]:
print(visualization.pretty_print_json(processing_configuration))

{
    "padding_characters": {
        "peptide_sequence": "_",
        "mz_array": 0.0,
        "intensity_array": 0.0
    },
    "padding_lengths": {
        "mz_array": 2354,
        "intensity_array": 2354,
        "peptide_sequence": 50
    },
    "idx_to_char": {
        "0": "A",
        "1": "C",
        "2": "D",
        "3": "E",
        "4": "F",
        "5": "G",
        "6": "H",
        "7": "I",
        "8": "K",
        "9": "L",
        "10": "M",
        "11": "M(Oxidation)",
        "12": "N",
        "13": "P",
        "14": "Q",
        "15": "R",
        "16": "S",
        "17": "T",
        "18": "V",
        "19": "W",
        "20": "Y",
        "21": "_"
    },
    "normalization": {
        "intensity_array": "<function base_peak_normalize at 0x7fa6046d5158>"
    },
    "split_value_columns": [
        "species",
        "istrain"
    ],
    "training_data_columns": [
        "mz_array",
        "intensity_array"
    ],
    "target_data_columns": [
        "pep

In [16]:
with open(PROCESSING_FILE_PATH, 'w') as file:
    file.write(visualization.pretty_print_json(processing_configuration))
    print(f"wrote processing info to '{PROCESSING_FILE_PATH}'")

wrote processing info to '../dumps/PXD010000/training_columns/tf_datasets/processing_info.json'
