# Training an ML Model on Tensorflow Datasets
## Prerequisites

In [1]:
import glob
import os

import numpy as np
import pandas as pd
import tensorflow as tf

from mmproteo.utils import log, utils, visualization
from mmproteo.utils.formats.mz import MzmlidFileStatsCreator
from mmproteo.utils.formats.tf_dataset import Parquet2DatasetFileProcessor

2021-07-12 10:23:52.838117: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [3]:
logger = log.DummyLogger(verbose=False)

INFO: Printing to Stdout


## Configuration

In [4]:
pwd

'/hpi/fs00/home/mirko.krause/masterthesis/pride-downloader/notebooks'

In [5]:
PROJECT = "PXD010000"

In [6]:
DUMP_PATH = "/scratch/mirko.krause/dumps/PXD010000"
THREAD_COUNT=32

In [7]:
TRAINING_COLUMNS_DUMP_PATH = os.path.join(DUMP_PATH, "training_columns")
FILES_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "*_mzmlid.parquet")
STATISTICS_FILE_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "statistics.parquet")
DATASET_DUMP_PATH = os.path.join(TRAINING_COLUMNS_DUMP_PATH, "tf_datasets")
PROCESSING_FILE_PATH = os.path.join(DATASET_DUMP_PATH, "processing_info.json")

SEQ = 'peptide_sequence'
MZ = 'mz_array'
INT = 'intensity_array'

TRAINING_DATA_COLUMNS = [MZ, INT]
TARGET_DATA_COLUMNS = [SEQ]
SPLIT_VALUE_COLUMNS = ['species', 'istrain']

PADDING_CHARACTERS = {
    SEQ: '_',
    MZ: 0.0,
    INT: 0.0,
}

In [8]:
MZMLID_FILE_PATHS = glob.glob(FILES_PATH)
print(len(MZMLID_FILE_PATHS))
MZMLID_FILE_PATHS[0]

235


'/scratch/mirko.krause/dumps/PXD010000/training_columns/M_alcali_copp_CH4_B3_T1_11_QE_23Mar18_Oak_18-01-07_mzmlid.parquet'

## Calculating Statistics over all MZMLID Files

In [9]:
file_stats = MzmlidFileStatsCreator(
    mzmlid_file_paths=MZMLID_FILE_PATHS,
    statistics_file_path=STATISTICS_FILE_PATH,
    seq_col_name=SEQ,
    int_col_name=INT,
    logger=logger
).process(thread_count=THREAD_COUNT)
print(len(file_stats))
file_stats.head(2)

INFO: Processing item 1/235 '/scratch/mirko.krause/dumps/PXD010000/training_columns/M_alcali_copp_CH4_B3_T1_11_QE_23Mar18_Oak_18-01-07_mzmlid.parquet'
INFO: Processing item 11/235 '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_R_jostii_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet'
INFO: Processing item 21/235 '/scratch/mirko.krause/dumps/PXD010000/training_columns/QC_Shew_13_05_500ng_2_5hr_19Mar14_Samwise_13-07-17_mzmlid.parquet'
INFO: Processing item 31/235 '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_fragilis_CMcarb_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet'
INFO: Processing item 41/235 '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_M_smegmatis_BHI_aerobic_2_05Oct16_Pippin_16-05-06_mzmlid.parquet'
INFO: Processing item 51/235 '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_C_comes_LIB_01_28Oct15_Arwen_15-07-13_mzmlid.parquet'
INFO: Processing item 61/235 '/scratch/mirko.krause/

Unnamed: 0,file_path,max_sequence_length,max_array_length,alphabet,item_count
0,/scratch/mirko.krause/dumps/PXD010000/training_columns/M_alcali_copp_CH4_B3_T1_11_QE_23Mar18_Oak_18-01-07_mzmlid.parquet,50,2058,"{C, W, V, F, P, Y, L, T, M(Oxidation), S, M, E, K, Q, N, D, A, H, R, G, I}",24856
1,/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet,50,1002,"{C, W, V, F, P, Y, L, T, M(Oxidation), S, M, E, K, Q, N, D, A, H, R, G, I}",16114


In [10]:
PADDING_LENGTHS = {
    MZ: file_stats.max_array_length.max(),
    INT: file_stats.max_array_length.max(),
    SEQ: file_stats.max_sequence_length.max()
}

print("padding lengths =", PADDING_LENGTHS)

TOTAL_ITEM_COUNT = file_stats.item_count.sum()
print(f"TOTAL_ITEM_COUNT = {TOTAL_ITEM_COUNT}")

ALPHABET = set.union(*file_stats.alphabet)
print(f"ALPHABET = {', '.join(sorted(ALPHABET))}")

padding lengths = {'mz_array': 2354, 'intensity_array': 2354, 'peptide_sequence': 50}
TOTAL_ITEM_COUNT = 5513185
ALPHABET = A, C, D, E, F, G, H, I, K, L, M, M(Oxidation), N, P, Q, R, S, T, V, W, Y


## Data Normalization, Padding, and Conversion to Tensorflow Datasets

In [11]:
def l2_normalize(values: np.ndarray) -> np.ndarray:
    return tf.keras.utils.normalize(x=values, order=2)

def base_peak_normalize(values: np.ndarray) -> np.ndarray:
    return values / values.max(initial=0)

# by Tom, probably
# don't know, what it's based on
def ion_current_normalize(intensities: np.ndarray) -> np.ndarray:
    total_sum = np.sum(intensities**2)
    normalized = intensities/total_sum
    return normalized

NORMALIZATION = {
    INT: base_peak_normalize
}

In [12]:
ALPHABET.add(PADDING_CHARACTERS[SEQ])

In [13]:
char_to_idx = {char: idx for idx, char in enumerate(sorted(ALPHABET))}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
INDEX_ALPHABET = idx_to_char.keys()
char_to_idx

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'M(Oxidation)': 11,
 'N': 12,
 'P': 13,
 'Q': 14,
 'R': 15,
 'S': 16,
 'T': 17,
 'V': 18,
 'W': 19,
 'Y': 20,
 '_': 21}

In [15]:
Parquet2DatasetFileProcessor(
    training_data_columns=TRAINING_DATA_COLUMNS,
    target_data_columns=TARGET_DATA_COLUMNS,
    padding_lengths=PADDING_LENGTHS,
    padding_characters=PADDING_CHARACTERS,
    column_normalizations=NORMALIZATION,
    dataset_dump_path_prefix=DATASET_DUMP_PATH,
    char_to_idx_mapping_functions={
        SEQ: char_to_idx.get
    },
    item_count=len(MZMLID_FILE_PATHS),
    skip_existing=True,
    split_on_column_values_of=SPLIT_VALUE_COLUMNS,
    logger=logger
).process(parquet_file_paths=MZMLID_FILE_PATHS,
          thread_count=int(THREAD_COUNT/2))[:3]

INFO: Processing item 1/235: '/scratch/mirko.krause/dumps/PXD010000/training_columns/M_alcali_copp_CH4_B3_T1_11_QE_23Mar18_Oak_18-01-07_mzmlid.parquet'INFO: Processing item 11/235: '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_R_jostii_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet'



2021-07-12 10:24:46.859553: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-12 10:24:47.031484: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-12 10:24:47.076415: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-12 10:24:47.076527: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-12 10:24:47.251178: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-12 10:24:47.406046: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-12 10:24:47.454504: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-12 10

2021-07-12 10:24:49.021682: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 3 with properties: 
pciBusID: 0000:4e:00.0 name: A100-SXM4-40GB computeCapability: 8.0
coreClock: 1.41GHz coreCount: 108 deviceMemorySize: 39.59GiB deviceMemoryBandwidth: 1.41TiB/s
2021-07-12 10:24:49.024733: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 7 with properties: 
pciBusID: 0000:bd:00.0 name: A100-SXM4-40GB computeCapability: 8.0
coreClock: 1.41GHz coreCount: 108 deviceMemorySize: 39.59GiB deviceMemoryBandwidth: 1.41TiB/s
2021-07-12 10:24:49.024836: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-07-12 10:24:49.025140: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64
2021-07-12 10:24:49.025279: W 

2021-07-12 10:24:49.267326: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:07:00.0 name: A100-SXM4-40GB computeCapability: 8.0
coreClock: 1.41GHz coreCount: 108 deviceMemorySize: 39.59GiB deviceMemoryBandwidth: 1.41TiB/s
2021-07-12 10:24:49.297379: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 1 with properties: 
pciBusID: 0000:0f:00.0 name: A100-SXM4-40GB computeCapability: 8.0
coreClock: 1.41GHz coreCount: 108 deviceMemorySize: 39.59GiB deviceMemoryBandwidth: 1.41TiB/s
2021-07-12 10:24:49.311219: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 2 with properties: 
pciBusID: 0000:47:00.0 name: A100-SXM4-40GB computeCapability: 8.0
coreClock: 1.41GHz coreCount: 108 deviceMemorySize: 39.59GiB deviceMemoryBandwidth: 1.41TiB/s
2021-07-12 10:24:49.324680: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 3 with properties: 
pciBusID: 0000:4e:00.0 name: A100-SXM4-40GB c

2021-07-12 10:24:49.738811: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 5 with properties: 
pciBusID: 0000:90:00.0 name: A100-SXM4-40GB computeCapability: 8.0
coreClock: 1.41GHz coreCount: 108 deviceMemorySize: 39.59GiB deviceMemoryBandwidth: 1.41TiB/s
2021-07-12 10:24:49.743253: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 6 with properties: 
pciBusID: 0000:b7:00.0 name: A100-SXM4-40GB computeCapability: 8.0
coreClock: 1.41GHz coreCount: 108 deviceMemorySize: 39.59GiB deviceMemoryBandwidth: 1.41TiB/s
2021-07-12 10:24:49.748122: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 6 with properties: 
pciBusID: 0000:b7:00.0 name: A100-SXM4-40GB computeCapability: 8.0
coreClock: 1.41GHz coreCount: 108 deviceMemorySize: 39.59GiB deviceMemoryBandwidth: 1.41TiB/s
2021-07-12 10:24:49.753018: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 7 with properties: 
pciBusID: 0000:bd:00.0 name: A100-SXM4-40GB c

2021-07-12 10:24:49.939269: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      
2021-07-12 10:24:50.208707: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-07-12 10:24:50.209910: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2245855000 Hz
2021-07-12 10:24:50.288107: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-07-12 10:24:50.289439: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2245855000 Hz
2021-07-12 10:24:50.297539: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-07-12 10:24:50.298049: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2245855000 Hz
2021-07-12 10:24:50.409635: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] N

INFO: Processing item 21/235: '/scratch/mirko.krause/dumps/PXD010000/training_columns/QC_Shew_13_05_500ng_2_5hr_19Mar14_Samwise_13-07-17_mzmlid.parquet'
INFO: Processing item 31/235: '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_fragilis_CMcarb_anaerobic_01_01Feb16_Arwen_15-07-13_mzmlid.parquet'
INFO: Processing item 41/235: '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_M_smegmatis_BHI_aerobic_2_05Oct16_Pippin_16-05-06_mzmlid.parquet'
INFO: Processing item 51/235: '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_C_comes_LIB_01_28Oct15_Arwen_15-07-13_mzmlid.parquet'
INFO: Processing item 61/235: '/scratch/mirko.krause/dumps/PXD010000/training_columns/Biodiversity_B_cereus_ATCC14579_LB_aerobic_2_17July16_Samwise_16-04-10_mzmlid.parquet'
INFO: Processing item 71/235: '/scratch/mirko.krause/dumps/PXD010000/training_columns/M_alcali_copp_MeOH_B1_T1_01_QE_23Mar18_Oak_18-01-07_mzmlid.parquet'
INFO: Processing item 81/235: '/s

['/scratch/mirko.krause/dumps/PXD010000/training_columns/tf_datasets/M_alcali_copp_CH4_B3_T1_11_QE_23Mar18_Oak_18-01-07_mzmlid.parquet',
 '/scratch/mirko.krause/dumps/PXD010000/training_columns/tf_datasets/Biodiversity_A_tumefaciens_R2A_aerobic_1_23Nov16_Pippin_16-09-11_mzmlid.parquet',
 '/scratch/mirko.krause/dumps/PXD010000/training_columns/tf_datasets/Biodiversity_A_cryptum_FeTSB_anaerobic_2_01Jun16_Pippin_16-03-39_mzmlid.parquet']

In [16]:
processing_configuration = {
    'padding_characters': PADDING_CHARACTERS,
    'padding_lengths': PADDING_LENGTHS,
    'idx_to_char': idx_to_char,
    'normalization': NORMALIZATION,
    'split_value_columns': SPLIT_VALUE_COLUMNS,
    'training_data_columns': TRAINING_DATA_COLUMNS,
    'target_data_columns': TARGET_DATA_COLUMNS
}

In [17]:
processing_configuration = utils.denumpyfy(processing_configuration)

In [18]:
print(visualization.pretty_print_json(processing_configuration))

{
    "padding_characters": {
        "peptide_sequence": "_",
        "mz_array": 0.0,
        "intensity_array": 0.0
    },
    "padding_lengths": {
        "mz_array": 2354,
        "intensity_array": 2354,
        "peptide_sequence": 50
    },
    "idx_to_char": {
        "0": "A",
        "1": "C",
        "2": "D",
        "3": "E",
        "4": "F",
        "5": "G",
        "6": "H",
        "7": "I",
        "8": "K",
        "9": "L",
        "10": "M",
        "11": "M(Oxidation)",
        "12": "N",
        "13": "P",
        "14": "Q",
        "15": "R",
        "16": "S",
        "17": "T",
        "18": "V",
        "19": "W",
        "20": "Y",
        "21": "_"
    },
    "normalization": {
        "intensity_array": "<function base_peak_normalize at 0x7f8e5c0cf550>"
    },
    "split_value_columns": [
        "species",
        "istrain"
    ],
    "training_data_columns": [
        "mz_array",
        "intensity_array"
    ],
    "target_data_columns": [
        "pep

In [19]:
with open(PROCESSING_FILE_PATH, 'w') as file:
    file.write(visualization.pretty_print_json(processing_configuration))
    print(f"wrote processing info to '{PROCESSING_FILE_PATH}'")

wrote processing info to '/scratch/mirko.krause/dumps/PXD010000/training_columns/tf_datasets/processing_info.json'
