# Prototyping an ML Model
## Prerequisites

In [1]:
import glob
import pandas as pd
from mmproteo.utils.utils import ensure_dir_exists
from mmproteo.utils import log
from mmproteo.utils.formats.mz import FilteringProcessor, filter_files
from mmproteo.utils.processing import ItemProcessor
import os
import tensorflow as tf
import numpy as np
from typing import Iterable, Callable

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [3]:
logger = log.DummyLogger(verbose=False)

INFO: Printing to Stdout


## Data Import

In [4]:
pwd

'/tf/workspace/notebooks'

In [5]:
PROJECT = "PXD010000"
DUMP_PATH = f"../dumps/{PROJECT}"
TRAINING_COLUMNS_DUMP_PATH = DUMP_PATH + "/training_columns"
FILES_PATH = f"{TRAINING_COLUMNS_DUMP_PATH}/*_mzmlid.parquet"

In [6]:
MZMLID_FILE_PATHS = glob.glob(FILES_PATH)
len(MZMLID_FILE_PATHS)

235

In [7]:
path = MZMLID_FILE_PATHS[0]
path

'../dumps/PXD010000/training_columns/Biodiversity_S_agalactiae_LIB_aerobic_02_26Feb16_Arwen_16-01-01_mzmlid.parquet'

In [8]:
df = pd.concat(pd.read_parquet(path) for path in MZMLID_FILE_PATHS[:10])

In [9]:
#df = pd.read_parquet(path)
print(f"length = {len(df)}")
df.dtypes

length = 212568


SpectrumIdentificationItem__1__PeptideEvidenceRef__PeptideSequence    object
mz_array                                                              object
intensity_array                                                       object
dtype: object

In [10]:
df.head(1)

Unnamed: 0,SpectrumIdentificationItem__1__PeptideEvidenceRef__PeptideSequence,mz_array,intensity_array
24,AEQHIHENGAK,"[101.07125, 102.05546, 110.07156, 116.97215, 118.96744, 129.1021, 130.0869, 136.06169, 147.11232, 147.86722, 155.08163, 173.0923, 212.34276, 218.1501, 223.15547, 237.12326, 249.09761, 301.14133, 316.8805, 322.89563, 327.0749, 337.8088, 339.80032, 361.81232, 369.12436, 389.2164, 395.1711, 406.8533, 411.8027, 411.8499, 412.8472, 413.18677, 413.26584, 491.61603, 518.25494, 591.45447, 629.3038, 655.31635]","[902.02026, 4357.0073, 6633.4424, 4117.3403, 1100.8181, 6611.2314, 4355.198, 1551.3958, 5044.971, 1391.3762, 1411.4926, 4370.8145, 599.59827, 581.25366, 1002.15424, 855.4764, 2903.5098, 1110.4567, 597.7157, 1097.4309, 649.19794, 722.08014, 790.918, 1252.6704, 1245.698, 1415.5712, 674.397, 926.17834, 786.1922, 10235.162, 2630.4385, 726.63934, 6187.4634, 616.34375, 821.95905, 904.2936, 645.9391, 1105.8728]"


In [11]:
SEQ = FilteringProcessor.default_peptide_sequence_column_name
MZ = FilteringProcessor.default_mz_array_column_name
INT = FilteringProcessor.default_intensity_array_column_name

## Data Preprocessing

### Data Filtering

In [12]:
# drop non-AA characters
df[SEQ] = df[SEQ].str.replace(r"[^A-Z]",'')

### Data Normalization

#### Normalizing Intensities

In [13]:
def l2_normalize(values: np.ndarray) -> np.ndarray:
    return tf.keras.utils.normalize(x=values, order=2)

In [14]:
def base_peak_normalize(values: np.ndarray) -> np.ndarray:
    return values / values.max()

In [15]:
# by Tom, probably
# don't know, what it's based on
def ion_current_normalize(intensities):
    total_sum = np.sum(intensities**2)
    normalized = intensities/total_sum
    return normalized

In [16]:
NORMALIZATION=base_peak_normalize

In [17]:
df[INT] = df[INT].apply(NORMALIZATION)

In [18]:
df[INT].head(1)

24    [0.08812955, 0.4256901, 0.6481033, 0.40227407, 0.10755258, 0.6459333, 0.42551336, 0.1515751, 0.49290586, 0.1359408, 0.13790622, 0.42703912, 0.058582194, 0.056789882, 0.097912885, 0.0835821, 0.2836799, 0.10849429, 0.058398265, 0.10722164, 0.0634282, 0.07054897, 0.0772746, 0.122388914, 0.12170769, 0.13830471, 0.06589021, 0.09048986, 0.07681287, 1.0, 0.25700018, 0.070994414, 0.6045301, 0.06021827, 0.08030738, 0.08835166, 0.06310981, 0.108046435]
Name: intensity_array, dtype: object

#### Normalizing MZ values

### Data Padding

In [19]:
padding_characters = {
    SEQ: '_',
    MZ: 0.0,
    INT: 0.0,
}

In [20]:
# pad sequence string

max_sequence_length = df[SEQ].str.len().max()
print(f"Maximum sequence length = {max_sequence_length}")

df[SEQ] = df[SEQ].str.pad(
    width=max_sequence_length, 
    fillchar=padding_characters[SEQ], 
    side='right')

Maximum sequence length = 50


In [21]:
ARRAY_COLS = [MZ, INT]

In [22]:
df.head(1)

Unnamed: 0,SpectrumIdentificationItem__1__PeptideEvidenceRef__PeptideSequence,mz_array,intensity_array
24,AEQHIHENGAK_______________________________________,"[101.07125, 102.05546, 110.07156, 116.97215, 118.96744, 129.1021, 130.0869, 136.06169, 147.11232, 147.86722, 155.08163, 173.0923, 212.34276, 218.1501, 223.15547, 237.12326, 249.09761, 301.14133, 316.8805, 322.89563, 327.0749, 337.8088, 339.80032, 361.81232, 369.12436, 389.2164, 395.1711, 406.8533, 411.8027, 411.8499, 412.8472, 413.18677, 413.26584, 491.61603, 518.25494, 591.45447, 629.3038, 655.31635]","[0.08812955, 0.4256901, 0.6481033, 0.40227407, 0.10755258, 0.6459333, 0.42551336, 0.1515751, 0.49290586, 0.1359408, 0.13790622, 0.42703912, 0.058582194, 0.056789882, 0.097912885, 0.0835821, 0.2836799, 0.10849429, 0.058398265, 0.10722164, 0.0634282, 0.07054897, 0.0772746, 0.122388914, 0.12170769, 0.13830471, 0.06589021, 0.09048986, 0.07681287, 1.0, 0.25700018, 0.070994414, 0.6045301, 0.06021827, 0.08030738, 0.08835166, 0.06310981, 0.108046435]"


In [23]:
# pad arrays
for col in ARRAY_COLS:
    if len(df[col]) == 0:
        continue
    item_dtype = df[col].iloc[0].dtype
    
    max_array_length = df[col].str.len().max()
    df[col] = list(tf.keras.preprocessing.sequence.pad_sequences(
        sequences=df[col], 
        maxlen=max_array_length, 
        padding='post', 
        value=padding_characters[col],
        dtype=item_dtype
    ))
    assert df[col].str.len().min() == df[col].str.len().max()

max_array_length

1953

In [24]:
df.head(1)

Unnamed: 0,SpectrumIdentificationItem__1__PeptideEvidenceRef__PeptideSequence,mz_array,intensity_array
24,AEQHIHENGAK_______________________________________,"[101.07125, 102.05546, 110.07156, 116.97215, 118.96744, 129.1021, 130.0869, 136.06169, 147.11232, 147.86722, 155.08163, 173.0923, 212.34276, 218.1501, 223.15547, 237.12326, 249.09761, 301.14133, 316.8805, 322.89563, 327.0749, 337.8088, 339.80032, 361.81232, 369.12436, 389.2164, 395.1711, 406.8533, 411.8027, 411.8499, 412.8472, 413.18677, 413.26584, 491.61603, 518.25494, 591.45447, 629.3038, 655.31635, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[0.08812955, 0.4256901, 0.6481033, 0.40227407, 0.10755258, 0.6459333, 0.42551336, 0.1515751, 0.49290586, 0.1359408, 0.13790622, 0.42703912, 0.058582194, 0.056789882, 0.097912885, 0.0835821, 0.2836799, 0.10849429, 0.058398265, 0.10722164, 0.0634282, 0.07054897, 0.0772746, 0.122388914, 0.12170769, 0.13830471, 0.06589021, 0.09048986, 0.07681287, 1.0, 0.25700018, 0.070994414, 0.6045301, 0.06021827, 0.08030738, 0.08835166, 0.06310981, 0.108046435, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]"


In [25]:
df.values.shape

(212568, 3)

In [26]:
df.apply(lambda col: col.str.len().max())

SpectrumIdentificationItem__1__PeptideEvidenceRef__PeptideSequence      50
mz_array                                                              1953
intensity_array                                                       1953
dtype: int64

## Data Transformation

### One-Hot-Encoding of Character Sequences

In [27]:
ALPHABET = set.union(*df[SEQ].head(500000).apply(set))
char_to_idx = {char: idx for idx, char in enumerate(ALPHABET)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
INDEX_ALPHABET = idx_to_char.keys()

print(f"alphabet: {', '.join(sorted(ALPHABET))}")
char_to_idx

alphabet: A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y, _


{'R': 0,
 'M': 1,
 'F': 2,
 'Q': 3,
 'S': 4,
 'I': 5,
 'N': 6,
 'T': 7,
 'W': 8,
 'G': 9,
 'A': 10,
 'Y': 11,
 'D': 12,
 'V': 13,
 'H': 14,
 '_': 15,
 'P': 16,
 'L': 17,
 'C': 18,
 'E': 19,
 'K': 20}

In [28]:
df[SEQ] = df[SEQ].apply(list)
df[SEQ].head(1)

24    [A, E, Q, H, I, H, E, N, G, A, K, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]
Name: SpectrumIdentificationItem__1__PeptideEvidenceRef__PeptideSequence, dtype: object

In [29]:
def sequence_to_indices(sequence: Iterable[str], 
                        char_to_idx_mapping_fun: Callable[[str], int] = char_to_idx.get) -> np.ndarray:
    return np.array([char_to_idx_mapping_fun(char) for char in sequence])
is_sequence_of_integers=False

In [30]:
df[SEQ] = df[SEQ].apply(sequence_to_indices)
is_sequence_of_integers=True
print(df[SEQ].iloc[0].dtype)
df[SEQ].head(1)

int64


24    [10, 19, 3, 14, 5, 14, 19, 6, 9, 10, 20, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]
Name: SpectrumIdentificationItem__1__PeptideEvidenceRef__PeptideSequence, dtype: object

In [31]:
#categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(key=SEQ, vocabulary_list=ALPHABET)
if not is_sequence_of_integers:
    sequences = tf.feature_column.sequence_categorical_column_with_vocabulary_list(key=SEQ, vocabulary_list=ALPHABET)
else:
    sequences = tf.feature_column.sequence_categorical_column_with_identity(key=SEQ, num_buckets=len(INDEX_ALPHABET))
sequences_embedding = tf.feature_column.embedding_column(sequences, dimension=10)
columns = [sequences_embedding]
sequence_feature_layer = tf.keras.experimental.SequenceFeatures(columns)

In [32]:
stacked_df = df.apply(lambda item: [np.stack(item)])

In [33]:
del df

In [34]:
training_data = tuple(stacked_df[ARRAY_COLS].iloc[0])
target_data = tuple(stacked_df[[SEQ]].iloc[0])

In [35]:
dataset = tf.data.Dataset.from_tensor_slices((training_data, target_data))
dataset

<TensorSliceDataset shapes: (((1953,), (1953,)), ((50,),)), types: ((tf.float32, tf.float32), (tf.int64,))>

In [36]:
BATCH_SIZE = 128

In [37]:
dataset = dataset.batch(BATCH_SIZE)

In [38]:
#dataset = dataset.repeat()

In [39]:
#sequence_feature_layer(dataset)

In [40]:
next(dataset.as_numpy_iterator())[0][0].shape

(128, 1953)

In [41]:
ARRAY_COLS

['mz_array', 'intensity_array']

In [42]:
input_layers = {col: tf.keras.layers.Input(shape=(max_array_length,)) for col in ARRAY_COLS}
input_layers

{'mz_array': <KerasTensor: shape=(None, 1953) dtype=float32 (created by layer 'input_1')>,
 'intensity_array': <KerasTensor: shape=(None, 1953) dtype=float32 (created by layer 'input_2')>}

In [43]:
x = input_layers[MZ] + input_layers[INT]

x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(max_sequence_length*len(ALPHABET))(x)
x = tf.reshape(x,(-1, max_sequence_length, len(ALPHABET)))

x = tf.keras.activations.softmax(x)

model = tf.keras.Model([input_layers[MZ],input_layers[INT]],x)
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy())
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1953)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1953)]       0                                            
__________________________________________________________________________________________________
tf.__operators__.add (TFOpLambd (None, 1953)         0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
flatten (Flatten)               (None, 1953)         0           tf.__operators__.add[0][0]   

In [44]:
def split_dataset(dataset, fraction):
    split_value = int(len(dataset) * fraction)
    a = dataset.take(split_value)
    b = dataset.skip(split_value)
    return a, b

In [45]:
training_dataset, validation_dataset = split_dataset(dataset, 0.7)
validation_dataset, test_dataset = split_dataset(validation_dataset, 0.5)

In [46]:
model.fit(dataset, epochs=100, validation_data=validation_dataset)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f4320ad5710>

In [47]:

def trim_peaks_list(mz,intensities,MAX_N_PEAKS=MAX_N_PEAKS,pad=True):
    if mz.shape[0]<=MAX_N_PEAKS and pad:
        mz = np.pad(mz,((0,MAX_N_PEAKS-(mz.shape[0]))), 'constant', constant_values=0)
        intensities = np.pad(intensities,((0,MAX_N_PEAKS-(intensities.shape[0]))), 'constant', constant_values=0)    
        return mz,intensities
    else:
        indices = np.argsort(intensities)[-MAX_N_PEAKS:][::-1] # take only highest=MAX_N_PEAKS peaks
        return mz[indices],intensities[indices]

def create_iterator_from_mgf(mgf_file: str):     
    def iterator():        
        with mgf.read(mgf_file) as reader:              
            for entry in reader:
                sequence, mz, intensities = get_features(entry)
                indices = get_sequence_of_indices(sequence)
                indices = trim_sequence(indices)
                intensities = ion_current_normalize(intensities)
                mz,intensities = trim_peaks_list(mz,intensities,pad=True)
                mz = np.digitize(mz, bins=mz_bins)
                intensities = np.digitize(intensities, bins=intensity_bins) # TODO: this has to be replaced! Bin/Embed intensities? ...feels weird
                yield (mz,intensities),indices
                
    return iterator

NameError: name 'MAX_N_PEAKS' is not defined