In [1]:
from pyteomics import parser,mgf
import numpy as np # TODO: get rid of numpy! ... replace by tensorflow
import tensorflow as tf
import re

In [7]:
pwd

'/tf/workspace/notebooks'

In [21]:
MGF_FILE = "../datasets/small.mgf"

In [5]:
MAX_PEPTIDE_LENGTH=24
MAX_N_PEAKS=200
MZ_MIN = 100
MZ_MAX = 2100
MZ_ERROR = 0.1
MAX_DYNAMIC_RANGE=1000# TODO: this has to be replaced! Bin/Embed intensities? ...feels weird
MAX_INTENSITY=10e-7# TODO: this has to be replaced! Bin/Embed intensities? ...feels weird
BATCH_SIZE=64

In [12]:
MZ_RESOLUTION=int((MZ_MAX-MZ_MIN)/MZ_ERROR)
MZ_RESOLUTION

20000

In [16]:
mz_bins = np.linspace(MZ_MIN, MZ_MAX, MZ_RESOLUTION + 1)
mz_bins

array([ 100. ,  100.1,  100.2, ..., 2099.8, 2099.9, 2100. ])

In [17]:
len(mz_bins)

20001

In [19]:
# TODO: this has to be replaced! Bin/Embed intensities? ...feels weird
intensity_bins = np.linspace(0, MAX_INTENSITY, MAX_DYNAMIC_RANGE + 1)
intensity_bins

array([0.00e+00, 1.00e-09, 2.00e-09, ..., 9.98e-07, 9.99e-07, 1.00e-06])

In [20]:
len(intensity_bins)

1001

In [29]:
extract_features_from_mgf_entry(next(r))

{'seq': 'AADFLFSC+57.021DASHPDTLR',
 'mz': array([ 115.08660126,  120.08078003,  129.10192871,  136.07553101,
         143.08140564,  147.11248779,  175.11817932,  182.08056641,
         186.12322998,  197.12815857,  231.09684753,  233.16452026,
         235.10758972,  258.10803223,  259.11129761,  261.15933228,
         263.10223389,  329.14877319,  334.1390686 ,  355.19689941,
         360.15396118,  371.19207764,  376.18588257,  377.18130493,
         378.18457031,  389.24981689,  405.17630005,  406.18054199,
         447.22357178,  448.22689819,  448.73175049,  469.21282959,
         472.25442505,  473.2394104 ,  490.26544189,  491.26852417,
         500.24920654,  506.24777222,  518.26062012,  519.26373291,
         555.92797852,  556.26000977,  557.26654053,  586.25958252,
         586.76196289,  587.26409912,  588.29321289,  588.79595947,
         597.29937744,  597.80010986,  598.29907227,  601.33007812,
         601.80670166,  602.3347168 ,  610.30560303,  612.33563232,
      

In [6]:
def get_sequence_of_indices(sequence: str, aa_list: list=list(aa_with_pad)):
    return np.array([aa_list.index(aa) for aa in sequence])

def trim_sequence(indices):
    if len(indices)<=MAX_PEPTIDE_LENGTH:
        indices = np.pad(indices,((0,MAX_PEPTIDE_LENGTH-(indices.shape[0]))), 'constant', constant_values=0)
        return indices
    else:
        return indices[:MAX_PEPTIDE_LENGTH] #TODO: this has to be replaced! Longer Peptides should be discarded or increase MAX_PEPTIDE_LENGTH

def trim_peaks_list(mz,intensities,MAX_N_PEAKS=MAX_N_PEAKS,pad=True):
    if mz.shape[0]<=MAX_N_PEAKS and pad:
        mz = np.pad(mz,((0,MAX_N_PEAKS-(mz.shape[0]))), 'constant', constant_values=0)
        intensities = np.pad(intensities,((0,MAX_N_PEAKS-(intensities.shape[0]))), 'constant', constant_values=0)    
        return mz,intensities
    else:
        indices = np.argsort(intensities)[-MAX_N_PEAKS:][::-1] # take only highest=MAX_N_PEAKS peaks
        return mz[indices],intensities[indices]



def ion_current_normalize(intensities):
    total_sum = np.sum(intensities**2)
    normalized = intensities/total_sum
    return normalized

def create_iterator_from_mgf(mgf_file: str):     
    def iterator():        
        with mgf.read(mgf_file) as reader:              
            for entry in reader:
                sequence, mz, intensities = get_features(entry)
                indices = get_sequence_of_indices(sequence)
                indices = trim_sequence(indices)
                intensities = ion_current_normalize(intensities)
                mz,intensities = trim_peaks_list(mz,intensities,pad=True)
                mz = np.digitize(mz, bins=mz_bins)
                intensities = np.digitize(intensities, bins=intensity_bins) # TODO: this has to be replaced! Bin/Embed intensities? ...feels weird
                yield (mz,intensities),indices
                
    return iterator

def create_dataset_from_iterator(iterator, data_type=((tf.int32,tf.int32),tf.int32), batched=True, batch_size=BATCH_SIZE, repeat=True):
    ds = tf.data.Dataset.from_generator(iterator,data_type)
    if batched:
        ds = ds.batch(batch_size)
    if repeat:
        ds = ds.repeat()
    return ds

In [8]:
ds = create_ds(create_iterator_from_mgf(MGF_FILE))

In [9]:
inputs_mz = tf.keras.layers.Input(shape=(MAX_N_PEAKS,))
inputs_intensities = tf.keras.layers.Input(shape=(MAX_N_PEAKS,))
emb_1 = tf.keras.layers.Embedding(input_dim=MZ_RESOLUTION,output_dim=16)(inputs_mz)
emb_2 = tf.keras.layers.Embedding(input_dim=MAX_DYNAMIC_RANGE,output_dim=16)(inputs_intensities)
x = emb_1+emb_2

x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(MAX_PEPTIDE_LENGTH*len(alphabet))(x) #TODO: this has to be replaced!
x = tf.reshape(x,(-1, MAX_PEPTIDE_LENGTH, len(alphabet)))

x = tf.keras.activations.softmax(x)
model = tf.keras.Model([inputs_mz,inputs_intensities],x)
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy())
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 200, 16)      320000      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 16)      16000       input_2[0][0]                    
______________________________________________________________________________________________

In [10]:
model.fit(ds,steps_per_epoch=10,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa7fdfa4a58>

In [11]:
y = model.predict(ds,steps=1)

In [12]:
def decode(indices,aa=aa_with_pad,predicted=True):
    if predicted:
        indices = np.argmax(indices,axis=-1)
    sequence = np.apply_along_axis(lambda x: aa[x],axis=-1,arr=indices)
    sequence = np.apply_along_axis(lambda x: ''.join(x),axis=-1,arr=sequence)
    return sequence

In [13]:
ground_truth = np.array([x[1] for x in ds.take(1)])
ground_truth = decode(ground_truth,predicted=False)
ground_truth = list(ground_truth[0])

In [14]:
y = list(decode(y))

In [15]:
print("predicted peptide / true peptide")
print(np.array(list(zip(y,ground_truth))[:10]))

predicted peptide / true peptide
[['AAAGEEETAAAGSPGRK_______' 'AAAGEEETAAAGSPGRK_______']
 ['AAALASGCTVEIK___________' 'AAALASGCTVEIK___________']
 ['AAAVLRDSTSVPVTAEAK______' 'AAAVLRDSTSVPVTAEAK______']
 ['AADFLFSCDASHPDTLR_______' 'AADFLFSCDASHPDTLR_______']
 ['AADSSAPEDSEKLVGDTVSYSK__' 'AADSSAPEDSEKLVGDTVSYSK__']
 ['AAGHQADEILVPLDSK________' 'AAGHQADEILVPLDSK________']
 ['AAGLAGSDLITALISPTTR_____' 'AAGLAGSDLITALISPTTR_____']
 ['AAKEPEAVAVK_____________' 'AAKEPEAVAVK_____________']
 ['AAKIVTDVLLR_____________' 'AAKIVTDVLLR_____________']
 ['AALEQLLK________________' 'AALEQLLK________________']]
