In [1]:
#default_exp model.featurize

In [2]:
#export
import numpy as np
import pandas as pd
from typing import List, Union

from alphadeep._settings import model_const
from alphabase.constants.modification import MOD_CHEM

In [3]:
#export
mod_elements = model_const['mod_elements']
mod_feature_size = len(mod_elements)

mod_elem_to_idx = dict(zip(mod_elements, range(mod_feature_size)))

def _parse_mod_formula(formula):
    '''
    Parse a modification formula to a feature vector
    '''
    feature = np.zeros(mod_feature_size)
    elems = formula.strip(')').split(')')
    for elem in elems:
        chem, num = elem.split('(')
        num  = int(num)
        if chem in mod_elem_to_idx:
            feature[mod_elem_to_idx[chem]] = num
        else:
            feature[-1] += num
    return feature

MOD_TO_FEATURE = {}
for modname, formula in MOD_CHEM.items():
    MOD_TO_FEATURE[modname] = _parse_mod_formula(formula)
        

In [4]:
MOD_TO_FEATURE['Phospho@S']

array([0., 1., 0., 3., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])

In [5]:
#export
def parse_mod_feature(
    nAA:int, 
    mod_names:List[str], 
    mod_sites:List[int]
)->np.array:
    '''
    Get modification feature of a given peptide (len=nAA). 
    Note that `site=0` is for peptide N-term modification, 
    `site=1` is for peptide C-term modification, and 
    `1<=site<=nAA` is for residue modifications on the peptide.
    Args:
        nAA (int): the lenght of the peptide sequence
        mod_names (List[str]): the modification names
        mod_sites (List[str]): the modification sites corresponding
            to `mod_names` on the peptide
    Returns:
        np.array: 2-D feature array with shape `(nAA+2,mod_feature_size)`
    '''
    mod_x = np.zeros((nAA+2,mod_feature_size))
    if mod_names:
        mod_x[mod_sites] = [MOD_TO_FEATURE[mod] for mod in mod_names]
    return mod_x

In [6]:
#hide
x = parse_mod_feature(5, ['Acetyl@Protein N-term','Phospho@S','Oxidation@M'], [0,-1,1])
assert x.shape == (7, mod_feature_size)
assert np.all(x[1,:]==MOD_TO_FEATURE['Oxidation@M'])
assert np.all(x[0,:]==MOD_TO_FEATURE['Acetyl@Protein N-term'])
assert np.all(x[-1,:]==MOD_TO_FEATURE['Phospho@S'])
assert np.all(x[(2,3,4,5),:]==0)

In [7]:
#hide
x = parse_mod_feature(5, [], [])
assert x.shape == (7, mod_feature_size)
assert np.all(x==0)

In [8]:
#export
def get_batch_mod_feature(
    df_batch: pd.DataFrame, nAA: int
)->List[np.array]:
    '''
    Args:
        df_batch (pd.DataFrame): dataframe with same-length peptides ('sequence'), 
            which contains 'mods' and 'mod_sites' columns
        nAA (int): the length of the same-length peptides
    Returns:
        List[np.array]: a list of 2-D array features
    '''
    mod_x_batch = []
    for mod_names, mod_sites in df_batch[
        ['mods', 'mod_sites']
    ].values:
        if mod_names:
            mod_names = mod_names.split(';')
            mod_sites = [int(site) for site in mod_sites.split(';')]
        else:
            mod_names = []
            mod_sites = []
        mod_x_batch.append(parse_mod_feature(nAA, mod_names, mod_sites))
    return mod_x_batch

In [9]:
#export
def parse_aa_indices(
    seq_array: Union[List, np.array]
)->np.array:
    '''
    Convert peptide sequences into AA ID array. ID=0 is reserved for masking, 
    so ID of 'A' is 1, ID of 'B' is 2, ..., ID of 'Z' is 27. Zeros is padded 
    into the N- and C-term of each sequence after this conversion. 
    Args:
        seq_array (Union[List,np.array]): 
            list or 1-D array of sequences with the same length
    Returns:
        np.array: 2-D `np.int32` array with the shape 
        `(len(seq_array), len(seq_array[0])+2)`. Zeros is padded into the 
        N- and C-term of each sequence, so the 1st-D is `len(seq_array[0])+2`. 
    '''
    x = np.array(seq_array).view(np.int32).reshape(
            -1, len(seq_array[0])
        )-ord('A')+1
    # padding zeros at the N- and C-term
    return np.pad(x, [(0,0)]*(len(x.shape)-1)+[(1,1)])

In [10]:
#hide
assert np.all(parse_aa_indices(['ABCDE'])==np.array([[0, 1, 2, 3, 4, 5, 0]]))

In [11]:
assert np.all(parse_aa_indices(['ABCDE','ABCDE'])==np.array([[0, 1, 2, 3, 4, 5, 0],
       [0, 1, 2, 3, 4, 5, 0]]))

In [12]:
#export
instrument_dict = dict(
    zip(
        [inst.upper() for inst in model_const['instruments']], 
        range(len(model_const['instruments']))
    )
)
unknown_inst_index = model_const['max_instrument_num']-1

In [13]:
#export
def parse_instrument_indices(instrument_list):
    instrument_list = [inst.upper() for inst in instrument_list]
    return [
        instrument_dict[inst] if inst in instrument_dict
        else unknown_inst_index for inst in instrument_list
    ]

In [14]:
#hide
instrument_dict

{'QE': 0, 'LUMOS': 1, 'TIMSTOF': 2, 'SCIEXTOF': 3}

In [15]:
#hide
parse_instrument_indices([key for key in instrument_dict.keys()]+['Unknown'])

[0, 1, 2, 3, 7]