In [None]:
#default_exp constants.modification

In [None]:
# hide
__file__ = '../../alphabase/constants/modification.py'

In [None]:
#export
import os
import numba
import numpy as np
import pandas as pd
from typing import Union, List
from copy import deepcopy

from alphabase.yaml_utils import load_yaml
from alphabase.constants.element import calc_formula_mass

_base_dir = os.path.dirname(__file__)

def _update_all_by_MOD_INFO_DICT():
    global MOD_CHEM
    global MOD_MASS
    global MOD_LOSS_MASS

    MOD_CHEM = {}
    MOD_MASS = {}
    MOD_LOSS_MASS = {}
    for mod, val in MOD_INFO_DICT.items():
        MOD_CHEM[mod] = val['composition']
        MOD_MASS[mod] = val['mass']
        MOD_LOSS_MASS[mod] = val['modloss']

def load_mod_yaml(yaml_file):
    global MOD_INFO_DICT
    global MOD_DF
    MOD_INFO_DICT = load_yaml(yaml_file)
    
    # Add lower-case modifications for future usages
    for key, modinfo in list(MOD_INFO_DICT.items()):
        MOD_INFO_DICT[key]['upper_case_AA'] = True
        modname, site = key.split('@')
        if len(site) == 1:
            MOD_INFO_DICT[modname+'@'+site.lower()] = deepcopy(modinfo)
            MOD_INFO_DICT[modname+'@'+site.lower()]['upper_case_AA'] = False
        elif '^' in site:
            site = site[0].lower()+site[1:]
            MOD_INFO_DICT[modname+'@'+site] = deepcopy(modinfo)
            MOD_INFO_DICT[modname+'@'+site]['upper_case_AA'] = False

    for mod, val in MOD_INFO_DICT.items():
        MOD_INFO_DICT[mod]['unimod_mass'] = MOD_INFO_DICT[mod]['mono_mass']
        MOD_INFO_DICT[mod]['unimod_modloss'] = MOD_INFO_DICT[mod]['modloss']
        MOD_INFO_DICT[mod]['mass'] = calc_formula_mass(val['composition'])
        MOD_INFO_DICT[mod]['modloss'] = calc_formula_mass(val['modloss_composition'])
        MOD_INFO_DICT[mod]['modloss_importance'] = 0

    _update_all_by_MOD_INFO_DICT()

    MOD_DF = pd.DataFrame().from_dict(MOD_INFO_DICT, orient='index')
    MOD_DF['name'] = MOD_DF.index

load_mod_yaml(
    os.path.join(_base_dir,
    'used_mod.yaml')
)

def load_modloss_importance(yaml_file):
    global MOD_LOSS_IMPORTANCE
    MOD_LOSS_IMPORTANCE = load_yaml(yaml_file)
    for mod,val in MOD_LOSS_IMPORTANCE.items():
        MOD_INFO_DICT[mod]['modloss_importance'] = val
    MOD_DF['modloss_importance'] = pd.DataFrame().from_dict(
        MOD_LOSS_IMPORTANCE, orient='index'
    )
    MOD_DF.loc[pd.isna(MOD_DF['modloss_importance']), 'modloss_importance'] = 0


load_modloss_importance(
    os.path.join(_base_dir,
    'modloss_importance.yaml')
)

def _update_all_by_MOD_DF():
    global MOD_INFO_DICT
    MOD_INFO_DICT = MOD_DF.to_dict(orient='index')
    _update_all_by_MOD_INFO_DICT()


In [None]:
MOD_DF

Unnamed: 0,avge_mass,classification,composition,modloss,modloss_composition,mono_mass,unimod_id,upper_case_AA,unimod_mass,unimod_modloss,mass,modloss_importance,name
GlyGly@K,114.042927,Post-translational,H(6)C(4)N(2)O(2),114.042922,H(6)C(4)N(2)O(2),114.042927,-1,True,114.042927,114.042927,114.042922,1e6,GlyGly@K
15N-oxobutanoic@C^Any N-term,-18.023900,Artefact,H(-3)15N(-1),0.000000,,-18.023584,1419,True,-18.023584,0.000000,-18.023583,0,15N-oxobutanoic@C^Any N-term
15N-oxobutanoic@S^Protein N-term,-18.023900,Post-translational,H(-3)15N(-1),0.000000,,-18.023584,1419,True,-18.023584,0.000000,-18.023583,0,15N-oxobutanoic@S^Protein N-term
15N-oxobutanoic@T^Protein N-term,-18.023900,Post-translational,H(-3)15N(-1),0.000000,,-18.023584,1419,True,-18.023584,0.000000,-18.023583,0,15N-oxobutanoic@T^Protein N-term
2-dimethylsuccinyl@C,144.125300,Chemical derivative,H(8)C(6)O(4),0.000000,,144.042259,1262,True,144.042259,0.000000,144.042253,0,2-dimethylsuccinyl@C
...,...,...,...,...,...,...,...,...,...,...,...,...,...
spermidine@q,128.215300,Chemical derivative,H(16)C(7)N(2),0.000000,,128.131349,1421,False,128.131349,0.000000,128.131340,0,spermidine@q
spermine@q,185.309700,Chemical derivative,H(23)C(10)N(3),0.000000,,185.189198,1420,False,185.189198,0.000000,185.189185,0,spermine@q
sulfo+amino@y,95.077800,Chemical derivative,H(1)N(1)O(3)S(1),0.000000,,94.967714,997,False,94.967714,0.000000,94.967710,0,sulfo+amino@y
thioacylPA@k,159.206200,Chemical derivative,H(9)C(6)N(1)O(2)S(1),0.000000,,159.035399,967,False,159.035399,0.000000,159.035393,0,thioacylPA@k


## Be careful that the unimod_mass is different from formula-derived mass, we may have to check element.yaml file

In [None]:
for mod, unimod_mass, mass in MOD_DF[['name','unimod_mass','mass']].values:
    if abs(unimod_mass-mass) > 1e-4:
        print(f"{mod}: unimod mod={unimod_mass}, formula mass={mass}")

Cation:Cu[I]@Any C-term: unimod mod=61.921774, formula mass=61.9317734
Cation:Cu[I]@D: unimod mod=61.921774, formula mass=61.9317734
Cation:Cu[I]@E: unimod mod=61.921774, formula mass=61.9317734
Cation:Cu[I]@H: unimod mod=61.921774, formula mass=61.9317734
CuSMo@C: unimod mod=922.834855, formula mass=922.8448255000001
SUMO3549@K: unimod mod=3549.536568, formula mass=3549.536408
dHex(1)Hex(5)HexNAc(4)NeuAc(2)@N: unimod mod=2350.83035, formula mass=2350.8302473000003
Cation:Cu[I]@d: unimod mod=61.921774, formula mass=61.9317734
Cation:Cu[I]@e: unimod mod=61.921774, formula mass=61.9317734
Cation:Cu[I]@h: unimod mod=61.921774, formula mass=61.9317734
CuSMo@c: unimod mod=922.834855, formula mass=922.8448255000001
SUMO3549@k: unimod mod=3549.536568, formula mass=3549.536408
dHex(1)Hex(5)HexNAc(4)NeuAc(2)@n: unimod mod=2350.83035, formula mass=2350.8302473000003


# Mod site representation
* `site=0` refers to an N-term modification
* `site=-1` refers to a C-term modification
* `1<=site<=peplen` refers to a normal modification

For example: \_0A1B2C3D4E5F6G7H8I9J10K11\_-1

In [None]:
#export
def get_modification_mass(
    peplen:int, 
    mod_names:List[str], 
    mod_sites:List[int]
)->np.array:
    '''
    Get modification masses for the given peptide length (`peplen`), 
    and modified site list.
    Args:
        peplen (int): peptide length
        mod_names (List[str]): modification name list
        mod_sites (List[int]): modification site list corresponding 
            to `mod_names`.
            * `site=0` refers to an N-term modification
            * `site=-1` refers to a C-term modification
            * `1<=site<=peplen` refers to a normal modification
    Returns:
        np.array: 1-D array with length=`peplen`. 
            Masses of modifications through the peptide, 
            `0` if sites has no modifications
    '''
    masses = np.zeros(peplen)
    for site, mod in zip(mod_sites, mod_names):
        if site == 0:
            masses[site] += MOD_MASS[mod]
        elif site == -1:
            masses[site] += MOD_MASS[mod]
        else:
            masses[site-1] += MOD_MASS[mod]
    return masses

def get_modification_mass_sum(
    mod_names:List[str]
)->float:
    """
    Get summed mass of the given modification 
    without knowing the sites and peptide length.
    It is useful to calculate peptide mass.
    Args:
        mod_names (List[str]): modification name list
    Returns:
        float: total mass
    """
    return np.sum([
        MOD_MASS[mod] for mod in mod_names
    ])


In [None]:
seq = 'AGHCEWQMK'
mod_names = ['Acetyl@Protein N-term', 'Carbamidomethyl@C', 'Oxidation@M']
mod_sites = [0, 4, 8]

get_modification_mass(len(seq), mod_names, mod_sites)

array([42.0105633,  0.       ,  0.       , 57.0214611,  0.       ,
        0.       ,  0.       , 15.9949141,  0.       ])

In [None]:
%timeit get_modification_mass(len(seq), mod_names, mod_sites)

2.01 µs ± 22 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [None]:
#export
@numba.jit(nopython=True, nogil=True)
def _get_modloss(
    mod_losses: np.array, 
    _loss_importance: np.array
)->np.array:
    '''
    Get modification loss masses (e.g. -98 Da for Phospho@S/T, 
    -64 Da for Oxidation@M). Modification with higher `_loss_importance` 
    has higher priorities. For example, `AM(Oxidation@M)S(Phospho@S)...`,
    importance of Phospho@S > importance of Oxidation@M, so the modloss of 
    b3 ion will be -98 Da, not -64 Da.
    Args:
        mod_losses (np.array): mod loss masses of each AA position
        _loss_importance (np.array): mod loss importance of each AA position
    Returns:
        np.array: new mod_loss masses selected by `_loss_importance`
    '''
    prev_importance = _loss_importance[0]
    prev_most = 0
    for i, _curr_imp in enumerate(_loss_importance[1:]):
        if _curr_imp > prev_importance:
            prev_most = i+1
            prev_importance = _curr_imp
        else:
            mod_losses[i+1] = mod_losses[prev_most]
    return mod_losses

def get_modloss_mass(
    peplen: int, 
    mod_names: List, 
    mod_sites: List,
    for_nterm_frag: bool,
)->np.array:
    '''
    Get modification loss masses (e.g. -98 Da for Phospho@S/T, 
    -64 Da for Oxidation@M). Modifications with higher `MOD_LOSS_IMPORTANCE` 
    have higher priorities. For example, `AM(Oxidation@M)S(Phospho@S)...`,
    importance of Phospho@S > importance of Oxidation@M, so the modloss of 
    b3 ion will be -98 Da, not -64 Da.
    Args:
        peplen (int): peptide length
        mod_names (List[str]): modification name list
        mod_sites (List[int]): modification site list corresponding 
        for_nterm_frag (bool): if `True`, the loss will be on the 
            N-term fragments (mainly `b` ions); if `False`, the loss 
            will be on the C-term fragments (mainly `y` ions)
    Returns:
        np.array: mod_loss masses
    '''
    if not mod_names: return np.zeros(peplen-1)
    mod_losses = np.zeros(peplen+2)
    mod_losses[mod_sites] = [MOD_LOSS_MASS[mod] for mod in mod_names]
    _loss_importance = np.zeros(peplen+2)
    _loss_importance[mod_sites] = [
        MOD_LOSS_IMPORTANCE[mod] if mod in MOD_LOSS_IMPORTANCE else 0 
        for mod in mod_names
    ]
    
    # Will not consider the modloss if the corresponding modloss_importance is 0
    mod_losses[_loss_importance==0] = 0

    if for_nterm_frag:
        return _get_modloss(mod_losses, _loss_importance)[1:-2]
    else:
        return _get_modloss(mod_losses[::-1], _loss_importance[::-1])[-3:0:-1]


In [None]:
mod_names = ['Oxidation@M', 'Phospho@S', 'Carbamidomethyl@C']
mod_sites = [0, 4, 8]
MOD_LOSS_IMPORTANCE['Oxidation@M'] = 10
get_modloss_mass(10, mod_names, mod_sites, True)

array([63.9982825, 63.9982825, 63.9982825, 97.9768922, 97.9768922,
       97.9768922, 97.9768922, 97.9768922, 97.9768922])

In [None]:
MOD_LOSS_IMPORTANCE['Oxidation@M'] = 0
get_modloss_mass(10, mod_names, mod_sites, True)

array([ 0.       ,  0.       ,  0.       , 97.9768922, 97.9768922,
       97.9768922, 97.9768922, 97.9768922, 97.9768922])

In [None]:
MOD_LOSS_IMPORTANCE['Oxidation@M'] = 10
get_modloss_mass(10, mod_names, mod_sites, False)

array([97.9768922, 97.9768922, 97.9768922,  0.       ,  0.       ,
        0.       ,  0.       ,  0.       ,  0.       ])

### Note that get_modloss_mass is a little bit time comsuming
`%timeit get_modloss_mass(10, mod_names, mod_sites, False)`

`Results (12 seconds in total): 12.6 µs ± 96.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)`

## We can update modification list for differet requirements, for example:

In [None]:
MOD_DF.drop_duplicates('classification')

Unnamed: 0,avge_mass,classification,composition,modloss,modloss_composition,mono_mass,unimod_id,upper_case_AA,unimod_mass,unimod_modloss,mass,modloss_importance,name
GlyGly@K,114.042927,Post-translational,H(6)C(4)N(2)O(2),114.042922,H(6)C(4)N(2)O(2),114.042927,-1,True,114.042927,114.042927,114.042922,1000000.0,GlyGly@K
15N-oxobutanoic@C^Any N-term,-18.0239,Artefact,H(-3)15N(-1),0.0,,-18.023584,1419,True,-18.023584,0.0,-18.023583,0.0,15N-oxobutanoic@C^Any N-term
2-dimethylsuccinyl@C,144.1253,Chemical derivative,H(8)C(6)O(4),0.0,,144.042259,1262,True,144.042259,0.0,144.042253,0.0,2-dimethylsuccinyl@C
3-deoxyglucosone@R,144.1253,Multiple,H(8)C(6)O(4),0.0,,144.042259,949,True,144.042259,0.0,144.042253,0.0,3-deoxyglucosone@R
ADP-Ribosyl@C,541.3005,Other glycosylation,H(21)C(15)N(5)O(13)P(2),0.0,,541.06111,213,True,541.06111,0.0,541.06109,0.0,ADP-Ribosyl@C
ADP-Ribosyl@N,541.3005,N-linked glycosylation,H(21)C(15)N(5)O(13)P(2),541.06109,H(21)C(15)N(5)O(13)P(2),541.06111,213,True,541.06111,541.06111,541.06109,0.0,ADP-Ribosyl@N
ADP-Ribosyl@S,541.3005,O-linked glycosylation,H(21)C(15)N(5)O(13)P(2),541.06109,H(21)C(15)N(5)O(13)P(2),541.06111,213,True,541.06111,541.06111,541.06109,0.0,ADP-Ribosyl@S
AEC-MAEC:2H(4)@S,63.158,Isotopic label,H(1)2H(4)C(2)N(1)O(-1)S(1),0.0,,63.044462,792,True,63.044462,0.0,63.044462,0.0,AEC-MAEC:2H(4)@S
Ahx2+Hsl@Any C-term,309.4039,Non-standard residue,H(27)C(16)N(3)O(3),0.0,,309.205242,1015,True,309.205242,0.0,309.205226,0.0,Ahx2+Hsl@Any C-term
Ala->Arg@A,85.1078,AA substitution,H(7)C(3)N(3)O(0)S(0),0.0,,85.063997,1052,True,85.063997,0.0,85.063992,0.0,Ala->Arg@A


In [None]:
MOD_DF = MOD_DF[
    MOD_DF['upper_case_AA']
] # we only need PTMs
_update_all_by_MOD_DF()
pd.DataFrame().from_dict(MOD_INFO_DICT, orient='index')

Unnamed: 0,avge_mass,classification,composition,modloss,modloss_composition,mono_mass,unimod_id,upper_case_AA,unimod_mass,unimod_modloss,mass,modloss_importance,name
GlyGly@K,114.042927,Post-translational,H(6)C(4)N(2)O(2),114.042922,H(6)C(4)N(2)O(2),114.042927,-1,True,114.042927,114.042927,114.042922,1e6,GlyGly@K
15N-oxobutanoic@C^Any N-term,-18.023900,Artefact,H(-3)15N(-1),0.000000,,-18.023584,1419,True,-18.023584,0.000000,-18.023583,0,15N-oxobutanoic@C^Any N-term
15N-oxobutanoic@S^Protein N-term,-18.023900,Post-translational,H(-3)15N(-1),0.000000,,-18.023584,1419,True,-18.023584,0.000000,-18.023583,0,15N-oxobutanoic@S^Protein N-term
15N-oxobutanoic@T^Protein N-term,-18.023900,Post-translational,H(-3)15N(-1),0.000000,,-18.023584,1419,True,-18.023584,0.000000,-18.023583,0,15N-oxobutanoic@T^Protein N-term
2-dimethylsuccinyl@C,144.125300,Chemical derivative,H(8)C(6)O(4),0.000000,,144.042259,1262,True,144.042259,0.000000,144.042253,0,2-dimethylsuccinyl@C
...,...,...,...,...,...,...,...,...,...,...,...,...,...
spermidine@Q,128.215300,Chemical derivative,H(16)C(7)N(2),0.000000,,128.131349,1421,True,128.131349,0.000000,128.131340,0,spermidine@Q
spermine@Q,185.309700,Chemical derivative,H(23)C(10)N(3),0.000000,,185.189198,1420,True,185.189198,0.000000,185.189185,0,spermine@Q
sulfo+amino@Y,95.077800,Chemical derivative,H(1)N(1)O(3)S(1),0.000000,,94.967714,997,True,94.967714,0.000000,94.967710,0,sulfo+amino@Y
thioacylPA@K,159.206200,Chemical derivative,H(9)C(6)N(1)O(2)S(1),0.000000,,159.035399,967,True,159.035399,0.000000,159.035393,0,thioacylPA@K


In [None]:
MOD_DF = MOD_DF[
    (MOD_DF['classification'].isin(['Post-translational','O-linked glycosylation','AA substitution','Multiple','Non-standard residue','Pre-translational']))
    & MOD_DF['upper_case_AA']
] # we only need PTMs
_update_all_by_MOD_DF()
pd.DataFrame().from_dict(MOD_INFO_DICT, orient='index')

Unnamed: 0,avge_mass,classification,composition,modloss,modloss_composition,mono_mass,unimod_id,upper_case_AA,unimod_mass,unimod_modloss,mass,modloss_importance,name
GlyGly@K,114.042927,Post-translational,H(6)C(4)N(2)O(2),114.042922,H(6)C(4)N(2)O(2),114.042927,-1,True,114.042927,114.042927,114.042922,1e6,GlyGly@K
15N-oxobutanoic@S^Protein N-term,-18.023900,Post-translational,H(-3)15N(-1),0.000000,,-18.023584,1419,True,-18.023584,0.000000,-18.023583,0,15N-oxobutanoic@S^Protein N-term
15N-oxobutanoic@T^Protein N-term,-18.023900,Post-translational,H(-3)15N(-1),0.000000,,-18.023584,1419,True,-18.023584,0.000000,-18.023583,0,15N-oxobutanoic@T^Protein N-term
3-deoxyglucosone@R,144.125300,Multiple,H(8)C(6)O(4),0.000000,,144.042259,949,True,144.042259,0.000000,144.042253,0,3-deoxyglucosone@R
3-phosphoglyceryl@K,168.042000,Post-translational,H(5)C(3)O(6)P(1),0.000000,,167.982375,1387,True,167.982375,0.000000,167.982370,0,3-phosphoglyceryl@K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
pyrophospho@T,159.959800,Post-translational,H(2)O(6)P(2),176.935397,H(3)O(7)P(2),159.932662,898,True,159.932662,176.935402,159.932658,0,pyrophospho@T
s-GlcNAc@S,283.255700,O-linked glycosylation,H(13)C(8)N(1)O(8)S(1),283.036176,H(13)C(8)N(1)O(8)S(1),283.036187,1412,True,283.036187,283.036187,283.036176,0,s-GlcNAc@S
s-GlcNAc@T,283.255700,O-linked glycosylation,H(13)C(8)N(1)O(8)S(1),283.036176,H(13)C(8)N(1)O(8)S(1),283.036187,1412,True,283.036187,283.036187,283.036176,0,s-GlcNAc@T
serotonylation@Q,159.184600,Post-translational,H(9)C(10)N(1)O(1),0.000000,,159.068414,1992,True,159.068414,0.000000,159.068409,0,serotonylation@Q
