In [None]:
#---#| default_exp constants.modification

# Modification information

The default modification TSV is stored in `alphabase/constants/const_files/modification.tsv`. Please check it to add more modifications. 

First, we load `modification.tsv` into `MOD_DF`. 

Then, we extract information of `MOD_CHEM` (dict), `MOD_MASS` (dict), `MOD_LOSS_MASS` (dict), `MOD_INFO_DICT` (dict) ... from `MOD_DF`. This step is done in `update_all_by_MOD_DF`.

All these steps are done by `load_mod_df`.

In [None]:
from alphabase.constants.modification import *
import alphabase.constants.modification as modification

In [None]:
modification.MOD_DF

Unnamed: 0_level_0,mod_name,avge_mass,classification,composition,modloss_composition,mono_mass,unimod_id,unimod_mass,unimod_modloss,modloss_importance,mass,modloss_original,modloss,lower_case_AA
mod_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
GlyGly@K,GlyGly@K,114.042927,Post-translational,H(6)C(4)N(2)O(2),H(6)C(4)N(2)O(2),114.042927,121,114.042927,114.042927,1000000.0,114.042927,114.042927,114.042927,False
15N-oxobutanoic@C^Any N-term,15N-oxobutanoic@C^Any N-term,-18.023900,Artefact,H(-3)15N(-1),,-18.023584,1419,-18.023584,0.000000,0.0,-18.023584,0.000000,0.000000,False
15N-oxobutanoic@S^Protein N-term,15N-oxobutanoic@S^Protein N-term,-18.023900,Post-translational,H(-3)15N(-1),,-18.023584,1419,-18.023584,0.000000,0.0,-18.023584,0.000000,0.000000,False
15N-oxobutanoic@T^Protein N-term,15N-oxobutanoic@T^Protein N-term,-18.023900,Post-translational,H(-3)15N(-1),,-18.023584,1419,-18.023584,0.000000,0.0,-18.023584,0.000000,0.000000,False
2-dimethylsuccinyl@C,2-dimethylsuccinyl@C,144.125300,Chemical derivative,H(8)C(6)O(4),,144.042259,1262,144.042259,0.000000,0.0,144.042259,0.000000,0.000000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
spermidine@q,spermidine@q,128.215300,Chemical derivative,H(16)C(7)N(2),,128.131349,1421,128.131349,0.000000,0.0,128.131349,0.000000,0.000000,True
spermine@q,spermine@q,185.309700,Chemical derivative,H(23)C(10)N(3),,185.189198,1420,185.189198,0.000000,0.0,185.189198,0.000000,0.000000,True
sulfo+amino@y,sulfo+amino@y,95.077800,Chemical derivative,H(1)N(1)O(3)S(1),,94.967714,997,94.967714,0.000000,0.0,94.967714,0.000000,0.000000,True
thioacylPA@k,thioacylPA@k,159.206200,Chemical derivative,H(9)C(6)N(1)O(2)S(1),,159.035399,967,159.035399,0.000000,0.0,159.035400,0.000000,0.000000,True


### With NIST elements, all calculated masses are the same as unimod_mass now.

In [None]:
#| hide
for mod, unimod_mass, mass in MOD_DF[['mod_name','unimod_mass','mass']].values:
    if abs(unimod_mass-mass) > 1e-5:
        print(f"{mod}: unimod mod={unimod_mass}, formula mass={mass}")

Delta:Hg(1)@C: unimod mod=201.970617, formula mass=201.9706434
Delta:Hg(1)@c: unimod mod=201.970617, formula mass=201.9706434
Delta:Hg(1)@c: unimod mod=201.970617, formula mass=201.9706434
Delta:Hg(1)@c: unimod mod=201.970617, formula mass=201.9706434


# Mod site representation
* `site=0` refers to an N-term modification
* `site=-1` refers to a C-term modification
* `1<=site<=peplen` refers to a normal modification

For example: \_0A1B2C3D4E5F6G7H8I9J10K11\_-1

`calc_modification_mass` and `calc_modification_mass_sum` are base functions to calculate masses. But for large sets of peptides, we recommend to use `calc_mod_masses_for_same_len_seqs` instead of `calc_modification_mass`.

In [None]:
#| hide
seq = 'AGHCEWQMK'
mod_names = ['Acetyl@Protein N-term', 'Carbamidomethyl@C', 'Oxidation@M']
mod_sites = [0, 4, 8]

assert np.allclose(
    calc_modification_mass(len(seq), mod_names, mod_sites),
    [42.01056468, 0, 0, 57.02146372, 0, 0, 0, 15.99491462, 0]
)

In [None]:
#| hide
mod_names = ['Oxidation@M', 'Phospho@S', 'Carbamidomethyl@C']
mod_sites = [0, 4, 8]

load_mod_df(modloss_importance_level=0)
assert np.allclose(
    calc_modloss_mass(10, mod_names, mod_sites, True),
    [63.99828592, 63.99828592, 63.99828592, 97.97689557, 97.97689557,
    97.97689557, 97.97689557, 97.97689557, 97.97689557]
)

load_mod_df(tsv=os.path.join(CONST_FILE_FOLDER, 'modification.tsv'), modloss_importance_level=1)
assert np.allclose(
    calc_modloss_mass(10, mod_names, mod_sites, True),
    [0, 0, 0, 97.97689557, 97.97689557,
    97.97689557, 97.97689557, 97.97689557, 97.97689557]
)

assert np.allclose(
    calc_modloss_mass(10, mod_names, mod_sites, False),
    [97.97689557, 97.97689557, 97.97689557,  0 ,  0,
        0,  0 ,  0,  0]
)

### Note that get_modloss_mass is a little bit time comsuming
`%timeit get_modloss_mass(10, mod_names, mod_sites, False)`

`Results (12 seconds in total): 12.6 µs ± 96.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)`

In [None]:
MOD_DF.drop_duplicates('classification')

Unnamed: 0_level_0,mod_name,avge_mass,classification,composition,modloss_composition,mono_mass,unimod_id,unimod_mass,unimod_modloss,modloss_importance,mass,modloss_original,modloss,lower_case_AA
mod_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
GlyGly@K,GlyGly@K,114.042927,Post-translational,H(6)C(4)N(2)O(2),H(6)C(4)N(2)O(2),114.042927,121,114.042927,114.042927,1000000.0,114.042927,114.042927,114.042927,False
15N-oxobutanoic@C^Any N-term,15N-oxobutanoic@C^Any N-term,-18.0239,Artefact,H(-3)15N(-1),,-18.023584,1419,-18.023584,0.0,0.0,-18.023584,0.0,0.0,False
2-dimethylsuccinyl@C,2-dimethylsuccinyl@C,144.1253,Chemical derivative,H(8)C(6)O(4),,144.042259,1262,144.042259,0.0,0.0,144.042259,0.0,0.0,False
3-deoxyglucosone@R,3-deoxyglucosone@R,144.1253,Multiple,H(8)C(6)O(4),,144.042259,949,144.042259,0.0,0.0,144.042259,0.0,0.0,False
ADP-Ribosyl@C,ADP-Ribosyl@C,541.3005,Other glycosylation,H(21)C(15)N(5)O(13)P(2),,541.06111,213,541.06111,0.0,0.0,541.06111,0.0,0.0,False
ADP-Ribosyl@N,ADP-Ribosyl@N,541.3005,N-linked glycosylation,H(21)C(15)N(5)O(13)P(2),H(21)C(15)N(5)O(13)P(2),541.06111,213,541.06111,541.06111,0.0,541.06111,541.06111,0.0,False
ADP-Ribosyl@S,ADP-Ribosyl@S,541.3005,O-linked glycosylation,H(21)C(15)N(5)O(13)P(2),H(21)C(15)N(5)O(13)P(2),541.06111,213,541.06111,541.06111,0.0,541.06111,541.06111,0.0,False
AEC-MAEC:2H(4)@S,AEC-MAEC:2H(4)@S,63.158,Isotopic label,H(1)2H(4)C(2)N(1)O(-1)S(1),,63.044462,792,63.044462,0.0,0.0,63.044463,0.0,0.0,False
Ahx2+Hsl@Any C-term,Ahx2+Hsl@Any C-term,309.4039,Non-standard residue,H(27)C(16)N(3)O(3),,309.205242,1015,309.205242,0.0,0.0,309.205242,0.0,0.0,False
Ala->Arg@A,Ala->Arg@A,85.1078,AA substitution,H(7)C(3)N(3)O(0)S(0),,85.063997,1052,85.063997,0.0,0.0,85.063997,0.0,0.0,False


## We can update modification list for differet requirements, for example:

In [None]:
add_modifications_for_lower_case_AA()
modification.MOD_DF

Unnamed: 0_level_0,mod_name,avge_mass,classification,composition,modloss_composition,mono_mass,unimod_id,unimod_mass,unimod_modloss,modloss_importance,mass,modloss_original,modloss,lower_case_AA
mod_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
GlyGly@K,GlyGly@K,114.042927,Post-translational,H(6)C(4)N(2)O(2),H(6)C(4)N(2)O(2),114.042927,121,114.042927,114.042927,1000000.0,114.042927,114.042927,114.042927,False
15N-oxobutanoic@C^Any N-term,15N-oxobutanoic@C^Any N-term,-18.023900,Artefact,H(-3)15N(-1),,-18.023584,1419,-18.023584,0.000000,0.0,-18.023584,0.000000,0.000000,False
15N-oxobutanoic@S^Protein N-term,15N-oxobutanoic@S^Protein N-term,-18.023900,Post-translational,H(-3)15N(-1),,-18.023584,1419,-18.023584,0.000000,0.0,-18.023584,0.000000,0.000000,False
15N-oxobutanoic@T^Protein N-term,15N-oxobutanoic@T^Protein N-term,-18.023900,Post-translational,H(-3)15N(-1),,-18.023584,1419,-18.023584,0.000000,0.0,-18.023584,0.000000,0.000000,False
2-dimethylsuccinyl@C,2-dimethylsuccinyl@C,144.125300,Chemical derivative,H(8)C(6)O(4),,144.042259,1262,144.042259,0.000000,0.0,144.042259,0.000000,0.000000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
spermidine@q,spermidine@q,128.215300,Chemical derivative,H(16)C(7)N(2),,128.131349,1421,128.131349,0.000000,0.0,128.131349,0.000000,0.000000,True
spermine@q,spermine@q,185.309700,Chemical derivative,H(23)C(10)N(3),,185.189198,1420,185.189198,0.000000,0.0,185.189198,0.000000,0.000000,True
sulfo+amino@y,sulfo+amino@y,95.077800,Chemical derivative,H(1)N(1)O(3)S(1),,94.967714,997,94.967714,0.000000,0.0,94.967714,0.000000,0.000000,True
thioacylPA@k,thioacylPA@k,159.206200,Chemical derivative,H(9)C(6)N(1)O(2)S(1),,159.035399,967,159.035399,0.000000,0.0,159.035400,0.000000,0.000000,True


In [None]:

modification.MOD_DF = modification.MOD_DF[
    (modification.MOD_DF['classification'].isin(['Post-translational','O-linked glycosylation','AA substitution','Multiple','Non-standard residue','Pre-translational']))
    & modification.MOD_DF['lower_case_AA']
] # we only need PTMs
update_all_by_MOD_DF()
# MOD_INFO_DICT is also updated
pd.DataFrame().from_dict(MOD_INFO_DICT, orient='index')

Unnamed: 0,mod_name,avge_mass,classification,composition,modloss_composition,mono_mass,unimod_id,unimod_mass,unimod_modloss,modloss_importance,mass,modloss_original,modloss,lower_case_AA
GlyGly@k,GlyGly@k,114.042927,Post-translational,H(6)C(4)N(2)O(2),H(6)C(4)N(2)O(2),114.042927,121,114.042927,114.042927,1000000.0,114.042927,114.042927,114.042927,True
15N-oxobutanoic@s^Protein N-term,15N-oxobutanoic@s^Protein N-term,-18.023900,Post-translational,H(-3)15N(-1),,-18.023584,1419,-18.023584,0.000000,0.0,-18.023584,0.000000,0.000000,True
15N-oxobutanoic@t^Protein N-term,15N-oxobutanoic@t^Protein N-term,-18.023900,Post-translational,H(-3)15N(-1),,-18.023584,1419,-18.023584,0.000000,0.0,-18.023584,0.000000,0.000000,True
3-deoxyglucosone@r,3-deoxyglucosone@r,144.125300,Multiple,H(8)C(6)O(4),,144.042259,949,144.042259,0.000000,0.0,144.042259,0.000000,0.000000,True
3-phosphoglyceryl@k,3-phosphoglyceryl@k,168.042000,Post-translational,H(5)C(3)O(6)P(1),,167.982375,1387,167.982375,0.000000,0.0,167.982375,0.000000,0.000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pyrophospho@t,pyrophospho@t,159.959800,Post-translational,H(2)O(6)P(2),H(3)O(7)P(2),159.932662,898,159.932662,176.935402,0.0,159.932662,176.935401,0.000000,True
s-GlcNAc@s,s-GlcNAc@s,283.255700,O-linked glycosylation,H(13)C(8)N(1)O(8)S(1),H(13)C(8)N(1)O(8)S(1),283.036187,1412,283.036187,283.036187,0.0,283.036188,283.036188,0.000000,True
s-GlcNAc@t,s-GlcNAc@t,283.255700,O-linked glycosylation,H(13)C(8)N(1)O(8)S(1),H(13)C(8)N(1)O(8)S(1),283.036187,1412,283.036187,283.036187,0.0,283.036188,283.036188,0.000000,True
serotonylation@q,serotonylation@q,159.184600,Post-translational,H(9)C(10)N(1)O(1),,159.068414,1992,159.068414,0.000000,0.0,159.068414,0.000000,0.000000,True


In [None]:
#| hide
load_mod_df()
add_modifications_for_lower_case_AA()
MOD_DF = modification.MOD_DF
MOD_DF = MOD_DF[
    MOD_DF['lower_case_AA']
] # we only need PTMs
assert MOD_DF['mod_name'].apply(lambda x: x[x.find('@')+1].islower()).all()

In [None]:
#| hide
add_new_modifications([
    ("Hello@S","H(2)"),
    ("World@S","O(10)","O(3)")
])
assert (modification.MOD_DF.classification=='User-added').sum()==2
assert 'Hello@S' in modification.MOD_DF.mod_name
assert 'World@S' in modification.MOD_DF.mod_name
assert modification.MOD_DF.loc['World@S','modloss'] > 0
assert modification.MOD_DF.loc['World@S','modloss_importance'] > 0
assert 'Hello@S' in MOD_formula
assert 'World@S' in MOD_MASS

In [None]:
add_new_modifications({
    "Hi@S":{'composition':"H(2)"},
    "AlphaX@S":{'composition':"O(10)",'modloss_composition':"O(3)"}
})
assert (modification.MOD_DF.classification=='User-added').sum()==4
assert 'Hi@S' in modification.MOD_DF.mod_name
assert 'Hi@S' in modification.MOD_DF.index
assert 'AlphaX@S' in modification.MOD_DF.mod_name
assert 'AlphaX@S' in modification.MOD_DF.index
assert modification.MOD_DF.loc['AlphaX@S','modloss'] > 0
assert modification.MOD_DF.loc['AlphaX@S','modloss_importance'] > 0
assert 'Hi@S' in MOD_formula
assert 'AlphaX@S' in MOD_MASS
assert 'AlphaX@S' in MOD_LOSS_IMPORTANCE