In [None]:
#---#| default_exp constants.modification

# Modification information

The default modification TSV is stored in `alphabase/constants/const_files/modification.tsv`. Please check it to add more modifications. 

First, we load `modification.tsv` into `MOD_DF`. 

Then, we extract information of `MOD_CHEM` (dict), `MOD_MASS` (dict), `MOD_LOSS_MASS` (dict), `MOD_INFO_DICT` (dict) ... from `MOD_DF`. This step is done in `update_all_by_MOD_DF`.

All these steps are done by `load_mod_df`.

In [None]:
from alphabase.constants.modification import * # TODO get rid of this
import alphabase.constants.modification as modification

In [None]:
modification.MOD_DF

Unnamed: 0_level_0,mod_name,unimod_mass,unimod_avge_mass,composition,unimod_modloss,modloss_composition,classification,unimod_id,modloss_importance,mass,modloss_original,modloss
mod_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Acetyl@T,Acetyl@T,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0,42.010565,0.0,0.0
Acetyl@Protein_N-term,Acetyl@Protein_N-term,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0,42.010565,0.0,0.0
Acetyl@S,Acetyl@S,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0,42.010565,0.0,0.0
Acetyl@C,Acetyl@C,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0,42.010565,0.0,0.0
Acetyl@Any_N-term,Acetyl@Any_N-term,42.010565,42.0367,H(2)C(2)O(1),0.0,,Multiple,1,0.0,42.010565,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
TMTpro_zero@K,TMTpro_zero@K,295.189592,295.3773,H(25)C(15)N(3)O(3),0.0,,Chemical derivative,2017,0.0,295.189592,0.0,0.0
TMTpro_zero@T,TMTpro_zero@T,295.189592,295.3773,H(25)C(15)N(3)O(3),0.0,,Chemical derivative,2017,0.0,295.189592,0.0,0.0
Andro-H2O@C,Andro-H2O@C,332.198760,332.4339,H(28)C(20)O(4),0.0,,Chemical derivative,2025,0.0,332.198759,0.0,0.0
His+O(2)@H,His+O(2)@H,169.048741,169.1381,H(7)C(6)N(3)O(3),0.0,,Post-translational,2027,0.0,169.048741,0.0,0.0


### With NIST elements, all calculated masses are the same as unimod_mass now.

In [None]:
#| hide
for mod, unimod_mass, mass in MOD_DF[['mod_name','unimod_mass','mass']].values:
    if abs(unimod_mass-mass) > 1e-5:
        print(f"{mod}: unimod mod={unimod_mass}, formula mass={mass}")

Delta:Hg(1)@C: unimod mod=201.970617, formula mass=201.9706434


# Mod site representation
* `site=0` refers to an N-term modification
* `site=-1` refers to a C-term modification
* `1<=site<=peplen` refers to a normal modification

For example: \_0A1B2C3D4E5F6G7H8I9J10K11\_-1

`calc_modification_mass` and `calc_modification_mass_sum` are base functions to calculate masses. But for large sets of peptides, we recommend to use `calc_mod_masses_for_same_len_seqs` instead of `calc_modification_mass`.

In [None]:
#| hide
seq = 'AGHCEWQMK'
mod_names = ['Acetyl@Protein_N-term', 'Carbamidomethyl@C', 'Oxidation@M']
mod_sites = [0, 4, 8]

assert np.allclose(
    calc_modification_mass(len(seq), mod_names, mod_sites),
    [42.01056468, 0, 0, 57.02146372, 0, 0, 0, 15.99491462, 0]
)

In [None]:
#| hide
mod_names = ['Oxidation@M', 'Phospho@S', 'Carbamidomethyl@C']
mod_sites = [0, 4, 8]

load_mod_df(modloss_importance_level=0)
assert np.allclose(
    calc_modloss_mass(10, mod_names, mod_sites, True),
    [63.99828592, 63.99828592, 63.99828592, 97.97689557, 97.97689557,
    97.97689557, 97.97689557, 97.97689557, 97.97689557]
)

load_mod_df(tsv=os.path.join(CONST_FILE_FOLDER, 'modification.tsv'), modloss_importance_level=1)
assert np.allclose(
    calc_modloss_mass(10, mod_names, mod_sites, True),
    [0, 0, 0, 97.97689557, 97.97689557,
    97.97689557, 97.97689557, 97.97689557, 97.97689557]
)

assert np.allclose(
    calc_modloss_mass(10, mod_names, mod_sites, False),
    [97.97689557, 97.97689557, 97.97689557,  0 ,  0,
        0,  0 ,  0,  0]
)

### Note that get_modloss_mass is a little bit time comsuming
`%timeit get_modloss_mass(10, mod_names, mod_sites, False)`

`Results (12 seconds in total): 12.6 µs ± 96.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)`

In [None]:
MOD_DF.drop_duplicates('classification')

Unnamed: 0_level_0,mod_name,unimod_mass,unimod_avge_mass,composition,unimod_modloss,modloss_composition,classification,unimod_id,modloss_importance,mass,modloss_original,modloss
mod_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Acetyl@T,Acetyl@T,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0,42.010565,0.0,0.0
Acetyl@Any_N-term,Acetyl@Any_N-term,42.010565,42.0367,H(2)C(2)O(1),0.0,,Multiple,1,0.0,42.010565,0.0,0.0
Acetyl@Y,Acetyl@Y,42.010565,42.0367,H(2)C(2)O(1),0.0,,Chemical derivative,1,0.0,42.010565,0.0,0.0
Acetyl@R,Acetyl@R,42.010565,42.0367,H(2)C(2)O(1),0.0,,Artefact,1,0.0,42.010565,0.0,0.0
ICAT-G@C,ICAT-G@C,486.251206,486.6253,H(38)C(22)N(4)O(6)S(1),0.0,,Isotopic label,8,0.0,486.251206,0.0,0.0
Oxidation@G^Any_C-term,Oxidation@G^Any_C-term,15.994915,15.9994,O(1),0.0,,Pre-translational,35,0.0,15.994915,0.0,0.0
Hex@C,Hex@C,162.052824,162.1406,H(10)C(6)O(5),0.0,,Other glycosylation,41,0.0,162.052823,0.0,0.0
Hex@T,Hex@T,162.052824,162.1406,H(10)C(6)O(5),162.052824,H(10)C(6)O(5),O-linked glycosylation,41,0.0,162.052823,162.052823,0.0
Hex@N,Hex@N,162.052824,162.1406,H(10)C(6)O(5),162.052824,H(10)C(6)O(5),N-linked glycosylation,41,0.0,162.052823,162.052823,0.0
His->Asn@H,His->Asn@H,-23.015984,-23.0366,H(-1)C(-2)N(-1)O(1),0.0,,AA substitution,348,0.0,-23.015984,0.0,0.0


## We can update modification list for differet requirements, for example:

In [None]:
add_modifications_for_lower_case_AA()
modification.MOD_DF

Unnamed: 0_level_0,mod_name,unimod_mass,unimod_avge_mass,composition,unimod_modloss,modloss_composition,classification,unimod_id,modloss_importance,mass,modloss_original,modloss,lower_case_AA
mod_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Acetyl@T,Acetyl@T,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0,42.010565,0.0,0.0,False
Acetyl@Protein_N-term,Acetyl@Protein_N-term,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0,42.010565,0.0,0.0,False
Acetyl@S,Acetyl@S,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0,42.010565,0.0,0.0,False
Acetyl@C,Acetyl@C,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0,42.010565,0.0,0.0,False
Acetyl@Any_N-term,Acetyl@Any_N-term,42.010565,42.0367,H(2)C(2)O(1),0.0,,Multiple,1,0.0,42.010565,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TMTpro_zero@k,TMTpro_zero@k,295.189592,295.3773,H(25)C(15)N(3)O(3),0.0,,Chemical derivative,2017,0.0,295.189592,0.0,0.0,True
TMTpro_zero@t,TMTpro_zero@t,295.189592,295.3773,H(25)C(15)N(3)O(3),0.0,,Chemical derivative,2017,0.0,295.189592,0.0,0.0,True
Andro-H2O@c,Andro-H2O@c,332.198760,332.4339,H(28)C(20)O(4),0.0,,Chemical derivative,2025,0.0,332.198759,0.0,0.0,True
His+O(2)@h,His+O(2)@h,169.048741,169.1381,H(7)C(6)N(3)O(3),0.0,,Post-translational,2027,0.0,169.048741,0.0,0.0,True


In [None]:

modification.MOD_DF = modification.MOD_DF[
    (modification.MOD_DF['classification'].isin(['Post-translational','O-linked glycosylation','AA substitution','Multiple','Non-standard residue','Pre-translational']))
    & modification.MOD_DF['lower_case_AA']
] # we only need PTMs
update_all_by_MOD_DF()
# MOD_INFO_DICT is also updated
pd.DataFrame().from_dict(MOD_INFO_DICT, orient='index')

Unnamed: 0,mod_name,unimod_mass,unimod_avge_mass,composition,unimod_modloss,modloss_composition,classification,unimod_id,modloss_importance,mass,modloss_original,modloss,lower_case_AA
Acetyl@t,Acetyl@t,42.010565,42.0367,H(2)C(2)O(1),0.000000,,Post-translational,1,0.0,42.010565,0.000000,0.0,True
Acetyl@s,Acetyl@s,42.010565,42.0367,H(2)C(2)O(1),0.000000,,Post-translational,1,0.0,42.010565,0.000000,0.0,True
Acetyl@c,Acetyl@c,42.010565,42.0367,H(2)C(2)O(1),0.000000,,Post-translational,1,0.0,42.010565,0.000000,0.0,True
Acetyl@k,Acetyl@k,42.010565,42.0367,H(2)C(2)O(1),0.000000,,Multiple,1,0.0,42.010565,0.000000,0.0,True
Biotin@k,Biotin@k,226.077598,226.2954,H(14)C(10)N(2)O(2)S(1),0.000000,,Post-translational,3,0.0,226.077599,0.000000,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Hex(3)HexNAc(3)NeuAc(3)@t,Hex(3)HexNAc(3)NeuAc(3)@t,1968.682838,1969.7631,H(120)C(75)N(6)O(54),1968.682838,H(120)C(75)N(6)O(54),O-linked glycosylation,1968,0.0,1968.682837,1968.682837,0.0,True
CIGG@k,CIGG@k,330.136176,330.4032,H(22)C(13)N(4)O(4)S(1),0.000000,,Post-translational,1990,0.0,330.136176,0.000000,0.0,True
GNLLFLACYCIGG@k,GNLLFLACYCIGG@k,1324.630800,1325.5980,H(92)C(61)N(14)O(15)S(2),0.000000,,Post-translational,1991,0.0,1324.630801,0.000000,0.0,True
His+O(2)@h,His+O(2)@h,169.048741,169.1381,H(7)C(6)N(3)O(3),0.000000,,Post-translational,2027,0.0,169.048741,0.000000,0.0,True


In [None]:
#| hide
load_mod_df()
add_modifications_for_lower_case_AA()
MOD_DF = modification.MOD_DF
MOD_DF = MOD_DF[
    MOD_DF['lower_case_AA']
] # we only need PTMs
assert MOD_DF['mod_name'].apply(lambda x: x[x.find('@')+1].islower()).all()

In [None]:
#| hide
prev_value = (modification.MOD_DF.classification=='User-added').sum()
add_new_modifications([
    ("Hello@S","H(2)"),
    ("World@S","O(10)","O(3)")
])
assert (modification.MOD_DF.classification=='User-added').sum() - prev_value == 2
assert 'Hello@S' in modification.MOD_DF.mod_name
assert 'World@S' in modification.MOD_DF.mod_name
assert modification.MOD_DF.loc['World@S','modloss'] > 0
assert modification.MOD_DF.loc['World@S','modloss_importance'] > 0
assert 'Hello@S' in MOD_Composition
assert 'World@S' in MOD_MASS

In [None]:
add_new_modifications({
    "Hi@S":{'composition':"H(2)"},
    "AlphaX@S":{'composition':"O(10)",'modloss_composition':"O(3)"}
})
assert (modification.MOD_DF.classification=='User-added').sum() - prev_value == 4
assert 'Hi@S' in modification.MOD_DF.mod_name
assert 'Hi@S' in modification.MOD_DF.index
assert 'AlphaX@S' in modification.MOD_DF.mod_name
assert 'AlphaX@S' in modification.MOD_DF.index
assert modification.MOD_DF.loc['AlphaX@S','modloss'] > 0
assert modification.MOD_DF.loc['AlphaX@S','modloss_importance'] > 0
assert 'Hi@S' in MOD_Composition
assert 'AlphaX@S' in MOD_MASS
assert 'AlphaX@S' in MOD_LOSS_IMPORTANCE
assert modification.MOD_DF.loc['AlphaX@S','unimod_mass'] == 0

In [None]:
modification.MOD_DF

Unnamed: 0_level_0,mod_name,unimod_mass,unimod_avge_mass,composition,unimod_modloss,modloss_composition,classification,unimod_id,modloss_importance,mass,modloss_original,modloss,lower_case_AA
mod_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Acetyl@T,Acetyl@T,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1.0,0.000000e+00,42.010565,0.0,0.000000,False
Acetyl@Protein_N-term,Acetyl@Protein_N-term,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1.0,0.000000e+00,42.010565,0.0,0.000000,False
Acetyl@S,Acetyl@S,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1.0,0.000000e+00,42.010565,0.0,0.000000,False
Acetyl@C,Acetyl@C,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1.0,0.000000e+00,42.010565,0.0,0.000000,False
Acetyl@Any_N-term,Acetyl@Any_N-term,42.010565,42.0367,H(2)C(2)O(1),0.0,,Multiple,1.0,0.000000e+00,42.010565,0.0,0.000000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
GlyGly@k,GlyGly@k,114.042927,114.1026,H(6)C(4)N(2)O(2),0.0,,Post-translational,121.0,1.000000e+06,114.042927,0.0,0.000000,True
Hello@S,Hello@S,0.000000,0.0000,H(2),0.0,,User-added,0.0,0.000000e+00,2.015650,0.0,0.000000,0
World@S,World@S,0.000000,0.0000,O(10),0.0,O(3),User-added,0.0,1.000000e+100,159.949146,0.0,47.984744,0
Hi@S,Hi@S,0.000000,0.0000,H(2),0.0,,User-added,0.0,0.000000e+00,2.015650,0.0,0.000000,0
