In [None]:
import xml.etree.ElementTree as ET
import yaml
import pandas as pd

In [None]:
def save_yaml(filename, unimod):
    with open(filename, "w") as file:
        yaml.dump(unimod, file)


def get_composition(node):
    composition = ""
    for elem in node.findall(f'{xmlns}element'):
        composition += elem.attrib['symbol']+'('+elem.attrib['number']+')'
    return composition

In [1]:
xml = ET.parse('unimod.xml')
root = xml.getroot()

xmlns = '{http://www.unimod.org/xmlns/schema/unimod_2}'
unimod = {}
for modifications in root.findall(f'{xmlns}modifications'):
    for mod in modifications.findall(f'{xmlns}mod'):
        modname = mod.attrib['title']
        id = mod.attrib['record_id']
        for delta in mod.findall(f'{xmlns}delta'):
            unimod_mass = delta.attrib['mono_mass']
            unimod_avge_mass = delta.attrib['avge_mass']
            composition = get_composition(delta)
            break
        for specificity in mod.findall(f'{xmlns}specificity'):
            pos = specificity.attrib['position']
            site = specificity.attrib['site']
            _class = specificity.attrib['classification']
            if site == 'N-term' or site == 'C-term':
                site = pos
            elif pos.startswith('Any ') or pos.startswith('Protein '):
                site = site + '^' + pos

            ptm_nl = 0
            ptm_nl_composition = ""
            for nl in specificity.findall(f'{xmlns}NeutralLoss'):
                if nl.attrib['mono_mass'] == '0': continue
                ptm_nl = nl.attrib['mono_mass']
                ptm_nl_composition = get_composition(nl)
                break
            mod_site = f'{modname}@{site}'
            unimod[mod_site] = {}
            unimod[mod_site]['unimod_mass'] = float(unimod_mass)
            unimod[mod_site]['unimod_avge_mass'] = float(unimod_avge_mass)
            unimod[mod_site]['composition'] = composition
            unimod[mod_site]['unimod_modloss'] = float(ptm_nl)
            unimod[mod_site]['modloss_composition'] = ptm_nl_composition
            unimod[mod_site]['classification'] = _class
            unimod[mod_site]['unimod_id'] = int(id)

            if '~' in site:
                print(mod_site)
                mod_site = f'{modname}@{pos}'
                unimod[mod_site] = {}
                unimod[mod_site]['unimod_mass'] = float(unimod_mass)
                unimod[mod_site]['unimod_avge_mass'] = float(unimod_avge_mass)
                unimod[mod_site]['composition'] = composition
                unimod[mod_site]['unimod_modloss'] = float(ptm_nl)
                unimod[mod_site]['modloss_composition'] = ptm_nl_composition
                unimod[mod_site]['classification'] = _class
                unimod[mod_site]['unimod_id'] = int(id)

In [16]:
df = pd.DataFrame().from_dict(unimod, orient='index')
df.index = df.index.str.replace(" ", "_", regex=False)
df['modloss_importance'] = 0.0
df.loc[df.modloss_composition != '','modloss_importance'] = 0.5
df.loc['Phospho@S','modloss_importance'] = 1e8
df.loc['Phospho@T','modloss_importance'] = 1e7
df.loc['GG@K','modloss_importance'] = 1e6
df.loc['GlyGly@K',:] = df.loc['GG@K']
df.loc['GlyGly@K','classification'] = 'Multiple'
df['mod_name'] = df.index.values
df = df[['mod_name']+[col for col in df.columns if col != 'mod_name']]
df['unimod_id'] = df.unimod_id.astype(int)
df

Unnamed: 0,mod_name,unimod_mass,unimod_avge_mass,composition,unimod_modloss,modloss_composition,classification,unimod_id,modloss_importance
Acetyl@T,Acetyl@T,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0
Acetyl@Protein_N-term,Acetyl@Protein_N-term,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0
Acetyl@S,Acetyl@S,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0
Acetyl@C,Acetyl@C,42.010565,42.0367,H(2)C(2)O(1),0.0,,Post-translational,1,0.0
Acetyl@Any_N-term,Acetyl@Any_N-term,42.010565,42.0367,H(2)C(2)O(1),0.0,,Multiple,1,0.0
...,...,...,...,...,...,...,...,...,...
TMTpro_zero@K,TMTpro_zero@K,295.189592,295.3773,H(25)C(15)N(3)O(3),0.0,,Chemical derivative,2017,0.0
TMTpro_zero@T,TMTpro_zero@T,295.189592,295.3773,H(25)C(15)N(3)O(3),0.0,,Chemical derivative,2017,0.0
Andro-H2O@C,Andro-H2O@C,332.198760,332.4339,H(28)C(20)O(4),0.0,,Chemical derivative,2025,0.0
His+O(2)@H,His+O(2)@H,169.048741,169.1381,H(7)C(6)N(3)O(3),0.0,,Post-translational,2027,0.0


In [15]:
df.to_csv('../alphabase/constants/const_files/modification.tsv', index=False, sep='\t', header=True)