# Retrieve modifications from Unimod and update modifications.tsv

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import tempfile
import os
import urllib.request

## Retrieve unimod data

In [None]:
url = "https://www.unimod.org/xml/unimod.xml"
xmlns = '{http://www.unimod.org/xmlns/schema/unimod_2}'

# download unimod.xml to temp directory
temp_dir = tempfile.mkdtemp()
temp_file = os.path.join(temp_dir, 'unimod.xml')
urllib.request.urlretrieve(url, temp_file)

In [None]:
def get_composition(node):
    composition = ""
    for elem in node.findall(f'{xmlns}element'):
        composition += elem.attrib['symbol']+'('+elem.attrib['number']+')'
    return composition

def replace_modseq_with_whitespace(modseq):
    return modseq.replace(" ", "_")

xml = ET.parse(temp_file)
root = xml.getroot()

unimod = {}
for modifications in root.findall(f'{xmlns}modifications'):
    for mod in modifications.findall(f'{xmlns}mod'):
        modname = mod.attrib['title']
        id = mod.attrib['record_id']
        for delta in mod.findall(f'{xmlns}delta'):
            unimod_mass = delta.attrib['mono_mass']
            unimod_avge_mass = delta.attrib['avge_mass']
            composition = get_composition(delta)
            break
        for specificity in mod.findall(f'{xmlns}specificity'):
            pos = specificity.attrib['position']
            site = specificity.attrib['site']
            _class = specificity.attrib['classification']
            if site == 'N-term' or site == 'C-term':
                site = pos
            elif pos.startswith('Any ') or pos.startswith('Protein '):
                site = site + '^' + pos

            ptm_nl = 0
            ptm_nl_composition = ""
            for nl in specificity.findall(f'{xmlns}NeutralLoss'):
                if nl.attrib['mono_mass'] == '0': continue
                ptm_nl = nl.attrib['mono_mass']
                ptm_nl_composition = get_composition(nl)
                break
            mod_site = f'{modname}@{site}'
            mod_site = replace_modseq_with_whitespace(mod_site)

            unimod[mod_site] = {}
            unimod[mod_site]['unimod_mass'] = float(unimod_mass)
            unimod[mod_site]['unimod_avge_mass'] = float(unimod_avge_mass)
            unimod[mod_site]['composition'] = composition
            unimod[mod_site]['unimod_modloss'] = float(ptm_nl)
            unimod[mod_site]['modloss_composition'] = ptm_nl_composition
            unimod[mod_site]['classification'] = _class
            unimod[mod_site]['unimod_id'] = int(id)
            unimod[mod_site]['smiles'] = ''


            if '~' in site:
                print(mod_site)
                mod_site = f'{modname}@{pos}'
                mod_site = replace_modseq_with_whitespace(mod_site)

                unimod[mod_site] = {}
                unimod[mod_site]['unimod_mass'] = float(unimod_mass)
                unimod[mod_site]['unimod_avge_mass'] = float(unimod_avge_mass)
                unimod[mod_site]['composition'] = composition
                unimod[mod_site]['unimod_modloss'] = float(ptm_nl)
                unimod[mod_site]['modloss_composition'] = ptm_nl_composition
                unimod[mod_site]['classification'] = _class
                unimod[mod_site]['unimod_id'] = int(id)
                unimod[mod_site]['smiles'] = ''

## Construct modification dataframe

In [None]:
df = pd.DataFrame().from_dict(unimod, orient='index')
df['modloss_importance'] = 0
df.loc[df.modloss_composition != '','modloss_importance'] = 0.5
df.loc['Phospho@S','modloss_importance'] = 1e8
df.loc['Phospho@T','modloss_importance'] = 1e7
df.loc['GG@K','modloss_importance'] = 1e6
df.loc['GlyGly@K',:] = df.loc['GG@K']
df.loc['GlyGly@K','classification'] = 'Multiple'
df['mod_name'] = df.index.values
df = df[['mod_name']+[col for col in df.columns if col != 'mod_name']]
df['unimod_id'] = df.unimod_id.astype(int)
df

## Inspect added modifications

In [None]:
from alphabase.constants.modification import MOD_DF

stored_columns = ['mod_name', 'unimod_mass', 'unimod_avge_mass', 'composition', 'unimod_modloss', 'modloss_composition', 'classification', 'unimod_id', 'modloss_importance','smiles']

if 'smiles' not in MOD_DF.columns:
    MOD_DF['smiles'] = ''

mod_df = MOD_DF[stored_columns]
new_modifications = df[~df.index.isin(mod_df.index)]

In [None]:
new_mod_df = pd.concat([mod_df, new_modifications])
new_mod_df = new_mod_df.sort_values(by=['unimod_id','mod_name'])

In [None]:
new_mod_df

## Save updated modification dataframe

In [None]:
df.to_csv('../alphabase/constants/const_files/modification.tsv', index=False, sep='\t', header=True)