Skip to content

Commit

Permalink
adding code to pull RNA modifications from BRENDA
Browse files Browse the repository at this point in the history
  • Loading branch information
jonrkarr committed Apr 23, 2020
1 parent fe25870 commit 648dbbf
Showing 1 changed file with 267 additions and 0 deletions.
267 changes: 267 additions & 0 deletions datanator/data_source/modomics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
""" Use BpForms to gather rRNA and tRNA modification information from
`MODOMICS <https://iimcb.genesilico.pl/modomics/>`.
:Author: Jonathan Karr <karr@mssm.edu>
:Date: 2020-04-23
:Copyright: 2019, Karr Lab
:License: MIT
"""

import bpforms
import bs4
import csv
import os
import requests
import requests_cache


class Modomics(object):
""" Use BpForms to gather rRNA and tRNA modification information from
`MODOMICS <https://iimcb.genesilico.pl/modomics/>`.
"""
ENDPOINT = 'https://iimcb.genesilico.pl/modomics/sequences/'

def run(self):
# create dict of MODOMICS single character monomer codes
modomics_short_code_to_monomer = {}
for monomer in bpforms.rna_alphabet.monomers.values():
for identifier in monomer.identifiers:
if identifier.ns == 'modomics.short_name':
modomics_short_code_to_monomer[identifier.id] = monomer
modomics_short_code_to_monomer['a'] = bpforms.rna_alphabet.monomers.get('A')
modomics_short_code_to_monomer['c'] = bpforms.rna_alphabet.monomers.get('C')
modomics_short_code_to_monomer['g'] = bpforms.rna_alphabet.monomers.get('G')
modomics_short_code_to_monomer['u'] = bpforms.rna_alphabet.monomers.get('U')
modomics_short_code_to_monomer['b'] = bpforms.rna_alphabet.monomers.get('0522U')
modomics_short_code_to_monomer['B'] = bpforms.rna_alphabet.monomers.get('0C')
modomics_short_code_to_monomer['E'] = bpforms.rna_alphabet.monomers.get('662A')
modomics_short_code_to_monomer['h'] = bpforms.rna_alphabet.monomers.get('21511U')
# modomics_short_code_to_monomer['H'] = bpforms.rna_alphabet.monomers.get('0C')
modomics_short_code_to_monomer['J'] = bpforms.rna_alphabet.monomers.get('0U')
modomics_short_code_to_monomer['l'] = bpforms.rna_alphabet.monomers.get('253U')
modomics_short_code_to_monomer['L'] = bpforms.rna_alphabet.monomers.get('2G')
modomics_short_code_to_monomer['K'] = bpforms.rna_alphabet.monomers.get('1G')
modomics_short_code_to_monomer['M'] = bpforms.rna_alphabet.monomers.get('42C')
# modomics_short_code_to_monomer['N'] = bpforms.rna_alphabet.monomers.get('?U')
modomics_short_code_to_monomer['P'] = bpforms.rna_alphabet.monomers.get('9U')
modomics_short_code_to_monomer['R'] = bpforms.rna_alphabet.monomers.get('22G')
modomics_short_code_to_monomer['T'] = bpforms.rna_alphabet.monomers.get('5U')
modomics_short_code_to_monomer['Z'] = bpforms.rna_alphabet.monomers.get('09U')
modomics_short_code_to_monomer['7'] = bpforms.rna_alphabet.monomers.get('7G')
modomics_short_code_to_monomer['#'] = bpforms.rna_alphabet.monomers.get('0G')
modomics_short_code_to_monomer[':'] = bpforms.rna_alphabet.monomers.get('0A')
modomics_short_code_to_monomer['='] = bpforms.rna_alphabet.monomers.get('6A')
modomics_short_code_to_monomer['?'] = bpforms.rna_alphabet.monomers.get('5C')
modomics_short_code_to_monomer['λ'] = bpforms.rna_alphabet.monomers.get('04C')
modomics_short_code_to_monomer['"'] = bpforms.rna_alphabet.monomers.get('1A')
modomics_short_code_to_monomer["'"] = bpforms.rna_alphabet.monomers.get('3C')
modomics_short_code_to_monomer[','] = bpforms.rna_alphabet.monomers.get('522U')
modomics_short_code_to_monomer['\\'] = bpforms.rna_alphabet.monomers.get('05U')
modomics_short_code_to_monomer['ℑ'] = bpforms.rna_alphabet.monomers.get('00G')
modomics_short_code_to_monomer[']'] = bpforms.rna_alphabet.monomers.get('19U')
modomics_short_code_to_monomer['ˆ'] = bpforms.rna_alphabet.monomers.get('00A')
modomics_short_code_to_monomer['gluQtRNA'] = bpforms.rna_alphabet.monomers.get('105G')
modomics_short_code_to_monomer['m22G'] = bpforms.rna_alphabet.monomers.get('22G')
# modomics_short_code_to_monomer[';'] = bpforms.rna_alphabet.monomers.get('?G')
# modomics_short_code_to_monomer['<'] = bpforms.rna_alphabet.monomers.get('?C')

# output directory
out_dir = os.path.dirname(__file__)

# create cache for web queries
cache_name = os.path.join(out_dir, 'modomics')
session = requests_cache.core.CachedSession(cache_name, backend='sqlite', expire_after=None)
session.mount(self.ENDPOINT, requests.adapters.HTTPAdapter(max_retries=5))

# parse rRNA and tRNA data
monomer_codes = {}
rrna_forms = self._run_rrna(session, modomics_short_code_to_monomer, monomer_codes,
os.path.join(out_dir, 'modomics.rrna.tsv'))
trna_forms = self._run_trna(
session, modomics_short_code_to_monomer, monomer_codes, os.path.join(out_dir, 'modomics.trna.tsv'))

# return results
return rrna_forms, trna_forms

def _run_rrna(self, session, modomics_short_code_to_monomer, monomer_codes, out_filename):
response = session.get(self.ENDPOINT, params={
'RNA_type': 'rRNA',
'RNA_subtype': 'all',
'organism': 'all species',
'vis_type': 'Modomics symbols',
})

response.raise_for_status()

doc = bs4.BeautifulSoup(response.text, 'lxml')
table = doc.find('table', {'id': 'tseq'})
tbody = table.find('tbody')
rows = tbody.find_all('tr')
rna_forms = []
for row in rows:
if not isinstance(row, bs4.element.Tag):
continue

cells = row.find_all('td')

rna_form = bpforms.RnaForm()
unsupported_codes = set()
for child in cells[5].children:
if child.name is None or child.name == 'span':
if child.name is None:
text = str(child)
else:
text = child.text

for code in text.strip().replace('-', '').replace('_', ''):
monomer = modomics_short_code_to_monomer.get(code, None)
if monomer is None:
unsupported_codes.add(code)
monomer = bpforms.Monomer(id=code)
else:
monomer_codes[code] = monomer
rna_form.seq.append(monomer)
elif child.name == 'a':
code = child.get('href').replace('/modomics/modifications/', '')
monomer = modomics_short_code_to_monomer.get(code, None)
if monomer is None:
unsupported_codes.add(code)
monomer = bpforms.Monomer(id=code)
else:
monomer_codes[code] = monomer
rna_form.seq.append(monomer)
else:
raise Exception('Unsupported child {}'.format(child.name))

rna_forms.append({
'GenBank': cells[0].find('a').text.strip(),
'Organism': cells[3].text.strip(),
'Organellum': cells[4].text.strip(),
'Type': cells[2].text.strip(),
'Sequence (MODOMICS)': cells[5].text.strip().replace('-', '').replace('_', ''),
})
self._analyze_form(rna_form, unsupported_codes, rna_forms[-1])

# save results to tab-separated file
self._save_results(rna_forms, ['GenBank', 'Type'], out_filename)

return rna_forms

def _run_trna(self, session, modomics_short_code_to_monomer, monomer_codes, out_filename):
response = session.get(self.ENDPOINT, params={
'RNA_type': 'tRNA',
'RNA_subtype': 'all',
'organism': 'all species',
'vis_type': 'Modomics symbols',
})
response.raise_for_status()

doc = bs4.BeautifulSoup(response.text, 'lxml')
table = doc.find('table', {'id': 'tseq'})
tbody = table.find('tbody')
rows = tbody.find_all('tr')
rna_forms = []

for row in rows:
cells = row.find_all('td')

rna_form = bpforms.RnaForm()
unsupported_codes = set()
for child in cells[5].children:
if child.name is None or child.name == 'span':
if child.name is None:
text = str(child)
else:
text = child.text

for code in text.strip().replace('-', '').replace('_', ''):
monomer = modomics_short_code_to_monomer.get(code, None)
if monomer is None:
unsupported_codes.add(code)
monomer = bpforms.Monomer(id=code)
else:
monomer_codes[code] = monomer
rna_form.seq.append(monomer)
elif child.name == 'a':
code = child.get('href').replace('/modomics/modifications/', '')
monomer = modomics_short_code_to_monomer.get(code, None)
if monomer is None:
unsupported_codes.add(code)
monomer = bpforms.Monomer(id=code)
else:
monomer_codes[code] = monomer
rna_form.seq.append(monomer)
else:
raise Exception('Unsupported child {}'.format(child.name))

rna_forms.append({
'Amino acid type': cells[1].text.strip(),
'Anticodon': cells[2].text.strip(),
'Organism': cells[3].text.strip(),
'Organellum': cells[4].text.strip(),
'Sequence (MODOMICS)': cells[5].text.strip().replace('-', '').replace('_', ''),
})
self._analyze_form(rna_form, unsupported_codes, rna_forms[-1])

# save results to tab-separated file
self._save_results(rna_forms, ['Amino acid type', 'Anticodon'], out_filename)

return rna_forms

def _analyze_form(self, rna_form, unsupported_codes, results_dict):
results_dict['Sequence (BpForms)'] = str(rna_form)
results_dict['Sequence (IUPAC)'] = canonical_seq = rna_form.get_canonical_seq()
results_dict['Length'] = len(rna_form.seq)

results_dict['Number of modifications'] = len(rna_form.seq) \
- rna_form.seq.count(bpforms.rna_alphabet.monomers.A) \
- rna_form.seq.count(bpforms.rna_alphabet.monomers.C) \
- rna_form.seq.count(bpforms.rna_alphabet.monomers.G) \
- rna_form.seq.count(bpforms.rna_alphabet.monomers.U)
results_dict['Number of modified A'] = canonical_seq.count('A') - rna_form.seq.count(bpforms.rna_alphabet.monomers.A)
results_dict['Number of modified C'] = canonical_seq.count('C') - rna_form.seq.count(bpforms.rna_alphabet.monomers.C)
results_dict['Number of modified G'] = canonical_seq.count('G') - rna_form.seq.count(bpforms.rna_alphabet.monomers.G)
results_dict['Number of modified U'] = canonical_seq.count('U') - rna_form.seq.count(bpforms.rna_alphabet.monomers.U)

if unsupported_codes:
results_dict['BpForms errors'] = 'MODOMICS sequence uses monomeric forms {}'.format(
', '.join(unsupported_codes))
else:
results_dict['Formula'] = str(rna_form.get_formula())
results_dict['Molecular weight'] = rna_form.get_mol_wt()
results_dict['Charge'] = rna_form.get_charge()

canonical_form = bpforms.RnaForm().from_str(canonical_seq)
results_dict['Canonical formula'] = str(canonical_form.get_formula())
results_dict['Canonical molecular weight'] = canonical_form.get_mol_wt()
results_dict['Canonical charge'] = canonical_form.get_charge()

results_dict['Extra formula'] = str(rna_form.get_formula() - canonical_form.get_formula())
results_dict['Extra molecular weight'] = rna_form.get_mol_wt() - canonical_form.get_mol_wt()
results_dict['Extra charge'] = rna_form.get_charge() - canonical_form.get_charge()

results_dict['BpForms errors'] = ' '.join(rna_form.validate())

def _save_results(self, rna_forms, variable_field_names, out_filename):
with open(out_filename, 'w') as file:
writer = csv.DictWriter(file,
fieldnames=variable_field_names + [
'Organism', 'Organellum',
'Sequence (MODOMICS)',
'Sequence (BpForms)',
'Sequence (IUPAC)',
'Length',
'Number of modifications',
'Number of modified A',
'Number of modified C',
'Number of modified G',
'Number of modified U',
'Formula', 'Molecular weight', 'Charge',
'Canonical formula', 'Canonical molecular weight', 'Canonical charge',
'Extra formula', 'Extra molecular weight', 'Extra charge',
'BpForms errors'],
dialect='excel-tab')
writer.writeheader()

for rna_form in rna_forms:
writer.writerow(rna_form)

0 comments on commit 648dbbf

Please sign in to comment.