Skip to content

Commit

Permalink
Merge branch 'master' of ssh://github.com/KarrLab/datanator
Browse files Browse the repository at this point in the history
  • Loading branch information
jonrkarr committed Apr 27, 2020
2 parents 1f525da + 7651f42 commit 47f835d
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 68 deletions.
167 changes: 99 additions & 68 deletions datanator/data_source/brenda/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@

import ete3
import pickle
import os.path
import json
from pathlib import Path
from datanator_query_python.util import mongo_util
import datanator.config.core
import re
import warnings


class Brenda(object):
RAW_FILENAME = os.path.join(os.path.dirname(__file__), 'brenda_download.txt')
PROCESSED_FILENAME = os.path.join(os.path.dirname(__file__), 'brenda.pkl')
class Brenda(mongo_util.MongoUtil):
RAW_FILENAME = str(Path('~/karr_lab/datanator/docs/brenda/brenda_download.txt').expanduser())
PROCESSED_FILENAME = str(Path('~/karr_lab/datanator/docs/brenda/brenda.pkl').expanduser())
MAX_ENTRIES = float('inf')

LINE_CODES = {
'AC': 'ACTIVATING_COMPOUND',
Expand Down Expand Up @@ -71,12 +75,17 @@ class Brenda(object):
'TS': 'TEMPERATURE_STABILITY'
}

def __init__(self):
def __init__(self, MongoDB=None, db=None, username=None, password=None,
collection_str=None, authSource='admin', readPreference='nearest'):
super().__init__(MongoDB=MongoDB, db=db, username=username, password=password,
authSource=authSource, readPreference=readPreference)
self._ncbi_taxa = ete3.NCBITaxa()
self.collection = self.db_obj[collection_str]

def run(self, raw_filename=None, processed_filename=None):
def run(self, raw_filename=None, processed_filename=None, max_entries=None):
raw_filename = raw_filename or self.RAW_FILENAME
processed_filename = processed_filename or self.PROCESSED_FILENAME
processed_filename = str(Path(processed_filename).expanduser()) or self.PROCESSED_FILENAME
max_entries = max_entries or self.MAX_ENTRIES

ec_data = None
ec_code = None
Expand Down Expand Up @@ -142,20 +151,20 @@ def run(self, raw_filename=None, processed_filename=None):

# link refs
for ec_data in data.values():
for enz in ec_data['enzymes'].values():
enz['refs'] = [ec_data['refs'][ref_id] for ref_id in enz['ref_ids']]
# for enz in ec_data['enzymes'].values():
# enz['refs'] = [ec_data['refs'][ref_id] for ref_id in enz['ref_ids']]

for tissue in enz['tissues']:
tissue['refs'] = [ec_data['refs'][ref_id] for ref_id in tissue['ref_ids']]
# for tissue in enz['tissues']:
# tissue['refs'] = [ec_data['refs'][ref_id] for ref_id in tissue['ref_ids']]

for loc in enz['subcellular_localizations']:
loc['refs'] = [ec_data['refs'][ref_id] for ref_id in loc['ref_ids']]
# for loc in enz['subcellular_localizations']:
# loc['refs'] = [ec_data['refs'][ref_id] for ref_id in loc['ref_ids']]

for rxn in ec_data['natural_reactions']:
rxn['refs'] = [ec_data['refs'][ref_id] for ref_id in rxn['ref_ids']]
# for rxn in ec_data['natural_reactions']:
# rxn['refs'] = [ec_data['refs'][ref_id] for ref_id in rxn['ref_ids']]

for rxn in ec_data['reactions']:
rxn['refs'] = [ec_data['refs'][ref_id] for ref_id in rxn['ref_ids']]
# for rxn in ec_data['reactions']:
# rxn['refs'] = [ec_data['refs'][ref_id] for ref_id in rxn['ref_ids']]

for k_cat in ec_data['k_cats']:
k_cat['refs'] = [ec_data['refs'][ref_id] for ref_id in k_cat['ref_ids']]
Expand All @@ -164,22 +173,26 @@ def run(self, raw_filename=None, processed_filename=None):
k_m['refs'] = [ec_data['refs'][ref_id] for ref_id in k_m['ref_ids']]

# remove information no longer needed because refs have been deserialized
for ec_data in data.values():
for i, (_key, ec_data) in enumerate(data.items()):
if i == max_entries:
break
if i % 10 == 0:
print('Processing EC {}, the {}th of all ECs'.format(_key, i))
for enz in ec_data['enzymes'].values():
enz.pop('id')
enz.pop('ref_ids')
# enz.pop('ref_ids')

for tissue in enz['tissues']:
tissue.pop('ref_ids')
# for tissue in enz['tissues']:
# tissue.pop('ref_ids')

for loc in enz['subcellular_localizations']:
loc.pop('ref_ids')
# for loc in enz['subcellular_localizations']:
# loc.pop('ref_ids')

for rxn in ec_data['natural_reactions']:
rxn.pop('ref_ids')
# for rxn in ec_data['natural_reactions']:
# rxn.pop('ref_ids')

for rxn in ec_data['reactions']:
rxn.pop('ref_ids')
# for rxn in ec_data['reactions']:
# rxn.pop('ref_ids')

for k_cat in ec_data['k_cats']:
k_cat.pop('ref_ids')
Expand All @@ -193,12 +206,14 @@ def run(self, raw_filename=None, processed_filename=None):
ec_data.pop('enzymes')
ec_data.pop('refs')

# save to pickle and JSON files
with open(processed_filename, 'wb') as file:
pickle.dump(data, file)

# save to MongoDB
self.collection.update_one({'ec_number': _key},
{'$addToSet': {'ec_synonyms': {'$each': [ec_data['name'], ec_data['systematic_name']]}},
'$set': {'k_ms': ec_data['k_ms'],
'k_cats': ec_data['k_cats'],
'comments': ec_data['comments']}}, upsert=True)
# return extracted data
return data
return 'done!!'

def parse_ec_code(self, data, val):
match = re.match(r'^([0-9\.]+)([ \n]\((.*?)\))?$', val, re.DOTALL)
Expand All @@ -209,8 +224,8 @@ def parse_ec_code(self, data, val):
ec_data = data[ec_code] = {
'name': None,
'systematic_name': None,
'natural_reactions': [],
'reactions': [],
# 'natural_reactions': [],
# 'reactions': [],
'enzymes': {},
'k_cats': [],
'k_ms': [],
Expand Down Expand Up @@ -249,9 +264,9 @@ def parse_content(self, ec_code, ec_data, type, val):
'taxon': {'name': taxon_name, 'id': taxon_id} if taxon_name else None,
'tissues': [],
'subcellular_localizations': [],
'comments': comments,
'ref_ids': ref_ids,
'refs': None,
# 'comments': comments,
# 'ref_ids': ref_ids,
# 'refs': None,
}
else:
if xid:
Expand All @@ -264,8 +279,8 @@ def parse_content(self, ec_code, ec_data, type, val):
if not ec_data['enzymes'][id]['taxon'] and taxon_name:
ec_data['enzymes'][id]['taxon'] = {'name': taxon_name, 'id': taxon_id}

ec_data['enzymes'][id]['comments'] += comments
ec_data['enzymes'][id]['ref_ids'] = sorted(set(ec_data['enzymes'][id]['ref_ids'] + ref_ids))
# ec_data['enzymes'][id]['comments'] += comments
# ec_data['enzymes'][id]['ref_ids'] = sorted(set(ec_data['enzymes'][id]['ref_ids'] + ref_ids))

elif type == 'RN':
ec_data['name'] = val.replace('\n', ' ').strip()
Expand All @@ -286,8 +301,8 @@ def parse_content(self, ec_code, ec_data, type, val):
enzyme['tissues'].append({
'name': tissue,
'comments': self.filter_comments(comments, enz_id),
'ref_ids': ref_ids,
'refs': None,
# 'ref_ids': ref_ids,
# 'refs': None,
})
else:
warnings.warn('{} does not have enzyme with id {}. Error due to {}'.format(ec_code, enz_id, val), UserWarning)
Expand All @@ -305,37 +320,37 @@ def parse_content(self, ec_code, ec_data, type, val):
ec_data['enzymes'][enz_id]['subcellular_localizations'].append({
'name': localization,
'comments': self.filter_comments(comments, enz_id),
'ref_ids': ref_ids,
'refs': None,
# 'ref_ids': ref_ids,
# 'refs': None,
})
else:
warnings.warn('{} does not have enzyme with id {}. Error due to {}'.format(ec_code, enz_id, val), UserWarning)

elif type == 'NSP':
match = re.match(r'^#(.*?)#[ \n](.*?)([ \n]\((.*?)\))?([ \n]\|.*?\|)?([ \n]\{(r|)\})?[ \n]<([0-9,\n]+)>$', val, re.DOTALL)
comments = self.parse_comments(match.group(4))
ref_ids = match.group(8).replace('\n', ',').strip().split(',')
ec_data['natural_reactions'].append({
'equation': match.group(2).replace('\n', ' ').strip(),
'reversible': match.group(7) == 'r',
'enz_ids': match.group(1).replace('\n', ',').strip().split(','),
'comments': comments,
'ref_ids': ref_ids,
'refs': None,
})

elif type == 'SP':
match = re.match(r'^#(.*?)#[ \n](.*?)([ \n]\((.*?)\))?([ \n]\|.*?\|)?([ \n]\{(r|)\})?[ \n]<([0-9,\n]+)>$', val, re.DOTALL)
comments = self.parse_comments(match.group(4))
ref_ids = match.group(8).replace('\n', ',').strip().split(',')
ec_data['reactions'].append({
'equation': match.group(2).replace('\n', ' ').strip(),
'reversible': match.group(7) == 'r',
'enz_ids': match.group(1).replace('\n', ',').strip().split(','),
'comments': comments,
'ref_ids': ref_ids,
'refs': None,
})
# elif type == 'NSP':
# match = re.match(r'^#(.*?)#[ \n](.*?)([ \n]\((.*?)\))?([ \n]\|.*?\|)?([ \n]\{(r|)\})?[ \n]<([0-9,\n]+)>$', val, re.DOTALL)
# comments = self.parse_comments(match.group(4))
# ref_ids = match.group(8).replace('\n', ',').strip().split(',')
# ec_data['natural_reactions'].append({
# 'equation': match.group(2).replace('\n', ' ').strip(),
# 'reversible': match.group(7) == 'r',
# 'enz_ids': match.group(1).replace('\n', ',').strip().split(','),
# 'comments': comments,
# # 'ref_ids': ref_ids,
# # 'refs': None,
# })

# elif type == 'SP':
# match = re.match(r'^#(.*?)#[ \n](.*?)([ \n]\((.*?)\))?([ \n]\|.*?\|)?([ \n]\{(r|)\})?[ \n]<([0-9,\n]+)>$', val, re.DOTALL)
# comments = self.parse_comments(match.group(4))
# ref_ids = match.group(8).replace('\n', ',').strip().split(',')
# ec_data['reactions'].append({
# 'equation': match.group(2).replace('\n', ' ').strip(),
# 'reversible': match.group(7) == 'r',
# 'enz_ids': match.group(1).replace('\n', ',').strip().split(','),
# 'comments': comments,
# # 'ref_ids': ref_ids,
# # 'refs': None,
# })

elif type in ['TN', 'KM']:
match = re.match(
Expand Down Expand Up @@ -477,3 +492,19 @@ def filter_comments(self, comments, enz_id):
k_m_vals.update(set(k_m['value'] for k_m in ec_data['k_ms']))
comments.add(ec_data['comments'])
"""

def main():
db = 'datanator'
collection_str = 'ec'
username = datanator.config.core.get_config()[
'datanator']['mongodb']['user']
password = datanator.config.core.get_config(
)['datanator']['mongodb']['password']
server = datanator.config.core.get_config(
)['datanator']['mongodb']['server']
status = Brenda(MongoDB=server, username=username, password=password,
db=db, collection_str=collection_str).run(processed_filename='~/karr_lab/datanator/docs/brenda/brenda-1.json')
print(status)

if __name__ == '__main__':
main()
23 changes: 23 additions & 0 deletions datanator/data_source/brenda/kinetic_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pymongo
from bson.binary import Binary
import pickle
from datanator_query_python.util import mongo_util
import datanator.config.core
from pathlib import Path


def main():
db = 'test'
collection_str = 'brenda_constants'
username = datanator.config.core.get_config()[
'datanator']['mongodb']['user']
password = datanator.config.core.get_config(
)['datanator']['mongodb']['password']
MongoDB = datanator.config.core.get_config(
)['datanator']['mongodb']['server']
manager = mongo_util.MongoUtil(MongoDB=MongoDB, db=db, username=username,
password=password, collection_str=collection_str)

with open(str(Path('~/karr_lab/datanator/docs/brenda/brenda.pkl').expanduser()), 'rb') as f:
data = pickle.load(f)
coll.insert({'bin-data': Binary(thebytes)})
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pandas as pd
from datanator.util import rna_halflife_util, file_util
import datetime
import datanator.config.core
import datetime
from pymongo.collation import Collation, CollationStrength
import tempfile
import shutil


class Concentration(rna_halflife_util.RnaHLUtil):

def __init__(self, cache_dir=None, server=None, src_db=None, protein_col=None,
authDB=None, readPreference=None, username=None, password=None,
verbose=None, max_entries=None, des_db=None, collection_str=None):
"""Init
Args:
cache_dir (:obj:`str`, optional): Cache directory for logs. Defaults to None.
server (:obj:`str`, optional): MongoDB server address. Defaults to None.
db (:obj:`str`, optional): Database where initial uniprot collection resides. Defaults to None.
collection_str (:obj:`str`, optional): name of collection. Defaults to None.
authDB (:obj:`str`, optional): MongoDB authentication database. Defaults to None.
readPreference (:obj:`str`, optional): MongoDB read preference. Defaults to None.
username (:obj:`str`, optional): MongoDB username. Defaults to None.
password (:obj:`str`, optional): MongoDB password. Defaults to None.
verbose (:obj:`bool`, optional): Wheter to display verbose messages. Defaults to None.
max_entries (:obj:`int`, optional): Number of records to be processed. Defaults to None.
uniprot_col_db (:obj:`int`, optional): Database to which new uniprot records will be inserted. Defaults to None.
"""
super().__init__(server=server, username=username, password=password, src_db=src_db,
des_db=des_db, protein_col=protein_col, rna_col=collection_str, authDB=authDB, readPreference=readPreference,
max_entries=max_entries, verbose=verbose, cache_dir=cache_dir)
self.collation = Collation('en', strength=CollationStrength.SECONDARY)
self.max_entries = max_entries
self.verbose = verbose

0 comments on commit 47f835d

Please sign in to comment.