Skip to content

Commit

Permalink
ncbi id
Browse files Browse the repository at this point in the history
  • Loading branch information
lzy7071 committed Jan 29, 2020
1 parent 7061aaa commit 6b70e56
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 3 deletions.
27 changes: 26 additions & 1 deletion datanator/data_source/kegg_org_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import requests
from bs4 import BeautifulSoup
from datanator_query_python.util import mongo_util
from datanator_query_python.query import query_taxon_tree
from pymongo.collation import Collation, CollationStrength
import datanator.config.core

Expand All @@ -23,11 +24,14 @@ def __init__(self, MongoDB, db, cache_dirname=None, replicaSet=None, verbose=Fal
self.db = db
self.verbose = verbose
self.max_entries = max_entries
self.collection_str = 'kegg_organism_code'
self.collection_str = 'kegg_organisms_code'
r = requests.get(self.ENDPOINT_DOMAINS['root'])
self.soups = BeautifulSoup(r.content, 'html.parser')
self.client, self.db, self.collection = self.con_db(self.collection_str)
self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY)
self.taxon_manager = query_taxon_tree.QueryTaxonTree(collection_str='taxon_tree',
verbose=verbose, max_entries=max_entries, username=username, MongoDB=MongoDB,
password=password, db='datanator', authSource=authSource, readPreference=readPreference)

def has_href_and_id(self, tag):
return tag.has_attr('href') and tag.has_attr('id')
Expand Down Expand Up @@ -100,6 +104,27 @@ def bulk_load(self, bulk_size=100):
self.collection.insert_many(docs)
count += 1

def fill_ncbi_id(self):
"""Fill collection with ncbi_taxonomy_id.
"""
query = {}
docs = self.collection.find(query)
count = self.collection.count_documents(query)
for i, doc in enumerate(docs):
if i == self.max_entries:
break
if i % 50 == 0 and self.verbose:
print('Processing doc {} out of {}.'.format(i, count))
name = doc['org_name']
ids = self.taxon_manager.get_ids_by_name(name)
if len(ids) > 1:
self.collection.update_one({'org_name': name},
{'$set': {'ncbi_taxonomy_id': ids,
'ambiguous': True}}, upsert=False)
else:
self.collection.update_one({'org_name': name},
{'$set': {'ncbi_taxonomy_id': ids,
'ambiguous': False}}, upsert=False)

def main():
db = 'datanator'
Expand Down
8 changes: 6 additions & 2 deletions tests/data_source/test_kegg_org_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def setUpClass(cls):
@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.cache_dirname)
cls.src.db.drop_collection('kegg_organism_code')
# cls.src.db.drop_collection('kegg_organism_code')

@unittest.skip('passed')
def test_parse_ids(self):
Expand Down Expand Up @@ -53,5 +53,9 @@ def test_make_bulk(self):
result = self.src.make_bulk(offset=6000)
self.assertEqual(len(result), 100)

@unittest.skip('passed')
def test_bulk_load(self):
self.src.bulk_load()
self.src.bulk_load()

def test_fill_ncbi_id(self):
self.src.fill_ncbi_id()

0 comments on commit 6b70e56

Please sign in to comment.