ncbi id

KarrLab · Jan 29, 2020 · 6b70e56 · 6b70e56
1 parent 7061aaa
commit 6b70e56
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 3 deletions.
diff --git a/datanator/data_source/kegg_org_code.py b/datanator/data_source/kegg_org_code.py
@@ -5,6 +5,7 @@
 import requests
 from bs4 import BeautifulSoup
 from datanator_query_python.util import mongo_util
+from datanator_query_python.query import query_taxon_tree
 from pymongo.collation import Collation, CollationStrength
 import datanator.config.core
 
@@ -23,11 +24,14 @@ def __init__(self, MongoDB, db, cache_dirname=None, replicaSet=None, verbose=Fal
         self.db = db
         self.verbose = verbose
         self.max_entries = max_entries
-        self.collection_str = 'kegg_organism_code'
+        self.collection_str = 'kegg_organisms_code'
         r = requests.get(self.ENDPOINT_DOMAINS['root'])
         self.soups = BeautifulSoup(r.content, 'html.parser')
         self.client, self.db, self.collection = self.con_db(self.collection_str)
         self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY)
+        self.taxon_manager = query_taxon_tree.QueryTaxonTree(collection_str='taxon_tree', 
+                verbose=verbose, max_entries=max_entries, username=username, MongoDB=MongoDB, 
+                password=password, db='datanator', authSource=authSource, readPreference=readPreference)
 
     def has_href_and_id(self, tag):
         return tag.has_attr('href') and tag.has_attr('id')
@@ -100,6 +104,27 @@ def bulk_load(self, bulk_size=100):
                 self.collection.insert_many(docs)            
             count += 1
 
+    def fill_ncbi_id(self):
+        """Fill collection with ncbi_taxonomy_id.
+        """
+        query = {}
+        docs = self.collection.find(query)
+        count = self.collection.count_documents(query)
+        for i, doc in enumerate(docs):
+            if i == self.max_entries:
+                break
+            if i % 50 == 0 and self.verbose:
+                print('Processing doc {} out of {}.'.format(i, count))
+            name = doc['org_name']
+            ids = self.taxon_manager.get_ids_by_name(name)
+            if len(ids) > 1:
+                self.collection.update_one({'org_name': name},
+                                            {'$set': {'ncbi_taxonomy_id': ids,
+                                                      'ambiguous': True}}, upsert=False)
+            else:
+                self.collection.update_one({'org_name': name},
+                                            {'$set': {'ncbi_taxonomy_id': ids,
+                                                      'ambiguous': False}}, upsert=False)
 
 def main():
     db = 'datanator'

diff --git a/tests/data_source/test_kegg_org_code.py b/tests/data_source/test_kegg_org_code.py
@@ -20,7 +20,7 @@ def setUpClass(cls):
     @classmethod
     def tearDownClass(cls):
         shutil.rmtree(cls.cache_dirname)
-        cls.src.db.drop_collection('kegg_organism_code')
+        # cls.src.db.drop_collection('kegg_organism_code')
 
     @unittest.skip('passed')
     def test_parse_ids(self):
@@ -53,5 +53,9 @@ def test_make_bulk(self):
         result = self.src.make_bulk(offset=6000)
         self.assertEqual(len(result), 100)
 
+    @unittest.skip('passed')
     def test_bulk_load(self):
-        self.src.bulk_load()
+        self.src.bulk_load()
+
+    def test_fill_ncbi_id(self):
+        self.src.fill_ncbi_id()