get ncbi id using api.datanator.info

KarrLab · Jan 30, 2020 · bd79319 · bd79319
1 parent 6cf9395
commit bd79319
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 5 deletions.
diff --git a/datanator/data_source/kegg_org_code.py b/datanator/data_source/kegg_org_code.py
@@ -61,6 +61,24 @@ def get_ncbi_id(self, name):
                 return int(id_list.group(1))
         else:
             return int(str(result).split(': ')[1])
+
+    def get_ncbi_id_rest(self, name):
+        """Get ncbi taxonomy id of an organism using
+        api.datanator.info
+        
+        Args:
+            name (:obj:`str`): Name of the organism.
+
+        Return:
+            (:obj:`int`): NCBI Taxonomy ID.
+        """
+        endpoint = "https://api.datanator.info/ftx/text_search/num_of_index/?query_message={}&index=taxon_tree&from_=0&size=5&fields=tax_name&fields=name_txt".format(name)
+        r = requests.get(endpoint)
+        data = json.loads(r.text)
+        if data.get('taxon_tree', []) !=[]:
+            return data['taxon_tree'][0]['tax_id']
+        else:
+            return None
 
     def has_href_and_id(self, tag):
         return tag.has_attr('href') and tag.has_attr('id')
@@ -110,7 +128,7 @@ def make_bulk(self, offset=0, bulk_size=100):
             if i < offset:
                 continue
             if count < bulk_size:
-                ncbi_id = self.get_ncbi_id(name.split(' (')[0])   
+                ncbi_id = self.get_ncbi_id_rest(name)   
                 result.append({"kegg_organism_id": _id, "org_name": name,
                                 'ncbi_taxonomy_id': ncbi_id})
                 count += 1
@@ -173,7 +191,7 @@ def bulk_load(self, bulk_size=100):
             bulk_size(:obj:`int`): number of entries per insertion. Defaults to 100.
         """
         length = bulk_size
-        count = 0
+        count = 16
         while length != 0:
             if count == self.max_entries:
                 break

diff --git a/tests/data_source/test_kegg_org_code.py b/tests/data_source/test_kegg_org_code.py
@@ -20,7 +20,7 @@ def setUpClass(cls):
     @classmethod
     def tearDownClass(cls):
         shutil.rmtree(cls.cache_dirname)
-        cls.src.db.drop_collection(self.src.collection_str)
+        cls.src.db.drop_collection(cls.src.collection_str)
 
     @unittest.skip('passed')
     def test_parse_ids(self):
@@ -94,11 +94,16 @@ def test_matching_species_name_id(self):
             result = {name: _id}
         self.assertEqual(result, {'Candidatus Bathyarchaeota archaeon BA2': 1700836})
 
-    # @unittest.skip('passed')
+    @unittest.skip('passed')
     def test_get_ncbi_id(self):
         name = 'Ornithobacterium rhinotracheale ORT-UMN 88'
         self.assertEqual(self.src.get_ncbi_id(name), 1401325)
         name = 'latoieruwerwe'
         self.assertEqual(self.src.get_ncbi_id(name), None)
         name = 'Pan troglodytes'
-        self.assertEqual(self.src.get_ncbi_id(name), 9598)
+        self.assertEqual(self.src.get_ncbi_id(name), 9598)
+
+    @unittest.skip('passed')
+    def test_get_ncbi_id_rest(self):
+        name = "homo sapiens (human)"
+        self.assertEqual(self.src.get_ncbi_id_rest(name), 9606)