KarrLab · Oct 9, 2020
diff --git a/‎.circleci/requirements.txt
-1 b/‎.circleci/requirements.txt
-1
diff --git a/‎datanator_query_python/__main__.py
+39-1 b/‎datanator_query_python/__main__.py
+39-1
diff --git a/‎datanator_query_python/config/query_manager.py
+4-4 b/‎datanator_query_python/config/query_manager.py
+4-4
diff --git a/‎datanator_query_python/config/query_schema_2_manager.py
+2 b/‎datanator_query_python/config/query_schema_2_manager.py
+2
diff --git a/‎datanator_query_python/query/full_text_search.py
+27 b/‎datanator_query_python/query/full_text_search.py
+27
diff --git a/‎datanator_query_python/query/query_kegg_orthology.py
+24 b/‎datanator_query_python/query/query_kegg_orthology.py
+24
diff --git a/‎datanator_query_python/query/query_protein.py
+115 b/‎datanator_query_python/query/query_protein.py
+115
diff --git a/‎datanator_query_python/query/query_rna_halflife.py
+21 b/‎datanator_query_python/query/query_rna_halflife.py
+21
diff --git a/‎datanator_query_python/query/query_sabiork_old.py
+74 b/‎datanator_query_python/query/query_sabiork_old.py
+74
diff --git a/‎datanator_query_python/query/query_taxon_tree.py
+38-4 b/‎datanator_query_python/query/query_taxon_tree.py
+38-4
diff --git a/‎datanator_query_python/query_schema_2/ftx_search.py
+62 b/‎datanator_query_python/query_schema_2/ftx_search.py
+62
diff --git a/‎datanator_query_python/query_schema_2/query_entity.py
+41 b/‎datanator_query_python/query_schema_2/query_entity.py
+41
diff --git a/‎datanator_query_python/query_schema_2/query_observation.py
+52 b/‎datanator_query_python/query_schema_2/query_observation.py
+52
diff --git a/‎setup.py
+4 b/‎setup.py
+4
diff --git a/‎tests/config/test_query_schema_2_manager.py
+18 b/‎tests/config/test_query_schema_2_manager.py
+18
diff --git a/‎tests/query/test_full_text_query.py
+9 b/‎tests/query/test_full_text_query.py
+9
diff --git a/‎tests/query/test_query_kegg_orthology.py
+7-1 b/‎tests/query/test_query_kegg_orthology.py
+7-1
diff --git a/‎tests/query/test_query_protein.py
+13-4 b/‎tests/query/test_query_protein.py
+13-4
diff --git a/‎tests/query/test_query_sabiork_old.py
+9-1 b/‎tests/query/test_query_sabiork_old.py
+9-1
diff --git a/‎tests/query/test_query_taxon_tree.py
+20-4 b/‎tests/query/test_query_taxon_tree.py
+20-4
diff --git a/‎tests/query_schema_2/test_ftx_search.py
+18 b/‎tests/query_schema_2/test_ftx_search.py
+18
diff --git a/‎tests/query_schema_2/test_query_entity.py
+22 b/‎tests/query_schema_2/test_query_entity.py
+22
diff --git a/‎tests/query_schema_2/test_query_observation.py
+27 b/‎tests/query_schema_2/test_query_observation.py
+27
diff --git a/‎tests/query_schema_2/test_query_taxon_tree_v2.py
+50 b/‎tests/query_schema_2/test_query_taxon_tree_v2.py
+50
diff --git a/‎tests/query_schema_2/test_query_uniprot_v2.py b/‎tests/query_schema_2/test_query_uniprot_v2.py
diff --git a/‎tests/test_main.py
+17 b/‎tests/test_main.py
+17
diff --git a/‎tests/util/test_motor_util.py
+25 b/‎tests/util/test_motor_util.py
+25
@@ -1,5 +1,4 @@
 # Karr Lab packages
 git+https://github.com/KarrLab/pkg_utils.git#egg=pkg_utils
 git+https://github.com/KarrLab/wc_utils.git#egg=wc_utils
-
 git+https://github.com/KarrLab/karr_lab_aws_manager.git#egg=karr_lab_aws_manager
@@ -9,7 +9,7 @@
 import cement
 from datanator_query_python.util import mongo_util
 from datanator_query_python.config import config
-import datanator_query_python
+import datanator_query_python.core
 
 
 class BaseController(cement.Controller):
@@ -65,13 +65,51 @@ def _default(self):
         print("done")
 
 
+class DefineSchema(cement.Controller):
+    """Karrlab elasticsearch delete index. """
+
+    class Meta:
+        label = 'mongo-def-schema'
+        description = 'Define jsonschema of a collection'
+        stacked_on = 'base'
+        stacked_type = 'nested'
+        arguments = [
+            (['db'], dict(
+                type=str, help='Name of the database in which the collection resides.')),
+            (['collection'], dict(
+                type=str, help='Name of the collection to be defined.')),
+            (['jsonschema'], dict(
+                type=str, help='Location of jsonschema')),
+            (['--config_name', '-cn'], dict(
+                type=str, default='TestConfig',
+                help='Config class to be used.'))
+        ]
+
+    @cement.ex(hide=True)
+    def _default(self):
+        ''' Delete elasticsearch index
+
+            Args:
+                index (:obj:`str`): name of index in es
+                _id (:obj:`int`): id of the doc in index (optional)
+        '''
+        args = self.app.pargs
+        conf = getattr(config, args.config_name)
+        mongo_util.MongoUtil(MongoDB=conf.SERVER,
+                             db=args.db,
+                             username=conf.USERNAME,
+                             password=conf.PASSWORD).define_schema(args.collection, args.jsonschema)
+        print("done")
+
+
 class App(cement.App):
     """ Command line application """
     class Meta:
         label = 'datanator_query_python'
         base_controller = 'base'
         handlers = [
             BaseController,
+            Command3WithArgumentsController,
             DefineSchema
         ]
 
 
@@ -14,9 +14,9 @@ def __init__(self):
         self.read_preference = config.AtlasConfig.READ_PREFERENCE
         self.repl = config.AtlasConfig.REPLSET
 
-    def protein_manager(self):
+    def protein_manager(self, database="datanator"):
         return query_protein.QueryProtein(username=self.username, password=self.password, server=self.server,
-        authSource=self.authDB, readPreference=self.read_preference, replicaSet=self.repl)
+        authSource=self.authDB, readPreference=self.read_preference, replicaSet=self.repl, database=database)
 
     def metabolite_concentration_manager(self):
         return query_metabolite_concentrations.QueryMetaboliteConcentrations(MongoDB=self.server, db='datanator',
@@ -71,10 +71,10 @@ def metabolites_meta_manager():
 
 class RnaManager:
 
-    def rna_manager(self):
+    def rna_manager(self, db="datanator"):
         return query_rna_halflife.QueryRNA(username=config.AtlasConfig.USERNAME, password=config.AtlasConfig.PASSWORD,
         server=config.AtlasConfig.SERVER, authDB=config.AtlasConfig.AUTHDB, readPreference=config.AtlasConfig.READ_PREFERENCE,
-        db='datanator', collection_str='rna_halflife_new', replicaSet=config.AtlasConfig.REPLSET)
+        db=db, collection_str='rna_halflife_new', replicaSet=config.AtlasConfig.REPLSET)
 
 
 class KeggManager:
 
@@ -1,6 +1,7 @@
 from datanator_query_python.config import config
 from datanator_query_python.util import mongo_util
 from pymongo import ReadPreference
+from pymongo.collation import Collation, CollationStrength
 
 
 class QM(mongo_util.MongoUtil):
@@ -17,6 +18,7 @@ def __init__(self,
                         username=username, password=password,
                         authSource=authSource, readPreference=readPreference)
         self.read_preference = self._convert_read_p(readPreference)
+        self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY)
 
     def _convert_read_p(self, read_preference):
         """Convert string read preference to pymongo
 
@@ -196,7 +196,11 @@ def get_index_ko_count(self, q, num, agg_field="frontend_gene_aggregate", index=
                             "aggs": {
                                 "top_ko": {
                                     "top_hits": {'_source': {'includes': ['ko_number', 'ko_name', 'protein_name', 'definition', agg_field,
+<<<<<<< HEAD
                                                                           'species_name', "orthodb_name", "orthodb_id"]}, 'size': 1}
+=======
+                                                                          'species_name', "orthodb_name", "orthodb_id", "uniprot_id"]}, 'size': 1}
+>>>>>>> testapi
                                 },
                                 "top_hit" : {
                                     "max": {
@@ -341,7 +345,11 @@ def get_rxn_oi(self, query_message, minimum_should_match=0, from_=0,
                 result['sabio_rk'].append(hit['_source'])
             return result
 
+<<<<<<< HEAD
     def get_genes_orthodb_count(self, q, num, agg_field="orthodb_id", **kwargs):
+=======
+    def get_genes_orthodb_count(self, q, num, agg_field="orthodb_id.keyword", **kwargs):
+>>>>>>> testapi
         """Get protein index with different ko_number field for up to num hits,
         provided at least one of the proteins under orthodb_id has abundance info.
         
@@ -376,7 +384,11 @@ def get_genes_orthodb_count(self, q, num, agg_field="orthodb_id", **kwargs):
                             "aggs": {
                                 "top_ko": {
                                     "top_hits": {'_source': {'includes': ['orthodb_id', 'orthodb_name', 'protein_name', 'definition', agg_field,
+<<<<<<< HEAD
                                                                           'species_name']}, "size": 1}
+=======
+                                                                          'species_name', "uniprot_id"]}, "size": 1}
+>>>>>>> testapi
                                 },
                                 "top_hit" : {
                                     "max": {
@@ -412,6 +424,7 @@ def get_genes_orthodb_count(self, q, num, agg_field="orthodb_id", **kwargs):
         # for i, s in enumerate(r['aggregations']['top_kos']['buckets']):
         #     r['aggregations']['top_kos']['buckets'][i]['key'] = [s['key'][i:i+6] for i in range(0, len(s['key']), 6)]    
         for bucket_abundance in r['aggregations']['top_kos']['buckets']:
+<<<<<<< HEAD
             ko_abundance.add(bucket_abundance['top_ko']['hits']['hits'][0]['_source'][agg_field])
 
         for bucket_all in r_all['top_kos']['buckets']:
@@ -424,6 +437,20 @@ def get_genes_orthodb_count(self, q, num, agg_field="orthodb_id", **kwargs):
                 # s['top_ko']['hits']['hits'][0]['_source']['abundances'] = True
                 s['top_ko']['hits']['hits'][0]['_source'][agg_field] = [ko_str[i:i+6] for i in range(0, len(ko_str), 6)]
             else:
+=======
+            ko_abundance.add(bucket_abundance['top_ko']['hits']['hits'][0]['_source'].get(agg_field))
+            
+        for bucket_all in r_all['top_kos']['buckets']:
+            ko_all.add(bucket_all['top_ko']['hits']['hits'][0]['_source'].get(agg_field))
+        intersects = ko_abundance.intersection(ko_all)
+        for s in r['aggregations']['top_kos']['buckets']:
+            ko_str = s['top_ko']['hits']['hits'][0]['_source'].get(agg_field)   # ko_str can be "K01234,K12345"
+            # if ko_str in intersects and ko_str != 'nan':
+            if ko_str is None:
+            #     # s['top_ko']['hits']['hits'][0]['_source']['abundances'] = True
+            #     s['top_ko']['hits']['hits'][0]['_source'][agg_field] = [ko_str[i:i+6] for i in range(0, len(ko_str), 6)]
+            # else:
+>>>>>>> testapi
                 # s['top_ko']['hits']['hits'][0]['_source']['abundances'] = False
                 s['top_ko']['hits']['hits'][0]['_source'][agg_field] = ["N/A"]
         return r['aggregations']
@@ -14,6 +14,7 @@ def __init__(self, username=None, password=None, server=None, authSource='admin'
         self.max_entries = max_entries
         self.verbose = verbose
         self.client, self.db, self.collection = self.con_db('kegg_orthology')
+        self.ortho = self.db_obj["orthodb"]
         self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY)
 
     def get_ko_by_name(self, name):
@@ -97,6 +98,29 @@ def get_meta_by_kegg_ids(self, kegg_ids, projection={'_id': 0, 'gene_ortholog':
         count = self.collection.count_documents(query, collation=self.collation)
         return docs, count
 
+    def get_meta_by_ortho_ids(self, orthodb_ids, projection={'_id': 0, 'gene_ortholog': 0},
+                              limit=0):
+        """Get meta given kegg ids
+        
+        Args:
+            orthodb_ids (:obj:`list` of :obj:`str`): List of orthodb ids.
+            projection (:obj:`dict`): MongoDB result projection.
+
+        Return:
+            (:obj:`tuple` of :obj:`pymongo.Cursor` and :obj:`int`): pymongo Cursor obj and number of documents found.
+        """
+        projection['__order'] = 0
+        query = {'orthodb_id': {'$in': orthodb_ids}}
+        pipeline = [
+             {'$match': {'orthodb_id': {'$in': orthodb_ids}}},
+             {'$addFields': {"__order": {'$indexOfArray': [orthodb_ids, "$orthodb_id" ]}}},
+             {'$sort': {"__order": 1}},
+             {"$project": projection}
+            ]
+        docs = self.ortho.aggregate(pipeline)
+        count = self.ortho.count_documents(query)
+        return docs, count
+
     def get_meta_by_kegg_id(self, kegg_id):
         """Get meta information by kegg_id
         
 
@@ -64,6 +64,35 @@ def get_meta_by_id(self, _id):
             result.append(doc)
         return result
 
+    def get_ortho_by_id(self, _id):
+        '''
+            Get protein's metadata given uniprot id
+
+            Args:
+                _id (:obj:`str`): uniprot id.
+
+            Returns:
+                (:obj:`list` of :obj:`dict`): list of information.
+        '''
+        result = []
+        query = {'uniprot_id': _id}
+        doc = self.collection.find_one(filter=query, projection={"_id": 0})
+        if doc is None:
+            return {'uniprot_id': 'None',
+            'entry_name': 'None',
+            'gene_name': 'None',
+            'protein_name': 'None',
+            'canonical_sequence': 'None',
+            'length': 99999999,
+            'mass': '99999999',
+            'abundances': [],
+            'ncbi_taxonomy_id': 99999999,
+            'species_name': '99999999'}
+        else:
+            doc = json.loads(json.dumps(doc, ignore_nan=True))
+            result.append(doc)
+            return result            
+
     def get_meta_by_name_taxon(self, name, taxon_id):
         '''
             Get protein's metadata given protein name
@@ -869,4 +898,90 @@ def get_all_kegg(self, ko, anchor, max_distance):
                 species_canon_ancestor = obj[species+'_canon_ancestors']
                 doc['canon_ancestors'] = species_canon_ancestor
                 result[distance-1]['documents'].append(doc)
+        return result
+
+    def get_all_ortho(self, ko, anchor, max_distance):
+        '''Get replacement abundance value by taxonomic distance
+            with the same OrthoDB group number.
+
+        Args:
+            ko (:obj:`str`): OrthoDB group id to query for.
+            anchor (:obj:`str`): anchor species' name.
+            max_distance (:obj:`int`): max taxonomic distance from origin protein allowed for
+                                        proteins in results.
+            max_depth (:obj:`int`) max depth allowed from the common node.
+
+        Returns:
+            (:obj:`list` of :obj:`dict`): list of result proteins and their info 
+            [
+            {'distance': 1, 'documents': [{}, {}, {} ...]}, 
+            {'distance': 2, 'documents': [{}, {}, {} ...]}, ...].
+        '''
+        if max_distance <= 0:
+            return 'Please use get_abundance_by_id to check self abundance values'
+
+        result = []
+        for i in range(max_distance):
+            result.append({'distance': i + 1, 'documents': []})
+
+        projection = {
+            'orthodb_id': 1,
+            'orthodb_name': 1,
+            'ancestor_name': 1,
+            'ncbi_taxonomy_id': 1,
+            'abundances': 1,
+            'species_name': 1,
+            'uniprot_id': 1,
+            '_id': 0,
+            'ancestor_taxon_id': 1,
+            'protein_name': 1,
+            'gene_name': 1,
+            'modifications': 1
+        }
+        con_0 = {'orthodb_id': ko}
+        con_1 = {'abundances': {'$exists': True}}
+        query = {'$and': [con_0, con_1]}
+        docs = self.collection.find(filter=query, projection=projection)
+        queried = deque()
+        names = {}
+        for doc in docs:
+            doc = json.loads(json.dumps(doc, ignore_nan=True))
+            species = doc.get('species_name')
+            if species is None and species not in queried:
+                taxon_id = doc['ncbi_taxonomy_id']
+                species = self.db_obj['taxon_tree'].find_one({"tax_id": taxon_id})['tax_name']
+                queried.append(taxon_id)
+                names[taxon_id] = species
+            elif species is None and species in queried:
+                species = names[doc['ncbi_taxonomy_id']]
+            obj = self.taxon_manager.get_canon_common_ancestor_fast(anchor, species, org_format='tax_name')
+            distance = obj[anchor]            
+            if distance != -1 and distance <= max_distance:
+                species_canon_ancestor = obj[species+'_canon_ancestors']
+                doc['canon_ancestors'] = species_canon_ancestor
+                result[distance-1]['documents'].append(doc)
+        return result
+
+    def get_info_by_orthodb(self, orthodb):
+        '''
+            Find all proteins with the same kegg orthology id.
+
+            Args:
+                orthodb(:obj:`str`): kegg orthology ID.
+
+            Returns:
+                (:obj:`list` of :obj:`dict`): list of dictionary containing 
+                protein's uniprot_id and kegg information
+                [{'orthodb_id': ... 'orthodb_name': ... 'uniprot_ids': []},
+                 {'orthodb_id': ... 'orthodb_name': ... 'uniprot_ids': []}].
+        '''
+        ko = orthodb.lower()
+        result = [{'orthodb_id': ko, 'uniprot_ids': []}]
+        query = {'orthodb_id': ko}
+        projection = {'uniprot_id': 1, '_id': 0, 'orthodb_name': 1, 'orthodb_id': 1}
+        docs = self.collection.find(filter=query, projection=projection)
+
+        for doc in docs:
+            result[0]['orthodb_name'] = doc.get('orthodb_name', ['no name'])
+            result[0]['uniprot_ids'].append(doc.get('uniprot_id'))
         return result
@@ -70,3 +70,24 @@ def get_doc_by_ko(self, ko_number, projection={'_id': 0},
                                     skip=_from, limit=size)
         count = self.collection.count_documents(query)
         return docs, count
+
+    def get_doc_by_orthodb(self, orthodb, projection={'_id': 0},
+                      _from=0, size=0):
+        """Get documents by orthodb group ID.
+        
+        Args:
+            orthodb (:obj:`str`): Orthodb group ID.
+            projection (:obj:`dict`, optional): mongodb query result
+            projection. Defaults to {'_id': 0}.
+            _from (:obj:`int`): first page (0-indexed).
+            size (:obj:`int`): number of items per page.
+
+        Return:
+            (:obj:`tuple` of :obj:`Pymongo.Cursor` and :obj:`int`):
+            pymongo interable and number of documents.
+        """
+        query = {'orthodb_id': orthodb}
+        docs = self.collection.find(filter=query, projection=projection,
+                                    skip=_from, limit=size)
+        count = self.collection.count_documents(query)
+        return docs, count
@@ -20,6 +20,7 @@ def __init__(self, cache_dirname=None, MongoDB=None, replicaSet=None, db='datana
                         replicaSet=replicaSet, db=db,
                         verbose=verbose, max_entries=max_entries, username=username,
                         password=password, authSource=authSource, readPreference=readPreference)
+        self.u = self.client["datanator-test"]["uniprot"]
         self.chem_manager = chem_util.ChemUtil()
         self.file_manager = file_util.FileUtil()
         self.collection = self.db_obj[collection_str]
@@ -172,6 +173,79 @@ def get_kinlaw_by_rxn(self, substrates, products, dof=0,
         count = self.collection.count_documents(query)
         return count, docs
 
+    def get_kinlaw_by_rxn_ortho(self, substrates, products, dof=0,
+                          projection={'kinlaw_id': 1, '_id': 0, "enzymes": 1},
+                          bound='loose', skip=0, limit=0):
+        ''' Find the kinlaw_id defined in sabio_rk using 
+            rxn participants' inchikey
+
+            Args:
+                substrates (:obj:`list`): list of substrates' inchikey
+                products (:obj:`list`): list of products' inchikey
+                dof (:obj:`int`, optional): degree of freedom allowed (number of parts of
+                                  inchikey to truncate); the default is 0
+                projection (:obj:`dict`): pymongo query projection 
+                bound (:obj:`str`): limit substrates/products to include only input values
+
+            Return:
+                (:obj:`list` of :obj:`dict`): list of kinlaws that satisfy the condition
+        '''
+        substrate = 'reaction_participant.substrate_aggregate'
+        product = 'reaction_participant.product_aggregate'
+        if dof == 0:
+            substrates = substrates
+            products = products
+        elif dof == 1:
+            substrates = [re.compile('^' + x[:-2]) for x in substrates]
+            products = [re.compile('^' + x[:-2]) for x in products]
+        else:
+            substrates = [re.compile('^' + x[:14]) for x in substrates]
+            products = [re.compile('^' + x[:14]) for x in products]
+
+        if bound == 'loose':
+            constraint_0 = {substrate: {'$all': substrates}}
+            constraint_1 = {product: {'$all': products}}
+            constraint_2 = {"taxon_id": {"$ne": None}}
+        else:
+            constraint_0 = {substrate: substrates}
+            constraint_1 = {product: products}
+            constraint_2 = {"taxon_id": {"$ne": None}}            
+        query = {'$and': [constraint_0, constraint_1, constraint_2]}
+        # lookup = lookups.Lookups().simple_lookup("kegg_orthology", "resource.id", "definition.ec_code", "kegg_meta")
+        # if limit > 0:
+        #     pipeline = [{"$match": query}, {"$limit": limit}, {"$skip": skip}, lookup, {"$project": projection}]
+        # else:
+        #     pipeline = [{"$match": query}, {"$skip": skip}, lookup, {"$project": projection}]
+        docs = self.collection.find(filter=query,
+                                    limit=limit,
+                                    skip=skip,
+                                    projection=projection)
+        cache = {}
+        result = []
+        for doc in docs:
+            try:
+                u_id = doc["enzymes"][2]["subunit"][0]["uniprot_id"]
+                if u_id is not None:
+                    if cache.get(u_id) is None:
+                        x = self.u.find_one(filter={"uniprot_id": u_id},
+                                        projection={"orthodb_id": 1,
+                                                    "orthodb_name": 1})
+                        cache[u_id] = x
+                    else:
+                        x = cache.get(u_id)
+                    doc["orthodb_id"] = x["orthodb_id"]
+                    doc["orthodb_name"] = x["orthodb_name"]
+                else:
+                    doc["orthodb_id"] = None
+                    doc["orthodb_name"] = None
+            except:
+                doc["orthodb_id"] = None
+                doc["orthodb_name"] = None
+            doc.pop("enzymes", None)
+            result.append(doc)
+        count = self.collection.count_documents(query)
+        return count, result
+
     def get_kinlaw_by_entryid(self, entry_id):
         """Find reactions by sabio entry id
         
 
@@ -12,7 +12,7 @@ class QueryTaxonTree(mongo_util.MongoUtil):
 
     def __init__(self, cache_dirname=None, collection_str='taxon_tree', 
                 verbose=False, max_entries=float('inf'), username=None, MongoDB=None, 
-                password=None, db='datanator', authSource='admin', readPreference='nearest',
+                password=None, db='datanator-test', authSource='admin', readPreference='nearest',
                 replicaSet=None):
         self.collection_str = collection_str
         super().__init__(cache_dirname=cache_dirname, MongoDB=MongoDB,
@@ -415,6 +415,40 @@ def get_canon_common_ancestor_fast(self, org1, org2, org_format='tax_id'):
             Return:
                 (:obj:`Obj`)
         '''
-        anchor_org = self.collection.find_one({org_format: org1}, projection={'canon_anc_ids': 1, 'canon_anc_names': 1})
-        pipeline = self.pipeline_manager.aggregate_common_canon_ancestors(anchor_org, org1, intersect_name="ancMatch")
-        return self.collection.aggregate(pipeline)
+        if org1 is None or org2 is None:
+            return {'reason': 'Needs two organisms.'}
+        collection = self.client["datanator-test"]["taxon_tree"]
+        doc_1 = collection.find_one({org_format: org1}, projection={'canon_anc_ids': 1, 'canon_anc_names': 1})
+        doc_2 = collection.find_one({org_format: org2}, projection={'canon_anc_ids': 1, 'canon_anc_names': 1})
+        if doc_1 is None or doc_2 is None:
+            return {str(org1): -1, str(org2): -1, 'reason': 'No organism found.'}
+
+        canon_anc_1 = doc_1["canon_anc_names"]
+        canon_anc_2 = doc_2["canon_anc_names"]
+        
+        if canon_anc_1 == canon_anc_2:
+            return {str(org1): 0, str(org1)+'_canon_ancestors':canon_anc_1}
+
+        if canon_anc_1[-1] == org2:
+            distance1 = 1
+            distance2 = 0
+        elif canon_anc_2[-1] == org1:
+            distance1 = 0
+            distance2 = 1
+        else:
+            distance1 = -1
+            distance2 = -1            
+        ancestor = self.file_manager.get_common(canon_anc_1, canon_anc_2)
+        if ancestor == '':                
+            return {str(org1): -1, str(org2): -1, 'reason': 'No common ancestor.'}
+
+        idx_org1 = canon_anc_1.index(ancestor)
+        idx_org2 = canon_anc_2.index(ancestor)
+
+        if distance1 == -1:
+            distance1 = len(canon_anc_1) - (idx_org1) 
+        if distance2 == -1:
+            distance2 = len(canon_anc_2) - (idx_org2)
+
+        return {str(org1): distance1, str(org2): distance2, str(org1)+'_canon_ancestors':canon_anc_1,
+        str(org2)+'_canon_ancestors':canon_anc_2}
@@ -0,0 +1,62 @@
+from datanator_query_python.config import query_schema_2_manager
+from pymongo import MongoClient
+from pymongo import TEXT
+
+
+class FTX(query_schema_2_manager.QM):
+    def __init__(self):
+        super().__init__()
+
+    def search_taxon(self,
+                     msg,
+                     skip=0,
+                     limit=10,
+                     token_order="any",
+                     db="datanator-test"):
+        """Search for taxon names.
+        (https://docs.atlas.mongodb.com/reference/atlas-search)
+
+        Args:
+            msg(:obj:`str`): query message.
+            skip(:obj:`int`, optional): number of records to skip.
+            limit(:obj:`int`, optional): max number of documents to return.
+            token_order(:obj:`str`, optional): token order, i.e. sequential or any.
+            db(:obj:`str`, optional): name of database in which the result resides.
+
+        Return:
+            (:obj:`CommandCursor`): MongDB CommandCursor after aggregation.
+        """
+        collection = self.client[db]["taxon_tree"]
+        result = []
+        docs = collection.aggregate([
+                                        {
+                                            "$search": {
+                                                "autocomplete": {
+                                                    "path": "tax_name",
+                                                    "query": msg,
+                                                    "fuzzy": {
+                                                        "maxEdits": 2,
+                                                        "prefixLength": 1,
+                                                        "maxExpansions": 100
+                                                    },
+                                                    "tokenOrder": token_order
+                                                }
+                                            }
+                                        },
+                                        {
+                                            "$limit": limit
+                                        },
+                                        {
+                                            "$skip": skip
+                                        },
+                                        {
+                                            "$project": {
+                                                "_id": 0,
+                                                "tax_name": 1
+                                            }
+                                        }
+                                    ]
+                                    )
+        for doc in docs:
+            result.append(doc)
+        return result
@@ -0,0 +1,41 @@
+from datanator_query_python.config import query_schema_2_manager
+
+
+class QueryEn(query_schema_2_manager.QM):
+    def __init__(self,
+                 db="datanator-demo"):
+        super().__init__()
+        self.db = db
+
+    def query_entity(self, 
+                    identifier,
+                    datatype="metabolite",
+                    collection="entity",
+                    limit=10,
+                    skip=0,
+                    projection={"_id": 0}):
+        """Get entity with identifier.
+
+        Args:
+            identifier(:obj:`Obj`): identifier used for the entity.
+            datatype(:obj:`Obj`, optional): Datatype to be retrieved.
+            collection(:obj:`str`): name of the collection in which data resides.
+            limit(:obj:`int`, optional): number of results to return.
+            skip(:obj:`int`, optional): number of documents to skip.
+            projection(:obj:`Obj`, optional): MongoDB projection.
+
+        Return:
+            (:obj:`list`): pymongo iterables.
+        """
+        col = self.client[self.db][collection]
+        con_0 = {"identifiers": {"$elemMatch": identifier}}
+        con_1 = {"type": datatype}
+        query = {"$and": [con_0, con_1]}
+        result = []
+        docs = col.find(filter=query,
+                        limit=limit,
+                        skip=skip,
+                        projection=projection)
+        for doc in docs:
+            result.append(doc)
+        return result
@@ -0,0 +1,52 @@
+from datanator_query_python.config import query_schema_2_manager
+from pymongo import ASCENDING
+
+
+class QueryObs(query_schema_2_manager.QM):
+    def __init__(self,
+                 db="datanator-demo"):
+        super().__init__()
+        self.db = db
+
+    def get_entity_datatype(self, 
+                              identifier,
+                              entity="protein",
+                              datatype="half-life",
+                              collection="observation",
+                              limit=10,
+                              skip=0,
+                              projection={"_id": 0}):
+        """Get entity datatype.
+
+        Args:
+            identifier(:obj:`Obj`): identifier used for the entity.
+            entity(:obj:`Obj`, optional): entity type. i.e. "protein", "RNA", etc. 
+            datatype(:obj:`Obj`, optional): Datatype to be retrieved.
+            collection(:obj:`str`, optional): name of collection in which values reside.
+            limit(:obj:`int`, optional): number of results to return.
+            skip(:obj:`int`, optional): number of documents to skip.
+
+        Return:
+            (:obj:`list`): pymongo iterables.
+        """
+        results = []
+        col = self.client[self.db][collection]
+        con_0 = {"entity.type": entity}
+        con_1 = {}
+        if entity == "protein" and datatype != "localization":
+            con_1["values.type"] = datatype
+        elif entity == "protein" and datatype == "localization":
+            words = ["intramembrane_localization", "secretome location"]
+            con_1["values.type"] = {"$in": words}
+        elif entity == "RNA" and datatype == "localization":
+            con_1["values.type"] = "subcellular_localization"
+        query = {"$and": [{"identifier": identifier}, con_0, con_1]}
+        docs = col.find(filter=query, limit=limit, skip=skip,
+                        collation=self.collation,
+                        projection=projection,
+                        hint=[("identifier", ASCENDING), 
+                              ("entity.type", ASCENDING),
+                              ("values.type", ASCENDING)])
+        for doc in docs:
+            results.append(doc)
+        return results
@@ -46,6 +46,10 @@
     ],
     entry_points={
         'console_scripts': [
+            '{} = {}.__main__:main'.format(name, name),
+            '{} = {}.__main__:main'.format(name.replace('_', '-'), name),
+            '{}{:d} = {}.__main__:main'.format(name, sys.version_info[0], name),
+            '{}{:d} = {}.__main__:main'.format(name.replace('_', '-'), sys.version_info[0], name),
         ],
     },
 )
@@ -0,0 +1,18 @@
+import unittest
+from datanator_query_python.config import query_schema_2_manager
+from pymongo import ReadPreference
+
+
+class TestQ(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.src = query_schema_2_manager.QM()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.src.client.close()
+
+    def test_conn_protein(self):
+        result = self.src.conn_protein('datanator-test')
+        self.assertEqual(result.read_preference, ReadPreference.NEAREST)
@@ -154,6 +154,7 @@ def test_bool_query(self):
         result = self.src.bool_query(query_message, index, must_not=must_not)
         self.assertEqual(result['hits']['hits'][0]['_source']['ko_number'], 'K00001')
 
+    @unittest.skip("ko obsolete")
     def test_get_index_ko_count(self):
         agg_field = "frontend_gene_aggregate"
         query_message = 'alcohol dehydrogenase'
@@ -170,11 +171,19 @@ def test_get_rxn_oi(self):
         result = self.src.get_rxn_oi(query_message)
         self.assertEqual(result['sabio_rk_total'], {'value': 10000, 'relation': 'gte'})
 
+<<<<<<< HEAD
     def test_get_genes_ko_count(self):
         agg_field = "frontend_gene_aggregate"
         query_message = 'K14236'
         result_0 = self.src.get_genes_ko_count(query_message, 15, agg_field=agg_field, size=10, fields=['*'])
         print(result_0)
+=======
+    # def test_get_genes_ko_count(self):
+    #     agg_field = "ko_number"
+    #     query_message = 'K14236'
+    #     result_0 = self.src.get_genes_ko_count(query_message, 15, agg_field=agg_field, size=10, fields=['*'])
+    #     print(result_0)
+>>>>>>> testapi
 
     def test_get_genes_orthodb_count(self):
         agg_field = "orthodb_id"
 
@@ -61,4 +61,10 @@ def test_get_meta_by_kegg_ids(self):
     def test_get_meta_by_kegg_id(self):
         kegg_id = 'k00001'
         doc = self.src.get_meta_by_kegg_id(kegg_id)
-        self.assertEqual(doc['gene_name'], ['E1.1.1.1', 'adh'])
+        self.assertEqual(doc['gene_name'], ['E1.1.1.1', 'adh'])
+
+    def test_get_meta_by_ortho_ids(self):
+        kegg_ids = ['643917at2', '567019at2']
+        docs, count = self.src.get_meta_by_ortho_ids(kegg_ids)
+        for doc in docs:
+            print(doc)
@@ -280,7 +280,16 @@ def test_get_all_kegg(self):
         result_0 = self.src_1.get_all_kegg('K00850','Escherichia coli', 10)
         print(result_0)        
 
-    # @unittest.skip("skipping")
-    # def test_get_all_ortho(self):
-    #     result_0 = self.src_2.get_all_ortho('494933at2759','Escherichia coli', 10)
-    #     print(result_0)  
+    @unittest.skip("skipping")
+    def test_get_all_ortho(self):
+        result_0 = self.src_2.get_all_ortho('494933at2759','Escherichia coli', 10)
+        print(result_0)
+
+    @unittest.skip("passed")
+    def test_get_info_by_orthodb(self):
+        result = self.src_2.get_info_by_orthodb("643917at2")
+        print(result)
+
+    def test_get_ortho_by_id(self):
+        result = self.src_2.get_ortho_by_id("P53984")
+        print(result)
@@ -149,4 +149,12 @@ def test_get_rxn_with_prm(self):
     def test_get_reaction_by_subunit(self):
         _ids = ['P20932', 'P00803']
         result = self.src.get_reaction_by_subunit(_ids)
-        self.assertTrue(result[-1]['kinlaw_id'] in [31611, 31609])
+        self.assertTrue(result[-1]['kinlaw_id'] in [31611, 31609])
+
+    def test_get_kinlaw_by_rxn_ortho(self):
+        substrate_0 = 'XJLXINKUBYWONI-NNYOXOHSSA-N'
+        substrate_1 = 'ODBLHEXUDAPZAU-UHFFFAOYSA-N'
+        product_0 = 'GPRLSGONYQIRFK-UHFFFAOYSA-N'
+        product_1 = 'KPGXRSRHYNQIFN-UHFFFAOYSA-N'
+        _, result = self.src.get_kinlaw_by_rxn_ortho([substrate_0, substrate_1], [product_0, product_1])
+        print(result)
@@ -138,10 +138,26 @@ def test_get_canon_common_ancestor(self):
         # print(result_0)
 
     def test_get_canon_common_ancestor_fast(self):
-        org_0 = 2160
-        org_1 = 2161
-        for doc in self.src.get_canon_common_ancestor_fast(org_0, org_1):
-            print(doc)
+        self.assertEqual(self.src.get_canon_common_ancestor_fast("Escherichia coli E1002", "Homo 1002", org_format="tax_name")["reason"], "No organism found.")
+        org_1 = 743725
+        org_2 = 2107591
+        result = self.src.get_canon_common_ancestor_fast(org_1, org_2)
+        self.assertEqual(result, {'2107591': 4,
+                                '2107591_canon_ancestors': ['cellular organisms', 'Archaea',
+                                'Candidatus Diapherotrites',
+                                'Candidatus Forterrea',
+                                'Candidatus Forterrea multitransposorum'],
+                                '743725': 1,
+                                '743725_canon_ancestors': ['cellular organisms', 'Archaea']})
+        self.assertEqual(self.src.get_canon_common_ancestor_fast("Escherichia coli E1002", "Escherichia coli", org_format="tax_name")["Escherichia coli"], 0)        
+        result = self.src.get_canon_common_ancestor_fast('Escherichia coli', 'Escherichia coli', org_format='tax_name')
+        self.assertEqual(result['Escherichia coli'], 0)
+        org_3 = 9606
+        org_4 = 4932
+        result = self.src.get_canon_common_ancestor_fast(org_3, org_4)
+        self.assertEqual(result, {'9606': 7, '4932': 7, '9606_canon_ancestors': ['cellular organisms', 'Eukaryota', 'Metazoa', 'Chordata', 'Mammalia', 'Primates', 'Hominidae', 'Homo'], '4932_canon_ancestors': ['cellular organisms', 'Eukaryota', 'Fungi', 'Ascomycota', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']})
+   
+
 
 class TestQueryTaxonTreeMock(unittest.TestCase):
 
 
@@ -0,0 +1,18 @@
+from datanator_query_python.query_schema_2 import ftx_search
+import unittest
+
+
+class TestQEn(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.src = ftx_search.FTX()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.src.client.close()
+
+    def test_search_taxon(self):
+        docs = self.src.search_taxon("off",
+                                     token_order="any")
+        print(docs)
@@ -0,0 +1,22 @@
+from datanator_query_python.query_schema_2 import query_entity
+import unittest
+
+
+class TestQEn(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.src = query_entity.QueryEn()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.src.client.close()
+
+    def test_query_entity(self):
+        _id = {"namespace": "inchikey", "value": "TYEYBOSBBBHJIV-UHFFFAOYSA-N"}
+        r = self.src.query_entity(_id)
+        self.assertEqual(len(r), 1)
+        self.assertEqual(r[0]["name"], "2-Ketobutyric acid")
+        _id = {"namespace": "inchikey", "value": "234wgadgas"}
+        r = self.src.query_entity(_id)
+        self.assertEqual(r, [])
@@ -0,0 +1,27 @@
+from datanator_query_python.query_schema_2 import query_observation
+import unittest
+
+
+class TestQOb(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.src = query_observation.QueryObs()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.src.client.close()
+
+    def test_get_entity_datatype(self):
+        identfier = {"namespace": "gene_symbol", "value": "BAG1"}
+        results = self.src.get_entity_datatype(identfier)
+        self.assertEqual(results[0]["entity"]["name"], 'BCL2-associated athanogene.')
+        identfier = {"namespace": "gene_symbol", "value": "something"}
+        results = self.src.get_entity_datatype(identfier)
+        self.assertEqual(results, [])
+        identfier = {"namespace": "gene_name", "value": "EMC3"}
+        results = self.src.get_entity_datatype(identfier, datatype="localization")
+        self.assertEqual(results[0]["entity"]["name"], 'ER membrane protein complex subunit 3')
+        identfier = {"namespace": "gene_id", "value": "100003563"}
+        results = self.src.get_entity_datatype(identfier, entity="RNA", datatype="localization")
+        self.assertEqual(results[0]["genotype"]["taxon"]["name"], 'Danio rerio')
@@ -0,0 +1,50 @@
+import os
+os.environ["WHERE"] = "API_TEST"
+import unittest
+from datanator_query_python.config import config as query_config
+from datanator_query_python.query_schema_2 import query_taxon_tree_v2
+import asyncio
+from pprint import pprint
+
+
+
+class TestQTaxon(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        print(os.getenv("WHERE"))
+        cls.src = query_taxon_tree_v2.QTaxon()
+
+    @classmethod
+    def tearDownClass(cls):
+        os.environ["WHERE"] = ""
+
+    def test_get_canon_ancestor(self):
+        _id = 1915648
+        loop = asyncio.get_event_loop()
+        ids, _ = loop.run_until_complete(self.src.get_canon_ancestor(_id, _format='tax_id'))         
+        self.assertTrue(2283796 not in ids)
+        self.assertTrue(2157 in ids)
+        _id = "nonsense"
+        loop = asyncio.get_event_loop()
+        ids, _ = loop.run_until_complete(self.src.get_canon_ancestor(_id, _format='tax_name'))  
+        self.assertEqual([], ids)
+
+    # @unittest.skip("for now")
+    def test_aggregate_distance(self):
+        measured_0 = [{"canon_anc_ids": [131567, 2, 1224, 1236, 91347, 543, 590, 28901],
+                       "tax_name": "Salmonella enterica subsp. enterica serovar Newport str. CFSAN000907"}] #tax_id1299189
+        target_0 = 0
+        target_1 = 1227178
+        loop = asyncio.get_event_loop()
+        result_0 = loop.run_until_complete(self.src.aggregate_distance(measured_0, target_0, name_field='tax_name'))
+        self.assertEqual(result_0, measured_0)
+        result_1 = loop.run_until_complete(self.src.aggregate_distance(measured_0, target_1, name_field='tax_name'))
+        self.assertEqual(result_1[0]['taxon_distance']['Salmonella enterica subsp. enterica serovar Newport str. CFSAN001557'], 0)
+        # taget is measured's ancestor
+        measured_1 = [{"canon_anc_ids": [131567, 2759, 4751, 4890, 4891, 4892, 4893, 4930, 4932],
+                       "tax_name": "Saccharomyces cerevisiae CAT-1"}]
+        target_2 = 4932  # Saccharomyces cerevisiae
+        result_2 = loop.run_until_complete(self.src.aggregate_distance(measured_1, target_2, name_field='tax_name'))
+        self.assertEqual(result_2[0]['taxon_distance']['Saccharomyces cerevisiae'], 0)
+        self.assertEqual(result_2[0]['taxon_distance']['Saccharomyces cerevisiae CAT-1'], 1)
@@ -46,6 +46,23 @@ def test_version(self):
                                  datanator_query_python.__version__)
                 self.assertEqual(captured.stderr.get_text(), '')
 
+    def test_define_schema(self):
+        with capturer.CaptureOutput(merged=False, relay=False) as captured:
+            with __main__.App(argv=['mongo-def-schema',
+                                    'test',
+                                    'cli_test',
+                                    '../datanator_pattern_design/compiled/taxon_compiled.json']) as app:
+                # run app
+                app.run()
+
+                # test that the arguments to the CLI were correctly parsed
+                self.assertEqual(app.pargs.db, 'test')
+                self.assertTrue(app.pargs.collection, 'cli_test')
+
+                # test that the CLI produced the correct output
+                self.assertEqual(captured.stdout.get_text(), 'done')
+                self.assertEqual(captured.stderr.get_text(), '')
+
     def test_define_schema(self):
         with capturer.CaptureOutput(merged=False, relay=False) as captured:
             with __main__.App(argv=['mongo-def-schema',
 
@@ -0,0 +1,25 @@
+from datanator_query_python.util import motor_util
+from datanator_query_python.config import config
+import unittest
+
+
+class TestMUtil(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.src = motor_util.MotorUtil(MongoDB=config.AtlasConfig.SERVER,
+                username=config.DatanatorTest.USERNAME,
+                password=config.DatanatorTest.PASSWORD,
+                authSource=config.AtlasConfig.AUTHDB,
+                replicaSet=config.AtlasConfig.REPLSET,
+                readPreference=config.AtlasConfig.READ_PREFERENCE )
+        cls.test_collection = 'test_motor'
+        cls.test_database = "test"
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.src.client.close()
+        cls.src.client.get_database(cls.test_database).drop_collection(cls.test_collection)
+
+    def test_client(self):
+        self.src.client.get_database(self.test_database)[self.test_collection].insert_one({"test": 1})