Skip to content

Commit 5f3f35c

Browse files
author
Zhouyang Lian
committedOct 9, 2020
merge conflict
2 parents 11cc815 + 79d5616 commit 5f3f35c

27 files changed

+738
-20
lines changed
 

‎.circleci/requirements.txt

-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# Karr Lab packages
22
git+https://github.com/KarrLab/pkg_utils.git#egg=pkg_utils
33
git+https://github.com/KarrLab/wc_utils.git#egg=wc_utils
4-
54
git+https://github.com/KarrLab/karr_lab_aws_manager.git#egg=karr_lab_aws_manager

‎datanator_query_python/__main__.py

+39-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import cement
1010
from datanator_query_python.util import mongo_util
1111
from datanator_query_python.config import config
12-
import datanator_query_python
12+
import datanator_query_python.core
1313

1414

1515
class BaseController(cement.Controller):
@@ -65,13 +65,51 @@ def _default(self):
6565
print("done")
6666

6767

68+
class DefineSchema(cement.Controller):
69+
"""Karrlab elasticsearch delete index. """
70+
71+
class Meta:
72+
label = 'mongo-def-schema'
73+
description = 'Define jsonschema of a collection'
74+
stacked_on = 'base'
75+
stacked_type = 'nested'
76+
arguments = [
77+
(['db'], dict(
78+
type=str, help='Name of the database in which the collection resides.')),
79+
(['collection'], dict(
80+
type=str, help='Name of the collection to be defined.')),
81+
(['jsonschema'], dict(
82+
type=str, help='Location of jsonschema')),
83+
(['--config_name', '-cn'], dict(
84+
type=str, default='TestConfig',
85+
help='Config class to be used.'))
86+
]
87+
88+
@cement.ex(hide=True)
89+
def _default(self):
90+
''' Delete elasticsearch index
91+
92+
Args:
93+
index (:obj:`str`): name of index in es
94+
_id (:obj:`int`): id of the doc in index (optional)
95+
'''
96+
args = self.app.pargs
97+
conf = getattr(config, args.config_name)
98+
mongo_util.MongoUtil(MongoDB=conf.SERVER,
99+
db=args.db,
100+
username=conf.USERNAME,
101+
password=conf.PASSWORD).define_schema(args.collection, args.jsonschema)
102+
print("done")
103+
104+
68105
class App(cement.App):
69106
""" Command line application """
70107
class Meta:
71108
label = 'datanator_query_python'
72109
base_controller = 'base'
73110
handlers = [
74111
BaseController,
112+
Command3WithArgumentsController,
75113
DefineSchema
76114
]
77115

‎datanator_query_python/config/query_manager.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ def __init__(self):
1414
self.read_preference = config.AtlasConfig.READ_PREFERENCE
1515
self.repl = config.AtlasConfig.REPLSET
1616

17-
def protein_manager(self):
17+
def protein_manager(self, database="datanator"):
1818
return query_protein.QueryProtein(username=self.username, password=self.password, server=self.server,
19-
authSource=self.authDB, readPreference=self.read_preference, replicaSet=self.repl)
19+
authSource=self.authDB, readPreference=self.read_preference, replicaSet=self.repl, database=database)
2020

2121
def metabolite_concentration_manager(self):
2222
return query_metabolite_concentrations.QueryMetaboliteConcentrations(MongoDB=self.server, db='datanator',
@@ -71,10 +71,10 @@ def metabolites_meta_manager():
7171

7272
class RnaManager:
7373

74-
def rna_manager(self):
74+
def rna_manager(self, db="datanator"):
7575
return query_rna_halflife.QueryRNA(username=config.AtlasConfig.USERNAME, password=config.AtlasConfig.PASSWORD,
7676
server=config.AtlasConfig.SERVER, authDB=config.AtlasConfig.AUTHDB, readPreference=config.AtlasConfig.READ_PREFERENCE,
77-
db='datanator', collection_str='rna_halflife_new', replicaSet=config.AtlasConfig.REPLSET)
77+
db=db, collection_str='rna_halflife_new', replicaSet=config.AtlasConfig.REPLSET)
7878

7979

8080
class KeggManager:

‎datanator_query_python/config/query_schema_2_manager.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from datanator_query_python.config import config
22
from datanator_query_python.util import mongo_util
33
from pymongo import ReadPreference
4+
from pymongo.collation import Collation, CollationStrength
45

56

67
class QM(mongo_util.MongoUtil):
@@ -17,6 +18,7 @@ def __init__(self,
1718
username=username, password=password,
1819
authSource=authSource, readPreference=readPreference)
1920
self.read_preference = self._convert_read_p(readPreference)
21+
self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY)
2022

2123
def _convert_read_p(self, read_preference):
2224
"""Convert string read preference to pymongo

‎datanator_query_python/query/full_text_search.py

+27
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,11 @@ def get_index_ko_count(self, q, num, agg_field="frontend_gene_aggregate", index=
196196
"aggs": {
197197
"top_ko": {
198198
"top_hits": {'_source': {'includes': ['ko_number', 'ko_name', 'protein_name', 'definition', agg_field,
199+
<<<<<<< HEAD
199200
'species_name', "orthodb_name", "orthodb_id"]}, 'size': 1}
201+
=======
202+
'species_name', "orthodb_name", "orthodb_id", "uniprot_id"]}, 'size': 1}
203+
>>>>>>> testapi
200204
},
201205
"top_hit" : {
202206
"max": {
@@ -341,7 +345,11 @@ def get_rxn_oi(self, query_message, minimum_should_match=0, from_=0,
341345
result['sabio_rk'].append(hit['_source'])
342346
return result
343347

348+
<<<<<<< HEAD
344349
def get_genes_orthodb_count(self, q, num, agg_field="orthodb_id", **kwargs):
350+
=======
351+
def get_genes_orthodb_count(self, q, num, agg_field="orthodb_id.keyword", **kwargs):
352+
>>>>>>> testapi
345353
"""Get protein index with different ko_number field for up to num hits,
346354
provided at least one of the proteins under orthodb_id has abundance info.
347355
@@ -376,7 +384,11 @@ def get_genes_orthodb_count(self, q, num, agg_field="orthodb_id", **kwargs):
376384
"aggs": {
377385
"top_ko": {
378386
"top_hits": {'_source': {'includes': ['orthodb_id', 'orthodb_name', 'protein_name', 'definition', agg_field,
387+
<<<<<<< HEAD
379388
'species_name']}, "size": 1}
389+
=======
390+
'species_name', "uniprot_id"]}, "size": 1}
391+
>>>>>>> testapi
380392
},
381393
"top_hit" : {
382394
"max": {
@@ -412,6 +424,7 @@ def get_genes_orthodb_count(self, q, num, agg_field="orthodb_id", **kwargs):
412424
# for i, s in enumerate(r['aggregations']['top_kos']['buckets']):
413425
# r['aggregations']['top_kos']['buckets'][i]['key'] = [s['key'][i:i+6] for i in range(0, len(s['key']), 6)]
414426
for bucket_abundance in r['aggregations']['top_kos']['buckets']:
427+
<<<<<<< HEAD
415428
ko_abundance.add(bucket_abundance['top_ko']['hits']['hits'][0]['_source'][agg_field])
416429

417430
for bucket_all in r_all['top_kos']['buckets']:
@@ -424,6 +437,20 @@ def get_genes_orthodb_count(self, q, num, agg_field="orthodb_id", **kwargs):
424437
# s['top_ko']['hits']['hits'][0]['_source']['abundances'] = True
425438
s['top_ko']['hits']['hits'][0]['_source'][agg_field] = [ko_str[i:i+6] for i in range(0, len(ko_str), 6)]
426439
else:
440+
=======
441+
ko_abundance.add(bucket_abundance['top_ko']['hits']['hits'][0]['_source'].get(agg_field))
442+
443+
for bucket_all in r_all['top_kos']['buckets']:
444+
ko_all.add(bucket_all['top_ko']['hits']['hits'][0]['_source'].get(agg_field))
445+
intersects = ko_abundance.intersection(ko_all)
446+
for s in r['aggregations']['top_kos']['buckets']:
447+
ko_str = s['top_ko']['hits']['hits'][0]['_source'].get(agg_field) # ko_str can be "K01234,K12345"
448+
# if ko_str in intersects and ko_str != 'nan':
449+
if ko_str is None:
450+
# # s['top_ko']['hits']['hits'][0]['_source']['abundances'] = True
451+
# s['top_ko']['hits']['hits'][0]['_source'][agg_field] = [ko_str[i:i+6] for i in range(0, len(ko_str), 6)]
452+
# else:
453+
>>>>>>> testapi
427454
# s['top_ko']['hits']['hits'][0]['_source']['abundances'] = False
428455
s['top_ko']['hits']['hits'][0]['_source'][agg_field] = ["N/A"]
429456
return r['aggregations']

‎datanator_query_python/query/query_kegg_orthology.py

+24
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def __init__(self, username=None, password=None, server=None, authSource='admin'
1414
self.max_entries = max_entries
1515
self.verbose = verbose
1616
self.client, self.db, self.collection = self.con_db('kegg_orthology')
17+
self.ortho = self.db_obj["orthodb"]
1718
self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY)
1819

1920
def get_ko_by_name(self, name):
@@ -97,6 +98,29 @@ def get_meta_by_kegg_ids(self, kegg_ids, projection={'_id': 0, 'gene_ortholog':
9798
count = self.collection.count_documents(query, collation=self.collation)
9899
return docs, count
99100

101+
def get_meta_by_ortho_ids(self, orthodb_ids, projection={'_id': 0, 'gene_ortholog': 0},
102+
limit=0):
103+
"""Get meta given kegg ids
104+
105+
Args:
106+
orthodb_ids (:obj:`list` of :obj:`str`): List of orthodb ids.
107+
projection (:obj:`dict`): MongoDB result projection.
108+
109+
Return:
110+
(:obj:`tuple` of :obj:`pymongo.Cursor` and :obj:`int`): pymongo Cursor obj and number of documents found.
111+
"""
112+
projection['__order'] = 0
113+
query = {'orthodb_id': {'$in': orthodb_ids}}
114+
pipeline = [
115+
{'$match': {'orthodb_id': {'$in': orthodb_ids}}},
116+
{'$addFields': {"__order": {'$indexOfArray': [orthodb_ids, "$orthodb_id" ]}}},
117+
{'$sort': {"__order": 1}},
118+
{"$project": projection}
119+
]
120+
docs = self.ortho.aggregate(pipeline)
121+
count = self.ortho.count_documents(query)
122+
return docs, count
123+
100124
def get_meta_by_kegg_id(self, kegg_id):
101125
"""Get meta information by kegg_id
102126

‎datanator_query_python/query/query_protein.py

+115
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,35 @@ def get_meta_by_id(self, _id):
6464
result.append(doc)
6565
return result
6666

67+
def get_ortho_by_id(self, _id):
68+
'''
69+
Get protein's metadata given uniprot id
70+
71+
Args:
72+
_id (:obj:`str`): uniprot id.
73+
74+
Returns:
75+
(:obj:`list` of :obj:`dict`): list of information.
76+
'''
77+
result = []
78+
query = {'uniprot_id': _id}
79+
doc = self.collection.find_one(filter=query, projection={"_id": 0})
80+
if doc is None:
81+
return {'uniprot_id': 'None',
82+
'entry_name': 'None',
83+
'gene_name': 'None',
84+
'protein_name': 'None',
85+
'canonical_sequence': 'None',
86+
'length': 99999999,
87+
'mass': '99999999',
88+
'abundances': [],
89+
'ncbi_taxonomy_id': 99999999,
90+
'species_name': '99999999'}
91+
else:
92+
doc = json.loads(json.dumps(doc, ignore_nan=True))
93+
result.append(doc)
94+
return result
95+
6796
def get_meta_by_name_taxon(self, name, taxon_id):
6897
'''
6998
Get protein's metadata given protein name
@@ -869,4 +898,90 @@ def get_all_kegg(self, ko, anchor, max_distance):
869898
species_canon_ancestor = obj[species+'_canon_ancestors']
870899
doc['canon_ancestors'] = species_canon_ancestor
871900
result[distance-1]['documents'].append(doc)
901+
return result
902+
903+
def get_all_ortho(self, ko, anchor, max_distance):
904+
'''Get replacement abundance value by taxonomic distance
905+
with the same OrthoDB group number.
906+
907+
Args:
908+
ko (:obj:`str`): OrthoDB group id to query for.
909+
anchor (:obj:`str`): anchor species' name.
910+
max_distance (:obj:`int`): max taxonomic distance from origin protein allowed for
911+
proteins in results.
912+
max_depth (:obj:`int`) max depth allowed from the common node.
913+
914+
Returns:
915+
(:obj:`list` of :obj:`dict`): list of result proteins and their info
916+
[
917+
{'distance': 1, 'documents': [{}, {}, {} ...]},
918+
{'distance': 2, 'documents': [{}, {}, {} ...]}, ...].
919+
'''
920+
if max_distance <= 0:
921+
return 'Please use get_abundance_by_id to check self abundance values'
922+
923+
result = []
924+
for i in range(max_distance):
925+
result.append({'distance': i + 1, 'documents': []})
926+
927+
projection = {
928+
'orthodb_id': 1,
929+
'orthodb_name': 1,
930+
'ancestor_name': 1,
931+
'ncbi_taxonomy_id': 1,
932+
'abundances': 1,
933+
'species_name': 1,
934+
'uniprot_id': 1,
935+
'_id': 0,
936+
'ancestor_taxon_id': 1,
937+
'protein_name': 1,
938+
'gene_name': 1,
939+
'modifications': 1
940+
}
941+
con_0 = {'orthodb_id': ko}
942+
con_1 = {'abundances': {'$exists': True}}
943+
query = {'$and': [con_0, con_1]}
944+
docs = self.collection.find(filter=query, projection=projection)
945+
queried = deque()
946+
names = {}
947+
for doc in docs:
948+
doc = json.loads(json.dumps(doc, ignore_nan=True))
949+
species = doc.get('species_name')
950+
if species is None and species not in queried:
951+
taxon_id = doc['ncbi_taxonomy_id']
952+
species = self.db_obj['taxon_tree'].find_one({"tax_id": taxon_id})['tax_name']
953+
queried.append(taxon_id)
954+
names[taxon_id] = species
955+
elif species is None and species in queried:
956+
species = names[doc['ncbi_taxonomy_id']]
957+
obj = self.taxon_manager.get_canon_common_ancestor_fast(anchor, species, org_format='tax_name')
958+
distance = obj[anchor]
959+
if distance != -1 and distance <= max_distance:
960+
species_canon_ancestor = obj[species+'_canon_ancestors']
961+
doc['canon_ancestors'] = species_canon_ancestor
962+
result[distance-1]['documents'].append(doc)
963+
return result
964+
965+
def get_info_by_orthodb(self, orthodb):
966+
'''
967+
Find all proteins with the same kegg orthology id.
968+
969+
Args:
970+
orthodb(:obj:`str`): kegg orthology ID.
971+
972+
Returns:
973+
(:obj:`list` of :obj:`dict`): list of dictionary containing
974+
protein's uniprot_id and kegg information
975+
[{'orthodb_id': ... 'orthodb_name': ... 'uniprot_ids': []},
976+
{'orthodb_id': ... 'orthodb_name': ... 'uniprot_ids': []}].
977+
'''
978+
ko = orthodb.lower()
979+
result = [{'orthodb_id': ko, 'uniprot_ids': []}]
980+
query = {'orthodb_id': ko}
981+
projection = {'uniprot_id': 1, '_id': 0, 'orthodb_name': 1, 'orthodb_id': 1}
982+
docs = self.collection.find(filter=query, projection=projection)
983+
984+
for doc in docs:
985+
result[0]['orthodb_name'] = doc.get('orthodb_name', ['no name'])
986+
result[0]['uniprot_ids'].append(doc.get('uniprot_id'))
872987
return result

‎datanator_query_python/query/query_rna_halflife.py

+21
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,24 @@ def get_doc_by_ko(self, ko_number, projection={'_id': 0},
7070
skip=_from, limit=size)
7171
count = self.collection.count_documents(query)
7272
return docs, count
73+
74+
def get_doc_by_orthodb(self, orthodb, projection={'_id': 0},
75+
_from=0, size=0):
76+
"""Get documents by orthodb group ID.
77+
78+
Args:
79+
orthodb (:obj:`str`): Orthodb group ID.
80+
projection (:obj:`dict`, optional): mongodb query result
81+
projection. Defaults to {'_id': 0}.
82+
_from (:obj:`int`): first page (0-indexed).
83+
size (:obj:`int`): number of items per page.
84+
85+
Return:
86+
(:obj:`tuple` of :obj:`Pymongo.Cursor` and :obj:`int`):
87+
pymongo interable and number of documents.
88+
"""
89+
query = {'orthodb_id': orthodb}
90+
docs = self.collection.find(filter=query, projection=projection,
91+
skip=_from, limit=size)
92+
count = self.collection.count_documents(query)
93+
return docs, count

‎datanator_query_python/query/query_sabiork_old.py

+74
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def __init__(self, cache_dirname=None, MongoDB=None, replicaSet=None, db='datana
2020
replicaSet=replicaSet, db=db,
2121
verbose=verbose, max_entries=max_entries, username=username,
2222
password=password, authSource=authSource, readPreference=readPreference)
23+
self.u = self.client["datanator-test"]["uniprot"]
2324
self.chem_manager = chem_util.ChemUtil()
2425
self.file_manager = file_util.FileUtil()
2526
self.collection = self.db_obj[collection_str]
@@ -172,6 +173,79 @@ def get_kinlaw_by_rxn(self, substrates, products, dof=0,
172173
count = self.collection.count_documents(query)
173174
return count, docs
174175

176+
def get_kinlaw_by_rxn_ortho(self, substrates, products, dof=0,
177+
projection={'kinlaw_id': 1, '_id': 0, "enzymes": 1},
178+
bound='loose', skip=0, limit=0):
179+
''' Find the kinlaw_id defined in sabio_rk using
180+
rxn participants' inchikey
181+
182+
Args:
183+
substrates (:obj:`list`): list of substrates' inchikey
184+
products (:obj:`list`): list of products' inchikey
185+
dof (:obj:`int`, optional): degree of freedom allowed (number of parts of
186+
inchikey to truncate); the default is 0
187+
projection (:obj:`dict`): pymongo query projection
188+
bound (:obj:`str`): limit substrates/products to include only input values
189+
190+
Return:
191+
(:obj:`list` of :obj:`dict`): list of kinlaws that satisfy the condition
192+
'''
193+
substrate = 'reaction_participant.substrate_aggregate'
194+
product = 'reaction_participant.product_aggregate'
195+
if dof == 0:
196+
substrates = substrates
197+
products = products
198+
elif dof == 1:
199+
substrates = [re.compile('^' + x[:-2]) for x in substrates]
200+
products = [re.compile('^' + x[:-2]) for x in products]
201+
else:
202+
substrates = [re.compile('^' + x[:14]) for x in substrates]
203+
products = [re.compile('^' + x[:14]) for x in products]
204+
205+
if bound == 'loose':
206+
constraint_0 = {substrate: {'$all': substrates}}
207+
constraint_1 = {product: {'$all': products}}
208+
constraint_2 = {"taxon_id": {"$ne": None}}
209+
else:
210+
constraint_0 = {substrate: substrates}
211+
constraint_1 = {product: products}
212+
constraint_2 = {"taxon_id": {"$ne": None}}
213+
query = {'$and': [constraint_0, constraint_1, constraint_2]}
214+
# lookup = lookups.Lookups().simple_lookup("kegg_orthology", "resource.id", "definition.ec_code", "kegg_meta")
215+
# if limit > 0:
216+
# pipeline = [{"$match": query}, {"$limit": limit}, {"$skip": skip}, lookup, {"$project": projection}]
217+
# else:
218+
# pipeline = [{"$match": query}, {"$skip": skip}, lookup, {"$project": projection}]
219+
docs = self.collection.find(filter=query,
220+
limit=limit,
221+
skip=skip,
222+
projection=projection)
223+
cache = {}
224+
result = []
225+
for doc in docs:
226+
try:
227+
u_id = doc["enzymes"][2]["subunit"][0]["uniprot_id"]
228+
if u_id is not None:
229+
if cache.get(u_id) is None:
230+
x = self.u.find_one(filter={"uniprot_id": u_id},
231+
projection={"orthodb_id": 1,
232+
"orthodb_name": 1})
233+
cache[u_id] = x
234+
else:
235+
x = cache.get(u_id)
236+
doc["orthodb_id"] = x["orthodb_id"]
237+
doc["orthodb_name"] = x["orthodb_name"]
238+
else:
239+
doc["orthodb_id"] = None
240+
doc["orthodb_name"] = None
241+
except:
242+
doc["orthodb_id"] = None
243+
doc["orthodb_name"] = None
244+
doc.pop("enzymes", None)
245+
result.append(doc)
246+
count = self.collection.count_documents(query)
247+
return count, result
248+
175249
def get_kinlaw_by_entryid(self, entry_id):
176250
"""Find reactions by sabio entry id
177251

‎datanator_query_python/query/query_taxon_tree.py

+38-4
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class QueryTaxonTree(mongo_util.MongoUtil):
1212

1313
def __init__(self, cache_dirname=None, collection_str='taxon_tree',
1414
verbose=False, max_entries=float('inf'), username=None, MongoDB=None,
15-
password=None, db='datanator', authSource='admin', readPreference='nearest',
15+
password=None, db='datanator-test', authSource='admin', readPreference='nearest',
1616
replicaSet=None):
1717
self.collection_str = collection_str
1818
super().__init__(cache_dirname=cache_dirname, MongoDB=MongoDB,
@@ -415,6 +415,40 @@ def get_canon_common_ancestor_fast(self, org1, org2, org_format='tax_id'):
415415
Return:
416416
(:obj:`Obj`)
417417
'''
418-
anchor_org = self.collection.find_one({org_format: org1}, projection={'canon_anc_ids': 1, 'canon_anc_names': 1})
419-
pipeline = self.pipeline_manager.aggregate_common_canon_ancestors(anchor_org, org1, intersect_name="ancMatch")
420-
return self.collection.aggregate(pipeline)
418+
if org1 is None or org2 is None:
419+
return {'reason': 'Needs two organisms.'}
420+
collection = self.client["datanator-test"]["taxon_tree"]
421+
doc_1 = collection.find_one({org_format: org1}, projection={'canon_anc_ids': 1, 'canon_anc_names': 1})
422+
doc_2 = collection.find_one({org_format: org2}, projection={'canon_anc_ids': 1, 'canon_anc_names': 1})
423+
if doc_1 is None or doc_2 is None:
424+
return {str(org1): -1, str(org2): -1, 'reason': 'No organism found.'}
425+
426+
canon_anc_1 = doc_1["canon_anc_names"]
427+
canon_anc_2 = doc_2["canon_anc_names"]
428+
429+
if canon_anc_1 == canon_anc_2:
430+
return {str(org1): 0, str(org1)+'_canon_ancestors':canon_anc_1}
431+
432+
if canon_anc_1[-1] == org2:
433+
distance1 = 1
434+
distance2 = 0
435+
elif canon_anc_2[-1] == org1:
436+
distance1 = 0
437+
distance2 = 1
438+
else:
439+
distance1 = -1
440+
distance2 = -1
441+
ancestor = self.file_manager.get_common(canon_anc_1, canon_anc_2)
442+
if ancestor == '':
443+
return {str(org1): -1, str(org2): -1, 'reason': 'No common ancestor.'}
444+
445+
idx_org1 = canon_anc_1.index(ancestor)
446+
idx_org2 = canon_anc_2.index(ancestor)
447+
448+
if distance1 == -1:
449+
distance1 = len(canon_anc_1) - (idx_org1)
450+
if distance2 == -1:
451+
distance2 = len(canon_anc_2) - (idx_org2)
452+
453+
return {str(org1): distance1, str(org2): distance2, str(org1)+'_canon_ancestors':canon_anc_1,
454+
str(org2)+'_canon_ancestors':canon_anc_2}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
from datanator_query_python.config import query_schema_2_manager
2+
from pymongo import MongoClient
3+
from pymongo import TEXT
4+
5+
6+
class FTX(query_schema_2_manager.QM):
7+
def __init__(self):
8+
super().__init__()
9+
10+
def search_taxon(self,
11+
msg,
12+
skip=0,
13+
limit=10,
14+
token_order="any",
15+
db="datanator-test"):
16+
"""Search for taxon names.
17+
(https://docs.atlas.mongodb.com/reference/atlas-search)
18+
19+
Args:
20+
msg(:obj:`str`): query message.
21+
skip(:obj:`int`, optional): number of records to skip.
22+
limit(:obj:`int`, optional): max number of documents to return.
23+
token_order(:obj:`str`, optional): token order, i.e. sequential or any.
24+
db(:obj:`str`, optional): name of database in which the result resides.
25+
26+
Return:
27+
(:obj:`CommandCursor`): MongDB CommandCursor after aggregation.
28+
"""
29+
collection = self.client[db]["taxon_tree"]
30+
result = []
31+
docs = collection.aggregate([
32+
{
33+
"$search": {
34+
"autocomplete": {
35+
"path": "tax_name",
36+
"query": msg,
37+
"fuzzy": {
38+
"maxEdits": 2,
39+
"prefixLength": 1,
40+
"maxExpansions": 100
41+
},
42+
"tokenOrder": token_order
43+
}
44+
}
45+
},
46+
{
47+
"$limit": limit
48+
},
49+
{
50+
"$skip": skip
51+
},
52+
{
53+
"$project": {
54+
"_id": 0,
55+
"tax_name": 1
56+
}
57+
}
58+
]
59+
)
60+
for doc in docs:
61+
result.append(doc)
62+
return result
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from datanator_query_python.config import query_schema_2_manager
2+
3+
4+
class QueryEn(query_schema_2_manager.QM):
5+
def __init__(self,
6+
db="datanator-demo"):
7+
super().__init__()
8+
self.db = db
9+
10+
def query_entity(self,
11+
identifier,
12+
datatype="metabolite",
13+
collection="entity",
14+
limit=10,
15+
skip=0,
16+
projection={"_id": 0}):
17+
"""Get entity with identifier.
18+
19+
Args:
20+
identifier(:obj:`Obj`): identifier used for the entity.
21+
datatype(:obj:`Obj`, optional): Datatype to be retrieved.
22+
collection(:obj:`str`): name of the collection in which data resides.
23+
limit(:obj:`int`, optional): number of results to return.
24+
skip(:obj:`int`, optional): number of documents to skip.
25+
projection(:obj:`Obj`, optional): MongoDB projection.
26+
27+
Return:
28+
(:obj:`list`): pymongo iterables.
29+
"""
30+
col = self.client[self.db][collection]
31+
con_0 = {"identifiers": {"$elemMatch": identifier}}
32+
con_1 = {"type": datatype}
33+
query = {"$and": [con_0, con_1]}
34+
result = []
35+
docs = col.find(filter=query,
36+
limit=limit,
37+
skip=skip,
38+
projection=projection)
39+
for doc in docs:
40+
result.append(doc)
41+
return result
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from datanator_query_python.config import query_schema_2_manager
2+
from pymongo import ASCENDING
3+
4+
5+
class QueryObs(query_schema_2_manager.QM):
6+
def __init__(self,
7+
db="datanator-demo"):
8+
super().__init__()
9+
self.db = db
10+
11+
def get_entity_datatype(self,
12+
identifier,
13+
entity="protein",
14+
datatype="half-life",
15+
collection="observation",
16+
limit=10,
17+
skip=0,
18+
projection={"_id": 0}):
19+
"""Get entity datatype.
20+
21+
Args:
22+
identifier(:obj:`Obj`): identifier used for the entity.
23+
entity(:obj:`Obj`, optional): entity type. i.e. "protein", "RNA", etc.
24+
datatype(:obj:`Obj`, optional): Datatype to be retrieved.
25+
collection(:obj:`str`, optional): name of collection in which values reside.
26+
limit(:obj:`int`, optional): number of results to return.
27+
skip(:obj:`int`, optional): number of documents to skip.
28+
29+
Return:
30+
(:obj:`list`): pymongo iterables.
31+
"""
32+
results = []
33+
col = self.client[self.db][collection]
34+
con_0 = {"entity.type": entity}
35+
con_1 = {}
36+
if entity == "protein" and datatype != "localization":
37+
con_1["values.type"] = datatype
38+
elif entity == "protein" and datatype == "localization":
39+
words = ["intramembrane_localization", "secretome location"]
40+
con_1["values.type"] = {"$in": words}
41+
elif entity == "RNA" and datatype == "localization":
42+
con_1["values.type"] = "subcellular_localization"
43+
query = {"$and": [{"identifier": identifier}, con_0, con_1]}
44+
docs = col.find(filter=query, limit=limit, skip=skip,
45+
collation=self.collation,
46+
projection=projection,
47+
hint=[("identifier", ASCENDING),
48+
("entity.type", ASCENDING),
49+
("values.type", ASCENDING)])
50+
for doc in docs:
51+
results.append(doc)
52+
return results

‎setup.py

+4
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@
4646
],
4747
entry_points={
4848
'console_scripts': [
49+
'{} = {}.__main__:main'.format(name, name),
50+
'{} = {}.__main__:main'.format(name.replace('_', '-'), name),
51+
'{}{:d} = {}.__main__:main'.format(name, sys.version_info[0], name),
52+
'{}{:d} = {}.__main__:main'.format(name.replace('_', '-'), sys.version_info[0], name),
4953
],
5054
},
5155
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import unittest
2+
from datanator_query_python.config import query_schema_2_manager
3+
from pymongo import ReadPreference
4+
5+
6+
class TestQ(unittest.TestCase):
7+
8+
@classmethod
9+
def setUpClass(cls):
10+
cls.src = query_schema_2_manager.QM()
11+
12+
@classmethod
13+
def tearDownClass(cls):
14+
cls.src.client.close()
15+
16+
def test_conn_protein(self):
17+
result = self.src.conn_protein('datanator-test')
18+
self.assertEqual(result.read_preference, ReadPreference.NEAREST)

‎tests/query/test_full_text_query.py

+9
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ def test_bool_query(self):
154154
result = self.src.bool_query(query_message, index, must_not=must_not)
155155
self.assertEqual(result['hits']['hits'][0]['_source']['ko_number'], 'K00001')
156156

157+
@unittest.skip("ko obsolete")
157158
def test_get_index_ko_count(self):
158159
agg_field = "frontend_gene_aggregate"
159160
query_message = 'alcohol dehydrogenase'
@@ -170,11 +171,19 @@ def test_get_rxn_oi(self):
170171
result = self.src.get_rxn_oi(query_message)
171172
self.assertEqual(result['sabio_rk_total'], {'value': 10000, 'relation': 'gte'})
172173

174+
<<<<<<< HEAD
173175
def test_get_genes_ko_count(self):
174176
agg_field = "frontend_gene_aggregate"
175177
query_message = 'K14236'
176178
result_0 = self.src.get_genes_ko_count(query_message, 15, agg_field=agg_field, size=10, fields=['*'])
177179
print(result_0)
180+
=======
181+
# def test_get_genes_ko_count(self):
182+
# agg_field = "ko_number"
183+
# query_message = 'K14236'
184+
# result_0 = self.src.get_genes_ko_count(query_message, 15, agg_field=agg_field, size=10, fields=['*'])
185+
# print(result_0)
186+
>>>>>>> testapi
178187

179188
def test_get_genes_orthodb_count(self):
180189
agg_field = "orthodb_id"

‎tests/query/test_query_kegg_orthology.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,10 @@ def test_get_meta_by_kegg_ids(self):
6161
def test_get_meta_by_kegg_id(self):
6262
kegg_id = 'k00001'
6363
doc = self.src.get_meta_by_kegg_id(kegg_id)
64-
self.assertEqual(doc['gene_name'], ['E1.1.1.1', 'adh'])
64+
self.assertEqual(doc['gene_name'], ['E1.1.1.1', 'adh'])
65+
66+
def test_get_meta_by_ortho_ids(self):
67+
kegg_ids = ['643917at2', '567019at2']
68+
docs, count = self.src.get_meta_by_ortho_ids(kegg_ids)
69+
for doc in docs:
70+
print(doc)

‎tests/query/test_query_protein.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,16 @@ def test_get_all_kegg(self):
280280
result_0 = self.src_1.get_all_kegg('K00850','Escherichia coli', 10)
281281
print(result_0)
282282

283-
# @unittest.skip("skipping")
284-
# def test_get_all_ortho(self):
285-
# result_0 = self.src_2.get_all_ortho('494933at2759','Escherichia coli', 10)
286-
# print(result_0)
283+
@unittest.skip("skipping")
284+
def test_get_all_ortho(self):
285+
result_0 = self.src_2.get_all_ortho('494933at2759','Escherichia coli', 10)
286+
print(result_0)
287+
288+
@unittest.skip("passed")
289+
def test_get_info_by_orthodb(self):
290+
result = self.src_2.get_info_by_orthodb("643917at2")
291+
print(result)
292+
293+
def test_get_ortho_by_id(self):
294+
result = self.src_2.get_ortho_by_id("P53984")
295+
print(result)

‎tests/query/test_query_sabiork_old.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -149,4 +149,12 @@ def test_get_rxn_with_prm(self):
149149
def test_get_reaction_by_subunit(self):
150150
_ids = ['P20932', 'P00803']
151151
result = self.src.get_reaction_by_subunit(_ids)
152-
self.assertTrue(result[-1]['kinlaw_id'] in [31611, 31609])
152+
self.assertTrue(result[-1]['kinlaw_id'] in [31611, 31609])
153+
154+
def test_get_kinlaw_by_rxn_ortho(self):
155+
substrate_0 = 'XJLXINKUBYWONI-NNYOXOHSSA-N'
156+
substrate_1 = 'ODBLHEXUDAPZAU-UHFFFAOYSA-N'
157+
product_0 = 'GPRLSGONYQIRFK-UHFFFAOYSA-N'
158+
product_1 = 'KPGXRSRHYNQIFN-UHFFFAOYSA-N'
159+
_, result = self.src.get_kinlaw_by_rxn_ortho([substrate_0, substrate_1], [product_0, product_1])
160+
print(result)

‎tests/query/test_query_taxon_tree.py

+20-4
Original file line numberDiff line numberDiff line change
@@ -138,10 +138,26 @@ def test_get_canon_common_ancestor(self):
138138
# print(result_0)
139139

140140
def test_get_canon_common_ancestor_fast(self):
141-
org_0 = 2160
142-
org_1 = 2161
143-
for doc in self.src.get_canon_common_ancestor_fast(org_0, org_1):
144-
print(doc)
141+
self.assertEqual(self.src.get_canon_common_ancestor_fast("Escherichia coli E1002", "Homo 1002", org_format="tax_name")["reason"], "No organism found.")
142+
org_1 = 743725
143+
org_2 = 2107591
144+
result = self.src.get_canon_common_ancestor_fast(org_1, org_2)
145+
self.assertEqual(result, {'2107591': 4,
146+
'2107591_canon_ancestors': ['cellular organisms', 'Archaea',
147+
'Candidatus Diapherotrites',
148+
'Candidatus Forterrea',
149+
'Candidatus Forterrea multitransposorum'],
150+
'743725': 1,
151+
'743725_canon_ancestors': ['cellular organisms', 'Archaea']})
152+
self.assertEqual(self.src.get_canon_common_ancestor_fast("Escherichia coli E1002", "Escherichia coli", org_format="tax_name")["Escherichia coli"], 0)
153+
result = self.src.get_canon_common_ancestor_fast('Escherichia coli', 'Escherichia coli', org_format='tax_name')
154+
self.assertEqual(result['Escherichia coli'], 0)
155+
org_3 = 9606
156+
org_4 = 4932
157+
result = self.src.get_canon_common_ancestor_fast(org_3, org_4)
158+
self.assertEqual(result, {'9606': 7, '4932': 7, '9606_canon_ancestors': ['cellular organisms', 'Eukaryota', 'Metazoa', 'Chordata', 'Mammalia', 'Primates', 'Hominidae', 'Homo'], '4932_canon_ancestors': ['cellular organisms', 'Eukaryota', 'Fungi', 'Ascomycota', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']})
159+
160+
145161

146162
class TestQueryTaxonTreeMock(unittest.TestCase):
147163

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from datanator_query_python.query_schema_2 import ftx_search
2+
import unittest
3+
4+
5+
class TestQEn(unittest.TestCase):
6+
7+
@classmethod
8+
def setUpClass(cls):
9+
cls.src = ftx_search.FTX()
10+
11+
@classmethod
12+
def tearDownClass(cls):
13+
cls.src.client.close()
14+
15+
def test_search_taxon(self):
16+
docs = self.src.search_taxon("off",
17+
token_order="any")
18+
print(docs)
+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from datanator_query_python.query_schema_2 import query_entity
2+
import unittest
3+
4+
5+
class TestQEn(unittest.TestCase):
6+
7+
@classmethod
8+
def setUpClass(cls):
9+
cls.src = query_entity.QueryEn()
10+
11+
@classmethod
12+
def tearDownClass(cls):
13+
cls.src.client.close()
14+
15+
def test_query_entity(self):
16+
_id = {"namespace": "inchikey", "value": "TYEYBOSBBBHJIV-UHFFFAOYSA-N"}
17+
r = self.src.query_entity(_id)
18+
self.assertEqual(len(r), 1)
19+
self.assertEqual(r[0]["name"], "2-Ketobutyric acid")
20+
_id = {"namespace": "inchikey", "value": "234wgadgas"}
21+
r = self.src.query_entity(_id)
22+
self.assertEqual(r, [])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from datanator_query_python.query_schema_2 import query_observation
2+
import unittest
3+
4+
5+
class TestQOb(unittest.TestCase):
6+
7+
@classmethod
8+
def setUpClass(cls):
9+
cls.src = query_observation.QueryObs()
10+
11+
@classmethod
12+
def tearDownClass(cls):
13+
cls.src.client.close()
14+
15+
def test_get_entity_datatype(self):
16+
identfier = {"namespace": "gene_symbol", "value": "BAG1"}
17+
results = self.src.get_entity_datatype(identfier)
18+
self.assertEqual(results[0]["entity"]["name"], 'BCL2-associated athanogene.')
19+
identfier = {"namespace": "gene_symbol", "value": "something"}
20+
results = self.src.get_entity_datatype(identfier)
21+
self.assertEqual(results, [])
22+
identfier = {"namespace": "gene_name", "value": "EMC3"}
23+
results = self.src.get_entity_datatype(identfier, datatype="localization")
24+
self.assertEqual(results[0]["entity"]["name"], 'ER membrane protein complex subunit 3')
25+
identfier = {"namespace": "gene_id", "value": "100003563"}
26+
results = self.src.get_entity_datatype(identfier, entity="RNA", datatype="localization")
27+
self.assertEqual(results[0]["genotype"]["taxon"]["name"], 'Danio rerio')
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import os
2+
os.environ["WHERE"] = "API_TEST"
3+
import unittest
4+
from datanator_query_python.config import config as query_config
5+
from datanator_query_python.query_schema_2 import query_taxon_tree_v2
6+
import asyncio
7+
from pprint import pprint
8+
9+
10+
11+
class TestQTaxon(unittest.TestCase):
12+
13+
@classmethod
14+
def setUpClass(cls):
15+
print(os.getenv("WHERE"))
16+
cls.src = query_taxon_tree_v2.QTaxon()
17+
18+
@classmethod
19+
def tearDownClass(cls):
20+
os.environ["WHERE"] = ""
21+
22+
def test_get_canon_ancestor(self):
23+
_id = 1915648
24+
loop = asyncio.get_event_loop()
25+
ids, _ = loop.run_until_complete(self.src.get_canon_ancestor(_id, _format='tax_id'))
26+
self.assertTrue(2283796 not in ids)
27+
self.assertTrue(2157 in ids)
28+
_id = "nonsense"
29+
loop = asyncio.get_event_loop()
30+
ids, _ = loop.run_until_complete(self.src.get_canon_ancestor(_id, _format='tax_name'))
31+
self.assertEqual([], ids)
32+
33+
# @unittest.skip("for now")
34+
def test_aggregate_distance(self):
35+
measured_0 = [{"canon_anc_ids": [131567, 2, 1224, 1236, 91347, 543, 590, 28901],
36+
"tax_name": "Salmonella enterica subsp. enterica serovar Newport str. CFSAN000907"}] #tax_id1299189
37+
target_0 = 0
38+
target_1 = 1227178
39+
loop = asyncio.get_event_loop()
40+
result_0 = loop.run_until_complete(self.src.aggregate_distance(measured_0, target_0, name_field='tax_name'))
41+
self.assertEqual(result_0, measured_0)
42+
result_1 = loop.run_until_complete(self.src.aggregate_distance(measured_0, target_1, name_field='tax_name'))
43+
self.assertEqual(result_1[0]['taxon_distance']['Salmonella enterica subsp. enterica serovar Newport str. CFSAN001557'], 0)
44+
# taget is measured's ancestor
45+
measured_1 = [{"canon_anc_ids": [131567, 2759, 4751, 4890, 4891, 4892, 4893, 4930, 4932],
46+
"tax_name": "Saccharomyces cerevisiae CAT-1"}]
47+
target_2 = 4932 # Saccharomyces cerevisiae
48+
result_2 = loop.run_until_complete(self.src.aggregate_distance(measured_1, target_2, name_field='tax_name'))
49+
self.assertEqual(result_2[0]['taxon_distance']['Saccharomyces cerevisiae'], 0)
50+
self.assertEqual(result_2[0]['taxon_distance']['Saccharomyces cerevisiae CAT-1'], 1)

‎tests/query_schema_2/test_query_uniprot_v2.py

Whitespace-only changes.

‎tests/test_main.py

+17
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,23 @@ def test_version(self):
4646
datanator_query_python.__version__)
4747
self.assertEqual(captured.stderr.get_text(), '')
4848

49+
def test_define_schema(self):
50+
with capturer.CaptureOutput(merged=False, relay=False) as captured:
51+
with __main__.App(argv=['mongo-def-schema',
52+
'test',
53+
'cli_test',
54+
'../datanator_pattern_design/compiled/taxon_compiled.json']) as app:
55+
# run app
56+
app.run()
57+
58+
# test that the arguments to the CLI were correctly parsed
59+
self.assertEqual(app.pargs.db, 'test')
60+
self.assertTrue(app.pargs.collection, 'cli_test')
61+
62+
# test that the CLI produced the correct output
63+
self.assertEqual(captured.stdout.get_text(), 'done')
64+
self.assertEqual(captured.stderr.get_text(), '')
65+
4966
def test_define_schema(self):
5067
with capturer.CaptureOutput(merged=False, relay=False) as captured:
5168
with __main__.App(argv=['mongo-def-schema',

‎tests/util/test_motor_util.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from datanator_query_python.util import motor_util
2+
from datanator_query_python.config import config
3+
import unittest
4+
5+
6+
class TestMUtil(unittest.TestCase):
7+
8+
@classmethod
9+
def setUpClass(cls):
10+
cls.src = motor_util.MotorUtil(MongoDB=config.AtlasConfig.SERVER,
11+
username=config.DatanatorTest.USERNAME,
12+
password=config.DatanatorTest.PASSWORD,
13+
authSource=config.AtlasConfig.AUTHDB,
14+
replicaSet=config.AtlasConfig.REPLSET,
15+
readPreference=config.AtlasConfig.READ_PREFERENCE )
16+
cls.test_collection = 'test_motor'
17+
cls.test_database = "test"
18+
19+
@classmethod
20+
def tearDownClass(cls):
21+
cls.src.client.close()
22+
cls.src.client.get_database(cls.test_database).drop_collection(cls.test_collection)
23+
24+
def test_client(self):
25+
self.src.client.get_database(self.test_database)[self.test_collection].insert_one({"test": 1})

0 commit comments

Comments
 (0)
Please sign in to comment.