Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion graphgen/configs/search_config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
pipeline:
- name: read
params:
input_file: resources/input_examples/search_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples

- name: search
params:
Expand Down
23 changes: 13 additions & 10 deletions graphgen/models/searcher/db/uniprot_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,16 @@ def _get_pool():
return ThreadPoolExecutor(max_workers=10)


# ensure only one BLAST searcher at a time
_blast_lock = asyncio.Lock()


class UniProtSearch(BaseSearcher):
"""
UniProt Search client to searcher with UniProt.
1) Get the protein by accession number.
2) Search with keywords or protein names (fuzzy searcher).
3) Search with FASTA sequence (BLAST searcher).
3) Search with FASTA sequence (BLAST searcher). Note that NCBIWWW does not support async.
"""

def __init__(self, use_local_blast: bool = False, local_blast_db: str = "sp_db"):
Expand Down Expand Up @@ -230,22 +234,21 @@ async def search(
if query.startswith(">") or re.fullmatch(
r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I
):
coro = loop.run_in_executor(
_get_pool(), self.get_by_fasta, query, threshold
)
async with _blast_lock:
result = await loop.run_in_executor(
_get_pool(), self.get_by_fasta, query, threshold
)

# check if accession number
elif re.fullmatch(r"[A-NR-Z0-9]{6,10}", query, re.I):
coro = loop.run_in_executor(_get_pool(), self.get_by_accession, query)
result = await loop.run_in_executor(
_get_pool(), self.get_by_accession, query
)

else:
# otherwise treat as keyword
coro = loop.run_in_executor(_get_pool(), self.get_best_hit, query)
result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)

result = await coro
if result:
result["_search_query"] = query
return result


# TODO: use local UniProt database for large-scale searchs
3 changes: 3 additions & 0 deletions scripts/search/search_uniprot.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
python3 -m graphgen.run \
--config_file graphgen/configs/search_config.yaml \
--output_dir cache/