From 0da7ce1036fa8c277563b2c9407ecbf92e5ccafa Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Wed, 26 Nov 2025 20:21:03 +0800 Subject: [PATCH 01/22] feat: Add DNA and RNA search functionality - Add NCBISearch searcher for DNA/GenBank/Entrez database search - Add RNACentralSearch searcher for RNA database search - Update search_all.py to support ncbi and rnacentral data sources - Add search configs for DNA, RNA, and protein (renamed from search_config) - Add search scripts for DNA and RNA - Add demo input files for DNA and RNA search - Update search_uniprot.sh to use search_protein_config.yaml This PR extends the search functionality to support biological data types: - Protein search (existing, now explicitly named) - DNA search via NCBI - RNA search via RNAcentral --- graphgen/configs/search_dna_config.yaml | 15 + ...config.yaml => search_protein_config.yaml} | 2 +- graphgen/configs/search_rna_config.yaml | 14 + graphgen/models/__init__.py | 2 + graphgen/models/searcher/db/ncbi_searcher.py | 296 ++++++++++++++++++ .../models/searcher/db/rnacentral_searcher.py | 191 +++++++++++ graphgen/operators/search/search_all.py | 41 ++- .../input_examples/search_dna_demo.jsonl | 4 + ...h_demo.jsonl => search_protein_demo.jsonl} | 0 scripts/search/search_dna.sh | 4 + scripts/search/search_rna.sh | 4 + scripts/search/search_uniprot.sh | 2 +- 12 files changed, 568 insertions(+), 7 deletions(-) create mode 100644 graphgen/configs/search_dna_config.yaml rename graphgen/configs/{search_config.yaml => search_protein_config.yaml} (71%) create mode 100644 graphgen/configs/search_rna_config.yaml create mode 100644 graphgen/models/searcher/db/ncbi_searcher.py create mode 100644 graphgen/models/searcher/db/rnacentral_searcher.py create mode 100644 resources/input_examples/search_dna_demo.jsonl rename resources/input_examples/{search_demo.jsonl => search_protein_demo.jsonl} (100%) create mode 100644 scripts/search/search_dna.sh create mode 100644 scripts/search/search_rna.sh diff --git a/graphgen/configs/search_dna_config.yaml b/graphgen/configs/search_dna_config.yaml new file mode 100644 index 00000000..95f8fc39 --- /dev/null +++ b/graphgen/configs/search_dna_config.yaml @@ -0,0 +1,15 @@ +pipeline: + - name: read_step + op_key: read + params: + input_file: resources/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + + - name: search_step + op_key: search + deps: [read_step] # search_step depends on read_step + params: + data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral + ncbi_params: + email: test@example.com # NCBI requires an email address + tool: GraphGen # tool name for NCBI API + diff --git a/graphgen/configs/search_config.yaml b/graphgen/configs/search_protein_config.yaml similarity index 71% rename from graphgen/configs/search_config.yaml rename to graphgen/configs/search_protein_config.yaml index 63ebd241..bb46d34c 100644 --- a/graphgen/configs/search_config.yaml +++ b/graphgen/configs/search_protein_config.yaml @@ -2,7 +2,7 @@ pipeline: - name: read_step op_key: read params: - input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + input_file: resources/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples - name: search_step op_key: search diff --git a/graphgen/configs/search_rna_config.yaml b/graphgen/configs/search_rna_config.yaml new file mode 100644 index 00000000..3d051417 --- /dev/null +++ b/graphgen/configs/search_rna_config.yaml @@ -0,0 +1,14 @@ +pipeline: + - name: read_step + op_key: read + params: + input_file: resources/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + + - name: search_step + op_key: search + deps: [read_step] # search_step depends on read_step + params: + data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral + rnacentral_params: + {} # RNAcentral doesn't require additional parameters currently + diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index 68fd2a5d..bb73548d 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -26,6 +26,8 @@ RDFReader, TXTReader, ) +from .searcher.db.ncbi_searcher import NCBISearch +from .searcher.db.rnacentral_searcher import RNACentralSearch from .searcher.db.uniprot_searcher import UniProtSearch from .searcher.kg.wiki_search import WikiSearch from .searcher.web.bing_search import BingSearch diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py new file mode 100644 index 00000000..73a9ad87 --- /dev/null +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -0,0 +1,296 @@ +import asyncio +import re +import time +from concurrent.futures import ThreadPoolExecutor +from functools import lru_cache +from http.client import IncompleteRead +from typing import Dict, Optional + +from Bio import Entrez +from requests.exceptions import RequestException +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from graphgen.bases import BaseSearcher +from graphgen.utils import logger + + +@lru_cache(maxsize=None) +def _get_pool(): + return ThreadPoolExecutor(max_workers=10) + + +class NCBISearch(BaseSearcher): + """ + NCBI Search client to search DNA/GenBank/Entrez databases. + 1) Get the gene/DNA by accession number or gene ID. + 2) Search with keywords or gene names (fuzzy search). + 3) Search with FASTA sequence (BLAST search for DNA sequences). + + API Documentation: https://www.ncbi.nlm.nih.gov/home/develop/api/ + Note: NCBI has rate limits (max 3 requests per second), delays are required between requests. + """ + + def __init__(self, email: str = "test@example.com", tool: str = "GraphGen"): + super().__init__() + Entrez.email = email + Entrez.tool = tool + Entrez.timeout = 60 # 60 seconds timeout + + def get_by_gene_id(self, gene_id: str) -> Optional[dict]: + """ + Get gene information by Gene ID. + :param gene_id: NCBI Gene ID. + :return: A dictionary containing gene information or None if not found. + """ + try: + time.sleep(0.35) # Comply with rate limit (max 3 requests per second) + handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml") + try: + gene_record = Entrez.read(handle) + if not gene_record: + return None + + gene_data = gene_record[0] + gene_ref = gene_data.get("Entrezgene_gene", {}).get("Gene-ref", {}) + + return { + "molecule_type": "DNA", + "database": "NCBI", + "id": gene_id, + "gene_name": gene_ref.get("Gene-ref_locus", "N/A"), + "gene_description": gene_ref.get("Gene-ref_desc", "N/A"), + "organism": gene_data.get("Entrezgene_source", {}).get("BioSource", {}).get("BioSource_org", {}).get("Org-ref", {}).get("Org-ref_taxname", "N/A"), + "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}", + } + finally: + handle.close() + except RequestException: + raise + except Exception as exc: # pylint: disable=broad-except + logger.error("Gene ID %s not found: %s", gene_id, exc) + return None + + def get_by_accession(self, accession: str) -> Optional[dict]: + """ + Get sequence information by accession number. + :param accession: NCBI accession number (e.g., NM_000546). + :return: A dictionary containing sequence information or None if not found. + """ + try: + time.sleep(0.35) # 遵守速率限制 + handle = Entrez.efetch( + db="nuccore", + id=accession, + rettype="fasta", + retmode="text", + ) + try: + sequence_data = handle.read() + if not sequence_data: + return None + + seq_lines = sequence_data.strip().split("\n") + header = seq_lines[0] if seq_lines else "" + sequence = "".join(seq_lines[1:]) + + # Try to get more information + time.sleep(0.35) + summary_handle = Entrez.esummary(db="nuccore", id=accession) + try: + summary = Entrez.read(summary_handle) + if summary: + summary_data = summary[0] + title = summary_data.get("Title", header) + organism = summary_data.get("Organism", "N/A") + else: + title = header + organism = "N/A" + finally: + summary_handle.close() + + return { + "molecule_type": "DNA", + "database": "NCBI", + "id": accession, + "title": title, + "organism": organism, + "sequence": sequence, + "sequence_length": len(sequence), + "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}", + } + finally: + handle.close() + except RequestException: + raise + except Exception as exc: # pylint: disable=broad-except + logger.error("Accession %s not found: %s", accession, exc) + return None + + def search_by_keyword(self, keyword: str) -> Optional[dict]: + """ + Search NCBI Gene database with a keyword and return the best hit. + :param keyword: The search keyword (e.g., gene name). + :return: A dictionary containing the best hit information or None if not found. + """ + if not keyword.strip(): + return None + + try: + time.sleep(0.35) # 遵守速率限制 + # Search gene database + search_handle = Entrez.esearch( + db="gene", + term=f"{keyword}[Gene Name] OR {keyword}[All Fields]", + retmax=1, + ) + try: + search_results = Entrez.read(search_handle) + if not search_results.get("IdList"): + # If not found, try a broader search + time.sleep(0.35) + search_handle2 = Entrez.esearch( + db="gene", + term=keyword, + retmax=1, + ) + try: + search_results = Entrez.read(search_handle2) + finally: + search_handle2.close() + + if search_results.get("IdList"): + gene_id = search_results["IdList"][0] + return self.get_by_gene_id(gene_id) + finally: + search_handle.close() + except RequestException: + raise + except Exception as e: # pylint: disable=broad-except + logger.error("Keyword %s not found: %s", keyword, e) + return None + + def search_by_sequence(self, sequence: str) -> Optional[dict]: + """ + Search NCBI with a DNA sequence using BLAST. + Note: This is a simplified version. For production, consider using local BLAST. + :param sequence: DNA sequence (FASTA format or raw sequence). + :return: A dictionary containing the best hit information or None if not found. + """ + try: + # Extract sequence (if in FASTA format) + if sequence.startswith(">"): + seq_lines = sequence.strip().split("\n") + seq = "".join(seq_lines[1:]) + else: + seq = sequence.strip().replace(" ", "").replace("\n", "") + + # Validate if it's a DNA sequence + if not re.fullmatch(r"[ATCGN\s]+", seq, re.I): + logger.error("Invalid DNA sequence provided.") + return None + + if not seq: + logger.error("Empty DNA sequence provided.") + return None + + # Use BLAST search (Note: requires network connection, may be slow) + logger.debug("Performing BLAST search for DNA sequence...") + time.sleep(0.35) + from Bio.Blast import NCBIWWW, NCBIXML + + result_handle = NCBIWWW.qblast( + program="blastn", + database="nr", + sequence=seq, + hitlist_size=1, + expect=0.001, + ) + blast_record = NCBIXML.read(result_handle) + + if not blast_record.alignments: + logger.info("No BLAST hits found for the given sequence.") + return None + + best_alignment = blast_record.alignments[0] + best_hsp = best_alignment.hsps[0] + hit_id = best_alignment.hit_id + + # Extract accession number + # Format may be: gi|123456|ref|NM_000546.5| + accession_match = re.search(r"ref\|([^|]+)", hit_id) + if accession_match: + accession = accession_match.group(1).split(".")[0] + return self.get_by_accession(accession) + else: + # If unable to extract accession, return basic information + return { + "molecule_type": "DNA", + "database": "NCBI", + "id": hit_id, + "title": best_alignment.title, + "sequence_length": len(seq), + "e_value": best_hsp.expect, + "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0, + "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}", + } + except RequestException: + raise + except Exception as e: # pylint: disable=broad-except + logger.error("BLAST search failed: %s", e) + return None + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) + async def search( + self, query: str, **kwargs + ) -> Optional[Dict]: + """ + Search NCBI with either a gene ID, accession number, keyword, or DNA sequence. + :param query: The search query (gene ID, accession, keyword, or DNA sequence). + :param kwargs: Additional keyword arguments (not used currently). + :return: A dictionary containing the search results or None if not found. + """ + # auto detect query type + if not query or not isinstance(query, str): + logger.error("Empty or non-string input.") + return None + query = query.strip() + + logger.debug("NCBI search query: %s", query) + + loop = asyncio.get_running_loop() + + # check if DNA sequence (ATCG characters) + if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): + result = await loop.run_in_executor( + _get_pool(), self.search_by_sequence, query + ) + # check if gene ID (numeric) + elif re.fullmatch(r"^\d+$", query): + result = await loop.run_in_executor( + _get_pool(), self.get_by_gene_id, query + ) + # check if accession number (e.g., NM_000546, NC_000001) + elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): + result = await loop.run_in_executor( + _get_pool(), self.get_by_accession, query + ) + else: + # otherwise treat as keyword + result = await loop.run_in_executor( + _get_pool(), self.search_by_keyword, query + ) + + if result: + result["_search_query"] = query + return result + diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py new file mode 100644 index 00000000..d1decd6d --- /dev/null +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -0,0 +1,191 @@ +import asyncio +import re +from concurrent.futures import ThreadPoolExecutor +from functools import lru_cache +from typing import Dict, Optional + +import aiohttp +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from graphgen.bases import BaseSearcher +from graphgen.utils import logger + + +@lru_cache(maxsize=None) +def _get_pool(): + return ThreadPoolExecutor(max_workers=10) + + +class RNACentralSearch(BaseSearcher): + """ + RNAcentral Search client to search RNA databases. + 1) Get RNA by RNAcentral ID. + 2) Search with keywords or RNA names (fuzzy search). + 3) Search with RNA sequence. + + API Documentation: https://rnacentral.org/api/v1 + """ + + def __init__(self): + super().__init__() + self.base_url = "https://rnacentral.org/api/v1" + self.headers = {"Accept": "application/json"} + + async def get_by_rna_id(self, rna_id: str) -> Optional[dict]: + """ + Get RNA information by RNAcentral ID. + :param rna_id: RNAcentral ID (e.g., URS0000000001). + :return: A dictionary containing RNA information or None if not found. + """ + try: + async with aiohttp.ClientSession() as session: + url = f"{self.base_url}/rna/{rna_id}" + async with session.get( + url, headers=self.headers, timeout=aiohttp.ClientTimeout(total=30) + ) as resp: + if resp.status == 200: + rna_data = await resp.json() + return { + "molecule_type": "RNA", + "database": "RNAcentral", + "id": rna_id, + "rnacentral_id": rna_data.get("rnacentral_id", "N/A"), + "sequence": rna_data.get("sequence", ""), + "sequence_length": len(rna_data.get("sequence", "")), + "rna_type": rna_data.get("rna_type", "N/A"), + "description": rna_data.get("description", "N/A"), + "url": f"https://rnacentral.org/rna/{rna_id}", + } + elif resp.status == 404: + logger.error("RNA ID %s not found", rna_id) + return None + else: + raise Exception(f"HTTP {resp.status}: {await resp.text()}") + except Exception as exc: # pylint: disable=broad-except + logger.error("RNA ID %s not found: %s", rna_id, exc) + return None + + async def search_by_keyword(self, keyword: str) -> Optional[dict]: + """ + Search RNAcentral with a keyword and return the best hit. + :param keyword: The search keyword (e.g., miRNA name, RNA name). + :return: A dictionary containing the best hit information or None if not found. + """ + if not keyword.strip(): + return None + + try: + async with aiohttp.ClientSession() as session: + search_url = f"{self.base_url}/rna" + params = {"search": keyword, "format": "json"} + async with session.get( + search_url, + params=params, + headers=self.headers, + timeout=aiohttp.ClientTimeout(total=30), + ) as resp: + if resp.status == 200: + search_results = await resp.json() + if search_results.get("results"): + rna_id = search_results["results"][0].get("rnacentral_id") + if rna_id: + return await self.get_by_rna_id(rna_id) + logger.info("No results found for keyword: %s", keyword) + return None + else: + raise Exception(f"HTTP {resp.status}: {await resp.text()}") + except Exception as e: # pylint: disable=broad-except + logger.error("Keyword %s not found: %s", keyword, e) + return None + + async def search_by_sequence(self, sequence: str) -> Optional[dict]: + """ + Search RNAcentral with an RNA sequence. + :param sequence: RNA sequence (FASTA format or raw sequence). + :return: A dictionary containing the best hit information or None if not found. + """ + try: + # Extract sequence (if in FASTA format) + if sequence.startswith(">"): + seq_lines = sequence.strip().split("\n") + seq = "".join(seq_lines[1:]) + else: + seq = sequence.strip().replace(" ", "").replace("\n", "") + + # Validate if it's an RNA sequence (contains U instead of T) + if not re.fullmatch(r"[AUCGN\s]+", seq, re.I): + logger.error("Invalid RNA sequence provided.") + return None + + if not seq: + logger.error("Empty RNA sequence provided.") + return None + + # RNAcentral API supports sequence search + async with aiohttp.ClientSession() as session: + search_url = f"{self.base_url}/rna" + params = {"sequence": seq, "format": "json"} + async with session.get( + search_url, + params=params, + headers=self.headers, + timeout=aiohttp.ClientTimeout(total=60), # Sequence search may take longer + ) as resp: + if resp.status == 200: + search_results = await resp.json() + if search_results.get("results"): + rna_id = search_results["results"][0].get("rnacentral_id") + if rna_id: + return await self.get_by_rna_id(rna_id) + logger.info("No results found for sequence.") + return None + else: + raise Exception(f"HTTP {resp.status}: {await resp.text()}") + except Exception as e: # pylint: disable=broad-except + logger.error("Sequence search failed: %s", e) + return None + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)), + reraise=True, + ) + async def search( + self, query: str, **kwargs + ) -> Optional[Dict]: + """ + Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence. + :param query: The search query (RNAcentral ID, keyword, or RNA sequence). + :param kwargs: Additional keyword arguments (not used currently). + :return: A dictionary containing the search results or None if not found. + """ + # auto detect query type + if not query or not isinstance(query, str): + logger.error("Empty or non-string input.") + return None + query = query.strip() + + logger.debug("RNAcentral search query: %s", query) + + # check if RNA sequence (AUCG characters, contains U) + if query.startswith(">") or ( + re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper() + ): + result = await self.search_by_sequence(query) + # check if RNAcentral ID (typically starts with URS) + elif re.fullmatch(r"URS\d+", query, re.I): + result = await self.get_by_rna_id(query) + else: + # otherwise treat as keyword + result = await self.search_by_keyword(query) + + if result: + result["_search_query"] = query + return result + diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py index 6c543dbf..6017cfee 100644 --- a/graphgen/operators/search/search_all.py +++ b/graphgen/operators/search/search_all.py @@ -27,6 +27,10 @@ async def search_all( data_sources = search_config.get("data_sources", []) for data_source in data_sources: + data = list(seed_data.values()) + data = [d["content"] for d in data if "content" in d] + data = list(set(data)) # Remove duplicates + if data_source == "uniprot": from graphgen.models import UniProtSearch @@ -34,19 +38,46 @@ async def search_all( **search_config.get("uniprot_params", {}) ) - data = list(seed_data.values()) - data = [d["content"] for d in data if "content" in d] - data = list(set(data)) # Remove duplicates uniprot_results = await run_concurrent( uniprot_search_client.search, data, desc="Searching UniProt database", unit="keyword", ) + results[data_source] = uniprot_results + + elif data_source == "ncbi": + from graphgen.models import NCBISearch + + ncbi_search_client = NCBISearch( + **search_config.get("ncbi_params", {}) + ) + + ncbi_results = await run_concurrent( + ncbi_search_client.search, + data, + desc="Searching NCBI database", + unit="keyword", + ) + results[data_source] = ncbi_results + + elif data_source == "rnacentral": + from graphgen.models import RNACentralSearch + + rnacentral_search_client = RNACentralSearch( + **search_config.get("rnacentral_params", {}) + ) + + rnacentral_results = await run_concurrent( + rnacentral_search_client.search, + data, + desc="Searching RNAcentral database", + unit="keyword", + ) + results[data_source] = rnacentral_results + else: logger.error("Data source %s not supported.", data_source) continue - results[data_source] = uniprot_results - return results diff --git a/resources/input_examples/search_dna_demo.jsonl b/resources/input_examples/search_dna_demo.jsonl new file mode 100644 index 00000000..83086df9 --- /dev/null +++ b/resources/input_examples/search_dna_demo.jsonl @@ -0,0 +1,4 @@ +{"type": "text", "content": "TP53"} +{"type": "text", "content": "BRCA1"} +{"type": "text", "content": "NM_000546"} +{"type": "text", "content": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"} diff --git a/resources/input_examples/search_demo.jsonl b/resources/input_examples/search_protein_demo.jsonl similarity index 100% rename from resources/input_examples/search_demo.jsonl rename to resources/input_examples/search_protein_demo.jsonl diff --git a/scripts/search/search_dna.sh b/scripts/search/search_dna.sh new file mode 100644 index 00000000..5b82fdd6 --- /dev/null +++ b/scripts/search/search_dna.sh @@ -0,0 +1,4 @@ +python3 -m graphgen.run \ +--config_file graphgen/configs/search_dna_config.yaml \ +--output_dir cache/ + diff --git a/scripts/search/search_rna.sh b/scripts/search/search_rna.sh new file mode 100644 index 00000000..260499b3 --- /dev/null +++ b/scripts/search/search_rna.sh @@ -0,0 +1,4 @@ +python3 -m graphgen.run \ +--config_file graphgen/configs/search_rna_config.yaml \ +--output_dir cache/ + diff --git a/scripts/search/search_uniprot.sh b/scripts/search/search_uniprot.sh index 642040af..7b295f8d 100644 --- a/scripts/search/search_uniprot.sh +++ b/scripts/search/search_uniprot.sh @@ -1,3 +1,3 @@ python3 -m graphgen.run \ ---config_file graphgen/configs/search_config.yaml \ +--config_file graphgen/configs/search_protein_config.yaml \ --output_dir cache/ From 9a26138580ec0294a994eaefd9d01ff1a5f41356 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Wed, 26 Nov 2025 20:48:38 +0800 Subject: [PATCH 02/22] fix: fix pylint style issues - Remove all trailing whitespace - Fix line-too-long issues (split long lines) - Remove trailing newlines at end of files - Remove unnecessary else/elif after return statements --- graphgen/models/searcher/db/ncbi_searcher.py | 59 ++++++++++--------- .../models/searcher/db/rnacentral_searcher.py | 18 +++--- 2 files changed, 39 insertions(+), 38 deletions(-) diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index 73a9ad87..aa23a9d4 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -54,17 +54,24 @@ def get_by_gene_id(self, gene_id: str) -> Optional[dict]: gene_record = Entrez.read(handle) if not gene_record: return None - + gene_data = gene_record[0] gene_ref = gene_data.get("Entrezgene_gene", {}).get("Gene-ref", {}) - + + organism = ( + gene_data.get("Entrezgene_source", {}) + .get("BioSource", {}) + .get("BioSource_org", {}) + .get("Org-ref", {}) + .get("Org-ref_taxname", "N/A") + ) return { "molecule_type": "DNA", "database": "NCBI", "id": gene_id, "gene_name": gene_ref.get("Gene-ref_locus", "N/A"), "gene_description": gene_ref.get("Gene-ref_desc", "N/A"), - "organism": gene_data.get("Entrezgene_source", {}).get("BioSource", {}).get("BioSource_org", {}).get("Org-ref", {}).get("Org-ref_taxname", "N/A"), + "organism": organism, "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}", } finally: @@ -93,11 +100,11 @@ def get_by_accession(self, accession: str) -> Optional[dict]: sequence_data = handle.read() if not sequence_data: return None - + seq_lines = sequence_data.strip().split("\n") header = seq_lines[0] if seq_lines else "" sequence = "".join(seq_lines[1:]) - + # Try to get more information time.sleep(0.35) summary_handle = Entrez.esummary(db="nuccore", id=accession) @@ -112,7 +119,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]: organism = "N/A" finally: summary_handle.close() - + return { "molecule_type": "DNA", "database": "NCBI", @@ -162,7 +169,7 @@ def search_by_keyword(self, keyword: str) -> Optional[dict]: search_results = Entrez.read(search_handle2) finally: search_handle2.close() - + if search_results.get("IdList"): gene_id = search_results["IdList"][0] return self.get_by_gene_id(gene_id) @@ -188,21 +195,21 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]: seq = "".join(seq_lines[1:]) else: seq = sequence.strip().replace(" ", "").replace("\n", "") - + # Validate if it's a DNA sequence if not re.fullmatch(r"[ATCGN\s]+", seq, re.I): logger.error("Invalid DNA sequence provided.") return None - + if not seq: logger.error("Empty DNA sequence provided.") return None - + # Use BLAST search (Note: requires network connection, may be slow) logger.debug("Performing BLAST search for DNA sequence...") time.sleep(0.35) from Bio.Blast import NCBIWWW, NCBIXML - + result_handle = NCBIWWW.qblast( program="blastn", database="nr", @@ -211,33 +218,32 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]: expect=0.001, ) blast_record = NCBIXML.read(result_handle) - + if not blast_record.alignments: logger.info("No BLAST hits found for the given sequence.") return None - + best_alignment = blast_record.alignments[0] best_hsp = best_alignment.hsps[0] hit_id = best_alignment.hit_id - + # Extract accession number # Format may be: gi|123456|ref|NM_000546.5| accession_match = re.search(r"ref\|([^|]+)", hit_id) if accession_match: accession = accession_match.group(1).split(".")[0] return self.get_by_accession(accession) - else: - # If unable to extract accession, return basic information - return { - "molecule_type": "DNA", - "database": "NCBI", - "id": hit_id, - "title": best_alignment.title, - "sequence_length": len(seq), - "e_value": best_hsp.expect, - "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0, - "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}", - } + # If unable to extract accession, return basic information + return { + "molecule_type": "DNA", + "database": "NCBI", + "id": hit_id, + "title": best_alignment.title, + "sequence_length": len(seq), + "e_value": best_hsp.expect, + "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0, + "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}", + } except RequestException: raise except Exception as e: # pylint: disable=broad-except @@ -293,4 +299,3 @@ async def search( if result: result["_search_query"] = query return result - diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index d1decd6d..63c88395 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -61,11 +61,10 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]: "description": rna_data.get("description", "N/A"), "url": f"https://rnacentral.org/rna/{rna_id}", } - elif resp.status == 404: + if resp.status == 404: logger.error("RNA ID %s not found", rna_id) return None - else: - raise Exception(f"HTTP {resp.status}: {await resp.text()}") + raise Exception(f"HTTP {resp.status}: {await resp.text()}") except Exception as exc: # pylint: disable=broad-except logger.error("RNA ID %s not found: %s", rna_id, exc) return None @@ -97,8 +96,7 @@ async def search_by_keyword(self, keyword: str) -> Optional[dict]: return await self.get_by_rna_id(rna_id) logger.info("No results found for keyword: %s", keyword) return None - else: - raise Exception(f"HTTP {resp.status}: {await resp.text()}") + raise Exception(f"HTTP {resp.status}: {await resp.text()}") except Exception as e: # pylint: disable=broad-except logger.error("Keyword %s not found: %s", keyword, e) return None @@ -116,16 +114,16 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]: seq = "".join(seq_lines[1:]) else: seq = sequence.strip().replace(" ", "").replace("\n", "") - + # Validate if it's an RNA sequence (contains U instead of T) if not re.fullmatch(r"[AUCGN\s]+", seq, re.I): logger.error("Invalid RNA sequence provided.") return None - + if not seq: logger.error("Empty RNA sequence provided.") return None - + # RNAcentral API supports sequence search async with aiohttp.ClientSession() as session: search_url = f"{self.base_url}/rna" @@ -144,8 +142,7 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]: return await self.get_by_rna_id(rna_id) logger.info("No results found for sequence.") return None - else: - raise Exception(f"HTTP {resp.status}: {await resp.text()}") + raise Exception(f"HTTP {resp.status}: {await resp.text()}") except Exception as e: # pylint: disable=broad-except logger.error("Sequence search failed: %s", e) return None @@ -188,4 +185,3 @@ async def search( if result: result["_search_query"] = query return result - From ef270b84d7764d430b8b6933ad316796ad33a25e Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 27 Nov 2025 00:04:45 +0800 Subject: [PATCH 03/22] refactor: unify searcher interfaces and improve error handling - Extract utility functions (_gene_record_to_dict, _accession_to_dict, _rna_data_to_dict) - Unify method naming: search_by_keyword -> get_best_hit - Add threshold parameter to NCBI and RNAcentral searchers for interface consistency - Improve error handling with network error detection and fallback strategies - Fix RNAcentral sequence search to prioritize exact matches - Add search_rna_demo.jsonl example file --- graphgen/models/searcher/db/ncbi_searcher.py | 107 +++++++++++------- .../models/searcher/db/rnacentral_searcher.py | 99 ++++++++++++---- .../input_examples/search_rna_demo.jsonl | 4 + 3 files changed, 150 insertions(+), 60 deletions(-) create mode 100644 resources/input_examples/search_rna_demo.jsonl diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index aa23a9d4..9c637ffd 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -41,6 +41,38 @@ def __init__(self, email: str = "test@example.com", tool: str = "GraphGen"): Entrez.tool = tool Entrez.timeout = 60 # 60 seconds timeout + @staticmethod + def _gene_record_to_dict(gene_record, gene_id: str) -> dict: + """ + Convert an Entrez gene record to a dictionary. + :param gene_record: The Entrez gene record (list from Entrez.read). + :param gene_id: The gene ID. + :return: A dictionary containing gene information. + """ + if not gene_record: + raise ValueError("Empty gene record") + + gene_data = gene_record[0] + gene_ref = gene_data.get("Entrezgene_gene", {}).get("Gene-ref", {}) + + organism = ( + gene_data.get("Entrezgene_source", {}) + .get("BioSource", {}) + .get("BioSource_org", {}) + .get("Org-ref", {}) + .get("Org-ref_taxname", "N/A") + ) + + return { + "molecule_type": "DNA", + "database": "NCBI", + "id": gene_id, + "gene_name": gene_ref.get("Gene-ref_locus", "N/A"), + "gene_description": gene_ref.get("Gene-ref_desc", "N/A"), + "organism": organism, + "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}", + } + def get_by_gene_id(self, gene_id: str) -> Optional[dict]: """ Get gene information by Gene ID. @@ -54,26 +86,7 @@ def get_by_gene_id(self, gene_id: str) -> Optional[dict]: gene_record = Entrez.read(handle) if not gene_record: return None - - gene_data = gene_record[0] - gene_ref = gene_data.get("Entrezgene_gene", {}).get("Gene-ref", {}) - - organism = ( - gene_data.get("Entrezgene_source", {}) - .get("BioSource", {}) - .get("BioSource_org", {}) - .get("Org-ref", {}) - .get("Org-ref_taxname", "N/A") - ) - return { - "molecule_type": "DNA", - "database": "NCBI", - "id": gene_id, - "gene_name": gene_ref.get("Gene-ref_locus", "N/A"), - "gene_description": gene_ref.get("Gene-ref_desc", "N/A"), - "organism": organism, - "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}", - } + return self._gene_record_to_dict(gene_record, gene_id) finally: handle.close() except RequestException: @@ -82,6 +95,28 @@ def get_by_gene_id(self, gene_id: str) -> Optional[dict]: logger.error("Gene ID %s not found: %s", gene_id, exc) return None + @staticmethod + def _accession_to_dict(accession: str, sequence: str, header: str, title: str, organism: str) -> dict: + """ + Convert accession information to a dictionary. + :param accession: NCBI accession number. + :param sequence: DNA sequence. + :param header: FASTA header. + :param title: Sequence title. + :param organism: Organism name. + :return: A dictionary containing sequence information. + """ + return { + "molecule_type": "DNA", + "database": "NCBI", + "id": accession, + "title": title, + "organism": organism, + "sequence": sequence, + "sequence_length": len(sequence), + "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}", + } + def get_by_accession(self, accession: str) -> Optional[dict]: """ Get sequence information by accession number. @@ -89,7 +124,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]: :return: A dictionary containing sequence information or None if not found. """ try: - time.sleep(0.35) # 遵守速率限制 + time.sleep(0.35) # Comply with rate limit handle = Entrez.efetch( db="nuccore", id=accession, @@ -120,16 +155,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]: finally: summary_handle.close() - return { - "molecule_type": "DNA", - "database": "NCBI", - "id": accession, - "title": title, - "organism": organism, - "sequence": sequence, - "sequence_length": len(sequence), - "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}", - } + return self._accession_to_dict(accession, sequence, header, title, organism) finally: handle.close() except RequestException: @@ -138,7 +164,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]: logger.error("Accession %s not found: %s", accession, exc) return None - def search_by_keyword(self, keyword: str) -> Optional[dict]: + def get_best_hit(self, keyword: str) -> Optional[dict]: """ Search NCBI Gene database with a keyword and return the best hit. :param keyword: The search keyword (e.g., gene name). @@ -148,7 +174,7 @@ def search_by_keyword(self, keyword: str) -> Optional[dict]: return None try: - time.sleep(0.35) # 遵守速率限制 + time.sleep(0.35) # Comply with rate limit # Search gene database search_handle = Entrez.esearch( db="gene", @@ -181,11 +207,12 @@ def search_by_keyword(self, keyword: str) -> Optional[dict]: logger.error("Keyword %s not found: %s", keyword, e) return None - def search_by_sequence(self, sequence: str) -> Optional[dict]: + def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: """ Search NCBI with a DNA sequence using BLAST. Note: This is a simplified version. For production, consider using local BLAST. :param sequence: DNA sequence (FASTA format or raw sequence). + :param threshold: E-value threshold for BLAST search. :return: A dictionary containing the best hit information or None if not found. """ try: @@ -215,7 +242,7 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]: database="nr", sequence=seq, hitlist_size=1, - expect=0.001, + expect=threshold, ) blast_record = NCBIXML.read(result_handle) @@ -225,6 +252,9 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]: best_alignment = blast_record.alignments[0] best_hsp = best_alignment.hsps[0] + if best_hsp.expect > threshold: + logger.info("No BLAST hits below the threshold E-value.") + return None hit_id = best_alignment.hit_id # Extract accession number @@ -257,11 +287,12 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]: reraise=True, ) async def search( - self, query: str, **kwargs + self, query: str, threshold: float = 0.01, **kwargs ) -> Optional[Dict]: """ Search NCBI with either a gene ID, accession number, keyword, or DNA sequence. :param query: The search query (gene ID, accession, keyword, or DNA sequence). + :param threshold: E-value threshold for BLAST search. :param kwargs: Additional keyword arguments (not used currently). :return: A dictionary containing the search results or None if not found. """ @@ -278,7 +309,7 @@ async def search( # check if DNA sequence (ATCG characters) if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): result = await loop.run_in_executor( - _get_pool(), self.search_by_sequence, query + _get_pool(), self.search_by_sequence, query, threshold ) # check if gene ID (numeric) elif re.fullmatch(r"^\d+$", query): @@ -293,7 +324,7 @@ async def search( else: # otherwise treat as keyword result = await loop.run_in_executor( - _get_pool(), self.search_by_keyword, query + _get_pool(), self.get_best_hit, query ) if result: diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index 63c88395..0eeb4a43 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -36,6 +36,27 @@ def __init__(self): self.base_url = "https://rnacentral.org/api/v1" self.headers = {"Accept": "application/json"} + @staticmethod + def _rna_data_to_dict(rna_id: str, rna_data: dict) -> dict: + """ + Convert RNAcentral API response to a dictionary. + :param rna_id: RNAcentral ID. + :param rna_data: API response data (dict or dict-like from search results). + :return: A dictionary containing RNA information. + """ + sequence = rna_data.get("sequence", "") + return { + "molecule_type": "RNA", + "database": "RNAcentral", + "id": rna_id, + "rnacentral_id": rna_data.get("rnacentral_id", rna_id), + "sequence": sequence, + "sequence_length": rna_data.get("length", len(sequence)), + "rna_type": rna_data.get("rna_type", "N/A"), + "description": rna_data.get("description", "N/A"), + "url": f"https://rnacentral.org/rna/{rna_id}", + } + async def get_by_rna_id(self, rna_id: str) -> Optional[dict]: """ Get RNA information by RNAcentral ID. @@ -50,26 +71,19 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]: ) as resp: if resp.status == 200: rna_data = await resp.json() - return { - "molecule_type": "RNA", - "database": "RNAcentral", - "id": rna_id, - "rnacentral_id": rna_data.get("rnacentral_id", "N/A"), - "sequence": rna_data.get("sequence", ""), - "sequence_length": len(rna_data.get("sequence", "")), - "rna_type": rna_data.get("rna_type", "N/A"), - "description": rna_data.get("description", "N/A"), - "url": f"https://rnacentral.org/rna/{rna_id}", - } + return self._rna_data_to_dict(rna_id, rna_data) if resp.status == 404: logger.error("RNA ID %s not found", rna_id) return None raise Exception(f"HTTP {resp.status}: {await resp.text()}") + except aiohttp.ClientError as e: + logger.error("Network error getting RNA ID %s: %s", rna_id, e) + return None except Exception as exc: # pylint: disable=broad-except logger.error("RNA ID %s not found: %s", rna_id, exc) return None - async def search_by_keyword(self, keyword: str) -> Optional[dict]: + async def get_best_hit(self, keyword: str) -> Optional[dict]: """ Search RNAcentral with a keyword and return the best hit. :param keyword: The search keyword (e.g., miRNA name, RNA name). @@ -90,13 +104,26 @@ async def search_by_keyword(self, keyword: str) -> Optional[dict]: ) as resp: if resp.status == 200: search_results = await resp.json() - if search_results.get("results"): - rna_id = search_results["results"][0].get("rnacentral_id") + results = search_results.get("results", []) + if results: + # Use the first result directly (search API already returns enough info) + first_result = results[0] + rna_id = first_result.get("rnacentral_id") if rna_id: - return await self.get_by_rna_id(rna_id) + # Try to get detailed info, but fall back to search result if it fails + detailed_info = await self.get_by_rna_id(rna_id) + if detailed_info: + return detailed_info + # Fall back to using search result data + return self._rna_data_to_dict(rna_id, first_result) logger.info("No results found for keyword: %s", keyword) return None - raise Exception(f"HTTP {resp.status}: {await resp.text()}") + error_text = await resp.text() + logger.error("HTTP %d error for keyword %s: %s", resp.status, keyword, error_text[:200]) + raise Exception(f"HTTP {resp.status}: {error_text}") + except aiohttp.ClientError as e: + logger.error("Network error searching for keyword %s: %s", keyword, e) + return None except Exception as e: # pylint: disable=broad-except logger.error("Keyword %s not found: %s", keyword, e) return None @@ -136,13 +163,39 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]: ) as resp: if resp.status == 200: search_results = await resp.json() - if search_results.get("results"): - rna_id = search_results["results"][0].get("rnacentral_id") + results = search_results.get("results", []) + if results: + # First, try to find an exact sequence match + exact_match = None + for result in results: + result_seq = result.get("sequence", "") + if result_seq == seq: + exact_match = result + break + + # Use exact match if found, otherwise use first result + target_result = exact_match if exact_match else results[0] + rna_id = target_result.get("rnacentral_id") + if rna_id: - return await self.get_by_rna_id(rna_id) + # Try to get detailed info, but fall back to search result if it fails + try: + detailed_info = await self.get_by_rna_id(rna_id) + if detailed_info: + return detailed_info + except Exception as e: + logger.debug("Failed to get detailed info for %s: %s, using search result", rna_id, e) + + # Fall back to using search result data + return self._rna_data_to_dict(rna_id, target_result) logger.info("No results found for sequence.") return None - raise Exception(f"HTTP {resp.status}: {await resp.text()}") + error_text = await resp.text() + logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200]) + raise Exception(f"HTTP {resp.status}: {error_text}") + except aiohttp.ClientError as e: + logger.error("Network error searching for sequence: %s", e) + return None except Exception as e: # pylint: disable=broad-except logger.error("Sequence search failed: %s", e) return None @@ -154,11 +207,13 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]: reraise=True, ) async def search( - self, query: str, **kwargs + self, query: str, threshold: float = 0.7, **kwargs ) -> Optional[Dict]: """ Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence. :param query: The search query (RNAcentral ID, keyword, or RNA sequence). + :param threshold: E-value threshold for sequence search. + Note: RNAcentral API uses its own similarity matching, this parameter is for interface consistency. :param kwargs: Additional keyword arguments (not used currently). :return: A dictionary containing the search results or None if not found. """ @@ -180,7 +235,7 @@ async def search( result = await self.get_by_rna_id(query) else: # otherwise treat as keyword - result = await self.search_by_keyword(query) + result = await self.get_best_hit(query) if result: result["_search_query"] = query diff --git a/resources/input_examples/search_rna_demo.jsonl b/resources/input_examples/search_rna_demo.jsonl new file mode 100644 index 00000000..caa28612 --- /dev/null +++ b/resources/input_examples/search_rna_demo.jsonl @@ -0,0 +1,4 @@ +{"type": "text", "content": "hsa-let-7a-1"} +{"type": "text", "content": "URS0000123456"} +{"type": "text", "content": "URS0000000001"} +{"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} From 71fba90cbad7baae068228f7e8d28eb88c9c7381 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 27 Nov 2025 00:25:11 +0800 Subject: [PATCH 04/22] Add UniProt IDs to search_protein_demo.jsonl --- resources/input_examples/search_protein_demo.jsonl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/resources/input_examples/search_protein_demo.jsonl b/resources/input_examples/search_protein_demo.jsonl index 6409a805..82b77836 100644 --- a/resources/input_examples/search_protein_demo.jsonl +++ b/resources/input_examples/search_protein_demo.jsonl @@ -1,3 +1,7 @@ +{"type": "text", "content": "P01308"} +{"type": "text", "content": "P68871"} +{"type": "text", "content": "P02768"} +{"type": "text", "content": "P04637"} {"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} {"type": "text", "content": "MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"} {"type": "text", "content": "MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"} From 9f8c837ab2b617ff48e40f2e82cee80cbdb8acb3 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 27 Nov 2025 00:25:11 +0800 Subject: [PATCH 05/22] add: add UniProt IDs to search_protein_demo.jsonl --- resources/input_examples/search_protein_demo.jsonl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/resources/input_examples/search_protein_demo.jsonl b/resources/input_examples/search_protein_demo.jsonl index 6409a805..82b77836 100644 --- a/resources/input_examples/search_protein_demo.jsonl +++ b/resources/input_examples/search_protein_demo.jsonl @@ -1,3 +1,7 @@ +{"type": "text", "content": "P01308"} +{"type": "text", "content": "P68871"} +{"type": "text", "content": "P02768"} +{"type": "text", "content": "P04637"} {"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} {"type": "text", "content": "MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"} {"type": "text", "content": "MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"} From c60784e7035e743626315ede028017b0dcc5acaa Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 27 Nov 2025 17:42:05 +0800 Subject: [PATCH 06/22] feat: unify search interfaces to use gene ID as unified data source --- graphgen/models/searcher/db/ncbi_searcher.py | 408 +++++++++++++++---- 1 file changed, 338 insertions(+), 70 deletions(-) diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index 9c637ffd..cc78fa1c 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -1,4 +1,5 @@ import asyncio +import logging import re import time from concurrent.futures import ThreadPoolExecutor @@ -7,12 +8,14 @@ from typing import Dict, Optional from Bio import Entrez +from Bio.Blast import NCBIWWW, NCBIXML from requests.exceptions import RequestException from tenacity import ( retry, retry_if_exception_type, stop_after_attempt, wait_exponential, + before_sleep_log, ) from graphgen.bases import BaseSearcher @@ -41,6 +44,18 @@ def __init__(self, email: str = "test@example.com", tool: str = "GraphGen"): Entrez.tool = tool Entrez.timeout = 60 # 60 seconds timeout + @staticmethod + def _safe_get(obj, key, default=None): + """Safely get value from dict or StringElement-like object.""" + if isinstance(obj, dict): + return obj.get(key, default) + elif hasattr(obj, "get"): + return obj.get(key, default) + elif hasattr(obj, key): + return getattr(obj, key, default) + else: + return default + @staticmethod def _gene_record_to_dict(gene_record, gene_id: str) -> dict: """ @@ -53,30 +68,200 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict: raise ValueError("Empty gene record") gene_data = gene_record[0] - gene_ref = gene_data.get("Entrezgene_gene", {}).get("Gene-ref", {}) - - organism = ( - gene_data.get("Entrezgene_source", {}) - .get("BioSource", {}) - .get("BioSource_org", {}) - .get("Org-ref", {}) - .get("Org-ref_taxname", "N/A") - ) + + # Safely extract gene_ref, handling both dict and StringElement types + gene_ref = {} + entrezgene_gene = gene_data.get("Entrezgene_gene") + if isinstance(entrezgene_gene, dict): + gene_ref = entrezgene_gene.get("Gene-ref", {}) + elif hasattr(entrezgene_gene, "get"): + gene_ref = entrezgene_gene.get("Gene-ref", {}) + else: + # If it's a StringElement or other type, try to access as dict + try: + if hasattr(entrezgene_gene, "Gene-ref"): + gene_ref = getattr(entrezgene_gene, "Gene-ref", {}) + except Exception: + pass + # Safely extract organism + organism = "N/A" + try: + entrezgene_source = gene_data.get("Entrezgene_source") + if isinstance(entrezgene_source, dict): + biosource = entrezgene_source.get("BioSource", {}) + if isinstance(biosource, dict): + biosource_org = biosource.get("BioSource_org", {}) + if isinstance(biosource_org, dict): + org_ref = biosource_org.get("Org-ref", {}) + if isinstance(org_ref, dict): + organism = org_ref.get("Org-ref_taxname", "N/A") + elif hasattr(org_ref, "Org-ref_taxname"): + organism = getattr(org_ref, "Org-ref_taxname", "N/A") + except Exception as e: + logger.debug("Error extracting organism: %s", e) + + # Extract gene synonyms - safely handle StringElement types + gene_synonyms = [] + try: + gene_syn = gene_ref.get("Gene-ref_syn", []) if isinstance(gene_ref, dict) else [] + if isinstance(gene_syn, list): + for syn in gene_syn: + if isinstance(syn, dict): + gene_synonyms.append(syn.get("Gene-ref_syn_E", "N/A")) + elif isinstance(syn, str): + gene_synonyms.append(syn) + else: + # Handle StringElement or other types + gene_synonyms.append(str(syn)) + elif isinstance(gene_syn, str): + gene_synonyms.append(gene_syn) + elif gene_syn: # Handle StringElement + gene_synonyms.append(str(gene_syn)) + except Exception as e: + logger.debug("Error extracting gene synonyms: %s", e) + + # Extract gene type - safely handle StringElement types + # Note: Entrezgene_type is a StringElement with numeric value (e.g., "6" for ncRNA) + gene_type = None + try: + gene_type_data = gene_data.get("Entrezgene_type") + if gene_type_data: + type_value = str(gene_type_data) + # Map numeric values to type names (NCBI gene type codes) + type_mapping = { + "1": "protein-coding", + "2": "pseudo", + "3": "rRNA", + "4": "tRNA", + "5": "snRNA", + "6": "ncRNA", + "7": "other", + } + gene_type = type_mapping.get(type_value, f"type_{type_value}") + except Exception as e: + logger.debug("Error extracting gene type: %s", e) + + # Extract chromosome and genomic location from Entrezgene_locus + # Note: Entrezgene_location doesn't exist, but Entrezgene_locus contains location info + chromosome = None + genomic_location = None + + try: + locus_data = gene_data.get("Entrezgene_locus") + if locus_data and isinstance(locus_data, list) and locus_data: + first_locus = locus_data[0] + if isinstance(first_locus, dict): + # Extract chromosome from Gene-commentary_label + # Example: "Chromosome 13 Reference RoL_Sarg_1.0" -> "13" + label = NCBISearch._safe_get(first_locus, "Gene-commentary_label", "") + if label and "Chromosome" in str(label): + match = re.search(r'Chromosome\s+(\S+)', str(label)) + if match: + chromosome = match.group(1) + + # Extract genomic location from Gene-commentary_seqs + seqs = NCBISearch._safe_get(first_locus, "Gene-commentary_seqs", []) + if seqs and isinstance(seqs, list) and seqs: + first_seq = seqs[0] + if isinstance(first_seq, dict): + seq_loc_int = NCBISearch._safe_get(first_seq, "Seq-loc_int", {}) + if seq_loc_int: + seq_interval = NCBISearch._safe_get(seq_loc_int, "Seq-interval", {}) + if seq_interval: + seq_from = NCBISearch._safe_get(seq_interval, "Seq-interval_from", "") + seq_to = NCBISearch._safe_get(seq_interval, "Seq-interval_to", "") + if seq_from and seq_to: + genomic_location = f"{seq_from}-{seq_to}" + except Exception as e: + logger.debug("Error extracting chromosome/location from gene record: %s", e) + + # Extract gene functional description + # Note: Entrezgene_summary doesn't exist for most genes + # Try to extract from Entrezgene_comments if available + function = None + try: + # First try Entrezgene_summary (if exists) + summary = gene_data.get("Entrezgene_summary") + if summary: + function = str(summary) + else: + # Try to extract from Entrezgene_comments + comments_data = gene_data.get("Entrezgene_comments") + if comments_data and isinstance(comments_data, list): + for comment in comments_data: + if isinstance(comment, dict): + heading = NCBISearch._safe_get(comment, "Gene-commentary_heading", "") + # Look for function-related comments + if "function" in str(heading).lower() or "summary" in str(heading).lower(): + comment_text = NCBISearch._safe_get(comment, "Gene-commentary_comment", "") + if comment_text: + function = str(comment_text) + break + except Exception as e: + logger.debug("Error extracting function: %s", e) + + # Try to extract representative mRNA accession from Entrezgene_locus for sequence retrieval + representative_accession = None + try: + if locus_data and isinstance(locus_data, list) and locus_data: + first_locus = locus_data[0] + if isinstance(first_locus, dict): + products = NCBISearch._safe_get(first_locus, "Gene-commentary_products", []) + if products and isinstance(products, list): + # Look for mRNA (type 3) or the first product + for product in products: + if isinstance(product, dict): + product_type = NCBISearch._safe_get(product, "Gene-commentary_type", "") + product_type_str = str(product_type) + # Type 3 is mRNA, prefer mRNA over other types + if product_type_str == "3" or (not representative_accession and product_type_str): + accession = NCBISearch._safe_get(product, "Gene-commentary_accession", "") + if accession: + representative_accession = str(accession) + if product_type_str == "3": # Found mRNA, use it + break + except Exception as e: + logger.debug("Error extracting representative accession: %s", e) + + # Build result dictionary with all fields + # Include all fields that might be present in accession-based queries return { "molecule_type": "DNA", "database": "NCBI", "id": gene_id, - "gene_name": gene_ref.get("Gene-ref_locus", "N/A"), - "gene_description": gene_ref.get("Gene-ref_desc", "N/A"), + "gene_name": NCBISearch._safe_get(gene_ref, "Gene-ref_locus", "N/A"), + "gene_description": NCBISearch._safe_get(gene_ref, "Gene-ref_desc", "N/A"), "organism": organism, "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}", + "gene_synonyms": gene_synonyms if gene_synonyms else None, + "gene_type": gene_type, + "chromosome": chromosome, + "genomic_location": genomic_location, + "function": function, + # Fields from accession-based queries (set to None initially, may be filled later) + "title": None, + "sequence": None, + "sequence_length": None, + "gene_id": gene_id, # For consistency with accession queries + "molecule_type_detail": None, + "_representative_accession": representative_accession, } - def get_by_gene_id(self, gene_id: str) -> Optional[dict]: + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + + def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]: """ Get gene information by Gene ID. + This is the unified data source - all search methods eventually call this. :param gene_id: NCBI Gene ID. + :param preferred_accession: Optional accession to use for sequence retrieval if representative mRNA is not available. :return: A dictionary containing gene information or None if not found. """ try: @@ -86,84 +271,166 @@ def get_by_gene_id(self, gene_id: str) -> Optional[dict]: gene_record = Entrez.read(handle) if not gene_record: return None - return self._gene_record_to_dict(gene_record, gene_id) + result = self._gene_record_to_dict(gene_record, gene_id) + + # Try to get sequence from accession + # Priority: 1) preferred_accession (if provided), 2) representative mRNA accession + accession_to_use = preferred_accession or result.get("_representative_accession") + if accession_to_use: + try: + # Get sequence info directly from nuccore database + time.sleep(0.35) + seq_handle = Entrez.efetch( + db="nuccore", + id=accession_to_use, + rettype="fasta", + retmode="text", + ) + try: + sequence_data = seq_handle.read() + if sequence_data: + seq_lines = sequence_data.strip().split("\n") + header = seq_lines[0] if seq_lines else "" + sequence = "".join(seq_lines[1:]) + + # Get summary for additional info + time.sleep(0.35) + summary_handle = Entrez.esummary(db="nuccore", id=accession_to_use) + try: + summary = Entrez.read(summary_handle) + if summary: + summary_data = summary[0] + title = summary_data.get("Title", header) + + # Determine molecule type detail + molecule_type_detail = "N/A" + if accession_to_use.startswith("NM_") or accession_to_use.startswith("XM_"): + molecule_type_detail = "mRNA" + elif accession_to_use.startswith("NC_") or accession_to_use.startswith("NT_"): + molecule_type_detail = "genomic DNA" + elif accession_to_use.startswith("NR_") or accession_to_use.startswith("XR_"): + molecule_type_detail = "RNA" + elif accession_to_use.startswith("NG_"): + molecule_type_detail = "genomic region" + + # Merge sequence information into result + result["sequence"] = sequence + result["sequence_length"] = len(sequence) + result["title"] = title + result["molecule_type_detail"] = molecule_type_detail + + # Update chromosome and genomic_location if not already set + if not result.get("chromosome"): + chromosome = summary_data.get("ChrLoc") or summary_data.get("ChrAccVer") + if chromosome: + result["chromosome"] = chromosome + if not result.get("genomic_location"): + chr_start = summary_data.get("ChrStart") + chr_stop = summary_data.get("ChrStop") + if chr_start and chr_stop: + result["genomic_location"] = f"{chr_start}-{chr_stop}" + finally: + summary_handle.close() + finally: + seq_handle.close() + except (RequestException, IncompleteRead): + # Re-raise to allow retry mechanism + raise + except Exception as e: + logger.debug("Failed to get sequence for accession %s: %s", + accession_to_use, e) + + # Remove internal field + result.pop("_representative_accession", None) + return result finally: handle.close() except RequestException: raise + except IncompleteRead: + raise except Exception as exc: # pylint: disable=broad-except logger.error("Gene ID %s not found: %s", gene_id, exc) return None - @staticmethod - def _accession_to_dict(accession: str, sequence: str, header: str, title: str, organism: str) -> dict: - """ - Convert accession information to a dictionary. - :param accession: NCBI accession number. - :param sequence: DNA sequence. - :param header: FASTA header. - :param title: Sequence title. - :param organism: Organism name. - :return: A dictionary containing sequence information. - """ - return { - "molecule_type": "DNA", - "database": "NCBI", - "id": accession, - "title": title, - "organism": organism, - "sequence": sequence, - "sequence_length": len(sequence), - "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}", - } - def get_by_accession(self, accession: str) -> Optional[dict]: """ Get sequence information by accession number. + Unified approach: Get GeneID from accession, then call get_by_gene_id() for complete information. :param accession: NCBI accession number (e.g., NM_000546). - :return: A dictionary containing sequence information or None if not found. + :return: A dictionary containing complete gene information or None if not found. """ try: - time.sleep(0.35) # Comply with rate limit - handle = Entrez.efetch( - db="nuccore", - id=accession, - rettype="fasta", - retmode="text", - ) + # Step 1: Get GeneID from elink (nuccore -> gene) + # Note: esummary for nuccore doesn't include GeneID, so we use elink instead + time.sleep(0.35) + link_handle = Entrez.elink(dbfrom="nuccore", db="gene", id=accession) + gene_id = None try: - sequence_data = handle.read() - if not sequence_data: - return None - - seq_lines = sequence_data.strip().split("\n") - header = seq_lines[0] if seq_lines else "" - sequence = "".join(seq_lines[1:]) - - # Try to get more information - time.sleep(0.35) - summary_handle = Entrez.esummary(db="nuccore", id=accession) - try: - summary = Entrez.read(summary_handle) - if summary: - summary_data = summary[0] - title = summary_data.get("Title", header) - organism = summary_data.get("Organism", "N/A") - else: - title = header - organism = "N/A" - finally: - summary_handle.close() - - return self._accession_to_dict(accession, sequence, header, title, organism) + links = Entrez.read(link_handle) + + # Extract GeneID from elink results + # Structure: links[0]["LinkSetDb"][0]["Link"][0]["Id"] + if links and len(links) > 0: + first_link = links[0] + if "LinkSetDb" in first_link: + for link_set in first_link["LinkSetDb"]: + if link_set.get("DbTo") == "gene": + # Try Link structure first (most common) + links_in_set = link_set.get("Link", []) + if links_in_set and len(links_in_set) > 0: + first_link_item = links_in_set[0] + if isinstance(first_link_item, dict): + gene_id = str(first_link_item.get("Id", "")) + elif hasattr(first_link_item, "Id"): + gene_id = str(getattr(first_link_item, "Id", "")) + else: + # Handle StringElement or other types + gene_id = str(first_link_item) + if gene_id: + break + # Fallback: Try IdList (if Link is not available) + id_list = link_set.get("IdList", []) + if id_list and not gene_id: + gene_id = str(id_list[0]) + break + except Exception as e: + logger.error("Error parsing elink result for accession %s: %s", accession, e) + import traceback + logger.debug(traceback.format_exc()) + # Continue to check if we got gene_id before the error finally: - handle.close() - except RequestException: + link_handle.close() + + # Step 2: If we have a GeneID, get complete information from Gene database + # Pass the accession as preferred_accession so get_by_gene_id can use it for sequence + if gene_id: + result = self.get_by_gene_id(gene_id, preferred_accession=accession) + + # Update id to accession for consistency (user searched by accession) + if result: + result["id"] = accession + result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}" + + return result + + # Step 3: If no GeneID, this is a rare case (accession without associated gene) + # Return None - we can't provide complete information without Gene ID + logger.warning("Accession %s has no associated GeneID, cannot provide complete information", accession) + return None + except (RequestException, IncompleteRead): raise except Exception as exc: # pylint: disable=broad-except logger.error("Accession %s not found: %s", accession, exc) return None + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) def get_best_hit(self, keyword: str) -> Optional[dict]: """ Search NCBI Gene database with a keyword and return the best hit. @@ -203,6 +470,8 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: search_handle.close() except RequestException: raise + except IncompleteRead: + raise except Exception as e: # pylint: disable=broad-except logger.error("Keyword %s not found: %s", keyword, e) return None @@ -235,7 +504,6 @@ def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional # Use BLAST search (Note: requires network connection, may be slow) logger.debug("Performing BLAST search for DNA sequence...") time.sleep(0.35) - from Bio.Blast import NCBIWWW, NCBIXML result_handle = NCBIWWW.qblast( program="blastn", From 0dac99d0f2b5176bff545a08fc19fcf90dbf91c3 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 27 Nov 2025 18:29:16 +0800 Subject: [PATCH 07/22] add: an gene id example in DNA demo --- resources/input_examples/search_dna_demo.jsonl | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/input_examples/search_dna_demo.jsonl b/resources/input_examples/search_dna_demo.jsonl index 83086df9..387c87b8 100644 --- a/resources/input_examples/search_dna_demo.jsonl +++ b/resources/input_examples/search_dna_demo.jsonl @@ -1,4 +1,5 @@ {"type": "text", "content": "TP53"} {"type": "text", "content": "BRCA1"} +{"type": "text", "content": "672"} {"type": "text", "content": "NM_000546"} {"type": "text", "content": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"} From 1865120d756aa7345cd0abba6dd073ba86aa70ae Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 27 Nov 2025 20:23:15 +0800 Subject: [PATCH 08/22] feat: unify search interfaces to use RNA id as unified data source --- .../models/searcher/db/rnacentral_searcher.py | 226 +++++++++++++++--- 1 file changed, 196 insertions(+), 30 deletions(-) diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index 0eeb4a43..80cb4428 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -1,8 +1,6 @@ import asyncio import re -from concurrent.futures import ThreadPoolExecutor -from functools import lru_cache -from typing import Dict, Optional +from typing import Dict, Optional, List, Any import aiohttp from tenacity import ( @@ -15,12 +13,6 @@ from graphgen.bases import BaseSearcher from graphgen.utils import logger - -@lru_cache(maxsize=None) -def _get_pool(): - return ThreadPoolExecutor(max_workers=10) - - class RNACentralSearch(BaseSearcher): """ RNAcentral Search client to search RNA databases. @@ -36,15 +28,167 @@ def __init__(self): self.base_url = "https://rnacentral.org/api/v1" self.headers = {"Accept": "application/json"} + async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession) -> List[Dict]: + """ + Fetch all xrefs from the xrefs endpoint, handling pagination. + :param xrefs_url: URL to the xrefs endpoint. + :param session: aiohttp ClientSession to use for requests. + :return: List of all xref entries. + """ + all_xrefs = [] + current_url = xrefs_url + + while current_url: + try: + async with session.get( + current_url, headers=self.headers, timeout=aiohttp.ClientTimeout(total=30) + ) as resp: + if resp.status == 200: + data = await resp.json() + results = data.get("results", []) + all_xrefs.extend(results) + + # Check if there's a next page + current_url = data.get("next") + if not current_url: + break + + # Small delay to avoid rate limiting + await asyncio.sleep(0.2) + else: + logger.warning("Failed to fetch xrefs from %s: HTTP %d", current_url, resp.status) + break + except Exception as e: + logger.warning("Error fetching xrefs from %s: %s", current_url, e) + break + + return all_xrefs + @staticmethod - def _rna_data_to_dict(rna_id: str, rna_data: dict) -> dict: + def _extract_info_from_xrefs(xrefs: List[Dict]) -> Dict[str, Any]: + """ + Extract information from xrefs data. + :param xrefs: List of xref entries. + :return: Dictionary with extracted information. + """ + extracted = { + "organisms": set(), + "gene_names": set(), + "modifications": [], + "so_terms": set(), + "xrefs_list": [], + } + + for xref in xrefs: + # Extract accession information + accession = xref.get("accession", {}) + + # Extract species information + species = accession.get("species") + if species: + extracted["organisms"].add(species) + + # Extract gene name + gene = accession.get("gene") + if gene and gene.strip(): # Only add non-empty genes + extracted["gene_names"].add(gene.strip()) + + # Extract modifications + modifications = xref.get("modifications", []) + if modifications: + extracted["modifications"].extend(modifications) + + # Extract SO term (biotype) + biotype = accession.get("biotype") + if biotype: + extracted["so_terms"].add(biotype) + + # Build xrefs list + xref_info = { + "database": xref.get("database"), + "accession_id": accession.get("id"), + "external_id": accession.get("external_id"), + "description": accession.get("description"), + "species": species, + "gene": gene, + } + extracted["xrefs_list"].append(xref_info) + + # Convert sets to appropriate formats + return { + "organism": ( + list(extracted["organisms"])[0] + if len(extracted["organisms"]) == 1 + else (", ".join(extracted["organisms"]) if extracted["organisms"] else None) + ), + "gene_name": ( + list(extracted["gene_names"])[0] + if len(extracted["gene_names"]) == 1 + else (", ".join(extracted["gene_names"]) if extracted["gene_names"] else None) + ), + "related_genes": list(extracted["gene_names"]) if extracted["gene_names"] else None, + "modifications": extracted["modifications"] if extracted["modifications"] else None, + "so_term": ( + list(extracted["so_terms"])[0] + if len(extracted["so_terms"]) == 1 + else (", ".join(extracted["so_terms"]) if extracted["so_terms"] else None) + ), + "xrefs": extracted["xrefs_list"] if extracted["xrefs_list"] else None, + } + + @staticmethod + def _rna_data_to_dict(rna_id: str, rna_data: dict, xrefs_data: Optional[List[Dict]] = None) -> dict: """ Convert RNAcentral API response to a dictionary. :param rna_id: RNAcentral ID. :param rna_data: API response data (dict or dict-like from search results). + :param xrefs_data: Optional list of xref entries fetched from xrefs endpoint. :return: A dictionary containing RNA information. """ sequence = rna_data.get("sequence", "") + + # Initialize extracted info from xrefs if available + extracted_info = {} + if xrefs_data: + extracted_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data) + + # Extract organism information (prefer from xrefs, fallback to main data) + organism = extracted_info.get("organism") + if not organism: + organism = rna_data.get("organism", None) + if not organism: + organism = rna_data.get("species", None) + + # Extract related genes (prefer from xrefs, fallback to main data) + related_genes = extracted_info.get("related_genes") + if not related_genes: + related_genes = rna_data.get("related_genes", []) + if not related_genes: + related_genes = rna_data.get("genes", []) + if not related_genes: + gene_name_temp = rna_data.get("gene_name", None) + if gene_name_temp: + related_genes = [gene_name_temp] + + # Extract gene name (prefer from xrefs, fallback to main data) + gene_name = extracted_info.get("gene_name") + if not gene_name: + gene_name = rna_data.get("gene_name", None) + if not gene_name: + gene_name = rna_data.get("gene", None) + + # Extract so_term (prefer from xrefs, fallback to main data) + so_term = extracted_info.get("so_term") + if not so_term: + so_term = rna_data.get("so_term", None) + + # Extract modifications (prefer from xrefs, fallback to main data) + modifications = extracted_info.get("modifications") + if not modifications: + modifications = rna_data.get("modifications", None) + + # Build result dictionary (xrefs information is already extracted into other fields) + # information is extracted into organism, gene_name, so_term, modifications, etc. return { "molecule_type": "RNA", "database": "RNAcentral", @@ -55,6 +199,11 @@ def _rna_data_to_dict(rna_id: str, rna_data: dict) -> dict: "rna_type": rna_data.get("rna_type", "N/A"), "description": rna_data.get("description", "N/A"), "url": f"https://rnacentral.org/rna/{rna_id}", + "organism": organism, + "related_genes": related_genes if related_genes else None, + "gene_name": gene_name, + "so_term": so_term, + "modifications": modifications, } async def get_by_rna_id(self, rna_id: str) -> Optional[dict]: @@ -71,7 +220,19 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]: ) as resp: if resp.status == 200: rna_data = await resp.json() - return self._rna_data_to_dict(rna_id, rna_data) + + # Check if xrefs is a URL and fetch the actual xrefs data + xrefs_data = None + xrefs_url = rna_data.get("xrefs") + if xrefs_url and isinstance(xrefs_url, str) and xrefs_url.startswith("http"): + try: + xrefs_data = await self._fetch_all_xrefs(xrefs_url, session) + logger.debug("Fetched %d xrefs for RNA ID %s", len(xrefs_data), rna_id) + except Exception as e: + logger.warning("Failed to fetch xrefs for RNA ID %s: %s", rna_id, e) + # Continue without xrefs data + + return self._rna_data_to_dict(rna_id, rna_data, xrefs_data) if resp.status == 404: logger.error("RNA ID %s not found", rna_id) return None @@ -86,8 +247,9 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]: async def get_best_hit(self, keyword: str) -> Optional[dict]: """ Search RNAcentral with a keyword and return the best hit. + Unified approach: Find RNA ID from search, then call get_by_rna_id() for complete information. :param keyword: The search keyword (e.g., miRNA name, RNA name). - :return: A dictionary containing the best hit information or None if not found. + :return: A dictionary containing complete RNA information or None if not found. """ if not keyword.strip(): return None @@ -106,16 +268,20 @@ async def get_best_hit(self, keyword: str) -> Optional[dict]: search_results = await resp.json() results = search_results.get("results", []) if results: - # Use the first result directly (search API already returns enough info) + # Step 1: Get RNA ID from search results first_result = results[0] rna_id = first_result.get("rnacentral_id") + if rna_id: - # Try to get detailed info, but fall back to search result if it fails - detailed_info = await self.get_by_rna_id(rna_id) - if detailed_info: - return detailed_info - # Fall back to using search result data - return self._rna_data_to_dict(rna_id, first_result) + # Step 2: Unified call to get_by_rna_id() for complete information + result = await self.get_by_rna_id(rna_id) + + # Step 3: If get_by_rna_id() failed, use search result data as fallback + if not result: + logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) + result = self._rna_data_to_dict(rna_id, first_result) + + return result logger.info("No results found for keyword: %s", keyword) return None error_text = await resp.text() @@ -131,8 +297,9 @@ async def get_best_hit(self, keyword: str) -> Optional[dict]: async def search_by_sequence(self, sequence: str) -> Optional[dict]: """ Search RNAcentral with an RNA sequence. + Unified approach: Find RNA ID from sequence search, then call get_by_rna_id() for complete information. :param sequence: RNA sequence (FASTA format or raw sequence). - :return: A dictionary containing the best hit information or None if not found. + :return: A dictionary containing complete RNA information or None if not found. """ try: # Extract sequence (if in FASTA format) @@ -165,7 +332,7 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]: search_results = await resp.json() results = search_results.get("results", []) if results: - # First, try to find an exact sequence match + # Step 1: Find best match (prefer exact match) exact_match = None for result in results: result_seq = result.get("sequence", "") @@ -178,16 +345,15 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]: rna_id = target_result.get("rnacentral_id") if rna_id: - # Try to get detailed info, but fall back to search result if it fails - try: - detailed_info = await self.get_by_rna_id(rna_id) - if detailed_info: - return detailed_info - except Exception as e: - logger.debug("Failed to get detailed info for %s: %s, using search result", rna_id, e) + # Step 2: Unified call to get_by_rna_id() for complete information + result = await self.get_by_rna_id(rna_id) + + # Step 3: If get_by_rna_id() failed, use search result data as fallback + if not result: + logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) + result = self._rna_data_to_dict(rna_id, target_result) - # Fall back to using search result data - return self._rna_data_to_dict(rna_id, target_result) + return result logger.info("No results found for sequence.") return None error_text = await resp.text() From 8678e33161f969f948b72d61108440409f42fdf1 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 27 Nov 2025 20:34:33 +0800 Subject: [PATCH 09/22] fix: fix pylint style issues --- graphgen/models/searcher/db/ncbi_searcher.py | 41 +++++++------ .../models/searcher/db/rnacentral_searcher.py | 58 +++++++++---------- 2 files changed, 49 insertions(+), 50 deletions(-) diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index cc78fa1c..49d0f901 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -33,7 +33,7 @@ class NCBISearch(BaseSearcher): 1) Get the gene/DNA by accession number or gene ID. 2) Search with keywords or gene names (fuzzy search). 3) Search with FASTA sequence (BLAST search for DNA sequences). - + API Documentation: https://www.ncbi.nlm.nih.gov/home/develop/api/ Note: NCBI has rate limits (max 3 requests per second), delays are required between requests. """ @@ -49,12 +49,11 @@ def _safe_get(obj, key, default=None): """Safely get value from dict or StringElement-like object.""" if isinstance(obj, dict): return obj.get(key, default) - elif hasattr(obj, "get"): + if hasattr(obj, "get"): return obj.get(key, default) - elif hasattr(obj, key): + if hasattr(obj, key): return getattr(obj, key, default) - else: - return default + return default @staticmethod def _gene_record_to_dict(gene_record, gene_id: str) -> dict: @@ -68,7 +67,7 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict: raise ValueError("Empty gene record") gene_data = gene_record[0] - + # Safely extract gene_ref, handling both dict and StringElement types gene_ref = {} entrezgene_gene = gene_data.get("Entrezgene_gene") @@ -146,7 +145,7 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict: # Note: Entrezgene_location doesn't exist, but Entrezgene_locus contains location info chromosome = None genomic_location = None - + try: locus_data = gene_data.get("Entrezgene_locus") if locus_data and isinstance(locus_data, list) and locus_data: @@ -159,7 +158,7 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict: match = re.search(r'Chromosome\s+(\S+)', str(label)) if match: chromosome = match.group(1) - + # Extract genomic location from Gene-commentary_seqs seqs = NCBISearch._safe_get(first_locus, "Gene-commentary_seqs", []) if seqs and isinstance(seqs, list) and seqs: @@ -255,7 +254,7 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict: reraise=True, before_sleep=before_sleep_log(logger, logging.WARNING), ) - + def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]: """ Get gene information by Gene ID. @@ -272,7 +271,7 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None if not gene_record: return None result = self._gene_record_to_dict(gene_record, gene_id) - + # Try to get sequence from accession # Priority: 1) preferred_accession (if provided), 2) representative mRNA accession accession_to_use = preferred_accession or result.get("_representative_accession") @@ -292,7 +291,7 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None seq_lines = sequence_data.strip().split("\n") header = seq_lines[0] if seq_lines else "" sequence = "".join(seq_lines[1:]) - + # Get summary for additional info time.sleep(0.35) summary_handle = Entrez.esummary(db="nuccore", id=accession_to_use) @@ -301,7 +300,7 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None if summary: summary_data = summary[0] title = summary_data.get("Title", header) - + # Determine molecule type detail molecule_type_detail = "N/A" if accession_to_use.startswith("NM_") or accession_to_use.startswith("XM_"): @@ -312,13 +311,13 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None molecule_type_detail = "RNA" elif accession_to_use.startswith("NG_"): molecule_type_detail = "genomic region" - + # Merge sequence information into result result["sequence"] = sequence result["sequence_length"] = len(sequence) result["title"] = title result["molecule_type_detail"] = molecule_type_detail - + # Update chromosome and genomic_location if not already set if not result.get("chromosome"): chromosome = summary_data.get("ChrLoc") or summary_data.get("ChrAccVer") @@ -337,9 +336,9 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None # Re-raise to allow retry mechanism raise except Exception as e: - logger.debug("Failed to get sequence for accession %s: %s", + logger.debug("Failed to get sequence for accession %s: %s", accession_to_use, e) - + # Remove internal field result.pop("_representative_accession", None) return result @@ -368,7 +367,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]: gene_id = None try: links = Entrez.read(link_handle) - + # Extract GeneID from elink results # Structure: links[0]["LinkSetDb"][0]["Link"][0]["Id"] if links and len(links) > 0: @@ -401,19 +400,19 @@ def get_by_accession(self, accession: str) -> Optional[dict]: # Continue to check if we got gene_id before the error finally: link_handle.close() - + # Step 2: If we have a GeneID, get complete information from Gene database # Pass the accession as preferred_accession so get_by_gene_id can use it for sequence if gene_id: result = self.get_by_gene_id(gene_id, preferred_accession=accession) - + # Update id to accession for consistency (user searched by accession) if result: result["id"] = accession result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}" - + return result - + # Step 3: If no GeneID, this is a rare case (accession without associated gene) # Return None - we can't provide complete information without Gene ID logger.warning("Accession %s has no associated GeneID, cannot provide complete information", accession) diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index 80cb4428..89b430ac 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -19,7 +19,7 @@ class RNACentralSearch(BaseSearcher): 1) Get RNA by RNAcentral ID. 2) Search with keywords or RNA names (fuzzy search). 3) Search with RNA sequence. - + API Documentation: https://rnacentral.org/api/v1 """ @@ -37,7 +37,7 @@ async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession) """ all_xrefs = [] current_url = xrefs_url - + while current_url: try: async with session.get( @@ -47,12 +47,12 @@ async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession) data = await resp.json() results = data.get("results", []) all_xrefs.extend(results) - + # Check if there's a next page current_url = data.get("next") if not current_url: break - + # Small delay to avoid rate limiting await asyncio.sleep(0.2) else: @@ -61,7 +61,7 @@ async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession) except Exception as e: logger.warning("Error fetching xrefs from %s: %s", current_url, e) break - + return all_xrefs @staticmethod @@ -78,31 +78,31 @@ def _extract_info_from_xrefs(xrefs: List[Dict]) -> Dict[str, Any]: "so_terms": set(), "xrefs_list": [], } - + for xref in xrefs: # Extract accession information accession = xref.get("accession", {}) - + # Extract species information species = accession.get("species") if species: extracted["organisms"].add(species) - + # Extract gene name gene = accession.get("gene") if gene and gene.strip(): # Only add non-empty genes extracted["gene_names"].add(gene.strip()) - + # Extract modifications modifications = xref.get("modifications", []) if modifications: extracted["modifications"].extend(modifications) - + # Extract SO term (biotype) biotype = accession.get("biotype") if biotype: extracted["so_terms"].add(biotype) - + # Build xrefs list xref_info = { "database": xref.get("database"), @@ -113,24 +113,24 @@ def _extract_info_from_xrefs(xrefs: List[Dict]) -> Dict[str, Any]: "gene": gene, } extracted["xrefs_list"].append(xref_info) - + # Convert sets to appropriate formats return { "organism": ( - list(extracted["organisms"])[0] - if len(extracted["organisms"]) == 1 + list(extracted["organisms"])[0] + if len(extracted["organisms"]) == 1 else (", ".join(extracted["organisms"]) if extracted["organisms"] else None) ), "gene_name": ( - list(extracted["gene_names"])[0] - if len(extracted["gene_names"]) == 1 + list(extracted["gene_names"])[0] + if len(extracted["gene_names"]) == 1 else (", ".join(extracted["gene_names"]) if extracted["gene_names"] else None) ), "related_genes": list(extracted["gene_names"]) if extracted["gene_names"] else None, "modifications": extracted["modifications"] if extracted["modifications"] else None, "so_term": ( - list(extracted["so_terms"])[0] - if len(extracted["so_terms"]) == 1 + list(extracted["so_terms"])[0] + if len(extracted["so_terms"]) == 1 else (", ".join(extracted["so_terms"]) if extracted["so_terms"] else None) ), "xrefs": extracted["xrefs_list"] if extracted["xrefs_list"] else None, @@ -146,12 +146,12 @@ def _rna_data_to_dict(rna_id: str, rna_data: dict, xrefs_data: Optional[List[Dic :return: A dictionary containing RNA information. """ sequence = rna_data.get("sequence", "") - + # Initialize extracted info from xrefs if available extracted_info = {} if xrefs_data: extracted_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data) - + # Extract organism information (prefer from xrefs, fallback to main data) organism = extracted_info.get("organism") if not organism: @@ -220,7 +220,7 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]: ) as resp: if resp.status == 200: rna_data = await resp.json() - + # Check if xrefs is a URL and fetch the actual xrefs data xrefs_data = None xrefs_url = rna_data.get("xrefs") @@ -231,7 +231,7 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]: except Exception as e: logger.warning("Failed to fetch xrefs for RNA ID %s: %s", rna_id, e) # Continue without xrefs data - + return self._rna_data_to_dict(rna_id, rna_data, xrefs_data) if resp.status == 404: logger.error("RNA ID %s not found", rna_id) @@ -271,16 +271,16 @@ async def get_best_hit(self, keyword: str) -> Optional[dict]: # Step 1: Get RNA ID from search results first_result = results[0] rna_id = first_result.get("rnacentral_id") - + if rna_id: # Step 2: Unified call to get_by_rna_id() for complete information result = await self.get_by_rna_id(rna_id) - + # Step 3: If get_by_rna_id() failed, use search result data as fallback if not result: logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) result = self._rna_data_to_dict(rna_id, first_result) - + return result logger.info("No results found for keyword: %s", keyword) return None @@ -339,20 +339,20 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]: if result_seq == seq: exact_match = result break - + # Use exact match if found, otherwise use first result target_result = exact_match if exact_match else results[0] rna_id = target_result.get("rnacentral_id") - + if rna_id: # Step 2: Unified call to get_by_rna_id() for complete information result = await self.get_by_rna_id(rna_id) - + # Step 3: If get_by_rna_id() failed, use search result data as fallback if not result: logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) result = self._rna_data_to_dict(rna_id, target_result) - + return result logger.info("No results found for sequence.") return None From 40ef49e9c3f55fa78256a938f628e817c042c7ff Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Fri, 28 Nov 2025 01:07:26 +0800 Subject: [PATCH 10/22] fix: reduce nested blocks and fix all pylint issues --- graphgen/models/searcher/db/ncbi_searcher.py | 522 ++++++++++--------- 1 file changed, 282 insertions(+), 240 deletions(-) diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index 49d0f901..cca38bca 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -56,51 +56,34 @@ def _safe_get(obj, key, default=None): return default @staticmethod - def _gene_record_to_dict(gene_record, gene_id: str) -> dict: - """ - Convert an Entrez gene record to a dictionary. - :param gene_record: The Entrez gene record (list from Entrez.read). - :param gene_id: The gene ID. - :return: A dictionary containing gene information. - """ - if not gene_record: - raise ValueError("Empty gene record") - - gene_data = gene_record[0] - - # Safely extract gene_ref, handling both dict and StringElement types - gene_ref = {} - entrezgene_gene = gene_data.get("Entrezgene_gene") + def _extract_gene_ref(entrezgene_gene): + """Extract gene_ref from entrezgene_gene.""" if isinstance(entrezgene_gene, dict): - gene_ref = entrezgene_gene.get("Gene-ref", {}) - elif hasattr(entrezgene_gene, "get"): - gene_ref = entrezgene_gene.get("Gene-ref", {}) - else: - # If it's a StringElement or other type, try to access as dict - try: - if hasattr(entrezgene_gene, "Gene-ref"): - gene_ref = getattr(entrezgene_gene, "Gene-ref", {}) - except Exception: - pass + return entrezgene_gene.get("Gene-ref", {}) + if hasattr(entrezgene_gene, "get"): + return entrezgene_gene.get("Gene-ref", {}) + try: + if hasattr(entrezgene_gene, "Gene-ref"): + return getattr(entrezgene_gene, "Gene-ref", {}) + except Exception: + pass + return {} - # Safely extract organism - organism = "N/A" + @staticmethod + def _extract_organism(entrezgene_source): + """Extract organism from entrezgene_source.""" try: - entrezgene_source = gene_data.get("Entrezgene_source") - if isinstance(entrezgene_source, dict): - biosource = entrezgene_source.get("BioSource", {}) - if isinstance(biosource, dict): - biosource_org = biosource.get("BioSource_org", {}) - if isinstance(biosource_org, dict): - org_ref = biosource_org.get("Org-ref", {}) - if isinstance(org_ref, dict): - organism = org_ref.get("Org-ref_taxname", "N/A") - elif hasattr(org_ref, "Org-ref_taxname"): - organism = getattr(org_ref, "Org-ref_taxname", "N/A") + biosource = NCBISearch._safe_get(entrezgene_source, "BioSource", {}) + biosource_org = NCBISearch._safe_get(biosource, "BioSource_org", {}) + org_ref = NCBISearch._safe_get(biosource_org, "Org-ref", {}) + return NCBISearch._safe_get(org_ref, "Org-ref_taxname", "N/A") except Exception as e: logger.debug("Error extracting organism: %s", e) + return "N/A" - # Extract gene synonyms - safely handle StringElement types + @staticmethod + def _extract_synonyms(gene_ref): + """Extract gene synonyms from gene_ref.""" gene_synonyms = [] try: gene_syn = gene_ref.get("Gene-ref_syn", []) if isinstance(gene_ref, dict) else [] @@ -111,120 +94,154 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict: elif isinstance(syn, str): gene_synonyms.append(syn) else: - # Handle StringElement or other types gene_synonyms.append(str(syn)) elif isinstance(gene_syn, str): gene_synonyms.append(gene_syn) - elif gene_syn: # Handle StringElement + elif gene_syn: gene_synonyms.append(str(gene_syn)) except Exception as e: logger.debug("Error extracting gene synonyms: %s", e) + return gene_synonyms - # Extract gene type - safely handle StringElement types - # Note: Entrezgene_type is a StringElement with numeric value (e.g., "6" for ncRNA) - gene_type = None + @staticmethod + def _extract_gene_type(gene_data): + """Extract gene type from gene_data.""" try: gene_type_data = gene_data.get("Entrezgene_type") - if gene_type_data: - type_value = str(gene_type_data) - # Map numeric values to type names (NCBI gene type codes) - type_mapping = { - "1": "protein-coding", - "2": "pseudo", - "3": "rRNA", - "4": "tRNA", - "5": "snRNA", - "6": "ncRNA", - "7": "other", - } - gene_type = type_mapping.get(type_value, f"type_{type_value}") + if not gene_type_data: + return None + type_value = str(gene_type_data) + type_mapping = { + "1": "protein-coding", + "2": "pseudo", + "3": "rRNA", + "4": "tRNA", + "5": "snRNA", + "6": "ncRNA", + "7": "other", + } + return type_mapping.get(type_value, f"type_{type_value}") except Exception as e: logger.debug("Error extracting gene type: %s", e) + return None - # Extract chromosome and genomic location from Entrezgene_locus - # Note: Entrezgene_location doesn't exist, but Entrezgene_locus contains location info - chromosome = None - genomic_location = None + @staticmethod + def _extract_chromosome(first_locus): + """Extract chromosome from first_locus.""" + label = NCBISearch._safe_get(first_locus, "Gene-commentary_label", "") + if not label or "Chromosome" not in str(label): + return None + match = re.search(r'Chromosome\s+(\S+)', str(label)) + return match.group(1) if match else None - try: - locus_data = gene_data.get("Entrezgene_locus") - if locus_data and isinstance(locus_data, list) and locus_data: - first_locus = locus_data[0] - if isinstance(first_locus, dict): - # Extract chromosome from Gene-commentary_label - # Example: "Chromosome 13 Reference RoL_Sarg_1.0" -> "13" - label = NCBISearch._safe_get(first_locus, "Gene-commentary_label", "") - if label and "Chromosome" in str(label): - match = re.search(r'Chromosome\s+(\S+)', str(label)) - if match: - chromosome = match.group(1) - - # Extract genomic location from Gene-commentary_seqs - seqs = NCBISearch._safe_get(first_locus, "Gene-commentary_seqs", []) - if seqs and isinstance(seqs, list) and seqs: - first_seq = seqs[0] - if isinstance(first_seq, dict): - seq_loc_int = NCBISearch._safe_get(first_seq, "Seq-loc_int", {}) - if seq_loc_int: - seq_interval = NCBISearch._safe_get(seq_loc_int, "Seq-interval", {}) - if seq_interval: - seq_from = NCBISearch._safe_get(seq_interval, "Seq-interval_from", "") - seq_to = NCBISearch._safe_get(seq_interval, "Seq-interval_to", "") - if seq_from and seq_to: - genomic_location = f"{seq_from}-{seq_to}" - except Exception as e: - logger.debug("Error extracting chromosome/location from gene record: %s", e) + @staticmethod + def _extract_genomic_location(first_locus): + """Extract genomic location from first_locus.""" + seqs = NCBISearch._safe_get(first_locus, "Gene-commentary_seqs", []) + if not seqs or not isinstance(seqs, list) or not seqs: + return None + first_seq = seqs[0] + if not isinstance(first_seq, dict): + return None + seq_loc_int = NCBISearch._safe_get(first_seq, "Seq-loc_int", {}) + if not seq_loc_int: + return None + seq_interval = NCBISearch._safe_get(seq_loc_int, "Seq-interval", {}) + if not seq_interval: + return None + seq_from = NCBISearch._safe_get(seq_interval, "Seq-interval_from", "") + seq_to = NCBISearch._safe_get(seq_interval, "Seq-interval_to", "") + if seq_from and seq_to: + return f"{seq_from}-{seq_to}" + return None + + @staticmethod + def _extract_location_info(locus_data): + """Extract chromosome and genomic location from locus data.""" + if not locus_data or not isinstance(locus_data, list) or not locus_data: + return None, None + first_locus = locus_data[0] + if not isinstance(first_locus, dict): + return None, None + chromosome = NCBISearch._extract_chromosome(first_locus) + genomic_location = NCBISearch._extract_genomic_location(first_locus) + return chromosome, genomic_location - # Extract gene functional description - # Note: Entrezgene_summary doesn't exist for most genes - # Try to extract from Entrezgene_comments if available - function = None + @staticmethod + def _extract_function_info(gene_data): + """Extract gene functional description.""" try: - # First try Entrezgene_summary (if exists) summary = gene_data.get("Entrezgene_summary") if summary: - function = str(summary) - else: - # Try to extract from Entrezgene_comments - comments_data = gene_data.get("Entrezgene_comments") - if comments_data and isinstance(comments_data, list): - for comment in comments_data: - if isinstance(comment, dict): - heading = NCBISearch._safe_get(comment, "Gene-commentary_heading", "") - # Look for function-related comments - if "function" in str(heading).lower() or "summary" in str(heading).lower(): - comment_text = NCBISearch._safe_get(comment, "Gene-commentary_comment", "") - if comment_text: - function = str(comment_text) - break + return str(summary) + comments_data = gene_data.get("Entrezgene_comments") + if not comments_data or not isinstance(comments_data, list): + return None + for comment in comments_data: + if not isinstance(comment, dict): + continue + heading = NCBISearch._safe_get(comment, "Gene-commentary_heading", "") + heading_lower = str(heading).lower() + if "function" not in heading_lower and "summary" not in heading_lower: + continue + comment_text = NCBISearch._safe_get(comment, "Gene-commentary_comment", "") + if comment_text: + return str(comment_text) + return None except Exception as e: logger.debug("Error extracting function: %s", e) + return None - # Try to extract representative mRNA accession from Entrezgene_locus for sequence retrieval + @staticmethod + def _extract_accession(locus_data): + """Extract representative mRNA accession from locus data.""" + if not locus_data or not isinstance(locus_data, list) or not locus_data: + return None + first_locus = locus_data[0] + if not isinstance(first_locus, dict): + return None + products = NCBISearch._safe_get(first_locus, "Gene-commentary_products", []) + if not products or not isinstance(products, list): + return None representative_accession = None - try: - if locus_data and isinstance(locus_data, list) and locus_data: - first_locus = locus_data[0] - if isinstance(first_locus, dict): - products = NCBISearch._safe_get(first_locus, "Gene-commentary_products", []) - if products and isinstance(products, list): - # Look for mRNA (type 3) or the first product - for product in products: - if isinstance(product, dict): - product_type = NCBISearch._safe_get(product, "Gene-commentary_type", "") - product_type_str = str(product_type) - # Type 3 is mRNA, prefer mRNA over other types - if product_type_str == "3" or (not representative_accession and product_type_str): - accession = NCBISearch._safe_get(product, "Gene-commentary_accession", "") - if accession: - representative_accession = str(accession) - if product_type_str == "3": # Found mRNA, use it - break - except Exception as e: - logger.debug("Error extracting representative accession: %s", e) + for product in products: + if not isinstance(product, dict): + continue + product_type = NCBISearch._safe_get(product, "Gene-commentary_type", "") + product_type_str = str(product_type) + if product_type_str == "3" or (not representative_accession and product_type_str): + accession = NCBISearch._safe_get(product, "Gene-commentary_accession", "") + if accession: + representative_accession = str(accession) + if product_type_str == "3": + break + return representative_accession + + @staticmethod + def _gene_record_to_dict(gene_record, gene_id: str) -> dict: + """ + Convert an Entrez gene record to a dictionary. + :param gene_record: The Entrez gene record (list from Entrez.read). + :param gene_id: The gene ID. + :return: A dictionary containing gene information. + """ + if not gene_record: + raise ValueError("Empty gene record") + + gene_data = gene_record[0] + locus_data = gene_data.get("Entrezgene_locus") + + # Extract information using helper methods + entrezgene_gene = gene_data.get("Entrezgene_gene") + gene_ref = NCBISearch._extract_gene_ref(entrezgene_gene) + organism = NCBISearch._extract_organism(gene_data.get("Entrezgene_source")) + gene_synonyms = NCBISearch._extract_synonyms(gene_ref) + gene_type = NCBISearch._extract_gene_type(gene_data) + chromosome, genomic_location = NCBISearch._extract_location_info(locus_data) + function = NCBISearch._extract_function_info(gene_data) + representative_accession = NCBISearch._extract_accession(locus_data) # Build result dictionary with all fields - # Include all fields that might be present in accession-based queries return { "molecule_type": "DNA", "database": "NCBI", @@ -247,6 +264,128 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict: "_representative_accession": representative_accession, } + def _fetch_sequence(self, accession: str): + """Fetch sequence from nuccore database using efetch.""" + time.sleep(0.35) # Comply with rate limit + seq_handle = Entrez.efetch( + db="nuccore", + id=accession, + rettype="fasta", + retmode="text", + ) + try: + sequence_data = seq_handle.read() + if not sequence_data: + return None, None + seq_lines = sequence_data.strip().split("\n") + header = seq_lines[0] if seq_lines else "" + sequence = "".join(seq_lines[1:]) + return sequence, header + finally: + seq_handle.close() + + def _fetch_summary(self, accession: str, default_header: str = ""): + """Fetch summary from nuccore database using esummary.""" + time.sleep(0.35) # Comply with rate limit + summary_handle = Entrez.esummary(db="nuccore", id=accession) + try: + summary = Entrez.read(summary_handle) + if not summary: + return None + summary_data = summary[0] + + # Determine molecule type detail + molecule_type_detail = "N/A" + if accession.startswith("NM_") or accession.startswith("XM_"): + molecule_type_detail = "mRNA" + elif accession.startswith("NC_") or accession.startswith("NT_"): + molecule_type_detail = "genomic DNA" + elif accession.startswith("NR_") or accession.startswith("XR_"): + molecule_type_detail = "RNA" + elif accession.startswith("NG_"): + molecule_type_detail = "genomic region" + + title = summary_data.get("Title", default_header) + chromosome = summary_data.get("ChrLoc") or summary_data.get("ChrAccVer") + chr_start = summary_data.get("ChrStart") + chr_stop = summary_data.get("ChrStop") + genomic_location = None + if chr_start and chr_stop: + genomic_location = f"{chr_start}-{chr_stop}" + + return { + "title": title, + "molecule_type_detail": molecule_type_detail, + "chromosome": chromosome, + "genomic_location": genomic_location, + } + finally: + summary_handle.close() + + def _extract_gene_id(self, link_handle): + """Extract GeneID from elink results.""" + try: + links = Entrez.read(link_handle) + if not links or len(links) == 0: + return None + + first_link = links[0] + if "LinkSetDb" not in first_link: + return None + + for link_set in first_link["LinkSetDb"]: + if link_set.get("DbTo") != "gene": + continue + + # Try Link structure first (most common) + links_in_set = link_set.get("Link", []) + if links_in_set and len(links_in_set) > 0: + first_link_item = links_in_set[0] + if isinstance(first_link_item, dict): + gene_id = str(first_link_item.get("Id", "")) + elif hasattr(first_link_item, "Id"): + gene_id = str(getattr(first_link_item, "Id", "")) + else: + gene_id = str(first_link_item) + if gene_id: + return gene_id + + # Fallback: Try IdList (if Link is not available) + id_list = link_set.get("IdList", []) + if id_list: + return str(id_list[0]) + + return None + except Exception as e: + logger.error("Error parsing elink result: %s", e) + import traceback + logger.debug(traceback.format_exc()) + return None + + def _extract_sequence(self, result: dict, accession: str): + """Enrich result dictionary with sequence and summary information from accession.""" + try: + sequence, header = self._fetch_sequence(accession) + if sequence: + result["sequence"] = sequence + result["sequence_length"] = len(sequence) + + summary_info = self._fetch_summary(accession, header or "") + if not summary_info: + return + + result["title"] = summary_info.get("title") + result["molecule_type_detail"] = summary_info.get("molecule_type_detail") + # Update chromosome and genomic_location if not already set + if not result.get("chromosome") and summary_info.get("chromosome"): + result["chromosome"] = summary_info["chromosome"] + if not result.get("genomic_location") and summary_info.get("genomic_location"): + result["genomic_location"] = summary_info["genomic_location"] + except (RequestException, IncompleteRead): + raise + except Exception as e: + logger.debug("Failed to get sequence for accession %s: %s", accession, e) + @retry( stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10), @@ -260,7 +399,7 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None Get gene information by Gene ID. This is the unified data source - all search methods eventually call this. :param gene_id: NCBI Gene ID. - :param preferred_accession: Optional accession to use for sequence retrieval if representative mRNA is not available. + :param preferred_accession: Optional accession to use for sequence retrieval. :return: A dictionary containing gene information or None if not found. """ try: @@ -273,71 +412,9 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None result = self._gene_record_to_dict(gene_record, gene_id) # Try to get sequence from accession - # Priority: 1) preferred_accession (if provided), 2) representative mRNA accession accession_to_use = preferred_accession or result.get("_representative_accession") if accession_to_use: - try: - # Get sequence info directly from nuccore database - time.sleep(0.35) - seq_handle = Entrez.efetch( - db="nuccore", - id=accession_to_use, - rettype="fasta", - retmode="text", - ) - try: - sequence_data = seq_handle.read() - if sequence_data: - seq_lines = sequence_data.strip().split("\n") - header = seq_lines[0] if seq_lines else "" - sequence = "".join(seq_lines[1:]) - - # Get summary for additional info - time.sleep(0.35) - summary_handle = Entrez.esummary(db="nuccore", id=accession_to_use) - try: - summary = Entrez.read(summary_handle) - if summary: - summary_data = summary[0] - title = summary_data.get("Title", header) - - # Determine molecule type detail - molecule_type_detail = "N/A" - if accession_to_use.startswith("NM_") or accession_to_use.startswith("XM_"): - molecule_type_detail = "mRNA" - elif accession_to_use.startswith("NC_") or accession_to_use.startswith("NT_"): - molecule_type_detail = "genomic DNA" - elif accession_to_use.startswith("NR_") or accession_to_use.startswith("XR_"): - molecule_type_detail = "RNA" - elif accession_to_use.startswith("NG_"): - molecule_type_detail = "genomic region" - - # Merge sequence information into result - result["sequence"] = sequence - result["sequence_length"] = len(sequence) - result["title"] = title - result["molecule_type_detail"] = molecule_type_detail - - # Update chromosome and genomic_location if not already set - if not result.get("chromosome"): - chromosome = summary_data.get("ChrLoc") or summary_data.get("ChrAccVer") - if chromosome: - result["chromosome"] = chromosome - if not result.get("genomic_location"): - chr_start = summary_data.get("ChrStart") - chr_stop = summary_data.get("ChrStop") - if chr_start and chr_stop: - result["genomic_location"] = f"{chr_start}-{chr_stop}" - finally: - summary_handle.close() - finally: - seq_handle.close() - except (RequestException, IncompleteRead): - # Re-raise to allow retry mechanism - raise - except Exception as e: - logger.debug("Failed to get sequence for accession %s: %s", - accession_to_use, e) + self._extract_sequence(result, accession_to_use) # Remove internal field result.pop("_representative_accession", None) @@ -364,58 +441,24 @@ def get_by_accession(self, accession: str) -> Optional[dict]: # Note: esummary for nuccore doesn't include GeneID, so we use elink instead time.sleep(0.35) link_handle = Entrez.elink(dbfrom="nuccore", db="gene", id=accession) - gene_id = None try: - links = Entrez.read(link_handle) - - # Extract GeneID from elink results - # Structure: links[0]["LinkSetDb"][0]["Link"][0]["Id"] - if links and len(links) > 0: - first_link = links[0] - if "LinkSetDb" in first_link: - for link_set in first_link["LinkSetDb"]: - if link_set.get("DbTo") == "gene": - # Try Link structure first (most common) - links_in_set = link_set.get("Link", []) - if links_in_set and len(links_in_set) > 0: - first_link_item = links_in_set[0] - if isinstance(first_link_item, dict): - gene_id = str(first_link_item.get("Id", "")) - elif hasattr(first_link_item, "Id"): - gene_id = str(getattr(first_link_item, "Id", "")) - else: - # Handle StringElement or other types - gene_id = str(first_link_item) - if gene_id: - break - # Fallback: Try IdList (if Link is not available) - id_list = link_set.get("IdList", []) - if id_list and not gene_id: - gene_id = str(id_list[0]) - break - except Exception as e: - logger.error("Error parsing elink result for accession %s: %s", accession, e) - import traceback - logger.debug(traceback.format_exc()) - # Continue to check if we got gene_id before the error + gene_id = self._extract_gene_id(link_handle) finally: link_handle.close() # Step 2: If we have a GeneID, get complete information from Gene database - # Pass the accession as preferred_accession so get_by_gene_id can use it for sequence if gene_id: result = self.get_by_gene_id(gene_id, preferred_accession=accession) - - # Update id to accession for consistency (user searched by accession) if result: result["id"] = accession result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}" - return result # Step 3: If no GeneID, this is a rare case (accession without associated gene) - # Return None - we can't provide complete information without Gene ID - logger.warning("Accession %s has no associated GeneID, cannot provide complete information", accession) + logger.warning( + "Accession %s has no associated GeneID, cannot provide complete information", + accession + ) return None except (RequestException, IncompleteRead): raise @@ -491,13 +534,12 @@ def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional else: seq = sequence.strip().replace(" ", "").replace("\n", "") - # Validate if it's a DNA sequence - if not re.fullmatch(r"[ATCGN\s]+", seq, re.I): - logger.error("Invalid DNA sequence provided.") - return None - - if not seq: - logger.error("Empty DNA sequence provided.") + # Validate sequence + if not seq or not re.fullmatch(r"[ATCGN\s]+", seq, re.I): + if not seq: + logger.error("Empty DNA sequence provided.") + else: + logger.error("Invalid DNA sequence provided.") return None # Use BLAST search (Note: requires network connection, may be slow) From 93826604581c0bcd5d28738add03cdac3babf704 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Sat, 29 Nov 2025 22:25:42 +0800 Subject: [PATCH 11/22] feat: add DNA RNA local blast --- graphgen/configs/search_dna_config.yaml | 2 + graphgen/configs/search_protein_config.yaml | 3 +- graphgen/configs/search_rna_config.yaml | 4 +- graphgen/models/searcher/db/ncbi_searcher.py | 66 ++++++- .../models/searcher/db/rnacentral_searcher.py | 73 ++++++- scripts/search/build_db/build_dna_blast_db.sh | 178 ++++++++++++++++++ .../search/build_db/build_protein_blast_db.sh | 56 ++++++ scripts/search/build_db/build_rna_blast_db.sh | 157 +++++++++++++++ 8 files changed, 529 insertions(+), 10 deletions(-) create mode 100755 scripts/search/build_db/build_dna_blast_db.sh create mode 100755 scripts/search/build_db/build_protein_blast_db.sh create mode 100755 scripts/search/build_db/build_rna_blast_db.sh diff --git a/graphgen/configs/search_dna_config.yaml b/graphgen/configs/search_dna_config.yaml index 95f8fc39..5245ea0c 100644 --- a/graphgen/configs/search_dna_config.yaml +++ b/graphgen/configs/search_dna_config.yaml @@ -12,4 +12,6 @@ pipeline: ncbi_params: email: test@example.com # NCBI requires an email address tool: GraphGen # tool name for NCBI API + use_local_blast: true # whether to use local blast for DNA search + local_blast_db: /your_path/refseq_241 # path to local BLAST database (without .nhr extension) diff --git a/graphgen/configs/search_protein_config.yaml b/graphgen/configs/search_protein_config.yaml index bb46d34c..bfbf84eb 100644 --- a/graphgen/configs/search_protein_config.yaml +++ b/graphgen/configs/search_protein_config.yaml @@ -11,4 +11,5 @@ pipeline: data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot uniprot_params: use_local_blast: true # whether to use local blast for uniprot search - local_blast_db: /your_path/uniprot_sprot + local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot + # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database) diff --git a/graphgen/configs/search_rna_config.yaml b/graphgen/configs/search_rna_config.yaml index 3d051417..dae62ec2 100644 --- a/graphgen/configs/search_rna_config.yaml +++ b/graphgen/configs/search_rna_config.yaml @@ -10,5 +10,7 @@ pipeline: params: data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral rnacentral_params: - {} # RNAcentral doesn't require additional parameters currently + use_local_blast: true # whether to use local blast for RNA search + local_blast_db: /your_path/refseq_rna_241 # format: /path/to/refseq_rna_${RELEASE} + # can also use DNA database with RNA sequences (if already built) diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index cca38bca..24a37e2b 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -1,6 +1,9 @@ import asyncio import logging +import os import re +import subprocess +import tempfile import time from concurrent.futures import ThreadPoolExecutor from functools import lru_cache @@ -38,11 +41,22 @@ class NCBISearch(BaseSearcher): Note: NCBI has rate limits (max 3 requests per second), delays are required between requests. """ - def __init__(self, email: str = "test@example.com", tool: str = "GraphGen"): + def __init__( + self, + email: str = "test@example.com", + tool: str = "GraphGen", + use_local_blast: bool = False, + local_blast_db: str = "nt_db", + ): super().__init__() Entrez.email = email Entrez.tool = tool Entrez.timeout = 60 # 60 seconds timeout + self.use_local_blast = use_local_blast + self.local_blast_db = local_blast_db + if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): + logger.error("Local BLAST database files not found. Please check the path.") + self.use_local_blast = False @staticmethod def _safe_get(obj, key, default=None): @@ -518,10 +532,47 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: logger.error("Keyword %s not found: %s", keyword, e) return None + def _local_blast(self, seq: str, threshold: float) -> Optional[str]: + """ + Perform local BLAST search using local BLAST database. + :param seq: The DNA sequence. + :param threshold: E-value threshold for BLAST search. + :return: The accession number of the best hit or None if not found. + """ + try: + with tempfile.NamedTemporaryFile( + mode="w+", suffix=".fa", delete=False + ) as tmp: + tmp.write(f">query\n{seq}\n") + tmp_name = tmp.name + + cmd = [ + "blastn", + "-db", + self.local_blast_db, + "-query", + tmp_name, + "-evalue", + str(threshold), + "-max_target_seqs", + "1", + "-outfmt", + "6 sacc", # only return accession + ] + logger.debug("Running local blastn: %s", " ".join(cmd)) + out = subprocess.check_output(cmd, text=True).strip() + os.remove(tmp_name) + if out: + return out.split("\n", maxsplit=1)[0] + return None + except Exception as exc: # pylint: disable=broad-except + logger.error("Local blastn failed: %s", exc) + return None + def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: """ Search NCBI with a DNA sequence using BLAST. - Note: This is a simplified version. For production, consider using local BLAST. + Tries local BLAST first if enabled, falls back to network BLAST. :param sequence: DNA sequence (FASTA format or raw sequence). :param threshold: E-value threshold for BLAST search. :return: A dictionary containing the best hit information or None if not found. @@ -542,7 +593,16 @@ def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional logger.error("Invalid DNA sequence provided.") return None - # Use BLAST search (Note: requires network connection, may be slow) + # Try local BLAST first if enabled + accession = None + if self.use_local_blast: + accession = self._local_blast(seq, threshold) + if accession: + logger.debug("Local BLAST found accession: %s", accession) + return self.get_by_accession(accession) + + # Fall back to network BLAST + logger.debug("Falling back to NCBIWWW.qblast.") logger.debug("Performing BLAST search for DNA sequence...") time.sleep(0.35) diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index 89b430ac..c31bd978 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -1,5 +1,8 @@ import asyncio +import os import re +import subprocess +import tempfile from typing import Dict, Optional, List, Any import aiohttp @@ -23,10 +26,15 @@ class RNACentralSearch(BaseSearcher): API Documentation: https://rnacentral.org/api/v1 """ - def __init__(self): + def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db"): super().__init__() self.base_url = "https://rnacentral.org/api/v1" self.headers = {"Accept": "application/json"} + self.use_local_blast = use_local_blast + self.local_blast_db = local_blast_db + if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): + logger.error("Local BLAST database files not found. Please check the path.") + self.use_local_blast = False async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession) -> List[Dict]: """ @@ -294,11 +302,50 @@ async def get_best_hit(self, keyword: str) -> Optional[dict]: logger.error("Keyword %s not found: %s", keyword, e) return None - async def search_by_sequence(self, sequence: str) -> Optional[dict]: + def _local_blast(self, seq: str, threshold: float) -> Optional[str]: + """ + Perform local BLAST search using local BLAST database. + :param seq: The RNA sequence. + :param threshold: E-value threshold for BLAST search. + :return: The accession/ID of the best hit or None if not found. + """ + try: + with tempfile.NamedTemporaryFile( + mode="w+", suffix=".fa", delete=False + ) as tmp: + tmp.write(f">query\n{seq}\n") + tmp_name = tmp.name + + cmd = [ + "blastn", + "-db", + self.local_blast_db, + "-query", + tmp_name, + "-evalue", + str(threshold), + "-max_target_seqs", + "1", + "-outfmt", + "6 sacc", # only return accession + ] + logger.debug("Running local blastn for RNA: %s", " ".join(cmd)) + out = subprocess.check_output(cmd, text=True).strip() + os.remove(tmp_name) + if out: + return out.split("\n", maxsplit=1)[0] + return None + except Exception as exc: # pylint: disable=broad-except + logger.error("Local blastn failed: %s", exc) + return None + + async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: """ Search RNAcentral with an RNA sequence. + Tries local BLAST first if enabled, falls back to RNAcentral API. Unified approach: Find RNA ID from sequence search, then call get_by_rna_id() for complete information. :param sequence: RNA sequence (FASTA format or raw sequence). + :param threshold: E-value threshold for BLAST search. :return: A dictionary containing complete RNA information or None if not found. """ try: @@ -318,7 +365,23 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]: logger.error("Empty RNA sequence provided.") return None - # RNAcentral API supports sequence search + # Try local BLAST first if enabled + if self.use_local_blast: + accession = self._local_blast(seq, threshold) + if accession: + logger.debug("Local BLAST found accession: %s", accession) + # Try to get RNA ID from accession (may need conversion) + # For now, try using accession as RNA ID or search by it + result = await self.get_by_rna_id(accession) + if result: + return result + # If not found by ID, try keyword search + result = await self.get_best_hit(accession) + if result: + return result + + # Fall back to RNAcentral API + logger.debug("Falling back to RNAcentral API.") async with aiohttp.ClientSession() as session: search_url = f"{self.base_url}/rna" params = {"sequence": seq, "format": "json"} @@ -373,7 +436,7 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]: reraise=True, ) async def search( - self, query: str, threshold: float = 0.7, **kwargs + self, query: str, threshold: float = 0.1, **kwargs ) -> Optional[Dict]: """ Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence. @@ -395,7 +458,7 @@ async def search( if query.startswith(">") or ( re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper() ): - result = await self.search_by_sequence(query) + result = await self.search_by_sequence(query, threshold) # check if RNAcentral ID (typically starts with URS) elif re.fullmatch(r"URS\d+", query, re.I): result = await self.get_by_rna_id(query) diff --git a/scripts/search/build_db/build_dna_blast_db.sh b/scripts/search/build_db/build_dna_blast_db.sh new file mode 100755 index 00000000..b53b4249 --- /dev/null +++ b/scripts/search/build_db/build_dna_blast_db.sh @@ -0,0 +1,178 @@ +#!/bin/bash + +set -e + +# Downloads NCBI RefSeq nucleotide sequences and creates BLAST databases. +# +# RefSeq 目录结构说明(按生物分类组织): +# - vertebrate_mammalian (哺乳动物) +# - vertebrate_other (其他脊椎动物) +# - bacteria (细菌) +# - archaea (古菌) +# - fungi (真菌) +# - invertebrate (无脊椎动物) +# - plant (植物) +# - viral (病毒) +# - protozoa (原生动物) +# - mitochondrion (线粒体) +# - plastid (质体) +# - plasmid (质粒) +# - other (其他) +# - complete/ (完整基因组,包含所有分类) +# +# 每个分类目录下包含: +# - {category}.{number}.genomic.fna.gz (基因组序列) +# - {category}.{number}.rna.fna.gz (RNA序列) +# +# Usage: ./build_dna_blast_db.sh [representative|complete|all] +# representative: Download genomic sequences from major categories (recommended, smaller) +# Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi +# complete: Download all complete genomic sequences from complete/ directory (very large) +# all: Download all genomic sequences from all categories (very large) +# +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +DOWNLOAD_TYPE=${1:-representative} + +# Better to use a stable DOWNLOAD_TMP name to support resuming downloads +DOWNLOAD_TMP=_downloading_dna +mkdir -p ${DOWNLOAD_TMP} +cd ${DOWNLOAD_TMP} + +# Download RefSeq release information +echo "Downloading RefSeq release information..." +wget -c "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER" || { + echo "Warning: Could not download RELEASE_NUMBER, using current date as release identifier" + RELEASE=$(date +%Y%m%d) +} + +if [ -f "RELEASE_NUMBER" ]; then + RELEASE=$(cat RELEASE_NUMBER | tr -d '\n') + echo "RefSeq release: ${RELEASE}" +else + RELEASE=$(date +%Y%m%d) + echo "Using date as release identifier: ${RELEASE}" +fi + +# Download based on type +case ${DOWNLOAD_TYPE} in + representative) + echo "Downloading RefSeq representative sequences (recommended, smaller size)..." + # Download major categories for representative coverage + # Note: You can modify this list based on your specific requirements + for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi; do + echo "Downloading ${category} sequences..." + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' | \ + while read filename; do + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + done + done + ;; + complete) + echo "Downloading RefSeq complete genomic sequences (WARNING: very large, may take hours)..." + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' | \ + while read filename; do + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + done + ;; + all) + echo "Downloading all RefSeq genomic sequences from all categories (WARNING: extremely large, may take many hours)..." + # Download genomic sequences from all categories + for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do + echo "Downloading ${category} genomic sequences..." + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' | \ + while read filename; do + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + done + done + ;; + *) + echo "Error: Unknown download type '${DOWNLOAD_TYPE}'" + echo "Usage: $0 [representative|complete|all]" + echo "Note: For RNA sequences, use build_rna_blast_db.sh instead" + exit 1 + ;; +esac + +cd .. + +# Create release directory +mkdir -p refseq_${RELEASE} +mv ${DOWNLOAD_TMP}/* refseq_${RELEASE}/ 2>/dev/null || true +rmdir ${DOWNLOAD_TMP} 2>/dev/null || true + +cd refseq_${RELEASE} + +# Extract and combine sequences +echo "Extracting and combining sequences..." + +# Extract all downloaded genomic sequences +if [ $(find . -name "*.genomic.fna.gz" -type f | wc -l) -gt 0 ]; then + echo "Extracting genomic sequences..." + find . -name "*.genomic.fna.gz" -type f -exec gunzip {} \; +fi + +# Combine all FASTA files into one +echo "Combining all FASTA files..." +FASTA_FILES=$(find . -name "*.fna" -type f) +if [ -z "$FASTA_FILES" ]; then + FASTA_FILES=$(find . -name "*.fa" -type f) +fi + +if [ -z "$FASTA_FILES" ]; then + echo "Error: No FASTA files found to combine" + exit 1 +fi + +echo "$FASTA_FILES" | while read -r file; do + if [ -f "$file" ]; then + cat "$file" >> refseq_${RELEASE}.fasta + fi +done + +# Check if we have sequences +if [ ! -s "refseq_${RELEASE}.fasta" ]; then + echo "Error: Combined FASTA file is empty" + exit 1 +fi + +echo "Creating BLAST database..." +# Create BLAST database for DNA sequences (use -dbtype nucl for nucleotide) +makeblastdb -in refseq_${RELEASE}.fasta \ + -out refseq_${RELEASE} \ + -dbtype nucl \ + -parse_seqids \ + -title "RefSeq_${RELEASE}" + +echo "BLAST database created successfully!" +echo "Database location: $(pwd)/refseq_${RELEASE}" +echo "" +echo "To use this database, set in your config:" +echo " local_blast_db: $(pwd)/refseq_${RELEASE}" +echo "" +echo "Note: The database files are:" +ls -lh refseq_${RELEASE}.* + +cd .. + diff --git a/scripts/search/build_db/build_protein_blast_db.sh b/scripts/search/build_db/build_protein_blast_db.sh new file mode 100755 index 00000000..9292875a --- /dev/null +++ b/scripts/search/build_db/build_protein_blast_db.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +set -e + +# Downloads the latest release of UniProt, putting it in a release-specific directory. +# Creates associated BLAST databases. +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +# Better to use a stable DOWNLOAD_TMP name to support resuming downloads +DOWNLOAD_TMP=_downloading +mkdir -p ${DOWNLOAD_TMP} +cd ${DOWNLOAD_TMP} + +wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/RELEASE.metalink" + +# Extract the release name (like 2017_10 or 2017_1) +# Use sed for cross-platform compatibility (works on both macOS and Linux) +RELEASE=$(sed -n 's/.*\([0-9]\{4\}_[0-9]\{1,2\}\)<\/version>.*/\1/p' RELEASE.metalink | head -1) + +wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" +wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" +wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt" +wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/README" +wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/LICENSE" + +cd .. + +mkdir ${RELEASE} +mv ${DOWNLOAD_TMP}/* ${RELEASE} +rmdir ${DOWNLOAD_TMP} + +cd ${RELEASE} + +gunzip uniprot_sprot.fasta.gz +gunzip uniprot_trembl.fasta.gz + +cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta + +makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE} +makeblastdb -in uniprot_sprot.fasta -out uniprot_sprot -dbtype prot -parse_seqids -title uniprot_sprot +makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl + +cd .. + +echo "BLAST databases created successfully!" +echo "Database locations:" +echo " - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}" +echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" +echo " - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl" +echo "" +echo "To use these databases, set in your config:" +echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot # or uniprot_${RELEASE} or uniprot_trembl" + diff --git a/scripts/search/build_db/build_rna_blast_db.sh b/scripts/search/build_db/build_rna_blast_db.sh new file mode 100755 index 00000000..89b9dc0e --- /dev/null +++ b/scripts/search/build_db/build_rna_blast_db.sh @@ -0,0 +1,157 @@ +#!/bin/bash + +set -e + +# Downloads NCBI RefSeq RNA sequences and creates BLAST databases. +# This script specifically downloads RNA sequences (mRNA, rRNA, tRNA, etc.) +# from RefSeq, which is suitable for RNA sequence searches. +# +# Usage: ./build_rna_blast_db.sh [representative|complete|all] +# representative: Download RNA sequences from major categories (recommended, smaller) +# Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi, invertebrate, plant, viral +# complete: Download all RNA sequences from complete/ directory (very large) +# all: Download all RNA sequences from all categories (very large) +# +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +DOWNLOAD_TYPE=${1:-representative} + +# Better to use a stable DOWNLOAD_TMP name to support resuming downloads +DOWNLOAD_TMP=_downloading_rna +mkdir -p ${DOWNLOAD_TMP} +cd ${DOWNLOAD_TMP} + +# Download RefSeq release information +echo "Downloading RefSeq release information..." +wget -c "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER" || { + echo "Warning: Could not download RELEASE_NUMBER, using current date as release identifier" + RELEASE=$(date +%Y%m%d) +} + +if [ -f "RELEASE_NUMBER" ]; then + RELEASE=$(cat RELEASE_NUMBER | tr -d '\n') + echo "RefSeq release: ${RELEASE}" +else + RELEASE=$(date +%Y%m%d) + echo "Using date as release identifier: ${RELEASE}" +fi + +# Download based on type +case ${DOWNLOAD_TYPE} in + representative) + echo "Downloading RefSeq representative RNA sequences (recommended, smaller size)..." + echo "Downloading RNA sequences from major categories..." + for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral; do + echo "Downloading ${category} RNA sequences..." + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' | \ + while read filename; do + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + done + done + ;; + complete) + echo "Downloading RefSeq complete RNA sequences (WARNING: very large, may take hours)..." + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \ + grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' | \ + while read filename; do + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + done + ;; + all) + echo "Downloading all RefSeq RNA sequences from all categories (WARNING: extremely large, may take many hours)..." + for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do + echo "Downloading ${category} RNA sequences..." + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' | \ + while read filename; do + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + done + done + ;; + *) + echo "Error: Unknown download type '${DOWNLOAD_TYPE}'" + echo "Usage: $0 [representative|complete|all]" + exit 1 + ;; +esac + +cd .. + +# Create release directory +mkdir -p refseq_rna_${RELEASE} +mv ${DOWNLOAD_TMP}/* refseq_rna_${RELEASE}/ 2>/dev/null || true +rmdir ${DOWNLOAD_TMP} 2>/dev/null || true + +cd refseq_rna_${RELEASE} + +# Extract and combine sequences +echo "Extracting and combining RNA sequences..." + +# Extract all downloaded RNA sequences +if [ $(find . -name "*.rna.fna.gz" -type f | wc -l) -gt 0 ]; then + echo "Extracting RNA sequences..." + find . -name "*.rna.fna.gz" -type f -exec gunzip {} \; +fi + +# Combine all FASTA files into one +echo "Combining all FASTA files..." +FASTA_FILES=$(find . -name "*.fna" -type f) +if [ -z "$FASTA_FILES" ]; then + FASTA_FILES=$(find . -name "*.fa" -type f) +fi + +if [ -z "$FASTA_FILES" ]; then + echo "Error: No FASTA files found to combine" + exit 1 +fi + +echo "$FASTA_FILES" | while read -r file; do + if [ -f "$file" ]; then + cat "$file" >> refseq_rna_${RELEASE}.fasta + fi +done + +# Check if we have sequences +if [ ! -s "refseq_rna_${RELEASE}.fasta" ]; then + echo "Error: Combined FASTA file is empty" + exit 1 +fi + +echo "Creating BLAST database..." +# Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide) +makeblastdb -in refseq_rna_${RELEASE}.fasta \ + -out refseq_rna_${RELEASE} \ + -dbtype nucl \ + -parse_seqids \ + -title "RefSeq_RNA_${RELEASE}" + +echo "BLAST database created successfully!" +echo "Database location: $(pwd)/refseq_rna_${RELEASE}" +echo "" +echo "To use this database, set in your config:" +echo " local_blast_db: $(pwd)/refseq_rna_${RELEASE}" +echo "" +echo "Note: The database files are:" +ls -lh refseq_rna_${RELEASE}.* + +cd .. + From 2a715de9a3578e366ce7ea62fcac733bb24ae6f8 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Sat, 29 Nov 2025 22:40:21 +0800 Subject: [PATCH 12/22] style: reduce return statements and branches in searcher methods - Refactor search_by_sequence in ncbi_searcher.py to reduce return statements from 7 to 1 - Refactor search_by_sequence in rnacentral_searcher.py to reduce return statements from 8 to 1 and branches from 16 to 12 - Extract helper methods to improve code readability and maintainability - Fix pylint errors R0911 (too-many-return-statements) and R0912 (too-many-branches) --- graphgen/models/searcher/db/ncbi_searcher.py | 120 ++++++++-------- .../models/searcher/db/rnacentral_searcher.py | 136 +++++++++--------- 2 files changed, 131 insertions(+), 125 deletions(-) diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index 24a37e2b..4558f75a 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -569,6 +569,45 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]: logger.error("Local blastn failed: %s", exc) return None + def _extract_and_normalize_sequence(self, sequence: str) -> Optional[str]: + """Extract and normalize DNA sequence from input.""" + if sequence.startswith(">"): + seq_lines = sequence.strip().split("\n") + seq = "".join(seq_lines[1:]) + else: + seq = sequence.strip().replace(" ", "").replace("\n", "") + return seq if seq and re.fullmatch(r"[ATCGN\s]+", seq, re.I) else None + + def _process_network_blast_result(self, blast_record, seq: str, threshold: float) -> Optional[dict]: + """Process network BLAST result and return dictionary or None.""" + if not blast_record.alignments: + logger.info("No BLAST hits found for the given sequence.") + return None + + best_alignment = blast_record.alignments[0] + best_hsp = best_alignment.hsps[0] + if best_hsp.expect > threshold: + logger.info("No BLAST hits below the threshold E-value.") + return None + + hit_id = best_alignment.hit_id + accession_match = re.search(r"ref\|([^|]+)", hit_id) + if accession_match: + accession = accession_match.group(1).split(".")[0] + return self.get_by_accession(accession) + + # If unable to extract accession, return basic information + return { + "molecule_type": "DNA", + "database": "NCBI", + "id": hit_id, + "title": best_alignment.title, + "sequence_length": len(seq), + "e_value": best_hsp.expect, + "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0, + "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}", + } + def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: """ Search NCBI with a DNA sequence using BLAST. @@ -577,77 +616,40 @@ def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional :param threshold: E-value threshold for BLAST search. :return: A dictionary containing the best hit information or None if not found. """ + result = None try: - # Extract sequence (if in FASTA format) - if sequence.startswith(">"): - seq_lines = sequence.strip().split("\n") - seq = "".join(seq_lines[1:]) - else: - seq = sequence.strip().replace(" ", "").replace("\n", "") - - # Validate sequence - if not seq or not re.fullmatch(r"[ATCGN\s]+", seq, re.I): - if not seq: - logger.error("Empty DNA sequence provided.") - else: - logger.error("Invalid DNA sequence provided.") + seq = self._extract_and_normalize_sequence(sequence) + if not seq: + logger.error("Empty or invalid DNA sequence provided.") return None # Try local BLAST first if enabled - accession = None if self.use_local_blast: accession = self._local_blast(seq, threshold) if accession: logger.debug("Local BLAST found accession: %s", accession) - return self.get_by_accession(accession) - - # Fall back to network BLAST - logger.debug("Falling back to NCBIWWW.qblast.") - logger.debug("Performing BLAST search for DNA sequence...") - time.sleep(0.35) - - result_handle = NCBIWWW.qblast( - program="blastn", - database="nr", - sequence=seq, - hitlist_size=1, - expect=threshold, - ) - blast_record = NCBIXML.read(result_handle) - - if not blast_record.alignments: - logger.info("No BLAST hits found for the given sequence.") - return None - - best_alignment = blast_record.alignments[0] - best_hsp = best_alignment.hsps[0] - if best_hsp.expect > threshold: - logger.info("No BLAST hits below the threshold E-value.") - return None - hit_id = best_alignment.hit_id - - # Extract accession number - # Format may be: gi|123456|ref|NM_000546.5| - accession_match = re.search(r"ref\|([^|]+)", hit_id) - if accession_match: - accession = accession_match.group(1).split(".")[0] - return self.get_by_accession(accession) - # If unable to extract accession, return basic information - return { - "molecule_type": "DNA", - "database": "NCBI", - "id": hit_id, - "title": best_alignment.title, - "sequence_length": len(seq), - "e_value": best_hsp.expect, - "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0, - "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}", - } + result = self.get_by_accession(accession) + + # Fall back to network BLAST if local BLAST didn't find result + if not result: + logger.debug("Falling back to NCBIWWW.qblast.") + logger.debug("Performing BLAST search for DNA sequence...") + time.sleep(0.35) + + result_handle = NCBIWWW.qblast( + program="blastn", + database="nr", + sequence=seq, + hitlist_size=1, + expect=threshold, + ) + blast_record = NCBIXML.read(result_handle) + result = self._process_network_blast_result(blast_record, seq, threshold) except RequestException: raise except Exception as e: # pylint: disable=broad-except logger.error("BLAST search failed: %s", e) - return None + return result @retry( stop=stop_after_attempt(5), diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index c31bd978..5950a3e7 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -339,6 +339,49 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]: logger.error("Local blastn failed: %s", exc) return None + @staticmethod + def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: + """Extract and normalize RNA sequence from input.""" + if sequence.startswith(">"): + seq_lines = sequence.strip().split("\n") + seq = "".join(seq_lines[1:]) + else: + seq = sequence.strip().replace(" ", "").replace("\n", "") + return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None + + def _find_best_match_from_results(self, results: List[Dict], seq: str) -> Optional[Dict]: + """Find best match from search results, preferring exact match.""" + exact_match = None + for result_item in results: + result_seq = result_item.get("sequence", "") + if result_seq == seq: + exact_match = result_item + break + return exact_match if exact_match else (results[0] if results else None) + + async def _process_api_search_results( + self, results: List[Dict], seq: str + ) -> Optional[dict]: + """Process API search results and return dictionary or None.""" + if not results: + logger.info("No results found for sequence.") + return None + + target_result = self._find_best_match_from_results(results, seq) + if not target_result: + return None + + rna_id = target_result.get("rnacentral_id") + if not rna_id: + return None + + # Try to get complete information + result = await self.get_by_rna_id(rna_id) + if not result: + logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) + result = self._rna_data_to_dict(rna_id, target_result) + return result + async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: """ Search RNAcentral with an RNA sequence. @@ -348,21 +391,11 @@ async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Op :param threshold: E-value threshold for BLAST search. :return: A dictionary containing complete RNA information or None if not found. """ + result = None try: - # Extract sequence (if in FASTA format) - if sequence.startswith(">"): - seq_lines = sequence.strip().split("\n") - seq = "".join(seq_lines[1:]) - else: - seq = sequence.strip().replace(" ", "").replace("\n", "") - - # Validate if it's an RNA sequence (contains U instead of T) - if not re.fullmatch(r"[AUCGN\s]+", seq, re.I): - logger.error("Invalid RNA sequence provided.") - return None - + seq = self._extract_and_normalize_sequence(sequence) if not seq: - logger.error("Empty RNA sequence provided.") + logger.error("Empty or invalid RNA sequence provided.") return None # Try local BLAST first if enabled @@ -370,64 +403,35 @@ async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Op accession = self._local_blast(seq, threshold) if accession: logger.debug("Local BLAST found accession: %s", accession) - # Try to get RNA ID from accession (may need conversion) - # For now, try using accession as RNA ID or search by it result = await self.get_by_rna_id(accession) - if result: - return result - # If not found by ID, try keyword search - result = await self.get_best_hit(accession) - if result: - return result - - # Fall back to RNAcentral API - logger.debug("Falling back to RNAcentral API.") - async with aiohttp.ClientSession() as session: - search_url = f"{self.base_url}/rna" - params = {"sequence": seq, "format": "json"} - async with session.get( - search_url, - params=params, - headers=self.headers, - timeout=aiohttp.ClientTimeout(total=60), # Sequence search may take longer - ) as resp: - if resp.status == 200: - search_results = await resp.json() - results = search_results.get("results", []) - if results: - # Step 1: Find best match (prefer exact match) - exact_match = None - for result in results: - result_seq = result.get("sequence", "") - if result_seq == seq: - exact_match = result - break - - # Use exact match if found, otherwise use first result - target_result = exact_match if exact_match else results[0] - rna_id = target_result.get("rnacentral_id") - - if rna_id: - # Step 2: Unified call to get_by_rna_id() for complete information - result = await self.get_by_rna_id(rna_id) - - # Step 3: If get_by_rna_id() failed, use search result data as fallback - if not result: - logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) - result = self._rna_data_to_dict(rna_id, target_result) - - return result - logger.info("No results found for sequence.") - return None - error_text = await resp.text() - logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200]) - raise Exception(f"HTTP {resp.status}: {error_text}") + if not result: + result = await self.get_best_hit(accession) + + # Fall back to RNAcentral API if local BLAST didn't find result + if not result: + logger.debug("Falling back to RNAcentral API.") + async with aiohttp.ClientSession() as session: + search_url = f"{self.base_url}/rna" + params = {"sequence": seq, "format": "json"} + async with session.get( + search_url, + params=params, + headers=self.headers, + timeout=aiohttp.ClientTimeout(total=60), # Sequence search may take longer + ) as resp: + if resp.status == 200: + search_results = await resp.json() + results = search_results.get("results", []) + result = await self._process_api_search_results(results, seq) + else: + error_text = await resp.text() + logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200]) + raise Exception(f"HTTP {resp.status}: {error_text}") except aiohttp.ClientError as e: logger.error("Network error searching for sequence: %s", e) - return None except Exception as e: # pylint: disable=broad-except logger.error("Sequence search failed: %s", e) - return None + return result @retry( stop=stop_after_attempt(3), From b48930af059bbfbd3f307a892800de05f74aa1d1 Mon Sep 17 00:00:00 2001 From: chenzihong <522023320011@smail.nju.edu.cn> Date: Sun, 30 Nov 2025 17:44:50 +0800 Subject: [PATCH 13/22] perf: optimize code style and search efficiency --- graphgen/models/searcher/db/ncbi_searcher.py | 764 ++++++------------- 1 file changed, 223 insertions(+), 541 deletions(-) diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index 4558f75a..946e3c1f 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -1,16 +1,15 @@ import asyncio -import logging import os import re import subprocess import tempfile -import time from concurrent.futures import ThreadPoolExecutor from functools import lru_cache from http.client import IncompleteRead from typing import Dict, Optional +from graphgen.models.searcher.limitter import RateLimiter -from Bio import Entrez +from Bio import Entrez, SeqIO from Bio.Blast import NCBIWWW, NCBIXML from requests.exceptions import RequestException from tenacity import ( @@ -18,7 +17,6 @@ retry_if_exception_type, stop_after_attempt, wait_exponential, - before_sleep_log, ) from graphgen.bases import BaseSearcher @@ -43,613 +41,317 @@ class NCBISearch(BaseSearcher): def __init__( self, - email: str = "test@example.com", - tool: str = "GraphGen", use_local_blast: bool = False, local_blast_db: str = "nt_db", + email: str = "email@example.com", + api_key: str = "", ): + """ + Initialize the NCBI Search client. + + Args: + use_local_blast (bool): Whether to use local BLAST database. + local_blast_db (str): Path to the local BLAST database. + email (str): Email address for NCBI API requests. + api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/. + """ super().__init__() - Entrez.email = email - Entrez.tool = tool Entrez.timeout = 60 # 60 seconds timeout + Entrez.email = email + if api_key: + Entrez.api_key = api_key + Entrez.max_tries = 10 if api_key else 3 + Entrez.sleep_between_tries = 5 self.use_local_blast = use_local_blast self.local_blast_db = local_blast_db if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): logger.error("Local BLAST database files not found. Please check the path.") self.use_local_blast = False + self.rate_limiter = RateLimiter() @staticmethod - def _safe_get(obj, key, default=None): - """Safely get value from dict or StringElement-like object.""" - if isinstance(obj, dict): - return obj.get(key, default) - if hasattr(obj, "get"): - return obj.get(key, default) - if hasattr(obj, key): - return getattr(obj, key, default) - return default - - @staticmethod - def _extract_gene_ref(entrezgene_gene): - """Extract gene_ref from entrezgene_gene.""" - if isinstance(entrezgene_gene, dict): - return entrezgene_gene.get("Gene-ref", {}) - if hasattr(entrezgene_gene, "get"): - return entrezgene_gene.get("Gene-ref", {}) - try: - if hasattr(entrezgene_gene, "Gene-ref"): - return getattr(entrezgene_gene, "Gene-ref", {}) - except Exception: - pass - return {} - - @staticmethod - def _extract_organism(entrezgene_source): - """Extract organism from entrezgene_source.""" - try: - biosource = NCBISearch._safe_get(entrezgene_source, "BioSource", {}) - biosource_org = NCBISearch._safe_get(biosource, "BioSource_org", {}) - org_ref = NCBISearch._safe_get(biosource_org, "Org-ref", {}) - return NCBISearch._safe_get(org_ref, "Org-ref_taxname", "N/A") - except Exception as e: - logger.debug("Error extracting organism: %s", e) - return "N/A" - - @staticmethod - def _extract_synonyms(gene_ref): - """Extract gene synonyms from gene_ref.""" - gene_synonyms = [] - try: - gene_syn = gene_ref.get("Gene-ref_syn", []) if isinstance(gene_ref, dict) else [] - if isinstance(gene_syn, list): - for syn in gene_syn: - if isinstance(syn, dict): - gene_synonyms.append(syn.get("Gene-ref_syn_E", "N/A")) - elif isinstance(syn, str): - gene_synonyms.append(syn) - else: - gene_synonyms.append(str(syn)) - elif isinstance(gene_syn, str): - gene_synonyms.append(gene_syn) - elif gene_syn: - gene_synonyms.append(str(gene_syn)) - except Exception as e: - logger.debug("Error extracting gene synonyms: %s", e) - return gene_synonyms - - @staticmethod - def _extract_gene_type(gene_data): - """Extract gene type from gene_data.""" - try: - gene_type_data = gene_data.get("Entrezgene_type") - if not gene_type_data: - return None - type_value = str(gene_type_data) - type_mapping = { - "1": "protein-coding", - "2": "pseudo", - "3": "rRNA", - "4": "tRNA", - "5": "snRNA", - "6": "ncRNA", - "7": "other", - } - return type_mapping.get(type_value, f"type_{type_value}") - except Exception as e: - logger.debug("Error extracting gene type: %s", e) - return None - - @staticmethod - def _extract_chromosome(first_locus): - """Extract chromosome from first_locus.""" - label = NCBISearch._safe_get(first_locus, "Gene-commentary_label", "") - if not label or "Chromosome" not in str(label): - return None - match = re.search(r'Chromosome\s+(\S+)', str(label)) - return match.group(1) if match else None - - @staticmethod - def _extract_genomic_location(first_locus): - """Extract genomic location from first_locus.""" - seqs = NCBISearch._safe_get(first_locus, "Gene-commentary_seqs", []) - if not seqs or not isinstance(seqs, list) or not seqs: - return None - first_seq = seqs[0] - if not isinstance(first_seq, dict): - return None - seq_loc_int = NCBISearch._safe_get(first_seq, "Seq-loc_int", {}) - if not seq_loc_int: - return None - seq_interval = NCBISearch._safe_get(seq_loc_int, "Seq-interval", {}) - if not seq_interval: - return None - seq_from = NCBISearch._safe_get(seq_interval, "Seq-interval_from", "") - seq_to = NCBISearch._safe_get(seq_interval, "Seq-interval_to", "") - if seq_from and seq_to: - return f"{seq_from}-{seq_to}" - return None - - @staticmethod - def _extract_location_info(locus_data): - """Extract chromosome and genomic location from locus data.""" - if not locus_data or not isinstance(locus_data, list) or not locus_data: - return None, None - first_locus = locus_data[0] - if not isinstance(first_locus, dict): - return None, None - chromosome = NCBISearch._extract_chromosome(first_locus) - genomic_location = NCBISearch._extract_genomic_location(first_locus) - return chromosome, genomic_location - - @staticmethod - def _extract_function_info(gene_data): - """Extract gene functional description.""" - try: - summary = gene_data.get("Entrezgene_summary") - if summary: - return str(summary) - comments_data = gene_data.get("Entrezgene_comments") - if not comments_data or not isinstance(comments_data, list): - return None - for comment in comments_data: - if not isinstance(comment, dict): - continue - heading = NCBISearch._safe_get(comment, "Gene-commentary_heading", "") - heading_lower = str(heading).lower() - if "function" not in heading_lower and "summary" not in heading_lower: - continue - comment_text = NCBISearch._safe_get(comment, "Gene-commentary_comment", "") - if comment_text: - return str(comment_text) - return None - except Exception as e: - logger.debug("Error extracting function: %s", e) - return None - - @staticmethod - def _extract_accession(locus_data): - """Extract representative mRNA accession from locus data.""" - if not locus_data or not isinstance(locus_data, list) or not locus_data: - return None - first_locus = locus_data[0] - if not isinstance(first_locus, dict): - return None - products = NCBISearch._safe_get(first_locus, "Gene-commentary_products", []) - if not products or not isinstance(products, list): - return None - representative_accession = None - for product in products: - if not isinstance(product, dict): - continue - product_type = NCBISearch._safe_get(product, "Gene-commentary_type", "") - product_type_str = str(product_type) - if product_type_str == "3" or (not representative_accession and product_type_str): - accession = NCBISearch._safe_get(product, "Gene-commentary_accession", "") - if accession: - representative_accession = str(accession) - if product_type_str == "3": - break - return representative_accession + def _nested_get(data: dict, *keys, default=None): + """Safely traverse nested dictionaries.""" + for key in keys: + if not isinstance(data, dict): + return default + data = data.get(key, default) + return data @staticmethod def _gene_record_to_dict(gene_record, gene_id: str) -> dict: """ Convert an Entrez gene record to a dictionary. - :param gene_record: The Entrez gene record (list from Entrez.read). - :param gene_id: The gene ID. - :return: A dictionary containing gene information. + All extraction logic is inlined for maximum clarity and performance. """ if not gene_record: raise ValueError("Empty gene record") - gene_data = gene_record[0] - locus_data = gene_data.get("Entrezgene_locus") + data = gene_record[0] + locus = (data.get("Entrezgene_locus") or [{}])[0] - # Extract information using helper methods - entrezgene_gene = gene_data.get("Entrezgene_gene") - gene_ref = NCBISearch._extract_gene_ref(entrezgene_gene) - organism = NCBISearch._extract_organism(gene_data.get("Entrezgene_source")) - gene_synonyms = NCBISearch._extract_synonyms(gene_ref) - gene_type = NCBISearch._extract_gene_type(gene_data) - chromosome, genomic_location = NCBISearch._extract_location_info(locus_data) - function = NCBISearch._extract_function_info(gene_data) - representative_accession = NCBISearch._extract_accession(locus_data) + # Extract common nested paths once + gene_ref = NCBISearch._nested_get(data, "Entrezgene_gene", "Gene-ref", default={}) + biosource = NCBISearch._nested_get(data, "Entrezgene_source", "BioSource", default={}) + + # Process synonyms + synonyms_raw = gene_ref.get("Gene-ref_syn", []) + gene_synonyms = [] + if isinstance(synonyms_raw, list): + for syn in synonyms_raw: + gene_synonyms.append(syn.get("Gene-ref_syn_E") if isinstance(syn, dict) else str(syn)) + elif synonyms_raw: + gene_synonyms.append(str(synonyms_raw)) + + # Extract location info + label = locus.get("Gene-commentary_label", "") + chromosome_match = re.search(r"Chromosome\s+(\S+)", str(label)) if label else None + + seq_interval = NCBISearch._nested_get( + locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={} + ) + genomic_location = ( + f"{seq_interval.get('Seq-interval_from')}-{seq_interval.get('Seq-interval_to')}" + if seq_interval.get('Seq-interval_from') and seq_interval.get('Seq-interval_to') + else None + ) + + # Extract representative accession + representative_accession = next( + ( + product.get("Gene-commentary_accession") + for product in locus.get("Gene-commentary_products", []) + if product.get("Gene-commentary_type") == "3" + ), + None, + ) + + # Extract function + function = data.get("Entrezgene_summary") or next( + ( + comment.get("Gene-commentary_comment") + for comment in data.get("Entrezgene_comments", []) + if isinstance(comment, dict) + and "function" in str(comment.get("Gene-commentary_heading", "")).lower() + ), + None, + ) - # Build result dictionary with all fields return { "molecule_type": "DNA", "database": "NCBI", "id": gene_id, - "gene_name": NCBISearch._safe_get(gene_ref, "Gene-ref_locus", "N/A"), - "gene_description": NCBISearch._safe_get(gene_ref, "Gene-ref_desc", "N/A"), - "organism": organism, + "gene_name": gene_ref.get("Gene-ref_locus", "N/A"), + "gene_description": gene_ref.get("Gene-ref_desc", "N/A"), + "organism": NCBISearch._nested_get( + biosource, "BioSource_org", "Org-ref", "Org-ref_taxname", default="N/A" + ), "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}", - "gene_synonyms": gene_synonyms if gene_synonyms else None, - "gene_type": gene_type, - "chromosome": chromosome, + "gene_synonyms": gene_synonyms or None, + "gene_type": { + "1": "protein-coding", + "2": "pseudo", + "3": "rRNA", + "4": "tRNA", + "5": "snRNA", + "6": "ncRNA", + "7": "other", + }.get(str(data.get("Entrezgene_type")), f"type_{data.get('Entrezgene_type')}"), + "chromosome": chromosome_match.group(1) if chromosome_match else None, "genomic_location": genomic_location, "function": function, - # Fields from accession-based queries (set to None initially, may be filled later) + # Fields from accession-based queries "title": None, "sequence": None, "sequence_length": None, - "gene_id": gene_id, # For consistency with accession queries + "gene_id": gene_id, "molecule_type_detail": None, "_representative_accession": representative_accession, } - def _fetch_sequence(self, accession: str): - """Fetch sequence from nuccore database using efetch.""" - time.sleep(0.35) # Comply with rate limit - seq_handle = Entrez.efetch( - db="nuccore", - id=accession, - rettype="fasta", - retmode="text", - ) - try: - sequence_data = seq_handle.read() - if not sequence_data: - return None, None - seq_lines = sequence_data.strip().split("\n") - header = seq_lines[0] if seq_lines else "" - sequence = "".join(seq_lines[1:]) - return sequence, header - finally: - seq_handle.close() - - def _fetch_summary(self, accession: str, default_header: str = ""): - """Fetch summary from nuccore database using esummary.""" - time.sleep(0.35) # Comply with rate limit - summary_handle = Entrez.esummary(db="nuccore", id=accession) - try: - summary = Entrez.read(summary_handle) - if not summary: - return None - summary_data = summary[0] - - # Determine molecule type detail - molecule_type_detail = "N/A" - if accession.startswith("NM_") or accession.startswith("XM_"): - molecule_type_detail = "mRNA" - elif accession.startswith("NC_") or accession.startswith("NT_"): - molecule_type_detail = "genomic DNA" - elif accession.startswith("NR_") or accession.startswith("XR_"): - molecule_type_detail = "RNA" - elif accession.startswith("NG_"): - molecule_type_detail = "genomic region" - - title = summary_data.get("Title", default_header) - chromosome = summary_data.get("ChrLoc") or summary_data.get("ChrAccVer") - chr_start = summary_data.get("ChrStart") - chr_stop = summary_data.get("ChrStop") - genomic_location = None - if chr_start and chr_stop: - genomic_location = f"{chr_start}-{chr_stop}" - - return { - "title": title, - "molecule_type_detail": molecule_type_detail, - "chromosome": chromosome, - "genomic_location": genomic_location, - } - finally: - summary_handle.close() - - def _extract_gene_id(self, link_handle): - """Extract GeneID from elink results.""" - try: - links = Entrez.read(link_handle) - if not links or len(links) == 0: - return None + def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]: + """Get gene information by Gene ID.""" + def _extract_from_genbank(result: dict, accession: str): + """Enrich result dictionary with sequence and summary information from accession.""" + with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle: + record = SeqIO.read(handle, "genbank") + result["sequence"] = str(record.seq) + result["sequence_length"] = len(record.seq) + result["title"] = record.description + result["molecule_type_detail"] = ( + "mRNA" if accession.startswith(("NM_", "XM_")) else + "genomic DNA" if accession.startswith(("NC_", "NT_")) else + "RNA" if accession.startswith(("NR_", "XR_")) else + "genomic region" if accession.startswith("NG_") else "N/A" + ) - first_link = links[0] - if "LinkSetDb" not in first_link: - return None + for feature in record.features: + if feature.type == "source": + if 'chromosome' in feature.qualifiers: + result["chromosome"] = feature.qualifiers['chromosome'][0] - for link_set in first_link["LinkSetDb"]: - if link_set.get("DbTo") != "gene": - continue + if feature.location: + start = int(feature.location.start) + 1 + end = int(feature.location.end) + result["genomic_location"] = f"{start}-{end}" - # Try Link structure first (most common) - links_in_set = link_set.get("Link", []) - if links_in_set and len(links_in_set) > 0: - first_link_item = links_in_set[0] - if isinstance(first_link_item, dict): - gene_id = str(first_link_item.get("Id", "")) - elif hasattr(first_link_item, "Id"): - gene_id = str(getattr(first_link_item, "Id", "")) - else: - gene_id = str(first_link_item) - if gene_id: - return gene_id - - # Fallback: Try IdList (if Link is not available) - id_list = link_set.get("IdList", []) - if id_list: - return str(id_list[0]) + break - return None - except Exception as e: - logger.error("Error parsing elink result: %s", e) - import traceback - logger.debug(traceback.format_exc()) - return None + if not result.get("organism") and 'organism' in record.annotations: + result["organism"] = record.annotations['organism'] - def _extract_sequence(self, result: dict, accession: str): - """Enrich result dictionary with sequence and summary information from accession.""" - try: - sequence, header = self._fetch_sequence(accession) - if sequence: - result["sequence"] = sequence - result["sequence_length"] = len(sequence) - - summary_info = self._fetch_summary(accession, header or "") - if not summary_info: - return - - result["title"] = summary_info.get("title") - result["molecule_type_detail"] = summary_info.get("molecule_type_detail") - # Update chromosome and genomic_location if not already set - if not result.get("chromosome") and summary_info.get("chromosome"): - result["chromosome"] = summary_info["chromosome"] - if not result.get("genomic_location") and summary_info.get("genomic_location"): - result["genomic_location"] = summary_info["genomic_location"] - except (RequestException, IncompleteRead): - raise - except Exception as e: - logger.debug("Failed to get sequence for accession %s: %s", accession, e) + return result - @retry( - stop=stop_after_attempt(5), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_exception_type((RequestException, IncompleteRead)), - reraise=True, - before_sleep=before_sleep_log(logger, logging.WARNING), - ) - def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]: - """ - Get gene information by Gene ID. - This is the unified data source - all search methods eventually call this. - :param gene_id: NCBI Gene ID. - :param preferred_accession: Optional accession to use for sequence retrieval. - :return: A dictionary containing gene information or None if not found. - """ try: - time.sleep(0.35) # Comply with rate limit (max 3 requests per second) - handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml") - try: + with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle: gene_record = Entrez.read(handle) if not gene_record: return None - result = self._gene_record_to_dict(gene_record, gene_id) - # Try to get sequence from accession - accession_to_use = preferred_accession or result.get("_representative_accession") - if accession_to_use: - self._extract_sequence(result, accession_to_use) + result = self._gene_record_to_dict(gene_record, gene_id) + if accession := (preferred_accession or result.get("_representative_accession")): + result = _extract_from_genbank(result, accession) - # Remove internal field result.pop("_representative_accession", None) return result - finally: - handle.close() - except RequestException: - raise - except IncompleteRead: + except (RequestException, IncompleteRead): raise - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: logger.error("Gene ID %s not found: %s", gene_id, exc) return None def get_by_accession(self, accession: str) -> Optional[dict]: - """ - Get sequence information by accession number. - Unified approach: Get GeneID from accession, then call get_by_gene_id() for complete information. - :param accession: NCBI accession number (e.g., NM_000546). - :return: A dictionary containing complete gene information or None if not found. - """ + """Get sequence information by accession number.""" + def _extract_gene_id(link_handle): + """Extract GeneID from elink results.""" + links = Entrez.read(link_handle) + if not links or "LinkSetDb" not in links[0]: + return None + + for link_set in links[0]["LinkSetDb"]: + if link_set.get("DbTo") != "gene": + continue + + link = (link_set.get("Link") or link_set.get("IdList", [{}]))[0] + return str(link.get("Id") if isinstance(link, dict) else link) + try: - # Step 1: Get GeneID from elink (nuccore -> gene) - # Note: esummary for nuccore doesn't include GeneID, so we use elink instead - time.sleep(0.35) - link_handle = Entrez.elink(dbfrom="nuccore", db="gene", id=accession) - try: - gene_id = self._extract_gene_id(link_handle) - finally: - link_handle.close() - - # Step 2: If we have a GeneID, get complete information from Gene database - if gene_id: - result = self.get_by_gene_id(gene_id, preferred_accession=accession) - if result: - result["id"] = accession - result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}" - return result + with Entrez.elink(dbfrom="nuccore", db="gene", id=accession) as link_handle: + gene_id = _extract_gene_id(link_handle) - # Step 3: If no GeneID, this is a rare case (accession without associated gene) - logger.warning( - "Accession %s has no associated GeneID, cannot provide complete information", - accession - ) - return None + if not gene_id: + logger.warning("Accession %s has no associated GeneID", accession) + return None + + result = self.get_by_gene_id(gene_id, preferred_accession=accession) + if result: + result["id"] = accession + result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}" + return result except (RequestException, IncompleteRead): raise - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: logger.error("Accession %s not found: %s", accession, exc) return None - @retry( - stop=stop_after_attempt(5), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_exception_type((RequestException, IncompleteRead)), - reraise=True, - before_sleep=before_sleep_log(logger, logging.WARNING), - ) def get_best_hit(self, keyword: str) -> Optional[dict]: - """ - Search NCBI Gene database with a keyword and return the best hit. - :param keyword: The search keyword (e.g., gene name). - :return: A dictionary containing the best hit information or None if not found. - """ + """Search NCBI Gene database with a keyword and return the best hit.""" if not keyword.strip(): return None try: - time.sleep(0.35) # Comply with rate limit - # Search gene database - search_handle = Entrez.esearch( - db="gene", - term=f"{keyword}[Gene Name] OR {keyword}[All Fields]", - retmax=1, - ) - try: - search_results = Entrez.read(search_handle) - if not search_results.get("IdList"): - # If not found, try a broader search - time.sleep(0.35) - search_handle2 = Entrez.esearch( - db="gene", - term=keyword, - retmax=1, - ) - try: - search_results = Entrez.read(search_handle2) - finally: - search_handle2.close() - - if search_results.get("IdList"): - gene_id = search_results["IdList"][0] - return self.get_by_gene_id(gene_id) - finally: - search_handle.close() - except RequestException: - raise - except IncompleteRead: + for search_term in [f"{keyword}[Gene Name] OR {keyword}[All Fields]", keyword]: + with Entrez.esearch(db="gene", term=search_term, retmax=1) as search_handle: + if search_results := Entrez.read(search_handle): + if gene_id := search_results["IdList"][0]: + return self.get_by_gene_id(gene_id) + self.rate_limiter.wait() + except (RequestException, IncompleteRead): raise - except Exception as e: # pylint: disable=broad-except + except Exception as e: logger.error("Keyword %s not found: %s", keyword, e) return None def _local_blast(self, seq: str, threshold: float) -> Optional[str]: - """ - Perform local BLAST search using local BLAST database. - :param seq: The DNA sequence. - :param threshold: E-value threshold for BLAST search. - :return: The accession number of the best hit or None if not found. - """ + """Perform local BLAST search using local BLAST database.""" try: - with tempfile.NamedTemporaryFile( - mode="w+", suffix=".fa", delete=False - ) as tmp: + with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: tmp.write(f">query\n{seq}\n") tmp_name = tmp.name cmd = [ - "blastn", - "-db", - self.local_blast_db, - "-query", - tmp_name, - "-evalue", - str(threshold), - "-max_target_seqs", - "1", - "-outfmt", - "6 sacc", # only return accession + "blastn", "-db", self.local_blast_db, "-query", tmp_name, + "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc" ] logger.debug("Running local blastn: %s", " ".join(cmd)) out = subprocess.check_output(cmd, text=True).strip() os.remove(tmp_name) - if out: - return out.split("\n", maxsplit=1)[0] - return None - except Exception as exc: # pylint: disable=broad-except + return out.split("\n", maxsplit=1)[0] if out else None + except Exception as exc: logger.error("Local blastn failed: %s", exc) return None - def _extract_and_normalize_sequence(self, sequence: str) -> Optional[str]: - """Extract and normalize DNA sequence from input.""" - if sequence.startswith(">"): - seq_lines = sequence.strip().split("\n") - seq = "".join(seq_lines[1:]) - else: - seq = sequence.strip().replace(" ", "").replace("\n", "") - return seq if seq and re.fullmatch(r"[ATCGN\s]+", seq, re.I) else None + def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: + """Search NCBI with a DNA sequence using BLAST.""" - def _process_network_blast_result(self, blast_record, seq: str, threshold: float) -> Optional[dict]: - """Process network BLAST result and return dictionary or None.""" - if not blast_record.alignments: - logger.info("No BLAST hits found for the given sequence.") - return None + def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: + """Extract and normalize DNA sequence from input.""" + if sequence.startswith(">"): + seq = "".join(sequence.strip().split("\n")[1:]) + else: + seq = sequence.strip().replace(" ", "").replace("\n", "") + return seq if re.fullmatch(r"[ATCGN]+", seq, re.I) else None - best_alignment = blast_record.alignments[0] - best_hsp = best_alignment.hsps[0] - if best_hsp.expect > threshold: - logger.info("No BLAST hits below the threshold E-value.") - return None - hit_id = best_alignment.hit_id - accession_match = re.search(r"ref\|([^|]+)", hit_id) - if accession_match: - accession = accession_match.group(1).split(".")[0] - return self.get_by_accession(accession) + def _process_network_blast_result(blast_record, seq: str, threshold: float) -> Optional[dict]: + """Process network BLAST result and return dictionary or None.""" + if not blast_record.alignments: + logger.info("No BLAST hits found for the given sequence.") + return None - # If unable to extract accession, return basic information - return { - "molecule_type": "DNA", - "database": "NCBI", - "id": hit_id, - "title": best_alignment.title, - "sequence_length": len(seq), - "e_value": best_hsp.expect, - "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0, - "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}", - } + best_alignment = blast_record.alignments[0] + best_hsp = best_alignment.hsps[0] + if best_hsp.expect > threshold: + logger.info("No BLAST hits below the threshold E-value.") + return None + + hit_id = best_alignment.hit_id + if accession_match := re.search(r"ref\|([^|]+)", hit_id): + return self.get_by_accession(accession_match.group(1).split(".")[0]) + + # If unable to extract accession, return basic information + return { + "molecule_type": "DNA", + "database": "NCBI", + "id": hit_id, + "title": best_alignment.title, + "sequence_length": len(seq), + "e_value": best_hsp.expect, + "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0, + "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}", + } - def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: - """ - Search NCBI with a DNA sequence using BLAST. - Tries local BLAST first if enabled, falls back to network BLAST. - :param sequence: DNA sequence (FASTA format or raw sequence). - :param threshold: E-value threshold for BLAST search. - :return: A dictionary containing the best hit information or None if not found. - """ - result = None try: - seq = self._extract_and_normalize_sequence(sequence) - if not seq: + if not (seq := _extract_and_normalize_sequence(sequence)): logger.error("Empty or invalid DNA sequence provided.") return None # Try local BLAST first if enabled - if self.use_local_blast: - accession = self._local_blast(seq, threshold) - if accession: - logger.debug("Local BLAST found accession: %s", accession) - result = self.get_by_accession(accession) - - # Fall back to network BLAST if local BLAST didn't find result - if not result: - logger.debug("Falling back to NCBIWWW.qblast.") - logger.debug("Performing BLAST search for DNA sequence...") - time.sleep(0.35) - - result_handle = NCBIWWW.qblast( - program="blastn", - database="nr", - sequence=seq, - hitlist_size=1, - expect=threshold, - ) - blast_record = NCBIXML.read(result_handle) - result = self._process_network_blast_result(blast_record, seq, threshold) - except RequestException: + if self.use_local_blast and (accession := self._local_blast(seq, threshold)): + logger.debug("Local BLAST found accession: %s", accession) + return self.get_by_accession(accession) + + # Fall back to network BLAST + logger.debug("Falling back to NCBIWWW.qblast") + + with NCBIWWW.qblast("blastn", "nr", seq, hitlist_size=1, expect=threshold) as result_handle: + return _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold) + except (RequestException, IncompleteRead): raise - except Exception as e: # pylint: disable=broad-except + except Exception as e: logger.error("BLAST search failed: %s", e) - return result + return None @retry( stop=stop_after_attempt(5), @@ -657,46 +359,26 @@ def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional retry=retry_if_exception_type((RequestException, IncompleteRead)), reraise=True, ) - async def search( - self, query: str, threshold: float = 0.01, **kwargs - ) -> Optional[Dict]: - """ - Search NCBI with either a gene ID, accession number, keyword, or DNA sequence. - :param query: The search query (gene ID, accession, keyword, or DNA sequence). - :param threshold: E-value threshold for BLAST search. - :param kwargs: Additional keyword arguments (not used currently). - :return: A dictionary containing the search results or None if not found. - """ - # auto detect query type + async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optional[Dict]: + """Search NCBI with either a gene ID, accession number, keyword, or DNA sequence.""" if not query or not isinstance(query, str): logger.error("Empty or non-string input.") return None - query = query.strip() + query = query.strip() logger.debug("NCBI search query: %s", query) loop = asyncio.get_running_loop() - # check if DNA sequence (ATCG characters) + # Auto-detect query type and execute in thread pool if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): - result = await loop.run_in_executor( - _get_pool(), self.search_by_sequence, query, threshold - ) - # check if gene ID (numeric) + result = await loop.run_in_executor(_get_pool(), self.search_by_sequence, query, threshold) elif re.fullmatch(r"^\d+$", query): - result = await loop.run_in_executor( - _get_pool(), self.get_by_gene_id, query - ) - # check if accession number (e.g., NM_000546, NC_000001) + result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query) elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): - result = await loop.run_in_executor( - _get_pool(), self.get_by_accession, query - ) + result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query) else: - # otherwise treat as keyword - result = await loop.run_in_executor( - _get_pool(), self.get_best_hit, query - ) + result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) if result: result["_search_query"] = query From bb84c0b571dc979e9c93535b030e5fa2b9f22583 Mon Sep 17 00:00:00 2001 From: chenzihong <522023320011@smail.nju.edu.cn> Date: Sun, 30 Nov 2025 17:52:28 +0800 Subject: [PATCH 14/22] fix: fix import error --- graphgen/models/searcher/db/ncbi_searcher.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index 946e3c1f..1a2dd7b5 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -7,7 +7,6 @@ from functools import lru_cache from http.client import IncompleteRead from typing import Dict, Optional -from graphgen.models.searcher.limitter import RateLimiter from Bio import Entrez, SeqIO from Bio.Blast import NCBIWWW, NCBIXML @@ -67,7 +66,6 @@ def __init__( if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): logger.error("Local BLAST database files not found. Please check the path.") self.use_local_blast = False - self.rate_limiter = RateLimiter() @staticmethod def _nested_get(data: dict, *keys, default=None): From 58ef1ec35797a72367e013cc5a9c8daac302632b Mon Sep 17 00:00:00 2001 From: chenzihong <522023320011@smail.nju.edu.cn> Date: Sun, 30 Nov 2025 18:07:27 +0800 Subject: [PATCH 15/22] fix: delete rate_limiter --- graphgen/models/searcher/db/ncbi_searcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index 1a2dd7b5..12da3098 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -264,7 +264,6 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: if search_results := Entrez.read(search_handle): if gene_id := search_results["IdList"][0]: return self.get_by_gene_id(gene_id) - self.rate_limiter.wait() except (RequestException, IncompleteRead): raise except Exception as e: From d767096741ab09d095ca39c2c7534c62f03b292f Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Mon, 1 Dec 2025 01:52:52 +0800 Subject: [PATCH 16/22] perf: simplify RNA searcher and align with DNA searcher logic --- .../models/searcher/db/rnacentral_searcher.py | 370 ++++++------------ 1 file changed, 118 insertions(+), 252 deletions(-) diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index 5950a3e7..6a3e2a28 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -74,129 +74,76 @@ async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession) @staticmethod def _extract_info_from_xrefs(xrefs: List[Dict]) -> Dict[str, Any]: - """ - Extract information from xrefs data. - :param xrefs: List of xref entries. - :return: Dictionary with extracted information. - """ - extracted = { - "organisms": set(), - "gene_names": set(), - "modifications": [], - "so_terms": set(), - "xrefs_list": [], - } + """Extract information from xrefs data.""" + organisms = set() + gene_names = set() + modifications = [] + so_terms = set() + xrefs_list = [] for xref in xrefs: - # Extract accession information accession = xref.get("accession", {}) - - # Extract species information species = accession.get("species") if species: - extracted["organisms"].add(species) + organisms.add(species) - # Extract gene name gene = accession.get("gene") - if gene and gene.strip(): # Only add non-empty genes - extracted["gene_names"].add(gene.strip()) + if gene and gene.strip(): + gene_names.add(gene.strip()) - # Extract modifications - modifications = xref.get("modifications", []) - if modifications: - extracted["modifications"].extend(modifications) + if mods := xref.get("modifications", []): + modifications.extend(mods) - # Extract SO term (biotype) - biotype = accession.get("biotype") - if biotype: - extracted["so_terms"].add(biotype) + if biotype := accession.get("biotype"): + so_terms.add(biotype) - # Build xrefs list - xref_info = { + xrefs_list.append({ "database": xref.get("database"), "accession_id": accession.get("id"), "external_id": accession.get("external_id"), "description": accession.get("description"), "species": species, "gene": gene, - } - extracted["xrefs_list"].append(xref_info) + }) + + def _format_set(s): + """Format set to single value or comma-separated string.""" + if not s: + return None + return list(s)[0] if len(s) == 1 else ", ".join(s) - # Convert sets to appropriate formats return { - "organism": ( - list(extracted["organisms"])[0] - if len(extracted["organisms"]) == 1 - else (", ".join(extracted["organisms"]) if extracted["organisms"] else None) - ), - "gene_name": ( - list(extracted["gene_names"])[0] - if len(extracted["gene_names"]) == 1 - else (", ".join(extracted["gene_names"]) if extracted["gene_names"] else None) - ), - "related_genes": list(extracted["gene_names"]) if extracted["gene_names"] else None, - "modifications": extracted["modifications"] if extracted["modifications"] else None, - "so_term": ( - list(extracted["so_terms"])[0] - if len(extracted["so_terms"]) == 1 - else (", ".join(extracted["so_terms"]) if extracted["so_terms"] else None) - ), - "xrefs": extracted["xrefs_list"] if extracted["xrefs_list"] else None, + "organism": _format_set(organisms), + "gene_name": _format_set(gene_names), + "related_genes": list(gene_names) if gene_names else None, + "modifications": modifications if modifications else None, + "so_term": _format_set(so_terms), + "xrefs": xrefs_list if xrefs_list else None, } @staticmethod def _rna_data_to_dict(rna_id: str, rna_data: dict, xrefs_data: Optional[List[Dict]] = None) -> dict: - """ - Convert RNAcentral API response to a dictionary. - :param rna_id: RNAcentral ID. - :param rna_data: API response data (dict or dict-like from search results). - :param xrefs_data: Optional list of xref entries fetched from xrefs endpoint. - :return: A dictionary containing RNA information. - """ + """Convert RNAcentral API response to a dictionary.""" sequence = rna_data.get("sequence", "") + extracted_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data) if xrefs_data else {} + + # Helper to get value with fallbacks + def _get_with_fallbacks(key, *fallback_keys): + if key in extracted_info and extracted_info[key]: + return extracted_info[key] + for fk in fallback_keys: + if value := rna_data.get(fk): + return value + return None - # Initialize extracted info from xrefs if available - extracted_info = {} - if xrefs_data: - extracted_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data) - - # Extract organism information (prefer from xrefs, fallback to main data) - organism = extracted_info.get("organism") - if not organism: - organism = rna_data.get("organism", None) - if not organism: - organism = rna_data.get("species", None) - - # Extract related genes (prefer from xrefs, fallback to main data) + # Extract related genes with special handling related_genes = extracted_info.get("related_genes") if not related_genes: - related_genes = rna_data.get("related_genes", []) - if not related_genes: - related_genes = rna_data.get("genes", []) - if not related_genes: - gene_name_temp = rna_data.get("gene_name", None) - if gene_name_temp: - related_genes = [gene_name_temp] - - # Extract gene name (prefer from xrefs, fallback to main data) - gene_name = extracted_info.get("gene_name") - if not gene_name: - gene_name = rna_data.get("gene_name", None) - if not gene_name: - gene_name = rna_data.get("gene", None) - - # Extract so_term (prefer from xrefs, fallback to main data) - so_term = extracted_info.get("so_term") - if not so_term: - so_term = rna_data.get("so_term", None) - - # Extract modifications (prefer from xrefs, fallback to main data) - modifications = extracted_info.get("modifications") - if not modifications: - modifications = rna_data.get("modifications", None) - - # Build result dictionary (xrefs information is already extracted into other fields) - # information is extracted into organism, gene_name, so_term, modifications, etc. + related_genes = rna_data.get("related_genes") or rna_data.get("genes", []) + if not related_genes: + if gene_name_temp := rna_data.get("gene_name"): + related_genes = [gene_name_temp] + return { "molecule_type": "RNA", "database": "RNAcentral", @@ -207,11 +154,11 @@ def _rna_data_to_dict(rna_id: str, rna_data: dict, xrefs_data: Optional[List[Dic "rna_type": rna_data.get("rna_type", "N/A"), "description": rna_data.get("description", "N/A"), "url": f"https://rnacentral.org/rna/{rna_id}", - "organism": organism, + "organism": _get_with_fallbacks("organism", "organism", "species"), "related_genes": related_genes if related_genes else None, - "gene_name": gene_name, - "so_term": so_term, - "modifications": modifications, + "gene_name": _get_with_fallbacks("gene_name", "gene_name", "gene"), + "so_term": _get_with_fallbacks("so_term", "so_term"), + "modifications": _get_with_fallbacks("modifications", "modifications"), } async def get_by_rna_id(self, rna_id: str) -> Optional[dict]: @@ -253,48 +200,37 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]: return None async def get_best_hit(self, keyword: str) -> Optional[dict]: - """ - Search RNAcentral with a keyword and return the best hit. - Unified approach: Find RNA ID from search, then call get_by_rna_id() for complete information. - :param keyword: The search keyword (e.g., miRNA name, RNA name). - :return: A dictionary containing complete RNA information or None if not found. - """ + """Search RNAcentral with a keyword and return the best hit.""" if not keyword.strip(): return None try: async with aiohttp.ClientSession() as session: - search_url = f"{self.base_url}/rna" - params = {"search": keyword, "format": "json"} async with session.get( - search_url, - params=params, + f"{self.base_url}/rna", + params={"search": keyword, "format": "json"}, headers=self.headers, timeout=aiohttp.ClientTimeout(total=30), ) as resp: - if resp.status == 200: - search_results = await resp.json() - results = search_results.get("results", []) - if results: - # Step 1: Get RNA ID from search results - first_result = results[0] - rna_id = first_result.get("rnacentral_id") - - if rna_id: - # Step 2: Unified call to get_by_rna_id() for complete information - result = await self.get_by_rna_id(rna_id) + if resp.status != 200: + error_text = await resp.text() + logger.error("HTTP %d error for keyword %s: %s", resp.status, keyword, error_text[:200]) + raise Exception(f"HTTP {resp.status}: {error_text}") - # Step 3: If get_by_rna_id() failed, use search result data as fallback - if not result: - logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) - result = self._rna_data_to_dict(rna_id, first_result) - - return result + search_results = await resp.json() + if not (results := search_results.get("results", [])): logger.info("No results found for keyword: %s", keyword) return None - error_text = await resp.text() - logger.error("HTTP %d error for keyword %s: %s", resp.status, keyword, error_text[:200]) - raise Exception(f"HTTP {resp.status}: {error_text}") + + first_result = results[0] + if not (rna_id := first_result.get("rnacentral_id")): + return None + + result = await self.get_by_rna_id(rna_id) + if not result: + logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) + result = self._rna_data_to_dict(rna_id, first_result) + return result except aiohttp.ClientError as e: logger.error("Network error searching for keyword %s: %s", keyword, e) return None @@ -303,133 +239,77 @@ async def get_best_hit(self, keyword: str) -> Optional[dict]: return None def _local_blast(self, seq: str, threshold: float) -> Optional[str]: - """ - Perform local BLAST search using local BLAST database. - :param seq: The RNA sequence. - :param threshold: E-value threshold for BLAST search. - :return: The accession/ID of the best hit or None if not found. - """ + """Perform local BLAST search using local BLAST database.""" try: - with tempfile.NamedTemporaryFile( - mode="w+", suffix=".fa", delete=False - ) as tmp: + with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: tmp.write(f">query\n{seq}\n") tmp_name = tmp.name cmd = [ - "blastn", - "-db", - self.local_blast_db, - "-query", - tmp_name, - "-evalue", - str(threshold), - "-max_target_seqs", - "1", - "-outfmt", - "6 sacc", # only return accession + "blastn", "-db", self.local_blast_db, "-query", tmp_name, + "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc" ] logger.debug("Running local blastn for RNA: %s", " ".join(cmd)) out = subprocess.check_output(cmd, text=True).strip() os.remove(tmp_name) - if out: - return out.split("\n", maxsplit=1)[0] - return None - except Exception as exc: # pylint: disable=broad-except + return out.split("\n", maxsplit=1)[0] if out else None + except Exception as exc: logger.error("Local blastn failed: %s", exc) return None - @staticmethod - def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: - """Extract and normalize RNA sequence from input.""" - if sequence.startswith(">"): - seq_lines = sequence.strip().split("\n") - seq = "".join(seq_lines[1:]) - else: - seq = sequence.strip().replace(" ", "").replace("\n", "") - return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None - - def _find_best_match_from_results(self, results: List[Dict], seq: str) -> Optional[Dict]: - """Find best match from search results, preferring exact match.""" - exact_match = None - for result_item in results: - result_seq = result_item.get("sequence", "") - if result_seq == seq: - exact_match = result_item - break - return exact_match if exact_match else (results[0] if results else None) - - async def _process_api_search_results( - self, results: List[Dict], seq: str - ) -> Optional[dict]: - """Process API search results and return dictionary or None.""" - if not results: - logger.info("No results found for sequence.") - return None - - target_result = self._find_best_match_from_results(results, seq) - if not target_result: - return None - - rna_id = target_result.get("rnacentral_id") - if not rna_id: - return None - - # Try to get complete information - result = await self.get_by_rna_id(rna_id) - if not result: - logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) - result = self._rna_data_to_dict(rna_id, target_result) - return result - async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: - """ - Search RNAcentral with an RNA sequence. - Tries local BLAST first if enabled, falls back to RNAcentral API. - Unified approach: Find RNA ID from sequence search, then call get_by_rna_id() for complete information. - :param sequence: RNA sequence (FASTA format or raw sequence). - :param threshold: E-value threshold for BLAST search. - :return: A dictionary containing complete RNA information or None if not found. - """ + """Search RNAcentral with an RNA sequence using BLAST or API.""" + + def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: + """Extract and normalize RNA sequence from input.""" + if sequence.startswith(">"): + seq = "".join(sequence.strip().split("\n")[1:]) + else: + seq = sequence.strip().replace(" ", "").replace("\n", "") + return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None + + def _find_best_match(results: List[Dict], seq: str) -> Optional[Dict]: + """Find best match from search results, preferring exact match.""" + for result_item in results: + if result_item.get("sequence", "") == seq: + return result_item + return results[0] if results else None + result = None try: - seq = self._extract_and_normalize_sequence(sequence) - if not seq: + if not (seq := _extract_and_normalize_sequence(sequence)): logger.error("Empty or invalid RNA sequence provided.") - return None - - # Try local BLAST first if enabled - if self.use_local_blast: - accession = self._local_blast(seq, threshold) - if accession: - logger.debug("Local BLAST found accession: %s", accession) - result = await self.get_by_rna_id(accession) - if not result: - result = await self.get_best_hit(accession) - - # Fall back to RNAcentral API if local BLAST didn't find result - if not result: - logger.debug("Falling back to RNAcentral API.") + elif self.use_local_blast and (accession := self._local_blast(seq, threshold)): + logger.debug("Local BLAST found accession: %s", accession) + result = await self.get_by_rna_id(accession) or await self.get_best_hit(accession) + else: + # Fall back to RNAcentral API + logger.debug("Falling back to RNAcentral API") async with aiohttp.ClientSession() as session: - search_url = f"{self.base_url}/rna" - params = {"sequence": seq, "format": "json"} async with session.get( - search_url, - params=params, + f"{self.base_url}/rna", + params={"sequence": seq, "format": "json"}, headers=self.headers, - timeout=aiohttp.ClientTimeout(total=60), # Sequence search may take longer + timeout=aiohttp.ClientTimeout(total=60), ) as resp: - if resp.status == 200: - search_results = await resp.json() - results = search_results.get("results", []) - result = await self._process_api_search_results(results, seq) - else: + if resp.status != 200: error_text = await resp.text() logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200]) raise Exception(f"HTTP {resp.status}: {error_text}") + + search_results = await resp.json() + if results := search_results.get("results", []): + target_result = _find_best_match(results, seq) + if rna_id := target_result.get("rnacentral_id"): + result = await self.get_by_rna_id(rna_id) + if not result: + logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) + result = self._rna_data_to_dict(rna_id, target_result) + else: + logger.info("No results found for sequence.") except aiohttp.ClientError as e: logger.error("Network error searching for sequence: %s", e) - except Exception as e: # pylint: disable=broad-except + except Exception as e: logger.error("Sequence search failed: %s", e) return result @@ -439,35 +319,21 @@ async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Op retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)), reraise=True, ) - async def search( - self, query: str, threshold: float = 0.1, **kwargs - ) -> Optional[Dict]: - """ - Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence. - :param query: The search query (RNAcentral ID, keyword, or RNA sequence). - :param threshold: E-value threshold for sequence search. - Note: RNAcentral API uses its own similarity matching, this parameter is for interface consistency. - :param kwargs: Additional keyword arguments (not used currently). - :return: A dictionary containing the search results or None if not found. - """ - # auto detect query type + async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional[Dict]: + """Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence.""" if not query or not isinstance(query, str): logger.error("Empty or non-string input.") return None - query = query.strip() + query = query.strip() logger.debug("RNAcentral search query: %s", query) - # check if RNA sequence (AUCG characters, contains U) - if query.startswith(">") or ( - re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper() - ): + # Auto-detect query type + if query.startswith(">") or (re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper()): result = await self.search_by_sequence(query, threshold) - # check if RNAcentral ID (typically starts with URS) elif re.fullmatch(r"URS\d+", query, re.I): result = await self.get_by_rna_id(query) else: - # otherwise treat as keyword result = await self.get_best_hit(query) if result: From ea30cef89b903f2f6fbc17ca3fc6a1d9724e8bf5 Mon Sep 17 00:00:00 2001 From: chenzihong <522023320011@smail.nju.edu.cn> Date: Mon, 1 Dec 2025 13:40:00 +0800 Subject: [PATCH 17/22] fix: fix search params in get_best_hit --- graphgen/models/searcher/db/ncbi_searcher.py | 26 +++++++++----------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index 12da3098..655ea4fd 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -76,8 +76,7 @@ def _nested_get(data: dict, *keys, default=None): data = data.get(key, default) return data - @staticmethod - def _gene_record_to_dict(gene_record, gene_id: str) -> dict: + def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: """ Convert an Entrez gene record to a dictionary. All extraction logic is inlined for maximum clarity and performance. @@ -89,8 +88,8 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict: locus = (data.get("Entrezgene_locus") or [{}])[0] # Extract common nested paths once - gene_ref = NCBISearch._nested_get(data, "Entrezgene_gene", "Gene-ref", default={}) - biosource = NCBISearch._nested_get(data, "Entrezgene_source", "BioSource", default={}) + gene_ref = self._nested_get(data, "Entrezgene_gene", "Gene-ref", default={}) + biosource = self._nested_get(data, "Entrezgene_source", "BioSource", default={}) # Process synonyms synonyms_raw = gene_ref.get("Gene-ref_syn", []) @@ -105,7 +104,7 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict: label = locus.get("Gene-commentary_label", "") chromosome_match = re.search(r"Chromosome\s+(\S+)", str(label)) if label else None - seq_interval = NCBISearch._nested_get( + seq_interval = self._nested_get( locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={} ) genomic_location = ( @@ -141,7 +140,7 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict: "id": gene_id, "gene_name": gene_ref.get("Gene-ref_locus", "N/A"), "gene_description": gene_ref.get("Gene-ref_desc", "N/A"), - "organism": NCBISearch._nested_get( + "organism": self._nested_get( biosource, "BioSource_org", "Org-ref", "Org-ref_taxname", default="N/A" ), "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}", @@ -200,7 +199,6 @@ def _extract_from_genbank(result: dict, accession: str): return result - try: with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle: gene_record = Entrez.read(handle) @@ -259,11 +257,11 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: return None try: - for search_term in [f"{keyword}[Gene Name] OR {keyword}[All Fields]", keyword]: - with Entrez.esearch(db="gene", term=search_term, retmax=1) as search_handle: - if search_results := Entrez.read(search_handle): - if gene_id := search_results["IdList"][0]: - return self.get_by_gene_id(gene_id) + for search_term in [f"{keyword}[Gene] OR {keyword}[All Fields]", keyword]: + with Entrez.esearch(db="gene", term=search_term, retmax=1, sort="relevance") as search_handle: + search_results = Entrez.read(search_handle) + if len(gene_id := search_results.get("IdList", [])) > 0: + return self.get_by_gene_id(gene_id) except (RequestException, IncompleteRead): raise except Exception as e: @@ -289,7 +287,7 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]: logger.error("Local blastn failed: %s", exc) return None - def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: + def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: """Search NCBI with a DNA sequence using BLAST.""" def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: @@ -369,7 +367,7 @@ async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optiona # Auto-detect query type and execute in thread pool if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): - result = await loop.run_in_executor(_get_pool(), self.search_by_sequence, query, threshold) + result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) elif re.fullmatch(r"^\d+$", query): result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query) elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): From 3adb9566794550b143cbff2f711d65c63f8ccb5a Mon Sep 17 00:00:00 2001 From: chenzihong <522023320011@smail.nju.edu.cn> Date: Mon, 1 Dec 2025 13:42:16 +0800 Subject: [PATCH 18/22] perf: optimize search logic in rnacentral_searcher --- .../models/searcher/db/rnacentral_searcher.py | 414 +++++++----------- 1 file changed, 166 insertions(+), 248 deletions(-) diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index 5950a3e7..99c163f2 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -2,9 +2,13 @@ import os import re import subprocess +from concurrent.futures import ThreadPoolExecutor +from functools import lru_cache import tempfile -from typing import Dict, Optional, List, Any +from typing import Dict, Optional, List, Any, Set +import hashlib +import requests import aiohttp from tenacity import ( retry, @@ -16,6 +20,11 @@ from graphgen.bases import BaseSearcher from graphgen.utils import logger + +@lru_cache(maxsize=None) +def _get_pool(): + return ThreadPoolExecutor(max_workers=10) + class RNACentralSearch(BaseSearcher): """ RNAcentral Search client to search RNA databases. @@ -36,167 +45,90 @@ def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db" logger.error("Local BLAST database files not found. Please check the path.") self.use_local_blast = False - async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession) -> List[Dict]: - """ - Fetch all xrefs from the xrefs endpoint, handling pagination. - :param xrefs_url: URL to the xrefs endpoint. - :param session: aiohttp ClientSession to use for requests. - :return: List of all xref entries. - """ - all_xrefs = [] - current_url = xrefs_url - - while current_url: - try: - async with session.get( - current_url, headers=self.headers, timeout=aiohttp.ClientTimeout(total=30) - ) as resp: - if resp.status == 200: - data = await resp.json() - results = data.get("results", []) - all_xrefs.extend(results) - - # Check if there's a next page - current_url = data.get("next") - if not current_url: - break - - # Small delay to avoid rate limiting - await asyncio.sleep(0.2) - else: - logger.warning("Failed to fetch xrefs from %s: HTTP %d", current_url, resp.status) - break - except Exception as e: - logger.warning("Error fetching xrefs from %s: %s", current_url, e) - break - - return all_xrefs - @staticmethod - def _extract_info_from_xrefs(xrefs: List[Dict]) -> Dict[str, Any]: - """ - Extract information from xrefs data. - :param xrefs: List of xref entries. - :return: Dictionary with extracted information. - """ - extracted = { - "organisms": set(), - "gene_names": set(), - "modifications": [], - "so_terms": set(), - "xrefs_list": [], - } + def _extract_info_from_xrefs(xrefs: List[Dict[str, Any]]) -> Dict[str, Any]: + organisms: Set[str] = set() + gene_names: Set[str] = set() + modifications: List[Any] = [] + so_terms: Set[str] = set() + xrefs_list: List[Dict[str, Any]] = [] + + def format_unique_values(values: Set[str]) -> Optional[str]: + if not values: + return None + if len(values) == 1: + return next(iter(values)) + return ", ".join(sorted(values)) for xref in xrefs: - # Extract accession information accession = xref.get("accession", {}) - - # Extract species information species = accession.get("species") - if species: - extracted["organisms"].add(species) - - # Extract gene name gene = accession.get("gene") - if gene and gene.strip(): # Only add non-empty genes - extracted["gene_names"].add(gene.strip()) - - # Extract modifications - modifications = xref.get("modifications", []) - if modifications: - extracted["modifications"].extend(modifications) - - # Extract SO term (biotype) - biotype = accession.get("biotype") - if biotype: - extracted["so_terms"].add(biotype) - - # Build xrefs list - xref_info = { + stripped_gene = gene.strip() if gene else None + if species: + organisms.add(species) + if stripped_gene: + gene_names.add(stripped_gene) + if mods := xref.get("modifications"): + modifications.extend(mods) + if biotype := accession.get("biotype"): + so_terms.add(biotype) + + xrefs_list.append({ "database": xref.get("database"), "accession_id": accession.get("id"), "external_id": accession.get("external_id"), "description": accession.get("description"), "species": species, - "gene": gene, - } - extracted["xrefs_list"].append(xref_info) + "gene": stripped_gene, + }) - # Convert sets to appropriate formats return { - "organism": ( - list(extracted["organisms"])[0] - if len(extracted["organisms"]) == 1 - else (", ".join(extracted["organisms"]) if extracted["organisms"] else None) - ), - "gene_name": ( - list(extracted["gene_names"])[0] - if len(extracted["gene_names"]) == 1 - else (", ".join(extracted["gene_names"]) if extracted["gene_names"] else None) - ), - "related_genes": list(extracted["gene_names"]) if extracted["gene_names"] else None, - "modifications": extracted["modifications"] if extracted["modifications"] else None, - "so_term": ( - list(extracted["so_terms"])[0] - if len(extracted["so_terms"]) == 1 - else (", ".join(extracted["so_terms"]) if extracted["so_terms"] else None) - ), - "xrefs": extracted["xrefs_list"] if extracted["xrefs_list"] else None, + "organism": format_unique_values(organisms), + "gene_name": format_unique_values(gene_names), + "related_genes": list(gene_names) if gene_names else None, + "modifications": modifications or None, + "so_term": format_unique_values(so_terms), + "xrefs": xrefs_list or None, } @staticmethod - def _rna_data_to_dict(rna_id: str, rna_data: dict, xrefs_data: Optional[List[Dict]] = None) -> dict: - """ - Convert RNAcentral API response to a dictionary. - :param rna_id: RNAcentral ID. - :param rna_data: API response data (dict or dict-like from search results). - :param xrefs_data: Optional list of xref entries fetched from xrefs endpoint. - :return: A dictionary containing RNA information. - """ + def _rna_data_to_dict( + rna_id: str, + rna_data: Dict[str, Any], + xrefs_data: Optional[List[Dict[str, Any]]] = None + ) -> Dict[str, Any]: + fallback_rules = { + "organism": ["organism", "species"], + "related_genes": ["related_genes", "genes"], + "gene_name": ["gene_name", "gene"], + "so_term": ["so_term"], + "modifications": ["modifications"], + } + + xrefs_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data) if xrefs_data else {} + + def resolve_field(field_name: str) -> Any: + if value := xrefs_info.get(field_name): + return value + + for key in fallback_rules[field_name]: + if (value := rna_data.get(key)) is not None: + return value + + return None + + organism = resolve_field("organism") + gene_name = resolve_field("gene_name") + so_term = resolve_field("so_term") + modifications = resolve_field("modifications") + + related_genes = resolve_field("related_genes") + if not related_genes and (single_gene := rna_data.get("gene_name")): + related_genes = [single_gene] + sequence = rna_data.get("sequence", "") - # Initialize extracted info from xrefs if available - extracted_info = {} - if xrefs_data: - extracted_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data) - - # Extract organism information (prefer from xrefs, fallback to main data) - organism = extracted_info.get("organism") - if not organism: - organism = rna_data.get("organism", None) - if not organism: - organism = rna_data.get("species", None) - - # Extract related genes (prefer from xrefs, fallback to main data) - related_genes = extracted_info.get("related_genes") - if not related_genes: - related_genes = rna_data.get("related_genes", []) - if not related_genes: - related_genes = rna_data.get("genes", []) - if not related_genes: - gene_name_temp = rna_data.get("gene_name", None) - if gene_name_temp: - related_genes = [gene_name_temp] - - # Extract gene name (prefer from xrefs, fallback to main data) - gene_name = extracted_info.get("gene_name") - if not gene_name: - gene_name = rna_data.get("gene_name", None) - if not gene_name: - gene_name = rna_data.get("gene", None) - - # Extract so_term (prefer from xrefs, fallback to main data) - so_term = extracted_info.get("so_term") - if not so_term: - so_term = rna_data.get("so_term", None) - - # Extract modifications (prefer from xrefs, fallback to main data) - modifications = extracted_info.get("modifications") - if not modifications: - modifications = rna_data.get("modifications", None) - - # Build result dictionary (xrefs information is already extracted into other fields) - # information is extracted into organism, gene_name, so_term, modifications, etc. return { "molecule_type": "RNA", "database": "RNAcentral", @@ -208,51 +140,52 @@ def _rna_data_to_dict(rna_id: str, rna_data: dict, xrefs_data: Optional[List[Dic "description": rna_data.get("description", "N/A"), "url": f"https://rnacentral.org/rna/{rna_id}", "organism": organism, - "related_genes": related_genes if related_genes else None, + "related_genes": related_genes or None, "gene_name": gene_name, "so_term": so_term, "modifications": modifications, } - async def get_by_rna_id(self, rna_id: str) -> Optional[dict]: + @staticmethod + def _calculate_md5(sequence: str) -> str: + """ + Calculate MD5 hash for RNA sequence as per RNAcentral spec. + - Replace U with T + - Convert to uppercase + - Encode as ASCII + """ + # Normalize sequence + normalized_seq = sequence.replace("U", "T").replace("u", "t").upper() + if not re.fullmatch(r"[ATCGN]+", normalized_seq): + raise ValueError(f"Invalid sequence characters after normalization: {normalized_seq[:50]}...") + + return hashlib.md5(normalized_seq.encode("ascii")).hexdigest() + + def get_by_rna_id(self, rna_id: str) -> Optional[dict]: """ Get RNA information by RNAcentral ID. :param rna_id: RNAcentral ID (e.g., URS0000000001). :return: A dictionary containing RNA information or None if not found. """ try: - async with aiohttp.ClientSession() as session: - url = f"{self.base_url}/rna/{rna_id}" - async with session.get( - url, headers=self.headers, timeout=aiohttp.ClientTimeout(total=30) - ) as resp: - if resp.status == 200: - rna_data = await resp.json() - - # Check if xrefs is a URL and fetch the actual xrefs data - xrefs_data = None - xrefs_url = rna_data.get("xrefs") - if xrefs_url and isinstance(xrefs_url, str) and xrefs_url.startswith("http"): - try: - xrefs_data = await self._fetch_all_xrefs(xrefs_url, session) - logger.debug("Fetched %d xrefs for RNA ID %s", len(xrefs_data), rna_id) - except Exception as e: - logger.warning("Failed to fetch xrefs for RNA ID %s: %s", rna_id, e) - # Continue without xrefs data - - return self._rna_data_to_dict(rna_id, rna_data, xrefs_data) - if resp.status == 404: - logger.error("RNA ID %s not found", rna_id) - return None - raise Exception(f"HTTP {resp.status}: {await resp.text()}") - except aiohttp.ClientError as e: + url = f"{self.base_url}/rna/{rna_id}" + url += "?flat=true" + + resp = requests.get(url, headers=self.headers, timeout=30) + if resp.status_code == 200: + rna_data = resp.json() + xrefs_data = rna_data.get("xrefs", []) + return self._rna_data_to_dict(rna_id, rna_data, xrefs_data) + logger.error("Failed to fetch RNA ID %s: HTTP %s", rna_id, resp.status_code) + return None + except requests.RequestException as e: logger.error("Network error getting RNA ID %s: %s", rna_id, e) return None - except Exception as exc: # pylint: disable=broad-except - logger.error("RNA ID %s not found: %s", rna_id, exc) + except Exception as e: # pylint: disable=broad-except + logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e) return None - async def get_best_hit(self, keyword: str) -> Optional[dict]: + def get_best_hit(self, keyword: str) -> Optional[dict]: """ Search RNAcentral with a keyword and return the best hit. Unified approach: Find RNA ID from search, then call get_by_rna_id() for complete information. @@ -263,42 +196,35 @@ async def get_best_hit(self, keyword: str) -> Optional[dict]: return None try: - async with aiohttp.ClientSession() as session: - search_url = f"{self.base_url}/rna" - params = {"search": keyword, "format": "json"} - async with session.get( - search_url, - params=params, - headers=self.headers, - timeout=aiohttp.ClientTimeout(total=30), - ) as resp: - if resp.status == 200: - search_results = await resp.json() - results = search_results.get("results", []) - if results: - # Step 1: Get RNA ID from search results - first_result = results[0] - rna_id = first_result.get("rnacentral_id") - - if rna_id: - # Step 2: Unified call to get_by_rna_id() for complete information - result = await self.get_by_rna_id(rna_id) - - # Step 3: If get_by_rna_id() failed, use search result data as fallback - if not result: - logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) - result = self._rna_data_to_dict(rna_id, first_result) - - return result - logger.info("No results found for keyword: %s", keyword) - return None - error_text = await resp.text() - logger.error("HTTP %d error for keyword %s: %s", resp.status, keyword, error_text[:200]) - raise Exception(f"HTTP {resp.status}: {error_text}") + search_url = f"{self.base_url}/rna" + params = {"search": keyword, "format": "json"} + + resp = requests.get( + search_url, + params=params, + headers=self.headers, + timeout=30, + ) + if resp.status_code == 200: + search_results = resp.json() + results = search_results.get("results", []) + if results: + # Step 1: Get RNA ID from search results + first_result = results[0] + rna_id = first_result.get("rnacentral_id") + + if rna_id: + # Step 2: Unified call to get_by_rna_id() for complete information + return self.get_by_rna_id(rna_id) + # Step 3: If get_by_rna_id() failed, use search result data as fallback + logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) + return self._rna_data_to_dict(rna_id, first_result) + logger.error("No RNA ID found for keyword %s", keyword) + return None except aiohttp.ClientError as e: logger.error("Network error searching for keyword %s: %s", keyword, e) return None - except Exception as e: # pylint: disable=broad-except + except Exception as e: logger.error("Keyword %s not found: %s", keyword, e) return None @@ -339,16 +265,6 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]: logger.error("Local blastn failed: %s", exc) return None - @staticmethod - def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: - """Extract and normalize RNA sequence from input.""" - if sequence.startswith(">"): - seq_lines = sequence.strip().split("\n") - seq = "".join(seq_lines[1:]) - else: - seq = sequence.strip().replace(" ", "").replace("\n", "") - return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None - def _find_best_match_from_results(self, results: List[Dict], seq: str) -> Optional[Dict]: """Find best match from search results, preferring exact match.""" exact_match = None @@ -382,7 +298,7 @@ async def _process_api_search_results( result = self._rna_data_to_dict(rna_id, target_result) return result - async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: + def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: """ Search RNAcentral with an RNA sequence. Tries local BLAST first if enabled, falls back to RNAcentral API. @@ -391,9 +307,17 @@ async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Op :param threshold: E-value threshold for BLAST search. :return: A dictionary containing complete RNA information or None if not found. """ - result = None + def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: + """Extract and normalize RNA sequence from input.""" + if sequence.startswith(">"): + seq_lines = sequence.strip().split("\n") + seq = "".join(seq_lines[1:]) + else: + seq = sequence.strip().replace(" ", "").replace("\n", "") + return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None + try: - seq = self._extract_and_normalize_sequence(sequence) + seq = _extract_and_normalize_sequence(sequence) if not seq: logger.error("Empty or invalid RNA sequence provided.") return None @@ -403,35 +327,27 @@ async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Op accession = self._local_blast(seq, threshold) if accession: logger.debug("Local BLAST found accession: %s", accession) - result = await self.get_by_rna_id(accession) - if not result: - result = await self.get_best_hit(accession) + return self.get_by_rna_id(accession) # Fall back to RNAcentral API if local BLAST didn't find result - if not result: - logger.debug("Falling back to RNAcentral API.") - async with aiohttp.ClientSession() as session: - search_url = f"{self.base_url}/rna" - params = {"sequence": seq, "format": "json"} - async with session.get( - search_url, - params=params, - headers=self.headers, - timeout=aiohttp.ClientTimeout(total=60), # Sequence search may take longer - ) as resp: - if resp.status == 200: - search_results = await resp.json() - results = search_results.get("results", []) - result = await self._process_api_search_results(results, seq) - else: - error_text = await resp.text() - logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200]) - raise Exception(f"HTTP {resp.status}: {error_text}") - except aiohttp.ClientError as e: - logger.error("Network error searching for sequence: %s", e) + logger.debug("Falling back to RNAcentral API.") + + md5_hash = self._calculate_md5(seq) + search_url = f"{self.base_url}/rna" + params = {"md5": md5_hash, "format": "json"} + + resp = requests.get(search_url, params=params, headers=self.headers, timeout=60) # Sequence search may take longer + if resp.status_code == 200: + search_results = resp.json() + results = search_results.get("results", []) + return self._process_api_search_results(results, seq) + error_text = resp.text() + logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200]) + raise Exception(f"HTTP {resp.status}: {error_text}") except Exception as e: # pylint: disable=broad-except logger.error("Sequence search failed: %s", e) - return result + return None + @retry( stop=stop_after_attempt(3), @@ -458,17 +374,19 @@ async def search( logger.debug("RNAcentral search query: %s", query) + loop = asyncio.get_running_loop() + # check if RNA sequence (AUCG characters, contains U) if query.startswith(">") or ( re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper() ): - result = await self.search_by_sequence(query, threshold) + result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) # check if RNAcentral ID (typically starts with URS) elif re.fullmatch(r"URS\d+", query, re.I): - result = await self.get_by_rna_id(query) + result = await loop.run_in_executor(_get_pool(), self.get_by_rna_id, query) else: # otherwise treat as keyword - result = await self.get_best_hit(query) + result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) if result: result["_search_query"] = query From e1530f967e0e3bc5dbcf575872bc525d4d6b6353 Mon Sep 17 00:00:00 2001 From: chenzihong <522023320011@smail.nju.edu.cn> Date: Mon, 1 Dec 2025 14:18:29 +0800 Subject: [PATCH 19/22] perf: optimize code style --- .../models/searcher/db/rnacentral_searcher.py | 190 +++++++----------- 1 file changed, 69 insertions(+), 121 deletions(-) diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index afbf0ca7..4b288d9b 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -46,12 +46,24 @@ def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db" self.use_local_blast = False @staticmethod - def _extract_info_from_xrefs(xrefs: List[Dict[str, Any]]) -> Dict[str, Any]: - organisms: Set[str] = set() - gene_names: Set[str] = set() + def _rna_data_to_dict( + rna_id: str, + rna_data: Dict[str, Any], + xrefs_data: Optional[List[Dict[str, Any]]] = None + ) -> Dict[str, Any]: + organisms, gene_names, so_terms = set(), set(), set() modifications: List[Any] = [] - so_terms: Set[str] = set() - xrefs_list: List[Dict[str, Any]] = [] + + for xref in xrefs_data or []: + acc = xref.get("accession", {}) + if s := acc.get("species"): + organisms.add(s) + if g := acc.get("gene", "").strip(): + gene_names.add(g) + if m := xref.get("modifications"): + modifications.extend(m) + if b := acc.get("biotype"): + so_terms.add(b) def format_unique_values(values: Set[str]) -> Optional[str]: if not values: @@ -60,44 +72,14 @@ def format_unique_values(values: Set[str]) -> Optional[str]: return next(iter(values)) return ", ".join(sorted(values)) - for xref in xrefs: - accession = xref.get("accession", {}) - species = accession.get("species") - gene = accession.get("gene") - stripped_gene = gene.strip() if gene else None - if species: - organisms.add(species) - if stripped_gene: - gene_names.add(stripped_gene) - if mods := xref.get("modifications"): - modifications.extend(mods) - if biotype := accession.get("biotype"): - so_terms.add(biotype) - - xrefs_list.append({ - "database": xref.get("database"), - "accession_id": accession.get("id"), - "external_id": accession.get("external_id"), - "description": accession.get("description"), - "species": species, - "gene": stripped_gene, - }) - - return { + xrefs_info = { "organism": format_unique_values(organisms), "gene_name": format_unique_values(gene_names), "related_genes": list(gene_names) if gene_names else None, "modifications": modifications or None, "so_term": format_unique_values(so_terms), - "xrefs": xrefs_list or None, } - @staticmethod - def _rna_data_to_dict( - rna_id: str, - rna_data: Dict[str, Any], - xrefs_data: Optional[List[Dict[str, Any]]] = None - ) -> Dict[str, Any]: fallback_rules = { "organism": ["organism", "species"], "related_genes": ["related_genes", "genes"], @@ -106,10 +88,8 @@ def _rna_data_to_dict( "modifications": ["modifications"], } - xrefs_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data) if xrefs_data else {} - def resolve_field(field_name: str) -> Any: - if value := xrefs_info.get(field_name): + if (value := xrefs_info.get(field_name)) is not None: return value for key in fallback_rules[field_name]: @@ -128,7 +108,6 @@ def resolve_field(field_name: str) -> Any: related_genes = [single_gene] sequence = rna_data.get("sequence", "") - extracted_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data) if xrefs_data else {} return { "molecule_type": "RNA", @@ -172,13 +151,12 @@ def get_by_rna_id(self, rna_id: str) -> Optional[dict]: url = f"{self.base_url}/rna/{rna_id}" url += "?flat=true" - resp = requests.get(url, headers=self.headers, timeout=30) - if resp.status_code == 200: - rna_data = resp.json() - xrefs_data = rna_data.get("xrefs", []) - return self._rna_data_to_dict(rna_id, rna_data, xrefs_data) - logger.error("Failed to fetch RNA ID %s: HTTP %s", rna_id, resp.status_code) - return None + resp = requests.get(url, headers=self.headers) + resp.raise_for_status() + + rna_data = resp.json() + xrefs_data = rna_data.get("xrefs", []) + return self._rna_data_to_dict(rna_id, rna_data, xrefs_data) except requests.RequestException as e: logger.error("Network error getting RNA ID %s: %s", rna_id, e) return None @@ -189,44 +167,42 @@ def get_by_rna_id(self, rna_id: str) -> Optional[dict]: def get_best_hit(self, keyword: str) -> Optional[dict]: """ Search RNAcentral with a keyword and return the best hit. - Unified approach: Find RNA ID from search, then call get_by_rna_id() for complete information. :param keyword: The search keyword (e.g., miRNA name, RNA name). - :return: A dictionary containing complete RNA information or None if not found. + :return: Dictionary with RNA information or None. """ - if not keyword.strip(): + keyword = keyword.strip() + if not keyword: + logger.warning("Empty keyword provided to get_best_hit") return None try: - search_url = f"{self.base_url}/rna" + url = f"{self.base_url}/rna" params = {"search": keyword, "format": "json"} + resp = requests.get(url, params=params, headers=self.headers) + resp.raise_for_status() - resp = requests.get( - search_url, - params=params, - headers=self.headers, - timeout=30, - ) - if resp.status_code == 200: - search_results = resp.json() - results = search_results.get("results", []) - if results: - # Step 1: Get RNA ID from search results - first_result = results[0] - rna_id = first_result.get("rnacentral_id") - - if rna_id: - # Step 2: Unified call to get_by_rna_id() for complete information - return self.get_by_rna_id(rna_id) - # Step 3: If get_by_rna_id() failed, use search result data as fallback - logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) - return self._rna_data_to_dict(rna_id, first_result) - logger.error("No RNA ID found for keyword %s", keyword) - return None - except aiohttp.ClientError as e: - logger.error("Network error searching for keyword %s: %s", keyword, e) + data = resp.json() + results = data.get("results", []) + + if not results: + logger.info("No search results for keyword: %s", keyword) + return None + + first_result = results[0] + rna_id = first_result.get("rnacentral_id") + + if rna_id: + detailed = self.get_by_rna_id(rna_id) + if detailed: + return detailed + logger.debug("Using search result data for %s", rna_id or "unknown") + return self._rna_data_to_dict(rna_id or "", first_result) + + except requests.RequestException as e: + logger.error("Network error searching keyword '%s': %s", keyword, e) return None except Exception as e: - logger.error("Keyword %s not found: %s", keyword, e) + logger.error("Unexpected error searching keyword '%s': %s", keyword, e) return None def _local_blast(self, seq: str, threshold: float) -> Optional[str]: @@ -248,39 +224,6 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]: logger.error("Local blastn failed: %s", exc) return None - def _find_best_match_from_results(self, results: List[Dict], seq: str) -> Optional[Dict]: - """Find best match from search results, preferring exact match.""" - exact_match = None - for result_item in results: - result_seq = result_item.get("sequence", "") - if result_seq == seq: - exact_match = result_item - break - return exact_match if exact_match else (results[0] if results else None) - - async def _process_api_search_results( - self, results: List[Dict], seq: str - ) -> Optional[dict]: - """Process API search results and return dictionary or None.""" - if not results: - logger.info("No results found for sequence.") - return None - - target_result = self._find_best_match_from_results(results, seq) - if not target_result: - return None - - rna_id = target_result.get("rnacentral_id") - if not rna_id: - return None - - # Try to get complete information - result = await self.get_by_rna_id(rna_id) - if not result: - logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id) - result = self._rna_data_to_dict(rna_id, target_result) - return result - def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: """ Search RNAcentral with an RNA sequence. @@ -290,7 +233,7 @@ def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict] :param threshold: E-value threshold for BLAST search. :return: A dictionary containing complete RNA information or None if not found. """ - def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: + def _extract_sequence(sequence: str) -> Optional[str]: """Extract and normalize RNA sequence from input.""" if sequence.startswith(">"): seq_lines = sequence.strip().split("\n") @@ -300,7 +243,7 @@ def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None try: - seq = _extract_and_normalize_sequence(sequence) + seq = _extract_sequence(sequence) if not seq: logger.error("Empty or invalid RNA sequence provided.") return None @@ -319,19 +262,24 @@ def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: search_url = f"{self.base_url}/rna" params = {"md5": md5_hash, "format": "json"} - resp = requests.get(search_url, params=params, headers=self.headers, timeout=60) # Sequence search may take longer - if resp.status_code == 200: - search_results = resp.json() - results = search_results.get("results", []) - return self._process_api_search_results(results, seq) - error_text = resp.text() - logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200]) - raise Exception(f"HTTP {resp.status}: {error_text}") - except Exception as e: # pylint: disable=broad-except + resp = requests.get(search_url, params=params, headers=self.headers) + resp.raise_for_status() + + search_results = resp.json() + results = search_results.get("results", []) + + if not results: + logger.info("No exact match found in RNAcentral for sequence") + return None + rna_id = results[0].get("rnacentral_id") + if not rna_id: + logger.error("No RNAcentral ID found in search results.") + return None + return self.get_by_rna_id(rna_id) + except Exception as e: logger.error("Sequence search failed: %s", e) return None - @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10), From 61f7f4446f08639d8243c3ff103a6920fff44c37 Mon Sep 17 00:00:00 2001 From: chenzihong <522023320011@smail.nju.edu.cn> Date: Mon, 1 Dec 2025 14:23:24 +0800 Subject: [PATCH 20/22] fix: fix lint problems --- graphgen/models/searcher/db/rnacentral_searcher.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index 4b288d9b..58c5e86e 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -48,7 +48,7 @@ def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db" @staticmethod def _rna_data_to_dict( rna_id: str, - rna_data: Dict[str, Any], + rna_data: Dict[str, Any], xrefs_data: Optional[List[Dict[str, Any]]] = None ) -> Dict[str, Any]: organisms, gene_names, so_terms = set(), set(), set() @@ -151,7 +151,7 @@ def get_by_rna_id(self, rna_id: str) -> Optional[dict]: url = f"{self.base_url}/rna/{rna_id}" url += "?flat=true" - resp = requests.get(url, headers=self.headers) + resp = requests.get(url, headers=self.headers, timeout=30) resp.raise_for_status() rna_data = resp.json() @@ -178,7 +178,7 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: try: url = f"{self.base_url}/rna" params = {"search": keyword, "format": "json"} - resp = requests.get(url, params=params, headers=self.headers) + resp = requests.get(url, params=params, headers=self.headers, timeout=30) resp.raise_for_status() data = resp.json() @@ -262,7 +262,7 @@ def _extract_sequence(sequence: str) -> Optional[str]: search_url = f"{self.base_url}/rna" params = {"md5": md5_hash, "format": "json"} - resp = requests.get(search_url, params=params, headers=self.headers) + resp = requests.get(search_url, params=params, headers=self.headers, timeout=60) resp.raise_for_status() search_results = resp.json() From 2c00b9e750450aa13875f81bba1fb5328f62abb9 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Mon, 1 Dec 2025 16:25:02 +0800 Subject: [PATCH 21/22] fix: search setup problems --- graphgen/graphgen.py | 2 +- graphgen/models/searcher/db/ncbi_searcher.py | 3 +++ requirements.txt | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py index 167981e9..bc7e7742 100644 --- a/graphgen/graphgen.py +++ b/graphgen/graphgen.py @@ -45,7 +45,7 @@ def __init__( # llm self.tokenizer_instance: Tokenizer = tokenizer_instance or Tokenizer( - model_name=os.getenv("TOKENIZER_MODEL") + model_name=os.getenv("TOKENIZER_MODEL", "cl100k_base") ) self.synthesizer_llm_client: BaseLLMWrapper = ( diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index 655ea4fd..ae06db3d 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -44,6 +44,7 @@ def __init__( local_blast_db: str = "nt_db", email: str = "email@example.com", api_key: str = "", + tool: str = "GraphGen", ): """ Initialize the NCBI Search client. @@ -53,10 +54,12 @@ def __init__( local_blast_db (str): Path to the local BLAST database. email (str): Email address for NCBI API requests. api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/. + tool (str): Tool name for NCBI API requests. """ super().__init__() Entrez.timeout = 60 # 60 seconds timeout Entrez.email = email + Entrez.tool = tool if api_key: Entrez.api_key = api_key Entrez.max_tries = 10 if api_key else 3 diff --git a/requirements.txt b/requirements.txt index 47965013..fa2b1efc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,6 +21,7 @@ fastapi trafilatura aiohttp diskcache +socksio leidenalg igraph From 6d0be7aa2713e5d18dc7a50f0c5be4c8db8049dc Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Mon, 1 Dec 2025 18:21:00 +0800 Subject: [PATCH 22/22] feat: more examples in search demo --- graphgen/models/searcher/db/ncbi_searcher.py | 25 ++++++++++++------- .../input_examples/search_dna_demo.jsonl | 4 +++ .../input_examples/search_protein_demo.jsonl | 5 ++++ .../input_examples/search_rna_demo.jsonl | 1 + 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index ae06db3d..0de8ecc0 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -27,6 +27,10 @@ def _get_pool(): return ThreadPoolExecutor(max_workers=10) +# ensure only one NCBI request at a time +_ncbi_lock = asyncio.Lock() + + class NCBISearch(BaseSearcher): """ NCBI Search client to search DNA/GenBank/Entrez databases. @@ -236,6 +240,7 @@ def _extract_gene_id(link_handle): return str(link.get("Id") if isinstance(link, dict) else link) try: + # TODO: support accession number with version number (e.g., NM_000546.3) with Entrez.elink(dbfrom="nuccore", db="gene", id=accession) as link_handle: gene_id = _extract_gene_id(link_handle) @@ -368,15 +373,17 @@ async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optiona loop = asyncio.get_running_loop() - # Auto-detect query type and execute in thread pool - if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): - result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) - elif re.fullmatch(r"^\d+$", query): - result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query) - elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): - result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query) - else: - result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) + # limit concurrent requests (NCBI rate limit: max 3 requests per second) + async with _ncbi_lock: + # Auto-detect query type and execute in thread pool + if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): + result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) + elif re.fullmatch(r"^\d+$", query): + result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query) + elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): + result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query) + else: + result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) if result: result["_search_query"] = query diff --git a/resources/input_examples/search_dna_demo.jsonl b/resources/input_examples/search_dna_demo.jsonl index 387c87b8..346b65f0 100644 --- a/resources/input_examples/search_dna_demo.jsonl +++ b/resources/input_examples/search_dna_demo.jsonl @@ -1,5 +1,9 @@ {"type": "text", "content": "TP53"} {"type": "text", "content": "BRCA1"} {"type": "text", "content": "672"} +{"type": "text", "content": "11998"} {"type": "text", "content": "NM_000546"} +{"type": "text", "content": "NM_024140"} +{"type": "text", "content": ">query\nCTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"} {"type": "text", "content": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"} + diff --git a/resources/input_examples/search_protein_demo.jsonl b/resources/input_examples/search_protein_demo.jsonl index 82b77836..e119cec8 100644 --- a/resources/input_examples/search_protein_demo.jsonl +++ b/resources/input_examples/search_protein_demo.jsonl @@ -2,6 +2,11 @@ {"type": "text", "content": "P68871"} {"type": "text", "content": "P02768"} {"type": "text", "content": "P04637"} +{"type": "text", "content": "insulin"} +{"type": "text", "content": "hemoglobin"} +{"type": "text", "content": "p53"} +{"type": "text", "content": "BRCA1"} +{"type": "text", "content": "albumin"} {"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} {"type": "text", "content": "MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"} {"type": "text", "content": "MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"} diff --git a/resources/input_examples/search_rna_demo.jsonl b/resources/input_examples/search_rna_demo.jsonl index caa28612..16e99479 100644 --- a/resources/input_examples/search_rna_demo.jsonl +++ b/resources/input_examples/search_rna_demo.jsonl @@ -1,4 +1,5 @@ {"type": "text", "content": "hsa-let-7a-1"} {"type": "text", "content": "URS0000123456"} {"type": "text", "content": "URS0000000001"} +{"type": "text", "content": ">query\nCUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} {"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}