From 0da7ce1036fa8c277563b2c9407ecbf92e5ccafa Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Wed, 26 Nov 2025 20:21:03 +0800
Subject: [PATCH 01/22] feat: Add DNA and RNA search functionality

- Add NCBISearch searcher for DNA/GenBank/Entrez database search
- Add RNACentralSearch searcher for RNA database search
- Update search_all.py to support ncbi and rnacentral data sources
- Add search configs for DNA, RNA, and protein (renamed from search_config)
- Add search scripts for DNA and RNA
- Add demo input files for DNA and RNA search
- Update search_uniprot.sh to use search_protein_config.yaml

This PR extends the search functionality to support biological data types:
- Protein search (existing, now explicitly named)
- DNA search via NCBI
- RNA search via RNAcentral
---
 graphgen/configs/search_dna_config.yaml       |  15 +
 ...config.yaml => search_protein_config.yaml} |   2 +-
 graphgen/configs/search_rna_config.yaml       |  14 +
 graphgen/models/__init__.py                   |   2 +
 graphgen/models/searcher/db/ncbi_searcher.py  | 296 ++++++++++++++++++
 .../models/searcher/db/rnacentral_searcher.py | 191 +++++++++++
 graphgen/operators/search/search_all.py       |  41 ++-
 .../input_examples/search_dna_demo.jsonl      |   4 +
 ...h_demo.jsonl => search_protein_demo.jsonl} |   0
 scripts/search/search_dna.sh                  |   4 +
 scripts/search/search_rna.sh                  |   4 +
 scripts/search/search_uniprot.sh              |   2 +-
 12 files changed, 568 insertions(+), 7 deletions(-)
 create mode 100644 graphgen/configs/search_dna_config.yaml
 rename graphgen/configs/{search_config.yaml => search_protein_config.yaml} (71%)
 create mode 100644 graphgen/configs/search_rna_config.yaml
 create mode 100644 graphgen/models/searcher/db/ncbi_searcher.py
 create mode 100644 graphgen/models/searcher/db/rnacentral_searcher.py
 create mode 100644 resources/input_examples/search_dna_demo.jsonl
 rename resources/input_examples/{search_demo.jsonl => search_protein_demo.jsonl} (100%)
 create mode 100644 scripts/search/search_dna.sh
 create mode 100644 scripts/search/search_rna.sh

diff --git a/graphgen/configs/search_dna_config.yaml b/graphgen/configs/search_dna_config.yaml
new file mode 100644
index 00000000..95f8fc39
--- /dev/null
+++ b/graphgen/configs/search_dna_config.yaml
@@ -0,0 +1,15 @@
+pipeline:
+  - name: read_step
+    op_key: read
+    params:
+      input_file:  resources/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+
+  - name: search_step
+    op_key: search
+    deps: [read_step] # search_step depends on read_step
+    params:
+      data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
+      ncbi_params:
+        email: test@example.com # NCBI requires an email address
+        tool: GraphGen # tool name for NCBI API
+
diff --git a/graphgen/configs/search_config.yaml b/graphgen/configs/search_protein_config.yaml
similarity index 71%
rename from graphgen/configs/search_config.yaml
rename to graphgen/configs/search_protein_config.yaml
index 63ebd241..bb46d34c 100644
--- a/graphgen/configs/search_config.yaml
+++ b/graphgen/configs/search_protein_config.yaml
@@ -2,7 +2,7 @@ pipeline:
   - name: read_step
     op_key: read
     params:
-      input_file:  resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      input_file:  resources/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 
   - name: search_step
     op_key: search
diff --git a/graphgen/configs/search_rna_config.yaml b/graphgen/configs/search_rna_config.yaml
new file mode 100644
index 00000000..3d051417
--- /dev/null
+++ b/graphgen/configs/search_rna_config.yaml
@@ -0,0 +1,14 @@
+pipeline:
+  - name: read_step
+    op_key: read
+    params:
+      input_file:  resources/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+
+  - name: search_step
+    op_key: search
+    deps: [read_step] # search_step depends on read_step
+    params:
+      data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
+      rnacentral_params:
+        {} # RNAcentral doesn't require additional parameters currently
+
diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py
index 68fd2a5d..bb73548d 100644
--- a/graphgen/models/__init__.py
+++ b/graphgen/models/__init__.py
@@ -26,6 +26,8 @@
     RDFReader,
     TXTReader,
 )
+from .searcher.db.ncbi_searcher import NCBISearch
+from .searcher.db.rnacentral_searcher import RNACentralSearch
 from .searcher.db.uniprot_searcher import UniProtSearch
 from .searcher.kg.wiki_search import WikiSearch
 from .searcher.web.bing_search import BingSearch
diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
new file mode 100644
index 00000000..73a9ad87
--- /dev/null
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -0,0 +1,296 @@
+import asyncio
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
+from http.client import IncompleteRead
+from typing import Dict, Optional
+
+from Bio import Entrez
+from requests.exceptions import RequestException
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from graphgen.bases import BaseSearcher
+from graphgen.utils import logger
+
+
+@lru_cache(maxsize=None)
+def _get_pool():
+    return ThreadPoolExecutor(max_workers=10)
+
+
+class NCBISearch(BaseSearcher):
+    """
+    NCBI Search client to search DNA/GenBank/Entrez databases.
+    1) Get the gene/DNA by accession number or gene ID.
+    2) Search with keywords or gene names (fuzzy search).
+    3) Search with FASTA sequence (BLAST search for DNA sequences).
+    
+    API Documentation: https://www.ncbi.nlm.nih.gov/home/develop/api/
+    Note: NCBI has rate limits (max 3 requests per second), delays are required between requests.
+    """
+
+    def __init__(self, email: str = "test@example.com", tool: str = "GraphGen"):
+        super().__init__()
+        Entrez.email = email
+        Entrez.tool = tool
+        Entrez.timeout = 60  # 60 seconds timeout
+
+    def get_by_gene_id(self, gene_id: str) -> Optional[dict]:
+        """
+        Get gene information by Gene ID.
+        :param gene_id: NCBI Gene ID.
+        :return: A dictionary containing gene information or None if not found.
+        """
+        try:
+            time.sleep(0.35)  # Comply with rate limit (max 3 requests per second)
+            handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml")
+            try:
+                gene_record = Entrez.read(handle)
+                if not gene_record:
+                    return None
+                
+                gene_data = gene_record[0]
+                gene_ref = gene_data.get("Entrezgene_gene", {}).get("Gene-ref", {})
+                
+                return {
+                    "molecule_type": "DNA",
+                    "database": "NCBI",
+                    "id": gene_id,
+                    "gene_name": gene_ref.get("Gene-ref_locus", "N/A"),
+                    "gene_description": gene_ref.get("Gene-ref_desc", "N/A"),
+                    "organism": gene_data.get("Entrezgene_source", {}).get("BioSource", {}).get("BioSource_org", {}).get("Org-ref", {}).get("Org-ref_taxname", "N/A"),
+                    "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}",
+                }
+            finally:
+                handle.close()
+        except RequestException:
+            raise
+        except Exception as exc:  # pylint: disable=broad-except
+            logger.error("Gene ID %s not found: %s", gene_id, exc)
+            return None
+
+    def get_by_accession(self, accession: str) -> Optional[dict]:
+        """
+        Get sequence information by accession number.
+        :param accession: NCBI accession number (e.g., NM_000546).
+        :return: A dictionary containing sequence information or None if not found.
+        """
+        try:
+            time.sleep(0.35)  # 遵守速率限制
+            handle = Entrez.efetch(
+                db="nuccore",
+                id=accession,
+                rettype="fasta",
+                retmode="text",
+            )
+            try:
+                sequence_data = handle.read()
+                if not sequence_data:
+                    return None
+                
+                seq_lines = sequence_data.strip().split("\n")
+                header = seq_lines[0] if seq_lines else ""
+                sequence = "".join(seq_lines[1:])
+                
+                # Try to get more information
+                time.sleep(0.35)
+                summary_handle = Entrez.esummary(db="nuccore", id=accession)
+                try:
+                    summary = Entrez.read(summary_handle)
+                    if summary:
+                        summary_data = summary[0]
+                        title = summary_data.get("Title", header)
+                        organism = summary_data.get("Organism", "N/A")
+                    else:
+                        title = header
+                        organism = "N/A"
+                finally:
+                    summary_handle.close()
+                
+                return {
+                    "molecule_type": "DNA",
+                    "database": "NCBI",
+                    "id": accession,
+                    "title": title,
+                    "organism": organism,
+                    "sequence": sequence,
+                    "sequence_length": len(sequence),
+                    "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}",
+                }
+            finally:
+                handle.close()
+        except RequestException:
+            raise
+        except Exception as exc:  # pylint: disable=broad-except
+            logger.error("Accession %s not found: %s", accession, exc)
+            return None
+
+    def search_by_keyword(self, keyword: str) -> Optional[dict]:
+        """
+        Search NCBI Gene database with a keyword and return the best hit.
+        :param keyword: The search keyword (e.g., gene name).
+        :return: A dictionary containing the best hit information or None if not found.
+        """
+        if not keyword.strip():
+            return None
+
+        try:
+            time.sleep(0.35)  # 遵守速率限制
+            # Search gene database
+            search_handle = Entrez.esearch(
+                db="gene",
+                term=f"{keyword}[Gene Name] OR {keyword}[All Fields]",
+                retmax=1,
+            )
+            try:
+                search_results = Entrez.read(search_handle)
+                if not search_results.get("IdList"):
+                    # If not found, try a broader search
+                    time.sleep(0.35)
+                    search_handle2 = Entrez.esearch(
+                        db="gene",
+                        term=keyword,
+                        retmax=1,
+                    )
+                    try:
+                        search_results = Entrez.read(search_handle2)
+                    finally:
+                        search_handle2.close()
+                
+                if search_results.get("IdList"):
+                    gene_id = search_results["IdList"][0]
+                    return self.get_by_gene_id(gene_id)
+            finally:
+                search_handle.close()
+        except RequestException:
+            raise
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Keyword %s not found: %s", keyword, e)
+        return None
+
+    def search_by_sequence(self, sequence: str) -> Optional[dict]:
+        """
+        Search NCBI with a DNA sequence using BLAST.
+        Note: This is a simplified version. For production, consider using local BLAST.
+        :param sequence: DNA sequence (FASTA format or raw sequence).
+        :return: A dictionary containing the best hit information or None if not found.
+        """
+        try:
+            # Extract sequence (if in FASTA format)
+            if sequence.startswith(">"):
+                seq_lines = sequence.strip().split("\n")
+                seq = "".join(seq_lines[1:])
+            else:
+                seq = sequence.strip().replace(" ", "").replace("\n", "")
+            
+            # Validate if it's a DNA sequence
+            if not re.fullmatch(r"[ATCGN\s]+", seq, re.I):
+                logger.error("Invalid DNA sequence provided.")
+                return None
+            
+            if not seq:
+                logger.error("Empty DNA sequence provided.")
+                return None
+            
+            # Use BLAST search (Note: requires network connection, may be slow)
+            logger.debug("Performing BLAST search for DNA sequence...")
+            time.sleep(0.35)
+            from Bio.Blast import NCBIWWW, NCBIXML
+            
+            result_handle = NCBIWWW.qblast(
+                program="blastn",
+                database="nr",
+                sequence=seq,
+                hitlist_size=1,
+                expect=0.001,
+            )
+            blast_record = NCBIXML.read(result_handle)
+            
+            if not blast_record.alignments:
+                logger.info("No BLAST hits found for the given sequence.")
+                return None
+            
+            best_alignment = blast_record.alignments[0]
+            best_hsp = best_alignment.hsps[0]
+            hit_id = best_alignment.hit_id
+            
+            # Extract accession number
+            # Format may be: gi|123456|ref|NM_000546.5|
+            accession_match = re.search(r"ref\|([^|]+)", hit_id)
+            if accession_match:
+                accession = accession_match.group(1).split(".")[0]
+                return self.get_by_accession(accession)
+            else:
+                # If unable to extract accession, return basic information
+                return {
+                    "molecule_type": "DNA",
+                    "database": "NCBI",
+                    "id": hit_id,
+                    "title": best_alignment.title,
+                    "sequence_length": len(seq),
+                    "e_value": best_hsp.expect,
+                    "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0,
+                    "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
+                }
+        except RequestException:
+            raise
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("BLAST search failed: %s", e)
+            return None
+
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+    )
+    async def search(
+        self, query: str, **kwargs
+    ) -> Optional[Dict]:
+        """
+        Search NCBI with either a gene ID, accession number, keyword, or DNA sequence.
+        :param query: The search query (gene ID, accession, keyword, or DNA sequence).
+        :param kwargs: Additional keyword arguments (not used currently).
+        :return: A dictionary containing the search results or None if not found.
+        """
+        # auto detect query type
+        if not query or not isinstance(query, str):
+            logger.error("Empty or non-string input.")
+            return None
+        query = query.strip()
+
+        logger.debug("NCBI search query: %s", query)
+
+        loop = asyncio.get_running_loop()
+
+        # check if DNA sequence (ATCG characters)
+        if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
+            result = await loop.run_in_executor(
+                _get_pool(), self.search_by_sequence, query
+            )
+        # check if gene ID (numeric)
+        elif re.fullmatch(r"^\d+$", query):
+            result = await loop.run_in_executor(
+                _get_pool(), self.get_by_gene_id, query
+            )
+        # check if accession number (e.g., NM_000546, NC_000001)
+        elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
+            result = await loop.run_in_executor(
+                _get_pool(), self.get_by_accession, query
+            )
+        else:
+            # otherwise treat as keyword
+            result = await loop.run_in_executor(
+                _get_pool(), self.search_by_keyword, query
+            )
+
+        if result:
+            result["_search_query"] = query
+        return result
+
diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
new file mode 100644
index 00000000..d1decd6d
--- /dev/null
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -0,0 +1,191 @@
+import asyncio
+import re
+from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
+from typing import Dict, Optional
+
+import aiohttp
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from graphgen.bases import BaseSearcher
+from graphgen.utils import logger
+
+
+@lru_cache(maxsize=None)
+def _get_pool():
+    return ThreadPoolExecutor(max_workers=10)
+
+
+class RNACentralSearch(BaseSearcher):
+    """
+    RNAcentral Search client to search RNA databases.
+    1) Get RNA by RNAcentral ID.
+    2) Search with keywords or RNA names (fuzzy search).
+    3) Search with RNA sequence.
+    
+    API Documentation: https://rnacentral.org/api/v1
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.base_url = "https://rnacentral.org/api/v1"
+        self.headers = {"Accept": "application/json"}
+
+    async def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
+        """
+        Get RNA information by RNAcentral ID.
+        :param rna_id: RNAcentral ID (e.g., URS0000000001).
+        :return: A dictionary containing RNA information or None if not found.
+        """
+        try:
+            async with aiohttp.ClientSession() as session:
+                url = f"{self.base_url}/rna/{rna_id}"
+                async with session.get(
+                    url, headers=self.headers, timeout=aiohttp.ClientTimeout(total=30)
+                ) as resp:
+                    if resp.status == 200:
+                        rna_data = await resp.json()
+                        return {
+                            "molecule_type": "RNA",
+                            "database": "RNAcentral",
+                            "id": rna_id,
+                            "rnacentral_id": rna_data.get("rnacentral_id", "N/A"),
+                            "sequence": rna_data.get("sequence", ""),
+                            "sequence_length": len(rna_data.get("sequence", "")),
+                            "rna_type": rna_data.get("rna_type", "N/A"),
+                            "description": rna_data.get("description", "N/A"),
+                            "url": f"https://rnacentral.org/rna/{rna_id}",
+                        }
+                    elif resp.status == 404:
+                        logger.error("RNA ID %s not found", rna_id)
+                        return None
+                    else:
+                        raise Exception(f"HTTP {resp.status}: {await resp.text()}")
+        except Exception as exc:  # pylint: disable=broad-except
+            logger.error("RNA ID %s not found: %s", rna_id, exc)
+            return None
+
+    async def search_by_keyword(self, keyword: str) -> Optional[dict]:
+        """
+        Search RNAcentral with a keyword and return the best hit.
+        :param keyword: The search keyword (e.g., miRNA name, RNA name).
+        :return: A dictionary containing the best hit information or None if not found.
+        """
+        if not keyword.strip():
+            return None
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                search_url = f"{self.base_url}/rna"
+                params = {"search": keyword, "format": "json"}
+                async with session.get(
+                    search_url,
+                    params=params,
+                    headers=self.headers,
+                    timeout=aiohttp.ClientTimeout(total=30),
+                ) as resp:
+                    if resp.status == 200:
+                        search_results = await resp.json()
+                        if search_results.get("results"):
+                            rna_id = search_results["results"][0].get("rnacentral_id")
+                            if rna_id:
+                                return await self.get_by_rna_id(rna_id)
+                        logger.info("No results found for keyword: %s", keyword)
+                        return None
+                    else:
+                        raise Exception(f"HTTP {resp.status}: {await resp.text()}")
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Keyword %s not found: %s", keyword, e)
+            return None
+
+    async def search_by_sequence(self, sequence: str) -> Optional[dict]:
+        """
+        Search RNAcentral with an RNA sequence.
+        :param sequence: RNA sequence (FASTA format or raw sequence).
+        :return: A dictionary containing the best hit information or None if not found.
+        """
+        try:
+            # Extract sequence (if in FASTA format)
+            if sequence.startswith(">"):
+                seq_lines = sequence.strip().split("\n")
+                seq = "".join(seq_lines[1:])
+            else:
+                seq = sequence.strip().replace(" ", "").replace("\n", "")
+            
+            # Validate if it's an RNA sequence (contains U instead of T)
+            if not re.fullmatch(r"[AUCGN\s]+", seq, re.I):
+                logger.error("Invalid RNA sequence provided.")
+                return None
+            
+            if not seq:
+                logger.error("Empty RNA sequence provided.")
+                return None
+            
+            # RNAcentral API supports sequence search
+            async with aiohttp.ClientSession() as session:
+                search_url = f"{self.base_url}/rna"
+                params = {"sequence": seq, "format": "json"}
+                async with session.get(
+                    search_url,
+                    params=params,
+                    headers=self.headers,
+                    timeout=aiohttp.ClientTimeout(total=60),  # Sequence search may take longer
+                ) as resp:
+                    if resp.status == 200:
+                        search_results = await resp.json()
+                        if search_results.get("results"):
+                            rna_id = search_results["results"][0].get("rnacentral_id")
+                            if rna_id:
+                                return await self.get_by_rna_id(rna_id)
+                        logger.info("No results found for sequence.")
+                        return None
+                    else:
+                        raise Exception(f"HTTP {resp.status}: {await resp.text()}")
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Sequence search failed: %s", e)
+            return None
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)),
+        reraise=True,
+    )
+    async def search(
+        self, query: str, **kwargs
+    ) -> Optional[Dict]:
+        """
+        Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence.
+        :param query: The search query (RNAcentral ID, keyword, or RNA sequence).
+        :param kwargs: Additional keyword arguments (not used currently).
+        :return: A dictionary containing the search results or None if not found.
+        """
+        # auto detect query type
+        if not query or not isinstance(query, str):
+            logger.error("Empty or non-string input.")
+            return None
+        query = query.strip()
+
+        logger.debug("RNAcentral search query: %s", query)
+
+        # check if RNA sequence (AUCG characters, contains U)
+        if query.startswith(">") or (
+            re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper()
+        ):
+            result = await self.search_by_sequence(query)
+        # check if RNAcentral ID (typically starts with URS)
+        elif re.fullmatch(r"URS\d+", query, re.I):
+            result = await self.get_by_rna_id(query)
+        else:
+            # otherwise treat as keyword
+            result = await self.search_by_keyword(query)
+
+        if result:
+            result["_search_query"] = query
+        return result
+
diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py
index 6c543dbf..6017cfee 100644
--- a/graphgen/operators/search/search_all.py
+++ b/graphgen/operators/search/search_all.py
@@ -27,6 +27,10 @@ async def search_all(
     data_sources = search_config.get("data_sources", [])
 
     for data_source in data_sources:
+        data = list(seed_data.values())
+        data = [d["content"] for d in data if "content" in d]
+        data = list(set(data))  # Remove duplicates
+
         if data_source == "uniprot":
             from graphgen.models import UniProtSearch
 
@@ -34,19 +38,46 @@ async def search_all(
                 **search_config.get("uniprot_params", {})
             )
 
-            data = list(seed_data.values())
-            data = [d["content"] for d in data if "content" in d]
-            data = list(set(data))  # Remove duplicates
             uniprot_results = await run_concurrent(
                 uniprot_search_client.search,
                 data,
                 desc="Searching UniProt database",
                 unit="keyword",
             )
+            results[data_source] = uniprot_results
+
+        elif data_source == "ncbi":
+            from graphgen.models import NCBISearch
+
+            ncbi_search_client = NCBISearch(
+                **search_config.get("ncbi_params", {})
+            )
+
+            ncbi_results = await run_concurrent(
+                ncbi_search_client.search,
+                data,
+                desc="Searching NCBI database",
+                unit="keyword",
+            )
+            results[data_source] = ncbi_results
+
+        elif data_source == "rnacentral":
+            from graphgen.models import RNACentralSearch
+
+            rnacentral_search_client = RNACentralSearch(
+                **search_config.get("rnacentral_params", {})
+            )
+
+            rnacentral_results = await run_concurrent(
+                rnacentral_search_client.search,
+                data,
+                desc="Searching RNAcentral database",
+                unit="keyword",
+            )
+            results[data_source] = rnacentral_results
+
         else:
             logger.error("Data source %s not supported.", data_source)
             continue
 
-        results[data_source] = uniprot_results
-
     return results
diff --git a/resources/input_examples/search_dna_demo.jsonl b/resources/input_examples/search_dna_demo.jsonl
new file mode 100644
index 00000000..83086df9
--- /dev/null
+++ b/resources/input_examples/search_dna_demo.jsonl
@@ -0,0 +1,4 @@
+{"type": "text", "content": "TP53"}
+{"type": "text", "content": "BRCA1"}
+{"type": "text", "content": "NM_000546"}
+{"type": "text", "content": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"}
diff --git a/resources/input_examples/search_demo.jsonl b/resources/input_examples/search_protein_demo.jsonl
similarity index 100%
rename from resources/input_examples/search_demo.jsonl
rename to resources/input_examples/search_protein_demo.jsonl
diff --git a/scripts/search/search_dna.sh b/scripts/search/search_dna.sh
new file mode 100644
index 00000000..5b82fdd6
--- /dev/null
+++ b/scripts/search/search_dna.sh
@@ -0,0 +1,4 @@
+python3 -m graphgen.run \
+--config_file graphgen/configs/search_dna_config.yaml \
+--output_dir cache/
+
diff --git a/scripts/search/search_rna.sh b/scripts/search/search_rna.sh
new file mode 100644
index 00000000..260499b3
--- /dev/null
+++ b/scripts/search/search_rna.sh
@@ -0,0 +1,4 @@
+python3 -m graphgen.run \
+--config_file graphgen/configs/search_rna_config.yaml \
+--output_dir cache/
+
diff --git a/scripts/search/search_uniprot.sh b/scripts/search/search_uniprot.sh
index 642040af..7b295f8d 100644
--- a/scripts/search/search_uniprot.sh
+++ b/scripts/search/search_uniprot.sh
@@ -1,3 +1,3 @@
 python3 -m graphgen.run \
---config_file graphgen/configs/search_config.yaml \
+--config_file graphgen/configs/search_protein_config.yaml \
 --output_dir cache/

From 9a26138580ec0294a994eaefd9d01ff1a5f41356 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Wed, 26 Nov 2025 20:48:38 +0800
Subject: [PATCH 02/22] fix: fix pylint style issues

- Remove all trailing whitespace
- Fix line-too-long issues (split long lines)
- Remove trailing newlines at end of files
- Remove unnecessary else/elif after return statements
---
 graphgen/models/searcher/db/ncbi_searcher.py  | 59 ++++++++++---------
 .../models/searcher/db/rnacentral_searcher.py | 18 +++---
 2 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 73a9ad87..aa23a9d4 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -54,17 +54,24 @@ def get_by_gene_id(self, gene_id: str) -> Optional[dict]:
                 gene_record = Entrez.read(handle)
                 if not gene_record:
                     return None
-                
+
                 gene_data = gene_record[0]
                 gene_ref = gene_data.get("Entrezgene_gene", {}).get("Gene-ref", {})
-                
+
+                organism = (
+                    gene_data.get("Entrezgene_source", {})
+                    .get("BioSource", {})
+                    .get("BioSource_org", {})
+                    .get("Org-ref", {})
+                    .get("Org-ref_taxname", "N/A")
+                )
                 return {
                     "molecule_type": "DNA",
                     "database": "NCBI",
                     "id": gene_id,
                     "gene_name": gene_ref.get("Gene-ref_locus", "N/A"),
                     "gene_description": gene_ref.get("Gene-ref_desc", "N/A"),
-                    "organism": gene_data.get("Entrezgene_source", {}).get("BioSource", {}).get("BioSource_org", {}).get("Org-ref", {}).get("Org-ref_taxname", "N/A"),
+                    "organism": organism,
                     "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}",
                 }
             finally:
@@ -93,11 +100,11 @@ def get_by_accession(self, accession: str) -> Optional[dict]:
                 sequence_data = handle.read()
                 if not sequence_data:
                     return None
-                
+
                 seq_lines = sequence_data.strip().split("\n")
                 header = seq_lines[0] if seq_lines else ""
                 sequence = "".join(seq_lines[1:])
-                
+
                 # Try to get more information
                 time.sleep(0.35)
                 summary_handle = Entrez.esummary(db="nuccore", id=accession)
@@ -112,7 +119,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]:
                         organism = "N/A"
                 finally:
                     summary_handle.close()
-                
+
                 return {
                     "molecule_type": "DNA",
                     "database": "NCBI",
@@ -162,7 +169,7 @@ def search_by_keyword(self, keyword: str) -> Optional[dict]:
                         search_results = Entrez.read(search_handle2)
                     finally:
                         search_handle2.close()
-                
+
                 if search_results.get("IdList"):
                     gene_id = search_results["IdList"][0]
                     return self.get_by_gene_id(gene_id)
@@ -188,21 +195,21 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]:
                 seq = "".join(seq_lines[1:])
             else:
                 seq = sequence.strip().replace(" ", "").replace("\n", "")
-            
+
             # Validate if it's a DNA sequence
             if not re.fullmatch(r"[ATCGN\s]+", seq, re.I):
                 logger.error("Invalid DNA sequence provided.")
                 return None
-            
+
             if not seq:
                 logger.error("Empty DNA sequence provided.")
                 return None
-            
+
             # Use BLAST search (Note: requires network connection, may be slow)
             logger.debug("Performing BLAST search for DNA sequence...")
             time.sleep(0.35)
             from Bio.Blast import NCBIWWW, NCBIXML
-            
+
             result_handle = NCBIWWW.qblast(
                 program="blastn",
                 database="nr",
@@ -211,33 +218,32 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]:
                 expect=0.001,
             )
             blast_record = NCBIXML.read(result_handle)
-            
+
             if not blast_record.alignments:
                 logger.info("No BLAST hits found for the given sequence.")
                 return None
-            
+
             best_alignment = blast_record.alignments[0]
             best_hsp = best_alignment.hsps[0]
             hit_id = best_alignment.hit_id
-            
+
             # Extract accession number
             # Format may be: gi|123456|ref|NM_000546.5|
             accession_match = re.search(r"ref\|([^|]+)", hit_id)
             if accession_match:
                 accession = accession_match.group(1).split(".")[0]
                 return self.get_by_accession(accession)
-            else:
-                # If unable to extract accession, return basic information
-                return {
-                    "molecule_type": "DNA",
-                    "database": "NCBI",
-                    "id": hit_id,
-                    "title": best_alignment.title,
-                    "sequence_length": len(seq),
-                    "e_value": best_hsp.expect,
-                    "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0,
-                    "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
-                }
+            # If unable to extract accession, return basic information
+            return {
+                "molecule_type": "DNA",
+                "database": "NCBI",
+                "id": hit_id,
+                "title": best_alignment.title,
+                "sequence_length": len(seq),
+                "e_value": best_hsp.expect,
+                "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0,
+                "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
+            }
         except RequestException:
             raise
         except Exception as e:  # pylint: disable=broad-except
@@ -293,4 +299,3 @@ async def search(
         if result:
             result["_search_query"] = query
         return result
-
diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index d1decd6d..63c88395 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -61,11 +61,10 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
                             "description": rna_data.get("description", "N/A"),
                             "url": f"https://rnacentral.org/rna/{rna_id}",
                         }
-                    elif resp.status == 404:
+                    if resp.status == 404:
                         logger.error("RNA ID %s not found", rna_id)
                         return None
-                    else:
-                        raise Exception(f"HTTP {resp.status}: {await resp.text()}")
+                    raise Exception(f"HTTP {resp.status}: {await resp.text()}")
         except Exception as exc:  # pylint: disable=broad-except
             logger.error("RNA ID %s not found: %s", rna_id, exc)
             return None
@@ -97,8 +96,7 @@ async def search_by_keyword(self, keyword: str) -> Optional[dict]:
                                 return await self.get_by_rna_id(rna_id)
                         logger.info("No results found for keyword: %s", keyword)
                         return None
-                    else:
-                        raise Exception(f"HTTP {resp.status}: {await resp.text()}")
+                    raise Exception(f"HTTP {resp.status}: {await resp.text()}")
         except Exception as e:  # pylint: disable=broad-except
             logger.error("Keyword %s not found: %s", keyword, e)
             return None
@@ -116,16 +114,16 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]:
                 seq = "".join(seq_lines[1:])
             else:
                 seq = sequence.strip().replace(" ", "").replace("\n", "")
-            
+
             # Validate if it's an RNA sequence (contains U instead of T)
             if not re.fullmatch(r"[AUCGN\s]+", seq, re.I):
                 logger.error("Invalid RNA sequence provided.")
                 return None
-            
+
             if not seq:
                 logger.error("Empty RNA sequence provided.")
                 return None
-            
+
             # RNAcentral API supports sequence search
             async with aiohttp.ClientSession() as session:
                 search_url = f"{self.base_url}/rna"
@@ -144,8 +142,7 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]:
                                 return await self.get_by_rna_id(rna_id)
                         logger.info("No results found for sequence.")
                         return None
-                    else:
-                        raise Exception(f"HTTP {resp.status}: {await resp.text()}")
+                    raise Exception(f"HTTP {resp.status}: {await resp.text()}")
         except Exception as e:  # pylint: disable=broad-except
             logger.error("Sequence search failed: %s", e)
             return None
@@ -188,4 +185,3 @@ async def search(
         if result:
             result["_search_query"] = query
         return result
-

From ef270b84d7764d430b8b6933ad316796ad33a25e Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Thu, 27 Nov 2025 00:04:45 +0800
Subject: [PATCH 03/22] refactor: unify searcher interfaces and improve error
 handling

- Extract utility functions (_gene_record_to_dict, _accession_to_dict, _rna_data_to_dict)
- Unify method naming: search_by_keyword -> get_best_hit
- Add threshold parameter to NCBI and RNAcentral searchers for interface consistency
- Improve error handling with network error detection and fallback strategies
- Fix RNAcentral sequence search to prioritize exact matches
- Add search_rna_demo.jsonl example file
---
 graphgen/models/searcher/db/ncbi_searcher.py  | 107 +++++++++++-------
 .../models/searcher/db/rnacentral_searcher.py |  99 ++++++++++++----
 .../input_examples/search_rna_demo.jsonl      |   4 +
 3 files changed, 150 insertions(+), 60 deletions(-)
 create mode 100644 resources/input_examples/search_rna_demo.jsonl

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index aa23a9d4..9c637ffd 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -41,6 +41,38 @@ def __init__(self, email: str = "test@example.com", tool: str = "GraphGen"):
         Entrez.tool = tool
         Entrez.timeout = 60  # 60 seconds timeout
 
+    @staticmethod
+    def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
+        """
+        Convert an Entrez gene record to a dictionary.
+        :param gene_record: The Entrez gene record (list from Entrez.read).
+        :param gene_id: The gene ID.
+        :return: A dictionary containing gene information.
+        """
+        if not gene_record:
+            raise ValueError("Empty gene record")
+
+        gene_data = gene_record[0]
+        gene_ref = gene_data.get("Entrezgene_gene", {}).get("Gene-ref", {})
+
+        organism = (
+            gene_data.get("Entrezgene_source", {})
+            .get("BioSource", {})
+            .get("BioSource_org", {})
+            .get("Org-ref", {})
+            .get("Org-ref_taxname", "N/A")
+        )
+
+        return {
+            "molecule_type": "DNA",
+            "database": "NCBI",
+            "id": gene_id,
+            "gene_name": gene_ref.get("Gene-ref_locus", "N/A"),
+            "gene_description": gene_ref.get("Gene-ref_desc", "N/A"),
+            "organism": organism,
+            "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}",
+        }
+
     def get_by_gene_id(self, gene_id: str) -> Optional[dict]:
         """
         Get gene information by Gene ID.
@@ -54,26 +86,7 @@ def get_by_gene_id(self, gene_id: str) -> Optional[dict]:
                 gene_record = Entrez.read(handle)
                 if not gene_record:
                     return None
-
-                gene_data = gene_record[0]
-                gene_ref = gene_data.get("Entrezgene_gene", {}).get("Gene-ref", {})
-
-                organism = (
-                    gene_data.get("Entrezgene_source", {})
-                    .get("BioSource", {})
-                    .get("BioSource_org", {})
-                    .get("Org-ref", {})
-                    .get("Org-ref_taxname", "N/A")
-                )
-                return {
-                    "molecule_type": "DNA",
-                    "database": "NCBI",
-                    "id": gene_id,
-                    "gene_name": gene_ref.get("Gene-ref_locus", "N/A"),
-                    "gene_description": gene_ref.get("Gene-ref_desc", "N/A"),
-                    "organism": organism,
-                    "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}",
-                }
+                return self._gene_record_to_dict(gene_record, gene_id)
             finally:
                 handle.close()
         except RequestException:
@@ -82,6 +95,28 @@ def get_by_gene_id(self, gene_id: str) -> Optional[dict]:
             logger.error("Gene ID %s not found: %s", gene_id, exc)
             return None
 
+    @staticmethod
+    def _accession_to_dict(accession: str, sequence: str, header: str, title: str, organism: str) -> dict:
+        """
+        Convert accession information to a dictionary.
+        :param accession: NCBI accession number.
+        :param sequence: DNA sequence.
+        :param header: FASTA header.
+        :param title: Sequence title.
+        :param organism: Organism name.
+        :return: A dictionary containing sequence information.
+        """
+        return {
+            "molecule_type": "DNA",
+            "database": "NCBI",
+            "id": accession,
+            "title": title,
+            "organism": organism,
+            "sequence": sequence,
+            "sequence_length": len(sequence),
+            "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}",
+        }
+
     def get_by_accession(self, accession: str) -> Optional[dict]:
         """
         Get sequence information by accession number.
@@ -89,7 +124,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]:
         :return: A dictionary containing sequence information or None if not found.
         """
         try:
-            time.sleep(0.35)  # 遵守速率限制
+            time.sleep(0.35)  # Comply with rate limit
             handle = Entrez.efetch(
                 db="nuccore",
                 id=accession,
@@ -120,16 +155,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]:
                 finally:
                     summary_handle.close()
 
-                return {
-                    "molecule_type": "DNA",
-                    "database": "NCBI",
-                    "id": accession,
-                    "title": title,
-                    "organism": organism,
-                    "sequence": sequence,
-                    "sequence_length": len(sequence),
-                    "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}",
-                }
+                return self._accession_to_dict(accession, sequence, header, title, organism)
             finally:
                 handle.close()
         except RequestException:
@@ -138,7 +164,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]:
             logger.error("Accession %s not found: %s", accession, exc)
             return None
 
-    def search_by_keyword(self, keyword: str) -> Optional[dict]:
+    def get_best_hit(self, keyword: str) -> Optional[dict]:
         """
         Search NCBI Gene database with a keyword and return the best hit.
         :param keyword: The search keyword (e.g., gene name).
@@ -148,7 +174,7 @@ def search_by_keyword(self, keyword: str) -> Optional[dict]:
             return None
 
         try:
-            time.sleep(0.35)  # 遵守速率限制
+            time.sleep(0.35)  # Comply with rate limit
             # Search gene database
             search_handle = Entrez.esearch(
                 db="gene",
@@ -181,11 +207,12 @@ def search_by_keyword(self, keyword: str) -> Optional[dict]:
             logger.error("Keyword %s not found: %s", keyword, e)
         return None
 
-    def search_by_sequence(self, sequence: str) -> Optional[dict]:
+    def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
         """
         Search NCBI with a DNA sequence using BLAST.
         Note: This is a simplified version. For production, consider using local BLAST.
         :param sequence: DNA sequence (FASTA format or raw sequence).
+        :param threshold: E-value threshold for BLAST search.
         :return: A dictionary containing the best hit information or None if not found.
         """
         try:
@@ -215,7 +242,7 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]:
                 database="nr",
                 sequence=seq,
                 hitlist_size=1,
-                expect=0.001,
+                expect=threshold,
             )
             blast_record = NCBIXML.read(result_handle)
 
@@ -225,6 +252,9 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]:
 
             best_alignment = blast_record.alignments[0]
             best_hsp = best_alignment.hsps[0]
+            if best_hsp.expect > threshold:
+                logger.info("No BLAST hits below the threshold E-value.")
+                return None
             hit_id = best_alignment.hit_id
 
             # Extract accession number
@@ -257,11 +287,12 @@ def search_by_sequence(self, sequence: str) -> Optional[dict]:
         reraise=True,
     )
     async def search(
-        self, query: str, **kwargs
+        self, query: str, threshold: float = 0.01, **kwargs
     ) -> Optional[Dict]:
         """
         Search NCBI with either a gene ID, accession number, keyword, or DNA sequence.
         :param query: The search query (gene ID, accession, keyword, or DNA sequence).
+        :param threshold: E-value threshold for BLAST search.
         :param kwargs: Additional keyword arguments (not used currently).
         :return: A dictionary containing the search results or None if not found.
         """
@@ -278,7 +309,7 @@ async def search(
         # check if DNA sequence (ATCG characters)
         if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
             result = await loop.run_in_executor(
-                _get_pool(), self.search_by_sequence, query
+                _get_pool(), self.search_by_sequence, query, threshold
             )
         # check if gene ID (numeric)
         elif re.fullmatch(r"^\d+$", query):
@@ -293,7 +324,7 @@ async def search(
         else:
             # otherwise treat as keyword
             result = await loop.run_in_executor(
-                _get_pool(), self.search_by_keyword, query
+                _get_pool(), self.get_best_hit, query
             )
 
         if result:
diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index 63c88395..0eeb4a43 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -36,6 +36,27 @@ def __init__(self):
         self.base_url = "https://rnacentral.org/api/v1"
         self.headers = {"Accept": "application/json"}
 
+    @staticmethod
+    def _rna_data_to_dict(rna_id: str, rna_data: dict) -> dict:
+        """
+        Convert RNAcentral API response to a dictionary.
+        :param rna_id: RNAcentral ID.
+        :param rna_data: API response data (dict or dict-like from search results).
+        :return: A dictionary containing RNA information.
+        """
+        sequence = rna_data.get("sequence", "")
+        return {
+            "molecule_type": "RNA",
+            "database": "RNAcentral",
+            "id": rna_id,
+            "rnacentral_id": rna_data.get("rnacentral_id", rna_id),
+            "sequence": sequence,
+            "sequence_length": rna_data.get("length", len(sequence)),
+            "rna_type": rna_data.get("rna_type", "N/A"),
+            "description": rna_data.get("description", "N/A"),
+            "url": f"https://rnacentral.org/rna/{rna_id}",
+        }
+
     async def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
         """
         Get RNA information by RNAcentral ID.
@@ -50,26 +71,19 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
                 ) as resp:
                     if resp.status == 200:
                         rna_data = await resp.json()
-                        return {
-                            "molecule_type": "RNA",
-                            "database": "RNAcentral",
-                            "id": rna_id,
-                            "rnacentral_id": rna_data.get("rnacentral_id", "N/A"),
-                            "sequence": rna_data.get("sequence", ""),
-                            "sequence_length": len(rna_data.get("sequence", "")),
-                            "rna_type": rna_data.get("rna_type", "N/A"),
-                            "description": rna_data.get("description", "N/A"),
-                            "url": f"https://rnacentral.org/rna/{rna_id}",
-                        }
+                        return self._rna_data_to_dict(rna_id, rna_data)
                     if resp.status == 404:
                         logger.error("RNA ID %s not found", rna_id)
                         return None
                     raise Exception(f"HTTP {resp.status}: {await resp.text()}")
+        except aiohttp.ClientError as e:
+            logger.error("Network error getting RNA ID %s: %s", rna_id, e)
+            return None
         except Exception as exc:  # pylint: disable=broad-except
             logger.error("RNA ID %s not found: %s", rna_id, exc)
             return None
 
-    async def search_by_keyword(self, keyword: str) -> Optional[dict]:
+    async def get_best_hit(self, keyword: str) -> Optional[dict]:
         """
         Search RNAcentral with a keyword and return the best hit.
         :param keyword: The search keyword (e.g., miRNA name, RNA name).
@@ -90,13 +104,26 @@ async def search_by_keyword(self, keyword: str) -> Optional[dict]:
                 ) as resp:
                     if resp.status == 200:
                         search_results = await resp.json()
-                        if search_results.get("results"):
-                            rna_id = search_results["results"][0].get("rnacentral_id")
+                        results = search_results.get("results", [])
+                        if results:
+                            # Use the first result directly (search API already returns enough info)
+                            first_result = results[0]
+                            rna_id = first_result.get("rnacentral_id")
                             if rna_id:
-                                return await self.get_by_rna_id(rna_id)
+                                # Try to get detailed info, but fall back to search result if it fails
+                                detailed_info = await self.get_by_rna_id(rna_id)
+                                if detailed_info:
+                                    return detailed_info
+                                # Fall back to using search result data
+                                return self._rna_data_to_dict(rna_id, first_result)
                         logger.info("No results found for keyword: %s", keyword)
                         return None
-                    raise Exception(f"HTTP {resp.status}: {await resp.text()}")
+                    error_text = await resp.text()
+                    logger.error("HTTP %d error for keyword %s: %s", resp.status, keyword, error_text[:200])
+                    raise Exception(f"HTTP {resp.status}: {error_text}")
+        except aiohttp.ClientError as e:
+            logger.error("Network error searching for keyword %s: %s", keyword, e)
+            return None
         except Exception as e:  # pylint: disable=broad-except
             logger.error("Keyword %s not found: %s", keyword, e)
             return None
@@ -136,13 +163,39 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]:
                 ) as resp:
                     if resp.status == 200:
                         search_results = await resp.json()
-                        if search_results.get("results"):
-                            rna_id = search_results["results"][0].get("rnacentral_id")
+                        results = search_results.get("results", [])
+                        if results:
+                            # First, try to find an exact sequence match
+                            exact_match = None
+                            for result in results:
+                                result_seq = result.get("sequence", "")
+                                if result_seq == seq:
+                                    exact_match = result
+                                    break
+                            
+                            # Use exact match if found, otherwise use first result
+                            target_result = exact_match if exact_match else results[0]
+                            rna_id = target_result.get("rnacentral_id")
+                            
                             if rna_id:
-                                return await self.get_by_rna_id(rna_id)
+                                # Try to get detailed info, but fall back to search result if it fails
+                                try:
+                                    detailed_info = await self.get_by_rna_id(rna_id)
+                                    if detailed_info:
+                                        return detailed_info
+                                except Exception as e:
+                                    logger.debug("Failed to get detailed info for %s: %s, using search result", rna_id, e)
+                                
+                                # Fall back to using search result data
+                                return self._rna_data_to_dict(rna_id, target_result)
                         logger.info("No results found for sequence.")
                         return None
-                    raise Exception(f"HTTP {resp.status}: {await resp.text()}")
+                    error_text = await resp.text()
+                    logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200])
+                    raise Exception(f"HTTP {resp.status}: {error_text}")
+        except aiohttp.ClientError as e:
+            logger.error("Network error searching for sequence: %s", e)
+            return None
         except Exception as e:  # pylint: disable=broad-except
             logger.error("Sequence search failed: %s", e)
             return None
@@ -154,11 +207,13 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]:
         reraise=True,
     )
     async def search(
-        self, query: str, **kwargs
+        self, query: str, threshold: float = 0.7, **kwargs
     ) -> Optional[Dict]:
         """
         Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence.
         :param query: The search query (RNAcentral ID, keyword, or RNA sequence).
+        :param threshold: E-value threshold for sequence search.
+        Note: RNAcentral API uses its own similarity matching, this parameter is for interface consistency.
         :param kwargs: Additional keyword arguments (not used currently).
         :return: A dictionary containing the search results or None if not found.
         """
@@ -180,7 +235,7 @@ async def search(
             result = await self.get_by_rna_id(query)
         else:
             # otherwise treat as keyword
-            result = await self.search_by_keyword(query)
+            result = await self.get_best_hit(query)
 
         if result:
             result["_search_query"] = query
diff --git a/resources/input_examples/search_rna_demo.jsonl b/resources/input_examples/search_rna_demo.jsonl
new file mode 100644
index 00000000..caa28612
--- /dev/null
+++ b/resources/input_examples/search_rna_demo.jsonl
@@ -0,0 +1,4 @@
+{"type": "text", "content": "hsa-let-7a-1"}
+{"type": "text", "content": "URS0000123456"}
+{"type": "text", "content": "URS0000000001"}
+{"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}

From 71fba90cbad7baae068228f7e8d28eb88c9c7381 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Thu, 27 Nov 2025 00:25:11 +0800
Subject: [PATCH 04/22] Add UniProt IDs to search_protein_demo.jsonl

---
 resources/input_examples/search_protein_demo.jsonl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/resources/input_examples/search_protein_demo.jsonl b/resources/input_examples/search_protein_demo.jsonl
index 6409a805..82b77836 100644
--- a/resources/input_examples/search_protein_demo.jsonl
+++ b/resources/input_examples/search_protein_demo.jsonl
@@ -1,3 +1,7 @@
+{"type": "text", "content": "P01308"}
+{"type": "text", "content": "P68871"}
+{"type": "text", "content": "P02768"}
+{"type": "text", "content": "P04637"}
 {"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"}
 {"type": "text", "content": "MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"}
 {"type": "text", "content": "MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"}

From 9f8c837ab2b617ff48e40f2e82cee80cbdb8acb3 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Thu, 27 Nov 2025 00:25:11 +0800
Subject: [PATCH 05/22] add: add UniProt IDs to search_protein_demo.jsonl

---
 resources/input_examples/search_protein_demo.jsonl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/resources/input_examples/search_protein_demo.jsonl b/resources/input_examples/search_protein_demo.jsonl
index 6409a805..82b77836 100644
--- a/resources/input_examples/search_protein_demo.jsonl
+++ b/resources/input_examples/search_protein_demo.jsonl
@@ -1,3 +1,7 @@
+{"type": "text", "content": "P01308"}
+{"type": "text", "content": "P68871"}
+{"type": "text", "content": "P02768"}
+{"type": "text", "content": "P04637"}
 {"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"}
 {"type": "text", "content": "MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"}
 {"type": "text", "content": "MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"}

From c60784e7035e743626315ede028017b0dcc5acaa Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Thu, 27 Nov 2025 17:42:05 +0800
Subject: [PATCH 06/22] feat: unify search interfaces to use gene ID as unified
 data source

---
 graphgen/models/searcher/db/ncbi_searcher.py | 408 +++++++++++++++----
 1 file changed, 338 insertions(+), 70 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 9c637ffd..cc78fa1c 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -1,4 +1,5 @@
 import asyncio
+import logging
 import re
 import time
 from concurrent.futures import ThreadPoolExecutor
@@ -7,12 +8,14 @@
 from typing import Dict, Optional
 
 from Bio import Entrez
+from Bio.Blast import NCBIWWW, NCBIXML
 from requests.exceptions import RequestException
 from tenacity import (
     retry,
     retry_if_exception_type,
     stop_after_attempt,
     wait_exponential,
+    before_sleep_log,
 )
 
 from graphgen.bases import BaseSearcher
@@ -41,6 +44,18 @@ def __init__(self, email: str = "test@example.com", tool: str = "GraphGen"):
         Entrez.tool = tool
         Entrez.timeout = 60  # 60 seconds timeout
 
+    @staticmethod
+    def _safe_get(obj, key, default=None):
+        """Safely get value from dict or StringElement-like object."""
+        if isinstance(obj, dict):
+            return obj.get(key, default)
+        elif hasattr(obj, "get"):
+            return obj.get(key, default)
+        elif hasattr(obj, key):
+            return getattr(obj, key, default)
+        else:
+            return default
+
     @staticmethod
     def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
         """
@@ -53,30 +68,200 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
             raise ValueError("Empty gene record")
 
         gene_data = gene_record[0]
-        gene_ref = gene_data.get("Entrezgene_gene", {}).get("Gene-ref", {})
-
-        organism = (
-            gene_data.get("Entrezgene_source", {})
-            .get("BioSource", {})
-            .get("BioSource_org", {})
-            .get("Org-ref", {})
-            .get("Org-ref_taxname", "N/A")
-        )
+        
+        # Safely extract gene_ref, handling both dict and StringElement types
+        gene_ref = {}
+        entrezgene_gene = gene_data.get("Entrezgene_gene")
+        if isinstance(entrezgene_gene, dict):
+            gene_ref = entrezgene_gene.get("Gene-ref", {})
+        elif hasattr(entrezgene_gene, "get"):
+            gene_ref = entrezgene_gene.get("Gene-ref", {})
+        else:
+            # If it's a StringElement or other type, try to access as dict
+            try:
+                if hasattr(entrezgene_gene, "Gene-ref"):
+                    gene_ref = getattr(entrezgene_gene, "Gene-ref", {})
+            except Exception:
+                pass
 
+        # Safely extract organism
+        organism = "N/A"
+        try:
+            entrezgene_source = gene_data.get("Entrezgene_source")
+            if isinstance(entrezgene_source, dict):
+                biosource = entrezgene_source.get("BioSource", {})
+                if isinstance(biosource, dict):
+                    biosource_org = biosource.get("BioSource_org", {})
+                    if isinstance(biosource_org, dict):
+                        org_ref = biosource_org.get("Org-ref", {})
+                        if isinstance(org_ref, dict):
+                            organism = org_ref.get("Org-ref_taxname", "N/A")
+                        elif hasattr(org_ref, "Org-ref_taxname"):
+                            organism = getattr(org_ref, "Org-ref_taxname", "N/A")
+        except Exception as e:
+            logger.debug("Error extracting organism: %s", e)
+
+        # Extract gene synonyms - safely handle StringElement types
+        gene_synonyms = []
+        try:
+            gene_syn = gene_ref.get("Gene-ref_syn", []) if isinstance(gene_ref, dict) else []
+            if isinstance(gene_syn, list):
+                for syn in gene_syn:
+                    if isinstance(syn, dict):
+                        gene_synonyms.append(syn.get("Gene-ref_syn_E", "N/A"))
+                    elif isinstance(syn, str):
+                        gene_synonyms.append(syn)
+                    else:
+                        # Handle StringElement or other types
+                        gene_synonyms.append(str(syn))
+            elif isinstance(gene_syn, str):
+                gene_synonyms.append(gene_syn)
+            elif gene_syn:  # Handle StringElement
+                gene_synonyms.append(str(gene_syn))
+        except Exception as e:
+            logger.debug("Error extracting gene synonyms: %s", e)
+
+        # Extract gene type - safely handle StringElement types
+        # Note: Entrezgene_type is a StringElement with numeric value (e.g., "6" for ncRNA)
+        gene_type = None
+        try:
+            gene_type_data = gene_data.get("Entrezgene_type")
+            if gene_type_data:
+                type_value = str(gene_type_data)
+                # Map numeric values to type names (NCBI gene type codes)
+                type_mapping = {
+                    "1": "protein-coding",
+                    "2": "pseudo",
+                    "3": "rRNA",
+                    "4": "tRNA",
+                    "5": "snRNA",
+                    "6": "ncRNA",
+                    "7": "other",
+                }
+                gene_type = type_mapping.get(type_value, f"type_{type_value}")
+        except Exception as e:
+            logger.debug("Error extracting gene type: %s", e)
+
+        # Extract chromosome and genomic location from Entrezgene_locus
+        # Note: Entrezgene_location doesn't exist, but Entrezgene_locus contains location info
+        chromosome = None
+        genomic_location = None
+        
+        try:
+            locus_data = gene_data.get("Entrezgene_locus")
+            if locus_data and isinstance(locus_data, list) and locus_data:
+                first_locus = locus_data[0]
+                if isinstance(first_locus, dict):
+                    # Extract chromosome from Gene-commentary_label
+                    # Example: "Chromosome 13 Reference RoL_Sarg_1.0" -> "13"
+                    label = NCBISearch._safe_get(first_locus, "Gene-commentary_label", "")
+                    if label and "Chromosome" in str(label):
+                        match = re.search(r'Chromosome\s+(\S+)', str(label))
+                        if match:
+                            chromosome = match.group(1)
+                    
+                    # Extract genomic location from Gene-commentary_seqs
+                    seqs = NCBISearch._safe_get(first_locus, "Gene-commentary_seqs", [])
+                    if seqs and isinstance(seqs, list) and seqs:
+                        first_seq = seqs[0]
+                        if isinstance(first_seq, dict):
+                            seq_loc_int = NCBISearch._safe_get(first_seq, "Seq-loc_int", {})
+                            if seq_loc_int:
+                                seq_interval = NCBISearch._safe_get(seq_loc_int, "Seq-interval", {})
+                                if seq_interval:
+                                    seq_from = NCBISearch._safe_get(seq_interval, "Seq-interval_from", "")
+                                    seq_to = NCBISearch._safe_get(seq_interval, "Seq-interval_to", "")
+                                    if seq_from and seq_to:
+                                        genomic_location = f"{seq_from}-{seq_to}"
+        except Exception as e:
+            logger.debug("Error extracting chromosome/location from gene record: %s", e)
+
+        # Extract gene functional description
+        # Note: Entrezgene_summary doesn't exist for most genes
+        # Try to extract from Entrezgene_comments if available
+        function = None
+        try:
+            # First try Entrezgene_summary (if exists)
+            summary = gene_data.get("Entrezgene_summary")
+            if summary:
+                function = str(summary)
+            else:
+                # Try to extract from Entrezgene_comments
+                comments_data = gene_data.get("Entrezgene_comments")
+                if comments_data and isinstance(comments_data, list):
+                    for comment in comments_data:
+                        if isinstance(comment, dict):
+                            heading = NCBISearch._safe_get(comment, "Gene-commentary_heading", "")
+                            # Look for function-related comments
+                            if "function" in str(heading).lower() or "summary" in str(heading).lower():
+                                comment_text = NCBISearch._safe_get(comment, "Gene-commentary_comment", "")
+                                if comment_text:
+                                    function = str(comment_text)
+                                    break
+        except Exception as e:
+            logger.debug("Error extracting function: %s", e)
+
+        # Try to extract representative mRNA accession from Entrezgene_locus for sequence retrieval
+        representative_accession = None
+        try:
+            if locus_data and isinstance(locus_data, list) and locus_data:
+                first_locus = locus_data[0]
+                if isinstance(first_locus, dict):
+                    products = NCBISearch._safe_get(first_locus, "Gene-commentary_products", [])
+                    if products and isinstance(products, list):
+                        # Look for mRNA (type 3) or the first product
+                        for product in products:
+                            if isinstance(product, dict):
+                                product_type = NCBISearch._safe_get(product, "Gene-commentary_type", "")
+                                product_type_str = str(product_type)
+                                # Type 3 is mRNA, prefer mRNA over other types
+                                if product_type_str == "3" or (not representative_accession and product_type_str):
+                                    accession = NCBISearch._safe_get(product, "Gene-commentary_accession", "")
+                                    if accession:
+                                        representative_accession = str(accession)
+                                        if product_type_str == "3":  # Found mRNA, use it
+                                            break
+        except Exception as e:
+            logger.debug("Error extracting representative accession: %s", e)
+
+        # Build result dictionary with all fields
+        # Include all fields that might be present in accession-based queries
         return {
             "molecule_type": "DNA",
             "database": "NCBI",
             "id": gene_id,
-            "gene_name": gene_ref.get("Gene-ref_locus", "N/A"),
-            "gene_description": gene_ref.get("Gene-ref_desc", "N/A"),
+            "gene_name": NCBISearch._safe_get(gene_ref, "Gene-ref_locus", "N/A"),
+            "gene_description": NCBISearch._safe_get(gene_ref, "Gene-ref_desc", "N/A"),
             "organism": organism,
             "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}",
+            "gene_synonyms": gene_synonyms if gene_synonyms else None,
+            "gene_type": gene_type,
+            "chromosome": chromosome,
+            "genomic_location": genomic_location,
+            "function": function,
+            # Fields from accession-based queries (set to None initially, may be filled later)
+            "title": None,
+            "sequence": None,
+            "sequence_length": None,
+            "gene_id": gene_id,  # For consistency with accession queries
+            "molecule_type_detail": None,
+            "_representative_accession": representative_accession,
         }
 
-    def get_by_gene_id(self, gene_id: str) -> Optional[dict]:
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+        before_sleep=before_sleep_log(logger, logging.WARNING),
+    )
+    
+    def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
         """
         Get gene information by Gene ID.
+        This is the unified data source - all search methods eventually call this.
         :param gene_id: NCBI Gene ID.
+        :param preferred_accession: Optional accession to use for sequence retrieval if representative mRNA is not available.
         :return: A dictionary containing gene information or None if not found.
         """
         try:
@@ -86,84 +271,166 @@ def get_by_gene_id(self, gene_id: str) -> Optional[dict]:
                 gene_record = Entrez.read(handle)
                 if not gene_record:
                     return None
-                return self._gene_record_to_dict(gene_record, gene_id)
+                result = self._gene_record_to_dict(gene_record, gene_id)
+                
+                # Try to get sequence from accession
+                # Priority: 1) preferred_accession (if provided), 2) representative mRNA accession
+                accession_to_use = preferred_accession or result.get("_representative_accession")
+                if accession_to_use:
+                    try:
+                        # Get sequence info directly from nuccore database
+                        time.sleep(0.35)
+                        seq_handle = Entrez.efetch(
+                            db="nuccore",
+                            id=accession_to_use,
+                            rettype="fasta",
+                            retmode="text",
+                        )
+                        try:
+                            sequence_data = seq_handle.read()
+                            if sequence_data:
+                                seq_lines = sequence_data.strip().split("\n")
+                                header = seq_lines[0] if seq_lines else ""
+                                sequence = "".join(seq_lines[1:])
+                                
+                                # Get summary for additional info
+                                time.sleep(0.35)
+                                summary_handle = Entrez.esummary(db="nuccore", id=accession_to_use)
+                                try:
+                                    summary = Entrez.read(summary_handle)
+                                    if summary:
+                                        summary_data = summary[0]
+                                        title = summary_data.get("Title", header)
+                                        
+                                        # Determine molecule type detail
+                                        molecule_type_detail = "N/A"
+                                        if accession_to_use.startswith("NM_") or accession_to_use.startswith("XM_"):
+                                            molecule_type_detail = "mRNA"
+                                        elif accession_to_use.startswith("NC_") or accession_to_use.startswith("NT_"):
+                                            molecule_type_detail = "genomic DNA"
+                                        elif accession_to_use.startswith("NR_") or accession_to_use.startswith("XR_"):
+                                            molecule_type_detail = "RNA"
+                                        elif accession_to_use.startswith("NG_"):
+                                            molecule_type_detail = "genomic region"
+                                        
+                                        # Merge sequence information into result
+                                        result["sequence"] = sequence
+                                        result["sequence_length"] = len(sequence)
+                                        result["title"] = title
+                                        result["molecule_type_detail"] = molecule_type_detail
+                                        
+                                        # Update chromosome and genomic_location if not already set
+                                        if not result.get("chromosome"):
+                                            chromosome = summary_data.get("ChrLoc") or summary_data.get("ChrAccVer")
+                                            if chromosome:
+                                                result["chromosome"] = chromosome
+                                        if not result.get("genomic_location"):
+                                            chr_start = summary_data.get("ChrStart")
+                                            chr_stop = summary_data.get("ChrStop")
+                                            if chr_start and chr_stop:
+                                                result["genomic_location"] = f"{chr_start}-{chr_stop}"
+                                finally:
+                                    summary_handle.close()
+                        finally:
+                            seq_handle.close()
+                    except (RequestException, IncompleteRead):
+                        # Re-raise to allow retry mechanism
+                        raise
+                    except Exception as e:
+                        logger.debug("Failed to get sequence for accession %s: %s", 
+                                   accession_to_use, e)
+                
+                # Remove internal field
+                result.pop("_representative_accession", None)
+                return result
             finally:
                 handle.close()
         except RequestException:
             raise
+        except IncompleteRead:
+            raise
         except Exception as exc:  # pylint: disable=broad-except
             logger.error("Gene ID %s not found: %s", gene_id, exc)
             return None
 
-    @staticmethod
-    def _accession_to_dict(accession: str, sequence: str, header: str, title: str, organism: str) -> dict:
-        """
-        Convert accession information to a dictionary.
-        :param accession: NCBI accession number.
-        :param sequence: DNA sequence.
-        :param header: FASTA header.
-        :param title: Sequence title.
-        :param organism: Organism name.
-        :return: A dictionary containing sequence information.
-        """
-        return {
-            "molecule_type": "DNA",
-            "database": "NCBI",
-            "id": accession,
-            "title": title,
-            "organism": organism,
-            "sequence": sequence,
-            "sequence_length": len(sequence),
-            "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}",
-        }
-
     def get_by_accession(self, accession: str) -> Optional[dict]:
         """
         Get sequence information by accession number.
+        Unified approach: Get GeneID from accession, then call get_by_gene_id() for complete information.
         :param accession: NCBI accession number (e.g., NM_000546).
-        :return: A dictionary containing sequence information or None if not found.
+        :return: A dictionary containing complete gene information or None if not found.
         """
         try:
-            time.sleep(0.35)  # Comply with rate limit
-            handle = Entrez.efetch(
-                db="nuccore",
-                id=accession,
-                rettype="fasta",
-                retmode="text",
-            )
+            # Step 1: Get GeneID from elink (nuccore -> gene)
+            # Note: esummary for nuccore doesn't include GeneID, so we use elink instead
+            time.sleep(0.35)
+            link_handle = Entrez.elink(dbfrom="nuccore", db="gene", id=accession)
+            gene_id = None
             try:
-                sequence_data = handle.read()
-                if not sequence_data:
-                    return None
-
-                seq_lines = sequence_data.strip().split("\n")
-                header = seq_lines[0] if seq_lines else ""
-                sequence = "".join(seq_lines[1:])
-
-                # Try to get more information
-                time.sleep(0.35)
-                summary_handle = Entrez.esummary(db="nuccore", id=accession)
-                try:
-                    summary = Entrez.read(summary_handle)
-                    if summary:
-                        summary_data = summary[0]
-                        title = summary_data.get("Title", header)
-                        organism = summary_data.get("Organism", "N/A")
-                    else:
-                        title = header
-                        organism = "N/A"
-                finally:
-                    summary_handle.close()
-
-                return self._accession_to_dict(accession, sequence, header, title, organism)
+                links = Entrez.read(link_handle)
+                
+                # Extract GeneID from elink results
+                # Structure: links[0]["LinkSetDb"][0]["Link"][0]["Id"]
+                if links and len(links) > 0:
+                    first_link = links[0]
+                    if "LinkSetDb" in first_link:
+                        for link_set in first_link["LinkSetDb"]:
+                            if link_set.get("DbTo") == "gene":
+                                # Try Link structure first (most common)
+                                links_in_set = link_set.get("Link", [])
+                                if links_in_set and len(links_in_set) > 0:
+                                    first_link_item = links_in_set[0]
+                                    if isinstance(first_link_item, dict):
+                                        gene_id = str(first_link_item.get("Id", ""))
+                                    elif hasattr(first_link_item, "Id"):
+                                        gene_id = str(getattr(first_link_item, "Id", ""))
+                                    else:
+                                        # Handle StringElement or other types
+                                        gene_id = str(first_link_item)
+                                    if gene_id:
+                                        break
+                                # Fallback: Try IdList (if Link is not available)
+                                id_list = link_set.get("IdList", [])
+                                if id_list and not gene_id:
+                                    gene_id = str(id_list[0])
+                                    break
+            except Exception as e:
+                logger.error("Error parsing elink result for accession %s: %s", accession, e)
+                import traceback
+                logger.debug(traceback.format_exc())
+                # Continue to check if we got gene_id before the error
             finally:
-                handle.close()
-        except RequestException:
+                link_handle.close()
+            
+            # Step 2: If we have a GeneID, get complete information from Gene database
+            # Pass the accession as preferred_accession so get_by_gene_id can use it for sequence
+            if gene_id:
+                result = self.get_by_gene_id(gene_id, preferred_accession=accession)
+                
+                # Update id to accession for consistency (user searched by accession)
+                if result:
+                    result["id"] = accession
+                    result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}"
+                
+                return result
+            
+            # Step 3: If no GeneID, this is a rare case (accession without associated gene)
+            # Return None - we can't provide complete information without Gene ID
+            logger.warning("Accession %s has no associated GeneID, cannot provide complete information", accession)
+            return None
+        except (RequestException, IncompleteRead):
             raise
         except Exception as exc:  # pylint: disable=broad-except
             logger.error("Accession %s not found: %s", accession, exc)
             return None
 
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type((RequestException, IncompleteRead)),
+        reraise=True,
+        before_sleep=before_sleep_log(logger, logging.WARNING),
+    )
     def get_best_hit(self, keyword: str) -> Optional[dict]:
         """
         Search NCBI Gene database with a keyword and return the best hit.
@@ -203,6 +470,8 @@ def get_best_hit(self, keyword: str) -> Optional[dict]:
                 search_handle.close()
         except RequestException:
             raise
+        except IncompleteRead:
+            raise
         except Exception as e:  # pylint: disable=broad-except
             logger.error("Keyword %s not found: %s", keyword, e)
         return None
@@ -235,7 +504,6 @@ def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional
             # Use BLAST search (Note: requires network connection, may be slow)
             logger.debug("Performing BLAST search for DNA sequence...")
             time.sleep(0.35)
-            from Bio.Blast import NCBIWWW, NCBIXML
 
             result_handle = NCBIWWW.qblast(
                 program="blastn",

From 0dac99d0f2b5176bff545a08fc19fcf90dbf91c3 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Thu, 27 Nov 2025 18:29:16 +0800
Subject: [PATCH 07/22] add: an gene id example in DNA demo

---
 resources/input_examples/search_dna_demo.jsonl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/resources/input_examples/search_dna_demo.jsonl b/resources/input_examples/search_dna_demo.jsonl
index 83086df9..387c87b8 100644
--- a/resources/input_examples/search_dna_demo.jsonl
+++ b/resources/input_examples/search_dna_demo.jsonl
@@ -1,4 +1,5 @@
 {"type": "text", "content": "TP53"}
 {"type": "text", "content": "BRCA1"}
+{"type": "text", "content": "672"}
 {"type": "text", "content": "NM_000546"}
 {"type": "text", "content": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"}

From 1865120d756aa7345cd0abba6dd073ba86aa70ae Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Thu, 27 Nov 2025 20:23:15 +0800
Subject: [PATCH 08/22] feat: unify search interfaces to use RNA id as unified
 data source

---
 .../models/searcher/db/rnacentral_searcher.py | 226 +++++++++++++++---
 1 file changed, 196 insertions(+), 30 deletions(-)

diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index 0eeb4a43..80cb4428 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -1,8 +1,6 @@
 import asyncio
 import re
-from concurrent.futures import ThreadPoolExecutor
-from functools import lru_cache
-from typing import Dict, Optional
+from typing import Dict, Optional, List, Any
 
 import aiohttp
 from tenacity import (
@@ -15,12 +13,6 @@
 from graphgen.bases import BaseSearcher
 from graphgen.utils import logger
 
-
-@lru_cache(maxsize=None)
-def _get_pool():
-    return ThreadPoolExecutor(max_workers=10)
-
-
 class RNACentralSearch(BaseSearcher):
     """
     RNAcentral Search client to search RNA databases.
@@ -36,15 +28,167 @@ def __init__(self):
         self.base_url = "https://rnacentral.org/api/v1"
         self.headers = {"Accept": "application/json"}
 
+    async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession) -> List[Dict]:
+        """
+        Fetch all xrefs from the xrefs endpoint, handling pagination.
+        :param xrefs_url: URL to the xrefs endpoint.
+        :param session: aiohttp ClientSession to use for requests.
+        :return: List of all xref entries.
+        """
+        all_xrefs = []
+        current_url = xrefs_url
+        
+        while current_url:
+            try:
+                async with session.get(
+                    current_url, headers=self.headers, timeout=aiohttp.ClientTimeout(total=30)
+                ) as resp:
+                    if resp.status == 200:
+                        data = await resp.json()
+                        results = data.get("results", [])
+                        all_xrefs.extend(results)
+                        
+                        # Check if there's a next page
+                        current_url = data.get("next")
+                        if not current_url:
+                            break
+                        
+                        # Small delay to avoid rate limiting
+                        await asyncio.sleep(0.2)
+                    else:
+                        logger.warning("Failed to fetch xrefs from %s: HTTP %d", current_url, resp.status)
+                        break
+            except Exception as e:
+                logger.warning("Error fetching xrefs from %s: %s", current_url, e)
+                break
+        
+        return all_xrefs
+
     @staticmethod
-    def _rna_data_to_dict(rna_id: str, rna_data: dict) -> dict:
+    def _extract_info_from_xrefs(xrefs: List[Dict]) -> Dict[str, Any]:
+        """
+        Extract information from xrefs data.
+        :param xrefs: List of xref entries.
+        :return: Dictionary with extracted information.
+        """
+        extracted = {
+            "organisms": set(),
+            "gene_names": set(),
+            "modifications": [],
+            "so_terms": set(),
+            "xrefs_list": [],
+        }
+        
+        for xref in xrefs:
+            # Extract accession information
+            accession = xref.get("accession", {})
+            
+            # Extract species information
+            species = accession.get("species")
+            if species:
+                extracted["organisms"].add(species)
+            
+            # Extract gene name
+            gene = accession.get("gene")
+            if gene and gene.strip():  # Only add non-empty genes
+                extracted["gene_names"].add(gene.strip())
+            
+            # Extract modifications
+            modifications = xref.get("modifications", [])
+            if modifications:
+                extracted["modifications"].extend(modifications)
+            
+            # Extract SO term (biotype)
+            biotype = accession.get("biotype")
+            if biotype:
+                extracted["so_terms"].add(biotype)
+            
+            # Build xrefs list
+            xref_info = {
+                "database": xref.get("database"),
+                "accession_id": accession.get("id"),
+                "external_id": accession.get("external_id"),
+                "description": accession.get("description"),
+                "species": species,
+                "gene": gene,
+            }
+            extracted["xrefs_list"].append(xref_info)
+        
+        # Convert sets to appropriate formats
+        return {
+            "organism": (
+                list(extracted["organisms"])[0] 
+                if len(extracted["organisms"]) == 1 
+                else (", ".join(extracted["organisms"]) if extracted["organisms"] else None)
+            ),
+            "gene_name": (
+                list(extracted["gene_names"])[0] 
+                if len(extracted["gene_names"]) == 1 
+                else (", ".join(extracted["gene_names"]) if extracted["gene_names"] else None)
+            ),
+            "related_genes": list(extracted["gene_names"]) if extracted["gene_names"] else None,
+            "modifications": extracted["modifications"] if extracted["modifications"] else None,
+            "so_term": (
+                list(extracted["so_terms"])[0] 
+                if len(extracted["so_terms"]) == 1 
+                else (", ".join(extracted["so_terms"]) if extracted["so_terms"] else None)
+            ),
+            "xrefs": extracted["xrefs_list"] if extracted["xrefs_list"] else None,
+        }
+
+    @staticmethod
+    def _rna_data_to_dict(rna_id: str, rna_data: dict, xrefs_data: Optional[List[Dict]] = None) -> dict:
         """
         Convert RNAcentral API response to a dictionary.
         :param rna_id: RNAcentral ID.
         :param rna_data: API response data (dict or dict-like from search results).
+        :param xrefs_data: Optional list of xref entries fetched from xrefs endpoint.
         :return: A dictionary containing RNA information.
         """
         sequence = rna_data.get("sequence", "")
+        
+        # Initialize extracted info from xrefs if available
+        extracted_info = {}
+        if xrefs_data:
+            extracted_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data)
+        
+        # Extract organism information (prefer from xrefs, fallback to main data)
+        organism = extracted_info.get("organism")
+        if not organism:
+            organism = rna_data.get("organism", None)
+        if not organism:
+            organism = rna_data.get("species", None)
+
+        # Extract related genes (prefer from xrefs, fallback to main data)
+        related_genes = extracted_info.get("related_genes")
+        if not related_genes:
+            related_genes = rna_data.get("related_genes", [])
+        if not related_genes:
+            related_genes = rna_data.get("genes", [])
+        if not related_genes:
+            gene_name_temp = rna_data.get("gene_name", None)
+            if gene_name_temp:
+                related_genes = [gene_name_temp]
+
+        # Extract gene name (prefer from xrefs, fallback to main data)
+        gene_name = extracted_info.get("gene_name")
+        if not gene_name:
+            gene_name = rna_data.get("gene_name", None)
+        if not gene_name:
+            gene_name = rna_data.get("gene", None)
+
+        # Extract so_term (prefer from xrefs, fallback to main data)
+        so_term = extracted_info.get("so_term")
+        if not so_term:
+            so_term = rna_data.get("so_term", None)
+
+        # Extract modifications (prefer from xrefs, fallback to main data)
+        modifications = extracted_info.get("modifications")
+        if not modifications:
+            modifications = rna_data.get("modifications", None)
+
+        # Build result dictionary (xrefs information is already extracted into other fields)
+        # information is extracted into organism, gene_name, so_term, modifications, etc.
         return {
             "molecule_type": "RNA",
             "database": "RNAcentral",
@@ -55,6 +199,11 @@ def _rna_data_to_dict(rna_id: str, rna_data: dict) -> dict:
             "rna_type": rna_data.get("rna_type", "N/A"),
             "description": rna_data.get("description", "N/A"),
             "url": f"https://rnacentral.org/rna/{rna_id}",
+            "organism": organism,
+            "related_genes": related_genes if related_genes else None,
+            "gene_name": gene_name,
+            "so_term": so_term,
+            "modifications": modifications,
         }
 
     async def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
@@ -71,7 +220,19 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
                 ) as resp:
                     if resp.status == 200:
                         rna_data = await resp.json()
-                        return self._rna_data_to_dict(rna_id, rna_data)
+                        
+                        # Check if xrefs is a URL and fetch the actual xrefs data
+                        xrefs_data = None
+                        xrefs_url = rna_data.get("xrefs")
+                        if xrefs_url and isinstance(xrefs_url, str) and xrefs_url.startswith("http"):
+                            try:
+                                xrefs_data = await self._fetch_all_xrefs(xrefs_url, session)
+                                logger.debug("Fetched %d xrefs for RNA ID %s", len(xrefs_data), rna_id)
+                            except Exception as e:
+                                logger.warning("Failed to fetch xrefs for RNA ID %s: %s", rna_id, e)
+                                # Continue without xrefs data
+                        
+                        return self._rna_data_to_dict(rna_id, rna_data, xrefs_data)
                     if resp.status == 404:
                         logger.error("RNA ID %s not found", rna_id)
                         return None
@@ -86,8 +247,9 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
     async def get_best_hit(self, keyword: str) -> Optional[dict]:
         """
         Search RNAcentral with a keyword and return the best hit.
+        Unified approach: Find RNA ID from search, then call get_by_rna_id() for complete information.
         :param keyword: The search keyword (e.g., miRNA name, RNA name).
-        :return: A dictionary containing the best hit information or None if not found.
+        :return: A dictionary containing complete RNA information or None if not found.
         """
         if not keyword.strip():
             return None
@@ -106,16 +268,20 @@ async def get_best_hit(self, keyword: str) -> Optional[dict]:
                         search_results = await resp.json()
                         results = search_results.get("results", [])
                         if results:
-                            # Use the first result directly (search API already returns enough info)
+                            # Step 1: Get RNA ID from search results
                             first_result = results[0]
                             rna_id = first_result.get("rnacentral_id")
+                            
                             if rna_id:
-                                # Try to get detailed info, but fall back to search result if it fails
-                                detailed_info = await self.get_by_rna_id(rna_id)
-                                if detailed_info:
-                                    return detailed_info
-                                # Fall back to using search result data
-                                return self._rna_data_to_dict(rna_id, first_result)
+                                # Step 2: Unified call to get_by_rna_id() for complete information
+                                result = await self.get_by_rna_id(rna_id)
+                                
+                                # Step 3: If get_by_rna_id() failed, use search result data as fallback
+                                if not result:
+                                    logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
+                                    result = self._rna_data_to_dict(rna_id, first_result)
+                                
+                                return result
                         logger.info("No results found for keyword: %s", keyword)
                         return None
                     error_text = await resp.text()
@@ -131,8 +297,9 @@ async def get_best_hit(self, keyword: str) -> Optional[dict]:
     async def search_by_sequence(self, sequence: str) -> Optional[dict]:
         """
         Search RNAcentral with an RNA sequence.
+        Unified approach: Find RNA ID from sequence search, then call get_by_rna_id() for complete information.
         :param sequence: RNA sequence (FASTA format or raw sequence).
-        :return: A dictionary containing the best hit information or None if not found.
+        :return: A dictionary containing complete RNA information or None if not found.
         """
         try:
             # Extract sequence (if in FASTA format)
@@ -165,7 +332,7 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]:
                         search_results = await resp.json()
                         results = search_results.get("results", [])
                         if results:
-                            # First, try to find an exact sequence match
+                            # Step 1: Find best match (prefer exact match)
                             exact_match = None
                             for result in results:
                                 result_seq = result.get("sequence", "")
@@ -178,16 +345,15 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]:
                             rna_id = target_result.get("rnacentral_id")
                             
                             if rna_id:
-                                # Try to get detailed info, but fall back to search result if it fails
-                                try:
-                                    detailed_info = await self.get_by_rna_id(rna_id)
-                                    if detailed_info:
-                                        return detailed_info
-                                except Exception as e:
-                                    logger.debug("Failed to get detailed info for %s: %s, using search result", rna_id, e)
+                                # Step 2: Unified call to get_by_rna_id() for complete information
+                                result = await self.get_by_rna_id(rna_id)
+                                
+                                # Step 3: If get_by_rna_id() failed, use search result data as fallback
+                                if not result:
+                                    logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
+                                    result = self._rna_data_to_dict(rna_id, target_result)
                                 
-                                # Fall back to using search result data
-                                return self._rna_data_to_dict(rna_id, target_result)
+                                return result
                         logger.info("No results found for sequence.")
                         return None
                     error_text = await resp.text()

From 8678e33161f969f948b72d61108440409f42fdf1 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Thu, 27 Nov 2025 20:34:33 +0800
Subject: [PATCH 09/22] fix: fix pylint style issues

---
 graphgen/models/searcher/db/ncbi_searcher.py  | 41 +++++++------
 .../models/searcher/db/rnacentral_searcher.py | 58 +++++++++----------
 2 files changed, 49 insertions(+), 50 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index cc78fa1c..49d0f901 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -33,7 +33,7 @@ class NCBISearch(BaseSearcher):
     1) Get the gene/DNA by accession number or gene ID.
     2) Search with keywords or gene names (fuzzy search).
     3) Search with FASTA sequence (BLAST search for DNA sequences).
-    
+
     API Documentation: https://www.ncbi.nlm.nih.gov/home/develop/api/
     Note: NCBI has rate limits (max 3 requests per second), delays are required between requests.
     """
@@ -49,12 +49,11 @@ def _safe_get(obj, key, default=None):
         """Safely get value from dict or StringElement-like object."""
         if isinstance(obj, dict):
             return obj.get(key, default)
-        elif hasattr(obj, "get"):
+        if hasattr(obj, "get"):
             return obj.get(key, default)
-        elif hasattr(obj, key):
+        if hasattr(obj, key):
             return getattr(obj, key, default)
-        else:
-            return default
+        return default
 
     @staticmethod
     def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
@@ -68,7 +67,7 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
             raise ValueError("Empty gene record")
 
         gene_data = gene_record[0]
-        
+
         # Safely extract gene_ref, handling both dict and StringElement types
         gene_ref = {}
         entrezgene_gene = gene_data.get("Entrezgene_gene")
@@ -146,7 +145,7 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
         # Note: Entrezgene_location doesn't exist, but Entrezgene_locus contains location info
         chromosome = None
         genomic_location = None
-        
+
         try:
             locus_data = gene_data.get("Entrezgene_locus")
             if locus_data and isinstance(locus_data, list) and locus_data:
@@ -159,7 +158,7 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
                         match = re.search(r'Chromosome\s+(\S+)', str(label))
                         if match:
                             chromosome = match.group(1)
-                    
+
                     # Extract genomic location from Gene-commentary_seqs
                     seqs = NCBISearch._safe_get(first_locus, "Gene-commentary_seqs", [])
                     if seqs and isinstance(seqs, list) and seqs:
@@ -255,7 +254,7 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
         reraise=True,
         before_sleep=before_sleep_log(logger, logging.WARNING),
     )
-    
+
     def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
         """
         Get gene information by Gene ID.
@@ -272,7 +271,7 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None
                 if not gene_record:
                     return None
                 result = self._gene_record_to_dict(gene_record, gene_id)
-                
+
                 # Try to get sequence from accession
                 # Priority: 1) preferred_accession (if provided), 2) representative mRNA accession
                 accession_to_use = preferred_accession or result.get("_representative_accession")
@@ -292,7 +291,7 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None
                                 seq_lines = sequence_data.strip().split("\n")
                                 header = seq_lines[0] if seq_lines else ""
                                 sequence = "".join(seq_lines[1:])
-                                
+
                                 # Get summary for additional info
                                 time.sleep(0.35)
                                 summary_handle = Entrez.esummary(db="nuccore", id=accession_to_use)
@@ -301,7 +300,7 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None
                                     if summary:
                                         summary_data = summary[0]
                                         title = summary_data.get("Title", header)
-                                        
+
                                         # Determine molecule type detail
                                         molecule_type_detail = "N/A"
                                         if accession_to_use.startswith("NM_") or accession_to_use.startswith("XM_"):
@@ -312,13 +311,13 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None
                                             molecule_type_detail = "RNA"
                                         elif accession_to_use.startswith("NG_"):
                                             molecule_type_detail = "genomic region"
-                                        
+
                                         # Merge sequence information into result
                                         result["sequence"] = sequence
                                         result["sequence_length"] = len(sequence)
                                         result["title"] = title
                                         result["molecule_type_detail"] = molecule_type_detail
-                                        
+
                                         # Update chromosome and genomic_location if not already set
                                         if not result.get("chromosome"):
                                             chromosome = summary_data.get("ChrLoc") or summary_data.get("ChrAccVer")
@@ -337,9 +336,9 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None
                         # Re-raise to allow retry mechanism
                         raise
                     except Exception as e:
-                        logger.debug("Failed to get sequence for accession %s: %s", 
+                        logger.debug("Failed to get sequence for accession %s: %s",
                                    accession_to_use, e)
-                
+
                 # Remove internal field
                 result.pop("_representative_accession", None)
                 return result
@@ -368,7 +367,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]:
             gene_id = None
             try:
                 links = Entrez.read(link_handle)
-                
+
                 # Extract GeneID from elink results
                 # Structure: links[0]["LinkSetDb"][0]["Link"][0]["Id"]
                 if links and len(links) > 0:
@@ -401,19 +400,19 @@ def get_by_accession(self, accession: str) -> Optional[dict]:
                 # Continue to check if we got gene_id before the error
             finally:
                 link_handle.close()
-            
+
             # Step 2: If we have a GeneID, get complete information from Gene database
             # Pass the accession as preferred_accession so get_by_gene_id can use it for sequence
             if gene_id:
                 result = self.get_by_gene_id(gene_id, preferred_accession=accession)
-                
+
                 # Update id to accession for consistency (user searched by accession)
                 if result:
                     result["id"] = accession
                     result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}"
-                
+
                 return result
-            
+
             # Step 3: If no GeneID, this is a rare case (accession without associated gene)
             # Return None - we can't provide complete information without Gene ID
             logger.warning("Accession %s has no associated GeneID, cannot provide complete information", accession)
diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index 80cb4428..89b430ac 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -19,7 +19,7 @@ class RNACentralSearch(BaseSearcher):
     1) Get RNA by RNAcentral ID.
     2) Search with keywords or RNA names (fuzzy search).
     3) Search with RNA sequence.
-    
+
     API Documentation: https://rnacentral.org/api/v1
     """
 
@@ -37,7 +37,7 @@ async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession)
         """
         all_xrefs = []
         current_url = xrefs_url
-        
+
         while current_url:
             try:
                 async with session.get(
@@ -47,12 +47,12 @@ async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession)
                         data = await resp.json()
                         results = data.get("results", [])
                         all_xrefs.extend(results)
-                        
+
                         # Check if there's a next page
                         current_url = data.get("next")
                         if not current_url:
                             break
-                        
+
                         # Small delay to avoid rate limiting
                         await asyncio.sleep(0.2)
                     else:
@@ -61,7 +61,7 @@ async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession)
             except Exception as e:
                 logger.warning("Error fetching xrefs from %s: %s", current_url, e)
                 break
-        
+
         return all_xrefs
 
     @staticmethod
@@ -78,31 +78,31 @@ def _extract_info_from_xrefs(xrefs: List[Dict]) -> Dict[str, Any]:
             "so_terms": set(),
             "xrefs_list": [],
         }
-        
+
         for xref in xrefs:
             # Extract accession information
             accession = xref.get("accession", {})
-            
+
             # Extract species information
             species = accession.get("species")
             if species:
                 extracted["organisms"].add(species)
-            
+
             # Extract gene name
             gene = accession.get("gene")
             if gene and gene.strip():  # Only add non-empty genes
                 extracted["gene_names"].add(gene.strip())
-            
+
             # Extract modifications
             modifications = xref.get("modifications", [])
             if modifications:
                 extracted["modifications"].extend(modifications)
-            
+
             # Extract SO term (biotype)
             biotype = accession.get("biotype")
             if biotype:
                 extracted["so_terms"].add(biotype)
-            
+
             # Build xrefs list
             xref_info = {
                 "database": xref.get("database"),
@@ -113,24 +113,24 @@ def _extract_info_from_xrefs(xrefs: List[Dict]) -> Dict[str, Any]:
                 "gene": gene,
             }
             extracted["xrefs_list"].append(xref_info)
-        
+
         # Convert sets to appropriate formats
         return {
             "organism": (
-                list(extracted["organisms"])[0] 
-                if len(extracted["organisms"]) == 1 
+                list(extracted["organisms"])[0]
+                if len(extracted["organisms"]) == 1
                 else (", ".join(extracted["organisms"]) if extracted["organisms"] else None)
             ),
             "gene_name": (
-                list(extracted["gene_names"])[0] 
-                if len(extracted["gene_names"]) == 1 
+                list(extracted["gene_names"])[0]
+                if len(extracted["gene_names"]) == 1
                 else (", ".join(extracted["gene_names"]) if extracted["gene_names"] else None)
             ),
             "related_genes": list(extracted["gene_names"]) if extracted["gene_names"] else None,
             "modifications": extracted["modifications"] if extracted["modifications"] else None,
             "so_term": (
-                list(extracted["so_terms"])[0] 
-                if len(extracted["so_terms"]) == 1 
+                list(extracted["so_terms"])[0]
+                if len(extracted["so_terms"]) == 1
                 else (", ".join(extracted["so_terms"]) if extracted["so_terms"] else None)
             ),
             "xrefs": extracted["xrefs_list"] if extracted["xrefs_list"] else None,
@@ -146,12 +146,12 @@ def _rna_data_to_dict(rna_id: str, rna_data: dict, xrefs_data: Optional[List[Dic
         :return: A dictionary containing RNA information.
         """
         sequence = rna_data.get("sequence", "")
-        
+
         # Initialize extracted info from xrefs if available
         extracted_info = {}
         if xrefs_data:
             extracted_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data)
-        
+
         # Extract organism information (prefer from xrefs, fallback to main data)
         organism = extracted_info.get("organism")
         if not organism:
@@ -220,7 +220,7 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
                 ) as resp:
                     if resp.status == 200:
                         rna_data = await resp.json()
-                        
+
                         # Check if xrefs is a URL and fetch the actual xrefs data
                         xrefs_data = None
                         xrefs_url = rna_data.get("xrefs")
@@ -231,7 +231,7 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
                             except Exception as e:
                                 logger.warning("Failed to fetch xrefs for RNA ID %s: %s", rna_id, e)
                                 # Continue without xrefs data
-                        
+
                         return self._rna_data_to_dict(rna_id, rna_data, xrefs_data)
                     if resp.status == 404:
                         logger.error("RNA ID %s not found", rna_id)
@@ -271,16 +271,16 @@ async def get_best_hit(self, keyword: str) -> Optional[dict]:
                             # Step 1: Get RNA ID from search results
                             first_result = results[0]
                             rna_id = first_result.get("rnacentral_id")
-                            
+
                             if rna_id:
                                 # Step 2: Unified call to get_by_rna_id() for complete information
                                 result = await self.get_by_rna_id(rna_id)
-                                
+
                                 # Step 3: If get_by_rna_id() failed, use search result data as fallback
                                 if not result:
                                     logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
                                     result = self._rna_data_to_dict(rna_id, first_result)
-                                
+
                                 return result
                         logger.info("No results found for keyword: %s", keyword)
                         return None
@@ -339,20 +339,20 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]:
                                 if result_seq == seq:
                                     exact_match = result
                                     break
-                            
+
                             # Use exact match if found, otherwise use first result
                             target_result = exact_match if exact_match else results[0]
                             rna_id = target_result.get("rnacentral_id")
-                            
+
                             if rna_id:
                                 # Step 2: Unified call to get_by_rna_id() for complete information
                                 result = await self.get_by_rna_id(rna_id)
-                                
+
                                 # Step 3: If get_by_rna_id() failed, use search result data as fallback
                                 if not result:
                                     logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
                                     result = self._rna_data_to_dict(rna_id, target_result)
-                                
+
                                 return result
                         logger.info("No results found for sequence.")
                         return None

From 40ef49e9c3f55fa78256a938f628e817c042c7ff Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Fri, 28 Nov 2025 01:07:26 +0800
Subject: [PATCH 10/22] fix: reduce nested blocks and fix all pylint issues

---
 graphgen/models/searcher/db/ncbi_searcher.py | 522 ++++++++++---------
 1 file changed, 282 insertions(+), 240 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 49d0f901..cca38bca 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -56,51 +56,34 @@ def _safe_get(obj, key, default=None):
         return default
 
     @staticmethod
-    def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
-        """
-        Convert an Entrez gene record to a dictionary.
-        :param gene_record: The Entrez gene record (list from Entrez.read).
-        :param gene_id: The gene ID.
-        :return: A dictionary containing gene information.
-        """
-        if not gene_record:
-            raise ValueError("Empty gene record")
-
-        gene_data = gene_record[0]
-
-        # Safely extract gene_ref, handling both dict and StringElement types
-        gene_ref = {}
-        entrezgene_gene = gene_data.get("Entrezgene_gene")
+    def _extract_gene_ref(entrezgene_gene):
+        """Extract gene_ref from entrezgene_gene."""
         if isinstance(entrezgene_gene, dict):
-            gene_ref = entrezgene_gene.get("Gene-ref", {})
-        elif hasattr(entrezgene_gene, "get"):
-            gene_ref = entrezgene_gene.get("Gene-ref", {})
-        else:
-            # If it's a StringElement or other type, try to access as dict
-            try:
-                if hasattr(entrezgene_gene, "Gene-ref"):
-                    gene_ref = getattr(entrezgene_gene, "Gene-ref", {})
-            except Exception:
-                pass
+            return entrezgene_gene.get("Gene-ref", {})
+        if hasattr(entrezgene_gene, "get"):
+            return entrezgene_gene.get("Gene-ref", {})
+        try:
+            if hasattr(entrezgene_gene, "Gene-ref"):
+                return getattr(entrezgene_gene, "Gene-ref", {})
+        except Exception:
+            pass
+        return {}
 
-        # Safely extract organism
-        organism = "N/A"
+    @staticmethod
+    def _extract_organism(entrezgene_source):
+        """Extract organism from entrezgene_source."""
         try:
-            entrezgene_source = gene_data.get("Entrezgene_source")
-            if isinstance(entrezgene_source, dict):
-                biosource = entrezgene_source.get("BioSource", {})
-                if isinstance(biosource, dict):
-                    biosource_org = biosource.get("BioSource_org", {})
-                    if isinstance(biosource_org, dict):
-                        org_ref = biosource_org.get("Org-ref", {})
-                        if isinstance(org_ref, dict):
-                            organism = org_ref.get("Org-ref_taxname", "N/A")
-                        elif hasattr(org_ref, "Org-ref_taxname"):
-                            organism = getattr(org_ref, "Org-ref_taxname", "N/A")
+            biosource = NCBISearch._safe_get(entrezgene_source, "BioSource", {})
+            biosource_org = NCBISearch._safe_get(biosource, "BioSource_org", {})
+            org_ref = NCBISearch._safe_get(biosource_org, "Org-ref", {})
+            return NCBISearch._safe_get(org_ref, "Org-ref_taxname", "N/A")
         except Exception as e:
             logger.debug("Error extracting organism: %s", e)
+            return "N/A"
 
-        # Extract gene synonyms - safely handle StringElement types
+    @staticmethod
+    def _extract_synonyms(gene_ref):
+        """Extract gene synonyms from gene_ref."""
         gene_synonyms = []
         try:
             gene_syn = gene_ref.get("Gene-ref_syn", []) if isinstance(gene_ref, dict) else []
@@ -111,120 +94,154 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
                     elif isinstance(syn, str):
                         gene_synonyms.append(syn)
                     else:
-                        # Handle StringElement or other types
                         gene_synonyms.append(str(syn))
             elif isinstance(gene_syn, str):
                 gene_synonyms.append(gene_syn)
-            elif gene_syn:  # Handle StringElement
+            elif gene_syn:
                 gene_synonyms.append(str(gene_syn))
         except Exception as e:
             logger.debug("Error extracting gene synonyms: %s", e)
+        return gene_synonyms
 
-        # Extract gene type - safely handle StringElement types
-        # Note: Entrezgene_type is a StringElement with numeric value (e.g., "6" for ncRNA)
-        gene_type = None
+    @staticmethod
+    def _extract_gene_type(gene_data):
+        """Extract gene type from gene_data."""
         try:
             gene_type_data = gene_data.get("Entrezgene_type")
-            if gene_type_data:
-                type_value = str(gene_type_data)
-                # Map numeric values to type names (NCBI gene type codes)
-                type_mapping = {
-                    "1": "protein-coding",
-                    "2": "pseudo",
-                    "3": "rRNA",
-                    "4": "tRNA",
-                    "5": "snRNA",
-                    "6": "ncRNA",
-                    "7": "other",
-                }
-                gene_type = type_mapping.get(type_value, f"type_{type_value}")
+            if not gene_type_data:
+                return None
+            type_value = str(gene_type_data)
+            type_mapping = {
+                "1": "protein-coding",
+                "2": "pseudo",
+                "3": "rRNA",
+                "4": "tRNA",
+                "5": "snRNA",
+                "6": "ncRNA",
+                "7": "other",
+            }
+            return type_mapping.get(type_value, f"type_{type_value}")
         except Exception as e:
             logger.debug("Error extracting gene type: %s", e)
+            return None
 
-        # Extract chromosome and genomic location from Entrezgene_locus
-        # Note: Entrezgene_location doesn't exist, but Entrezgene_locus contains location info
-        chromosome = None
-        genomic_location = None
+    @staticmethod
+    def _extract_chromosome(first_locus):
+        """Extract chromosome from first_locus."""
+        label = NCBISearch._safe_get(first_locus, "Gene-commentary_label", "")
+        if not label or "Chromosome" not in str(label):
+            return None
+        match = re.search(r'Chromosome\s+(\S+)', str(label))
+        return match.group(1) if match else None
 
-        try:
-            locus_data = gene_data.get("Entrezgene_locus")
-            if locus_data and isinstance(locus_data, list) and locus_data:
-                first_locus = locus_data[0]
-                if isinstance(first_locus, dict):
-                    # Extract chromosome from Gene-commentary_label
-                    # Example: "Chromosome 13 Reference RoL_Sarg_1.0" -> "13"
-                    label = NCBISearch._safe_get(first_locus, "Gene-commentary_label", "")
-                    if label and "Chromosome" in str(label):
-                        match = re.search(r'Chromosome\s+(\S+)', str(label))
-                        if match:
-                            chromosome = match.group(1)
-
-                    # Extract genomic location from Gene-commentary_seqs
-                    seqs = NCBISearch._safe_get(first_locus, "Gene-commentary_seqs", [])
-                    if seqs and isinstance(seqs, list) and seqs:
-                        first_seq = seqs[0]
-                        if isinstance(first_seq, dict):
-                            seq_loc_int = NCBISearch._safe_get(first_seq, "Seq-loc_int", {})
-                            if seq_loc_int:
-                                seq_interval = NCBISearch._safe_get(seq_loc_int, "Seq-interval", {})
-                                if seq_interval:
-                                    seq_from = NCBISearch._safe_get(seq_interval, "Seq-interval_from", "")
-                                    seq_to = NCBISearch._safe_get(seq_interval, "Seq-interval_to", "")
-                                    if seq_from and seq_to:
-                                        genomic_location = f"{seq_from}-{seq_to}"
-        except Exception as e:
-            logger.debug("Error extracting chromosome/location from gene record: %s", e)
+    @staticmethod
+    def _extract_genomic_location(first_locus):
+        """Extract genomic location from first_locus."""
+        seqs = NCBISearch._safe_get(first_locus, "Gene-commentary_seqs", [])
+        if not seqs or not isinstance(seqs, list) or not seqs:
+            return None
+        first_seq = seqs[0]
+        if not isinstance(first_seq, dict):
+            return None
+        seq_loc_int = NCBISearch._safe_get(first_seq, "Seq-loc_int", {})
+        if not seq_loc_int:
+            return None
+        seq_interval = NCBISearch._safe_get(seq_loc_int, "Seq-interval", {})
+        if not seq_interval:
+            return None
+        seq_from = NCBISearch._safe_get(seq_interval, "Seq-interval_from", "")
+        seq_to = NCBISearch._safe_get(seq_interval, "Seq-interval_to", "")
+        if seq_from and seq_to:
+            return f"{seq_from}-{seq_to}"
+        return None
+
+    @staticmethod
+    def _extract_location_info(locus_data):
+        """Extract chromosome and genomic location from locus data."""
+        if not locus_data or not isinstance(locus_data, list) or not locus_data:
+            return None, None
+        first_locus = locus_data[0]
+        if not isinstance(first_locus, dict):
+            return None, None
+        chromosome = NCBISearch._extract_chromosome(first_locus)
+        genomic_location = NCBISearch._extract_genomic_location(first_locus)
+        return chromosome, genomic_location
 
-        # Extract gene functional description
-        # Note: Entrezgene_summary doesn't exist for most genes
-        # Try to extract from Entrezgene_comments if available
-        function = None
+    @staticmethod
+    def _extract_function_info(gene_data):
+        """Extract gene functional description."""
         try:
-            # First try Entrezgene_summary (if exists)
             summary = gene_data.get("Entrezgene_summary")
             if summary:
-                function = str(summary)
-            else:
-                # Try to extract from Entrezgene_comments
-                comments_data = gene_data.get("Entrezgene_comments")
-                if comments_data and isinstance(comments_data, list):
-                    for comment in comments_data:
-                        if isinstance(comment, dict):
-                            heading = NCBISearch._safe_get(comment, "Gene-commentary_heading", "")
-                            # Look for function-related comments
-                            if "function" in str(heading).lower() or "summary" in str(heading).lower():
-                                comment_text = NCBISearch._safe_get(comment, "Gene-commentary_comment", "")
-                                if comment_text:
-                                    function = str(comment_text)
-                                    break
+                return str(summary)
+            comments_data = gene_data.get("Entrezgene_comments")
+            if not comments_data or not isinstance(comments_data, list):
+                return None
+            for comment in comments_data:
+                if not isinstance(comment, dict):
+                    continue
+                heading = NCBISearch._safe_get(comment, "Gene-commentary_heading", "")
+                heading_lower = str(heading).lower()
+                if "function" not in heading_lower and "summary" not in heading_lower:
+                    continue
+                comment_text = NCBISearch._safe_get(comment, "Gene-commentary_comment", "")
+                if comment_text:
+                    return str(comment_text)
+            return None
         except Exception as e:
             logger.debug("Error extracting function: %s", e)
+            return None
 
-        # Try to extract representative mRNA accession from Entrezgene_locus for sequence retrieval
+    @staticmethod
+    def _extract_accession(locus_data):
+        """Extract representative mRNA accession from locus data."""
+        if not locus_data or not isinstance(locus_data, list) or not locus_data:
+            return None
+        first_locus = locus_data[0]
+        if not isinstance(first_locus, dict):
+            return None
+        products = NCBISearch._safe_get(first_locus, "Gene-commentary_products", [])
+        if not products or not isinstance(products, list):
+            return None
         representative_accession = None
-        try:
-            if locus_data and isinstance(locus_data, list) and locus_data:
-                first_locus = locus_data[0]
-                if isinstance(first_locus, dict):
-                    products = NCBISearch._safe_get(first_locus, "Gene-commentary_products", [])
-                    if products and isinstance(products, list):
-                        # Look for mRNA (type 3) or the first product
-                        for product in products:
-                            if isinstance(product, dict):
-                                product_type = NCBISearch._safe_get(product, "Gene-commentary_type", "")
-                                product_type_str = str(product_type)
-                                # Type 3 is mRNA, prefer mRNA over other types
-                                if product_type_str == "3" or (not representative_accession and product_type_str):
-                                    accession = NCBISearch._safe_get(product, "Gene-commentary_accession", "")
-                                    if accession:
-                                        representative_accession = str(accession)
-                                        if product_type_str == "3":  # Found mRNA, use it
-                                            break
-        except Exception as e:
-            logger.debug("Error extracting representative accession: %s", e)
+        for product in products:
+            if not isinstance(product, dict):
+                continue
+            product_type = NCBISearch._safe_get(product, "Gene-commentary_type", "")
+            product_type_str = str(product_type)
+            if product_type_str == "3" or (not representative_accession and product_type_str):
+                accession = NCBISearch._safe_get(product, "Gene-commentary_accession", "")
+                if accession:
+                    representative_accession = str(accession)
+                    if product_type_str == "3":
+                        break
+        return representative_accession
+
+    @staticmethod
+    def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
+        """
+        Convert an Entrez gene record to a dictionary.
+        :param gene_record: The Entrez gene record (list from Entrez.read).
+        :param gene_id: The gene ID.
+        :return: A dictionary containing gene information.
+        """
+        if not gene_record:
+            raise ValueError("Empty gene record")
+
+        gene_data = gene_record[0]
+        locus_data = gene_data.get("Entrezgene_locus")
+
+        # Extract information using helper methods
+        entrezgene_gene = gene_data.get("Entrezgene_gene")
+        gene_ref = NCBISearch._extract_gene_ref(entrezgene_gene)
+        organism = NCBISearch._extract_organism(gene_data.get("Entrezgene_source"))
+        gene_synonyms = NCBISearch._extract_synonyms(gene_ref)
+        gene_type = NCBISearch._extract_gene_type(gene_data)
+        chromosome, genomic_location = NCBISearch._extract_location_info(locus_data)
+        function = NCBISearch._extract_function_info(gene_data)
+        representative_accession = NCBISearch._extract_accession(locus_data)
 
         # Build result dictionary with all fields
-        # Include all fields that might be present in accession-based queries
         return {
             "molecule_type": "DNA",
             "database": "NCBI",
@@ -247,6 +264,128 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
             "_representative_accession": representative_accession,
         }
 
+    def _fetch_sequence(self, accession: str):
+        """Fetch sequence from nuccore database using efetch."""
+        time.sleep(0.35)  # Comply with rate limit
+        seq_handle = Entrez.efetch(
+            db="nuccore",
+            id=accession,
+            rettype="fasta",
+            retmode="text",
+        )
+        try:
+            sequence_data = seq_handle.read()
+            if not sequence_data:
+                return None, None
+            seq_lines = sequence_data.strip().split("\n")
+            header = seq_lines[0] if seq_lines else ""
+            sequence = "".join(seq_lines[1:])
+            return sequence, header
+        finally:
+            seq_handle.close()
+
+    def _fetch_summary(self, accession: str, default_header: str = ""):
+        """Fetch summary from nuccore database using esummary."""
+        time.sleep(0.35)  # Comply with rate limit
+        summary_handle = Entrez.esummary(db="nuccore", id=accession)
+        try:
+            summary = Entrez.read(summary_handle)
+            if not summary:
+                return None
+            summary_data = summary[0]
+
+            # Determine molecule type detail
+            molecule_type_detail = "N/A"
+            if accession.startswith("NM_") or accession.startswith("XM_"):
+                molecule_type_detail = "mRNA"
+            elif accession.startswith("NC_") or accession.startswith("NT_"):
+                molecule_type_detail = "genomic DNA"
+            elif accession.startswith("NR_") or accession.startswith("XR_"):
+                molecule_type_detail = "RNA"
+            elif accession.startswith("NG_"):
+                molecule_type_detail = "genomic region"
+
+            title = summary_data.get("Title", default_header)
+            chromosome = summary_data.get("ChrLoc") or summary_data.get("ChrAccVer")
+            chr_start = summary_data.get("ChrStart")
+            chr_stop = summary_data.get("ChrStop")
+            genomic_location = None
+            if chr_start and chr_stop:
+                genomic_location = f"{chr_start}-{chr_stop}"
+
+            return {
+                "title": title,
+                "molecule_type_detail": molecule_type_detail,
+                "chromosome": chromosome,
+                "genomic_location": genomic_location,
+            }
+        finally:
+            summary_handle.close()
+
+    def _extract_gene_id(self, link_handle):
+        """Extract GeneID from elink results."""
+        try:
+            links = Entrez.read(link_handle)
+            if not links or len(links) == 0:
+                return None
+
+            first_link = links[0]
+            if "LinkSetDb" not in first_link:
+                return None
+
+            for link_set in first_link["LinkSetDb"]:
+                if link_set.get("DbTo") != "gene":
+                    continue
+
+                # Try Link structure first (most common)
+                links_in_set = link_set.get("Link", [])
+                if links_in_set and len(links_in_set) > 0:
+                    first_link_item = links_in_set[0]
+                    if isinstance(first_link_item, dict):
+                        gene_id = str(first_link_item.get("Id", ""))
+                    elif hasattr(first_link_item, "Id"):
+                        gene_id = str(getattr(first_link_item, "Id", ""))
+                    else:
+                        gene_id = str(first_link_item)
+                    if gene_id:
+                        return gene_id
+
+                # Fallback: Try IdList (if Link is not available)
+                id_list = link_set.get("IdList", [])
+                if id_list:
+                    return str(id_list[0])
+
+            return None
+        except Exception as e:
+            logger.error("Error parsing elink result: %s", e)
+            import traceback
+            logger.debug(traceback.format_exc())
+            return None
+
+    def _extract_sequence(self, result: dict, accession: str):
+        """Enrich result dictionary with sequence and summary information from accession."""
+        try:
+            sequence, header = self._fetch_sequence(accession)
+            if sequence:
+                result["sequence"] = sequence
+                result["sequence_length"] = len(sequence)
+
+            summary_info = self._fetch_summary(accession, header or "")
+            if not summary_info:
+                return
+
+            result["title"] = summary_info.get("title")
+            result["molecule_type_detail"] = summary_info.get("molecule_type_detail")
+            # Update chromosome and genomic_location if not already set
+            if not result.get("chromosome") and summary_info.get("chromosome"):
+                result["chromosome"] = summary_info["chromosome"]
+            if not result.get("genomic_location") and summary_info.get("genomic_location"):
+                result["genomic_location"] = summary_info["genomic_location"]
+        except (RequestException, IncompleteRead):
+            raise
+        except Exception as e:
+            logger.debug("Failed to get sequence for accession %s: %s", accession, e)
+
     @retry(
         stop=stop_after_attempt(5),
         wait=wait_exponential(multiplier=1, min=4, max=10),
@@ -260,7 +399,7 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None
         Get gene information by Gene ID.
         This is the unified data source - all search methods eventually call this.
         :param gene_id: NCBI Gene ID.
-        :param preferred_accession: Optional accession to use for sequence retrieval if representative mRNA is not available.
+        :param preferred_accession: Optional accession to use for sequence retrieval.
         :return: A dictionary containing gene information or None if not found.
         """
         try:
@@ -273,71 +412,9 @@ def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None
                 result = self._gene_record_to_dict(gene_record, gene_id)
 
                 # Try to get sequence from accession
-                # Priority: 1) preferred_accession (if provided), 2) representative mRNA accession
                 accession_to_use = preferred_accession or result.get("_representative_accession")
                 if accession_to_use:
-                    try:
-                        # Get sequence info directly from nuccore database
-                        time.sleep(0.35)
-                        seq_handle = Entrez.efetch(
-                            db="nuccore",
-                            id=accession_to_use,
-                            rettype="fasta",
-                            retmode="text",
-                        )
-                        try:
-                            sequence_data = seq_handle.read()
-                            if sequence_data:
-                                seq_lines = sequence_data.strip().split("\n")
-                                header = seq_lines[0] if seq_lines else ""
-                                sequence = "".join(seq_lines[1:])
-
-                                # Get summary for additional info
-                                time.sleep(0.35)
-                                summary_handle = Entrez.esummary(db="nuccore", id=accession_to_use)
-                                try:
-                                    summary = Entrez.read(summary_handle)
-                                    if summary:
-                                        summary_data = summary[0]
-                                        title = summary_data.get("Title", header)
-
-                                        # Determine molecule type detail
-                                        molecule_type_detail = "N/A"
-                                        if accession_to_use.startswith("NM_") or accession_to_use.startswith("XM_"):
-                                            molecule_type_detail = "mRNA"
-                                        elif accession_to_use.startswith("NC_") or accession_to_use.startswith("NT_"):
-                                            molecule_type_detail = "genomic DNA"
-                                        elif accession_to_use.startswith("NR_") or accession_to_use.startswith("XR_"):
-                                            molecule_type_detail = "RNA"
-                                        elif accession_to_use.startswith("NG_"):
-                                            molecule_type_detail = "genomic region"
-
-                                        # Merge sequence information into result
-                                        result["sequence"] = sequence
-                                        result["sequence_length"] = len(sequence)
-                                        result["title"] = title
-                                        result["molecule_type_detail"] = molecule_type_detail
-
-                                        # Update chromosome and genomic_location if not already set
-                                        if not result.get("chromosome"):
-                                            chromosome = summary_data.get("ChrLoc") or summary_data.get("ChrAccVer")
-                                            if chromosome:
-                                                result["chromosome"] = chromosome
-                                        if not result.get("genomic_location"):
-                                            chr_start = summary_data.get("ChrStart")
-                                            chr_stop = summary_data.get("ChrStop")
-                                            if chr_start and chr_stop:
-                                                result["genomic_location"] = f"{chr_start}-{chr_stop}"
-                                finally:
-                                    summary_handle.close()
-                        finally:
-                            seq_handle.close()
-                    except (RequestException, IncompleteRead):
-                        # Re-raise to allow retry mechanism
-                        raise
-                    except Exception as e:
-                        logger.debug("Failed to get sequence for accession %s: %s",
-                                   accession_to_use, e)
+                    self._extract_sequence(result, accession_to_use)
 
                 # Remove internal field
                 result.pop("_representative_accession", None)
@@ -364,58 +441,24 @@ def get_by_accession(self, accession: str) -> Optional[dict]:
             # Note: esummary for nuccore doesn't include GeneID, so we use elink instead
             time.sleep(0.35)
             link_handle = Entrez.elink(dbfrom="nuccore", db="gene", id=accession)
-            gene_id = None
             try:
-                links = Entrez.read(link_handle)
-
-                # Extract GeneID from elink results
-                # Structure: links[0]["LinkSetDb"][0]["Link"][0]["Id"]
-                if links and len(links) > 0:
-                    first_link = links[0]
-                    if "LinkSetDb" in first_link:
-                        for link_set in first_link["LinkSetDb"]:
-                            if link_set.get("DbTo") == "gene":
-                                # Try Link structure first (most common)
-                                links_in_set = link_set.get("Link", [])
-                                if links_in_set and len(links_in_set) > 0:
-                                    first_link_item = links_in_set[0]
-                                    if isinstance(first_link_item, dict):
-                                        gene_id = str(first_link_item.get("Id", ""))
-                                    elif hasattr(first_link_item, "Id"):
-                                        gene_id = str(getattr(first_link_item, "Id", ""))
-                                    else:
-                                        # Handle StringElement or other types
-                                        gene_id = str(first_link_item)
-                                    if gene_id:
-                                        break
-                                # Fallback: Try IdList (if Link is not available)
-                                id_list = link_set.get("IdList", [])
-                                if id_list and not gene_id:
-                                    gene_id = str(id_list[0])
-                                    break
-            except Exception as e:
-                logger.error("Error parsing elink result for accession %s: %s", accession, e)
-                import traceback
-                logger.debug(traceback.format_exc())
-                # Continue to check if we got gene_id before the error
+                gene_id = self._extract_gene_id(link_handle)
             finally:
                 link_handle.close()
 
             # Step 2: If we have a GeneID, get complete information from Gene database
-            # Pass the accession as preferred_accession so get_by_gene_id can use it for sequence
             if gene_id:
                 result = self.get_by_gene_id(gene_id, preferred_accession=accession)
-
-                # Update id to accession for consistency (user searched by accession)
                 if result:
                     result["id"] = accession
                     result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}"
-
                 return result
 
             # Step 3: If no GeneID, this is a rare case (accession without associated gene)
-            # Return None - we can't provide complete information without Gene ID
-            logger.warning("Accession %s has no associated GeneID, cannot provide complete information", accession)
+            logger.warning(
+                "Accession %s has no associated GeneID, cannot provide complete information",
+                accession
+            )
             return None
         except (RequestException, IncompleteRead):
             raise
@@ -491,13 +534,12 @@ def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional
             else:
                 seq = sequence.strip().replace(" ", "").replace("\n", "")
 
-            # Validate if it's a DNA sequence
-            if not re.fullmatch(r"[ATCGN\s]+", seq, re.I):
-                logger.error("Invalid DNA sequence provided.")
-                return None
-
-            if not seq:
-                logger.error("Empty DNA sequence provided.")
+            # Validate sequence
+            if not seq or not re.fullmatch(r"[ATCGN\s]+", seq, re.I):
+                if not seq:
+                    logger.error("Empty DNA sequence provided.")
+                else:
+                    logger.error("Invalid DNA sequence provided.")
                 return None
 
             # Use BLAST search (Note: requires network connection, may be slow)

From 93826604581c0bcd5d28738add03cdac3babf704 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Sat, 29 Nov 2025 22:25:42 +0800
Subject: [PATCH 11/22] feat: add DNA RNA local blast

---
 graphgen/configs/search_dna_config.yaml       |   2 +
 graphgen/configs/search_protein_config.yaml   |   3 +-
 graphgen/configs/search_rna_config.yaml       |   4 +-
 graphgen/models/searcher/db/ncbi_searcher.py  |  66 ++++++-
 .../models/searcher/db/rnacentral_searcher.py |  73 ++++++-
 scripts/search/build_db/build_dna_blast_db.sh | 178 ++++++++++++++++++
 .../search/build_db/build_protein_blast_db.sh |  56 ++++++
 scripts/search/build_db/build_rna_blast_db.sh | 157 +++++++++++++++
 8 files changed, 529 insertions(+), 10 deletions(-)
 create mode 100755 scripts/search/build_db/build_dna_blast_db.sh
 create mode 100755 scripts/search/build_db/build_protein_blast_db.sh
 create mode 100755 scripts/search/build_db/build_rna_blast_db.sh

diff --git a/graphgen/configs/search_dna_config.yaml b/graphgen/configs/search_dna_config.yaml
index 95f8fc39..5245ea0c 100644
--- a/graphgen/configs/search_dna_config.yaml
+++ b/graphgen/configs/search_dna_config.yaml
@@ -12,4 +12,6 @@ pipeline:
       ncbi_params:
         email: test@example.com # NCBI requires an email address
         tool: GraphGen # tool name for NCBI API
+        use_local_blast: true # whether to use local blast for DNA search
+        local_blast_db: /your_path/refseq_241 # path to local BLAST database (without .nhr extension)
 
diff --git a/graphgen/configs/search_protein_config.yaml b/graphgen/configs/search_protein_config.yaml
index bb46d34c..bfbf84eb 100644
--- a/graphgen/configs/search_protein_config.yaml
+++ b/graphgen/configs/search_protein_config.yaml
@@ -11,4 +11,5 @@ pipeline:
       data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
       uniprot_params:
         use_local_blast: true # whether to use local blast for uniprot search
-        local_blast_db: /your_path/uniprot_sprot
+        local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot
+        # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database)
diff --git a/graphgen/configs/search_rna_config.yaml b/graphgen/configs/search_rna_config.yaml
index 3d051417..dae62ec2 100644
--- a/graphgen/configs/search_rna_config.yaml
+++ b/graphgen/configs/search_rna_config.yaml
@@ -10,5 +10,7 @@ pipeline:
     params:
       data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
       rnacentral_params:
-        {} # RNAcentral doesn't require additional parameters currently
+        use_local_blast: true # whether to use local blast for RNA search
+        local_blast_db: /your_path/refseq_rna_241 # format: /path/to/refseq_rna_${RELEASE}
+        # can also use DNA database with RNA sequences (if already built)
 
diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index cca38bca..24a37e2b 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -1,6 +1,9 @@
 import asyncio
 import logging
+import os
 import re
+import subprocess
+import tempfile
 import time
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
@@ -38,11 +41,22 @@ class NCBISearch(BaseSearcher):
     Note: NCBI has rate limits (max 3 requests per second), delays are required between requests.
     """
 
-    def __init__(self, email: str = "test@example.com", tool: str = "GraphGen"):
+    def __init__(
+        self,
+        email: str = "test@example.com",
+        tool: str = "GraphGen",
+        use_local_blast: bool = False,
+        local_blast_db: str = "nt_db",
+    ):
         super().__init__()
         Entrez.email = email
         Entrez.tool = tool
         Entrez.timeout = 60  # 60 seconds timeout
+        self.use_local_blast = use_local_blast
+        self.local_blast_db = local_blast_db
+        if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
+            logger.error("Local BLAST database files not found. Please check the path.")
+            self.use_local_blast = False
 
     @staticmethod
     def _safe_get(obj, key, default=None):
@@ -518,10 +532,47 @@ def get_best_hit(self, keyword: str) -> Optional[dict]:
             logger.error("Keyword %s not found: %s", keyword, e)
         return None
 
+    def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
+        """
+        Perform local BLAST search using local BLAST database.
+        :param seq: The DNA sequence.
+        :param threshold: E-value threshold for BLAST search.
+        :return: The accession number of the best hit or None if not found.
+        """
+        try:
+            with tempfile.NamedTemporaryFile(
+                mode="w+", suffix=".fa", delete=False
+            ) as tmp:
+                tmp.write(f">query\n{seq}\n")
+                tmp_name = tmp.name
+
+            cmd = [
+                "blastn",
+                "-db",
+                self.local_blast_db,
+                "-query",
+                tmp_name,
+                "-evalue",
+                str(threshold),
+                "-max_target_seqs",
+                "1",
+                "-outfmt",
+                "6 sacc",  # only return accession
+            ]
+            logger.debug("Running local blastn: %s", " ".join(cmd))
+            out = subprocess.check_output(cmd, text=True).strip()
+            os.remove(tmp_name)
+            if out:
+                return out.split("\n", maxsplit=1)[0]
+            return None
+        except Exception as exc:  # pylint: disable=broad-except
+            logger.error("Local blastn failed: %s", exc)
+            return None
+
     def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
         """
         Search NCBI with a DNA sequence using BLAST.
-        Note: This is a simplified version. For production, consider using local BLAST.
+        Tries local BLAST first if enabled, falls back to network BLAST.
         :param sequence: DNA sequence (FASTA format or raw sequence).
         :param threshold: E-value threshold for BLAST search.
         :return: A dictionary containing the best hit information or None if not found.
@@ -542,7 +593,16 @@ def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional
                     logger.error("Invalid DNA sequence provided.")
                 return None
 
-            # Use BLAST search (Note: requires network connection, may be slow)
+            # Try local BLAST first if enabled
+            accession = None
+            if self.use_local_blast:
+                accession = self._local_blast(seq, threshold)
+                if accession:
+                    logger.debug("Local BLAST found accession: %s", accession)
+                    return self.get_by_accession(accession)
+
+            # Fall back to network BLAST
+            logger.debug("Falling back to NCBIWWW.qblast.")
             logger.debug("Performing BLAST search for DNA sequence...")
             time.sleep(0.35)
 
diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index 89b430ac..c31bd978 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -1,5 +1,8 @@
 import asyncio
+import os
 import re
+import subprocess
+import tempfile
 from typing import Dict, Optional, List, Any
 
 import aiohttp
@@ -23,10 +26,15 @@ class RNACentralSearch(BaseSearcher):
     API Documentation: https://rnacentral.org/api/v1
     """
 
-    def __init__(self):
+    def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db"):
         super().__init__()
         self.base_url = "https://rnacentral.org/api/v1"
         self.headers = {"Accept": "application/json"}
+        self.use_local_blast = use_local_blast
+        self.local_blast_db = local_blast_db
+        if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
+            logger.error("Local BLAST database files not found. Please check the path.")
+            self.use_local_blast = False
 
     async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession) -> List[Dict]:
         """
@@ -294,11 +302,50 @@ async def get_best_hit(self, keyword: str) -> Optional[dict]:
             logger.error("Keyword %s not found: %s", keyword, e)
             return None
 
-    async def search_by_sequence(self, sequence: str) -> Optional[dict]:
+    def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
+        """
+        Perform local BLAST search using local BLAST database.
+        :param seq: The RNA sequence.
+        :param threshold: E-value threshold for BLAST search.
+        :return: The accession/ID of the best hit or None if not found.
+        """
+        try:
+            with tempfile.NamedTemporaryFile(
+                mode="w+", suffix=".fa", delete=False
+            ) as tmp:
+                tmp.write(f">query\n{seq}\n")
+                tmp_name = tmp.name
+
+            cmd = [
+                "blastn",
+                "-db",
+                self.local_blast_db,
+                "-query",
+                tmp_name,
+                "-evalue",
+                str(threshold),
+                "-max_target_seqs",
+                "1",
+                "-outfmt",
+                "6 sacc",  # only return accession
+            ]
+            logger.debug("Running local blastn for RNA: %s", " ".join(cmd))
+            out = subprocess.check_output(cmd, text=True).strip()
+            os.remove(tmp_name)
+            if out:
+                return out.split("\n", maxsplit=1)[0]
+            return None
+        except Exception as exc:  # pylint: disable=broad-except
+            logger.error("Local blastn failed: %s", exc)
+            return None
+
+    async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
         """
         Search RNAcentral with an RNA sequence.
+        Tries local BLAST first if enabled, falls back to RNAcentral API.
         Unified approach: Find RNA ID from sequence search, then call get_by_rna_id() for complete information.
         :param sequence: RNA sequence (FASTA format or raw sequence).
+        :param threshold: E-value threshold for BLAST search.
         :return: A dictionary containing complete RNA information or None if not found.
         """
         try:
@@ -318,7 +365,23 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]:
                 logger.error("Empty RNA sequence provided.")
                 return None
 
-            # RNAcentral API supports sequence search
+            # Try local BLAST first if enabled
+            if self.use_local_blast:
+                accession = self._local_blast(seq, threshold)
+                if accession:
+                    logger.debug("Local BLAST found accession: %s", accession)
+                    # Try to get RNA ID from accession (may need conversion)
+                    # For now, try using accession as RNA ID or search by it
+                    result = await self.get_by_rna_id(accession)
+                    if result:
+                        return result
+                    # If not found by ID, try keyword search
+                    result = await self.get_best_hit(accession)
+                    if result:
+                        return result
+
+            # Fall back to RNAcentral API
+            logger.debug("Falling back to RNAcentral API.")
             async with aiohttp.ClientSession() as session:
                 search_url = f"{self.base_url}/rna"
                 params = {"sequence": seq, "format": "json"}
@@ -373,7 +436,7 @@ async def search_by_sequence(self, sequence: str) -> Optional[dict]:
         reraise=True,
     )
     async def search(
-        self, query: str, threshold: float = 0.7, **kwargs
+        self, query: str, threshold: float = 0.1, **kwargs
     ) -> Optional[Dict]:
         """
         Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence.
@@ -395,7 +458,7 @@ async def search(
         if query.startswith(">") or (
             re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper()
         ):
-            result = await self.search_by_sequence(query)
+            result = await self.search_by_sequence(query, threshold)
         # check if RNAcentral ID (typically starts with URS)
         elif re.fullmatch(r"URS\d+", query, re.I):
             result = await self.get_by_rna_id(query)
diff --git a/scripts/search/build_db/build_dna_blast_db.sh b/scripts/search/build_db/build_dna_blast_db.sh
new file mode 100755
index 00000000..b53b4249
--- /dev/null
+++ b/scripts/search/build_db/build_dna_blast_db.sh
@@ -0,0 +1,178 @@
+#!/bin/bash
+
+set -e
+
+# Downloads NCBI RefSeq nucleotide sequences and creates BLAST databases.
+# 
+# RefSeq 目录结构说明（按生物分类组织）：
+#   - vertebrate_mammalian (哺乳动物)
+#   - vertebrate_other (其他脊椎动物)
+#   - bacteria (细菌)
+#   - archaea (古菌)
+#   - fungi (真菌)
+#   - invertebrate (无脊椎动物)
+#   - plant (植物)
+#   - viral (病毒)
+#   - protozoa (原生动物)
+#   - mitochondrion (线粒体)
+#   - plastid (质体)
+#   - plasmid (质粒)
+#   - other (其他)
+#   - complete/ (完整基因组，包含所有分类)
+#
+# 每个分类目录下包含：
+#   - {category}.{number}.genomic.fna.gz (基因组序列)
+#   - {category}.{number}.rna.fna.gz (RNA序列)
+#
+# Usage: ./build_dna_blast_db.sh [representative|complete|all]
+#   representative: Download genomic sequences from major categories (recommended, smaller)
+#                    Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi
+#   complete: Download all complete genomic sequences from complete/ directory (very large)
+#   all: Download all genomic sequences from all categories (very large)
+#
+# We need makeblastdb on our PATH
+# For Ubuntu/Debian: sudo apt install ncbi-blast+
+# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
+# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
+
+DOWNLOAD_TYPE=${1:-representative}
+
+# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
+DOWNLOAD_TMP=_downloading_dna
+mkdir -p ${DOWNLOAD_TMP}
+cd ${DOWNLOAD_TMP}
+
+# Download RefSeq release information
+echo "Downloading RefSeq release information..."
+wget -c "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER" || {
+    echo "Warning: Could not download RELEASE_NUMBER, using current date as release identifier"
+    RELEASE=$(date +%Y%m%d)
+}
+
+if [ -f "RELEASE_NUMBER" ]; then
+    RELEASE=$(cat RELEASE_NUMBER | tr -d '\n')
+    echo "RefSeq release: ${RELEASE}"
+else
+    RELEASE=$(date +%Y%m%d)
+    echo "Using date as release identifier: ${RELEASE}"
+fi
+
+# Download based on type
+case ${DOWNLOAD_TYPE} in
+    representative)
+        echo "Downloading RefSeq representative sequences (recommended, smaller size)..."
+        # Download major categories for representative coverage
+        # Note: You can modify this list based on your specific requirements
+        for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi; do
+            echo "Downloading ${category} sequences..."
+            curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
+                grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
+                sed 's/href="\(.*\)"/\1/' | \
+                while read filename; do
+                    echo "  Downloading ${filename}..."
+                    wget -c -q --show-progress \
+                        "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || {
+                        echo "Warning: Failed to download ${filename}"
+                    }
+                done
+        done
+        ;;
+    complete)
+        echo "Downloading RefSeq complete genomic sequences (WARNING: very large, may take hours)..."
+        curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \
+            grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
+            sed 's/href="\(.*\)"/\1/' | \
+            while read filename; do
+                echo "  Downloading ${filename}..."
+                wget -c -q --show-progress \
+                    "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || {
+                    echo "Warning: Failed to download ${filename}"
+                }
+            done
+        ;;
+    all)
+        echo "Downloading all RefSeq genomic sequences from all categories (WARNING: extremely large, may take many hours)..."
+        # Download genomic sequences from all categories
+        for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do
+            echo "Downloading ${category} genomic sequences..."
+            curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
+                grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
+                sed 's/href="\(.*\)"/\1/' | \
+                while read filename; do
+                    echo "  Downloading ${filename}..."
+                    wget -c -q --show-progress \
+                        "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || {
+                        echo "Warning: Failed to download ${filename}"
+                    }
+                done
+        done
+        ;;
+    *)
+        echo "Error: Unknown download type '${DOWNLOAD_TYPE}'"
+        echo "Usage: $0 [representative|complete|all]"
+        echo "Note: For RNA sequences, use build_rna_blast_db.sh instead"
+        exit 1
+        ;;
+esac
+
+cd ..
+
+# Create release directory
+mkdir -p refseq_${RELEASE}
+mv ${DOWNLOAD_TMP}/* refseq_${RELEASE}/ 2>/dev/null || true
+rmdir ${DOWNLOAD_TMP} 2>/dev/null || true
+
+cd refseq_${RELEASE}
+
+# Extract and combine sequences
+echo "Extracting and combining sequences..."
+
+# Extract all downloaded genomic sequences
+if [ $(find . -name "*.genomic.fna.gz" -type f | wc -l) -gt 0 ]; then
+    echo "Extracting genomic sequences..."
+    find . -name "*.genomic.fna.gz" -type f -exec gunzip {} \;
+fi
+
+# Combine all FASTA files into one
+echo "Combining all FASTA files..."
+FASTA_FILES=$(find . -name "*.fna" -type f)
+if [ -z "$FASTA_FILES" ]; then
+    FASTA_FILES=$(find . -name "*.fa" -type f)
+fi
+
+if [ -z "$FASTA_FILES" ]; then
+    echo "Error: No FASTA files found to combine"
+    exit 1
+fi
+
+echo "$FASTA_FILES" | while read -r file; do
+    if [ -f "$file" ]; then
+        cat "$file" >> refseq_${RELEASE}.fasta
+    fi
+done
+
+# Check if we have sequences
+if [ ! -s "refseq_${RELEASE}.fasta" ]; then
+    echo "Error: Combined FASTA file is empty"
+    exit 1
+fi
+
+echo "Creating BLAST database..."
+# Create BLAST database for DNA sequences (use -dbtype nucl for nucleotide)
+makeblastdb -in refseq_${RELEASE}.fasta \
+    -out refseq_${RELEASE} \
+    -dbtype nucl \
+    -parse_seqids \
+    -title "RefSeq_${RELEASE}"
+
+echo "BLAST database created successfully!"
+echo "Database location: $(pwd)/refseq_${RELEASE}"
+echo ""
+echo "To use this database, set in your config:"
+echo "  local_blast_db: $(pwd)/refseq_${RELEASE}"
+echo ""
+echo "Note: The database files are:"
+ls -lh refseq_${RELEASE}.*
+
+cd ..
+
diff --git a/scripts/search/build_db/build_protein_blast_db.sh b/scripts/search/build_db/build_protein_blast_db.sh
new file mode 100755
index 00000000..9292875a
--- /dev/null
+++ b/scripts/search/build_db/build_protein_blast_db.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+set -e
+
+# Downloads the latest release of UniProt, putting it in a release-specific directory.
+# Creates associated BLAST databases.
+# We need makeblastdb on our PATH
+# For Ubuntu/Debian: sudo apt install ncbi-blast+
+# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
+# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
+
+# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
+DOWNLOAD_TMP=_downloading
+mkdir -p ${DOWNLOAD_TMP}
+cd ${DOWNLOAD_TMP}
+
+wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/RELEASE.metalink"
+
+# Extract the release name (like 2017_10 or 2017_1)
+# Use sed for cross-platform compatibility (works on both macOS and Linux)
+RELEASE=$(sed -n 's/.*<version>\([0-9]\{4\}_[0-9]\{1,2\}\)<\/version>.*/\1/p' RELEASE.metalink | head -1)
+
+wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
+wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz"
+wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt"
+wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/README"
+wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/LICENSE"
+
+cd ..
+
+mkdir ${RELEASE}
+mv ${DOWNLOAD_TMP}/* ${RELEASE}
+rmdir ${DOWNLOAD_TMP}
+
+cd ${RELEASE}
+
+gunzip uniprot_sprot.fasta.gz
+gunzip uniprot_trembl.fasta.gz
+
+cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta
+
+makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE}
+makeblastdb -in uniprot_sprot.fasta -out uniprot_sprot -dbtype prot -parse_seqids -title uniprot_sprot
+makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl
+
+cd ..
+
+echo "BLAST databases created successfully!"
+echo "Database locations:"
+echo "  - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}"
+echo "  - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot"
+echo "  - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl"
+echo ""
+echo "To use these databases, set in your config:"
+echo "  local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot  # or uniprot_${RELEASE} or uniprot_trembl"
+
diff --git a/scripts/search/build_db/build_rna_blast_db.sh b/scripts/search/build_db/build_rna_blast_db.sh
new file mode 100755
index 00000000..89b9dc0e
--- /dev/null
+++ b/scripts/search/build_db/build_rna_blast_db.sh
@@ -0,0 +1,157 @@
+#!/bin/bash
+
+set -e
+
+# Downloads NCBI RefSeq RNA sequences and creates BLAST databases.
+# This script specifically downloads RNA sequences (mRNA, rRNA, tRNA, etc.)
+# from RefSeq, which is suitable for RNA sequence searches.
+#
+# Usage: ./build_rna_blast_db.sh [representative|complete|all]
+#   representative: Download RNA sequences from major categories (recommended, smaller)
+#                    Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi, invertebrate, plant, viral
+#   complete: Download all RNA sequences from complete/ directory (very large)
+#   all: Download all RNA sequences from all categories (very large)
+#
+# We need makeblastdb on our PATH
+# For Ubuntu/Debian: sudo apt install ncbi-blast+
+# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
+# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
+
+DOWNLOAD_TYPE=${1:-representative}
+
+# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
+DOWNLOAD_TMP=_downloading_rna
+mkdir -p ${DOWNLOAD_TMP}
+cd ${DOWNLOAD_TMP}
+
+# Download RefSeq release information
+echo "Downloading RefSeq release information..."
+wget -c "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER" || {
+    echo "Warning: Could not download RELEASE_NUMBER, using current date as release identifier"
+    RELEASE=$(date +%Y%m%d)
+}
+
+if [ -f "RELEASE_NUMBER" ]; then
+    RELEASE=$(cat RELEASE_NUMBER | tr -d '\n')
+    echo "RefSeq release: ${RELEASE}"
+else
+    RELEASE=$(date +%Y%m%d)
+    echo "Using date as release identifier: ${RELEASE}"
+fi
+
+# Download based on type
+case ${DOWNLOAD_TYPE} in
+    representative)
+        echo "Downloading RefSeq representative RNA sequences (recommended, smaller size)..."
+        echo "Downloading RNA sequences from major categories..."
+        for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral; do
+            echo "Downloading ${category} RNA sequences..."
+            curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
+                grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \
+                sed 's/href="\(.*\)"/\1/' | \
+                while read filename; do
+                    echo "  Downloading ${filename}..."
+                    wget -c -q --show-progress \
+                        "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || {
+                        echo "Warning: Failed to download ${filename}"
+                    }
+                done
+        done
+        ;;
+    complete)
+        echo "Downloading RefSeq complete RNA sequences (WARNING: very large, may take hours)..."
+        curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \
+            grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \
+            sed 's/href="\(.*\)"/\1/' | \
+            while read filename; do
+                echo "  Downloading ${filename}..."
+                wget -c -q --show-progress \
+                    "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || {
+                    echo "Warning: Failed to download ${filename}"
+                }
+            done
+        ;;
+    all)
+        echo "Downloading all RefSeq RNA sequences from all categories (WARNING: extremely large, may take many hours)..."
+        for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do
+            echo "Downloading ${category} RNA sequences..."
+            curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
+                grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \
+                sed 's/href="\(.*\)"/\1/' | \
+                while read filename; do
+                    echo "  Downloading ${filename}..."
+                    wget -c -q --show-progress \
+                        "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || {
+                        echo "Warning: Failed to download ${filename}"
+                    }
+                done
+        done
+        ;;
+    *)
+        echo "Error: Unknown download type '${DOWNLOAD_TYPE}'"
+        echo "Usage: $0 [representative|complete|all]"
+        exit 1
+        ;;
+esac
+
+cd ..
+
+# Create release directory
+mkdir -p refseq_rna_${RELEASE}
+mv ${DOWNLOAD_TMP}/* refseq_rna_${RELEASE}/ 2>/dev/null || true
+rmdir ${DOWNLOAD_TMP} 2>/dev/null || true
+
+cd refseq_rna_${RELEASE}
+
+# Extract and combine sequences
+echo "Extracting and combining RNA sequences..."
+
+# Extract all downloaded RNA sequences
+if [ $(find . -name "*.rna.fna.gz" -type f | wc -l) -gt 0 ]; then
+    echo "Extracting RNA sequences..."
+    find . -name "*.rna.fna.gz" -type f -exec gunzip {} \;
+fi
+
+# Combine all FASTA files into one
+echo "Combining all FASTA files..."
+FASTA_FILES=$(find . -name "*.fna" -type f)
+if [ -z "$FASTA_FILES" ]; then
+    FASTA_FILES=$(find . -name "*.fa" -type f)
+fi
+
+if [ -z "$FASTA_FILES" ]; then
+    echo "Error: No FASTA files found to combine"
+    exit 1
+fi
+
+echo "$FASTA_FILES" | while read -r file; do
+    if [ -f "$file" ]; then
+        cat "$file" >> refseq_rna_${RELEASE}.fasta
+    fi
+done
+
+# Check if we have sequences
+if [ ! -s "refseq_rna_${RELEASE}.fasta" ]; then
+    echo "Error: Combined FASTA file is empty"
+    exit 1
+fi
+
+echo "Creating BLAST database..."
+# Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide)
+makeblastdb -in refseq_rna_${RELEASE}.fasta \
+    -out refseq_rna_${RELEASE} \
+    -dbtype nucl \
+    -parse_seqids \
+    -title "RefSeq_RNA_${RELEASE}"
+
+echo "BLAST database created successfully!"
+echo "Database location: $(pwd)/refseq_rna_${RELEASE}"
+echo ""
+echo "To use this database, set in your config:"
+echo "  local_blast_db: $(pwd)/refseq_rna_${RELEASE}"
+echo ""
+echo "Note: The database files are:"
+ls -lh refseq_rna_${RELEASE}.*
+
+cd ..
+

From 2a715de9a3578e366ce7ea62fcac733bb24ae6f8 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Sat, 29 Nov 2025 22:40:21 +0800
Subject: [PATCH 12/22] style: reduce return statements and branches in
 searcher methods

- Refactor search_by_sequence in ncbi_searcher.py to reduce return statements from 7 to 1
- Refactor search_by_sequence in rnacentral_searcher.py to reduce return statements from 8 to 1 and branches from 16 to 12
- Extract helper methods to improve code readability and maintainability
- Fix pylint errors R0911 (too-many-return-statements) and R0912 (too-many-branches)
---
 graphgen/models/searcher/db/ncbi_searcher.py  | 120 ++++++++--------
 .../models/searcher/db/rnacentral_searcher.py | 136 +++++++++---------
 2 files changed, 131 insertions(+), 125 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 24a37e2b..4558f75a 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -569,6 +569,45 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
             logger.error("Local blastn failed: %s", exc)
             return None
 
+    def _extract_and_normalize_sequence(self, sequence: str) -> Optional[str]:
+        """Extract and normalize DNA sequence from input."""
+        if sequence.startswith(">"):
+            seq_lines = sequence.strip().split("\n")
+            seq = "".join(seq_lines[1:])
+        else:
+            seq = sequence.strip().replace(" ", "").replace("\n", "")
+        return seq if seq and re.fullmatch(r"[ATCGN\s]+", seq, re.I) else None
+
+    def _process_network_blast_result(self, blast_record, seq: str, threshold: float) -> Optional[dict]:
+        """Process network BLAST result and return dictionary or None."""
+        if not blast_record.alignments:
+            logger.info("No BLAST hits found for the given sequence.")
+            return None
+
+        best_alignment = blast_record.alignments[0]
+        best_hsp = best_alignment.hsps[0]
+        if best_hsp.expect > threshold:
+            logger.info("No BLAST hits below the threshold E-value.")
+            return None
+
+        hit_id = best_alignment.hit_id
+        accession_match = re.search(r"ref\|([^|]+)", hit_id)
+        if accession_match:
+            accession = accession_match.group(1).split(".")[0]
+            return self.get_by_accession(accession)
+
+        # If unable to extract accession, return basic information
+        return {
+            "molecule_type": "DNA",
+            "database": "NCBI",
+            "id": hit_id,
+            "title": best_alignment.title,
+            "sequence_length": len(seq),
+            "e_value": best_hsp.expect,
+            "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0,
+            "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
+        }
+
     def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
         """
         Search NCBI with a DNA sequence using BLAST.
@@ -577,77 +616,40 @@ def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional
         :param threshold: E-value threshold for BLAST search.
         :return: A dictionary containing the best hit information or None if not found.
         """
+        result = None
         try:
-            # Extract sequence (if in FASTA format)
-            if sequence.startswith(">"):
-                seq_lines = sequence.strip().split("\n")
-                seq = "".join(seq_lines[1:])
-            else:
-                seq = sequence.strip().replace(" ", "").replace("\n", "")
-
-            # Validate sequence
-            if not seq or not re.fullmatch(r"[ATCGN\s]+", seq, re.I):
-                if not seq:
-                    logger.error("Empty DNA sequence provided.")
-                else:
-                    logger.error("Invalid DNA sequence provided.")
+            seq = self._extract_and_normalize_sequence(sequence)
+            if not seq:
+                logger.error("Empty or invalid DNA sequence provided.")
                 return None
 
             # Try local BLAST first if enabled
-            accession = None
             if self.use_local_blast:
                 accession = self._local_blast(seq, threshold)
                 if accession:
                     logger.debug("Local BLAST found accession: %s", accession)
-                    return self.get_by_accession(accession)
-
-            # Fall back to network BLAST
-            logger.debug("Falling back to NCBIWWW.qblast.")
-            logger.debug("Performing BLAST search for DNA sequence...")
-            time.sleep(0.35)
-
-            result_handle = NCBIWWW.qblast(
-                program="blastn",
-                database="nr",
-                sequence=seq,
-                hitlist_size=1,
-                expect=threshold,
-            )
-            blast_record = NCBIXML.read(result_handle)
-
-            if not blast_record.alignments:
-                logger.info("No BLAST hits found for the given sequence.")
-                return None
-
-            best_alignment = blast_record.alignments[0]
-            best_hsp = best_alignment.hsps[0]
-            if best_hsp.expect > threshold:
-                logger.info("No BLAST hits below the threshold E-value.")
-                return None
-            hit_id = best_alignment.hit_id
-
-            # Extract accession number
-            # Format may be: gi|123456|ref|NM_000546.5|
-            accession_match = re.search(r"ref\|([^|]+)", hit_id)
-            if accession_match:
-                accession = accession_match.group(1).split(".")[0]
-                return self.get_by_accession(accession)
-            # If unable to extract accession, return basic information
-            return {
-                "molecule_type": "DNA",
-                "database": "NCBI",
-                "id": hit_id,
-                "title": best_alignment.title,
-                "sequence_length": len(seq),
-                "e_value": best_hsp.expect,
-                "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0,
-                "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
-            }
+                    result = self.get_by_accession(accession)
+
+            # Fall back to network BLAST if local BLAST didn't find result
+            if not result:
+                logger.debug("Falling back to NCBIWWW.qblast.")
+                logger.debug("Performing BLAST search for DNA sequence...")
+                time.sleep(0.35)
+
+                result_handle = NCBIWWW.qblast(
+                    program="blastn",
+                    database="nr",
+                    sequence=seq,
+                    hitlist_size=1,
+                    expect=threshold,
+                )
+                blast_record = NCBIXML.read(result_handle)
+                result = self._process_network_blast_result(blast_record, seq, threshold)
         except RequestException:
             raise
         except Exception as e:  # pylint: disable=broad-except
             logger.error("BLAST search failed: %s", e)
-            return None
+        return result
 
     @retry(
         stop=stop_after_attempt(5),
diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index c31bd978..5950a3e7 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -339,6 +339,49 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
             logger.error("Local blastn failed: %s", exc)
             return None
 
+    @staticmethod
+    def _extract_and_normalize_sequence(sequence: str) -> Optional[str]:
+        """Extract and normalize RNA sequence from input."""
+        if sequence.startswith(">"):
+            seq_lines = sequence.strip().split("\n")
+            seq = "".join(seq_lines[1:])
+        else:
+            seq = sequence.strip().replace(" ", "").replace("\n", "")
+        return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None
+
+    def _find_best_match_from_results(self, results: List[Dict], seq: str) -> Optional[Dict]:
+        """Find best match from search results, preferring exact match."""
+        exact_match = None
+        for result_item in results:
+            result_seq = result_item.get("sequence", "")
+            if result_seq == seq:
+                exact_match = result_item
+                break
+        return exact_match if exact_match else (results[0] if results else None)
+
+    async def _process_api_search_results(
+        self, results: List[Dict], seq: str
+    ) -> Optional[dict]:
+        """Process API search results and return dictionary or None."""
+        if not results:
+            logger.info("No results found for sequence.")
+            return None
+
+        target_result = self._find_best_match_from_results(results, seq)
+        if not target_result:
+            return None
+
+        rna_id = target_result.get("rnacentral_id")
+        if not rna_id:
+            return None
+
+        # Try to get complete information
+        result = await self.get_by_rna_id(rna_id)
+        if not result:
+            logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
+            result = self._rna_data_to_dict(rna_id, target_result)
+        return result
+
     async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
         """
         Search RNAcentral with an RNA sequence.
@@ -348,21 +391,11 @@ async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Op
         :param threshold: E-value threshold for BLAST search.
         :return: A dictionary containing complete RNA information or None if not found.
         """
+        result = None
         try:
-            # Extract sequence (if in FASTA format)
-            if sequence.startswith(">"):
-                seq_lines = sequence.strip().split("\n")
-                seq = "".join(seq_lines[1:])
-            else:
-                seq = sequence.strip().replace(" ", "").replace("\n", "")
-
-            # Validate if it's an RNA sequence (contains U instead of T)
-            if not re.fullmatch(r"[AUCGN\s]+", seq, re.I):
-                logger.error("Invalid RNA sequence provided.")
-                return None
-
+            seq = self._extract_and_normalize_sequence(sequence)
             if not seq:
-                logger.error("Empty RNA sequence provided.")
+                logger.error("Empty or invalid RNA sequence provided.")
                 return None
 
             # Try local BLAST first if enabled
@@ -370,64 +403,35 @@ async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Op
                 accession = self._local_blast(seq, threshold)
                 if accession:
                     logger.debug("Local BLAST found accession: %s", accession)
-                    # Try to get RNA ID from accession (may need conversion)
-                    # For now, try using accession as RNA ID or search by it
                     result = await self.get_by_rna_id(accession)
-                    if result:
-                        return result
-                    # If not found by ID, try keyword search
-                    result = await self.get_best_hit(accession)
-                    if result:
-                        return result
-
-            # Fall back to RNAcentral API
-            logger.debug("Falling back to RNAcentral API.")
-            async with aiohttp.ClientSession() as session:
-                search_url = f"{self.base_url}/rna"
-                params = {"sequence": seq, "format": "json"}
-                async with session.get(
-                    search_url,
-                    params=params,
-                    headers=self.headers,
-                    timeout=aiohttp.ClientTimeout(total=60),  # Sequence search may take longer
-                ) as resp:
-                    if resp.status == 200:
-                        search_results = await resp.json()
-                        results = search_results.get("results", [])
-                        if results:
-                            # Step 1: Find best match (prefer exact match)
-                            exact_match = None
-                            for result in results:
-                                result_seq = result.get("sequence", "")
-                                if result_seq == seq:
-                                    exact_match = result
-                                    break
-
-                            # Use exact match if found, otherwise use first result
-                            target_result = exact_match if exact_match else results[0]
-                            rna_id = target_result.get("rnacentral_id")
-
-                            if rna_id:
-                                # Step 2: Unified call to get_by_rna_id() for complete information
-                                result = await self.get_by_rna_id(rna_id)
-
-                                # Step 3: If get_by_rna_id() failed, use search result data as fallback
-                                if not result:
-                                    logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
-                                    result = self._rna_data_to_dict(rna_id, target_result)
-
-                                return result
-                        logger.info("No results found for sequence.")
-                        return None
-                    error_text = await resp.text()
-                    logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200])
-                    raise Exception(f"HTTP {resp.status}: {error_text}")
+                    if not result:
+                        result = await self.get_best_hit(accession)
+
+            # Fall back to RNAcentral API if local BLAST didn't find result
+            if not result:
+                logger.debug("Falling back to RNAcentral API.")
+                async with aiohttp.ClientSession() as session:
+                    search_url = f"{self.base_url}/rna"
+                    params = {"sequence": seq, "format": "json"}
+                    async with session.get(
+                        search_url,
+                        params=params,
+                        headers=self.headers,
+                        timeout=aiohttp.ClientTimeout(total=60),  # Sequence search may take longer
+                    ) as resp:
+                        if resp.status == 200:
+                            search_results = await resp.json()
+                            results = search_results.get("results", [])
+                            result = await self._process_api_search_results(results, seq)
+                        else:
+                            error_text = await resp.text()
+                            logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200])
+                            raise Exception(f"HTTP {resp.status}: {error_text}")
         except aiohttp.ClientError as e:
             logger.error("Network error searching for sequence: %s", e)
-            return None
         except Exception as e:  # pylint: disable=broad-except
             logger.error("Sequence search failed: %s", e)
-            return None
+        return result
 
     @retry(
         stop=stop_after_attempt(3),

From b48930af059bbfbd3f307a892800de05f74aa1d1 Mon Sep 17 00:00:00 2001
From: chenzihong <522023320011@smail.nju.edu.cn>
Date: Sun, 30 Nov 2025 17:44:50 +0800
Subject: [PATCH 13/22] perf: optimize code style and search efficiency

---
 graphgen/models/searcher/db/ncbi_searcher.py | 764 ++++++-------------
 1 file changed, 223 insertions(+), 541 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 4558f75a..946e3c1f 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -1,16 +1,15 @@
 import asyncio
-import logging
 import os
 import re
 import subprocess
 import tempfile
-import time
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 from http.client import IncompleteRead
 from typing import Dict, Optional
+from graphgen.models.searcher.limitter import RateLimiter
 
-from Bio import Entrez
+from Bio import Entrez, SeqIO
 from Bio.Blast import NCBIWWW, NCBIXML
 from requests.exceptions import RequestException
 from tenacity import (
@@ -18,7 +17,6 @@
     retry_if_exception_type,
     stop_after_attempt,
     wait_exponential,
-    before_sleep_log,
 )
 
 from graphgen.bases import BaseSearcher
@@ -43,613 +41,317 @@ class NCBISearch(BaseSearcher):
 
     def __init__(
         self,
-        email: str = "test@example.com",
-        tool: str = "GraphGen",
         use_local_blast: bool = False,
         local_blast_db: str = "nt_db",
+        email: str = "email@example.com",
+        api_key: str = "",
     ):
+        """
+        Initialize the NCBI Search client.
+
+        Args:
+            use_local_blast (bool): Whether to use local BLAST database.
+            local_blast_db (str): Path to the local BLAST database.
+            email (str): Email address for NCBI API requests.
+            api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/.
+        """
         super().__init__()
-        Entrez.email = email
-        Entrez.tool = tool
         Entrez.timeout = 60  # 60 seconds timeout
+        Entrez.email = email
+        if api_key:
+            Entrez.api_key = api_key
+        Entrez.max_tries = 10 if api_key else 3
+        Entrez.sleep_between_tries = 5
         self.use_local_blast = use_local_blast
         self.local_blast_db = local_blast_db
         if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
             logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
+        self.rate_limiter = RateLimiter()
 
     @staticmethod
-    def _safe_get(obj, key, default=None):
-        """Safely get value from dict or StringElement-like object."""
-        if isinstance(obj, dict):
-            return obj.get(key, default)
-        if hasattr(obj, "get"):
-            return obj.get(key, default)
-        if hasattr(obj, key):
-            return getattr(obj, key, default)
-        return default
-
-    @staticmethod
-    def _extract_gene_ref(entrezgene_gene):
-        """Extract gene_ref from entrezgene_gene."""
-        if isinstance(entrezgene_gene, dict):
-            return entrezgene_gene.get("Gene-ref", {})
-        if hasattr(entrezgene_gene, "get"):
-            return entrezgene_gene.get("Gene-ref", {})
-        try:
-            if hasattr(entrezgene_gene, "Gene-ref"):
-                return getattr(entrezgene_gene, "Gene-ref", {})
-        except Exception:
-            pass
-        return {}
-
-    @staticmethod
-    def _extract_organism(entrezgene_source):
-        """Extract organism from entrezgene_source."""
-        try:
-            biosource = NCBISearch._safe_get(entrezgene_source, "BioSource", {})
-            biosource_org = NCBISearch._safe_get(biosource, "BioSource_org", {})
-            org_ref = NCBISearch._safe_get(biosource_org, "Org-ref", {})
-            return NCBISearch._safe_get(org_ref, "Org-ref_taxname", "N/A")
-        except Exception as e:
-            logger.debug("Error extracting organism: %s", e)
-            return "N/A"
-
-    @staticmethod
-    def _extract_synonyms(gene_ref):
-        """Extract gene synonyms from gene_ref."""
-        gene_synonyms = []
-        try:
-            gene_syn = gene_ref.get("Gene-ref_syn", []) if isinstance(gene_ref, dict) else []
-            if isinstance(gene_syn, list):
-                for syn in gene_syn:
-                    if isinstance(syn, dict):
-                        gene_synonyms.append(syn.get("Gene-ref_syn_E", "N/A"))
-                    elif isinstance(syn, str):
-                        gene_synonyms.append(syn)
-                    else:
-                        gene_synonyms.append(str(syn))
-            elif isinstance(gene_syn, str):
-                gene_synonyms.append(gene_syn)
-            elif gene_syn:
-                gene_synonyms.append(str(gene_syn))
-        except Exception as e:
-            logger.debug("Error extracting gene synonyms: %s", e)
-        return gene_synonyms
-
-    @staticmethod
-    def _extract_gene_type(gene_data):
-        """Extract gene type from gene_data."""
-        try:
-            gene_type_data = gene_data.get("Entrezgene_type")
-            if not gene_type_data:
-                return None
-            type_value = str(gene_type_data)
-            type_mapping = {
-                "1": "protein-coding",
-                "2": "pseudo",
-                "3": "rRNA",
-                "4": "tRNA",
-                "5": "snRNA",
-                "6": "ncRNA",
-                "7": "other",
-            }
-            return type_mapping.get(type_value, f"type_{type_value}")
-        except Exception as e:
-            logger.debug("Error extracting gene type: %s", e)
-            return None
-
-    @staticmethod
-    def _extract_chromosome(first_locus):
-        """Extract chromosome from first_locus."""
-        label = NCBISearch._safe_get(first_locus, "Gene-commentary_label", "")
-        if not label or "Chromosome" not in str(label):
-            return None
-        match = re.search(r'Chromosome\s+(\S+)', str(label))
-        return match.group(1) if match else None
-
-    @staticmethod
-    def _extract_genomic_location(first_locus):
-        """Extract genomic location from first_locus."""
-        seqs = NCBISearch._safe_get(first_locus, "Gene-commentary_seqs", [])
-        if not seqs or not isinstance(seqs, list) or not seqs:
-            return None
-        first_seq = seqs[0]
-        if not isinstance(first_seq, dict):
-            return None
-        seq_loc_int = NCBISearch._safe_get(first_seq, "Seq-loc_int", {})
-        if not seq_loc_int:
-            return None
-        seq_interval = NCBISearch._safe_get(seq_loc_int, "Seq-interval", {})
-        if not seq_interval:
-            return None
-        seq_from = NCBISearch._safe_get(seq_interval, "Seq-interval_from", "")
-        seq_to = NCBISearch._safe_get(seq_interval, "Seq-interval_to", "")
-        if seq_from and seq_to:
-            return f"{seq_from}-{seq_to}"
-        return None
-
-    @staticmethod
-    def _extract_location_info(locus_data):
-        """Extract chromosome and genomic location from locus data."""
-        if not locus_data or not isinstance(locus_data, list) or not locus_data:
-            return None, None
-        first_locus = locus_data[0]
-        if not isinstance(first_locus, dict):
-            return None, None
-        chromosome = NCBISearch._extract_chromosome(first_locus)
-        genomic_location = NCBISearch._extract_genomic_location(first_locus)
-        return chromosome, genomic_location
-
-    @staticmethod
-    def _extract_function_info(gene_data):
-        """Extract gene functional description."""
-        try:
-            summary = gene_data.get("Entrezgene_summary")
-            if summary:
-                return str(summary)
-            comments_data = gene_data.get("Entrezgene_comments")
-            if not comments_data or not isinstance(comments_data, list):
-                return None
-            for comment in comments_data:
-                if not isinstance(comment, dict):
-                    continue
-                heading = NCBISearch._safe_get(comment, "Gene-commentary_heading", "")
-                heading_lower = str(heading).lower()
-                if "function" not in heading_lower and "summary" not in heading_lower:
-                    continue
-                comment_text = NCBISearch._safe_get(comment, "Gene-commentary_comment", "")
-                if comment_text:
-                    return str(comment_text)
-            return None
-        except Exception as e:
-            logger.debug("Error extracting function: %s", e)
-            return None
-
-    @staticmethod
-    def _extract_accession(locus_data):
-        """Extract representative mRNA accession from locus data."""
-        if not locus_data or not isinstance(locus_data, list) or not locus_data:
-            return None
-        first_locus = locus_data[0]
-        if not isinstance(first_locus, dict):
-            return None
-        products = NCBISearch._safe_get(first_locus, "Gene-commentary_products", [])
-        if not products or not isinstance(products, list):
-            return None
-        representative_accession = None
-        for product in products:
-            if not isinstance(product, dict):
-                continue
-            product_type = NCBISearch._safe_get(product, "Gene-commentary_type", "")
-            product_type_str = str(product_type)
-            if product_type_str == "3" or (not representative_accession and product_type_str):
-                accession = NCBISearch._safe_get(product, "Gene-commentary_accession", "")
-                if accession:
-                    representative_accession = str(accession)
-                    if product_type_str == "3":
-                        break
-        return representative_accession
+    def _nested_get(data: dict, *keys, default=None):
+        """Safely traverse nested dictionaries."""
+        for key in keys:
+            if not isinstance(data, dict):
+                return default
+            data = data.get(key, default)
+        return data
 
     @staticmethod
     def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
         """
         Convert an Entrez gene record to a dictionary.
-        :param gene_record: The Entrez gene record (list from Entrez.read).
-        :param gene_id: The gene ID.
-        :return: A dictionary containing gene information.
+        All extraction logic is inlined for maximum clarity and performance.
         """
         if not gene_record:
             raise ValueError("Empty gene record")
 
-        gene_data = gene_record[0]
-        locus_data = gene_data.get("Entrezgene_locus")
+        data = gene_record[0]
+        locus = (data.get("Entrezgene_locus") or [{}])[0]
 
-        # Extract information using helper methods
-        entrezgene_gene = gene_data.get("Entrezgene_gene")
-        gene_ref = NCBISearch._extract_gene_ref(entrezgene_gene)
-        organism = NCBISearch._extract_organism(gene_data.get("Entrezgene_source"))
-        gene_synonyms = NCBISearch._extract_synonyms(gene_ref)
-        gene_type = NCBISearch._extract_gene_type(gene_data)
-        chromosome, genomic_location = NCBISearch._extract_location_info(locus_data)
-        function = NCBISearch._extract_function_info(gene_data)
-        representative_accession = NCBISearch._extract_accession(locus_data)
+        # Extract common nested paths once
+        gene_ref = NCBISearch._nested_get(data, "Entrezgene_gene", "Gene-ref", default={})
+        biosource = NCBISearch._nested_get(data, "Entrezgene_source", "BioSource", default={})
+
+        # Process synonyms
+        synonyms_raw = gene_ref.get("Gene-ref_syn", [])
+        gene_synonyms = []
+        if isinstance(synonyms_raw, list):
+            for syn in synonyms_raw:
+                gene_synonyms.append(syn.get("Gene-ref_syn_E") if isinstance(syn, dict) else str(syn))
+        elif synonyms_raw:
+            gene_synonyms.append(str(synonyms_raw))
+
+        # Extract location info
+        label = locus.get("Gene-commentary_label", "")
+        chromosome_match = re.search(r"Chromosome\s+(\S+)", str(label)) if label else None
+
+        seq_interval = NCBISearch._nested_get(
+            locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={}
+        )
+        genomic_location = (
+            f"{seq_interval.get('Seq-interval_from')}-{seq_interval.get('Seq-interval_to')}"
+            if seq_interval.get('Seq-interval_from') and seq_interval.get('Seq-interval_to')
+            else None
+        )
+
+        # Extract representative accession
+        representative_accession = next(
+            (
+                product.get("Gene-commentary_accession")
+                for product in locus.get("Gene-commentary_products", [])
+                if product.get("Gene-commentary_type") == "3"
+            ),
+            None,
+        )
+
+        # Extract function
+        function = data.get("Entrezgene_summary") or next(
+            (
+                comment.get("Gene-commentary_comment")
+                for comment in data.get("Entrezgene_comments", [])
+                if isinstance(comment, dict)
+                and "function" in str(comment.get("Gene-commentary_heading", "")).lower()
+            ),
+            None,
+        )
 
-        # Build result dictionary with all fields
         return {
             "molecule_type": "DNA",
             "database": "NCBI",
             "id": gene_id,
-            "gene_name": NCBISearch._safe_get(gene_ref, "Gene-ref_locus", "N/A"),
-            "gene_description": NCBISearch._safe_get(gene_ref, "Gene-ref_desc", "N/A"),
-            "organism": organism,
+            "gene_name": gene_ref.get("Gene-ref_locus", "N/A"),
+            "gene_description": gene_ref.get("Gene-ref_desc", "N/A"),
+            "organism": NCBISearch._nested_get(
+                biosource, "BioSource_org", "Org-ref", "Org-ref_taxname", default="N/A"
+            ),
             "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}",
-            "gene_synonyms": gene_synonyms if gene_synonyms else None,
-            "gene_type": gene_type,
-            "chromosome": chromosome,
+            "gene_synonyms": gene_synonyms or None,
+            "gene_type": {
+                "1": "protein-coding",
+                "2": "pseudo",
+                "3": "rRNA",
+                "4": "tRNA",
+                "5": "snRNA",
+                "6": "ncRNA",
+                "7": "other",
+            }.get(str(data.get("Entrezgene_type")), f"type_{data.get('Entrezgene_type')}"),
+            "chromosome": chromosome_match.group(1) if chromosome_match else None,
             "genomic_location": genomic_location,
             "function": function,
-            # Fields from accession-based queries (set to None initially, may be filled later)
+            # Fields from accession-based queries
             "title": None,
             "sequence": None,
             "sequence_length": None,
-            "gene_id": gene_id,  # For consistency with accession queries
+            "gene_id": gene_id,
             "molecule_type_detail": None,
             "_representative_accession": representative_accession,
         }
 
-    def _fetch_sequence(self, accession: str):
-        """Fetch sequence from nuccore database using efetch."""
-        time.sleep(0.35)  # Comply with rate limit
-        seq_handle = Entrez.efetch(
-            db="nuccore",
-            id=accession,
-            rettype="fasta",
-            retmode="text",
-        )
-        try:
-            sequence_data = seq_handle.read()
-            if not sequence_data:
-                return None, None
-            seq_lines = sequence_data.strip().split("\n")
-            header = seq_lines[0] if seq_lines else ""
-            sequence = "".join(seq_lines[1:])
-            return sequence, header
-        finally:
-            seq_handle.close()
-
-    def _fetch_summary(self, accession: str, default_header: str = ""):
-        """Fetch summary from nuccore database using esummary."""
-        time.sleep(0.35)  # Comply with rate limit
-        summary_handle = Entrez.esummary(db="nuccore", id=accession)
-        try:
-            summary = Entrez.read(summary_handle)
-            if not summary:
-                return None
-            summary_data = summary[0]
-
-            # Determine molecule type detail
-            molecule_type_detail = "N/A"
-            if accession.startswith("NM_") or accession.startswith("XM_"):
-                molecule_type_detail = "mRNA"
-            elif accession.startswith("NC_") or accession.startswith("NT_"):
-                molecule_type_detail = "genomic DNA"
-            elif accession.startswith("NR_") or accession.startswith("XR_"):
-                molecule_type_detail = "RNA"
-            elif accession.startswith("NG_"):
-                molecule_type_detail = "genomic region"
-
-            title = summary_data.get("Title", default_header)
-            chromosome = summary_data.get("ChrLoc") or summary_data.get("ChrAccVer")
-            chr_start = summary_data.get("ChrStart")
-            chr_stop = summary_data.get("ChrStop")
-            genomic_location = None
-            if chr_start and chr_stop:
-                genomic_location = f"{chr_start}-{chr_stop}"
-
-            return {
-                "title": title,
-                "molecule_type_detail": molecule_type_detail,
-                "chromosome": chromosome,
-                "genomic_location": genomic_location,
-            }
-        finally:
-            summary_handle.close()
-
-    def _extract_gene_id(self, link_handle):
-        """Extract GeneID from elink results."""
-        try:
-            links = Entrez.read(link_handle)
-            if not links or len(links) == 0:
-                return None
+    def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
+        """Get gene information by Gene ID."""
+        def _extract_from_genbank(result: dict, accession: str):
+            """Enrich result dictionary with sequence and summary information from accession."""
+            with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle:
+                record = SeqIO.read(handle, "genbank")
+                result["sequence"] = str(record.seq)
+                result["sequence_length"] = len(record.seq)
+                result["title"] = record.description
+                result["molecule_type_detail"] = (
+                    "mRNA" if accession.startswith(("NM_", "XM_")) else
+                    "genomic DNA" if accession.startswith(("NC_", "NT_")) else
+                    "RNA" if accession.startswith(("NR_", "XR_")) else
+                    "genomic region" if accession.startswith("NG_") else "N/A"
+                )
 
-            first_link = links[0]
-            if "LinkSetDb" not in first_link:
-                return None
+                for feature in record.features:
+                    if feature.type == "source":
+                        if 'chromosome' in feature.qualifiers:
+                            result["chromosome"] = feature.qualifiers['chromosome'][0]
 
-            for link_set in first_link["LinkSetDb"]:
-                if link_set.get("DbTo") != "gene":
-                    continue
+                        if feature.location:
+                            start = int(feature.location.start) + 1
+                            end = int(feature.location.end)
+                            result["genomic_location"] = f"{start}-{end}"
 
-                # Try Link structure first (most common)
-                links_in_set = link_set.get("Link", [])
-                if links_in_set and len(links_in_set) > 0:
-                    first_link_item = links_in_set[0]
-                    if isinstance(first_link_item, dict):
-                        gene_id = str(first_link_item.get("Id", ""))
-                    elif hasattr(first_link_item, "Id"):
-                        gene_id = str(getattr(first_link_item, "Id", ""))
-                    else:
-                        gene_id = str(first_link_item)
-                    if gene_id:
-                        return gene_id
-
-                # Fallback: Try IdList (if Link is not available)
-                id_list = link_set.get("IdList", [])
-                if id_list:
-                    return str(id_list[0])
+                        break
 
-            return None
-        except Exception as e:
-            logger.error("Error parsing elink result: %s", e)
-            import traceback
-            logger.debug(traceback.format_exc())
-            return None
+                if not result.get("organism") and 'organism' in record.annotations:
+                    result["organism"] = record.annotations['organism']
 
-    def _extract_sequence(self, result: dict, accession: str):
-        """Enrich result dictionary with sequence and summary information from accession."""
-        try:
-            sequence, header = self._fetch_sequence(accession)
-            if sequence:
-                result["sequence"] = sequence
-                result["sequence_length"] = len(sequence)
-
-            summary_info = self._fetch_summary(accession, header or "")
-            if not summary_info:
-                return
-
-            result["title"] = summary_info.get("title")
-            result["molecule_type_detail"] = summary_info.get("molecule_type_detail")
-            # Update chromosome and genomic_location if not already set
-            if not result.get("chromosome") and summary_info.get("chromosome"):
-                result["chromosome"] = summary_info["chromosome"]
-            if not result.get("genomic_location") and summary_info.get("genomic_location"):
-                result["genomic_location"] = summary_info["genomic_location"]
-        except (RequestException, IncompleteRead):
-            raise
-        except Exception as e:
-            logger.debug("Failed to get sequence for accession %s: %s", accession, e)
+            return result
 
-    @retry(
-        stop=stop_after_attempt(5),
-        wait=wait_exponential(multiplier=1, min=4, max=10),
-        retry=retry_if_exception_type((RequestException, IncompleteRead)),
-        reraise=True,
-        before_sleep=before_sleep_log(logger, logging.WARNING),
-    )
 
-    def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]:
-        """
-        Get gene information by Gene ID.
-        This is the unified data source - all search methods eventually call this.
-        :param gene_id: NCBI Gene ID.
-        :param preferred_accession: Optional accession to use for sequence retrieval.
-        :return: A dictionary containing gene information or None if not found.
-        """
         try:
-            time.sleep(0.35)  # Comply with rate limit (max 3 requests per second)
-            handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml")
-            try:
+            with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
                 gene_record = Entrez.read(handle)
                 if not gene_record:
                     return None
-                result = self._gene_record_to_dict(gene_record, gene_id)
 
-                # Try to get sequence from accession
-                accession_to_use = preferred_accession or result.get("_representative_accession")
-                if accession_to_use:
-                    self._extract_sequence(result, accession_to_use)
+                result = self._gene_record_to_dict(gene_record, gene_id)
+                if accession := (preferred_accession or result.get("_representative_accession")):
+                    result = _extract_from_genbank(result, accession)
 
-                # Remove internal field
                 result.pop("_representative_accession", None)
                 return result
-            finally:
-                handle.close()
-        except RequestException:
-            raise
-        except IncompleteRead:
+        except (RequestException, IncompleteRead):
             raise
-        except Exception as exc:  # pylint: disable=broad-except
+        except Exception as exc:
             logger.error("Gene ID %s not found: %s", gene_id, exc)
             return None
 
     def get_by_accession(self, accession: str) -> Optional[dict]:
-        """
-        Get sequence information by accession number.
-        Unified approach: Get GeneID from accession, then call get_by_gene_id() for complete information.
-        :param accession: NCBI accession number (e.g., NM_000546).
-        :return: A dictionary containing complete gene information or None if not found.
-        """
+        """Get sequence information by accession number."""
+        def _extract_gene_id(link_handle):
+            """Extract GeneID from elink results."""
+            links = Entrez.read(link_handle)
+            if not links or "LinkSetDb" not in links[0]:
+                return None
+
+            for link_set in links[0]["LinkSetDb"]:
+                if link_set.get("DbTo") != "gene":
+                    continue
+
+                link = (link_set.get("Link") or link_set.get("IdList", [{}]))[0]
+                return str(link.get("Id") if isinstance(link, dict) else link)
+
         try:
-            # Step 1: Get GeneID from elink (nuccore -> gene)
-            # Note: esummary for nuccore doesn't include GeneID, so we use elink instead
-            time.sleep(0.35)
-            link_handle = Entrez.elink(dbfrom="nuccore", db="gene", id=accession)
-            try:
-                gene_id = self._extract_gene_id(link_handle)
-            finally:
-                link_handle.close()
-
-            # Step 2: If we have a GeneID, get complete information from Gene database
-            if gene_id:
-                result = self.get_by_gene_id(gene_id, preferred_accession=accession)
-                if result:
-                    result["id"] = accession
-                    result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}"
-                return result
+            with Entrez.elink(dbfrom="nuccore", db="gene", id=accession) as link_handle:
+                gene_id = _extract_gene_id(link_handle)
 
-            # Step 3: If no GeneID, this is a rare case (accession without associated gene)
-            logger.warning(
-                "Accession %s has no associated GeneID, cannot provide complete information",
-                accession
-            )
-            return None
+            if not gene_id:
+                logger.warning("Accession %s has no associated GeneID", accession)
+                return None
+
+            result = self.get_by_gene_id(gene_id, preferred_accession=accession)
+            if result:
+                result["id"] = accession
+                result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}"
+            return result
         except (RequestException, IncompleteRead):
             raise
-        except Exception as exc:  # pylint: disable=broad-except
+        except Exception as exc:
             logger.error("Accession %s not found: %s", accession, exc)
             return None
 
-    @retry(
-        stop=stop_after_attempt(5),
-        wait=wait_exponential(multiplier=1, min=4, max=10),
-        retry=retry_if_exception_type((RequestException, IncompleteRead)),
-        reraise=True,
-        before_sleep=before_sleep_log(logger, logging.WARNING),
-    )
     def get_best_hit(self, keyword: str) -> Optional[dict]:
-        """
-        Search NCBI Gene database with a keyword and return the best hit.
-        :param keyword: The search keyword (e.g., gene name).
-        :return: A dictionary containing the best hit information or None if not found.
-        """
+        """Search NCBI Gene database with a keyword and return the best hit."""
         if not keyword.strip():
             return None
 
         try:
-            time.sleep(0.35)  # Comply with rate limit
-            # Search gene database
-            search_handle = Entrez.esearch(
-                db="gene",
-                term=f"{keyword}[Gene Name] OR {keyword}[All Fields]",
-                retmax=1,
-            )
-            try:
-                search_results = Entrez.read(search_handle)
-                if not search_results.get("IdList"):
-                    # If not found, try a broader search
-                    time.sleep(0.35)
-                    search_handle2 = Entrez.esearch(
-                        db="gene",
-                        term=keyword,
-                        retmax=1,
-                    )
-                    try:
-                        search_results = Entrez.read(search_handle2)
-                    finally:
-                        search_handle2.close()
-
-                if search_results.get("IdList"):
-                    gene_id = search_results["IdList"][0]
-                    return self.get_by_gene_id(gene_id)
-            finally:
-                search_handle.close()
-        except RequestException:
-            raise
-        except IncompleteRead:
+            for search_term in [f"{keyword}[Gene Name] OR {keyword}[All Fields]", keyword]:
+                with Entrez.esearch(db="gene", term=search_term, retmax=1) as search_handle:
+                    if search_results := Entrez.read(search_handle):
+                        if gene_id := search_results["IdList"][0]:
+                            return self.get_by_gene_id(gene_id)
+                self.rate_limiter.wait()
+        except (RequestException, IncompleteRead):
             raise
-        except Exception as e:  # pylint: disable=broad-except
+        except Exception as e:
             logger.error("Keyword %s not found: %s", keyword, e)
         return None
 
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
-        """
-        Perform local BLAST search using local BLAST database.
-        :param seq: The DNA sequence.
-        :param threshold: E-value threshold for BLAST search.
-        :return: The accession number of the best hit or None if not found.
-        """
+        """Perform local BLAST search using local BLAST database."""
         try:
-            with tempfile.NamedTemporaryFile(
-                mode="w+", suffix=".fa", delete=False
-            ) as tmp:
+            with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp:
                 tmp.write(f">query\n{seq}\n")
                 tmp_name = tmp.name
 
             cmd = [
-                "blastn",
-                "-db",
-                self.local_blast_db,
-                "-query",
-                tmp_name,
-                "-evalue",
-                str(threshold),
-                "-max_target_seqs",
-                "1",
-                "-outfmt",
-                "6 sacc",  # only return accession
+                "blastn", "-db", self.local_blast_db, "-query", tmp_name,
+                "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc"
             ]
             logger.debug("Running local blastn: %s", " ".join(cmd))
             out = subprocess.check_output(cmd, text=True).strip()
             os.remove(tmp_name)
-            if out:
-                return out.split("\n", maxsplit=1)[0]
-            return None
-        except Exception as exc:  # pylint: disable=broad-except
+            return out.split("\n", maxsplit=1)[0] if out else None
+        except Exception as exc:
             logger.error("Local blastn failed: %s", exc)
             return None
 
-    def _extract_and_normalize_sequence(self, sequence: str) -> Optional[str]:
-        """Extract and normalize DNA sequence from input."""
-        if sequence.startswith(">"):
-            seq_lines = sequence.strip().split("\n")
-            seq = "".join(seq_lines[1:])
-        else:
-            seq = sequence.strip().replace(" ", "").replace("\n", "")
-        return seq if seq and re.fullmatch(r"[ATCGN\s]+", seq, re.I) else None
+    def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
+        """Search NCBI with a DNA sequence using BLAST."""
 
-    def _process_network_blast_result(self, blast_record, seq: str, threshold: float) -> Optional[dict]:
-        """Process network BLAST result and return dictionary or None."""
-        if not blast_record.alignments:
-            logger.info("No BLAST hits found for the given sequence.")
-            return None
+        def _extract_and_normalize_sequence(sequence: str) -> Optional[str]:
+            """Extract and normalize DNA sequence from input."""
+            if sequence.startswith(">"):
+                seq = "".join(sequence.strip().split("\n")[1:])
+            else:
+                seq = sequence.strip().replace(" ", "").replace("\n", "")
+            return seq if re.fullmatch(r"[ATCGN]+", seq, re.I) else None
 
-        best_alignment = blast_record.alignments[0]
-        best_hsp = best_alignment.hsps[0]
-        if best_hsp.expect > threshold:
-            logger.info("No BLAST hits below the threshold E-value.")
-            return None
 
-        hit_id = best_alignment.hit_id
-        accession_match = re.search(r"ref\|([^|]+)", hit_id)
-        if accession_match:
-            accession = accession_match.group(1).split(".")[0]
-            return self.get_by_accession(accession)
+        def _process_network_blast_result(blast_record, seq: str, threshold: float) -> Optional[dict]:
+            """Process network BLAST result and return dictionary or None."""
+            if not blast_record.alignments:
+                logger.info("No BLAST hits found for the given sequence.")
+                return None
 
-        # If unable to extract accession, return basic information
-        return {
-            "molecule_type": "DNA",
-            "database": "NCBI",
-            "id": hit_id,
-            "title": best_alignment.title,
-            "sequence_length": len(seq),
-            "e_value": best_hsp.expect,
-            "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0,
-            "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
-        }
+            best_alignment = blast_record.alignments[0]
+            best_hsp = best_alignment.hsps[0]
+            if best_hsp.expect > threshold:
+                logger.info("No BLAST hits below the threshold E-value.")
+                return None
+
+            hit_id = best_alignment.hit_id
+            if accession_match := re.search(r"ref\|([^|]+)", hit_id):
+                return self.get_by_accession(accession_match.group(1).split(".")[0])
+
+            # If unable to extract accession, return basic information
+            return {
+                "molecule_type": "DNA",
+                "database": "NCBI",
+                "id": hit_id,
+                "title": best_alignment.title,
+                "sequence_length": len(seq),
+                "e_value": best_hsp.expect,
+                "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0,
+                "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}",
+            }
 
-    def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
-        """
-        Search NCBI with a DNA sequence using BLAST.
-        Tries local BLAST first if enabled, falls back to network BLAST.
-        :param sequence: DNA sequence (FASTA format or raw sequence).
-        :param threshold: E-value threshold for BLAST search.
-        :return: A dictionary containing the best hit information or None if not found.
-        """
-        result = None
         try:
-            seq = self._extract_and_normalize_sequence(sequence)
-            if not seq:
+            if not (seq := _extract_and_normalize_sequence(sequence)):
                 logger.error("Empty or invalid DNA sequence provided.")
                 return None
 
             # Try local BLAST first if enabled
-            if self.use_local_blast:
-                accession = self._local_blast(seq, threshold)
-                if accession:
-                    logger.debug("Local BLAST found accession: %s", accession)
-                    result = self.get_by_accession(accession)
-
-            # Fall back to network BLAST if local BLAST didn't find result
-            if not result:
-                logger.debug("Falling back to NCBIWWW.qblast.")
-                logger.debug("Performing BLAST search for DNA sequence...")
-                time.sleep(0.35)
-
-                result_handle = NCBIWWW.qblast(
-                    program="blastn",
-                    database="nr",
-                    sequence=seq,
-                    hitlist_size=1,
-                    expect=threshold,
-                )
-                blast_record = NCBIXML.read(result_handle)
-                result = self._process_network_blast_result(blast_record, seq, threshold)
-        except RequestException:
+            if self.use_local_blast and (accession := self._local_blast(seq, threshold)):
+                logger.debug("Local BLAST found accession: %s", accession)
+                return self.get_by_accession(accession)
+
+            # Fall back to network BLAST
+            logger.debug("Falling back to NCBIWWW.qblast")
+
+            with NCBIWWW.qblast("blastn", "nr", seq, hitlist_size=1, expect=threshold) as result_handle:
+                return _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold)
+        except (RequestException, IncompleteRead):
             raise
-        except Exception as e:  # pylint: disable=broad-except
+        except Exception as e:
             logger.error("BLAST search failed: %s", e)
-        return result
+            return None
 
     @retry(
         stop=stop_after_attempt(5),
@@ -657,46 +359,26 @@ def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional
         retry=retry_if_exception_type((RequestException, IncompleteRead)),
         reraise=True,
     )
-    async def search(
-        self, query: str, threshold: float = 0.01, **kwargs
-    ) -> Optional[Dict]:
-        """
-        Search NCBI with either a gene ID, accession number, keyword, or DNA sequence.
-        :param query: The search query (gene ID, accession, keyword, or DNA sequence).
-        :param threshold: E-value threshold for BLAST search.
-        :param kwargs: Additional keyword arguments (not used currently).
-        :return: A dictionary containing the search results or None if not found.
-        """
-        # auto detect query type
+    async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optional[Dict]:
+        """Search NCBI with either a gene ID, accession number, keyword, or DNA sequence."""
         if not query or not isinstance(query, str):
             logger.error("Empty or non-string input.")
             return None
-        query = query.strip()
 
+        query = query.strip()
         logger.debug("NCBI search query: %s", query)
 
         loop = asyncio.get_running_loop()
 
-        # check if DNA sequence (ATCG characters)
+        # Auto-detect query type and execute in thread pool
         if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
-            result = await loop.run_in_executor(
-                _get_pool(), self.search_by_sequence, query, threshold
-            )
-        # check if gene ID (numeric)
+            result = await loop.run_in_executor(_get_pool(), self.search_by_sequence, query, threshold)
         elif re.fullmatch(r"^\d+$", query):
-            result = await loop.run_in_executor(
-                _get_pool(), self.get_by_gene_id, query
-            )
-        # check if accession number (e.g., NM_000546, NC_000001)
+            result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query)
         elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
-            result = await loop.run_in_executor(
-                _get_pool(), self.get_by_accession, query
-            )
+            result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query)
         else:
-            # otherwise treat as keyword
-            result = await loop.run_in_executor(
-                _get_pool(), self.get_best_hit, query
-            )
+            result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
 
         if result:
             result["_search_query"] = query

From bb84c0b571dc979e9c93535b030e5fa2b9f22583 Mon Sep 17 00:00:00 2001
From: chenzihong <522023320011@smail.nju.edu.cn>
Date: Sun, 30 Nov 2025 17:52:28 +0800
Subject: [PATCH 14/22] fix: fix import error

---
 graphgen/models/searcher/db/ncbi_searcher.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 946e3c1f..1a2dd7b5 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -7,7 +7,6 @@
 from functools import lru_cache
 from http.client import IncompleteRead
 from typing import Dict, Optional
-from graphgen.models.searcher.limitter import RateLimiter
 
 from Bio import Entrez, SeqIO
 from Bio.Blast import NCBIWWW, NCBIXML
@@ -67,7 +66,6 @@ def __init__(
         if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"):
             logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
-        self.rate_limiter = RateLimiter()
 
     @staticmethod
     def _nested_get(data: dict, *keys, default=None):

From 58ef1ec35797a72367e013cc5a9c8daac302632b Mon Sep 17 00:00:00 2001
From: chenzihong <522023320011@smail.nju.edu.cn>
Date: Sun, 30 Nov 2025 18:07:27 +0800
Subject: [PATCH 15/22] fix: delete rate_limiter

---
 graphgen/models/searcher/db/ncbi_searcher.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 1a2dd7b5..12da3098 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -264,7 +264,6 @@ def get_best_hit(self, keyword: str) -> Optional[dict]:
                     if search_results := Entrez.read(search_handle):
                         if gene_id := search_results["IdList"][0]:
                             return self.get_by_gene_id(gene_id)
-                self.rate_limiter.wait()
         except (RequestException, IncompleteRead):
             raise
         except Exception as e:

From d767096741ab09d095ca39c2c7534c62f03b292f Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Mon, 1 Dec 2025 01:52:52 +0800
Subject: [PATCH 16/22] perf: simplify RNA searcher and align with DNA searcher
 logic

---
 .../models/searcher/db/rnacentral_searcher.py | 370 ++++++------------
 1 file changed, 118 insertions(+), 252 deletions(-)

diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index 5950a3e7..6a3e2a28 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -74,129 +74,76 @@ async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession)
 
     @staticmethod
     def _extract_info_from_xrefs(xrefs: List[Dict]) -> Dict[str, Any]:
-        """
-        Extract information from xrefs data.
-        :param xrefs: List of xref entries.
-        :return: Dictionary with extracted information.
-        """
-        extracted = {
-            "organisms": set(),
-            "gene_names": set(),
-            "modifications": [],
-            "so_terms": set(),
-            "xrefs_list": [],
-        }
+        """Extract information from xrefs data."""
+        organisms = set()
+        gene_names = set()
+        modifications = []
+        so_terms = set()
+        xrefs_list = []
 
         for xref in xrefs:
-            # Extract accession information
             accession = xref.get("accession", {})
-
-            # Extract species information
             species = accession.get("species")
             if species:
-                extracted["organisms"].add(species)
+                organisms.add(species)
 
-            # Extract gene name
             gene = accession.get("gene")
-            if gene and gene.strip():  # Only add non-empty genes
-                extracted["gene_names"].add(gene.strip())
+            if gene and gene.strip():
+                gene_names.add(gene.strip())
 
-            # Extract modifications
-            modifications = xref.get("modifications", [])
-            if modifications:
-                extracted["modifications"].extend(modifications)
+            if mods := xref.get("modifications", []):
+                modifications.extend(mods)
 
-            # Extract SO term (biotype)
-            biotype = accession.get("biotype")
-            if biotype:
-                extracted["so_terms"].add(biotype)
+            if biotype := accession.get("biotype"):
+                so_terms.add(biotype)
 
-            # Build xrefs list
-            xref_info = {
+            xrefs_list.append({
                 "database": xref.get("database"),
                 "accession_id": accession.get("id"),
                 "external_id": accession.get("external_id"),
                 "description": accession.get("description"),
                 "species": species,
                 "gene": gene,
-            }
-            extracted["xrefs_list"].append(xref_info)
+            })
+
+        def _format_set(s):
+            """Format set to single value or comma-separated string."""
+            if not s:
+                return None
+            return list(s)[0] if len(s) == 1 else ", ".join(s)
 
-        # Convert sets to appropriate formats
         return {
-            "organism": (
-                list(extracted["organisms"])[0]
-                if len(extracted["organisms"]) == 1
-                else (", ".join(extracted["organisms"]) if extracted["organisms"] else None)
-            ),
-            "gene_name": (
-                list(extracted["gene_names"])[0]
-                if len(extracted["gene_names"]) == 1
-                else (", ".join(extracted["gene_names"]) if extracted["gene_names"] else None)
-            ),
-            "related_genes": list(extracted["gene_names"]) if extracted["gene_names"] else None,
-            "modifications": extracted["modifications"] if extracted["modifications"] else None,
-            "so_term": (
-                list(extracted["so_terms"])[0]
-                if len(extracted["so_terms"]) == 1
-                else (", ".join(extracted["so_terms"]) if extracted["so_terms"] else None)
-            ),
-            "xrefs": extracted["xrefs_list"] if extracted["xrefs_list"] else None,
+            "organism": _format_set(organisms),
+            "gene_name": _format_set(gene_names),
+            "related_genes": list(gene_names) if gene_names else None,
+            "modifications": modifications if modifications else None,
+            "so_term": _format_set(so_terms),
+            "xrefs": xrefs_list if xrefs_list else None,
         }
 
     @staticmethod
     def _rna_data_to_dict(rna_id: str, rna_data: dict, xrefs_data: Optional[List[Dict]] = None) -> dict:
-        """
-        Convert RNAcentral API response to a dictionary.
-        :param rna_id: RNAcentral ID.
-        :param rna_data: API response data (dict or dict-like from search results).
-        :param xrefs_data: Optional list of xref entries fetched from xrefs endpoint.
-        :return: A dictionary containing RNA information.
-        """
+        """Convert RNAcentral API response to a dictionary."""
         sequence = rna_data.get("sequence", "")
+        extracted_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data) if xrefs_data else {}
+
+        # Helper to get value with fallbacks
+        def _get_with_fallbacks(key, *fallback_keys):
+            if key in extracted_info and extracted_info[key]:
+                return extracted_info[key]
+            for fk in fallback_keys:
+                if value := rna_data.get(fk):
+                    return value
+            return None
 
-        # Initialize extracted info from xrefs if available
-        extracted_info = {}
-        if xrefs_data:
-            extracted_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data)
-
-        # Extract organism information (prefer from xrefs, fallback to main data)
-        organism = extracted_info.get("organism")
-        if not organism:
-            organism = rna_data.get("organism", None)
-        if not organism:
-            organism = rna_data.get("species", None)
-
-        # Extract related genes (prefer from xrefs, fallback to main data)
+        # Extract related genes with special handling
         related_genes = extracted_info.get("related_genes")
         if not related_genes:
-            related_genes = rna_data.get("related_genes", [])
-        if not related_genes:
-            related_genes = rna_data.get("genes", [])
-        if not related_genes:
-            gene_name_temp = rna_data.get("gene_name", None)
-            if gene_name_temp:
-                related_genes = [gene_name_temp]
-
-        # Extract gene name (prefer from xrefs, fallback to main data)
-        gene_name = extracted_info.get("gene_name")
-        if not gene_name:
-            gene_name = rna_data.get("gene_name", None)
-        if not gene_name:
-            gene_name = rna_data.get("gene", None)
-
-        # Extract so_term (prefer from xrefs, fallback to main data)
-        so_term = extracted_info.get("so_term")
-        if not so_term:
-            so_term = rna_data.get("so_term", None)
-
-        # Extract modifications (prefer from xrefs, fallback to main data)
-        modifications = extracted_info.get("modifications")
-        if not modifications:
-            modifications = rna_data.get("modifications", None)
-
-        # Build result dictionary (xrefs information is already extracted into other fields)
-        # information is extracted into organism, gene_name, so_term, modifications, etc.
+            related_genes = rna_data.get("related_genes") or rna_data.get("genes", [])
+            if not related_genes:
+                if gene_name_temp := rna_data.get("gene_name"):
+                    related_genes = [gene_name_temp]
+
         return {
             "molecule_type": "RNA",
             "database": "RNAcentral",
@@ -207,11 +154,11 @@ def _rna_data_to_dict(rna_id: str, rna_data: dict, xrefs_data: Optional[List[Dic
             "rna_type": rna_data.get("rna_type", "N/A"),
             "description": rna_data.get("description", "N/A"),
             "url": f"https://rnacentral.org/rna/{rna_id}",
-            "organism": organism,
+            "organism": _get_with_fallbacks("organism", "organism", "species"),
             "related_genes": related_genes if related_genes else None,
-            "gene_name": gene_name,
-            "so_term": so_term,
-            "modifications": modifications,
+            "gene_name": _get_with_fallbacks("gene_name", "gene_name", "gene"),
+            "so_term": _get_with_fallbacks("so_term", "so_term"),
+            "modifications": _get_with_fallbacks("modifications", "modifications"),
         }
 
     async def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
@@ -253,48 +200,37 @@ async def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
             return None
 
     async def get_best_hit(self, keyword: str) -> Optional[dict]:
-        """
-        Search RNAcentral with a keyword and return the best hit.
-        Unified approach: Find RNA ID from search, then call get_by_rna_id() for complete information.
-        :param keyword: The search keyword (e.g., miRNA name, RNA name).
-        :return: A dictionary containing complete RNA information or None if not found.
-        """
+        """Search RNAcentral with a keyword and return the best hit."""
         if not keyword.strip():
             return None
 
         try:
             async with aiohttp.ClientSession() as session:
-                search_url = f"{self.base_url}/rna"
-                params = {"search": keyword, "format": "json"}
                 async with session.get(
-                    search_url,
-                    params=params,
+                    f"{self.base_url}/rna",
+                    params={"search": keyword, "format": "json"},
                     headers=self.headers,
                     timeout=aiohttp.ClientTimeout(total=30),
                 ) as resp:
-                    if resp.status == 200:
-                        search_results = await resp.json()
-                        results = search_results.get("results", [])
-                        if results:
-                            # Step 1: Get RNA ID from search results
-                            first_result = results[0]
-                            rna_id = first_result.get("rnacentral_id")
-
-                            if rna_id:
-                                # Step 2: Unified call to get_by_rna_id() for complete information
-                                result = await self.get_by_rna_id(rna_id)
+                    if resp.status != 200:
+                        error_text = await resp.text()
+                        logger.error("HTTP %d error for keyword %s: %s", resp.status, keyword, error_text[:200])
+                        raise Exception(f"HTTP {resp.status}: {error_text}")
 
-                                # Step 3: If get_by_rna_id() failed, use search result data as fallback
-                                if not result:
-                                    logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
-                                    result = self._rna_data_to_dict(rna_id, first_result)
-
-                                return result
+                    search_results = await resp.json()
+                    if not (results := search_results.get("results", [])):
                         logger.info("No results found for keyword: %s", keyword)
                         return None
-                    error_text = await resp.text()
-                    logger.error("HTTP %d error for keyword %s: %s", resp.status, keyword, error_text[:200])
-                    raise Exception(f"HTTP {resp.status}: {error_text}")
+
+                    first_result = results[0]
+                    if not (rna_id := first_result.get("rnacentral_id")):
+                        return None
+
+                    result = await self.get_by_rna_id(rna_id)
+                    if not result:
+                        logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
+                        result = self._rna_data_to_dict(rna_id, first_result)
+                    return result
         except aiohttp.ClientError as e:
             logger.error("Network error searching for keyword %s: %s", keyword, e)
             return None
@@ -303,133 +239,77 @@ async def get_best_hit(self, keyword: str) -> Optional[dict]:
             return None
 
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
-        """
-        Perform local BLAST search using local BLAST database.
-        :param seq: The RNA sequence.
-        :param threshold: E-value threshold for BLAST search.
-        :return: The accession/ID of the best hit or None if not found.
-        """
+        """Perform local BLAST search using local BLAST database."""
         try:
-            with tempfile.NamedTemporaryFile(
-                mode="w+", suffix=".fa", delete=False
-            ) as tmp:
+            with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp:
                 tmp.write(f">query\n{seq}\n")
                 tmp_name = tmp.name
 
             cmd = [
-                "blastn",
-                "-db",
-                self.local_blast_db,
-                "-query",
-                tmp_name,
-                "-evalue",
-                str(threshold),
-                "-max_target_seqs",
-                "1",
-                "-outfmt",
-                "6 sacc",  # only return accession
+                "blastn", "-db", self.local_blast_db, "-query", tmp_name,
+                "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc"
             ]
             logger.debug("Running local blastn for RNA: %s", " ".join(cmd))
             out = subprocess.check_output(cmd, text=True).strip()
             os.remove(tmp_name)
-            if out:
-                return out.split("\n", maxsplit=1)[0]
-            return None
-        except Exception as exc:  # pylint: disable=broad-except
+            return out.split("\n", maxsplit=1)[0] if out else None
+        except Exception as exc:
             logger.error("Local blastn failed: %s", exc)
             return None
 
-    @staticmethod
-    def _extract_and_normalize_sequence(sequence: str) -> Optional[str]:
-        """Extract and normalize RNA sequence from input."""
-        if sequence.startswith(">"):
-            seq_lines = sequence.strip().split("\n")
-            seq = "".join(seq_lines[1:])
-        else:
-            seq = sequence.strip().replace(" ", "").replace("\n", "")
-        return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None
-
-    def _find_best_match_from_results(self, results: List[Dict], seq: str) -> Optional[Dict]:
-        """Find best match from search results, preferring exact match."""
-        exact_match = None
-        for result_item in results:
-            result_seq = result_item.get("sequence", "")
-            if result_seq == seq:
-                exact_match = result_item
-                break
-        return exact_match if exact_match else (results[0] if results else None)
-
-    async def _process_api_search_results(
-        self, results: List[Dict], seq: str
-    ) -> Optional[dict]:
-        """Process API search results and return dictionary or None."""
-        if not results:
-            logger.info("No results found for sequence.")
-            return None
-
-        target_result = self._find_best_match_from_results(results, seq)
-        if not target_result:
-            return None
-
-        rna_id = target_result.get("rnacentral_id")
-        if not rna_id:
-            return None
-
-        # Try to get complete information
-        result = await self.get_by_rna_id(rna_id)
-        if not result:
-            logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
-            result = self._rna_data_to_dict(rna_id, target_result)
-        return result
-
     async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
-        """
-        Search RNAcentral with an RNA sequence.
-        Tries local BLAST first if enabled, falls back to RNAcentral API.
-        Unified approach: Find RNA ID from sequence search, then call get_by_rna_id() for complete information.
-        :param sequence: RNA sequence (FASTA format or raw sequence).
-        :param threshold: E-value threshold for BLAST search.
-        :return: A dictionary containing complete RNA information or None if not found.
-        """
+        """Search RNAcentral with an RNA sequence using BLAST or API."""
+
+        def _extract_and_normalize_sequence(sequence: str) -> Optional[str]:
+            """Extract and normalize RNA sequence from input."""
+            if sequence.startswith(">"):
+                seq = "".join(sequence.strip().split("\n")[1:])
+            else:
+                seq = sequence.strip().replace(" ", "").replace("\n", "")
+            return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None
+
+        def _find_best_match(results: List[Dict], seq: str) -> Optional[Dict]:
+            """Find best match from search results, preferring exact match."""
+            for result_item in results:
+                if result_item.get("sequence", "") == seq:
+                    return result_item
+            return results[0] if results else None
+
         result = None
         try:
-            seq = self._extract_and_normalize_sequence(sequence)
-            if not seq:
+            if not (seq := _extract_and_normalize_sequence(sequence)):
                 logger.error("Empty or invalid RNA sequence provided.")
-                return None
-
-            # Try local BLAST first if enabled
-            if self.use_local_blast:
-                accession = self._local_blast(seq, threshold)
-                if accession:
-                    logger.debug("Local BLAST found accession: %s", accession)
-                    result = await self.get_by_rna_id(accession)
-                    if not result:
-                        result = await self.get_best_hit(accession)
-
-            # Fall back to RNAcentral API if local BLAST didn't find result
-            if not result:
-                logger.debug("Falling back to RNAcentral API.")
+            elif self.use_local_blast and (accession := self._local_blast(seq, threshold)):
+                logger.debug("Local BLAST found accession: %s", accession)
+                result = await self.get_by_rna_id(accession) or await self.get_best_hit(accession)
+            else:
+                # Fall back to RNAcentral API
+                logger.debug("Falling back to RNAcentral API")
                 async with aiohttp.ClientSession() as session:
-                    search_url = f"{self.base_url}/rna"
-                    params = {"sequence": seq, "format": "json"}
                     async with session.get(
-                        search_url,
-                        params=params,
+                        f"{self.base_url}/rna",
+                        params={"sequence": seq, "format": "json"},
                         headers=self.headers,
-                        timeout=aiohttp.ClientTimeout(total=60),  # Sequence search may take longer
+                        timeout=aiohttp.ClientTimeout(total=60),
                     ) as resp:
-                        if resp.status == 200:
-                            search_results = await resp.json()
-                            results = search_results.get("results", [])
-                            result = await self._process_api_search_results(results, seq)
-                        else:
+                        if resp.status != 200:
                             error_text = await resp.text()
                             logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200])
                             raise Exception(f"HTTP {resp.status}: {error_text}")
+
+                        search_results = await resp.json()
+                        if results := search_results.get("results", []):
+                            target_result = _find_best_match(results, seq)
+                            if rna_id := target_result.get("rnacentral_id"):
+                                result = await self.get_by_rna_id(rna_id)
+                                if not result:
+                                    logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
+                                    result = self._rna_data_to_dict(rna_id, target_result)
+                        else:
+                            logger.info("No results found for sequence.")
         except aiohttp.ClientError as e:
             logger.error("Network error searching for sequence: %s", e)
-        except Exception as e:  # pylint: disable=broad-except
+        except Exception as e:
             logger.error("Sequence search failed: %s", e)
         return result
 
@@ -439,35 +319,21 @@ async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Op
         retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)),
         reraise=True,
     )
-    async def search(
-        self, query: str, threshold: float = 0.1, **kwargs
-    ) -> Optional[Dict]:
-        """
-        Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence.
-        :param query: The search query (RNAcentral ID, keyword, or RNA sequence).
-        :param threshold: E-value threshold for sequence search.
-        Note: RNAcentral API uses its own similarity matching, this parameter is for interface consistency.
-        :param kwargs: Additional keyword arguments (not used currently).
-        :return: A dictionary containing the search results or None if not found.
-        """
-        # auto detect query type
+    async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional[Dict]:
+        """Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence."""
         if not query or not isinstance(query, str):
             logger.error("Empty or non-string input.")
             return None
-        query = query.strip()
 
+        query = query.strip()
         logger.debug("RNAcentral search query: %s", query)
 
-        # check if RNA sequence (AUCG characters, contains U)
-        if query.startswith(">") or (
-            re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper()
-        ):
+        # Auto-detect query type
+        if query.startswith(">") or (re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper()):
             result = await self.search_by_sequence(query, threshold)
-        # check if RNAcentral ID (typically starts with URS)
         elif re.fullmatch(r"URS\d+", query, re.I):
             result = await self.get_by_rna_id(query)
         else:
-            # otherwise treat as keyword
             result = await self.get_best_hit(query)
 
         if result:

From ea30cef89b903f2f6fbc17ca3fc6a1d9724e8bf5 Mon Sep 17 00:00:00 2001
From: chenzihong <522023320011@smail.nju.edu.cn>
Date: Mon, 1 Dec 2025 13:40:00 +0800
Subject: [PATCH 17/22] fix: fix search params in get_best_hit

---
 graphgen/models/searcher/db/ncbi_searcher.py | 26 +++++++++-----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 12da3098..655ea4fd 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -76,8 +76,7 @@ def _nested_get(data: dict, *keys, default=None):
             data = data.get(key, default)
         return data
 
-    @staticmethod
-    def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
+    def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict:
         """
         Convert an Entrez gene record to a dictionary.
         All extraction logic is inlined for maximum clarity and performance.
@@ -89,8 +88,8 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
         locus = (data.get("Entrezgene_locus") or [{}])[0]
 
         # Extract common nested paths once
-        gene_ref = NCBISearch._nested_get(data, "Entrezgene_gene", "Gene-ref", default={})
-        biosource = NCBISearch._nested_get(data, "Entrezgene_source", "BioSource", default={})
+        gene_ref = self._nested_get(data, "Entrezgene_gene", "Gene-ref", default={})
+        biosource = self._nested_get(data, "Entrezgene_source", "BioSource", default={})
 
         # Process synonyms
         synonyms_raw = gene_ref.get("Gene-ref_syn", [])
@@ -105,7 +104,7 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
         label = locus.get("Gene-commentary_label", "")
         chromosome_match = re.search(r"Chromosome\s+(\S+)", str(label)) if label else None
 
-        seq_interval = NCBISearch._nested_get(
+        seq_interval = self._nested_get(
             locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={}
         )
         genomic_location = (
@@ -141,7 +140,7 @@ def _gene_record_to_dict(gene_record, gene_id: str) -> dict:
             "id": gene_id,
             "gene_name": gene_ref.get("Gene-ref_locus", "N/A"),
             "gene_description": gene_ref.get("Gene-ref_desc", "N/A"),
-            "organism": NCBISearch._nested_get(
+            "organism": self._nested_get(
                 biosource, "BioSource_org", "Org-ref", "Org-ref_taxname", default="N/A"
             ),
             "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}",
@@ -200,7 +199,6 @@ def _extract_from_genbank(result: dict, accession: str):
 
             return result
 
-
         try:
             with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle:
                 gene_record = Entrez.read(handle)
@@ -259,11 +257,11 @@ def get_best_hit(self, keyword: str) -> Optional[dict]:
             return None
 
         try:
-            for search_term in [f"{keyword}[Gene Name] OR {keyword}[All Fields]", keyword]:
-                with Entrez.esearch(db="gene", term=search_term, retmax=1) as search_handle:
-                    if search_results := Entrez.read(search_handle):
-                        if gene_id := search_results["IdList"][0]:
-                            return self.get_by_gene_id(gene_id)
+            for search_term in [f"{keyword}[Gene] OR {keyword}[All Fields]", keyword]:
+                with Entrez.esearch(db="gene", term=search_term, retmax=1, sort="relevance") as search_handle:
+                    search_results = Entrez.read(search_handle)
+                    if len(gene_id := search_results.get("IdList", [])) > 0:
+                        return self.get_by_gene_id(gene_id)
         except (RequestException, IncompleteRead):
             raise
         except Exception as e:
@@ -289,7 +287,7 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
             logger.error("Local blastn failed: %s", exc)
             return None
 
-    def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
+    def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
         """Search NCBI with a DNA sequence using BLAST."""
 
         def _extract_and_normalize_sequence(sequence: str) -> Optional[str]:
@@ -369,7 +367,7 @@ async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optiona
 
         # Auto-detect query type and execute in thread pool
         if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
-            result = await loop.run_in_executor(_get_pool(), self.search_by_sequence, query, threshold)
+            result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
         elif re.fullmatch(r"^\d+$", query):
             result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query)
         elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):

From 3adb9566794550b143cbff2f711d65c63f8ccb5a Mon Sep 17 00:00:00 2001
From: chenzihong <522023320011@smail.nju.edu.cn>
Date: Mon, 1 Dec 2025 13:42:16 +0800
Subject: [PATCH 18/22] perf: optimize search logic in rnacentral_searcher

---
 .../models/searcher/db/rnacentral_searcher.py | 414 +++++++-----------
 1 file changed, 166 insertions(+), 248 deletions(-)

diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index 5950a3e7..99c163f2 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -2,9 +2,13 @@
 import os
 import re
 import subprocess
+from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
 import tempfile
-from typing import Dict, Optional, List, Any
+from typing import Dict, Optional, List, Any, Set
 
+import hashlib
+import requests
 import aiohttp
 from tenacity import (
     retry,
@@ -16,6 +20,11 @@
 from graphgen.bases import BaseSearcher
 from graphgen.utils import logger
 
+
+@lru_cache(maxsize=None)
+def _get_pool():
+    return ThreadPoolExecutor(max_workers=10)
+
 class RNACentralSearch(BaseSearcher):
     """
     RNAcentral Search client to search RNA databases.
@@ -36,167 +45,90 @@ def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db"
             logger.error("Local BLAST database files not found. Please check the path.")
             self.use_local_blast = False
 
-    async def _fetch_all_xrefs(self, xrefs_url: str, session: aiohttp.ClientSession) -> List[Dict]:
-        """
-        Fetch all xrefs from the xrefs endpoint, handling pagination.
-        :param xrefs_url: URL to the xrefs endpoint.
-        :param session: aiohttp ClientSession to use for requests.
-        :return: List of all xref entries.
-        """
-        all_xrefs = []
-        current_url = xrefs_url
-
-        while current_url:
-            try:
-                async with session.get(
-                    current_url, headers=self.headers, timeout=aiohttp.ClientTimeout(total=30)
-                ) as resp:
-                    if resp.status == 200:
-                        data = await resp.json()
-                        results = data.get("results", [])
-                        all_xrefs.extend(results)
-
-                        # Check if there's a next page
-                        current_url = data.get("next")
-                        if not current_url:
-                            break
-
-                        # Small delay to avoid rate limiting
-                        await asyncio.sleep(0.2)
-                    else:
-                        logger.warning("Failed to fetch xrefs from %s: HTTP %d", current_url, resp.status)
-                        break
-            except Exception as e:
-                logger.warning("Error fetching xrefs from %s: %s", current_url, e)
-                break
-
-        return all_xrefs
-
     @staticmethod
-    def _extract_info_from_xrefs(xrefs: List[Dict]) -> Dict[str, Any]:
-        """
-        Extract information from xrefs data.
-        :param xrefs: List of xref entries.
-        :return: Dictionary with extracted information.
-        """
-        extracted = {
-            "organisms": set(),
-            "gene_names": set(),
-            "modifications": [],
-            "so_terms": set(),
-            "xrefs_list": [],
-        }
+    def _extract_info_from_xrefs(xrefs: List[Dict[str, Any]]) -> Dict[str, Any]:
+        organisms: Set[str] = set()
+        gene_names: Set[str] = set()
+        modifications: List[Any] = []
+        so_terms: Set[str] = set()
+        xrefs_list: List[Dict[str, Any]] = []
+
+        def format_unique_values(values: Set[str]) -> Optional[str]:
+            if not values:
+                return None
+            if len(values) == 1:
+                return next(iter(values))
+            return ", ".join(sorted(values))
 
         for xref in xrefs:
-            # Extract accession information
             accession = xref.get("accession", {})
-
-            # Extract species information
             species = accession.get("species")
-            if species:
-                extracted["organisms"].add(species)
-
-            # Extract gene name
             gene = accession.get("gene")
-            if gene and gene.strip():  # Only add non-empty genes
-                extracted["gene_names"].add(gene.strip())
-
-            # Extract modifications
-            modifications = xref.get("modifications", [])
-            if modifications:
-                extracted["modifications"].extend(modifications)
-
-            # Extract SO term (biotype)
-            biotype = accession.get("biotype")
-            if biotype:
-                extracted["so_terms"].add(biotype)
-
-            # Build xrefs list
-            xref_info = {
+            stripped_gene = gene.strip() if gene else None
+            if species:
+                organisms.add(species)
+            if stripped_gene:
+                gene_names.add(stripped_gene)
+            if mods := xref.get("modifications"):
+                modifications.extend(mods)
+            if biotype := accession.get("biotype"):
+                so_terms.add(biotype)
+
+            xrefs_list.append({
                 "database": xref.get("database"),
                 "accession_id": accession.get("id"),
                 "external_id": accession.get("external_id"),
                 "description": accession.get("description"),
                 "species": species,
-                "gene": gene,
-            }
-            extracted["xrefs_list"].append(xref_info)
+                "gene": stripped_gene,
+            })
 
-        # Convert sets to appropriate formats
         return {
-            "organism": (
-                list(extracted["organisms"])[0]
-                if len(extracted["organisms"]) == 1
-                else (", ".join(extracted["organisms"]) if extracted["organisms"] else None)
-            ),
-            "gene_name": (
-                list(extracted["gene_names"])[0]
-                if len(extracted["gene_names"]) == 1
-                else (", ".join(extracted["gene_names"]) if extracted["gene_names"] else None)
-            ),
-            "related_genes": list(extracted["gene_names"]) if extracted["gene_names"] else None,
-            "modifications": extracted["modifications"] if extracted["modifications"] else None,
-            "so_term": (
-                list(extracted["so_terms"])[0]
-                if len(extracted["so_terms"]) == 1
-                else (", ".join(extracted["so_terms"]) if extracted["so_terms"] else None)
-            ),
-            "xrefs": extracted["xrefs_list"] if extracted["xrefs_list"] else None,
+            "organism": format_unique_values(organisms),
+            "gene_name": format_unique_values(gene_names),
+            "related_genes": list(gene_names) if gene_names else None,
+            "modifications": modifications or None,
+            "so_term": format_unique_values(so_terms),
+            "xrefs": xrefs_list or None,
         }
 
     @staticmethod
-    def _rna_data_to_dict(rna_id: str, rna_data: dict, xrefs_data: Optional[List[Dict]] = None) -> dict:
-        """
-        Convert RNAcentral API response to a dictionary.
-        :param rna_id: RNAcentral ID.
-        :param rna_data: API response data (dict or dict-like from search results).
-        :param xrefs_data: Optional list of xref entries fetched from xrefs endpoint.
-        :return: A dictionary containing RNA information.
-        """
+    def _rna_data_to_dict(
+        rna_id: str, 
+        rna_data: Dict[str, Any], 
+        xrefs_data: Optional[List[Dict[str, Any]]] = None
+    ) -> Dict[str, Any]:
+        fallback_rules = {
+            "organism": ["organism", "species"],
+            "related_genes": ["related_genes", "genes"],
+            "gene_name": ["gene_name", "gene"],
+            "so_term": ["so_term"],
+            "modifications": ["modifications"],
+        }
+
+        xrefs_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data) if xrefs_data else {}
+
+        def resolve_field(field_name: str) -> Any:
+            if value := xrefs_info.get(field_name):
+                return value
+
+            for key in fallback_rules[field_name]:
+                if (value := rna_data.get(key)) is not None:
+                    return value
+
+            return None
+
+        organism = resolve_field("organism")
+        gene_name = resolve_field("gene_name")
+        so_term = resolve_field("so_term")
+        modifications = resolve_field("modifications")
+
+        related_genes = resolve_field("related_genes")
+        if not related_genes and (single_gene := rna_data.get("gene_name")):
+            related_genes = [single_gene]
+
         sequence = rna_data.get("sequence", "")
 
-        # Initialize extracted info from xrefs if available
-        extracted_info = {}
-        if xrefs_data:
-            extracted_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data)
-
-        # Extract organism information (prefer from xrefs, fallback to main data)
-        organism = extracted_info.get("organism")
-        if not organism:
-            organism = rna_data.get("organism", None)
-        if not organism:
-            organism = rna_data.get("species", None)
-
-        # Extract related genes (prefer from xrefs, fallback to main data)
-        related_genes = extracted_info.get("related_genes")
-        if not related_genes:
-            related_genes = rna_data.get("related_genes", [])
-        if not related_genes:
-            related_genes = rna_data.get("genes", [])
-        if not related_genes:
-            gene_name_temp = rna_data.get("gene_name", None)
-            if gene_name_temp:
-                related_genes = [gene_name_temp]
-
-        # Extract gene name (prefer from xrefs, fallback to main data)
-        gene_name = extracted_info.get("gene_name")
-        if not gene_name:
-            gene_name = rna_data.get("gene_name", None)
-        if not gene_name:
-            gene_name = rna_data.get("gene", None)
-
-        # Extract so_term (prefer from xrefs, fallback to main data)
-        so_term = extracted_info.get("so_term")
-        if not so_term:
-            so_term = rna_data.get("so_term", None)
-
-        # Extract modifications (prefer from xrefs, fallback to main data)
-        modifications = extracted_info.get("modifications")
-        if not modifications:
-            modifications = rna_data.get("modifications", None)
-
-        # Build result dictionary (xrefs information is already extracted into other fields)
-        # information is extracted into organism, gene_name, so_term, modifications, etc.
         return {
             "molecule_type": "RNA",
             "database": "RNAcentral",
@@ -208,51 +140,52 @@ def _rna_data_to_dict(rna_id: str, rna_data: dict, xrefs_data: Optional[List[Dic
             "description": rna_data.get("description", "N/A"),
             "url": f"https://rnacentral.org/rna/{rna_id}",
             "organism": organism,
-            "related_genes": related_genes if related_genes else None,
+            "related_genes": related_genes or None,
             "gene_name": gene_name,
             "so_term": so_term,
             "modifications": modifications,
         }
 
-    async def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
+    @staticmethod
+    def _calculate_md5(sequence: str) -> str:
+        """
+        Calculate MD5 hash for RNA sequence as per RNAcentral spec.
+        - Replace U with T
+        - Convert to uppercase
+        - Encode as ASCII
+        """
+        # Normalize sequence
+        normalized_seq = sequence.replace("U", "T").replace("u", "t").upper()
+        if not re.fullmatch(r"[ATCGN]+", normalized_seq):
+            raise ValueError(f"Invalid sequence characters after normalization: {normalized_seq[:50]}...")
+
+        return hashlib.md5(normalized_seq.encode("ascii")).hexdigest()
+
+    def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
         """
         Get RNA information by RNAcentral ID.
         :param rna_id: RNAcentral ID (e.g., URS0000000001).
         :return: A dictionary containing RNA information or None if not found.
         """
         try:
-            async with aiohttp.ClientSession() as session:
-                url = f"{self.base_url}/rna/{rna_id}"
-                async with session.get(
-                    url, headers=self.headers, timeout=aiohttp.ClientTimeout(total=30)
-                ) as resp:
-                    if resp.status == 200:
-                        rna_data = await resp.json()
-
-                        # Check if xrefs is a URL and fetch the actual xrefs data
-                        xrefs_data = None
-                        xrefs_url = rna_data.get("xrefs")
-                        if xrefs_url and isinstance(xrefs_url, str) and xrefs_url.startswith("http"):
-                            try:
-                                xrefs_data = await self._fetch_all_xrefs(xrefs_url, session)
-                                logger.debug("Fetched %d xrefs for RNA ID %s", len(xrefs_data), rna_id)
-                            except Exception as e:
-                                logger.warning("Failed to fetch xrefs for RNA ID %s: %s", rna_id, e)
-                                # Continue without xrefs data
-
-                        return self._rna_data_to_dict(rna_id, rna_data, xrefs_data)
-                    if resp.status == 404:
-                        logger.error("RNA ID %s not found", rna_id)
-                        return None
-                    raise Exception(f"HTTP {resp.status}: {await resp.text()}")
-        except aiohttp.ClientError as e:
+            url = f"{self.base_url}/rna/{rna_id}"
+            url += "?flat=true"
+
+            resp = requests.get(url, headers=self.headers, timeout=30)
+            if resp.status_code == 200:
+                rna_data = resp.json()
+                xrefs_data = rna_data.get("xrefs", [])
+                return self._rna_data_to_dict(rna_id, rna_data, xrefs_data)
+            logger.error("Failed to fetch RNA ID %s: HTTP %s", rna_id, resp.status_code)
+            return None
+        except requests.RequestException as e:
             logger.error("Network error getting RNA ID %s: %s", rna_id, e)
             return None
-        except Exception as exc:  # pylint: disable=broad-except
-            logger.error("RNA ID %s not found: %s", rna_id, exc)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e)
             return None
 
-    async def get_best_hit(self, keyword: str) -> Optional[dict]:
+    def get_best_hit(self, keyword: str) -> Optional[dict]:
         """
         Search RNAcentral with a keyword and return the best hit.
         Unified approach: Find RNA ID from search, then call get_by_rna_id() for complete information.
@@ -263,42 +196,35 @@ async def get_best_hit(self, keyword: str) -> Optional[dict]:
             return None
 
         try:
-            async with aiohttp.ClientSession() as session:
-                search_url = f"{self.base_url}/rna"
-                params = {"search": keyword, "format": "json"}
-                async with session.get(
-                    search_url,
-                    params=params,
-                    headers=self.headers,
-                    timeout=aiohttp.ClientTimeout(total=30),
-                ) as resp:
-                    if resp.status == 200:
-                        search_results = await resp.json()
-                        results = search_results.get("results", [])
-                        if results:
-                            # Step 1: Get RNA ID from search results
-                            first_result = results[0]
-                            rna_id = first_result.get("rnacentral_id")
-
-                            if rna_id:
-                                # Step 2: Unified call to get_by_rna_id() for complete information
-                                result = await self.get_by_rna_id(rna_id)
-
-                                # Step 3: If get_by_rna_id() failed, use search result data as fallback
-                                if not result:
-                                    logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
-                                    result = self._rna_data_to_dict(rna_id, first_result)
-
-                                return result
-                        logger.info("No results found for keyword: %s", keyword)
-                        return None
-                    error_text = await resp.text()
-                    logger.error("HTTP %d error for keyword %s: %s", resp.status, keyword, error_text[:200])
-                    raise Exception(f"HTTP {resp.status}: {error_text}")
+            search_url = f"{self.base_url}/rna"
+            params = {"search": keyword, "format": "json"}
+
+            resp = requests.get(
+                search_url,
+                params=params,
+                headers=self.headers,
+                timeout=30,
+            )
+            if resp.status_code == 200:
+                search_results = resp.json()
+                results = search_results.get("results", [])
+                if results:
+                    # Step 1: Get RNA ID from search results
+                    first_result = results[0]
+                    rna_id = first_result.get("rnacentral_id")
+
+                    if rna_id:
+                        # Step 2: Unified call to get_by_rna_id() for complete information
+                        return self.get_by_rna_id(rna_id)
+                # Step 3: If get_by_rna_id() failed, use search result data as fallback
+                logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
+                return self._rna_data_to_dict(rna_id, first_result)
+            logger.error("No RNA ID found for keyword %s", keyword)
+            return None
         except aiohttp.ClientError as e:
             logger.error("Network error searching for keyword %s: %s", keyword, e)
             return None
-        except Exception as e:  # pylint: disable=broad-except
+        except Exception as e:
             logger.error("Keyword %s not found: %s", keyword, e)
             return None
 
@@ -339,16 +265,6 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
             logger.error("Local blastn failed: %s", exc)
             return None
 
-    @staticmethod
-    def _extract_and_normalize_sequence(sequence: str) -> Optional[str]:
-        """Extract and normalize RNA sequence from input."""
-        if sequence.startswith(">"):
-            seq_lines = sequence.strip().split("\n")
-            seq = "".join(seq_lines[1:])
-        else:
-            seq = sequence.strip().replace(" ", "").replace("\n", "")
-        return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None
-
     def _find_best_match_from_results(self, results: List[Dict], seq: str) -> Optional[Dict]:
         """Find best match from search results, preferring exact match."""
         exact_match = None
@@ -382,7 +298,7 @@ async def _process_api_search_results(
             result = self._rna_data_to_dict(rna_id, target_result)
         return result
 
-    async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
+    def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
         """
         Search RNAcentral with an RNA sequence.
         Tries local BLAST first if enabled, falls back to RNAcentral API.
@@ -391,9 +307,17 @@ async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Op
         :param threshold: E-value threshold for BLAST search.
         :return: A dictionary containing complete RNA information or None if not found.
         """
-        result = None
+        def _extract_and_normalize_sequence(sequence: str) -> Optional[str]:
+            """Extract and normalize RNA sequence from input."""
+            if sequence.startswith(">"):
+                seq_lines = sequence.strip().split("\n")
+                seq = "".join(seq_lines[1:])
+            else:
+                seq = sequence.strip().replace(" ", "").replace("\n", "")
+            return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None
+
         try:
-            seq = self._extract_and_normalize_sequence(sequence)
+            seq = _extract_and_normalize_sequence(sequence)
             if not seq:
                 logger.error("Empty or invalid RNA sequence provided.")
                 return None
@@ -403,35 +327,27 @@ async def search_by_sequence(self, sequence: str, threshold: float = 0.01) -> Op
                 accession = self._local_blast(seq, threshold)
                 if accession:
                     logger.debug("Local BLAST found accession: %s", accession)
-                    result = await self.get_by_rna_id(accession)
-                    if not result:
-                        result = await self.get_best_hit(accession)
+                    return self.get_by_rna_id(accession)
 
             # Fall back to RNAcentral API if local BLAST didn't find result
-            if not result:
-                logger.debug("Falling back to RNAcentral API.")
-                async with aiohttp.ClientSession() as session:
-                    search_url = f"{self.base_url}/rna"
-                    params = {"sequence": seq, "format": "json"}
-                    async with session.get(
-                        search_url,
-                        params=params,
-                        headers=self.headers,
-                        timeout=aiohttp.ClientTimeout(total=60),  # Sequence search may take longer
-                    ) as resp:
-                        if resp.status == 200:
-                            search_results = await resp.json()
-                            results = search_results.get("results", [])
-                            result = await self._process_api_search_results(results, seq)
-                        else:
-                            error_text = await resp.text()
-                            logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200])
-                            raise Exception(f"HTTP {resp.status}: {error_text}")
-        except aiohttp.ClientError as e:
-            logger.error("Network error searching for sequence: %s", e)
+            logger.debug("Falling back to RNAcentral API.")
+
+            md5_hash = self._calculate_md5(seq)
+            search_url = f"{self.base_url}/rna"
+            params = {"md5": md5_hash, "format": "json"}
+
+            resp = requests.get(search_url, params=params, headers=self.headers, timeout=60)  # Sequence search may take longer
+            if resp.status_code == 200:
+                search_results = resp.json()
+                results = search_results.get("results", [])
+                return self._process_api_search_results(results, seq)
+            error_text = resp.text()
+            logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200])
+            raise Exception(f"HTTP {resp.status}: {error_text}")
         except Exception as e:  # pylint: disable=broad-except
             logger.error("Sequence search failed: %s", e)
-        return result
+            return None
+
 
     @retry(
         stop=stop_after_attempt(3),
@@ -458,17 +374,19 @@ async def search(
 
         logger.debug("RNAcentral search query: %s", query)
 
+        loop = asyncio.get_running_loop()
+
         # check if RNA sequence (AUCG characters, contains U)
         if query.startswith(">") or (
             re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper()
         ):
-            result = await self.search_by_sequence(query, threshold)
+            result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
         # check if RNAcentral ID (typically starts with URS)
         elif re.fullmatch(r"URS\d+", query, re.I):
-            result = await self.get_by_rna_id(query)
+            result = await loop.run_in_executor(_get_pool(), self.get_by_rna_id, query)
         else:
             # otherwise treat as keyword
-            result = await self.get_best_hit(query)
+            result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
 
         if result:
             result["_search_query"] = query

From e1530f967e0e3bc5dbcf575872bc525d4d6b6353 Mon Sep 17 00:00:00 2001
From: chenzihong <522023320011@smail.nju.edu.cn>
Date: Mon, 1 Dec 2025 14:18:29 +0800
Subject: [PATCH 19/22] perf: optimize code style

---
 .../models/searcher/db/rnacentral_searcher.py | 190 +++++++-----------
 1 file changed, 69 insertions(+), 121 deletions(-)

diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index afbf0ca7..4b288d9b 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -46,12 +46,24 @@ def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db"
             self.use_local_blast = False
 
     @staticmethod
-    def _extract_info_from_xrefs(xrefs: List[Dict[str, Any]]) -> Dict[str, Any]:
-        organisms: Set[str] = set()
-        gene_names: Set[str] = set()
+    def _rna_data_to_dict(
+        rna_id: str,
+        rna_data: Dict[str, Any], 
+        xrefs_data: Optional[List[Dict[str, Any]]] = None
+    ) -> Dict[str, Any]:
+        organisms, gene_names, so_terms = set(), set(), set()
         modifications: List[Any] = []
-        so_terms: Set[str] = set()
-        xrefs_list: List[Dict[str, Any]] = []
+
+        for xref in xrefs_data or []:
+            acc = xref.get("accession", {})
+            if s := acc.get("species"):
+                organisms.add(s)
+            if g := acc.get("gene", "").strip():
+                gene_names.add(g)
+            if m := xref.get("modifications"):
+                modifications.extend(m)
+            if b := acc.get("biotype"):
+                so_terms.add(b)
 
         def format_unique_values(values: Set[str]) -> Optional[str]:
             if not values:
@@ -60,44 +72,14 @@ def format_unique_values(values: Set[str]) -> Optional[str]:
                 return next(iter(values))
             return ", ".join(sorted(values))
 
-        for xref in xrefs:
-            accession = xref.get("accession", {})
-            species = accession.get("species")
-            gene = accession.get("gene")
-            stripped_gene = gene.strip() if gene else None
-            if species:
-                organisms.add(species)
-            if stripped_gene:
-                gene_names.add(stripped_gene)
-            if mods := xref.get("modifications"):
-                modifications.extend(mods)
-            if biotype := accession.get("biotype"):
-                so_terms.add(biotype)
-
-            xrefs_list.append({
-                "database": xref.get("database"),
-                "accession_id": accession.get("id"),
-                "external_id": accession.get("external_id"),
-                "description": accession.get("description"),
-                "species": species,
-                "gene": stripped_gene,
-            })
-
-        return {
+        xrefs_info = {
             "organism": format_unique_values(organisms),
             "gene_name": format_unique_values(gene_names),
             "related_genes": list(gene_names) if gene_names else None,
             "modifications": modifications or None,
             "so_term": format_unique_values(so_terms),
-            "xrefs": xrefs_list or None,
         }
 
-    @staticmethod
-    def _rna_data_to_dict(
-        rna_id: str, 
-        rna_data: Dict[str, Any], 
-        xrefs_data: Optional[List[Dict[str, Any]]] = None
-    ) -> Dict[str, Any]:
         fallback_rules = {
             "organism": ["organism", "species"],
             "related_genes": ["related_genes", "genes"],
@@ -106,10 +88,8 @@ def _rna_data_to_dict(
             "modifications": ["modifications"],
         }
 
-        xrefs_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data) if xrefs_data else {}
-
         def resolve_field(field_name: str) -> Any:
-            if value := xrefs_info.get(field_name):
+            if (value := xrefs_info.get(field_name)) is not None:
                 return value
 
             for key in fallback_rules[field_name]:
@@ -128,7 +108,6 @@ def resolve_field(field_name: str) -> Any:
             related_genes = [single_gene]
 
         sequence = rna_data.get("sequence", "")
-        extracted_info = RNACentralSearch._extract_info_from_xrefs(xrefs_data) if xrefs_data else {}
 
         return {
             "molecule_type": "RNA",
@@ -172,13 +151,12 @@ def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
             url = f"{self.base_url}/rna/{rna_id}"
             url += "?flat=true"
 
-            resp = requests.get(url, headers=self.headers, timeout=30)
-            if resp.status_code == 200:
-                rna_data = resp.json()
-                xrefs_data = rna_data.get("xrefs", [])
-                return self._rna_data_to_dict(rna_id, rna_data, xrefs_data)
-            logger.error("Failed to fetch RNA ID %s: HTTP %s", rna_id, resp.status_code)
-            return None
+            resp = requests.get(url, headers=self.headers)
+            resp.raise_for_status()
+
+            rna_data = resp.json()
+            xrefs_data = rna_data.get("xrefs", [])
+            return self._rna_data_to_dict(rna_id, rna_data, xrefs_data)
         except requests.RequestException as e:
             logger.error("Network error getting RNA ID %s: %s", rna_id, e)
             return None
@@ -189,44 +167,42 @@ def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
     def get_best_hit(self, keyword: str) -> Optional[dict]:
         """
         Search RNAcentral with a keyword and return the best hit.
-        Unified approach: Find RNA ID from search, then call get_by_rna_id() for complete information.
         :param keyword: The search keyword (e.g., miRNA name, RNA name).
-        :return: A dictionary containing complete RNA information or None if not found.
+        :return: Dictionary with RNA information or None.
         """
-        if not keyword.strip():
+        keyword = keyword.strip()
+        if not keyword:
+            logger.warning("Empty keyword provided to get_best_hit")
             return None
 
         try:
-            search_url = f"{self.base_url}/rna"
+            url = f"{self.base_url}/rna"
             params = {"search": keyword, "format": "json"}
+            resp = requests.get(url, params=params, headers=self.headers)
+            resp.raise_for_status()
 
-            resp = requests.get(
-                search_url,
-                params=params,
-                headers=self.headers,
-                timeout=30,
-            )
-            if resp.status_code == 200:
-                search_results = resp.json()
-                results = search_results.get("results", [])
-                if results:
-                    # Step 1: Get RNA ID from search results
-                    first_result = results[0]
-                    rna_id = first_result.get("rnacentral_id")
-
-                    if rna_id:
-                        # Step 2: Unified call to get_by_rna_id() for complete information
-                        return self.get_by_rna_id(rna_id)
-                # Step 3: If get_by_rna_id() failed, use search result data as fallback
-                logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
-                return self._rna_data_to_dict(rna_id, first_result)
-            logger.error("No RNA ID found for keyword %s", keyword)
-            return None
-        except aiohttp.ClientError as e:
-            logger.error("Network error searching for keyword %s: %s", keyword, e)
+            data = resp.json()
+            results = data.get("results", [])
+
+            if not results:
+                logger.info("No search results for keyword: %s", keyword)
+                return None
+
+            first_result = results[0]
+            rna_id = first_result.get("rnacentral_id")
+
+            if rna_id:
+                detailed = self.get_by_rna_id(rna_id)
+                if detailed:
+                    return detailed
+            logger.debug("Using search result data for %s", rna_id or "unknown")
+            return self._rna_data_to_dict(rna_id or "", first_result)
+
+        except requests.RequestException as e:
+            logger.error("Network error searching keyword '%s': %s", keyword, e)
             return None
         except Exception as e:
-            logger.error("Keyword %s not found: %s", keyword, e)
+            logger.error("Unexpected error searching keyword '%s': %s", keyword, e)
             return None
 
     def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
@@ -248,39 +224,6 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
             logger.error("Local blastn failed: %s", exc)
             return None
 
-    def _find_best_match_from_results(self, results: List[Dict], seq: str) -> Optional[Dict]:
-        """Find best match from search results, preferring exact match."""
-        exact_match = None
-        for result_item in results:
-            result_seq = result_item.get("sequence", "")
-            if result_seq == seq:
-                exact_match = result_item
-                break
-        return exact_match if exact_match else (results[0] if results else None)
-
-    async def _process_api_search_results(
-        self, results: List[Dict], seq: str
-    ) -> Optional[dict]:
-        """Process API search results and return dictionary or None."""
-        if not results:
-            logger.info("No results found for sequence.")
-            return None
-
-        target_result = self._find_best_match_from_results(results, seq)
-        if not target_result:
-            return None
-
-        rna_id = target_result.get("rnacentral_id")
-        if not rna_id:
-            return None
-
-        # Try to get complete information
-        result = await self.get_by_rna_id(rna_id)
-        if not result:
-            logger.debug("get_by_rna_id() failed for %s, using search result data", rna_id)
-            result = self._rna_data_to_dict(rna_id, target_result)
-        return result
-
     def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]:
         """
         Search RNAcentral with an RNA sequence.
@@ -290,7 +233,7 @@ def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]
         :param threshold: E-value threshold for BLAST search.
         :return: A dictionary containing complete RNA information or None if not found.
         """
-        def _extract_and_normalize_sequence(sequence: str) -> Optional[str]:
+        def _extract_sequence(sequence: str) -> Optional[str]:
             """Extract and normalize RNA sequence from input."""
             if sequence.startswith(">"):
                 seq_lines = sequence.strip().split("\n")
@@ -300,7 +243,7 @@ def _extract_and_normalize_sequence(sequence: str) -> Optional[str]:
             return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None
 
         try:
-            seq = _extract_and_normalize_sequence(sequence)
+            seq = _extract_sequence(sequence)
             if not seq:
                 logger.error("Empty or invalid RNA sequence provided.")
                 return None
@@ -319,19 +262,24 @@ def _extract_and_normalize_sequence(sequence: str) -> Optional[str]:
             search_url = f"{self.base_url}/rna"
             params = {"md5": md5_hash, "format": "json"}
 
-            resp = requests.get(search_url, params=params, headers=self.headers, timeout=60)  # Sequence search may take longer
-            if resp.status_code == 200:
-                search_results = resp.json()
-                results = search_results.get("results", [])
-                return self._process_api_search_results(results, seq)
-            error_text = resp.text()
-            logger.error("HTTP %d error for sequence search: %s", resp.status, error_text[:200])
-            raise Exception(f"HTTP {resp.status}: {error_text}")
-        except Exception as e:  # pylint: disable=broad-except
+            resp = requests.get(search_url, params=params, headers=self.headers)
+            resp.raise_for_status()
+
+            search_results = resp.json()
+            results = search_results.get("results", [])
+
+            if not results:
+                logger.info("No exact match found in RNAcentral for sequence")
+                return None
+            rna_id = results[0].get("rnacentral_id")
+            if not rna_id:
+                logger.error("No RNAcentral ID found in search results.")
+                return None
+            return self.get_by_rna_id(rna_id)
+        except Exception as e:
             logger.error("Sequence search failed: %s", e)
             return None
 
-
     @retry(
         stop=stop_after_attempt(3),
         wait=wait_exponential(multiplier=1, min=2, max=10),

From 61f7f4446f08639d8243c3ff103a6920fff44c37 Mon Sep 17 00:00:00 2001
From: chenzihong <522023320011@smail.nju.edu.cn>
Date: Mon, 1 Dec 2025 14:23:24 +0800
Subject: [PATCH 20/22] fix: fix lint problems

---
 graphgen/models/searcher/db/rnacentral_searcher.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py
index 4b288d9b..58c5e86e 100644
--- a/graphgen/models/searcher/db/rnacentral_searcher.py
+++ b/graphgen/models/searcher/db/rnacentral_searcher.py
@@ -48,7 +48,7 @@ def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db"
     @staticmethod
     def _rna_data_to_dict(
         rna_id: str,
-        rna_data: Dict[str, Any], 
+        rna_data: Dict[str, Any],
         xrefs_data: Optional[List[Dict[str, Any]]] = None
     ) -> Dict[str, Any]:
         organisms, gene_names, so_terms = set(), set(), set()
@@ -151,7 +151,7 @@ def get_by_rna_id(self, rna_id: str) -> Optional[dict]:
             url = f"{self.base_url}/rna/{rna_id}"
             url += "?flat=true"
 
-            resp = requests.get(url, headers=self.headers)
+            resp = requests.get(url, headers=self.headers, timeout=30)
             resp.raise_for_status()
 
             rna_data = resp.json()
@@ -178,7 +178,7 @@ def get_best_hit(self, keyword: str) -> Optional[dict]:
         try:
             url = f"{self.base_url}/rna"
             params = {"search": keyword, "format": "json"}
-            resp = requests.get(url, params=params, headers=self.headers)
+            resp = requests.get(url, params=params, headers=self.headers, timeout=30)
             resp.raise_for_status()
 
             data = resp.json()
@@ -262,7 +262,7 @@ def _extract_sequence(sequence: str) -> Optional[str]:
             search_url = f"{self.base_url}/rna"
             params = {"md5": md5_hash, "format": "json"}
 
-            resp = requests.get(search_url, params=params, headers=self.headers)
+            resp = requests.get(search_url, params=params, headers=self.headers, timeout=60)
             resp.raise_for_status()
 
             search_results = resp.json()

From 2c00b9e750450aa13875f81bba1fb5328f62abb9 Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Mon, 1 Dec 2025 16:25:02 +0800
Subject: [PATCH 21/22] fix: search setup problems

---
 graphgen/graphgen.py                         | 2 +-
 graphgen/models/searcher/db/ncbi_searcher.py | 3 +++
 requirements.txt                             | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
index 167981e9..bc7e7742 100644
--- a/graphgen/graphgen.py
+++ b/graphgen/graphgen.py
@@ -45,7 +45,7 @@ def __init__(
 
         # llm
         self.tokenizer_instance: Tokenizer = tokenizer_instance or Tokenizer(
-            model_name=os.getenv("TOKENIZER_MODEL")
+            model_name=os.getenv("TOKENIZER_MODEL", "cl100k_base")
         )
 
         self.synthesizer_llm_client: BaseLLMWrapper = (
diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index 655ea4fd..ae06db3d 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -44,6 +44,7 @@ def __init__(
         local_blast_db: str = "nt_db",
         email: str = "email@example.com",
         api_key: str = "",
+        tool: str = "GraphGen",
     ):
         """
         Initialize the NCBI Search client.
@@ -53,10 +54,12 @@ def __init__(
             local_blast_db (str): Path to the local BLAST database.
             email (str): Email address for NCBI API requests.
             api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/.
+            tool (str): Tool name for NCBI API requests.
         """
         super().__init__()
         Entrez.timeout = 60  # 60 seconds timeout
         Entrez.email = email
+        Entrez.tool = tool
         if api_key:
             Entrez.api_key = api_key
         Entrez.max_tries = 10 if api_key else 3
diff --git a/requirements.txt b/requirements.txt
index 47965013..fa2b1efc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,6 +21,7 @@ fastapi
 trafilatura
 aiohttp
 diskcache
+socksio
 
 leidenalg
 igraph

From 6d0be7aa2713e5d18dc7a50f0c5be4c8db8049dc Mon Sep 17 00:00:00 2001
From: CHERRY-ui8 <2693275288@qq.com>
Date: Mon, 1 Dec 2025 18:21:00 +0800
Subject: [PATCH 22/22] feat: more examples in search demo

---
 graphgen/models/searcher/db/ncbi_searcher.py  | 25 ++++++++++++-------
 .../input_examples/search_dna_demo.jsonl      |  4 +++
 .../input_examples/search_protein_demo.jsonl  |  5 ++++
 .../input_examples/search_rna_demo.jsonl      |  1 +
 4 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py
index ae06db3d..0de8ecc0 100644
--- a/graphgen/models/searcher/db/ncbi_searcher.py
+++ b/graphgen/models/searcher/db/ncbi_searcher.py
@@ -27,6 +27,10 @@ def _get_pool():
     return ThreadPoolExecutor(max_workers=10)
 
 
+# ensure only one NCBI request at a time
+_ncbi_lock = asyncio.Lock()
+
+
 class NCBISearch(BaseSearcher):
     """
     NCBI Search client to search DNA/GenBank/Entrez databases.
@@ -236,6 +240,7 @@ def _extract_gene_id(link_handle):
                 return str(link.get("Id") if isinstance(link, dict) else link)
 
         try:
+            # TODO: support accession number with version number (e.g., NM_000546.3)
             with Entrez.elink(dbfrom="nuccore", db="gene", id=accession) as link_handle:
                 gene_id = _extract_gene_id(link_handle)
 
@@ -368,15 +373,17 @@ async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optiona
 
         loop = asyncio.get_running_loop()
 
-        # Auto-detect query type and execute in thread pool
-        if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
-            result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
-        elif re.fullmatch(r"^\d+$", query):
-            result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query)
-        elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
-            result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query)
-        else:
-            result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
+        # limit concurrent requests (NCBI rate limit: max 3 requests per second)
+        async with _ncbi_lock:
+            # Auto-detect query type and execute in thread pool
+            if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I):
+                result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold)
+            elif re.fullmatch(r"^\d+$", query):
+                result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query)
+            elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I):
+                result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query)
+            else:
+                result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query)
 
         if result:
             result["_search_query"] = query
diff --git a/resources/input_examples/search_dna_demo.jsonl b/resources/input_examples/search_dna_demo.jsonl
index 387c87b8..346b65f0 100644
--- a/resources/input_examples/search_dna_demo.jsonl
+++ b/resources/input_examples/search_dna_demo.jsonl
@@ -1,5 +1,9 @@
 {"type": "text", "content": "TP53"}
 {"type": "text", "content": "BRCA1"}
 {"type": "text", "content": "672"}
+{"type": "text", "content": "11998"}
 {"type": "text", "content": "NM_000546"}
+{"type": "text", "content": "NM_024140"}
+{"type": "text", "content": ">query\nCTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"}
 {"type": "text", "content": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"}
+
diff --git a/resources/input_examples/search_protein_demo.jsonl b/resources/input_examples/search_protein_demo.jsonl
index 82b77836..e119cec8 100644
--- a/resources/input_examples/search_protein_demo.jsonl
+++ b/resources/input_examples/search_protein_demo.jsonl
@@ -2,6 +2,11 @@
 {"type": "text", "content": "P68871"}
 {"type": "text", "content": "P02768"}
 {"type": "text", "content": "P04637"}
+{"type": "text", "content": "insulin"}
+{"type": "text", "content": "hemoglobin"}
+{"type": "text", "content": "p53"}
+{"type": "text", "content": "BRCA1"}
+{"type": "text", "content": "albumin"}
 {"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"}
 {"type": "text", "content": "MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"}
 {"type": "text", "content": "MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"}
diff --git a/resources/input_examples/search_rna_demo.jsonl b/resources/input_examples/search_rna_demo.jsonl
index caa28612..16e99479 100644
--- a/resources/input_examples/search_rna_demo.jsonl
+++ b/resources/input_examples/search_rna_demo.jsonl
@@ -1,4 +1,5 @@
 {"type": "text", "content": "hsa-let-7a-1"}
 {"type": "text", "content": "URS0000123456"}
 {"type": "text", "content": "URS0000000001"}
+{"type": "text", "content": ">query\nCUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}
 {"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"}