diff --git a/graphgen/configs/search_config.yaml b/graphgen/configs/search_config.yaml deleted file mode 100644 index 63ebd241..00000000 --- a/graphgen/configs/search_config.yaml +++ /dev/null @@ -1,14 +0,0 @@ -pipeline: - - name: read_step - op_key: read - params: - input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples - - - name: search_step - op_key: search - deps: [read_step] # search_step depends on read_step - params: - data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot - uniprot_params: - use_local_blast: true # whether to use local blast for uniprot search - local_blast_db: /your_path/uniprot_sprot diff --git a/graphgen/configs/search_dna_config.yaml b/graphgen/configs/search_dna_config.yaml new file mode 100644 index 00000000..5245ea0c --- /dev/null +++ b/graphgen/configs/search_dna_config.yaml @@ -0,0 +1,17 @@ +pipeline: + - name: read_step + op_key: read + params: + input_file: resources/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + + - name: search_step + op_key: search + deps: [read_step] # search_step depends on read_step + params: + data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral + ncbi_params: + email: test@example.com # NCBI requires an email address + tool: GraphGen # tool name for NCBI API + use_local_blast: true # whether to use local blast for DNA search + local_blast_db: /your_path/refseq_241 # path to local BLAST database (without .nhr extension) + diff --git a/graphgen/configs/search_protein_config.yaml b/graphgen/configs/search_protein_config.yaml new file mode 100644 index 00000000..bfbf84eb --- /dev/null +++ b/graphgen/configs/search_protein_config.yaml @@ -0,0 +1,15 @@ +pipeline: + - name: read_step + op_key: read + params: + input_file: resources/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + + - name: search_step + op_key: search + deps: [read_step] # search_step depends on read_step + params: + data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot + uniprot_params: + use_local_blast: true # whether to use local blast for uniprot search + local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot + # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database) diff --git a/graphgen/configs/search_rna_config.yaml b/graphgen/configs/search_rna_config.yaml new file mode 100644 index 00000000..dae62ec2 --- /dev/null +++ b/graphgen/configs/search_rna_config.yaml @@ -0,0 +1,16 @@ +pipeline: + - name: read_step + op_key: read + params: + input_file: resources/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + + - name: search_step + op_key: search + deps: [read_step] # search_step depends on read_step + params: + data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral + rnacentral_params: + use_local_blast: true # whether to use local blast for RNA search + local_blast_db: /your_path/refseq_rna_241 # format: /path/to/refseq_rna_${RELEASE} + # can also use DNA database with RNA sequences (if already built) + diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py index 167981e9..bc7e7742 100644 --- a/graphgen/graphgen.py +++ b/graphgen/graphgen.py @@ -45,7 +45,7 @@ def __init__( # llm self.tokenizer_instance: Tokenizer = tokenizer_instance or Tokenizer( - model_name=os.getenv("TOKENIZER_MODEL") + model_name=os.getenv("TOKENIZER_MODEL", "cl100k_base") ) self.synthesizer_llm_client: BaseLLMWrapper = ( diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index cc8dfd90..00a5cc56 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -26,6 +26,8 @@ RDFReader, TXTReader, ) +from .searcher.db.ncbi_searcher import NCBISearch +from .searcher.db.rnacentral_searcher import RNACentralSearch from .searcher.db.uniprot_searcher import UniProtSearch from .searcher.kg.wiki_search import WikiSearch from .searcher.web.bing_search import BingSearch diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py new file mode 100644 index 00000000..0de8ecc0 --- /dev/null +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -0,0 +1,390 @@ +import asyncio +import os +import re +import subprocess +import tempfile +from concurrent.futures import ThreadPoolExecutor +from functools import lru_cache +from http.client import IncompleteRead +from typing import Dict, Optional + +from Bio import Entrez, SeqIO +from Bio.Blast import NCBIWWW, NCBIXML +from requests.exceptions import RequestException +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from graphgen.bases import BaseSearcher +from graphgen.utils import logger + + +@lru_cache(maxsize=None) +def _get_pool(): + return ThreadPoolExecutor(max_workers=10) + + +# ensure only one NCBI request at a time +_ncbi_lock = asyncio.Lock() + + +class NCBISearch(BaseSearcher): + """ + NCBI Search client to search DNA/GenBank/Entrez databases. + 1) Get the gene/DNA by accession number or gene ID. + 2) Search with keywords or gene names (fuzzy search). + 3) Search with FASTA sequence (BLAST search for DNA sequences). + + API Documentation: https://www.ncbi.nlm.nih.gov/home/develop/api/ + Note: NCBI has rate limits (max 3 requests per second), delays are required between requests. + """ + + def __init__( + self, + use_local_blast: bool = False, + local_blast_db: str = "nt_db", + email: str = "email@example.com", + api_key: str = "", + tool: str = "GraphGen", + ): + """ + Initialize the NCBI Search client. + + Args: + use_local_blast (bool): Whether to use local BLAST database. + local_blast_db (str): Path to the local BLAST database. + email (str): Email address for NCBI API requests. + api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/. + tool (str): Tool name for NCBI API requests. + """ + super().__init__() + Entrez.timeout = 60 # 60 seconds timeout + Entrez.email = email + Entrez.tool = tool + if api_key: + Entrez.api_key = api_key + Entrez.max_tries = 10 if api_key else 3 + Entrez.sleep_between_tries = 5 + self.use_local_blast = use_local_blast + self.local_blast_db = local_blast_db + if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): + logger.error("Local BLAST database files not found. Please check the path.") + self.use_local_blast = False + + @staticmethod + def _nested_get(data: dict, *keys, default=None): + """Safely traverse nested dictionaries.""" + for key in keys: + if not isinstance(data, dict): + return default + data = data.get(key, default) + return data + + def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: + """ + Convert an Entrez gene record to a dictionary. + All extraction logic is inlined for maximum clarity and performance. + """ + if not gene_record: + raise ValueError("Empty gene record") + + data = gene_record[0] + locus = (data.get("Entrezgene_locus") or [{}])[0] + + # Extract common nested paths once + gene_ref = self._nested_get(data, "Entrezgene_gene", "Gene-ref", default={}) + biosource = self._nested_get(data, "Entrezgene_source", "BioSource", default={}) + + # Process synonyms + synonyms_raw = gene_ref.get("Gene-ref_syn", []) + gene_synonyms = [] + if isinstance(synonyms_raw, list): + for syn in synonyms_raw: + gene_synonyms.append(syn.get("Gene-ref_syn_E") if isinstance(syn, dict) else str(syn)) + elif synonyms_raw: + gene_synonyms.append(str(synonyms_raw)) + + # Extract location info + label = locus.get("Gene-commentary_label", "") + chromosome_match = re.search(r"Chromosome\s+(\S+)", str(label)) if label else None + + seq_interval = self._nested_get( + locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={} + ) + genomic_location = ( + f"{seq_interval.get('Seq-interval_from')}-{seq_interval.get('Seq-interval_to')}" + if seq_interval.get('Seq-interval_from') and seq_interval.get('Seq-interval_to') + else None + ) + + # Extract representative accession + representative_accession = next( + ( + product.get("Gene-commentary_accession") + for product in locus.get("Gene-commentary_products", []) + if product.get("Gene-commentary_type") == "3" + ), + None, + ) + + # Extract function + function = data.get("Entrezgene_summary") or next( + ( + comment.get("Gene-commentary_comment") + for comment in data.get("Entrezgene_comments", []) + if isinstance(comment, dict) + and "function" in str(comment.get("Gene-commentary_heading", "")).lower() + ), + None, + ) + + return { + "molecule_type": "DNA", + "database": "NCBI", + "id": gene_id, + "gene_name": gene_ref.get("Gene-ref_locus", "N/A"), + "gene_description": gene_ref.get("Gene-ref_desc", "N/A"), + "organism": self._nested_get( + biosource, "BioSource_org", "Org-ref", "Org-ref_taxname", default="N/A" + ), + "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}", + "gene_synonyms": gene_synonyms or None, + "gene_type": { + "1": "protein-coding", + "2": "pseudo", + "3": "rRNA", + "4": "tRNA", + "5": "snRNA", + "6": "ncRNA", + "7": "other", + }.get(str(data.get("Entrezgene_type")), f"type_{data.get('Entrezgene_type')}"), + "chromosome": chromosome_match.group(1) if chromosome_match else None, + "genomic_location": genomic_location, + "function": function, + # Fields from accession-based queries + "title": None, + "sequence": None, + "sequence_length": None, + "gene_id": gene_id, + "molecule_type_detail": None, + "_representative_accession": representative_accession, + } + + def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]: + """Get gene information by Gene ID.""" + def _extract_from_genbank(result: dict, accession: str): + """Enrich result dictionary with sequence and summary information from accession.""" + with Entrez.efetch(db="nuccore", id=accession, rettype="gb", retmode="text") as handle: + record = SeqIO.read(handle, "genbank") + result["sequence"] = str(record.seq) + result["sequence_length"] = len(record.seq) + result["title"] = record.description + result["molecule_type_detail"] = ( + "mRNA" if accession.startswith(("NM_", "XM_")) else + "genomic DNA" if accession.startswith(("NC_", "NT_")) else + "RNA" if accession.startswith(("NR_", "XR_")) else + "genomic region" if accession.startswith("NG_") else "N/A" + ) + + for feature in record.features: + if feature.type == "source": + if 'chromosome' in feature.qualifiers: + result["chromosome"] = feature.qualifiers['chromosome'][0] + + if feature.location: + start = int(feature.location.start) + 1 + end = int(feature.location.end) + result["genomic_location"] = f"{start}-{end}" + + break + + if not result.get("organism") and 'organism' in record.annotations: + result["organism"] = record.annotations['organism'] + + return result + + try: + with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle: + gene_record = Entrez.read(handle) + if not gene_record: + return None + + result = self._gene_record_to_dict(gene_record, gene_id) + if accession := (preferred_accession or result.get("_representative_accession")): + result = _extract_from_genbank(result, accession) + + result.pop("_representative_accession", None) + return result + except (RequestException, IncompleteRead): + raise + except Exception as exc: + logger.error("Gene ID %s not found: %s", gene_id, exc) + return None + + def get_by_accession(self, accession: str) -> Optional[dict]: + """Get sequence information by accession number.""" + def _extract_gene_id(link_handle): + """Extract GeneID from elink results.""" + links = Entrez.read(link_handle) + if not links or "LinkSetDb" not in links[0]: + return None + + for link_set in links[0]["LinkSetDb"]: + if link_set.get("DbTo") != "gene": + continue + + link = (link_set.get("Link") or link_set.get("IdList", [{}]))[0] + return str(link.get("Id") if isinstance(link, dict) else link) + + try: + # TODO: support accession number with version number (e.g., NM_000546.3) + with Entrez.elink(dbfrom="nuccore", db="gene", id=accession) as link_handle: + gene_id = _extract_gene_id(link_handle) + + if not gene_id: + logger.warning("Accession %s has no associated GeneID", accession) + return None + + result = self.get_by_gene_id(gene_id, preferred_accession=accession) + if result: + result["id"] = accession + result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}" + return result + except (RequestException, IncompleteRead): + raise + except Exception as exc: + logger.error("Accession %s not found: %s", accession, exc) + return None + + def get_best_hit(self, keyword: str) -> Optional[dict]: + """Search NCBI Gene database with a keyword and return the best hit.""" + if not keyword.strip(): + return None + + try: + for search_term in [f"{keyword}[Gene] OR {keyword}[All Fields]", keyword]: + with Entrez.esearch(db="gene", term=search_term, retmax=1, sort="relevance") as search_handle: + search_results = Entrez.read(search_handle) + if len(gene_id := search_results.get("IdList", [])) > 0: + return self.get_by_gene_id(gene_id) + except (RequestException, IncompleteRead): + raise + except Exception as e: + logger.error("Keyword %s not found: %s", keyword, e) + return None + + def _local_blast(self, seq: str, threshold: float) -> Optional[str]: + """Perform local BLAST search using local BLAST database.""" + try: + with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: + tmp.write(f">query\n{seq}\n") + tmp_name = tmp.name + + cmd = [ + "blastn", "-db", self.local_blast_db, "-query", tmp_name, + "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc" + ] + logger.debug("Running local blastn: %s", " ".join(cmd)) + out = subprocess.check_output(cmd, text=True).strip() + os.remove(tmp_name) + return out.split("\n", maxsplit=1)[0] if out else None + except Exception as exc: + logger.error("Local blastn failed: %s", exc) + return None + + def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: + """Search NCBI with a DNA sequence using BLAST.""" + + def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: + """Extract and normalize DNA sequence from input.""" + if sequence.startswith(">"): + seq = "".join(sequence.strip().split("\n")[1:]) + else: + seq = sequence.strip().replace(" ", "").replace("\n", "") + return seq if re.fullmatch(r"[ATCGN]+", seq, re.I) else None + + + def _process_network_blast_result(blast_record, seq: str, threshold: float) -> Optional[dict]: + """Process network BLAST result and return dictionary or None.""" + if not blast_record.alignments: + logger.info("No BLAST hits found for the given sequence.") + return None + + best_alignment = blast_record.alignments[0] + best_hsp = best_alignment.hsps[0] + if best_hsp.expect > threshold: + logger.info("No BLAST hits below the threshold E-value.") + return None + + hit_id = best_alignment.hit_id + if accession_match := re.search(r"ref\|([^|]+)", hit_id): + return self.get_by_accession(accession_match.group(1).split(".")[0]) + + # If unable to extract accession, return basic information + return { + "molecule_type": "DNA", + "database": "NCBI", + "id": hit_id, + "title": best_alignment.title, + "sequence_length": len(seq), + "e_value": best_hsp.expect, + "identity": best_hsp.identities / best_hsp.align_length if best_hsp.align_length > 0 else 0, + "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{hit_id}", + } + + try: + if not (seq := _extract_and_normalize_sequence(sequence)): + logger.error("Empty or invalid DNA sequence provided.") + return None + + # Try local BLAST first if enabled + if self.use_local_blast and (accession := self._local_blast(seq, threshold)): + logger.debug("Local BLAST found accession: %s", accession) + return self.get_by_accession(accession) + + # Fall back to network BLAST + logger.debug("Falling back to NCBIWWW.qblast") + + with NCBIWWW.qblast("blastn", "nr", seq, hitlist_size=1, expect=threshold) as result_handle: + return _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold) + except (RequestException, IncompleteRead): + raise + except Exception as e: + logger.error("BLAST search failed: %s", e) + return None + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) + async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optional[Dict]: + """Search NCBI with either a gene ID, accession number, keyword, or DNA sequence.""" + if not query or not isinstance(query, str): + logger.error("Empty or non-string input.") + return None + + query = query.strip() + logger.debug("NCBI search query: %s", query) + + loop = asyncio.get_running_loop() + + # limit concurrent requests (NCBI rate limit: max 3 requests per second) + async with _ncbi_lock: + # Auto-detect query type and execute in thread pool + if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): + result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) + elif re.fullmatch(r"^\d+$", query): + result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query) + elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): + result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query) + else: + result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) + + if result: + result["_search_query"] = query + return result diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py new file mode 100644 index 00000000..58c5e86e --- /dev/null +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -0,0 +1,314 @@ +import asyncio +import os +import re +import subprocess +from concurrent.futures import ThreadPoolExecutor +from functools import lru_cache +import tempfile +from typing import Dict, Optional, List, Any, Set + +import hashlib +import requests +import aiohttp +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from graphgen.bases import BaseSearcher +from graphgen.utils import logger + + +@lru_cache(maxsize=None) +def _get_pool(): + return ThreadPoolExecutor(max_workers=10) + +class RNACentralSearch(BaseSearcher): + """ + RNAcentral Search client to search RNA databases. + 1) Get RNA by RNAcentral ID. + 2) Search with keywords or RNA names (fuzzy search). + 3) Search with RNA sequence. + + API Documentation: https://rnacentral.org/api/v1 + """ + + def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db"): + super().__init__() + self.base_url = "https://rnacentral.org/api/v1" + self.headers = {"Accept": "application/json"} + self.use_local_blast = use_local_blast + self.local_blast_db = local_blast_db + if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): + logger.error("Local BLAST database files not found. Please check the path.") + self.use_local_blast = False + + @staticmethod + def _rna_data_to_dict( + rna_id: str, + rna_data: Dict[str, Any], + xrefs_data: Optional[List[Dict[str, Any]]] = None + ) -> Dict[str, Any]: + organisms, gene_names, so_terms = set(), set(), set() + modifications: List[Any] = [] + + for xref in xrefs_data or []: + acc = xref.get("accession", {}) + if s := acc.get("species"): + organisms.add(s) + if g := acc.get("gene", "").strip(): + gene_names.add(g) + if m := xref.get("modifications"): + modifications.extend(m) + if b := acc.get("biotype"): + so_terms.add(b) + + def format_unique_values(values: Set[str]) -> Optional[str]: + if not values: + return None + if len(values) == 1: + return next(iter(values)) + return ", ".join(sorted(values)) + + xrefs_info = { + "organism": format_unique_values(organisms), + "gene_name": format_unique_values(gene_names), + "related_genes": list(gene_names) if gene_names else None, + "modifications": modifications or None, + "so_term": format_unique_values(so_terms), + } + + fallback_rules = { + "organism": ["organism", "species"], + "related_genes": ["related_genes", "genes"], + "gene_name": ["gene_name", "gene"], + "so_term": ["so_term"], + "modifications": ["modifications"], + } + + def resolve_field(field_name: str) -> Any: + if (value := xrefs_info.get(field_name)) is not None: + return value + + for key in fallback_rules[field_name]: + if (value := rna_data.get(key)) is not None: + return value + + return None + + organism = resolve_field("organism") + gene_name = resolve_field("gene_name") + so_term = resolve_field("so_term") + modifications = resolve_field("modifications") + + related_genes = resolve_field("related_genes") + if not related_genes and (single_gene := rna_data.get("gene_name")): + related_genes = [single_gene] + + sequence = rna_data.get("sequence", "") + + return { + "molecule_type": "RNA", + "database": "RNAcentral", + "id": rna_id, + "rnacentral_id": rna_data.get("rnacentral_id", rna_id), + "sequence": sequence, + "sequence_length": rna_data.get("length", len(sequence)), + "rna_type": rna_data.get("rna_type", "N/A"), + "description": rna_data.get("description", "N/A"), + "url": f"https://rnacentral.org/rna/{rna_id}", + "organism": organism, + "related_genes": related_genes or None, + "gene_name": gene_name, + "so_term": so_term, + "modifications": modifications, + } + + @staticmethod + def _calculate_md5(sequence: str) -> str: + """ + Calculate MD5 hash for RNA sequence as per RNAcentral spec. + - Replace U with T + - Convert to uppercase + - Encode as ASCII + """ + # Normalize sequence + normalized_seq = sequence.replace("U", "T").replace("u", "t").upper() + if not re.fullmatch(r"[ATCGN]+", normalized_seq): + raise ValueError(f"Invalid sequence characters after normalization: {normalized_seq[:50]}...") + + return hashlib.md5(normalized_seq.encode("ascii")).hexdigest() + + def get_by_rna_id(self, rna_id: str) -> Optional[dict]: + """ + Get RNA information by RNAcentral ID. + :param rna_id: RNAcentral ID (e.g., URS0000000001). + :return: A dictionary containing RNA information or None if not found. + """ + try: + url = f"{self.base_url}/rna/{rna_id}" + url += "?flat=true" + + resp = requests.get(url, headers=self.headers, timeout=30) + resp.raise_for_status() + + rna_data = resp.json() + xrefs_data = rna_data.get("xrefs", []) + return self._rna_data_to_dict(rna_id, rna_data, xrefs_data) + except requests.RequestException as e: + logger.error("Network error getting RNA ID %s: %s", rna_id, e) + return None + except Exception as e: # pylint: disable=broad-except + logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e) + return None + + def get_best_hit(self, keyword: str) -> Optional[dict]: + """ + Search RNAcentral with a keyword and return the best hit. + :param keyword: The search keyword (e.g., miRNA name, RNA name). + :return: Dictionary with RNA information or None. + """ + keyword = keyword.strip() + if not keyword: + logger.warning("Empty keyword provided to get_best_hit") + return None + + try: + url = f"{self.base_url}/rna" + params = {"search": keyword, "format": "json"} + resp = requests.get(url, params=params, headers=self.headers, timeout=30) + resp.raise_for_status() + + data = resp.json() + results = data.get("results", []) + + if not results: + logger.info("No search results for keyword: %s", keyword) + return None + + first_result = results[0] + rna_id = first_result.get("rnacentral_id") + + if rna_id: + detailed = self.get_by_rna_id(rna_id) + if detailed: + return detailed + logger.debug("Using search result data for %s", rna_id or "unknown") + return self._rna_data_to_dict(rna_id or "", first_result) + + except requests.RequestException as e: + logger.error("Network error searching keyword '%s': %s", keyword, e) + return None + except Exception as e: + logger.error("Unexpected error searching keyword '%s': %s", keyword, e) + return None + + def _local_blast(self, seq: str, threshold: float) -> Optional[str]: + """Perform local BLAST search using local BLAST database.""" + try: + with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: + tmp.write(f">query\n{seq}\n") + tmp_name = tmp.name + + cmd = [ + "blastn", "-db", self.local_blast_db, "-query", tmp_name, + "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc" + ] + logger.debug("Running local blastn for RNA: %s", " ".join(cmd)) + out = subprocess.check_output(cmd, text=True).strip() + os.remove(tmp_name) + return out.split("\n", maxsplit=1)[0] if out else None + except Exception as exc: + logger.error("Local blastn failed: %s", exc) + return None + + def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: + """ + Search RNAcentral with an RNA sequence. + Tries local BLAST first if enabled, falls back to RNAcentral API. + Unified approach: Find RNA ID from sequence search, then call get_by_rna_id() for complete information. + :param sequence: RNA sequence (FASTA format or raw sequence). + :param threshold: E-value threshold for BLAST search. + :return: A dictionary containing complete RNA information or None if not found. + """ + def _extract_sequence(sequence: str) -> Optional[str]: + """Extract and normalize RNA sequence from input.""" + if sequence.startswith(">"): + seq_lines = sequence.strip().split("\n") + seq = "".join(seq_lines[1:]) + else: + seq = sequence.strip().replace(" ", "").replace("\n", "") + return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None + + try: + seq = _extract_sequence(sequence) + if not seq: + logger.error("Empty or invalid RNA sequence provided.") + return None + + # Try local BLAST first if enabled + if self.use_local_blast: + accession = self._local_blast(seq, threshold) + if accession: + logger.debug("Local BLAST found accession: %s", accession) + return self.get_by_rna_id(accession) + + # Fall back to RNAcentral API if local BLAST didn't find result + logger.debug("Falling back to RNAcentral API.") + + md5_hash = self._calculate_md5(seq) + search_url = f"{self.base_url}/rna" + params = {"md5": md5_hash, "format": "json"} + + resp = requests.get(search_url, params=params, headers=self.headers, timeout=60) + resp.raise_for_status() + + search_results = resp.json() + results = search_results.get("results", []) + + if not results: + logger.info("No exact match found in RNAcentral for sequence") + return None + rna_id = results[0].get("rnacentral_id") + if not rna_id: + logger.error("No RNAcentral ID found in search results.") + return None + return self.get_by_rna_id(rna_id) + except Exception as e: + logger.error("Sequence search failed: %s", e) + return None + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)), + reraise=True, + ) + async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional[Dict]: + """Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence.""" + if not query or not isinstance(query, str): + logger.error("Empty or non-string input.") + return None + + query = query.strip() + logger.debug("RNAcentral search query: %s", query) + + loop = asyncio.get_running_loop() + + # check if RNA sequence (AUCG characters, contains U) + if query.startswith(">") or ( + re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper() + ): + result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) + # check if RNAcentral ID (typically starts with URS) + elif re.fullmatch(r"URS\d+", query, re.I): + result = await loop.run_in_executor(_get_pool(), self.get_by_rna_id, query) + else: + # otherwise treat as keyword + result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) + + if result: + result["_search_query"] = query + return result diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py index 6c543dbf..6017cfee 100644 --- a/graphgen/operators/search/search_all.py +++ b/graphgen/operators/search/search_all.py @@ -27,6 +27,10 @@ async def search_all( data_sources = search_config.get("data_sources", []) for data_source in data_sources: + data = list(seed_data.values()) + data = [d["content"] for d in data if "content" in d] + data = list(set(data)) # Remove duplicates + if data_source == "uniprot": from graphgen.models import UniProtSearch @@ -34,19 +38,46 @@ async def search_all( **search_config.get("uniprot_params", {}) ) - data = list(seed_data.values()) - data = [d["content"] for d in data if "content" in d] - data = list(set(data)) # Remove duplicates uniprot_results = await run_concurrent( uniprot_search_client.search, data, desc="Searching UniProt database", unit="keyword", ) + results[data_source] = uniprot_results + + elif data_source == "ncbi": + from graphgen.models import NCBISearch + + ncbi_search_client = NCBISearch( + **search_config.get("ncbi_params", {}) + ) + + ncbi_results = await run_concurrent( + ncbi_search_client.search, + data, + desc="Searching NCBI database", + unit="keyword", + ) + results[data_source] = ncbi_results + + elif data_source == "rnacentral": + from graphgen.models import RNACentralSearch + + rnacentral_search_client = RNACentralSearch( + **search_config.get("rnacentral_params", {}) + ) + + rnacentral_results = await run_concurrent( + rnacentral_search_client.search, + data, + desc="Searching RNAcentral database", + unit="keyword", + ) + results[data_source] = rnacentral_results + else: logger.error("Data source %s not supported.", data_source) continue - results[data_source] = uniprot_results - return results diff --git a/requirements.txt b/requirements.txt index 47965013..fa2b1efc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,6 +21,7 @@ fastapi trafilatura aiohttp diskcache +socksio leidenalg igraph diff --git a/resources/input_examples/search_dna_demo.jsonl b/resources/input_examples/search_dna_demo.jsonl new file mode 100644 index 00000000..346b65f0 --- /dev/null +++ b/resources/input_examples/search_dna_demo.jsonl @@ -0,0 +1,9 @@ +{"type": "text", "content": "TP53"} +{"type": "text", "content": "BRCA1"} +{"type": "text", "content": "672"} +{"type": "text", "content": "11998"} +{"type": "text", "content": "NM_000546"} +{"type": "text", "content": "NM_024140"} +{"type": "text", "content": ">query\nCTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"} +{"type": "text", "content": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACTCCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCGCACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGAGCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCCTTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGCCCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCTACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTGTTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCCTTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAACAAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTAAGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATCAGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTCTCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGTTAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTGAGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTAGAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACCTAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCACCAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGACTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCCCCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCCAACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCAGGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAGGGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTTTATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCA"} + diff --git a/resources/input_examples/search_demo.jsonl b/resources/input_examples/search_protein_demo.jsonl similarity index 77% rename from resources/input_examples/search_demo.jsonl rename to resources/input_examples/search_protein_demo.jsonl index 6409a805..e119cec8 100644 --- a/resources/input_examples/search_demo.jsonl +++ b/resources/input_examples/search_protein_demo.jsonl @@ -1,3 +1,12 @@ +{"type": "text", "content": "P01308"} +{"type": "text", "content": "P68871"} +{"type": "text", "content": "P02768"} +{"type": "text", "content": "P04637"} +{"type": "text", "content": "insulin"} +{"type": "text", "content": "hemoglobin"} +{"type": "text", "content": "p53"} +{"type": "text", "content": "BRCA1"} +{"type": "text", "content": "albumin"} {"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} {"type": "text", "content": "MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"} {"type": "text", "content": "MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"} diff --git a/resources/input_examples/search_rna_demo.jsonl b/resources/input_examples/search_rna_demo.jsonl new file mode 100644 index 00000000..16e99479 --- /dev/null +++ b/resources/input_examples/search_rna_demo.jsonl @@ -0,0 +1,5 @@ +{"type": "text", "content": "hsa-let-7a-1"} +{"type": "text", "content": "URS0000123456"} +{"type": "text", "content": "URS0000000001"} +{"type": "text", "content": ">query\nCUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} +{"type": "text", "content": "CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} diff --git a/scripts/search/build_db/build_dna_blast_db.sh b/scripts/search/build_db/build_dna_blast_db.sh new file mode 100755 index 00000000..b53b4249 --- /dev/null +++ b/scripts/search/build_db/build_dna_blast_db.sh @@ -0,0 +1,178 @@ +#!/bin/bash + +set -e + +# Downloads NCBI RefSeq nucleotide sequences and creates BLAST databases. +# +# RefSeq 目录结构说明(按生物分类组织): +# - vertebrate_mammalian (哺乳动物) +# - vertebrate_other (其他脊椎动物) +# - bacteria (细菌) +# - archaea (古菌) +# - fungi (真菌) +# - invertebrate (无脊椎动物) +# - plant (植物) +# - viral (病毒) +# - protozoa (原生动物) +# - mitochondrion (线粒体) +# - plastid (质体) +# - plasmid (质粒) +# - other (其他) +# - complete/ (完整基因组,包含所有分类) +# +# 每个分类目录下包含: +# - {category}.{number}.genomic.fna.gz (基因组序列) +# - {category}.{number}.rna.fna.gz (RNA序列) +# +# Usage: ./build_dna_blast_db.sh [representative|complete|all] +# representative: Download genomic sequences from major categories (recommended, smaller) +# Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi +# complete: Download all complete genomic sequences from complete/ directory (very large) +# all: Download all genomic sequences from all categories (very large) +# +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +DOWNLOAD_TYPE=${1:-representative} + +# Better to use a stable DOWNLOAD_TMP name to support resuming downloads +DOWNLOAD_TMP=_downloading_dna +mkdir -p ${DOWNLOAD_TMP} +cd ${DOWNLOAD_TMP} + +# Download RefSeq release information +echo "Downloading RefSeq release information..." +wget -c "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER" || { + echo "Warning: Could not download RELEASE_NUMBER, using current date as release identifier" + RELEASE=$(date +%Y%m%d) +} + +if [ -f "RELEASE_NUMBER" ]; then + RELEASE=$(cat RELEASE_NUMBER | tr -d '\n') + echo "RefSeq release: ${RELEASE}" +else + RELEASE=$(date +%Y%m%d) + echo "Using date as release identifier: ${RELEASE}" +fi + +# Download based on type +case ${DOWNLOAD_TYPE} in + representative) + echo "Downloading RefSeq representative sequences (recommended, smaller size)..." + # Download major categories for representative coverage + # Note: You can modify this list based on your specific requirements + for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi; do + echo "Downloading ${category} sequences..." + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' | \ + while read filename; do + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + done + done + ;; + complete) + echo "Downloading RefSeq complete genomic sequences (WARNING: very large, may take hours)..." + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' | \ + while read filename; do + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + done + ;; + all) + echo "Downloading all RefSeq genomic sequences from all categories (WARNING: extremely large, may take many hours)..." + # Download genomic sequences from all categories + for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do + echo "Downloading ${category} genomic sequences..." + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' | \ + while read filename; do + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + done + done + ;; + *) + echo "Error: Unknown download type '${DOWNLOAD_TYPE}'" + echo "Usage: $0 [representative|complete|all]" + echo "Note: For RNA sequences, use build_rna_blast_db.sh instead" + exit 1 + ;; +esac + +cd .. + +# Create release directory +mkdir -p refseq_${RELEASE} +mv ${DOWNLOAD_TMP}/* refseq_${RELEASE}/ 2>/dev/null || true +rmdir ${DOWNLOAD_TMP} 2>/dev/null || true + +cd refseq_${RELEASE} + +# Extract and combine sequences +echo "Extracting and combining sequences..." + +# Extract all downloaded genomic sequences +if [ $(find . -name "*.genomic.fna.gz" -type f | wc -l) -gt 0 ]; then + echo "Extracting genomic sequences..." + find . -name "*.genomic.fna.gz" -type f -exec gunzip {} \; +fi + +# Combine all FASTA files into one +echo "Combining all FASTA files..." +FASTA_FILES=$(find . -name "*.fna" -type f) +if [ -z "$FASTA_FILES" ]; then + FASTA_FILES=$(find . -name "*.fa" -type f) +fi + +if [ -z "$FASTA_FILES" ]; then + echo "Error: No FASTA files found to combine" + exit 1 +fi + +echo "$FASTA_FILES" | while read -r file; do + if [ -f "$file" ]; then + cat "$file" >> refseq_${RELEASE}.fasta + fi +done + +# Check if we have sequences +if [ ! -s "refseq_${RELEASE}.fasta" ]; then + echo "Error: Combined FASTA file is empty" + exit 1 +fi + +echo "Creating BLAST database..." +# Create BLAST database for DNA sequences (use -dbtype nucl for nucleotide) +makeblastdb -in refseq_${RELEASE}.fasta \ + -out refseq_${RELEASE} \ + -dbtype nucl \ + -parse_seqids \ + -title "RefSeq_${RELEASE}" + +echo "BLAST database created successfully!" +echo "Database location: $(pwd)/refseq_${RELEASE}" +echo "" +echo "To use this database, set in your config:" +echo " local_blast_db: $(pwd)/refseq_${RELEASE}" +echo "" +echo "Note: The database files are:" +ls -lh refseq_${RELEASE}.* + +cd .. + diff --git a/scripts/search/build_db/build_protein_blast_db.sh b/scripts/search/build_db/build_protein_blast_db.sh new file mode 100755 index 00000000..9292875a --- /dev/null +++ b/scripts/search/build_db/build_protein_blast_db.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +set -e + +# Downloads the latest release of UniProt, putting it in a release-specific directory. +# Creates associated BLAST databases. +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +# Better to use a stable DOWNLOAD_TMP name to support resuming downloads +DOWNLOAD_TMP=_downloading +mkdir -p ${DOWNLOAD_TMP} +cd ${DOWNLOAD_TMP} + +wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/RELEASE.metalink" + +# Extract the release name (like 2017_10 or 2017_1) +# Use sed for cross-platform compatibility (works on both macOS and Linux) +RELEASE=$(sed -n 's/.*\([0-9]\{4\}_[0-9]\{1,2\}\)<\/version>.*/\1/p' RELEASE.metalink | head -1) + +wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" +wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" +wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt" +wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/README" +wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/LICENSE" + +cd .. + +mkdir ${RELEASE} +mv ${DOWNLOAD_TMP}/* ${RELEASE} +rmdir ${DOWNLOAD_TMP} + +cd ${RELEASE} + +gunzip uniprot_sprot.fasta.gz +gunzip uniprot_trembl.fasta.gz + +cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta + +makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE} +makeblastdb -in uniprot_sprot.fasta -out uniprot_sprot -dbtype prot -parse_seqids -title uniprot_sprot +makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl + +cd .. + +echo "BLAST databases created successfully!" +echo "Database locations:" +echo " - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}" +echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" +echo " - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl" +echo "" +echo "To use these databases, set in your config:" +echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot # or uniprot_${RELEASE} or uniprot_trembl" + diff --git a/scripts/search/build_db/build_rna_blast_db.sh b/scripts/search/build_db/build_rna_blast_db.sh new file mode 100755 index 00000000..89b9dc0e --- /dev/null +++ b/scripts/search/build_db/build_rna_blast_db.sh @@ -0,0 +1,157 @@ +#!/bin/bash + +set -e + +# Downloads NCBI RefSeq RNA sequences and creates BLAST databases. +# This script specifically downloads RNA sequences (mRNA, rRNA, tRNA, etc.) +# from RefSeq, which is suitable for RNA sequence searches. +# +# Usage: ./build_rna_blast_db.sh [representative|complete|all] +# representative: Download RNA sequences from major categories (recommended, smaller) +# Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi, invertebrate, plant, viral +# complete: Download all RNA sequences from complete/ directory (very large) +# all: Download all RNA sequences from all categories (very large) +# +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +DOWNLOAD_TYPE=${1:-representative} + +# Better to use a stable DOWNLOAD_TMP name to support resuming downloads +DOWNLOAD_TMP=_downloading_rna +mkdir -p ${DOWNLOAD_TMP} +cd ${DOWNLOAD_TMP} + +# Download RefSeq release information +echo "Downloading RefSeq release information..." +wget -c "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER" || { + echo "Warning: Could not download RELEASE_NUMBER, using current date as release identifier" + RELEASE=$(date +%Y%m%d) +} + +if [ -f "RELEASE_NUMBER" ]; then + RELEASE=$(cat RELEASE_NUMBER | tr -d '\n') + echo "RefSeq release: ${RELEASE}" +else + RELEASE=$(date +%Y%m%d) + echo "Using date as release identifier: ${RELEASE}" +fi + +# Download based on type +case ${DOWNLOAD_TYPE} in + representative) + echo "Downloading RefSeq representative RNA sequences (recommended, smaller size)..." + echo "Downloading RNA sequences from major categories..." + for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral; do + echo "Downloading ${category} RNA sequences..." + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' | \ + while read filename; do + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + done + done + ;; + complete) + echo "Downloading RefSeq complete RNA sequences (WARNING: very large, may take hours)..." + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \ + grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' | \ + while read filename; do + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + done + ;; + all) + echo "Downloading all RefSeq RNA sequences from all categories (WARNING: extremely large, may take many hours)..." + for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do + echo "Downloading ${category} RNA sequences..." + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.rna\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' | \ + while read filename; do + echo " Downloading ${filename}..." + wget -c -q --show-progress \ + "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { + echo "Warning: Failed to download ${filename}" + } + done + done + ;; + *) + echo "Error: Unknown download type '${DOWNLOAD_TYPE}'" + echo "Usage: $0 [representative|complete|all]" + exit 1 + ;; +esac + +cd .. + +# Create release directory +mkdir -p refseq_rna_${RELEASE} +mv ${DOWNLOAD_TMP}/* refseq_rna_${RELEASE}/ 2>/dev/null || true +rmdir ${DOWNLOAD_TMP} 2>/dev/null || true + +cd refseq_rna_${RELEASE} + +# Extract and combine sequences +echo "Extracting and combining RNA sequences..." + +# Extract all downloaded RNA sequences +if [ $(find . -name "*.rna.fna.gz" -type f | wc -l) -gt 0 ]; then + echo "Extracting RNA sequences..." + find . -name "*.rna.fna.gz" -type f -exec gunzip {} \; +fi + +# Combine all FASTA files into one +echo "Combining all FASTA files..." +FASTA_FILES=$(find . -name "*.fna" -type f) +if [ -z "$FASTA_FILES" ]; then + FASTA_FILES=$(find . -name "*.fa" -type f) +fi + +if [ -z "$FASTA_FILES" ]; then + echo "Error: No FASTA files found to combine" + exit 1 +fi + +echo "$FASTA_FILES" | while read -r file; do + if [ -f "$file" ]; then + cat "$file" >> refseq_rna_${RELEASE}.fasta + fi +done + +# Check if we have sequences +if [ ! -s "refseq_rna_${RELEASE}.fasta" ]; then + echo "Error: Combined FASTA file is empty" + exit 1 +fi + +echo "Creating BLAST database..." +# Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide) +makeblastdb -in refseq_rna_${RELEASE}.fasta \ + -out refseq_rna_${RELEASE} \ + -dbtype nucl \ + -parse_seqids \ + -title "RefSeq_RNA_${RELEASE}" + +echo "BLAST database created successfully!" +echo "Database location: $(pwd)/refseq_rna_${RELEASE}" +echo "" +echo "To use this database, set in your config:" +echo " local_blast_db: $(pwd)/refseq_rna_${RELEASE}" +echo "" +echo "Note: The database files are:" +ls -lh refseq_rna_${RELEASE}.* + +cd .. + diff --git a/scripts/search/search_dna.sh b/scripts/search/search_dna.sh new file mode 100644 index 00000000..5b82fdd6 --- /dev/null +++ b/scripts/search/search_dna.sh @@ -0,0 +1,4 @@ +python3 -m graphgen.run \ +--config_file graphgen/configs/search_dna_config.yaml \ +--output_dir cache/ + diff --git a/scripts/search/search_rna.sh b/scripts/search/search_rna.sh new file mode 100644 index 00000000..260499b3 --- /dev/null +++ b/scripts/search/search_rna.sh @@ -0,0 +1,4 @@ +python3 -m graphgen.run \ +--config_file graphgen/configs/search_rna_config.yaml \ +--output_dir cache/ + diff --git a/scripts/search/search_uniprot.sh b/scripts/search/search_uniprot.sh index 642040af..7b295f8d 100644 --- a/scripts/search/search_uniprot.sh +++ b/scripts/search/search_uniprot.sh @@ -1,3 +1,3 @@ python3 -m graphgen.run \ ---config_file graphgen/configs/search_config.yaml \ +--config_file graphgen/configs/search_protein_config.yaml \ --output_dir cache/