In [1]:
import os
import json
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

# Configuration
SPARQL_ENDPOINT = "http://localhost:9999/blazegraph/sparql"
OUTPUT_DIR = "disease_exports"
# Corrected ORDO identifiers based on Orpha numbers
DISEASES = {
    "Cystic_Fibrosis": "Orphanet_586",      # ORPHA:586 for cystic fibrosis
    "Huntingtons_Disease": "Orphanet_399",   # ORPHA:399 for Huntington disease
    # Add your other 2–20 diseases here, using Orphanet_<ORPHA_ID>
}

# OMIM->Gene mapping file (placed at notebook/project root)
MIM2GENE_FILE = "mim2gene.txt"

# Read OMIM to gene table
def load_mim2gene(path: str) -> pd.DataFrame:
    cols = ["mim", "entrez_id", "symbol", "status"]
    df = (
        pd.read_csv(path, sep="\t", names=cols, comment="#", dtype=str)
          .query("status == '*' or status == ''")
          .drop(columns=["status"])
    )
    return df

# Define the queries you want to run per disease
QUERIES = {
    "genes": {
        "description": "Associated genes",
        "sparql": lambda ido: f"""
PREFIX ORDO: <http://www.orpha.net/ORDO/>
PREFIX hoom: <http://www.humanontologies.org/HOOM/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?geneLabel WHERE {{
  ORDO:{ido} hoom:has_gene ?gene .
  ?gene rdfs:label ?geneLabel .
}}"""
    },
    "phenotypes": {
        "description": "Phenotypic features",
        "sparql": lambda ido: f"""
PREFIX ORDO: <http://www.orpha.net/ORDO/>
PREFIX hoom: <http://www.humanontologies.org/HOOM/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?phenotypeLabel WHERE {{
  ORDO:{ido} hoom:has_phenotypic_feature ?phen .
  ?phen rdfs:label ?phenotypeLabel .
}}"""
    },
    "therapeutics": {
        "description": "Therapeutic procedures",
        "sparql": lambda ido: f"""
PREFIX ORDO: <http://www.orpha.net/ORDO/>
PREFIX hoom: <http://www.humanontologies.org/HOOM/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?treatmentLabel WHERE {{
  ORDO:{ido} hoom:has_therapeutic_procedure ?treatment .
  ?treatment rdfs:label ?treatmentLabel .
}}"""
    }
}


def run_query(endpoint: str, sparql: str) -> list:
    wrapper = SPARQLWrapper(endpoint)
    wrapper.setQuery(sparql)
    wrapper.setReturnFormat(JSON)
    return wrapper.query().convert().get("results", {}).get("bindings", [])


def enrich_with_omim(summary: dict, m2g: pd.DataFrame) -> None:
    """
    Add OMIM->gene mappings to summary dict in-place, under 'omim_genes'.
    Expects summary['xrefs'] like ['OMIM:219700', ...].
    """
    omim_ids = []
    for x in summary.get('xrefs', []):
        if x.startswith('OMIM:'):
            omim_ids.append(x.split(':', 1)[1])

    entries = []
    for mim in omim_ids:
        matches = m2g[m2g['mim'] == mim]
        for _, row in matches.iterrows():
            entries.append({
                'mim': mim,
                'entrez_id': row['entrez_id'],
                'symbol': row['symbol']
            })
    summary['omim_genes'] = entries


def export_for_disease(name: str, ordo_id: str, m2g: pd.DataFrame):
    disease_dir = os.path.join(OUTPUT_DIR, name)
    os.makedirs(disease_dir, exist_ok=True)

    print(f"\n=== Exporting: {name} ({ordo_id}) ===")
    # Load existing summary or initialize
    summary_path = os.path.join(disease_dir, "summary.json")
    if os.path.exists(summary_path):
        with open(summary_path) as fh:
            summary = json.load(fh)
    else:
        summary = {"disease": name, "ordo_id": ordo_id, "data": {}, "counts": {}}

    # Enrich with OMIM genes
    enrich_with_omim(summary, m2g)
    print(f"  -> Added {len(summary.get('omim_genes', []))} OMIM gene mappings")

    # Write enriched summary
    enriched_path = os.path.join(disease_dir, "summary_enriched.json")
    with open(enriched_path, "w", encoding="utf-8") as fh:
        json.dump(summary, fh, indent=2)
    print(f"  -> Enriched summary written to {enriched_path}")


def main():
    # Load OMIM->gene table
    if not os.path.exists(MIM2GENE_FILE):
        raise FileNotFoundError(f"Cannot find {MIM2GENE_FILE} in current directory")
    m2g = load_mim2gene(MIM2GENE_FILE)
    print(f"Loaded {len(m2g)} mappings from {MIM2GENE_FILE}")

    # Enrich each disease
    for disease_name, ordo_id in DISEASES.items():
        export_for_disease(disease_name, ordo_id, m2g)
    print("\nAll diseases enriched with OMIM genes.")

if __name__ == "__main__":
    main()


Loaded 0 mappings from mim2gene.txt

=== Exporting: Cystic_Fibrosis (Orphanet_586) ===
  -> Added 0 OMIM gene mappings
  -> Enriched summary written to disease_exports\Cystic_Fibrosis\summary_enriched.json

=== Exporting: Huntingtons_Disease (Orphanet_399) ===
  -> Added 0 OMIM gene mappings
  -> Enriched summary written to disease_exports\Huntingtons_Disease\summary_enriched.json

All diseases enriched with OMIM genes.
