# MeSH(Medical Subject Headings) vocabulary thesaurus.
This is a scope map which performs various tasks for our model:
1. Helps for **Keyword harvesting** by making use of `harvester_modifiers` _(Qualifiers)_
2. Helps for **detection** by making use of `semantic_ground_truth` _(ScopeNote)_
3. Helps for **tracing** by making use of `historical_trace` _(HistoryNote)_
4. Helps for **tracing** by making use of `date_introduced`

## Dependencies

In [1]:
import xml.etree.ElementTree as ET
from collections import defaultdict
import json
from datetime import datetime
from pathlib import Path
from typing import List, Union, Dict

## Storage directory configurations

In [2]:
STORAGE_DIR = Path.cwd().parent.parent / "storage"
STORAGE_DIR.mkdir(exist_ok=True)

SEED_DIR=STORAGE_DIR / "seeds"
SEED_DIR

WindowsPath('C:/Users/Aman Sheikh/Desktop/Projects/VeriFact/Model/harvester/storage/seeds')

## Helper

In [3]:
def get_mesh_date(element):
    if element is None: return None
    y = element.findtext('Year')
    m = element.findtext('Month')
    d = element.findtext('Day')
    return f"{y}-{m}-{d}"

## Allowed Branches

In [4]:
# Define your scope based on your requirements
# ALLOWED_BRANCHES = (
#     'C01', 'C02', 'C03', 'C04', 'C23', # Diseases & Toxicology
#     'D',                               # Chemicals & Drugs
#     'E01', 'E02', 'E05',               # Therapeutics & Diagnostics
#     'N01', 'N02',                      # Public Health
#     'M01', 'M02',                      # Population
#     'G'                                # Life Sciences
# )
ALLOWED_BRANCHES = (

    # =========================
    # DISEASES (Core)
    # =========================
    'C01',  # Bacterial Infections and Mycoses
    'C02',  # Viral Diseases
    'C03',  # Parasitic Diseases
    'C04',  # Neoplasms (Cancer)
    'C05',  # Musculoskeletal Diseases
    'C06',  # Digestive System Diseases
    'C07',  # Stomatognathic Diseases
    'C08',  # Nervous System Diseases
    'C09',  # Eye Diseases
    'C10',  # Ear, Nose, and Throat Diseases
    'C11',  # Respiratory Tract Diseases
    'C12',  # Urogenital Diseases
    'C13',  # Cardiovascular Diseases
    'C14',  # Hemic and Lymphatic Diseases
    'C15',  # Musculoskeletal Diseases
    'C16',  # Congenital, Hereditary, Neonatal Diseases
    'C17',  # Skin and Connective Tissue Diseases
    'C18',  # Nutritional and Metabolic Diseases
    'C19',  # Endocrine System Diseases
    'C20',  # Immune System Diseases
    'C21',  # Disorders of Environmental Origin
    'C22',  # Animal Diseases (zoonotic narratives)
    'C23',  # Pathological Conditions, Signs and Symptoms
    'C24',  # Occupational Diseases
    'C25',  # Chemically-Induced Disorders
    'C26',  # Wounds and Injuries

    # =========================
    # CHEMICALS, DRUGS, SUPPLEMENTS
    # =========================
    'D',     # Chemicals and Drugs (explode)
    'D03',  # Anti-Infective Agents
    'D04',  # Central Nervous System Agents
    'D06',  # Hormones and Hormone Substitutes
    'D10',  # Dietary Supplements
    'D12',  # Amino Acids, Peptides, Proteins
    'D20',  # Complex Mixtures (herbal products)
    'D23',  # Biological Factors

    # =========================
    # DIAGNOSTICS & THERAPEUTICS
    # =========================
    'E01',  # Diagnosis
    'E02',  # Diagnostic Techniques and Procedures
    'E05',  # Investigative Techniques
    'E06',  # Therapeutics (critical for cure claims)

    # =========================
    # PSYCHOLOGY & BEHAVIOR
    # =========================
    'F01',  # Behavior and Behavior Mechanisms
    'F04',  # Behavioral Disciplines and Activities
    'F05',  # Mental Disorders

    # =========================
    # PUBLIC HEALTH & HEALTH SYSTEMS
    # =========================
    'N01',  # Population Health
    'N02',  # Health Care Administration
    'N04',  # Health Services (hospitals, doctors, systems)

    # =========================
    # POPULATION GROUPS
    # =========================
    'M01',  # Persons
    'M02',  # Population Groups

    # =========================
    # BIOLOGICAL PHENOMENA
    # =========================
    'G',     # Phenomena and Processes
    'G04',  # Biological Phenomena
    'G12',  # Immunologic Phenomena
)


## Parse MeSH

In [5]:
def parse_mesh_xml(xml_path):
    seeds = []

    context = ET.iterparse(xml_path, events=('end',))

    for event, elem in context:
        if elem.tag != 'DescriptorRecord':
            continue

        tree_numbers = [tn.text for tn in elem.findall('./TreeNumberList/TreeNumber')]

        # 1. Branch Filtering
        if not any(tn.startswith(ALLOWED_BRANCHES) for tn in tree_numbers):
            elem.clear()
            continue

        ui = elem.findtext('DescriptorUI')
        name = elem.findtext('./DescriptorName/String')

        # ---------- Temporal metadata ----------
        date_intro = get_mesh_date(elem.find('DateIntroduced'))
        last_update = get_mesh_date(elem.find('LastUpdated'))

        # ---------- Historical trace ----------
        history_note = elem.findtext('HistoryNote')
        prev_indexing = [
            pi.text for pi in elem.findall('./PreviousIndexingList/PreviousIndexing')
        ]

        # ---------- Semantic ground truth ----------
        scope_note = elem.findtext(
            './/Concept[@PreferredConceptYN="Y"]/ScopeNote'
        )

        # ---------- Synonyms ----------
        synonyms = list(
            set(t.text for t in elem.findall('.//TermList/Term/String') if t.text)
        )

        # ---------- Qualifiers / modifiers ----------
        qualifiers = [
            q.findtext('./QualifierReferredTo/QualifierName/String')
            for q in elem.findall('.//AllowableQualifier')
            if q.findtext('./QualifierReferredTo/QualifierName/String')
        ]

        # ---------- Pharmacological actions ----------
        actions = [
            pa.findtext('.//DescriptorName/String')
            for pa in elem.findall('.//PharmacologicalAction')
            if pa.findtext('.//DescriptorName/String')
        ]

        # ---------- Seed schema ----------
        seed = {
            # ---------- Core identity ----------
            "seed_id": f"mesh:{ui}",
            "label": name.lower(),
            "canonical_label": name,
            "aliases": [],  # populated later (e.g., from consumer vocabularies)

            # ---------- Semantics ----------
            "semantic_ground_truth": scope_note,

            # ---------- Temporal ----------
            "temporal_tracking": {
                "date_introduced": date_intro,
                "last_medical_update": last_update,
                "status": (
                    "established"
                    if date_intro and int(date_intro[:4]) < 2020
                    else "emerging"
                )
            },

            # ---------- Historical ----------
            "historical_trace": {
                "history_note": history_note,
                "previous_indexing": prev_indexing
            },

            # ---------- Biomedical context ----------
            "pharmacological_actions": actions,
            "synonyms": synonyms,

            # ---------- Concept grounding & SapBERT ----------
            "keyword_candidates": [],  # populated during grounding
            "preferred_search_terms": [],
            "excluded_terms": [],

            "sapbert_metadata": {
                "model_name": "",
                "label_embedding_ref": "",
                "candidate_embeddings_ref": "",
                "embedding_dim": None
            },

            # ---------- Retrieval ----------
            "harvester_modifiers": qualifiers,

            "modifier_groups": {},

            "query_templates": [],

            "retrieval_params": {
                "top_k_keywords": None,
                "retmax_stageA": None,
                "retmax_stageB": None,
                "min_keyword_score": None,
                "title_threshold": None,
                "combined_score_threshold_for_biobert": None
            },

            # ---------- Evidence index ----------
            "evidence_index": [],

            # ---------- Downstream analysis ----------
            "claim_templates": [],

            "analysis": {
                "biobert_runs": [],
                "claim_verdicts": []
            },

            # ---------- Administrative ----------
            "mesh_tree_numbers": tree_numbers,
            "tags": list(set(tn[:3] for tn in tree_numbers)),
            "top_level_branches": list(set(tn[0] for tn in tree_numbers)),
            "seed_level_notes": "",
            "created_at": datetime.now().isoformat() + "Z",
            "last_updated_in_db": datetime.now().isoformat() + "Z",
            "updated_at": ""
        }

        seeds.append(seed)
        elem.clear()

    return seeds


## Example of an element of the seed:

```json
 {
  "seed_id": "mesh:D000721",
  "label": "ancrod",
  "temporal_tracking": {"date_introduced": "1978-01-01",
   "last_medical_update": "2016-07-05",
   "status": "established"},
  "historical_trace": {"history_note": "78; was ARVIN see under ENDOPEPTIDASES 1975-77; ARVIN was see ANCROD 1978-95\n  ",
   "previous_indexing": ["Peptide Hydrolases (1972-1974)",
    "Peptide Peptidohydrolases (1966-1974)",
    "Venoms (1972-1974)"]},
  "semantic_ground_truth": "An enzyme fraction from the venom of the Malayan pit viper, Agkistrodon rhodostoma. It catalyzes the hydrolysis of a number of amino acid esters and a limited proteolysis of fibrinogen. It is used clinically to produce controlled defibrination in patients requiring anticoagulant therapy. EC 3.4.21.-.\n    ",
  "pharmacological_actions": ["Anticoagulants", "Fibrinolytic Agents"],
  "synonyms": ["Venacil",
   "Agkistrodon rhodostoma Venom Protease",
   "Arwin",
   "Ancrod",
   "Arvin",
   "Arvin IRC50",
   "Arvin IRC 50",
   "Arvin IRC-50",
   "Agkistrodon Serine Proteinase"],
  "harvester_modifiers": ["administration & dosage",
   "adverse effects",
   "analysis",
   "antagonists & inhibitors",
   "biosynthesis",
   "blood",
   "cerebrospinal fluid",
   "chemical synthesis",
   "classification",
   "deficiency",
   "drug effects",
   "economics",
   "genetics",
   "history",
   "immunology",
   "isolation & purification",
   "metabolism",
   "pharmacokinetics",
   "pharmacology",
   "physiology",
   "poisoning",
   "radiation effects",
   "standards",
   "supply & distribution",
   "therapeutic use",
   "toxicity",
   "ultrastructure",
   "urine",
   "chemistry"],
  "mesh_tree_numbers": ["D08.811.277.656.300.760.955.060",
   "D08.811.277.656.959.350.955.060",
   "D20.888.850.960.200.050",
   "D23.946.833.850.960.200.050"],
  "tags": ["D08", "D08", "D20", "D23"],
  "last_updated_in_db": "2026-01-13T15:04:53.036487Z"
}
```

## Save the seed generated

In [6]:
def save_seeds_by_mesh_branch(
    seeds: List[Dict],
    storage_path: Union[str, Path],
    filename_prefix: str = "mesh_seed"
) -> Dict[str, Path]:
    """
    Saves seeds into separate JSON files per top-level MeSH branch.
    Example:
        mesh_seed_C.json
        mesh_seed_D.json
        mesh_seed_G.json

    Returns:
        Dict mapping branch letter -> file path
    """

    base_dir = Path(storage_path)
    base_dir.mkdir(parents=True, exist_ok=True)

    # Group seeds by branch
    branch_buckets = defaultdict(list)

    for seed in seeds:
        for branch in seed.get("top_level_branches", []):
            branch_buckets[branch].append(seed)

    saved_files = {}

    for branch, branch_seeds in branch_buckets.items():
        file_path = base_dir / f"{filename_prefix}_{branch}.json"

        try:
            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(branch_seeds, f, indent=4, ensure_ascii=False)

            print(f"✅ Saved {len(branch_seeds)} seeds → {file_path}")
            saved_files[branch] = file_path

        except Exception as e:
            print(f"❌ Error saving branch {branch}: {e}")

    return saved_files

## Usage

In [7]:
seeds = parse_mesh_xml('../../desc2026.xml')
print(f"Generated {len(seeds)} seeds within your scope.")

filesSaved = save_seeds_by_mesh_branch(seeds, SEED_DIR)

Generated 21887 seeds within your scope.
✅ Saved 10688 seeds → C:\Users\Aman Sheikh\Desktop\Projects\VeriFact\Model\harvester\storage\seeds\mesh_seed_D.json
✅ Saved 5069 seeds → C:\Users\Aman Sheikh\Desktop\Projects\VeriFact\Model\harvester\storage\seeds\mesh_seed_C.json
✅ Saved 1400 seeds → C:\Users\Aman Sheikh\Desktop\Projects\VeriFact\Model\harvester\storage\seeds\mesh_seed_N.json
✅ Saved 2513 seeds → C:\Users\Aman Sheikh\Desktop\Projects\VeriFact\Model\harvester\storage\seeds\mesh_seed_G.json
✅ Saved 361 seeds → C:\Users\Aman Sheikh\Desktop\Projects\VeriFact\Model\harvester\storage\seeds\mesh_seed_M.json
✅ Saved 974 seeds → C:\Users\Aman Sheikh\Desktop\Projects\VeriFact\Model\harvester\storage\seeds\mesh_seed_F.json
✅ Saved 2467 seeds → C:\Users\Aman Sheikh\Desktop\Projects\VeriFact\Model\harvester\storage\seeds\mesh_seed_E.json
✅ Saved 363 seeds → C:\Users\Aman Sheikh\Desktop\Projects\VeriFact\Model\harvester\storage\seeds\mesh_seed_J.json
✅ Saved 114 seeds → C:\Users\Aman Sheikh\