# MeSH(Medical Subject Headings) vocabulary thesaurus.
This is a scope map which performs various tasks for our model:
1. Helps for **Keyword harvesting** by making use of `harvester_modifiers` _(Qualifiers)_
2. Helps for **detection** by making use of `semantic_ground_truth` _(ScopeNote)_
3. Helps for **tracing** by making use of `historical_trace` _(HistoryNote)_
4. Helps for **tracing** by making use of `date_introduced`

## Dependencies

In [None]:
import xml.etree.ElementTree as ET
import json
from datetime import datetime
from pathlib import Path
from typing import List, Union

## Storage directory configurations

In [None]:
STORAGE_DIR = Path.cwd().parent.parent / "storage"
STORAGE_DIR.mkdir(exist_ok=True)

SEED_DIR=STORAGE_DIR / "seeds"
SEED_DIR

## Helper

In [None]:
def get_mesh_date(element):
    if element is None: return None
    y = element.findtext('Year')
    m = element.findtext('Month')
    d = element.findtext('Day')
    return f"{y}-{m}-{d}"

## Allowed Branches

In [None]:
# Define your scope based on your requirements
ALLOWED_BRANCHES = (
    'C01', 'C02', 'C03', 'C04', 'C23', # Diseases & Toxicology
    'D',                               # Chemicals & Drugs
    'E01', 'E02', 'E05',               # Therapeutics & Diagnostics
    'N01', 'N02',                      # Public Health
    'M01', 'M02',                      # Population
    'G'                                # Life Sciences
)

## Parse mesh

In [None]:
def parse_mesh_xml(xml_path):
    seeds = []
    # Iterative parsing to save RAM
    context = ET.iterparse(xml_path, events=('end',))

    for event, elem in context:
        if elem.tag == 'DescriptorRecord':
            tree_numbers = [tn.text for tn in elem.findall('./TreeNumberList/TreeNumber')]

            # 1. Branch Filtering
            if any(tn.startswith(ALLOWED_BRANCHES) for tn in tree_numbers):
                ui = elem.findtext('DescriptorUI')
                name = elem.findtext('./DescriptorName/String')

                # 2. Tracking: Temporal Metadata
                date_intro = get_mesh_date(elem.find('DateIntroduced'))
                last_update = get_mesh_date(elem.find('LastUpdated'))

                # 3. Tracing: Historical Trail
                # Captures "was [Old Name] 1975-90" to find legacy misinformation
                history_note = elem.findtext('HistoryNote')
                prev_indexing = [pi.text for pi in elem.findall('./PreviousIndexingList/PreviousIndexing')]

                # 4. Detection: Semantic Ground Truth (ScopeNote)
                # We pull from the "Preferred Concept" (Y)
                scope_note = elem.findtext('.//Concept[@PreferredConceptYN="Y"]/ScopeNote')

                # 5. Keyword Harvesting: Synonyms & Qualifiers
                # Extracting all synonyms from all concepts
                synonyms = list(set([t.text for t in elem.findall('.//TermList/Term/String')]))

                # Extracting "Action" keywords (e.g., /adverse effects, /toxicity)

                # Harvest Modifiers are the "angles" or "perspectives" attached to a seed term to narrow down a search.
                # Think of a seed term like "Vaccine" as a large, messy room.
                # If you just search for "Vaccine," you get everything.

                # A Modifier is like a spotlight that only hits a specific corner of that room, such as "Adverse Effects" or "Toxicity."
                qualifiers = [q.findtext('./QualifierReferredTo/QualifierName/String')
                             for q in elem.findall('.//AllowableQualifier')]

                # 6. Functional Context (Pharmacological Actions)
                actions = [pa.findtext('.//DescriptorName/String')
                          for pa in elem.findall('.//PharmacologicalAction')]

                # Build Schema
                seed = {
                    "seed_id": f"mesh:{ui}",
                    "label": name.lower(),
                    "temporal_tracking": {
                        "date_introduced": date_intro,
                        "last_medical_update": last_update,
                        # if it is older than 2020, make it established.
                        "status": "established" if date_intro and int(date_intro[:4]) < 2020 else "emerging"
                    },
                    "historical_trace": {
                        "history_note": history_note,
                        "previous_indexing": prev_indexing
                    },
                    "semantic_ground_truth": scope_note,
                    "pharmacological_actions": actions,
                    "synonyms": synonyms,
                    "harvester_modifiers": qualifiers, # Use these to generate search queries
                    "mesh_tree_numbers": tree_numbers,
                    "tags": [tn[:3] for tn in tree_numbers],
                    "last_updated_in_db": datetime.now().isoformat() + "Z"
                }
                seeds.append(seed)

            elem.clear() # Free memory
    return seeds

## Example of an element of the seed:

```json
 {
  "seed_id": "mesh:D000721",
  "label": "ancrod",
  "temporal_tracking": {"date_introduced": "1978-01-01",
   "last_medical_update": "2016-07-05",
   "status": "established"},
  "historical_trace": {"history_note": "78; was ARVIN see under ENDOPEPTIDASES 1975-77; ARVIN was see ANCROD 1978-95\n  ",
   "previous_indexing": ["Peptide Hydrolases (1972-1974)",
    "Peptide Peptidohydrolases (1966-1974)",
    "Venoms (1972-1974)"]},
  "semantic_ground_truth": "An enzyme fraction from the venom of the Malayan pit viper, Agkistrodon rhodostoma. It catalyzes the hydrolysis of a number of amino acid esters and a limited proteolysis of fibrinogen. It is used clinically to produce controlled defibrination in patients requiring anticoagulant therapy. EC 3.4.21.-.\n    ",
  "pharmacological_actions": ["Anticoagulants", "Fibrinolytic Agents"],
  "synonyms": ["Venacil",
   "Agkistrodon rhodostoma Venom Protease",
   "Arwin",
   "Ancrod",
   "Arvin",
   "Arvin IRC50",
   "Arvin IRC 50",
   "Arvin IRC-50",
   "Agkistrodon Serine Proteinase"],
  "harvester_modifiers": ["administration & dosage",
   "adverse effects",
   "analysis",
   "antagonists & inhibitors",
   "biosynthesis",
   "blood",
   "cerebrospinal fluid",
   "chemical synthesis",
   "classification",
   "deficiency",
   "drug effects",
   "economics",
   "genetics",
   "history",
   "immunology",
   "isolation & purification",
   "metabolism",
   "pharmacokinetics",
   "pharmacology",
   "physiology",
   "poisoning",
   "radiation effects",
   "standards",
   "supply & distribution",
   "therapeutic use",
   "toxicity",
   "ultrastructure",
   "urine",
   "chemistry"],
  "mesh_tree_numbers": ["D08.811.277.656.300.760.955.060",
   "D08.811.277.656.959.350.955.060",
   "D20.888.850.960.200.050",
   "D23.946.833.850.960.200.050"],
  "tags": ["D08", "D08", "D20", "D23"],
  "last_updated_in_db": "2026-01-13T15:04:53.036487Z"
}
```

## Save the seed generated

In [None]:
def save_seeds_to_json(seeds: List[str], storage_path: Union[str, Path], filename: str = "seeds_mesh.json") -> Path:
    """
    Saves a list of seed terms to a JSON file.

    Args:
        seeds: The list of strings to save.
        storage_path: The directory path (str or Path object).
        filename: The name of the file (defaults to seeds_mesh.json).

    Returns:
        The Path object to the saved file.
    """
    # Convert string path to Path object if necessary
    base_dir = Path(storage_path)
    base_dir.mkdir(parents=True, exist_ok=True)

    file_path = base_dir / filename

    if not seeds:
        print(f"⚠️ Warning: The seed list is empty. No file created.")
        return file_path

    try:
        print(f"Writing {len(seeds)} seeds to disk...")
        with open(file_path, "w", encoding="utf-8") as file:
            # ensure_ascii=False handles special characters in medical terms
            json.dump(seeds, file, indent=4, ensure_ascii=False)
        print(f"✅ Saved successfully: {file_path}")
    except Exception as e:
        print(f"❌ Critical Error saving to {file_path}: {e}")

    return file_path

## Example Usage

In [None]:
seeds = parse_mesh_xml('../../desc2026.xml')
print(f"Generated {len(seeds)} seeds within your scope.")

saved_path = save_seeds_to_json(seeds, SEED_DIR)