## Dependencies

In [33]:
import json
import subprocess
from typing import Union
from pathlib import Path
import sys
import re
from typing import List, Iterable, Union, Dict

## Storage directory configurations



In [34]:
STORAGE_DIR = Path.cwd().parent.parent / "storage"
STORAGE_DIR.mkdir(exist_ok=True)

SEED_DIR=STORAGE_DIR / "seeds"
SEED_DIR

WindowsPath('C:/Users/Aman Sheikh/Desktop/Projects/VeriFact/Model/harvester/storage/seeds')

## Importing generated seeds

In [35]:
def import_seeds_for_harvest(file_path: Union[str, Path]):
    """
    Imports the generated JSON seed file to be used by the harvester.
    """
    path = Path(file_path)

    if not path.exists():
        print(f"‚ùå Error: The file {path} does not exist.")
        return None

    try:
        with open(path, "r", encoding="utf-8") as file:
            seeds_data = json.load(file)

        print(f"üìñ Successfully imported {len(seeds_data)} seeds from {path.name}")
        return seeds_data

    except json.JSONDecodeError:
        print(f"‚ùå Error: The file {path} is not a valid JSON.")
        return None
    except Exception as e:
        print(f"‚ùå An unexpected error occurred: {e}")
        return None

## Extract seed keyword

```python
seed = {
    "label": "Cardiac Arrest",
    "synonyms": ["heart arrest", "cardiac arrest, heart stoppage", "SCA|sudden cardiac arrest"]
}

keywords = extract_seed_keywords(seed)
# -> ["cardiac arrest", "heart arrest", "heart stoppage", "sca", "sudden cardiac arrest"]
```

### Normalize keywords

In [36]:
def _normalize_keyword(s: str) -> str:
    """
    Normalize keyword for SapBERT:
    - lowercase
    - normalize separators
    - keep alphanumerics and spaces
    - collapse whitespace
    """
    if not s or not isinstance(s, str):
        return ""

    s = s.lower().strip()
    s = re.sub(r"[-_/]", " ", s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

### Split synonyms

In [37]:
def _split_synonym(s: str) -> Iterable[str]:
    """
    Split compound synonym strings safely.
    Handles:
      - commas
      - semicolons
      - pipes
    """
    if not s:
        return []
    return [p.strip() for p in re.split(r"[;,|]+", s) if p.strip()]

### Extract Keywords

In [38]:
def extract_seed_keywords_from_list(
    seeds: List[Dict],
    *,
    include_label: bool = True,
    include_synonyms: bool = True,
    max_phrase_tokens: int = 12,
    drop_label_duplicates: bool = True
) -> Dict[str, List[str]]:
    """
    Extract SapBERT keyword seeds from a list of MeSH-like seed dicts.

    Returns:
      {
        seed_id: [keyword1, keyword2, ...]
      }
    """

    results = {}

    for seed in seeds:
        seed_id = seed.get("seed_id", "unknown")
        label = seed.get("label", "")
        synonyms = seed.get("synonyms", [])

        seen = set()
        keywords = []

        def add_kw(phrase: str):
            norm = _normalize_keyword(phrase)
            if not norm:
                return

            # length guard (prevents huge IUPAC strings dominating embeddings)
            if len(norm.split()) > max_phrase_tokens:
                return

            if norm not in seen:
                seen.add(norm)
                keywords.append(norm)

        label_norm = _normalize_keyword(label)

        # --- label ---
        if include_label and label:
            add_kw(label)

        # --- synonyms ---
        if include_synonyms:
            for syn in synonyms:
                for part in _split_synonym(syn):
                    norm = _normalize_keyword(part)

                    if not norm:
                        continue

                    if drop_label_duplicates and norm == label_norm:
                        continue

                    add_kw(part)

        results[seed_id] = keywords
    print(f"‚úÖ Extracted a total of {len(results)} seeds.")
    return results

## Export the generated keyword dictionary

In [42]:
def export_keywords_to_json(
    keyword_map: Dict[str, List[str]],
    output_path: str,
    *,
    indent: int = 2,
    sort_keys: bool = True
) -> None:
    """
    Export extracted SapBERT seed keywords to a JSON file.

    Args:
      keyword_map: {seed_id: [keyword1, keyword2, ...]}
      output_path: path to output JSON file
      indent: JSON indentation level (default=2)
      sort_keys: sort dictionary keys for reproducibility
    """

    path = Path(output_path)
    path.parent.mkdir(parents=True, exist_ok=True)

    with path.open("w", encoding="utf-8") as f:
        json.dump(
            keyword_map,
            f,
            ensure_ascii=False,
            indent=indent,
            sort_keys=sort_keys
        )
    print(f"‚úÖ File saved to {path} successfully.")

## Example usage

In [45]:
SEED_FILE = SEED_DIR / "seeds_mesh.json"

OUTPUT_DIR = Path("../../storage/outputs")
OUTPUT_DIR.mkdir(exist_ok=True)

seed_data=import_seeds_for_harvest(SEED_FILE)

keywords = extract_seed_keywords_from_list(seed_data)

export_keywords_to_json(
    keywords,
    output_path=OUTPUT_DIR/"seed_keywords.json",
)

üìñ Successfully imported 18168 seeds from seeds_mesh.json
‚úÖ Extracted a total of 18168 seeds.
‚úÖ File saved to ..\..\storage\outputs\seed_keywords.json successfully.


## Query Generator

In [None]:
# def generate_harvester_matrix(seed_data):
#     # Safely get the label
#     label = seed_data.get('label', 'Unknown')
#
#     # Safely get the description, checking both possible key names
#     scope_note =seed_data.get('semantic_ground_truth', "")
#
#     # Get qualifiers (default to empty list if missing)
#     qualifiers = seed_data.get('harvester_modifiers', [])
#
#     # Tier 1 & 2 logic
#     identity_queries = [f'"{label}"[Mesh]']
#     action_queries = [f'"{label} {q}"' for q in qualifiers[:5]]
#
#     # Tier 3: Biological Context
#     # This now calls our "safe" function
#     bio_keywords = extract_bio_phrases(label, scope_note)
#     context_queries = [f'"{label}" AND "{bk}"' for bk in bio_keywords]
#
#     return {
#         "seed": label,
#         "harvest_list": list(set(identity_queries + action_queries + context_queries))
#     }

## Example usage

### Run the python script

In [None]:
# SEED_FILE = SEED_DIR / "seeds_mesh.json"
#
# process = subprocess.Popen(
#     [sys.executable, "harvest_parallel.py", str(SEED_FILE)],
#     stdout=subprocess.PIPE,
#     stderr=subprocess.STDOUT,
#     bufsize=1
# )
#
# # Stream output live
# for bline in process.stdout:
#     line = bline.decode("utf-8", errors="replace")
#     print(line, end="")
#
# process.wait()

### Display output

In [None]:
# OUTPUT_DIR = Path("../../storage/outputs/harvest_outputs")
# OUTPUT_DIR.mkdir(exist_ok=True)
#
# results = []
# for file in OUTPUT_DIR.glob("*.json"):
#     with open(file, "r", encoding="utf-8") as f:
#         results.append(json.load(f))
#
# # Optional: stable ordering
# results.sort(key=lambda r: r["label"])
#
# # Show results
# for r in results:
#     if r["error"]:
#         print(f"‚ùå {r['label']}")
#     else:
#         print(f"üöÄ {r['label']} ‚Üí {len(r['result']['harvest_list'])} queries")