In [None]:
"""
================================================================================
FDA ENRICHMENT PIPELINE
================================================================================
Goal:
  1) Read the cleaned CSV produced by DataDoseCleanActiveingredIent.ipynb
  2) Extract all unique active ingredients
  3) Groq API: validate/canonicalize each ingredient before querying OpenFDA
  4) OpenFDA API: fetch medical label data per ingredient
  5) Produce:
       - ingredients_fda_results.csv   (analysis/inspection)
       - ingredients_fda_results.json  (knowledge graph build)
Run in Google Colab:
  from fda_enrichment_pipeline import run_full_pipeline
  run_full_pipeline()
================================================================================
"""

import json
import time
import os
import re
import csv
import requests
import pandas as pd
from datetime import datetime

# ==============================================================================
# Settings (edit only these values)
# ==============================================================================

BASE_DIR = "/content/drive/MyDrive/DataDoseDepi"

# Input: CSV output from drug_cleaning_pipeline_v2.py
CLEANED_CSV = os.path.join(BASE_DIR, "DataDoseDataset_FinalV.csv")

# Outputs
OUTPUT_CSV = os.path.join(BASE_DIR, "ingredients_fda_results.csv")
OUTPUT_JSON = os.path.join(BASE_DIR, "ingredients_fda_results.json")
PROGRESS_FILE = os.path.join(BASE_DIR, "fda_pipeline_progress.json")  # Resume support
LOG_FILE = os.path.join(BASE_DIR, "fda_pipeline_log.txt")

GROQ_API_KEYS = [

]
GROQ_API_KEY = GROQ_API_KEYS[0]  # backward compat
_current_key_idx = 0
_ingredients_count = 0
ROTATE_EVERY = 10  # rotate key every N ingredients
GROQ_MODEL = "llama-3.1-8b-instant"
GROQ_URL = "https://api.groq.com/openai/v1/chat/completions"

OPENFDA_BASE = "https://api.fda.gov/drug/label.json"
OPENFDA_API_KEY = "HXe0xJOpAVR3Py5EtYvvzGjog1wyo6omIuotTqio"  # optional


# ==============================================================================
# Logging
# ==============================================================================

def log(msg: str, level: str = "INFO"):
    ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    line = f"[{ts}] [{level}] {msg}"
    print(line)
    try:
        with open(LOG_FILE, "a", encoding="utf-8") as f:
            f.write(line + "\n")
    except Exception:
        pass


# ==============================================================================
# Step 1 ‚Äî Extract unique ingredients from the cleaned CSV
# ==============================================================================

def extract_unique_ingredients(cleaned_csv: str) -> list[str]:
    """
    Read cleaned CSV and return a list of unique active ingredients.
    If a row contains a combination (A + B + C), split into individual ingredients.
    """
    log(f"Reading data from: {cleaned_csv}")
    df = pd.read_csv(cleaned_csv)

    # Find the ingredient column
    col = None
    for candidate in ["activeingredient_clean", "Graph_Node_Ingredient",
                      "activeingredient", "ActiveIngredient"]:
        if candidate in df.columns:
            col = candidate
            break

    if col is None:
        raise ValueError(f"Ingredient column not found. Available columns: {list(df.columns)}")

    log(f"Ingredient column: '{col}' | Total rows: {len(df):,}")

    # Split combos and collect unique tokens
    all_ingredients = set()
    for row in df[col].dropna():
        parts = [p.strip() for p in str(row).split("+")]
        for p in parts:
            p = p.strip().lower()
            if p and len(p) > 2:
                all_ingredients.add(p)

    unique = sorted(all_ingredients)
    log(f"Unique ingredients after split: {len(unique):,}")
    return unique


# ==============================================================================
# Step 2 ‚Äî Groq API: validate and canonicalize ingredient name
# ==============================================================================

GROQ_SYSTEM_PROMPT = """
You are a senior pharmaceutical scientist and medical terminologist.
Your job is to validate and canonicalize active pharmaceutical ingredient names.

INPUT: A drug ingredient name (possibly misspelled, abbreviated, or non-pharmaceutical).

YOUR TASK ‚Äî answer these 3 questions:
1. Is this a real, recognized ACTIVE PHARMACEUTICAL INGREDIENT (API)?
   - Yes: prescription drugs, OTC drugs, biologics, vaccines, vitamins (if pharmaceutical grade)
   - No: cosmetics, food ingredients, vague marketing terms, herb extracts without a defined API

2. If YES ‚Äî what is the CANONICAL INN (International Nonproprietary Name) spelling?
   - Use the WHO INN standard
   - Fix typos, expand abbreviations, remove salts if they don't change the search
   - Example: "paracetamol" ‚Üí "paracetamol" (INN), NOT "acetaminophen"
   - Example: "digoxine" ‚Üí "digoxin"
   - Example: "cobalamin" ‚Üí "cyanocobalamin" (more specific for FDA search)

3. REJECTION CRITERIA ‚Äî return is_drug: false if:
   - It's a cosmetic ingredient (glycerin, petrolatum, fragrance, etc.)
   - It's a vague term (minerals, vitamins, supplements, herbal extract)
   - It's a food/spice (turmeric, ginger, garlic, etc.)
   - It's a non-drug supplement (MSM, royal jelly, propolis, etc.)
   - It cannot be found in any drug regulatory database

RETURN FORMAT ‚Äî ONLY valid JSON, no explanations:
{
  "input": "the original name you received",
  "is_drug": true or false,
  "canonical_name": "corrected INN name" or null if is_drug is false,
  "fda_search_term": "best term to search OpenFDA API" or null,
  "rejection_reason": null or "brief reason if is_drug is false",
  "confidence": 0.0 to 1.0
}

EXAMPLES:
Input: "digoxine"
Output: {"input":"digoxine","is_drug":true,"canonical_name":"digoxin","fda_search_term":"digoxin","rejection_reason":null,"confidence":0.99}

Input: "turmeric"
Output: {"input":"turmeric","is_drug":false,"canonical_name":null,"fda_search_term":null,"rejection_reason":"herbal spice, not a pharmaceutical API","confidence":0.97}

Input: "cobalamin"
Output: {"input":"cobalamin","is_drug":true,"canonical_name":"cyanocobalamin","fda_search_term":"cyanocobalamin","rejection_reason":null,"confidence":0.92}

Input: "omega 3"
Output: {"input":"omega 3","is_drug":false,"canonical_name":null,"fda_search_term":null,"rejection_reason":"vague supplement term, not a specific API","confidence":0.88}
"""

def _next_groq_key() -> str:
    """Return the next API key in round-robin."""
    global _current_key_idx
    _current_key_idx = (_current_key_idx + 1) % len(GROQ_API_KEYS)
    return GROQ_API_KEYS[_current_key_idx]


# ==============================================================================
# Local validator (runs before Groq to reduce API calls)
# ==============================================================================

_KNOWN_DRUGS = {
    "paracetamol", "acetaminophen", "ibuprofen", "aspirin", "diclofenac",
    "naproxen", "indomethacin", "ketoprofen", "meloxicam", "celecoxib",
    "tramadol", "codeine", "morphine", "oxycodone", "fentanyl", "methadone",
    "buprenorphine", "naloxone", "naltrexone",
    "amoxicillin", "ampicillin", "penicillin", "cloxacillin", "flucloxacillin",
    "cephalexin", "cefuroxime", "ceftriaxone", "cefixime", "cefadroxil",
    "azithromycin", "clarithromycin", "erythromycin", "roxithromycin",
    "ciprofloxacin", "levofloxacin", "ofloxacin", "norfloxacin",
    "doxycycline", "tetracycline", "minocycline",
    "metronidazole", "tinidazole", "ornidazole",
    "trimethoprim", "sulfamethoxazole",
    "nitrofurantoin", "fosfomycin", "linezolid", "vancomycin",
    "meropenem", "imipenem", "ertapenem",
    "gentamicin", "amikacin", "tobramycin", "streptomycin",
    "clindamycin", "chloramphenicol", "rifampicin", "isoniazid",
    "ethambutol", "pyrazinamide", "dapsone",
    "fluconazole", "itraconazole", "ketoconazole", "voriconazole",
    "clotrimazole", "miconazole", "terbinafine", "nystatin", "amphotericin",
    "acyclovir", "valacyclovir", "ganciclovir", "oseltamivir", "zanamivir",
    "ribavirin", "interferon", "lamivudine", "tenofovir", "efavirenz",
    "lopinavir", "ritonavir", "atazanavir", "darunavir",
    "amlodipine", "nifedipine", "verapamil", "diltiazem", "felodipine",
    "enalapril", "lisinopril", "ramipril", "captopril", "perindopril",
    "losartan", "valsartan", "irbesartan", "telmisartan", "candesartan",
    "atenolol", "metoprolol", "bisoprolol", "carvedilol", "propranolol",
    "furosemide", "hydrochlorothiazide", "spironolactone", "indapamide",
    "digoxin", "amiodarone", "warfarin", "heparin", "enoxaparin",
    "clopidogrel", "aspirin", "atorvastatin", "simvastatin", "rosuvastatin",
    "isosorbide", "nitroglycerin", "dobutamine", "dopamine", "adrenaline",
    "noradrenaline", "epinephrine", "norepinephrine",
    "metformin", "glibenclamide", "gliclazide", "glimepiride", "glipizide",
    "pioglitazone", "sitagliptin", "vildagliptin", "saxagliptin", "empagliflozin",
    "dapagliflozin", "liraglutide", "insulin",
    "omeprazole", "esomeprazole", "pantoprazole", "lansoprazole", "rabeprazole",
    "ranitidine", "famotidine", "cimetidine", "sucralfate", "bismuth",
    "metoclopramide", "domperidone", "ondansetron", "granisetron", "tropisetron",
    "loperamide", "lactulose", "bisacodyl", "senna", "psyllium",
    "salbutamol", "albuterol", "terbutaline", "salmeterol", "formoterol",
    "ipratropium", "tiotropium", "theophylline", "aminophylline",
    "beclomethasone", "budesonide", "fluticasone", "mometasone", "ciclesonide",
    "montelukast", "zafirlukast", "cromoglicate",
    "diazepam", "lorazepam", "alprazolam", "clonazepam", "midazolam",
    "phenobarbital", "phenytoin", "carbamazepine", "valproate", "lamotrigine",
    "levetiracetam", "topiramate", "gabapentin", "pregabalin",
    "haloperidol", "risperidone", "olanzapine", "quetiapine", "aripiprazole",
    "clozapine", "ziprasidone", "amisulpride",
    "fluoxetine", "sertraline", "paroxetine", "escitalopram", "citalopram",
    "venlafaxine", "duloxetine", "mirtazapine", "amitriptyline", "imipramine",
    "levodopa", "carbidopa", "pramipexole", "ropinirole", "bromocriptine",
    "donepezil", "rivastigmine", "memantine",
    "methylphenidate", "atomoxetine", "lithium",
    "levothyroxine", "liothyronine", "propylthiouracil", "methimazole",
    "hydrocortisone", "prednisolone", "prednisone", "dexamethasone",
    "betamethasone", "fludrocortisone", "methylprednisolone", "triamcinolone",
    "testosterone", "estradiol", "progesterone", "estrogen",
    "ethinylestradiol", "desogestrel", "levonorgestrel", "norethisterone",
    "medroxyprogesterone", "tamoxifen", "anastrozole", "letrozole",
    "growth hormone", "somatropin", "octreotide", "lanreotide",
    "cyclosporine", "tacrolimus", "mycophenolate", "azathioprine", "sirolimus",
    "methotrexate", "cyclophosphamide", "chlorambucil", "busulfan",
    "doxorubicin", "epirubicin", "vincristine", "paclitaxel", "docetaxel",
    "cisplatin", "carboplatin", "oxaliplatin", "fluorouracil", "capecitabine",
    "imatinib", "erlotinib", "gefitinib", "sorafenib", "sunitinib",
    "tretinoin", "isotretinoin", "adapalene", "tazarotene", "benzoyl peroxide",
    "azelaic acid", "salicylic acid", "chlorhexidine", "povidone iodine",
    "mupirocin", "fusidic acid", "silver sulfadiazine",
    "timolol", "betaxolol", "latanoprost", "bimatoprost", "travoprost",
    "pilocarpine", "acetazolamide", "dorzolamide", "brimonidine",
    "dexamethasone", "prednisolone", "tobramycin", "ciprofloxacin",
    "allopurinol", "colchicine", "probenecid", "febuxostat",
    "calcium", "vitamin d", "vitamin d3", "cholecalciferol", "alendronate",
    "risedronate", "ibandronate", "zoledronic acid",
    "glucosamine", "chondroitin", "hyaluronic acid", "collagen",
    "thiamine", "riboflavin", "niacin", "pyridoxine", "folic acid",
    "cyanocobalamin", "cobalamin", "ascorbic acid", "vitamin c",
    "retinol", "vitamin a", "tocopherol", "vitamin e", "phytomenadione",
    "cholecalciferol", "ergocalciferol", "biotin", "pantothenic acid",
    "iron", "ferrous sulfate", "ferrous gluconate", "zinc", "selenium",
    "magnesium", "potassium", "sodium", "iodine", "fluoride", "copper",
    "manganese", "chromium", "molybdenum",
    "vaccine", "immunoglobulin", "albumin", "erythropoietin", "filgrastim",
    "interferon", "infliximab", "adalimumab", "rituximab", "trastuzumab",
    "lidocaine", "lignocaine", "bupivacaine", "ropivacaine", "articaine",
    "procaine", "benzocaine", "tetracaine", "prilocaine",
    "camphor", "menthol", "eucalyptus oil", "turpentine oil",
    "zinc oxide", "calamine", "ichthammol", "coal tar",
    "potassium permanganate", "hydrogen peroxide", "alcohol", "ethanol",
    "glycerin", "sorbitol", "mannitol", "lactose",
    "caffeine", "theophylline", "dextromethorphan", "guaifenesin",
    "acetylcysteine", "bromhexine", "ambroxol", "carbocisteine",
    "antazoline", "chlorphenamine", "diphenhydramine", "promethazine",
    "cetirizine", "loratadine", "fexofenadine", "desloratadine",
    "dexchlorpheniramine", "hydroxyzine", "cyproheptadine",
    "sildenafil", "tadalafil", "vardenafil",
    "panthenol", "dexpanthenol", "pantothenic acid",
    "spironolactone", "eplerenone", "amiloride", "triamterene",
    "danazol", "mifepristone", "misoprostol", "oxytocin", "ergometrine",
    "atropine", "hyoscine", "scopolamine", "glycopyrrolate",
    "succinylcholine", "vecuronium", "rocuronium", "pancuronium",
    "propofol", "thiopental", "ketamine", "halothane", "isoflurane", "sevoflurane",
}

_KNOWN_NON_DRUGS = {
    "turmeric", "curcumin", "ginger", "garlic", "cinnamon", "pepper",
    "clove", "cardamom", "saffron", "oregano", "thyme", "rosemary",
    "chamomile", "lavender", "peppermint", "spearmint", "fennel",
    "licorice root", "valerian", "echinacea", "elderberry", "ginkgo",
    "ginseng", "ashwagandha", "moringa", "neem", "aloe", "aloe vera",
    "black seed", "nigella sativa", "fenugreek", "milk thistle",
    "dandelion", "nettle", "st john wort", "evening primrose",
    "flaxseed", "chia seed", "spirulina", "chlorella",
    "omega 3", "omega 6", "omega 9", "fish oil", "flaxseed oil",
    "evening primrose oil", "borage oil", "coconut oil", "olive oil",
    "royal jelly", "propolis", "bee pollen", "bee wax", "honey",
    "msm", "methylsulfonylmethane", "glucomannan",
    "resveratrol", "quercetin", "lycopene", "lutein", "zeaxanthin",
    "astaxanthin", "coq10", "q10", "ubiquinol", "alpha lipoic acid",
    "antioxidant", "antioxidants", "herbal extract", "plant extract",
    "natural extract", "essential oil", "botanical extract",
    "amino acids blend", "protein blend", "mineral blend", "vitamin blend",
    "multivitamin", "multimineral", "greens powder",
    "collagen peptides", "marine collagen", "bone broth",
    "probiotic blend", "prebiotic fiber", "digestive enzymes",
    "glycerin", "petrolatum", "paraffin", "beeswax", "lanolin",
    "dimethicone", "cyclomethicone", "squalene", "shea butter",
    "jojoba oil", "argan oil", "rosehip oil", "retinol serum",
    "hyaluronic serum", "niacinamide serum", "kojic acid",
    "fragrance", "parfum", "colorant", "dye",
    "sucrose", "glucose", "fructose", "maltose", "starch", "cellulose",
    "gelatin", "pectin", "guar gum", "xanthan gum", "carrageenan",
    "lecithin", "soy lecithin", "sunflower lecithin",
    "whey protein", "casein", "soy protein", "pea protein",
    "creatine", "beta alanine", "citrulline", "arginine blend",
}

_NON_DRUG_PATTERNS = [
    r"extract",
    r" oil$",
    r" bark",
    r"\bseed\b", r"\broot\b", r"\bherb\b",
    r"\bleaf\b", r"\bflower\b", r"\bberry\b",
    r"\bblend\b", r"\bformula\b", r"\bsupplement\b",
    r"\btea\b", r"\bjuice\b", r"\bbark\b",
]

import re as _re

def validate_local(ingredient: str) -> dict | None:
    """Local validation without API calls; return Groq-like dict or None if uncertain."""
    t = ingredient.strip().lower()

    if t in _KNOWN_DRUGS:
        return {
            "input": ingredient,
            "is_drug": True,
            "canonical_name": t,
            "fda_search_term": t,
            "rejection_reason": None,
            "confidence": 0.95,
            "_source": "local",
        }

    if t in _KNOWN_NON_DRUGS:
        return {
            "input": ingredient,
            "is_drug": False,
            "canonical_name": None,
            "fda_search_term": None,
            "rejection_reason": "known non-pharmaceutical ingredient",
            "confidence": 0.95,
            "_source": "local",
        }

    for pat in _NON_DRUG_PATTERNS:
        if _re.search(pat, t):
            return {
                "input": ingredient,
                "is_drug": False,
                "canonical_name": None,
                "fda_search_term": None,
                "rejection_reason": f"matches non-drug pattern: {pat}",
                "confidence": 0.85,
                "_source": "local_pattern",
            }

    return None


# ==============================================================================
# Groq key usage tracking for rate-limit handling
# ==============================================================================

import threading
_key_last_used = {}
_key_cooldown_until = {}
_groq_lock = threading.Lock()

def _get_retry_after(resp) -> int:
    """Extract wait time from response headers; default 60s, capped at 120s."""
    try:
        val = resp.headers.get("retry-after") or resp.headers.get("x-ratelimit-reset-requests", "60")
        return min(int(float(val)), 120)
    except Exception:
        return 60

def rotate_key_if_needed():
    """Rotate to the next key every ROTATE_EVERY ingredients."""
    global _current_key_idx, _ingredients_count
    _ingredients_count += 1
    if _ingredients_count % ROTATE_EVERY == 0:
        old_idx = _current_key_idx
        _current_key_idx = (_current_key_idx + 1) % len(GROQ_API_KEYS)
        log(
            f"üîÑ Key rotation: key[{old_idx}] ‚Üí key[{_current_key_idx}] "
            f"(after {_ingredients_count} ingredients)"
        )

def validate_with_groq(ingredient: str) -> dict | None:
    """
    Validate ingredient via Groq with rate-limit handling:
      - Use current key index (rotated by pipeline)
      - On 429: set cooldown and try next key
      - If all keys are on cooldown: wait for the soonest cooldown to end
    """
    global _current_key_idx

    payload = {
        "model": GROQ_MODEL,
        "messages": [
            {"role": "system", "content": GROQ_SYSTEM_PROMPT},
            {"role": "user", "content": f'Validate this ingredient: "{ingredient}"'},
        ],
        "response_format": {"type": "json_object"},
        "temperature": 0.05,
        "max_tokens": 256,
    }

    n_keys = len(GROQ_API_KEYS)

    for attempt in range(n_keys * 2):
        key_idx = (_current_key_idx + attempt) % n_keys

        now = time.time()
        cooldown_left = _key_cooldown_until.get(key_idx, 0) - now
        if cooldown_left > 0:
            if attempt < n_keys - 1:
                continue
            else:
                min_wait = min(
                    max(_key_cooldown_until.get(i, 0) - now, 0)
                    for i in range(n_keys)
                )
                log(f"  ‚è≥ All keys on cooldown ‚Äî waiting {min_wait:.0f}s", "WARN")
                time.sleep(min_wait + 1)
                _key_cooldown_until.clear()
                attempt = 0
                continue

        current_key = GROQ_API_KEYS[key_idx]
        headers = {
            "Authorization": f"Bearer {current_key}",
            "Content-Type": "application/json",
        }

        try:
            resp = requests.post(GROQ_URL, headers=headers, json=payload, timeout=20)

            if resp.status_code == 200:
                raw = resp.json()["choices"][0]["message"]["content"]
                result = json.loads(raw)
                if "is_drug" in result and "canonical_name" in result:
                    return result
                log(f"  Groq JSON missing fields for '{ingredient}'", "WARN")
                return None

            elif resp.status_code == 429:
                wait = _get_retry_after(resp)
                _key_cooldown_until[key_idx] = time.time() + wait
                log(f"  429 key[{key_idx}] ‚Üí cooldown {wait}s ‚Üí key[{(key_idx + 1) % n_keys}]", "WARN")
                continue

            elif resp.status_code == 401:
                _key_cooldown_until[key_idx] = time.time() + 7200
                log(f"  401 key[{key_idx}] ‚Üí disabled", "ERROR")
                continue

            else:
                log(f"  Groq {resp.status_code} key[{key_idx}]", "WARN")
                time.sleep(2)

        except json.JSONDecodeError:
            log(f"  JSON parse error key[{key_idx}]", "WARN")
        except requests.exceptions.Timeout:
            log(f"  Timeout key[{key_idx}]", "WARN")
        except Exception as e:
            log(f"  Error key[{key_idx}]: {e}", "WARN")

    log(f"‚ùå Groq failed for '{ingredient}' ‚Äî skip", "ERROR")
    return None


# ==============================================================================
# Step 3 ‚Äî OpenFDA API: fetch label data
# ==============================================================================

def search_openfda(search_term: str, retries: int = 3) -> dict | None:
    """
    Search OpenFDA drug labels API for an ingredient.
    Returns structured dict or None if not found / failed.
    """
    search_strategies = [
        f'openfda.substance_name:"{search_term.upper()}"',
        f'openfda.generic_name:"{search_term.upper()}"',
        f'openfda.substance_name:{search_term.upper()}*',
    ]

    params_base = {"limit": 3}
    if OPENFDA_API_KEY:
        params_base["api_key"] = OPENFDA_API_KEY

    for strategy in search_strategies:
        params = {**params_base, "search": strategy}

        for attempt in range(retries):
            try:
                resp = requests.get(OPENFDA_BASE, params=params, timeout=15)

                if resp.status_code == 200:
                    data = resp.json()
                    results = data.get("results", [])
                    if not results:
                        break
                    return _parse_openfda_results(search_term, results)

                elif resp.status_code == 404:
                    break

                elif resp.status_code == 429:
                    log("OpenFDA rate limit ‚Üí wait 10s", "WARN")
                    time.sleep(10)
                    continue

                else:
                    log(f"OpenFDA {resp.status_code} for '{search_term}'", "WARN")
                    time.sleep(2)

            except requests.exceptions.Timeout:
                log(f"OpenFDA timeout for '{search_term}' attempt {attempt + 1}", "WARN")
                time.sleep(3)
            except Exception as e:
                log(f"OpenFDA error: {e}", "WARN")
                time.sleep(2)

    return None


def _parse_openfda_results(search_term: str, results: list) -> dict:
    """Convert OpenFDA results into a structured record."""
    brand_names = set()
    generic_names = set()
    manufacturer = set()
    warnings = []
    interactions = []
    adverse_reacts = []
    indications = []
    dosage_forms = set()

    for result in results:
        openfda = result.get("openfda", {})

        brand_names.update(openfda.get("brand_name", []))
        generic_names.update(openfda.get("generic_name", []))
        manufacturer.update(openfda.get("manufacturer_name", []))
        dosage_forms.update(openfda.get("dosage_form", []))

        def _grab(field):
            val = result.get(field, [])
            return val[0][:2000] if val else ""

        if not warnings and result.get("warnings"):
            warnings = _extract_bullets(_grab("warnings"))
        if not interactions and result.get("drug_interactions"):
            interactions = _extract_bullets(_grab("drug_interactions"))
        if not adverse_reacts and result.get("adverse_reactions"):
            adverse_reacts = _extract_bullets(_grab("adverse_reactions"))
        if not indications and result.get("indications_and_usage"):
            indications = _extract_bullets(_grab("indications_and_usage"))

    return {
        "found": True,
        "search_term": search_term,
        "brand_names": sorted(brand_names)[:10],
        "generic_names": sorted(generic_names)[:5],
        "manufacturers": sorted(manufacturer)[:5],
        "dosage_forms": sorted(dosage_forms)[:5],
        "warnings": warnings[:10],
        "drug_interactions": interactions[:10],
        "adverse_reactions": adverse_reacts[:15],
        "indications": indications[:5],
    }


def _extract_bullets(text: str, max_items: int = 15) -> list[str]:
    """Split long text into short bullet-like items."""
    if not text:
        return []
    text = re.sub(r"\s+", " ", text).strip()
    sentences = re.split(r"(?<=[.!?])\s+|\n+|‚Ä¢|‚óè|-\s", text)
    bullets = []
    for s in sentences:
        s = s.strip().strip(".-‚Ä¢‚óè")
        if len(s) > 15:
            bullets.append(s[:300])
        if len(bullets) >= max_items:
            break
    return bullets


# ==============================================================================
# Step 4 ‚Äî Progress save/load (resume support)
# ==============================================================================

def load_progress(filepath: str | None = None) -> dict:
    path = filepath or PROGRESS_FILE
    if os.path.exists(path):
        try:
            with open(path, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            pass
    return {}


def save_progress(progress: dict, filepath: str | None = None):
    path = filepath or PROGRESS_FILE
    with open(path, "w", encoding="utf-8") as f:
        json.dump(progress, f, ensure_ascii=False, indent=2)


# ==============================================================================
# Step 5 ‚Äî Save final outputs (CSV + JSON)
# ==============================================================================

def _build_clean_json(results: dict) -> dict:
    """Convert raw progress dict into clean knowledge-graph JSON."""
    clean = {}
    for ingredient, data in results.items():
        if data.get("status") != "done":
            continue

        groq = data.get("groq_validation") or {}
        fda = data.get("fda_data") or {}

        clean[ingredient] = {
            "ingredient": ingredient,
            "is_drug": groq.get("is_drug", False),
            "canonical_name": groq.get("canonical_name"),
            "fda_search_term": groq.get("fda_search_term"),
            "groq_confidence": groq.get("confidence", 0.0),
            "rejection_reason": groq.get("rejection_reason"),
            "fda_found": fda.get("found", False),
            "brand_names": fda.get("brand_names", []),
            "generic_names": fda.get("generic_names", []),
            "manufacturers": fda.get("manufacturers", []),
            "dosage_forms": fda.get("dosage_forms", []),
            "warnings": fda.get("warnings", []),
            "drug_interactions": fda.get("drug_interactions", []),
            "adverse_reactions": fda.get("adverse_reactions", []),
            "indications": fda.get("indications", []),
        }
    return clean


def save_outputs(results: dict):
    """Save results to JSON (KG) and CSV (analysis)."""
    clean = _build_clean_json(results)

    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(clean, f, ensure_ascii=False, indent=2)
    log(f"JSON saved: {OUTPUT_JSON} ({len(clean):,} ingredients)")

    rows = []
    for ingredient, rec in clean.items():
        rows.append({
            "ingredient": ingredient,
            "canonical_name": rec.get("canonical_name", ""),
            "is_drug": rec.get("is_drug", ""),
            "groq_confidence": rec.get("groq_confidence", ""),
            "rejection_reason": rec.get("rejection_reason", ""),
            "fda_found": rec.get("fda_found", False),
            "fda_search_term": rec.get("fda_search_term", ""),
            "brand_names": " | ".join(rec.get("brand_names", [])),
            "generic_names": " | ".join(rec.get("generic_names", [])),
            "manufacturers": " | ".join(rec.get("manufacturers", [])),
            "dosage_forms": " | ".join(rec.get("dosage_forms", [])),
            "warnings_count": len(rec.get("warnings", [])),
            "interactions_count": len(rec.get("drug_interactions", [])),
            "adverse_count": len(rec.get("adverse_reactions", [])),
            "indications_count": len(rec.get("indications", [])),
            "first_warning": (rec["warnings"][0] if rec.get("warnings") else ""),
            "first_interaction": (rec["drug_interactions"][0] if rec.get("drug_interactions") else ""),
            "first_adverse": (rec["adverse_reactions"][0] if rec.get("adverse_reactions") else ""),
            "first_indication": (rec["indications"][0] if rec.get("indications") else ""),
        })

    df = pd.DataFrame(rows)
    df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    log(f"CSV saved: {OUTPUT_CSV} ({len(rows):,} rows)")

    total = len(clean)
    is_drug = sum(1 for r in clean.values() if r.get("is_drug"))
    fda_found = sum(1 for r in clean.values() if r.get("fda_found"))
    not_drug = total - is_drug
    not_found = is_drug - fda_found

    log("=" * 60)
    log("Summary:")
    log(f"   Total processed         : {total:,}")
    log(f"   Drugs (accepted)        : {is_drug:,}")
    log(f"   Non-drugs (rejected)    : {not_drug:,}")
    log(f"   Found in OpenFDA        : {fda_found:,}")
    log(f"   Drugs not found in FDA  : {not_found:,}")
    log("=" * 60)


# ==============================================================================
# Main pipeline
# ==============================================================================

def run_full_pipeline(
    cleaned_csv: str | None = None,
    groq_delay: float = 0.3,
    fda_delay: float = 0.5,
    groq_batch: int = 10,
    max_ingredients: int | None = None,
    start_from: int | None = None,
    end_at: int | None = None,
    worker_name: str | None = None,
):
    """
    Full pipeline with support for splitting work across multiple workers.

    Examples:
      run_full_pipeline(start_from=1, end_at=1500, worker_name="Ahmed")
      run_full_pipeline(start_from=1501, end_at=3000, worker_name="Mohamed")
      run_full_pipeline(start_from=3001, end_at=4500, worker_name="Sara")
      run_full_pipeline(start_from=4501, end_at=5022, worker_name="Ali")

    Notes:
      - Each worker uses a separate progress file (derived from start/end range)
      - Merge outputs later using merge_progress_files()
      - start_from/end_at are 1-based indices
    """
    if cleaned_csv is None:
        cleaned_csv = CLEANED_CSV

    if cleaned_csv is None:
        cleaned_csv = CLEANED_CSV

    log("=" * 60)
    worker_tag = f" [{worker_name}]" if worker_name else ""
    log(f"üöÄ FDA ENRICHMENT PIPELINE{worker_tag} ‚Äî START")
    log("=" * 60)

    ingredients = extract_unique_ingredients(cleaned_csv)
    total_all = len(ingredients)
    log(f"üìã Total unique ingredients: {total_all:,}")

    if max_ingredients:
        ingredients = ingredients[:max_ingredients]
        log(f"‚öôÔ∏è Testing mode: first {max_ingredients} ingredients")
    else:
        s = (start_from - 1) if start_from else 0
        e = end_at if end_at else total_all
        ingredients = ingredients[s:e]
        log(f"üéØ Range: {s + 1:,} ‚Üí {e:,} ({len(ingredients):,} ingredients)")

    if start_from and not max_ingredients:
        worker_progress = PROGRESS_FILE.replace(".json", f"_part{start_from}_{end_at or total_all}.json")
    else:
        worker_progress = PROGRESS_FILE
    log(f"üíæ Progress file: {worker_progress}")

    progress = load_progress(worker_progress)
    if progress:
        done = sum(1 for v in progress.values() if v.get("status") == "done")
        log(f"üîÑ Resume: {done:,} already done; continuing")

    results = {k: v for k, v in progress.items() if v.get("status") == "done"}
    total = len(ingredients)
    processed = 0

    for idx, ingredient in enumerate(ingredients):
        if ingredient in results:
            continue

        log(f"[{idx + 1}/{total}] Processing: '{ingredient}'")

        entry = {
            "status": "pending",
            "ingredient": ingredient,
            "groq_validation": None,
            "fda_data": None,
        }

        groq_result = validate_local(ingredient)

        if groq_result is not None:
            source = groq_result.pop("_source", "local")
            log(f"  ‚úÖ Local [{source}]: is_drug={groq_result['is_drug']}")
        else:
            log(f"  ü§ñ Groq needed for: '{ingredient}'")
            groq_result = validate_with_groq(ingredient)
            time.sleep(groq_delay)

            if groq_result is None:
                log("  ‚ö†Ô∏è Groq failed ‚Äî fallback: treat as drug, let FDA decide", "WARN")
                groq_result = {
                    "input": ingredient,
                    "is_drug": True,
                    "canonical_name": ingredient,
                    "fda_search_term": ingredient,
                    "rejection_reason": None,
                    "confidence": 0.0,
                }

        entry["groq_validation"] = groq_result

        if groq_result.get("is_drug"):
            search_term = (
                groq_result.get("fda_search_term") or
                groq_result.get("canonical_name") or
                ingredient
            )

            log(f"  üîç FDA search: '{search_term}'")
            fda_result = search_openfda(search_term)
            time.sleep(fda_delay)

            if fda_result:
                log(f"  ‚úÖ FDA: found {len(fda_result.get('brand_names', []))} brand names")
                entry["fda_data"] = fda_result
            else:
                log(f"  ‚ö†Ô∏è FDA: not found for '{search_term}'")
                entry["fda_data"] = {"found": False, "search_term": search_term}
        else:
            reason = groq_result.get("rejection_reason", "not a drug")
            log(f"  ‚ùå Groq: not a drug ({reason}) ‚Äî skip FDA")
            entry["fda_data"] = None

        entry["status"] = "done"
        results[ingredient] = entry
        processed += 1

        rotate_key_if_needed()

        if processed % groq_batch == 0:
            save_progress(results, worker_progress)
            log(f"  üíæ Progress saved ({processed}/{total} processed)")

    save_progress(results, worker_progress)

    log("\nüíæ Saving final outputs...")
    save_outputs(results)

    log("\nüéâ Pipeline finished successfully!")
    return results


# ==============================================================================
# Test helpers (Colab)
# ==============================================================================

def test_groq(ingredient: str = "digoxin"):
    """Test Groq API on a single ingredient."""
    print(f"\nüß™ Groq test for '{ingredient}':")
    result = validate_with_groq(ingredient)
    print(json.dumps(result, indent=2, ensure_ascii=False))
    return result


def test_fda(search_term: str = "digoxin"):
    """Test OpenFDA API on a single search term."""
    print(f"\nüî¨ OpenFDA test for '{search_term}':")
    result = search_openfda(search_term)
    if result:
        print(f"  Brand names: {result.get('brand_names', [])[:3]}")
        print(f"  Warnings:    {len(result.get('warnings', []))} found")
        print(f"  Interactions:{len(result.get('drug_interactions', []))} found")
        print(f"  Adverse:     {len(result.get('adverse_reactions', []))} found")
    else:
        print("  ‚ùå Not found in OpenFDA")
    return result


def test_pipeline_sample(n: int = 5):
    """Run the pipeline on the first N ingredients for quick validation."""
    print(f"\nüß™ Pipeline test on first {n} ingredients:")
    return run_full_pipeline(max_ingredients=n, groq_delay=1.0, fda_delay=0.3)


def show_progress_summary():
    """Print a summary of current progress file."""
    progress = load_progress()
    if not progress:
        print("No saved progress.")
        return

    total = len(progress)
    done = sum(1 for v in progress.values() if v.get("status") == "done")
    is_drug = sum(
        1 for v in progress.values()
        if v.get("groq_validation", {}) and v["groq_validation"].get("is_drug")
    )
    fda_found = sum(
        1 for v in progress.values()
        if v.get("fda_data") and isinstance(v["fda_data"], dict) and v["fda_data"].get("found")
    )

    print("\nüìä Progress Summary:")
    print(f"   Processed : {done}/{total}")
    print(f"   Is drug   : {is_drug}")
    print(f"   FDA found : {fda_found}")
    print(f"   Not drug  : {total - is_drug}")


# ==============================================================================
# Merge helpers (combine worker progress files)
# ==============================================================================

def merge_progress_files(output_json: str | None = None, output_csv: str | None = None):
    """
    Merge all partial progress files into one combined output.
    Looks for files named: fda_pipeline_progress_part*.json
    """
    import glob

    pattern = PROGRESS_FILE.replace(".json", "_part*.json")
    part_files = sorted(glob.glob(pattern))
    main_file = PROGRESS_FILE

    all_files = part_files + ([main_file] if os.path.exists(main_file) else [])

    if not all_files:
        log("‚ùå No progress files found to merge", "ERROR")
        return

    log(f"üîÄ Merge: found {len(all_files)} file(s)")
    merged = {}

    for path in all_files:
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
            before = len(merged)
            merged.update(data)
            log(
                f"  ‚úÖ {os.path.basename(path)}: {len(data):,} entries "
                f"(+{len(merged) - before:,} new)"
            )
        except Exception as e:
            log(f"  ‚ùå Failed to read {path}: {e}", "ERROR")

    log(f"üì¶ Total after merge: {len(merged):,} ingredients")

    out_json = output_json or OUTPUT_JSON
    out_csv = output_csv or OUTPUT_CSV
    save_outputs(merged)
    log(f"üéâ Merge done! JSON: {out_json} | CSV: {out_csv}")
    return merged


def show_all_workers_summary():
    """Print a summary for each worker progress file."""
    import glob

    pattern = PROGRESS_FILE.replace(".json", "_part*.json")
    part_files = sorted(glob.glob(pattern))

    if not part_files:
        print("No part files. Try show_progress_summary().")
        return

    print("\n" + "=" * 60)
    print("üìä Workers Summary")
    print("=" * 60)

    total_done = 0
    total_drug = 0
    total_found = 0

    for path in part_files:
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
            done = sum(1 for v in data.values() if v.get("status") == "done")
            drug = sum(1 for v in data.values() if (v.get("groq_validation") or {}).get("is_drug"))
            found = sum(
                1 for v in data.values()
                if isinstance(v.get("fda_data"), dict) and v["fda_data"].get("found")
            )
            name = os.path.basename(path)
            print(f"  üìÑ {name}")
            print(f"     Done: {done:,} | Drug: {drug:,} | FDA found: {found:,}")
            total_done += done
            total_drug += drug
            total_found += found
        except Exception as e:
            print(f"  ‚ùå {path}: {e}")

    print("-" * 60)
    print(f"  TOTAL ‚Üí Done: {total_done:,} | Drug: {total_drug:,} | FDA: {total_found:,}")
    print("=" * 60)


# ==============================================================================
# Entry point
# ==============================================================================

if __name__ == "__main__":
    run_full_pipeline(start_from=, end_at=, worker_name="The Best Team")