In [None]:
"""
Pipeline for Global Infectious Disease Surveillance project.
"""

from pathlib import Path
import requests
import pandas as pd

from config import RAW_DATA_DIR, PROCESSED_DATA_DIR, WHO_API_KEY


WHO_BASE = "https://ghoapi.azureedge.net/api"


def fetch_who_data(indicator: str = "WHS9_86") -> Path:
    """
    Fetch WHO data for a given indicator and save to CSV.

    indicator: WHO GHO code (placeholder used here).
    """
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
    url = f"{WHO_BASE}/{indicator}"
    resp = requests.get(url, timeout=60)
    resp.raise_for_status()
    data = resp.json()["value"]
    df = pd.DataFrame(data)
    out_path = RAW_DATA_DIR / f"who_{indicator}.csv"
    df.to_csv(out_path, index=False)
    return out_path


def fetch_ecdc_data() -> Path:
    """
    TODO: Implement ECDC fetch.
    For now, manually download latest ECDC outbreak table to data/raw and return path.
    """
    # Placeholder: user manually saves file
    return RAW_DATA_DIR / "ecdc_outbreaks.csv"


def run_pipeline():
    """
    Outline of analysis:

    1. Fetch WHO & ECDC data.
    2. Clean/normalize country codes and dates.
    3. Engineer outbreak metrics (incidence, growth rates).
    4. Fit forecasting model (Prophet) for key diseases/regions.
    5. Run anomaly detection on forecast residuals.
    6. Apply NLP keyword extraction on outbreak descriptions.
    7. Export processed datasets for Tableau (wide and long).
    """
    who_path = fetch_who_data()
    ecdc_path = fetch_ecdc_data()

    # 2) Cleaning + feature engineering (TODO: implement)
    # df_who = clean_who(who_path)
    # df_ecdc = clean_ecdc(ecdc_path)
    # combined = combine_sources(df_who, df_ecdc)
    # features = build_outbreak_features(combined)

    # 3) Forecasting (Prophet) (TODO: implement)
    # models, forecast_df = fit_prophet_models(features)

    # 4) Anomaly detection (TODO)
    # anomalies = detect_anomalies(forecast_df)

    # 5) NLP (KeyBERT) on narrative fields (TODO)
    # keywords = extract_keywords(combined["narrative"])

    # 6) Export for Tableau
    PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
    # features.to_csv(PROCESSED_DATA_DIR / "outbreak_features.csv", index=False)
    # anomalies.to_csv(PROCESSED_DATA_DIR / "outbreak_anomalies.csv", index=False)
    # keywords.to_csv(PROCESSED_DATA_DIR / "outbreak_keywords.csv", index=False)

    print("Pipeline outline executed (fill in TODOs to make it live).")


if __name__ == "__main__":
    run_pipeline()
