In [None]:
"""
Pipeline for Restaurant Inspection Violation Prediction.
"""

from pathlib import Path
import pandas as pd
import requests

from config import RAW_DATA_DIR, PROCESSED_DATA_DIR


NYC_ENDPOINT = "https://data.cityofnewyork.us/resource/43nn-pn8j.csv"


def fetch_nyc_inspections(limit: int = 50000) -> Path:
    """
    Fetch a sample of NYC restaurant inspections via Socrata endpoint.
    """
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
    params = {"$limit": limit}
    resp = requests.get(NYC_ENDPOINT, params=params, timeout=60)
    resp.raise_for_status()
    df = pd.DataFrame.from_records(resp.json())
    out_path = RAW_DATA_DIR / "nyc_inspections_sample.csv"
    df.to_csv(out_path, index=False)
    return out_path


def run_pipeline():
    """
    Steps:

    1. Pull inspection sample from NYC open data.
    2. Clean:
        - filter valid inspections
        - consolidate grades / violation codes.
    3. Engineer features:
        - cuisine type
        - prior violations
        - borough/zip
        - optional: menu risk score via external scraping.
    4. Train classification model (RF / gradient boosting).
    5. Evaluate (AUC, calibration, fairness across boroughs).
    6. Export scored dataset and aggregated risk metrics for Tableau.
    """
    inspections_path = fetch_nyc_inspections()

    # TODO:
    # df = pd.read_csv(inspections_path)
    # clean_df = clean_inspections(df)
    # features, target = build_features(clean_df)
    # model, metrics = train_classifier(features, target)
    # scored = score_restaurants(model, features)
    #
    # PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
    # scored.to_csv(PROCESSED_DATA_DIR / "restaurant_risk_scores.csv", index=False)

    print("Restaurant inspection pipeline outline executed.")


if __name__ == "__main__":
    run_pipeline()