In [None]:
"""
Pipeline for US Flight Delay Prediction.
"""

from pathlib import Path
import pandas as pd

from config import RAW_DATA_DIR, PROCESSED_DATA_DIR, DOT_API_KEY, NOAA_API_KEY


def fetch_dot_data() -> Path:
    """
    TODO: Implement DOT download (can be large, often manual).
    For now, assume you export monthly CSV from BTS Transtats and drop into data/raw.
    """
    return RAW_DATA_DIR / "On_Time_Reporting.csv"


def fetch_noaa_weather() -> Path:
    """
    TODO: Implement NOAA API calls for relevant airports & dates.
    """
    return RAW_DATA_DIR / "airport_weather.csv"


def run_pipeline():
    """
    Steps:

    1. Load flight data from BTS (CSV).
    2. Load weather data from NOAA and join by airport/date/hour.
    3. Engineer features:
        - carrier, route, distance
        - holiday/season indicators
        - congestion metrics (flights per airport/hour)
    4. Split train/test, train XGBoost classifier.
    5. Evaluate performance (AUC, confusion matrix).
    6. Save:
        - modeling-ready dataset
        - feature importances
        - scored sample for Tableau.
    """
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
    PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

    flights_path = fetch_dot_data()
    weather_path = fetch_noaa_weather()

    # TODO: implement:
    # flights = pd.read_csv(flights_path)
    # weather = pd.read_csv(weather_path)
    # combined = join_flights_weather(flights, weather)
    # features, target = build_features(combined)
    # model, metrics = train_xgboost(features, target)
    # scored = score_sample(model, features)

    # scored.to_csv(PROCESSED_DATA_DIR / "flights_scored.csv", index=False)
    print("Flight delay pipeline outline executed (implement TODO blocks).")


if __name__ == "__main__":
    run_pipeline()