In [1]:
import time
import os
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
from datetime import timedelta
import requests

In [2]:
EARTHQUAKE_COLUMNS = [
    # Properties
    "id", "mag", "place", "time", "updated", "tz", "url", "detail",
    "felt", "cdi", "mmi", "alert", "status", "tsunami", "sig",
    "net", "code", "ids", "sources", "types", "nst", "dmin",
    "rms", "gap", "magType", "type", "title",

    # Geometry
    "longitude", "latitude", "depth",
]
def earthquakes_to_dataframe(data: dict) -> pd.DataFrame:
    rows = []

    for feature in data.get("features", []):
        props = feature.get("properties", {})
        geom = feature.get("geometry", {})
        coords = geom.get("coordinates", [None, None, None])

        row = {
            # ID
            "id": feature.get("id"),

            # Properties
            "mag": props.get("mag"),
            "place": props.get("place"),
            "time": props.get("time"),
            "updated": props.get("updated"),
            "tz": props.get("tz"),
            "url": props.get("url"),
            "detail": props.get("detail"),
            "felt": props.get("felt"),
            "cdi": props.get("cdi"),
            "mmi": props.get("mmi"),
            "alert": props.get("alert"),
            "status": props.get("status"),
            "tsunami": props.get("tsunami"),
            "sig": props.get("sig"),
            "net": props.get("net"),
            "code": props.get("code"),
            "ids": props.get("ids"),
            "sources": props.get("sources"),
            "types": props.get("types"),
            "nst": props.get("nst"),
            "dmin": props.get("dmin"),
            "rms": props.get("rms"),
            "gap": props.get("gap"),
            "magType": props.get("magType"),
            "type": props.get("type"),
            "title": props.get("title"),

            # Geometry (schema mapping)
            "longitude": coords[0],
            "latitude": coords[1],
            "depth": coords[2],
        }

        rows.append(row)

    return pd.DataFrame(rows)

In [3]:
def fetch_earthquakes(
        starttime: str,
        endtime: str,
        latitude: float = 34.0522,
        longitude: float = -118.2437,
        maxradiuskm: int = 100
):
    """
    Fetch earthquake data from the USGS API.

    starttime, endtime: ISO strings like '2024-09-01T00:00:00'
    """
    url = "https://earthquake.usgs.gov/fdsnws/event/1/query"

    params = {
        "format": "geojson",
        "starttime": starttime,
        "endtime": endtime,
        "latitude": latitude,
        "longitude": longitude,
        "maxradiuskm": maxradiuskm,
        "eventtype": "earthquake",
        "orderby": "time",
    }

    response = requests.get(url, params=params)
    data = response.json()

    df = earthquakes_to_dataframe(data)
    if df.empty:
        df = pd.DataFrame(
            [{col: None for col in EARTHQUAKE_COLUMNS}]
        )
        df = df.drop(columns=['url','tz','tsunami', 'detail', 'ids', 'sources', 'types'])
        return df

    
    df['time'] = pd.to_datetime(df['time'], unit='ms')
    df['updated'] = pd.to_datetime(df['updated'], unit='ms')

    df = (
        df
        .sort_values('mag', ascending=False)
        .groupby(df['time'].dt.floor('min'), as_index=False)
        .first()
    )

    df['time'] = pd.to_datetime(df['time'], unit='ms').dt.strftime('%Y-%m-%d %H:%M:%S')
    df['updated'] = pd.to_datetime(df['updated'], unit='ms').dt.strftime('%Y-%m-%d %H:%M:%S')

    # dropping not needed columns

    df = df.drop(columns=['url','tz','tsunami', 'detail', 'ids', 'sources', 'types'])

    n = df['id'].notna().sum()
    df = df.loc[:, df.notna().sum() == n]

    return df

In [4]:
def simulate_estimated_delay_backward(
    minutes_to_departure,
    final_delay,
    reveal_bias,
    noise_rng
):
    # No delay shown if no final delay
    if final_delay <= 0:
        return 0.0

    # >6h out → nothing
    if minutes_to_departure > 360:
        return 0.0

    # Progress: 6h out -> 0, departure -> 1
    progress = 1 - (minutes_to_departure / 360)
    progress = np.clip(progress, 0, 1)

    # Small delays revealed late, big ones earlier
    if final_delay <= 15:
        base_power = 4.0
    elif final_delay <= 30:
        base_power = 2.5
    else:
        base_power = 1.5

    reveal_power = np.clip(base_power + reveal_bias, 1.2, 5.0)

    revealed_fraction = progress ** reveal_power
    expected = final_delay * revealed_fraction

    # Fading noise (cannot break constraints)
    noise_scale = (1 - progress) * min(3, final_delay * 0.1)
    noise = noise_rng.normal(0, noise_scale)

    estimate = expected + noise

    # Past planned departure → must reflect elapsed delay
    if minutes_to_departure < 0:
        estimate = max(estimate, abs(minutes_to_departure))

    # Hard bounds
    return float(np.clip(estimate, 0, final_delay))


In [None]:
def save_daily_snapshots(df_flights, start_date=None):
    flights_output_dir="flights"
    weather_output_dir="weather"
    seismic_output_dir="seismic"
    
    os.makedirs(flights_output_dir, exist_ok=True)
    os.makedirs(weather_output_dir, exist_ok=True)
    os.makedirs(seismic_output_dir, exist_ok=True)
    df = df_flights.copy()
    df["Planned_DT"] = pd.to_datetime(df["Planned_Dep_Timestamp"], unit="s")
    df["Actual_DT"] = pd.to_datetime(df["Actual_Dep_Timestamp"], unit="s", errors="coerce")

    while True:
        # Date setup
        if start_date is None:
            now = datetime.now() - relativedelta(years=1, hours=12)
            today = now.date()
        else:
            now = pd.to_datetime(start_date)
            today = now.date()
            start_date += relativedelta(seconds=30)
            
        # Flights

        mask = (
            (df["Planned_DT"] < now + timedelta(hours=6)) &
            (
                (df["Actual_DT"] > now) | 
                (df["Actual_DT"].isna())
            )
        )
        snapshot = df[mask].copy()
        snapshot["Estimated_Delay"] = None
        snapshot["Simulation_Timestamp"] = now.strftime("%Y-%m-%d %H:%M:%S")

        for idx, row in snapshot.iterrows():
            planned = row["Planned_DT"]
            actual = row["Actual_DT"]

            if pd.isna(actual) or actual > now:
                snapshot.at[idx, "Actual_Dep"] = None
                snapshot.at[idx, "Actual_Dep_Timestamp"] = None
                snapshot.at[idx, "Delay"] = None
                snapshot.at[idx, "Cancelled"] = None

                minutes_to_departure = (planned - now).total_seconds() / 60

                seed = abs(hash((row["Flight_Num"], planned.date()))) % (2**32)
                rng = np.random.default_rng(seed)

                final_delay = row["Delay"]  # ground truth

                # Some flights reveal earlier / later
                reveal_bias = rng.normal(0, 0.4)

                # Stable noise across snapshots
                noise_rng = np.random.default_rng(seed + 1)

                est_delay = simulate_estimated_delay_backward(
                    minutes_to_departure=minutes_to_departure,
                    final_delay=final_delay,
                    reveal_bias=reveal_bias,
                    noise_rng=noise_rng
                )

                snapshot.at[idx, "Estimated_Delay"] = round(est_delay, 0)

            else:
                snapshot.at[idx, "Estimated_Delay"] = row["Delay"]

        filename = now.strftime("%Y-%m-%d_%H-%M-%S") + ".csv"
        filepath = os.path.join(flights_output_dir, filename)

        snapshot.drop(columns=["Planned_DT", "Actual_DT"], inplace=True)
        snapshot.to_csv(filepath, index=False)

        print(f"Saved flights: {filepath}")
        
        # Weather
        if today <= pd.to_datetime("2024-10-26").date() and today >= pd.to_datetime("2024-09-01").date():
            weather_daily = pd.read_csv(f"../data_historic/weather/weather_{today}.csv")
            weather_record = weather_daily[weather_daily["weather_time"] == now.strftime("%Y-%m-%d %H:%M:%S")]
            
            filename = now.strftime("%Y-%m-%d_%H-%M-%S") + ".csv"
            filepath = os.path.join(weather_output_dir, filename)
            weather_record.to_csv(filepath, index=False)
            
            print(f"Saved weather: {filepath}")
            
        # Seismic
        starttime = now.strftime("%Y-%m-%dT%H:%M:%S")
        endtime = (now + timedelta(seconds=29)).strftime("%Y-%m-%dT%H:%M:%S")
        df_seismic = fetch_earthquakes(starttime, endtime)
        df_seismic['time'] = pd.to_datetime(df_seismic['time']).dt.floor('min')
        df_seismic['updated'] = pd.to_datetime(df_seismic['updated'])

        start_dt = pd.to_datetime(starttime).floor('30s')
        end_dt = pd.to_datetime(endtime).floor('30s')

        full_intervals = pd.date_range(start=start_dt, end=end_dt, freq='30s')

        df_seismic = df_seismic.set_index('time').reindex(full_intervals)

        missing_mask = df_seismic['id'].isna()
        df_seismic.loc[missing_mask, 'updated'] = df_seismic.index[missing_mask]

        df_seismic = df_seismic.reset_index().rename(columns={'index': 'time'})

        df_seismic['time'] = df_seismic['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
        df_seismic['updated'] = pd.to_datetime(df_seismic['updated']).dt.strftime('%Y-%m-%d %H:%M:%S')

        df_seismic['datetime'] = pd.to_datetime(df_seismic['time'])
        filename = now.strftime("%Y-%m-%d_%H-%M-%S") + ".csv"
        filepath = os.path.join(seismic_output_dir, filename)
        df_seismic.to_csv(filepath, index=False)
        print(f"Saved seismic: {filepath}")

        time.sleep(30)

In [6]:
df = pd.read_csv("flights_test.csv")

In [None]:
start_date = pd.to_datetime("2024-09-04 00:41:00")
save_daily_snapshots(df, start_date=start_date)

Saved flights: flights\2024-09-04_00-41-00.csv
Saved weather: weather\2024-09-04_00-41-00.csv
Saved seismic: seismic\2024-09-04_00-41-00.csv
Saved flights: flights\2024-09-04_00-41-30.csv
Saved weather: weather\2024-09-04_00-41-30.csv
Saved seismic: seismic\2024-09-04_00-41-30.csv
Saved flights: flights\2024-09-04_00-42-00.csv
Saved weather: weather\2024-09-04_00-42-00.csv
Saved seismic: seismic\2024-09-04_00-42-00.csv
Saved flights: flights\2024-09-04_00-42-30.csv
Saved weather: weather\2024-09-04_00-42-30.csv
Saved seismic: seismic\2024-09-04_00-42-30.csv
Saved flights: flights\2024-09-04_00-43-00.csv
Saved weather: weather\2024-09-04_00-43-00.csv
Saved seismic: seismic\2024-09-04_00-43-00.csv
