In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

import sys
sys.path.append("../")
from building_era5 import post_process_final

DATA_PROCESSED= Path("../../data/processed/")
DATA_RAW = Path("../../data/raw/")

In [None]:
df = pd.read_csv(
    DATA_PROCESSED / "ibtracs_era5_20251217_1436.csv",
    parse_dates=["time_stamp"]
)

df.era5_spatial_error_km.describe()

In [None]:
print("=== IBTrACS ===")
print("lat:", df["lat"].min(), df["lat"].max())
print("lon:", df["lon"].min(), df["lon"].max())

print("\n=== ERA5 (sampled grid point) ===")
print("latitude:", df["lat"].min(), df["lat"].max())
print("longitude:", df["lon"].min(), df["lon"].max())

In [None]:
import numpy as np

def haversine_raw(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))


df["debug_dist_km"] = haversine_raw(
    df["lat"].values,
    df["lon"].values,
    df["lat"].values,
    df["lon"].values,
)

print(df["debug_dist_km"].describe())


In [None]:
worst = df.sort_values("debug_dist_km", ascending=False).head(10)

print(
    worst[
        ["time_stamp", "basin", "lat", "lon", "latitude", "longitude", "debug_dist_km"]
    ].to_string(index=False)
)


In [None]:
def haversine_km(lat1, lon1, lat2, lon2):
    """
    Compute great-circle distance (km) between two lat/lon points.
    """
    R = 6371.0  # Earth radius (km)

    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = (
        np.sin(dlat / 2.0) ** 2
        + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    )
    c = 2 * np.arcsin(np.sqrt(a))

    return R * c

def wrap_lon180(lon):
    """
    Wrap longitude to [-180, 180]
    """
    lon = np.asarray(lon, dtype=float)
    return ((lon + 180) % 360) - 180


def post_process_final(df):
    """
    Final post-processing for IBTrACS + ERA5 dataset.

    - Ensure consistent longitude convention for distance computation
    - Compute ERA5 spatial approximation error (km)
    - Convert ERA5 MSLP from Pa → hPa
    - Keep clean and consistent columns
    """

    df = df.copy()

    # -------------------------------------------------
    # 1. Spatial error ERA5 vs IBTrACS (CORRECTED)
    # -------------------------------------------------
    required_cols = {"lat", "lon", "latitude", "longitude"}
    if not required_cols.issubset(df.columns):
        raise ValueError("Missing ERA5 latitude/longitude for spatial error")

    lon_ib = wrap_lon180(df["lon"].values)
    lon_era = wrap_lon180(df["longitude"].values)

    df["era5_spatial_error_km"] = haversine_km(
        df["lat"].values,
        lon_ib,
        df["latitude"].values,
        lon_era,
    )

    # -------------------------------------------------
    # 2. Pressure conversion (ERA5 Pa → hPa)
    # -------------------------------------------------
    if "mean_sea_level_pressure" in df.columns:
        df["mean_sea_level_pressure_hpa"] = (
            df["mean_sea_level_pressure"] / 100.0
        )

    # -------------------------------------------------
    # 3. Final clean column selection
    # -------------------------------------------------
    final_columns = [
        "sid",
        "name",
        "basin",
        "season",
        "time_stamp",
        "lat",
        "lon",
        "wind",
        "pressure",                     # IBTrACS (hPa)
        "storm_speed",
        "storm_dir",
        "2m_temperature",               # ERA5 (K)
        "mean_sea_level_pressure_hpa",  # ERA5 (hPa)
        "10m_u_component_of_wind",
        "10m_v_component_of_wind",
        "era5_spatial_error_km",
    ]

    final_columns = [c for c in final_columns if c in df.columns]
    df_final = df[final_columns].copy()

    # -------------------------------------------------
    # 4. Sort & reset index
    # -------------------------------------------------
    df_final = (
        df_final
        .sort_values("time_stamp")
        .reset_index(drop=True)
    )

    return df_final

def debug_lon_lat_ranges(df, n=5):
    print("IBTrACS lat range:", df["lat"].min(), df["lat"].max())
    print("IBTrACS lon range:", df["lon"].min(), df["lon"].max())

    print("ERA5 lat range (latitude):", df["latitude"].min(), df["latitude"].max())
    print("ERA5 lon range (longitude):", df["longitude"].min(), df["longitude"].max())

    # exemples de lignes
    cols = ["time_stamp", "lat", "lon", "latitude", "longitude"]
    print("\nSample rows:")
    print(df[cols].sample(min(n, len(df)), random_state=0).to_string(index=False))

def debug_outliers(df_final, k=10):
    cols = ["time_stamp","sid","basin","lat","lon","era5_spatial_error_km"]
    if "latitude" in df_final.columns: cols += ["latitude"]
    if "longitude" in df_final.columns: cols += ["longitude"]

    worst = df_final.sort_values("era5_spatial_error_km", ascending=False).head(k)
    print(worst[cols].to_string(index=False))


In [None]:
DATA_RAW = Path("../../data/raw/")

dfs = []
for year in [2022, 2023, 2024] :
    file_name = f"ibtracs_era5_{year}"

    now = datetime.now()
    timestamp_str = now.strftime("%Y%m%d_%H%M")
    output_file_name = f"ibtracs_era5_{timestamp_str}.csv"

    df = pd.read_csv(
        DATA_RAW / f"{file_name}.csv",
        parse_dates=["time_stamp"]
    )

    df_final = post_process_final(df)

    dfs.append(df_final)

df_all = pd.concat(dfs, ignore_index=True)
df_all = df_all.sort_values("time_stamp").reset_index(drop=True)

df_all.describe()

Meteorological variables are extracted from ERA5 at the nearest grid point to the IBTrACS cyclone center. The resulting spatial approximation error, quantified using the haversine distance, is on the order of a few tens of kilometers, consistent with the native spatial resolution of ERA5.