In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

def haversine_km(lat1, lon1, lat2, lon2):
    """
    Compute great-circle distance (km) between two lat/lon points.
    """
    R = 6371.0  # Earth radius (km)

    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = (
        np.sin(dlat / 2.0) ** 2
        + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    )
    c = 2 * np.arcsin(np.sqrt(a))

    return R * c


def post_process_final(df):
    """
    Final post-processing for IBTrACS + ERA5 dataset.

    - Compute ERA5 spatial approximation error (km)
    - Convert ERA5 MSLP from Pa → hPa
    - Keep clean and consistent columns
    - Safe against pandas chained assignment warnings
    """

    df = df.copy()

    # -------------------------------------------------
    # 1. Spatial error ERA5 vs IBTrACS
    # -------------------------------------------------
    required_cols = {"lat", "lon", "latitude", "longitude"}
    if required_cols.issubset(df.columns):
        df["era5_spatial_error_km"] = haversine_km(
            df["lat"].values,
            df["lon"].values,
            df["latitude"].values,
            df["longitude"].values,
        )
    else:
        raise ValueError("Missing ERA5 latitude/longitude for spatial error")

    # -------------------------------------------------
    # 2. Pressure conversion (ERA5 Pa → hPa)
    # -------------------------------------------------
    if "mean_sea_level_pressure" in df.columns:
        df["mean_sea_level_pressure_hpa"] = (
            df["mean_sea_level_pressure"] / 100.0
        )

    # -------------------------------------------------
    # 3. Final clean column selection
    # -------------------------------------------------
    final_columns = [
        "sid",
        "name",
        "basin",
        "season",
        "time_stamp",
        "lat",
        "lon",
        "wind",
        "pressure",                     # IBTrACS (hPa)
        "storm_speed",
        "storm_dir",
        "2m_temperature",               # ERA5 (K)
        "mean_sea_level_pressure_hpa",  # ERA5 (hPa)
        "10m_u_component_of_wind",
        "10m_v_component_of_wind",
        "era5_spatial_error_km",
    ]

    final_columns = [c for c in final_columns if c in df.columns]
    df_final = df[final_columns].copy()

    # -------------------------------------------------
    # 4. Sort & reset index
    # -------------------------------------------------
    df_final = (
        df_final
        .sort_values("time_stamp")
        .reset_index(drop=True)
    )

    return df_final


Meteorological variables are extracted from ERA5 at the nearest grid point to the IBTrACS cyclone center. The resulting spatial approximation error, quantified using the haversine distance, is on the order of a few tens of kilometers, consistent with the native spatial resolution of ERA5.

In [None]:
DATA_RAW = Path("../../data/raw/")

file_name = "ibtracs_era5_2022"

now = datetime.now()
timestamp_str = now.strftime("%Y%m%d_%H%M")
output_file_name = f"ibtracs_era5_{timestamp_str}.csv"

df = pd.read_csv(
    DATA_RAW / f"{file_name}.csv",
    parse_dates=["time_stamp"]
)

df_final = post_process_final(df)

df_final.to_csv(
    f"../../data/processed/{output_file_name}",
    index=False
)

In [None]:
df_final

Récupérer les températures océaniques

In [None]:
PROCESSED_DIR = Path("../../data/processed")

files = [
    PROCESSED_DIR / "ibtracs_era5_20251217_0007.csv",
    PROCESSED_DIR / "ibtracs_era5_20251217_0058.csv",
    PROCESSED_DIR / "ibtracs_era5_20251217_0145.csv",
]

dfs = []
for f in files:
    print(f"Loading {f.name}")
    df = pd.read_csv(f, parse_dates=["time_stamp"])
    dfs.append(df)

# Concaténation
df_final = pd.concat(dfs, ignore_index=True)

# Optionnel : tri propre
df_final = df_final.sort_values(
    by=["time_stamp", "sid"],
    ascending=[True, True]
).reset_index(drop=True)

# Sauvegarde
out_path = PROCESSED_DIR / "ibtracs_era5_final_2022_2024.csv"
df_final.to_csv(out_path, index=False)

print("\n[OK] Dataset final sauvegardé :", out_path)
print("Shape:", df_final.shape)
print("Cyclones:", df_final["sid"].nunique())
print(
    "Period:",
    df_final["time_stamp"].min(),
    "→",
    df_final["time_stamp"].max(),
)






