“Je télécharge ERA5 sur une zone et une période définies par quelques cyclones représentatifs, puis j’extrais tous les cyclones qui passent dans cette zone.”

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from pathlib import Path

import plotly.io as pio
pio.renderers.default = "notebook"

import sys
sys.path.append("../")
from building_era5 import load_IBTrACS

import matplotlib.pyplot as plt

# IBTrACS

In [None]:
PROCESSED_DIR = Path("../../data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

path_ibtracs = PROCESSED_DIR / "other/ibtracs_usa_20251216.csv"

df_ibtracs = load_IBTrACS(path_ibtracs, years=[2022, 2024])

df_ibtracs["time_stamp"] = pd.to_datetime(df_ibtracs["time_stamp"], errors="coerce")
df_ibtracs = df_ibtracs.dropna(subset=["time_stamp"])

years = sorted(df_ibtracs["time_stamp"].dt.year.unique())

for year in years:
    df_y = df_ibtracs[df_ibtracs["time_stamp"].dt.year == year]

    counts = (
        df_y.groupby("basin")
        .size()
        .sort_values(ascending=False)
    )

    plt.figure(figsize=(8, 4))
    counts.plot(kind="bar")
    plt.title(f"Number of IBTrACS observations per basin – {year}")
    plt.ylabel("Number of observations")
    plt.xlabel("Basin")
    plt.grid(axis="y")
    plt.tight_layout()
    plt.show()


In [None]:
hour_counts = (
    df_ibtracs["time_stamp"]
    .dt.hour
    .value_counts()
    .sort_index()
)

plt.figure(figsize=(8, 4))
hour_counts.plot(kind="bar")
plt.xlabel("Hour (UTC)")
plt.ylabel("Number of observations")
plt.title("Distribution of observation hours (IBTrACS)")
plt.grid(axis="y")
plt.tight_layout()
plt.show()

In [None]:
basins = sorted(df_ibtracs["basin"].unique())

for basin in basins:
    df_b = df_ibtracs[df_ibtracs["basin"] == basin]

    month_counts = (
        df_b["time_stamp"]
        .dt.month
        .value_counts()
        .sort_index()
    )

    plt.figure(figsize=(8, 4))
    month_counts.plot(kind="bar")
    plt.xlabel("Month")
    plt.ylabel("Number of observations")
    plt.title(f"Monthly distribution – Basin {basin}")
    plt.grid(axis="y")
    plt.tight_layout()
    plt.show()


# Bouding Box

In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime # Importation conservée si nécessaire ailleurs

BASIN_BBOX = {
    "NA": [60, -100, 0, -10],
    "EP": [40, -160, 0, -80],
    "WP": [50, 100, 0, 180],
    "NI": [30, 40, -5, 100],
    "SI": [0, 20, -40, 100],
    # CORRECTION ICI: Changement des longitudes pour SP
    # Ancienne: "SP": [0, 160, -40, -120],  <- Traverses Greenwich, arrive en Afrique.
    # Nouvelle: Pacifique Ouest vers Est dans l'hémisphère Ouest (entre -160 et -120)
    "SP": [0, -160, -40, -120], # Pacifique Sud, de 160 Ouest à 120 Ouest
    "SA": [0, -60, -40, 20],
}

BASIN_COLORS = {
    "NA": "#1f77b4",  # blue
    "EP": "#ff7f0e",  # orange
    "WP": "#2ca02c",  # green
    "NI": "#d62728",  # red
    "SI": "#9467bd",  # purple
    "SP": "#8c564b",  # brown
    "SA": "#e377c2",  # pink
}



def plot_basin_bounding_boxes_with_template(basin_bbox_dict):
    # Création d'un DataFrame minimal pour initialiser la carte Plotly
    df_dummy = pd.DataFrame({
        "lon": [0],
        "lat": [0],
        "sid": ["_dummy_"],
        "name": ["dummy"]
    })

    # Utilisation de px.line_geo pour initialiser la figure avec la projection désirée
    fig = px.line_geo(
        df_dummy,
        lon="lon",
        lat="lat",
        projection="natural earth",
        title="Cyclone basin bounding boxes",
    )

    # Suppression de la trace dummy générée par px.line_geo
    fig.data = []

    # Ajout des boîtes englobantes et des étiquettes
    for basin, bbox in basin_bbox_dict.items():
        north, west, south, east = bbox
        color = BASIN_COLORS.get(basin, "black")

        # Coordonnées du rectangle (les 5 points nécessaires pour boucler la ligne)
        lats = [south, north, north, south, south]
        lons = [west, west, east, east, west]

        # 1. Bounding box (Lignes)
        fig.add_trace(
            go.Scattergeo(
                lon=lons,
                lat=lats,
                mode="lines",
                line=dict(width=2, color=color),
                name=basin,
                hoverinfo="text",
                text=(
                    f"Basin {basin}<br>"
                    f"N: {north}°<br>S: {south}°<br>"
                    f"W: {west}°<br>E: {east}°"
                ),
            )
        )

        # 2. Label (Texte au centre)
        fig.add_trace(
            go.Scattergeo(
                lon=[(west + east) / 2],
                lat=[(north + south) / 2],
                mode="text",
                text=[basin],
                textfont=dict(color=color, size=12),
                showlegend=False,
            )
        )

    # Mise à jour de la mise en page
    fig.update_layout(
        legend_title_text="Basin",
        margin=dict(l=0, r=0, t=40, b=0),
        hovermode="closest",
    )

    fig.show()

# Exécution avec les coordonnées corrigées
plot_basin_bounding_boxes_with_template(BASIN_BBOX)

# ERA5

In [None]:
# Paths
PATH = "../../data/processed/ibtracs_era5_20251217_0148.csv"

# Load data
df = pd.read_csv(PATH, parse_dates=["time_stamp"])

print("Dataset shape:", df.shape)
print("Years:", sorted(df["time_stamp"].dt.year.unique()))

In [None]:
df

In [None]:
METEO_VARS = [
    "10m_u_component_of_wind",
    "10m_v_component_of_wind",
    "2m_temperature",
    "mean_sea_level_pressure_hpa",
]

def summarize_year(df, year):
    df_y = df[df["time_stamp"].dt.year == year]

    if df_y.empty:
        print(f"\n===== SUMMARY {year} =====")
        print("No data")
        return

    print(f"\n===== SUMMARY {year} =====")
    print("Observations:", len(df_y))
    print("Cyclones:", df_y["sid"].nunique())
    print(
        "Period:",
        df_y["time_stamp"].min(),
        "→",
        df_y["time_stamp"].max(),
    )

    print("\nObservations per cyclone:")
    print(df_y.groupby("sid").size().describe())

    print("\nMeteorological variables:")
    display(df_y[METEO_VARS].describe())


In [None]:
for year in sorted(df["time_stamp"].dt.year.unique()):
    summarize_year(df, year)

In [None]:
def check_nans_year(df, year):
    df_y = df[df["time_stamp"].dt.year == year]

    print(f"\nNaN check {year}")
    if df_y.empty:
        print("No data")
        return

    print(df_y[METEO_VARS].isna().mean())


for year in sorted(df["time_stamp"].dt.year.unique()):
    check_nans_year(df, year)


In [None]:
def plot_cyclone_tracks_year(df, year):
    df_y = (
        df[df["time_stamp"].dt.year == year]
        .sort_values("time_stamp")
        .copy()
    )

    if df_y.empty:
        print(f"No data for {year}")
        return

    fig = px.line_geo(
        df_y,
        lon="lon",
        lat="lat",
        color="sid",
        hover_name="name",
        projection="natural earth",
        title=f"Cyclone trajectories – {year}",
    )

    fig.update_traces(
        line=dict(width=2),
        hovertemplate=
        "<b>Cyclone:</b> %{customdata[0]}<br>"
        "<b>Date:</b> %{customdata[1]}<br>"
        "<b>Lat:</b> %{lat:.2f}°<br>"
        "<b>Lon:</b> %{lon:.2f}°<br><br>"
        "<b>IBTrACS</b><br>"
        "Wind: %{customdata[2]} kt<br>"
        "Pressure: %{customdata[3]} hPa<br><br>"
        "<b>ERA5</b><br>"
        "2m Temp: %{customdata[4]:.1f} K<br>"
        "MSLP: %{customdata[5]:.0f} Pa<br>"
        "U10: %{customdata[6]:.1f} m/s<br>"
        "V10: %{customdata[7]:.1f} m/s"
        "<extra></extra>",
        customdata=df_y[[
            "name",
            "time_stamp",
            "wind",
            "pressure",
            "2m_temperature",
            "mean_sea_level_pressure_hpa",
            "10m_u_component_of_wind",
            "10m_v_component_of_wind",
        ]].values
    )

    fig.update_layout(
        legend_title_text="Cyclone ID",
        margin=dict(l=0, r=0, t=40, b=0),
        hovermode="closest",
    )

    fig.show()


In [None]:
plot_cyclone_tracks_year(df, 2022)
plot_cyclone_tracks_year(df, 2023)
plot_cyclone_tracks_year(df, 2024)

Apparent name changes along some cyclone trajectories are not inconsistencies but result from IBTrACS aggregating multiple operational agencies and basin-specific naming conventions. Cyclone names are reused and may change when a system crosses basin boundaries, while the SID remains the unique identifier.

In [None]:
def count_ibtracs_cyclones_year(df_ibtracs, year):
    df_y = df_ibtracs[df_ibtracs["time_stamp"].dt.year == year]
    return df_y["sid"].nunique()

def count_era5_cyclones_year(df_era5, year):
    df_y = df_era5[df_era5["time_stamp"].dt.year == year]
    return df_y["sid"].nunique()

def count_ibtracs_obs_year(df_ibtracs, year):
    df_y = df_ibtracs[df_ibtracs["time_stamp"].dt.year == year]
    return len(df_y)

def count_era5_obs_year(df_era5, year):
    df_y = df_era5[df_era5["time_stamp"].dt.year == year]
    return len(df_y)


In [None]:
rows = []

for year in years:
    n_cyc_ib = count_ibtracs_cyclones_year(df_ibtracs, year)
    n_cyc_era = count_era5_cyclones_year(df, year)

    n_obs_ib = count_ibtracs_obs_year(df_ibtracs, year)
    n_obs_era = count_era5_obs_year(df, year)

    rows.append({
        "year": year,
        "cyclones_ibtracs": n_cyc_ib,
        "cyclones_with_era5": n_cyc_era,
        "cyclone_coverage_ratio": n_cyc_era / n_cyc_ib if n_cyc_ib > 0 else np.nan,
        "observations_ibtracs": n_obs_ib,
        "observations_with_era5": n_obs_era,
        "observation_coverage_ratio": n_obs_era / n_obs_ib if n_obs_ib > 0 else np.nan,
    })

df_coverage_full = pd.DataFrame(rows)
df_coverage_full


In [None]:
df_ibtracs

In [None]:
df