In [None]:
import pandas as pd
from pathlib import Path

# Dossier des fichiers traités
PROCESSED_DIR = Path("../../data/processed")

# Fichiers à merger
paths = [
    PROCESSED_DIR / "ibtracs_era5_2024.csv",
    PROCESSED_DIR / "ibtracs_era5_2024_SP.csv",
]

# Chargement
dfs = []
for p in paths:
    if not p.exists():
        raise FileNotFoundError(f"Missing file: {p}")
    dfs.append(pd.read_csv(p, parse_dates=["time_stamp"]))

# Merge vertical
df_2024 = pd.concat(dfs, ignore_index=True)

# Optionnel : tri temporel
df_2024 = df_2024.sort_values("time_stamp").reset_index(drop=True)

# Sauvegarde finale
out_path = PROCESSED_DIR / "ibtracs_era5_2024.csv"
df_2024.to_csv(out_path, index=False)

# Sanity checks
print("[OK] Merged dataset saved")
print("Path:", out_path)
print("Shape:", df_2024.shape)
print("Cyclones:", df_2024["sid"].nunique())
print("Basins:", sorted(df_2024["basin"].unique()))
print(
    "Period:",
    df_2024["time_stamp"].min(),
    "→",
    df_2024["time_stamp"].max(),
)


[OK] Merged dataset saved
Path: ../data/processed/ibtracs_era5_2024.csv
Shape: (1308, 26)
Cyclones: 32
Basins: ['NI', 'SI', 'SP']
Period: 2024-01-01 00:00:00.000039936 → 2024-12-31 00:00:00.000039936


In [None]:
import pandas as pd
from pathlib import Path
from building_era5 import load_IBTrACS, clean_basin, sample_era5_year_fast

# -----------------------
# Paths
# -----------------------
ERA5_DIR = Path("../../data/era5_yearly_tests")
PROCESSED_DIR = Path("../data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

IBTRACS_PATH = PROCESSED_DIR / "ibtracs_usa_processed_20251216_164245.csv"
OUT_PATH = PROCESSED_DIR / "ibtracs_era5_2024.csv"

# -----------------------
# ERA5 files per basin
# -----------------------
era5_files = {
    "EP": ERA5_DIR / "era5_year2024_EP_n3.nc",
    "NI": ERA5_DIR / "era5_year2024_NI_n3.nc",
    "SI": ERA5_DIR / "era5_year2024_SI_n3.nc",
    "SP": ERA5_DIR / "era5_year2024_SP_n3.nc",
}

# -----------------------
# Load IBTrACS
# -----------------------
df_ib = load_IBTrACS(IBTRACS_PATH, years=[2024, 2024])
df_ib["basin"] = df_ib["basin"].apply(clean_basin)

print("IBTrACS 2024:")
print("obs:", len(df_ib))
print("cyclones:", df_ib["sid"].nunique())
print("basins:", sorted(df_ib["basin"].unique()))

# -----------------------
# Sampling per basin
# -----------------------
merged_all = []

for basin, nc_path in era5_files.items():
    print(f"\n=== Basin {basin} ===")

    if not nc_path.exists():
        print("ERA5 file missing → skipped")
        continue

    df_basin = df_ib[df_ib["basin"] == basin].copy()
    if df_basin.empty:
        print("No IBTrACS data → skipped")
        continue

    print("obs:", len(df_basin))
    print("cyclones:", df_basin["sid"].nunique())

    df_merged = sample_era5_year_fast(nc_path, df_basin)
    merged_all.append(df_merged)

# -----------------------
# Final merge + save
# -----------------------
if not merged_all:
    raise RuntimeError("No basin produced data")

df_2024 = (
    pd.concat(merged_all, ignore_index=True)
      .sort_values("time_stamp")
      .reset_index(drop=True)
)

df_2024.to_csv(OUT_PATH, index=False)

# -----------------------
# Sanity checks
# -----------------------
print("\n[OK] Final dataset saved")
print("Path:", OUT_PATH)
print("Shape:", df_2024.shape)
print("Cyclones:", df_2024["sid"].nunique())
print("Basins:", sorted(df_2024["basin"].unique()))
print(
    "Period:",
    df_2024["time_stamp"].min(),
    "→",
    df_2024["time_stamp"].max(),
)


Période retenue: 2024 → 2024 | obs: 4465 | cyclones: 94 | basins: ['EP', 'NA', 'NI', 'SI', 'SP', 'WP']
IBTrACS 2024:
obs: 4465
cyclones: 94
basins: ['EP', 'NA', 'NI', 'SI', 'SP', 'WP']

=== Basin EP ===
obs: 703
cyclones: 15


Sampling ERA5: 100%|██████████| 481/481 [00:01<00:00, 433.77it/s]



=== Basin NI ===
obs: 219
cyclones: 4


Sampling ERA5: 100%|██████████| 219/219 [00:00<00:00, 477.85it/s]



=== Basin SI ===
obs: 876
cyclones: 20


Sampling ERA5: 100%|██████████| 735/735 [00:01<00:00, 447.46it/s]



=== Basin SP ===
obs: 213
cyclones: 9


Sampling ERA5: 100%|██████████| 186/186 [00:00<00:00, 466.62it/s]



[OK] Final dataset saved
Path: ../data/processed/ibtracs_era5_2024.csv
Shape: (2011, 26)
Cyclones: 47
Basins: ['EP', 'NI', 'SI', 'SP']
Period: 2024-01-01 00:00:00.000039936 → 2024-12-31 00:00:00.000039936


In [2]:
import pandas as pd
from pathlib import Path
from datetime import datetime

# -------------------------------------------------
# Paths
# -------------------------------------------------
DATA_DIR = Path("../../data/processed")

files = [
    DATA_DIR / "ibtracs_era5_2022.csv",
    DATA_DIR / "ibtracs_era5_2023.csv",
    DATA_DIR / "ibtracs_era5_2024.csv",
]

today = datetime.today().strftime("%Y%m%d")
out_path = DATA_DIR / f"ibtracs_era5_{today}.csv"

# -------------------------------------------------
# Load & concatenate
# -------------------------------------------------
dfs = []

for f in files:
    print(f"Loading {f.name}")
    df = pd.read_csv(f, parse_dates=["time_stamp"])
    dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)

# -------------------------------------------------
# Basic sanity checks
# -------------------------------------------------
print("\n=== Dataset summary ===")
print("Rows:", len(df_all))
print("Cyclones:", df_all["sid"].nunique())
print("Years:", sorted(df_all["time_stamp"].dt.year.unique()))

# -------------------------------------------------
# Sort & save
# -------------------------------------------------
df_all = df_all.sort_values("time_stamp")

df_all.to_csv(out_path, index=False)

print(f"\n[OK] Merged dataset saved to: {out_path}")

Loading ibtracs_era5_2022.csv
Loading ibtracs_era5_2023.csv
Loading ibtracs_era5_2024.csv

=== Dataset summary ===
Rows: 12911
Cyclones: 230
Years: [np.int32(2022), np.int32(2023), np.int32(2024)]

[OK] Merged dataset saved to: ../../data/processed/ibtracs_era5_20251216.csv
