# Extreme Rainfalls Nowcasting (UK)

<h2 style="color:Black;">Module Code: CSMPR - MSc Project</h2>

<h2 style="color:Black;">Project Title: Predictive Modelling of Extreme Rainfalls</h2>

<h3 style="color:Black;">Student Number: <span style="color:green;">32822955</span></h3>

<h3 style="color:Black;">Acknowledgments: Supervisor - Professor. Atta Badii, Researcher - Kieran Hunt </h3>

## Setup & helpers

In [1]:

# --- Core standard libs ---
import sys, os, pathlib, traceback, glob, warnings
from pathlib import Path
from datetime import datetime, timedelta
from urllib.parse import urljoin
from typing import Dict, List, Tuple
import subprocess, shutil, site, re, time, json, math, errno, warnings, glob

# --- Third-party scientific stack ---
import numpy as np
import pandas as pd
import xarray as xr
from tqdm import tqdm

# --- Environment check ---
expected_env = "rain311"
print("🔎 Python executable:", sys.executable)

env_path = sys.executable.lower()
if expected_env not in env_path:
    print(f"⚠️ WARNING: You are NOT running inside '{expected_env}'!")
    print("   Please change kernel in the Jupyter menu: "
          "Kernel → Change Kernel → Python (rain311)")
else:
    print(f"✅ Correct kernel detected: {expected_env}")

# --- Control NumExpr threads (performance tuning) ---
try:
    import numexpr as ne
    os.environ["NUMEXPR_MAX_THREADS"] = "16"
    ne.set_num_threads(16)
    print(f"⚡ NumExpr threads set to: {ne.nthreads}")
except ImportError:
    print("⚠️ NumExpr is not installed. Please ensure it's in environment.yml")

print("✅ Startup cell finished. Ready to go!")


🔎 Python executable: C:\Users\agree\conda_envs\rain311\python.exe
✅ Correct kernel detected: rain311
⚡ NumExpr threads set to: 16
✅ Startup cell finished. Ready to go!


## Setup & helpers (updated for separate inputs)

In [2]:
# ==============================================
# Setup & Helpers (clean + consistent)
# ==============================================

import os, glob, warnings
from pathlib import Path
import xarray as xr
import numpy as np

# ----------------------------
# Project paths
# ----------------------------
BASE = Path("D:/extreme_rainfalls")
RAW  = BASE / "data" / "raw"
PROC = BASE / "data" / "processed"

# Input directories
ERA5_SINGLE_RAW = RAW / "era5_single"
ERA5_PL_RAW     = RAW / "era5_pressure"
IMERG_RAW       = RAW / "imerg"

# Processed directories
ERA5_SINGLE_30 = PROC / "era5_single_30min"
ERA5_PL_30     = PROC / "era5_pressure_30min"

FE_OUT = PROC / "feature_engineered"
FE_OUT.mkdir(parents=True, exist_ok=True)


# Standardised outputs
ERA5_STD_SINGLE = PROC / "era5_standardised" / "single"
ERA5_STD_PL     = PROC / "era5_standardised" / "pressure"
IMERG_STD       = PROC / "imerg_standardised"

for d in [ERA5_STD_SINGLE, ERA5_STD_PL, IMERG_STD]:
    d.mkdir(parents=True, exist_ok=True)
    
# --- Finalized variables ---
ERA5_SINGLE_VARS = [
    "total_precipitation","convective_precipitation","2m_temperature",
    "10m_u_component_of_wind","10m_v_component_of_wind",
    "surface_pressure","total_column_water_vapour",
    "total_column_cloud_liquid_water","boundary_layer_height","mean_sea_level_pressure"
]
ERA5_PL_LEVELS = ["850","700","500","300"]
ERA5_PL_VARS = ["specific_humidity","u_component_of_wind","v_component_of_wind",
                "geopotential","relative_humidity","temperature"]

IMERG_VARS = [
    "precipitationCal","precipitationUncal","precipitationQualityIndex",
    "randomError","probabilityLiquidPrecipitation"
]

ACCUM_VARS = {"total_precipitation", "convective_precipitation"} # hourly totals (m)

# Years in scope
YEARS       = list(range(2015, 2020))      # 2015–2025 inclusive
TRAIN_YEARS = list(range(2015, 2022))      # 2015–2021
VAL_YEARS   = [2022, 2023]
TEST_YEARS  = [2024, 2025]


In [3]:

# ----------------------------
# Normalisation helper
# ----------------------------
def normalize_dataset(ds: xr.Dataset, path: Path = None, do_transpose: bool = True) -> xr.Dataset:
    """
    Clean and standardize a dataset:
    - Ensures 'time' coordinate exists (renames valid_time / forecast_time if needed)
    - Drops ERA5 expver dimension if present
    - Standardizes lat/lon naming and ascending order
    - Normalizes ERA5 short variable names (tp → total_precipitation, etc.)
    - Optionally transposes to a consistent dimension order
    """

    # Fix time coordinate
    if "time" not in ds.coords:
        for candidate in ["valid_time", "forecast_time", "forecast_reference_time"]:
            if candidate in ds.coords:
                ds = ds.rename({candidate: "time"})
                if path: print(f"⚠️ Renamed '{candidate}' → 'time' in {path}")
                break
        else:
            raise ValueError(f"No 'time' coordinate found. Got coords: {list(ds.coords)}")

    # Drop expver if present
    if "expver" in ds.dims:
        ds = ds.isel(expver=-1, drop=True)
        if path: print(f"ℹ️ Dropped 'expver' in {path}")

    # Standardize lat/lon coords
    rename_map = {}
    if "latitude" in ds.coords:  rename_map["latitude"] = "lat"
    if "y" in ds.coords:         rename_map["y"] = "lat"
    if "longitude" in ds.coords: rename_map["longitude"] = "lon"
    if "x" in ds.coords:         rename_map["x"] = "lon"
    if rename_map:
        ds = ds.rename(rename_map)

    # Ensure ascending
    if "lat" in ds.coords and ds.lat[0] > ds.lat[-1]:
        ds = ds.reindex(lat=ds.lat[::-1])
    if "lon" in ds.coords and ds.lon[0] > ds.lon[-1]:
        ds = ds.reindex(lon=ds.lon[::-1])

    # Normalize ERA5 variable names
    var_map = {
        # ERA5 single-level
        "tp": "total_precipitation",
        "cp": "convective_precipitation",
        "t2m": "2m_temperature",
        "u10": "10m_u_component_of_wind",
        "v10": "10m_v_component_of_wind",
        "sp": "surface_pressure",
        "tcwv": "total_column_water_vapour",
        "tclw": "total_column_cloud_liquid_water",
        "blh": "boundary_layer_height",
        "msl": "mean_sea_level_pressure",
        # ERA5 pressure-level
        "q": "specific_humidity",
        "u": "u_component_of_wind",
        "v": "v_component_of_wind",
        "z": "geopotential",
        "r": "relative_humidity",
        "t": "temperature",
    }
    rename_dict = {k: v for k, v in var_map.items() if k in ds.data_vars}
    if rename_dict:
        ds = ds.rename(rename_dict)
        if path: print(f"ℹ️ Renamed variables in {path}: {rename_dict}")

    # Transpose for consistency
    if do_transpose:
        dims = list(ds.dims)
        if set(["time", "pressure_level", "lat", "lon"]).issubset(dims):
            ds = ds.transpose("time", "pressure_level", "lat", "lon")
        elif set(["time", "lat", "lon"]).issubset(dims):
            ds = ds.transpose("time", "lat", "lon")

    return ds

# ----------------------------
# Utility helpers
# ----------------------------
def find_any(path: Path, pattern: str):
    """Return first matching file, or None if none exist."""
    hits = sorted(glob.glob(str(path / pattern)))
    return hits[0] if hits else None

def print_ds_summary(ds: xr.Dataset, name: str = "Dataset"):
    """Print compact summary of variables, dimensions, and units."""
    print(f"\n📊 Summary for {name}")
    print("-" * 60)
    print(f"Dimensions: {dict(ds.dims)}")
    print(f"Coordinates: {list(ds.coords)}")
    print("-" * 60)
    for var in ds.data_vars:
        da = ds[var]
        shape = " × ".join([f"{d}={da.sizes[d]}" for d in da.dims])
        units = da.attrs.get("units", "—")
        print(f"{var:30s} | {shape:40s} | units: {units}")
    print("-" * 60)


# -----------------------------
# Helper to save engineered feature
# -----------------------------
def save_feature(ds: xr.Dataset, name: str):
    out_fp = FE_OUT / f"{name}_2015_2025.nc"
    ds.to_netcdf(out_fp, mode="w")
    print(f"✅ Saved {name} → {out_fp}")
    return out_fp

## Pre-processing

### 1. ERA5 → 30-minute timestamps (to match IMERG)

In [4]:
def _ensure_sorted(ds: xr.Dataset) -> xr.Dataset:
    """Consistent coordinate order (time asc, latitude asc, longitude asc if needed)."""
    
    # Sort by time if present
    if "time" in ds.coords:
        ds = ds.sortby("time")

    # Handle latitude naming variations
    lat_name = None
    for candidate in ["latitude", "lat", "y"]:
        if candidate in ds.coords:
            lat_name = candidate
            break

    if lat_name:
        if ds[lat_name][0] > ds[lat_name][-1]:
            ds = ds.reindex({lat_name: ds[lat_name][::-1]})
    else:
        print("⚠️ No latitude/lat/y coordinate found; skipping lat reindex.")

    # Handle longitude naming variations
    lon_name = None
    for candidate in ["longitude", "lon", "x"]:
        if candidate in ds.coords:
            lon_name = candidate
            break

    if lon_name:
        # Optional: ensure monotonic increasing (common for regridding/merging)
        if ds[lon_name][0] > ds[lon_name][-1]:
            ds = ds.reindex({lon_name: ds[lon_name][::-1]})
    else:
        print("⚠️ No longitude/lon/x coordinate found; skipping lon reindex.")

    return ds

# Pick one example ERA5 single-level file from your raw folder
sample_files = glob.glob(str(ERA5_SINGLE_RAW / "2015" / "*.nc"))
if sample_files:
    fp = sample_files[0]
    print("Inspecting file:", fp)
    ds = xr.open_dataset(fp)
    print("\nCoordinates:", list(ds.coords))
    print("\nDimensions:", ds.dims)
    print("\nVariables:", list(ds.data_vars))
    print("\nTime preview:\n", ds.coords.get("time", ds.coords.get("valid_time", "No time coord found")))
else:
    print("⚠️ No sample files found in", ERA5_SINGLE_RAW)

Inspecting file: D:\extreme_rainfalls\data\raw\era5_single\2015\era5_single_2m_temperature_201501.nc

Coordinates: ['number', 'valid_time', 'latitude', 'longitude', 'expver']


Variables: ['t2m']

Time preview:
 <xarray.DataArray 'valid_time' (valid_time: 744)> Size: 6kB
array(['2015-01-01T00:00:00.000000000', '2015-01-01T01:00:00.000000000',
       '2015-01-01T02:00:00.000000000', ..., '2015-01-31T21:00:00.000000000',
       '2015-01-31T22:00:00.000000000', '2015-01-31T23:00:00.000000000'],
      dtype='datetime64[ns]')
Coordinates:
    number      int64 8B ...
  * valid_time  (valid_time) datetime64[ns] 6kB 2015-01-01 ... 2015-01-31T23:...
    expver      (valid_time) <U4 12kB ...
Attributes:
    long_name:      time
    standard_name:  time


In [5]:
def _ensure_sorted(ds: xr.Dataset) -> xr.Dataset:
    """Consistent coordinate order (time asc, latitude asc)."""
    if "time" in ds.coords:
        ds = ds.sortby("time")

    # Handle latitude naming variations
    lat_name = None
    for candidate in ["latitude", "lat", "y"]:
        if candidate in ds.coords:
            lat_name = candidate
            break

    if lat_name:
        if ds[lat_name][0] > ds[lat_name][-1]:
            ds = ds.reindex({lat_name: ds[lat_name][::-1]})
    else:
        print("⚠️ No latitude/lat/y coordinate found; skipping lat reindex.")

    return ds


def _drop_expver(ds: xr.Dataset) -> xr.Dataset:
    if "expver" in ds.dims:
        ds = ds.isel(expver=-1, drop=True)
    return ds


In [6]:
def _inst_to_30min(da: xr.DataArray) -> xr.DataArray:
    if "time" not in da.coords:
        raise ValueError(f"{da.name} has no 'time' coordinate")
    out = da.resample(time="30min").interpolate("linear")
    out.attrs.update(da.attrs)
    return out

def _accum_to_mm_per_30min(da: xr.DataArray) -> xr.DataArray:
    if "time" not in da.coords:
        raise ValueError(f"{da.name} has no 'time' coordinate")
    mm_per_hour = da * 1000.0
    mm_per_30   = mm_per_hour.resample(time="30min").pad() / 2.0
    mm_per_30.attrs["units"] = "mm/30min"
    return mm_per_30

# ======================================================
# Step-1: Open ERA5 single-level (raw) → Resample to 30min
# ======================================================

def open_era5_single_month(year: int, month: int) -> xr.Dataset:
    """
    Open ERA5 single-level data for a given year and month.
    Applies normalization (time, lat/lon, variable names).
    """
    folder = ERA5_SINGLE_RAW / str(year) / f"{month:02d}"
    files = sorted(glob.glob(str(folder / "*.nc")))
    if not files:
        raise FileNotFoundError(f"No ERA5 single-level files found for {year}-{month:02d} in {folder}")

    ds = xr.open_mfdataset(files, combine="by_coords", decode_times=True)
    return normalize_dataset(ds, path=folder)



def resample_era5_to_30min(ds: xr.Dataset) -> xr.Dataset:
    """Resample ERA5 (hourly) data to 30-minutes."""
    # Detect accumulated vs instantaneous
    accum_vars = {"total_precipitation", "convective_precipitation"}
    vars_in = list(ds.data_vars)
    inst_vars = [v for v in vars_in if v not in accum_vars]
    acc_vars  = [v for v in vars_in if v in accum_vars]

    # Instantaneous → interpolate
    out = {}
    if inst_vars:
        out_inst = ds[inst_vars].resample(time="30min").interpolate("linear")
        out.update(out_inst)

    # Accumulated → difference
    if acc_vars:
        out_acc = ds[acc_vars].resample(time="30min").sum()
        out.update(out_acc)

    return xr.Dataset(out, coords={"time": ds.time.resample(time="30min").asfreq()})



In [7]:
# ======================================================
# Step-2: Open ERA5 pressure-level (raw) → Resample to 30min
# ======================================================

def open_era5_pl_month(year: int, month: int) -> xr.Dataset:
    """
    Open ERA5 pressure-level data for a given year and month.
    Applies normalization (time, lat/lon, variable names).
    """
    folder = ERA5_PL_RAW / str(year) / f"{month:02d}"
    files = sorted(glob.glob(str(folder / "*.nc")))
    if not files:
        raise FileNotFoundError(f"No ERA5 pressure-level files found for {year}-{month:02d} in {folder}")

    ds = xr.open_mfdataset(files, combine="by_coords", decode_times=True)
    return normalize_dataset(ds, path=folder)


def resample_era5pl_to_30min(ds: xr.Dataset) -> xr.Dataset:
    """Resample ERA5 pressure-level (hourly) data to 30-minutes."""
    out = ds.resample(time="30min").interpolate("linear")
    return out



In [8]:
# Build list of all (year, month) pairs


RUN_STEP_1 = False   # change to True only if you want to rerun

if RUN_STEP_1:
    all_months = [(y, m) for y in YEARS for m in range(1, 13)]
    for year, month in tqdm(all_months, desc="Processing ERA5 → 30min", unit="month", ncols=100):
        try:
            ds_s = open_era5_single_month(year, month)
            ds_s_30 = resample_era5_to_30min(ds_s)
            tmp_fp = ERA5_SINGLE_30 / f"era5_single_30min_{year}{month:02d}.tmp.nc"
            ds_s_30.to_netcdf(tmp_fp, mode="w")
            os.replace(tmp_fp, ERA5_SINGLE_30 / f"era5_single_30min_{year}{month:02d}.nc")
        except FileNotFoundError:
            continue
    
        try:
            ds_pl = open_era5_pl_month(year, month)
            ds_pl_30 = resample_era5_to_30min(ds_pl)
            tmp_fp = ERA5_PL_30 / f"era5_pl_30min_{year}{month:02d}.tmp.nc"
            ds_pl_30.to_netcdf(tmp_fp, mode="w")
            os.replace(tmp_fp, ERA5_PL_30 / f"era5_pl_30min_{year}{month:02d}.nc")
        except FileNotFoundError:
            continue
    print("✅ Step 1 done: ERA5 → 30min resampling complete")
else:
    print("⏭️ Step 1 skipped (already processed)")


⏭️ Step 1 skipped (already processed)


In [9]:
# =======================================================
# Step Verify h5py & HDF5 versions before IMERG usage
# =======================================================
import h5py, warnings

print("🔍 Verifying h5py + HDF5 library versions...")

try:
    print("h5py version:", h5py.__version__)
    print("HDF5 library version:", h5py.version.hdf5_version)
    
    if h5py.version.hdf5_version.startswith("1.14"):
        print("✅ HDF5 library is consistent with h5py build.")
    else:
        warnings.warn("⚠️ Potential mismatch between h5py and HDF5 runtime libraries!")
except Exception as e:
    print("❌ Failed to check h5py/HDF5 versions:", e)


🔍 Verifying h5py + HDF5 library versions...
h5py version: 3.14.0
HDF5 library version: 1.14.6
✅ HDF5 library is consistent with h5py build.


## 2. Standardisation

In [10]:
# ==========================================
# Step-2: Standardisation of datasets
# ==========================================
def normalize_dataset(ds: xr.Dataset, path: Path = None, do_transpose: bool = True) -> xr.Dataset:
    """
    Clean and standardize a dataset:
    - Ensures 'time' coordinate exists (handles IMERG special case with startTime/endTime or attrs)
    - Drops ERA5 expver dimension if present
    - Standardizes lat/lon naming and ascending order
    - Normalizes ERA5 short variable names
    """
    # ----------------------------
    # Fix time
    # ----------------------------
    if "time" not in ds.coords:
        if "valid_time" in ds.coords:
            ds = ds.rename({"valid_time": "time"})
        elif "forecast_time" in ds.coords:
            ds = ds.rename({"forecast_time": "time"})
        elif "forecast_reference_time" in ds.coords:
            ds = ds.rename({"forecast_reference_time": "time"})
        elif "time" in ds.variables:
            # Promote variable to coordinate
            ds = ds.assign_coords(time=ds["time"].values)
        elif "startTime" in ds.variables and "endTime" in ds.variables:
            # IMERG: midpoint of startTime and endTime
            start = np.array(ds["startTime"].values, dtype="datetime64[ns]")
            end   = np.array(ds["endTime"].values, dtype="datetime64[ns]")
            midpoint = start + (end - start) // 2
            ds = ds.expand_dims({"time": [midpoint]})
            if path: print(f"⏱️ Created midpoint time for IMERG {path}")
        elif "time_coverage_start" in ds.attrs and "time_coverage_end" in ds.attrs:
            # Fallback to global attributes
            t0 = np.datetime64(ds.attrs["time_coverage_start"])
            t1 = np.datetime64(ds.attrs["time_coverage_end"])
            midpoint = t0 + (t1 - t0) // 2
            ds = ds.expand_dims({"time": [midpoint]})
            if path: print(f"⏱️ Created midpoint time from attrs for IMERG {path}")
        else:
            raise ValueError(
                f"No usable 'time' found in {path}. "
                f"Coords: {list(ds.coords)}, Vars: {list(ds.variables)}, Attrs: {list(ds.attrs.keys())}"
            )

    # Drop expver if present
    if "expver" in ds.dims:
        ds = ds.isel(expver=-1, drop=True)

    # ----------------------------
    # Standardize lat/lon coords
    # ----------------------------
    rename_map = {}
    if "latitude" in ds.coords:  rename_map["latitude"] = "lat"
    if "y" in ds.coords:         rename_map["y"] = "lat"
    if "longitude" in ds.coords: rename_map["longitude"] = "lon"
    if "x" in ds.coords:         rename_map["x"] = "lon"
    if rename_map:
        ds = ds.rename(rename_map)

    # Ensure ascending
    if "lat" in ds.coords and ds.lat[0] > ds.lat[-1]:
        ds = ds.reindex(lat=ds.lat[::-1])
    if "lon" in ds.coords and ds.lon[0] > ds.lon[-1]:
        ds = ds.reindex(lon=ds.lon[::-1])

    # ----------------------------
    # Normalize ERA5 variable names (skip IMERG vars, they’re already descriptive)
    # ----------------------------
    var_map = {
        "tp": "total_precipitation",
        "cp": "convective_precipitation",
        "t2m": "2m_temperature",
        "u10": "10m_u_component_of_wind",
        "v10": "10m_v_component_of_wind",
        "sp": "surface_pressure",
        "tcwv": "total_column_water_vapour",
        "tclw": "total_column_cloud_liquid_water",
        "blh": "boundary_layer_height",
        "msl": "mean_sea_level_pressure",
        "q": "specific_humidity",
        "u": "u_component_of_wind",
        "v": "v_component_of_wind",
        "z": "geopotential",
        "r": "relative_humidity",
        "t": "temperature",
    }
    rename_dict = {k: v for k, v in var_map.items() if k in ds.data_vars}
    if rename_dict:
        ds = ds.rename(rename_dict)

    # ----------------------------
    # Safe transpose
    # ----------------------------
    if do_transpose:
        dims = list(ds.dims)
        if set(["time", "pressure_level", "lat", "lon"]).issubset(dims):
            ds = ds.transpose("time", "pressure_level", "lat", "lon")
        elif set(["time", "lat", "lon"]).issubset(dims):
            ds = ds.transpose("time", "lat", "lon")

    return ds

# --- Apply standardisation to all datasets ---

RUN_STEP_1 = False   # change to True only if you want to rerun

if RUN_STEP_1:
    all_months = [(y, m) for y in YEARS for m in range(1, 13)]
    for year in tqdm(YEARS, desc="Step-2 Standardising"):
        # ERA5 Single
        for f in sorted((ERA5_SINGLE_30).glob(f"*{year}*.nc")):
            out = ERA5_STD_SINGLE / f.name.replace("30min", "std")
            if out.exists(): continue
            ds = xr.open_dataset(f, chunks={})
            ds = normalize_dataset(ds, f)
            ds.to_netcdf(out, engine="netcdf4")
            ds.close()
    
        # ERA5 Pressure
        for f in sorted((ERA5_PL_30).glob(f"*{year}*.nc")):
            out = ERA5_STD_PL / f.name.replace("30min", "std")
            if out.exists(): continue
            ds = xr.open_dataset(f, chunks={})
            ds = normalize_dataset(ds, f)
            ds.to_netcdf(out, engine="netcdf4")
            ds.close()
    
        ## IMERG        
        files = sorted((IMERG_RAW / str(year)).rglob("*.HDF5")) + \
                sorted((IMERG_RAW / str(year)).rglob("*.nc4"))
        
        for f in tqdm(files, desc=f"IMERG Standardising {year}"):
            out = IMERG_STD / f.name.replace(".HDF5", "_std.nc").replace(".nc4", "_std.nc")
            if out.exists():
                print(f"⚠️ Skipping existing {out}")
                continue
        
            ds = None
            try:
                # --- Try opening with xarray first ---
                for engine in ["netcdf4", "h5netcdf"]:
                    try:
                        with xr.open_dataset(f, engine=engine, chunks={"time": 100}) as ds_tmp:
                            ds = ds_tmp.load()  # load into memory
                        break
                    except Exception as e:
                        print(f"⚠️ Engine {engine} failed for {f.name}: {e}")
                        ds = None
        
                # --- If still no dataset, fallback to h5py ---
                if ds is None or len(ds.data_vars) == 0:
                    import h5py
                    with h5py.File(f, "r") as h5f:
                        # IMERG main variable usually 'precipitationCal'
                        var_name = "precipitationCal" if "precipitationCal" in h5f.keys() else list(h5f.keys())[0]
                        arr = np.array(h5f[var_name])
                        lat = np.array(h5f["lat"])
                        lon = np.array(h5f["lon"])
                        ds = xr.Dataset(
                            {"precipitation": (("lat", "lon"), arr)},
                            coords={"lat": lat, "lon": lon}
                        )
        
                # --- Ensure time coordinate exists ---
                if "time" not in ds.coords:
                    m = re.search(r"(\d{8})-S(\d{6})-E(\d{6})", f.name)
                    if m:
                        date_str, s_time, e_time = m.groups()
                        start = pd.to_datetime(date_str + s_time, format="%Y%m%d%H%M%S")
                        end   = pd.to_datetime(date_str + e_time, format="%Y%m%d%H%M%S")
                        midpoint = np.datetime64(start + (end - start) / 2)
                        ds = ds.expand_dims({"time": [midpoint]})
                        print(f"⏱️ Parsed time from filename for {f.name}")
                    else:
                        print(f"⚠️ Could not parse time from {f.name}, skipping")
                        continue
        
                # --- Normalise coords/lat/lon ---
                ds = normalize_dataset(ds, f, do_transpose=True)
        
                # --- Save compressed & float32 ---
                tmp_out = out.with_suffix(".tmp.nc")
                ds.astype("float32").to_netcdf(
                    tmp_out,
                    engine="netcdf4",
                    encoding={v: {"zlib": True, "complevel": 4} for v in ds.data_vars}
                )
                os.replace(tmp_out, out)  # safe overwrite
                ds.close()
        
            except Exception as e:
                print(f"❌ Could not standardise {f.name}: {e}")
                continue
        
        
    print(" Standardisation of ERA5 & IMERG datasets completed")

else:
    print("Skipped (already standardised)")

Skipped (already standardised)


## 3: Data Quality Checks

In [11]:
# =====================================
# Step-3: Data Quality Checks (lazy/dask safe)
# =====================================
import dask
import dask.array as da
from tqdm import tqdm

def check_duplicates_and_monotonic(ds, name="Dataset"):
    """Check duplicates + monotonicity of time/lat/lon without loading full data."""
    print(f"\n🔍 Checking duplicates & monotonicity for {name} ...")
    for coord in ["time", "lat", "lon"]:
        if coord in ds.coords:
            idx = ds.indexes[coord]
            if not idx.is_monotonic_increasing:
                print(f"⚠️ {coord} not monotonic → sorting")
                ds = ds.sortby(coord)
            if idx.has_duplicates:
                print(f"⚠️ {coord} has duplicates → dropping")
                ds = ds.sel({coord: ~idx.duplicated()})
    return ds


def check_physical_ranges(ds, var_ranges: dict, name="Dataset"):
    """Clamp variable values to known physical ranges (lazy, chunked)."""
    print(f"\n🔍 Checking/clamping ranges for {name} ...")
    for var, (vmin, vmax) in var_ranges.items():
        if var in ds.data_vars:
            da_var = ds[var]
            # Clamp lazily
            ds[var] = da.clip(da_var, vmin, vmax)
            print(f"  {var}: clamped to [{vmin}, {vmax}]")
    return ds


def check_missing_data(ds, name="Dataset"):
    """Check NaNs without loading all data."""
    print(f"\n🔍 Checking missing data for {name} ...")
    for var in ds.data_vars:
        try:
            has_nan = ds[var].isnull().any().compute()
            if has_nan:
                print(f"⚠️ {var}: contains NaNs")
            else:
                print(f"  {var}: no NaNs")
        except Exception as e:
            print(f"  {var}: check failed ({e})")
    return ds


# ----------------------------
# Example variable ranges
# ----------------------------
PHYSICAL_RANGES = {
    "total_precipitation": (0, 0.5),      # m over 30 min (~1000 mm/h upper cap)
    "convective_precipitation": (0, 0.5),
    "2m_temperature": (180, 330),         # K
    "10m_u_component_of_wind": (-100, 100),
    "10m_v_component_of_wind": (-100, 100),
    "surface_pressure": (5e4, 1.1e5),     # Pa
    "specific_humidity": (0, 0.1),
    "relative_humidity": (0, 100),
    "temperature": (180, 330),            # K
}


# ----------------------------
# Run checks for each dataset
# ----------------------------
def run_quality_checks(input_dir, output_dir, label):
    files = sorted(glob.glob(str(input_dir / "*.nc")))
    for f in tqdm(files, desc=f"Step-3 Quality Checks: {label}"):
        ds = xr.open_dataset(f, chunks={"time": 100})   # lazy open
        ds = normalize_dataset(ds, f)

        ds = check_duplicates_and_monotonic(ds, name=label)
        ds = check_physical_ranges(ds, PHYSICAL_RANGES, name=label)
        ds = check_missing_data(ds, name=label)

        # Save cleaned copy (NetCDF, compressed, float32)
        out = output_dir / Path(f).name
        if not out.exists():  # safeguard: skip if already processed
            ds.astype("float32").to_netcdf(out, engine="h5netcdf", 
                                           encoding={v: {"zlib": True, "complevel": 4} for v in ds.data_vars})
        ds.close()


# ----------------------------
# Apply to ERA5 & IMERG
# ----------------------------
# ERA5 single
# run_quality_checks(ERA5_SINGLE_30, ERA5_STD_SINGLE, "ERA5-Single") - commented as the process is completed (uncommnet to rerun)

# ERA5 pressure-level
#run_quality_checks(ERA5_PL_30, ERA5_STD_PL, "ERA5-Pressure") - commented as the process is completed (uncommnet to rerun)

# IMERG
# run_quality_checks(IMERG_RAW, IMERG_STD, "IMERG") - commented as the process is completed (uncommnet to rerun)


## 4. Splits for Training/Validation/Testing

In [12]:
# ==========================================
# Step-5: Train/Val/Test Splits (ERA5 only)
# ==========================================
import pandas as pd

split_csv = PROC / "splits.csv"
split_records = []

def label_split(ts):
    """Assign split label based on year."""
    year = pd.to_datetime(ts).year
    if year in TRAIN_YEARS: return "train"
    if year in VAL_YEARS:   return "val"
    if year in TEST_YEARS:  return "test"
    return "ignore"

# --- Apply to ERA5 Single & Pressure ---
for folder in [ERA5_STD_SINGLE, ERA5_STD_PL]:
    for f in tqdm(folder.glob("*.nc"), desc=f"Step-5 Splits {folder.name}"):
        try:
            ds = xr.open_dataset(f)
            split_label = label_split(ds.time)
            split_records.append({
                "file": f.name,
                "dataset": folder.name,
                "time_start": str(ds.time.values[0]),
                "time_end": str(ds.time.values[-1]),
                "n_steps": ds.dims["time"],
                "split": split_label,
            })
            ds.close()
        except Exception as e:
            print(f"⚠️ Skipping {f}: {e}")

# Save splits summary
pd.DataFrame(split_records).to_csv(split_csv, index=False)
print(f"✅ Split ranges saved to {split_csv}")

# --- IMERG Handling ---
print("ℹ️ IMERG not split at file level (too granular).")
print("   → Use aligned years in modeling: train=2015–2016, val=2019, test=2020.")


Step-5 Splits single: 0it [00:00, ?it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_on_imerg_30min_201501.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201501.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201502.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


Step-5 Splits single: 4it [00:00, 35.18it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201503.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201504.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201505.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


Step-5 Splits single: 8it [00:00, 32.66it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201506.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201507.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201508.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201509.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201510.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


Step-5 Splits single: 12it [00:00, 33.30it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201511.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201512.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


Step-5 Splits single: 16it [00:00, 32.44it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201601.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201602.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201603.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201604.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201605.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


Step-5 Splits single: 20it [00:00, 31.74it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201606.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201607.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


Step-5 Splits single: 24it [00:00, 31.39it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201608.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201609.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201610.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201611.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201612.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 31it [00:01, 28.69it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201702.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201703.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201704.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201705.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201706.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 39it [00:01, 29.81it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201709.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201710.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201711.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201712.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201801.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 45it [00:01, 29.25it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201804.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201805.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201806.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201807.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201808.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 51it [00:01, 28.88it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201810.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201811.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201812.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201901.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201902.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 58it [00:01, 29.78it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201904.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201905.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201906.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201907.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201908.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 64it [00:02, 29.14it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201911.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_201912.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202001.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202002.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202003.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 71it [00:02, 29.73it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202006.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202007.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202008.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202009.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202010.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 79it [00:02, 29.99it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202101.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202102.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202103.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202104.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202105.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 85it [00:02, 29.40it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202108.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202109.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202110.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202111.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202112.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 91it [00:03, 29.33it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202202.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202203.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202204.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202205.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202206.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 97it [00:03, 28.25it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202208.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202209.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202210.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202211.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202212.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 103it [00:03, 25.97it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202302.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202303.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202304.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202305.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202306.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


Step-5 Splits single: 109it [00:03, 27.22it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202307.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202308.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202309.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202310.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202311.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 115it [00:03, 27.47it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202401.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202402.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202403.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202404.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202405.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 118it [00:04, 26.41it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202407.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202408.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202409.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202410.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202411.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 125it [00:04, 27.61it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202501.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202502.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202503.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202504.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202505.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\pr

Step-5 Splits single: 131it [00:04, 26.51it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202507.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_std_202508.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201501.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201502.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201503.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


Step-5 Splits single: 137it [00:04, 27.16it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201504.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201505.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201506.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201507.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201508.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfal

Step-5 Splits single: 143it [00:04, 27.76it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201510.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201511.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201512.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201601.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201602.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfal

Step-5 Splits single: 150it [00:05, 28.04it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201604.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201605.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201606.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201607.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201608.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfal

Step-5 Splits single: 157it [00:05, 28.50it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201611.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201612.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201701.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201702.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201703.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfal

Step-5 Splits single: 163it [00:05, 27.83it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201706.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201707.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201708.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201709.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201710.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfal

Step-5 Splits single: 170it [00:05, 27.21it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201712.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201801.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201802.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201803.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201804.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfal

Step-5 Splits single: 176it [00:06, 27.84it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201806.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201807.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201808.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201809.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201810.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfal

Step-5 Splits single: 183it [00:06, 28.99it/s]

⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201812.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201901.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201902.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201903.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201904.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfal

Step-5 Splits single: 186it [00:06, 28.55it/s]


⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201907.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201908.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
⚠️ Skipping D:\extreme_rainfalls\data\processed\era5_standardised\single\era5_single_30min_201909.nc: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


KeyboardInterrupt: 

In [None]:
### Feture Engineering

In [13]:
# ===========================================================
# Step 1. Vertical Shear & Lapse Rates
# ===========================================================
era5_pl_files = sorted(ERA5_STD_PL.glob("*.nc"))
ds_pl = xr.open_mfdataset(era5_pl_files, combine="by_coords", parallel=True)

# Standardize dimension name
if "pressure_level" in ds_pl.dims:
    ds_pl = ds_pl.rename({"pressure_level": "level"})

# Shear: U/V difference between 850 and 500 hPa
ds_pl["shear_u_850_500"] = ds_pl["u_component_of_wind"].sel(level=850) - ds_pl["u_component_of_wind"].sel(level=500)
ds_pl["shear_v_850_500"] = ds_pl["v_component_of_wind"].sel(level=850) - ds_pl["v_component_of_wind"].sel(level=500)

# Lapse rate (temperature drop per ~3.5 km between 850–500 hPa)
ds_pl["lapse_rate_850_500"] = (ds_pl["temperature"].sel(level=850) - ds_pl["temperature"].sel(level=500)) / 3500.0


In [16]:
# ===========================================================
# Step 2. Integrated Vapour Transport (IVT)
# ===========================================================
q = ds_pl["specific_humidity"]
u = ds_pl["u_component_of_wind"]
v = ds_pl["v_component_of_wind"]

# Pressure levels in Pa
plevs = ds_pl["level"] * 100.0   # <-- use 'level' instead of 'pressure_level'
dp = np.gradient(plevs)          # (level,)

# Expand dp to broadcast with (time, level, lat, lon)
dp_xr = xr.DataArray(
    dp,
    dims=["level"],
    coords={"level": ds_pl["level"]},
)

# Compute IVT components
dx = (q * u * dp_xr).sum(dim="level") / 9.81
dy = (q * v * dp_xr).sum(dim="level") / 9.81

ivt = np.sqrt(dx**2 + dy**2)
ds_pl["IVT"] = ivt
ds_pl["IVT"].attrs["units"] = "kg m-1 s-1"

# -----------------------------
# Check the result
# -----------------------------
print_ds_summary(ds_pl, "ERA5 Pressure-Level with IVT")



📊 Summary for ERA5 Pressure-Level with IVT
------------------------------------------------------------
Dimensions: {'time': 186588, 'level': 4, 'lat': 53, 'lon': 67}
Coordinates: ['level', 'lat', 'lon', 'number', 'time']
------------------------------------------------------------
specific_humidity              | time=186588 × level=4 × lat=53 × lon=67  | units: kg kg**-1
u_component_of_wind            | time=186588 × level=4 × lat=53 × lon=67  | units: m s**-1
v_component_of_wind            | time=186588 × level=4 × lat=53 × lon=67  | units: m s**-1
geopotential                   | time=186588 × level=4 × lat=53 × lon=67  | units: m**2 s**-2
relative_humidity              | time=186588 × level=4 × lat=53 × lon=67  | units: %
temperature                    | time=186588 × level=4 × lat=53 × lon=67  | units: K
shear_u_850_500                | time=186588 × lat=53 × lon=67            | units: —
shear_v_850_500                | time=186588 × lat=53 × lon=67            | units: —
lapse_r

  print(f"Dimensions: {dict(ds.dims)}")


In [19]:
# Step 3. Anomalies (MSLP, BLH) with safe chunking
era5_single_files = sorted(ERA5_STD_SINGLE.glob("*.nc"))
ds_single = xr.open_mfdataset(
    era5_single_files,
    combine="by_coords",
    parallel=True,
    chunks={"time": 5000, "latitude": 53, "longitude": 67}  # coarser chunks
)

for v in ["mean_sea_level_pressure", "boundary_layer_height"]:
    if v in ds_single:
        # Pre-chunk to reduce warnings
        ds_var = ds_single[v].chunk({"time": -1})
        
        # Monthly climatology
        clim = ds_var.groupby("time.month").mean("time")
        
        # Subtract climatology
        ds_single[f"{v}_anom"] = ds_var.groupby("time.month") - clim

print("✅ Anomalies computed with controlled chunking")


  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(


KeyboardInterrupt: 

In [None]:
# ===========================================================
# Step 4. IMERG Persistence (rolling sums)
# ===========================================================
imerg_files = sorted(IMERG_STD.glob("*.nc"))
ds_imerg = xr.open_mfdataset(imerg_files, combine="by_coords", parallel=True)

if "precipitation_mm30" in ds_imerg:
    ds_imerg["imerg_persist_1h"] = ds_imerg["precipitation_mm30"].rolling(time=2, min_periods=1).sum()
    ds_imerg["imerg_persist_3h"] = ds_imerg["precipitation_mm30"].rolling(time=6, min_periods=1).sum()


In [None]:
# ===========================================================
# Step 5. Moisture Flux Convergence
# ===========================================================
R_earth = 6.371e6
lat = np.deg2rad(ds_pl["latitude"])
lon = np.deg2rad(ds_pl["longitude"])

uq = ds_pl["specific_humidity"].sel(level=850) * ds_pl["u_component_of_wind"].sel(level=850)
vq = ds_pl["specific_humidity"].sel(level=850) * ds_pl["v_component_of_wind"].sel(level=850)

dlon = np.gradient(lon)
dlat = np.gradient(lat)

uq_dx = (uq.diff("longitude") / dlon.mean()) / (R_earth * np.cos(lat.mean()))
vq_dy = (vq.diff("latitude") / dlat.mean()) / R_earth

div_q = uq_dx.pad({"longitude": (0,1)}) + vq_dy.pad({"latitude": (0,1)})
ds_pl["moisture_flux_conv"] = -div_q


In [None]:
# ===========================================================
# Step 6. CAPE Proxy
# ===========================================================
T = ds_pl["temperature"]
q = ds_pl["specific_humidity"]

# Potential temperature θ = T * (1000/p)**0.286
theta = T * (1000.0 / ds_pl["level"]) ** 0.286
theta_sfc = theta.sel(level=850)
theta_mid = theta.sel(level=500)

delta_theta = theta_sfc - theta_mid
cape_proxy = delta_theta * q.sel(level=850)

ds_pl["CAPE_proxy"] = cape_proxy
