
> **This notebook implements HyMoLAP rainfall–runoff model over CAMELS-FR catchments with less than 10% missing discharge data and stores the resulting calibrated parameters and performance metrics in `HyMoLAP_Simulation_Data_CAMELS_FR.csv`.**

**Author:** Lionel Cedric Gohouede

# IMPORT LIBRARIES

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from numba import njit
import math
import xarray as xr
import matplotlib.pyplot as plt
from google.colab import drive
from matplotlib.dates import DateFormatter
from scipy.optimize import minimize

##USE DATA FROM PYTHON

In [3]:
pip install aqua-fetch

Collecting aqua-fetch
  Downloading aqua_fetch-1.0.1-py3-none-any.whl.metadata (1.7 kB)
Downloading aqua_fetch-1.0.1-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.1/289.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: aqua-fetch
Successfully installed aqua-fetch-1.0.1


In [4]:
from aqua_fetch import RainfallRunoff

rr = RainfallRunoff("CAMELS_FR")



downloading 6 files to /usr/local/lib/python3.12/dist-packages/aqua_fetch/data/CAMELS/CAMELS_FR
downloading ADDITIONAL_LICENSES.zip
0% of 0.07 MB downloaded
100% of 0.07 MB downloaded
downloading CAMELS_FR_attributes.zip
0% of 9.88 MB downloaded
100% of 9.88 MB downloaded
downloading CAMELS_FR_geography.zip
0% of 1.45 MB downloaded
100% of 1.45 MB downloaded
downloading CAMELS_FR_time_series.zip
0% of 361.39 MB downloaded
20% of 361.39 MB downloaded
40% of 361.39 MB downloaded
60% of 361.39 MB downloaded
80% of 361.39 MB downloaded
100% of 361.39 MB downloaded
downloading README.md
0% of 0.01 MB downloaded
100% of 0.01 MB downloaded
downloading CAMELS-FR_description.ods
0% of 0.05 MB downloaded
100% of 0.05 MB downloaded
unzipping files in /usr/local/lib/python3.12/dist-packages/aqua_fetch/data/CAMELS/CAMELS_FR
unzipping CAMELS_FR_time_series.zip to CAMELS_FR_time_series
unzipping CAMELS_FR_geography.zip to CAMELS_FR_geography
unzipping CAMELS_FR_attributes.zip to CAMELS_FR_attributes


In [5]:
meta, ds = rr.fetch()

Read 654 stations for 22 dyn features in 48.47 seconds with 2 cpus.


In [6]:
print(ds)

<xarray.Dataset> Size: 2GB
Dimensions:           (time: 18993, dynamic_features: 22)
Coordinates:
  * time              (time) datetime64[ns] 152kB 1970-01-01 ... 2021-12-31
  * dynamic_features  (dynamic_features) object 176B 'q_cms_obs' ... 'airtemp...
Data variables: (12/654)
    A105003001        (time, dynamic_features) float64 3MB 1.65e+03 ... 13.4
    A107020001        (time, dynamic_features) float64 3MB nan nan ... 4.1 13.3
    A112020001        (time, dynamic_features) float64 3MB 1.04e+03 ... 13.4
    A116003002        (time, dynamic_features) float64 3MB nan nan ... 4.3 13.3
    A140202001        (time, dynamic_features) float64 3MB nan nan ... 6.8 12.4
    A202030001        (time, dynamic_features) float64 3MB nan nan ... 7.1 13.2
    ...                ...
    Y661401001        (time, dynamic_features) float64 3MB 860.0 0.441 ... 14.3
    Y781000101        (time, dynamic_features) float64 3MB nan nan ... 6.4 15.8
    Y862000101        (time, dynamic_features) float64 3MB 

In [7]:
print(ds.dynamic_features.values)

['q_cms_obs' 'q_mm_obs' 'tsd_val_s' 'tsd_val_q' 'tsd_val_m' 'tsd_val_c'
 'tsd_val_i' 'pcp_mm' 'pcp_mm_solfrac' 'airtemp_C_mean' 'pet_mm_ou'
 'pet_mm_pe' 'pet_mm_pm' 'windspeed_mps' 'spechum_gkg' 'lwdownrad_wm2'
 'solrad_wm2' 'tsd_swi_gr' 'tsd_swi_isba' 'tsd_swe_isba' 'airtemp_C_min'
 'airtemp_C_max']


In [8]:
# Full period
print(ds["time"].values[0], ds["time"].values[-1])

1970-01-01T00:00:00.000000000 2021-12-31T00:00:00.000000000


###General

**Functions**

In [10]:
# ============================================
# 1. COMPILED HYMOLAP MODEL & FAST METRICS
# Defined outside the loop to prevent memory reallocation
# ============================================
@njit
def HyMoLAP_Model(params, Q0, Pn):
    MU, LAMBDA = params
    N = len(Pn)
    Q = np.zeros(N)

    if LAMBDA <= 0.0 or MU <= 0.0:
        return np.full(N, np.nan)

    Q[0] = Q0
    X_curr = 0.0

    mu_lam = MU / LAMBDA
    inv_lam = 1.0 / LAMBDA
    one_m_mu_lam = 1.0 - mu_lam
    pow_term = 2.0 * MU - 1.0

    for t in range(N - 1):
        q_base = max(0.0, Q[t])
        Q[t+1] = max(0.0, Q[t] - mu_lam * (q_base ** pow_term) + inv_lam * X_curr * Pn[t])

        if Pn[t+1] == 0.0:
            X_curr = X_curr * one_m_mu_lam
        else:
            X_curr = X_curr + mu_lam * Pn[t+1]

    return Q

# --------------------------------------------
# ZERO-ALLOCATION COMPILED METRICS
# --------------------------------------------
@njit
def NSE(obs, sim):
    n = len(obs)
    sum_obs = 0.0
    count = 0

    # Pass 1: Find mean of valid observations
    for i in range(n):
        if not np.isnan(obs[i]) and not np.isnan(sim[i]):
            sum_obs += obs[i]
            count += 1

    if count == 0:
        return np.nan

    mean_obs = sum_obs / count

    # Pass 2: Calculate NSE components
    num = 0.0
    den = 0.0
    for i in range(n):
        if not np.isnan(obs[i]) and not np.isnan(sim[i]):
            num += (sim[i] - obs[i])**2
            den += (obs[i] - mean_obs)**2

    if den == 0.0:
        return np.nan

    return 1.0 - (num / den)

@njit
def RMSE(obs, sim):
    n = len(obs)
    sq_err_sum = 0.0
    count = 0

    for i in range(n):
        if not np.isnan(obs[i]) and not np.isnan(sim[i]):
            sq_err_sum += (sim[i] - obs[i])**2
            count += 1

    if count == 0:
        return np.nan

    return np.sqrt(sq_err_sum / count)

def objective(params, Q0, Pn_train, Q_obs_train):
    Q_sim = HyMoLAP_Model(params, Q0, Pn_train)
    nse = NSE(Q_obs_train, Q_sim)
    return 1.0 - nse if np.isfinite(nse) else 1e9

## **Main Code**

In [11]:
# Optional but highly recommended: Load the slice into RAM to avoid slow disk I/O in the loop
ds_recent = ds.sel(time=slice("2000-01-01", "2021-12-31")).load()

all_stations = list(ds_recent.keys())

b1_ratio = 0.7
max_missing_ratio = 0.1
results = {}

# Replaced manual counter `i` with Python's built-in `enumerate`
for i, station_id in enumerate(all_stations, 1):
    print(f"\n=== Station {station_id} ===, Number = {i}")

    # Extract arrays directly
    Q_obs_raw = ds_recent[station_id].sel(dynamic_features="q_cms_obs").to_numpy()

    # Note: If 'cms' means Cubic Meters per Second, dividing by 1000 is incorrect.
    # Verify if the raw data is actually in L/s before keeping this division.
    Q_obs = Q_obs_raw / 1000.0

    P   = ds_recent[station_id].sel(dynamic_features="pcp_mm").to_numpy()
    PET = ds_recent[station_id].sel(dynamic_features="pet_mm_pm").to_numpy()

    # Calculate Net Rainfall (Pn)
    Pn = np.maximum(0.0, P - PET)
    N = len(Q_obs)

    if N == 0 or np.all(np.isnan(Q_obs)):
        print("⚠️ Station ignored (no valid data).")
        continue

    missing_count = np.sum(np.isnan(Q_obs))
    missing_ratio = missing_count / N

    if missing_ratio > max_missing_ratio:
        print(f"⚠️ Too many missing values ({missing_ratio*100:.1f}%)")
        continue

    b1 = int(N * b1_ratio)

    # Safe Q0 initialization: Use the first non-NaN observation
    valid_Q_idx = np.where(~np.isnan(Q_obs))[0]
    Q0 = Q_obs[valid_Q_idx[0]] if len(valid_Q_idx) > 0 else 0.0

    # Slice training data once to save memory allocation in the minimize loop
    Pn_train = Pn[:b1]
    Q_obs_train = Q_obs[:b1]

    # ============================================
    # Optimization (HyMoLAP uses 2 parameters)
    # ============================================
    # Bounds for MU and LAMBDA to prevent mathematical errors in the objective function
    param_bounds = [(0.5, 10.0), (1e-3, 1000.0)]

    # Multi-start initial guesses to avoid local minima
    initial_guesses = [
        [1.1, 20.0]
    ]

    best_res = None
    best_val = float("inf")

    for guess in initial_guesses:
        res = minimize(
            objective,
            np.array(guess), # Pass as numpy array for Numba compatibility
            args=(Q0, Pn_train, Q_obs_train),
            method="Nelder-Mead",
            bounds=param_bounds,
            options={'maxiter': 2500, 'disp': False}
        )
        if res.fun < best_val:
            best_val = res.fun
            best_res = res

    # Unpack only 2 parameters for HyMoLAP
    MU, LAMBDA = best_res.x
    NSE_train = 1.0 - best_val

    # ============================================
    # Validation
    # ============================================
    Qsim = HyMoLAP_Model(best_res.x, Q0, Pn)

    NSE_val    = NSE(Q_obs[b1:], Qsim[b1:])
    RMSE_train = RMSE(Q_obs_train, Qsim[:b1])
    RMSE_val   = RMSE(Q_obs[b1:], Qsim[b1:])

    print(f"✅ Train NSE: {NSE_train:.3f}, Val NSE: {NSE_val:.3f}")
    print(f"   Train RMSE: {RMSE_train:.3f}, Val RMSE: {RMSE_val:.3f}")
    print(f"   Params: mu={MU:.3f}, lambda={LAMBDA:.3f}")

    # ============================================
    # Storage
    # ============================================
    results[station_id] = {
        "params": [MU, LAMBDA],
        "NSE_train": NSE_train,
        "NSE_val": NSE_val,
        "RMSE_train": RMSE_train,
        "RMSE_val": RMSE_val,
        "Qsim": Qsim,
        "missing_ratio": missing_ratio,
        "missing_count": missing_count,
    }

print(f"\n✅ Simulation complete for {len(results)} valid basins.")


=== Station A105003001 ===, Number = 1
✅ Train NSE: 0.633, Val NSE: 0.611
   Train RMSE: 2.152, Val RMSE: 1.703
   Params: mu=1.381, lambda=19.748

=== Station A107020001 ===, Number = 2
✅ Train NSE: 0.061, Val NSE: -0.094
   Train RMSE: 0.888, Val RMSE: 0.796
   Params: mu=7.067, lambda=77.570

=== Station A112020001 ===, Number = 3
⚠️ Too many missing values (63.5%)

=== Station A116003002 ===, Number = 4
✅ Train NSE: 0.690, Val NSE: 0.755
   Train RMSE: 5.489, Val RMSE: 3.951
   Params: mu=1.170, lambda=8.772

=== Station A140202001 ===, Number = 5
✅ Train NSE: -0.910, Val NSE: -0.497
   Train RMSE: 0.648, Val RMSE: 0.709
   Params: mu=10.000, lambda=261.907

=== Station A202030001 ===, Number = 6
✅ Train NSE: -0.244, Val NSE: -0.076
   Train RMSE: 1.473, Val RMSE: 1.601
   Params: mu=10.000, lambda=113.494

=== Station A204010101 ===, Number = 7
⚠️ Too many missing values (68.1%)

=== Station A211030001 ===, Number = 8
✅ Train NSE: -0.318, Val NSE: -0.131
   Train RMSE: 1.063, Val

In [12]:
nse_values = [res['NSE_val'] for res in results.values()]

if nse_values:
    nse_mean = np.mean(nse_values)
    nse_min = np.min(nse_values)
    nse_max = np.max(nse_values)
    nse_5th = np.percentile(nse_values, 5)
    nse_95th = np.percentile(nse_values, 95)

    print(f"NSE Validation -> Mean: {nse_mean:.3f}, Min: {nse_min:.3f}, Max: {nse_max:.3f}, 5th percentile: {nse_5th:.3f}, 95th percentile: {nse_95th:.3f}")
else:
    print("No stations processed (all contain missing values).")


NSE Validation -> Mean: 0.165, Min: -34.142, Max: 0.870, 5th percentile: -1.021, 95th percentile: 0.791


In [13]:
rmse_values = [res['RMSE_val'] for res in results.values()]

if rmse_values:
    rmse_50th = np.percentile(rmse_values, 50)
    rmse_5th = np.percentile(rmse_values, 5)
    rmse_95th = np.percentile(rmse_values, 95)

    print(f"RMSE Validation -> Median: {rmse_50th:.3f}, 5th percentile: {rmse_5th:.3f}, 95th percentile: {rmse_95th:.3f}")
else:
    print("No stations processed (all contain missing values).")


RMSE Validation -> Median: 1.902, 5th percentile: 0.619, 95th percentile: 21.034


In [14]:
rows = []

for station_id, res in results.items():
    MU, LAMBDA = res["params"]

    rows.append({
        "station_id": station_id,
        "MU": MU,
        "LAMBDA": LAMBDA,
        "NSE_train": res["NSE_train"],
        "NSE_val": res["NSE_val"],
        "RMSE_train": res["RMSE_train"],
        "RMSE_val": res["RMSE_val"],
        "missing_ratio": res["missing_ratio"],
        "missing_count": res["missing_count"],
    })

df = pd.DataFrame(rows)

# Local save
df.to_csv("HyMoLAP_Simulation_Data_CAMELS_FR.csv", index=False)

df.head()

# Save to Google Drive
from google.colab import drive
drive.mount("/content/drive")

df.to_csv(
    "/content/drive/MyDrive/HyMoLAP_Simulation_Data_CAMELS_FR.csv",
    index=False
)

Mounted at /content/drive
