 **This notebook imprement d-HyMoLAP (dimensionally consistent reformulation of HyMoLAP rainfall-runoff Model) run over CAMELS-GB dataset catchments with less than 10% of missing data, and stock results (parameters and performance metrics) in dHyMoLAP_Simulation_Data_CAMELS_GB.csv**

**Author:** Lionel Cedric Gohouede

## 1. MOUNT GOOGLE DRIVE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 2. IMPORT LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import warnings
from numba import njit
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")

✅ Libraries imported successfully


## 3. FILTER DATA BY TIME PERIOD (1990-2014)

In [None]:
data_dir = '/content/drive/MyDrive/Colab Notebooks/CAMELS_GB/'
start_date, end_date = '1990-01-01', '2014-12-31'

print("Loading, filtering, and aligning CSV files...\n")

# 1. Fast Load & Filter
df_pcp = pd.read_csv(f"{data_dir}pcp_mm.csv", index_col=0).loc[start_date:end_date]
df_pet = pd.read_csv(f"{data_dir}pet_mm.csv", index_col=0).loc[start_date:end_date]
df_q   = pd.read_csv(f"{data_dir}q_cms_obs.csv", index_col=0).loc[start_date:end_date]

# 2. Fast Alignment
common_stations = sorted(list(set(df_pcp.columns) & set(df_pet.columns) & set(df_q.columns)))
df_pcp = df_pcp[common_stations]
df_pet = df_pet[common_stations]
df_q   = df_q[common_stations]

print(f"✅ Data aligned. Total common stations: {len(common_stations)}")

# ============================================
# Memory-Optimized Wrapper Classes
# ============================================
class SimpleArray:
    __slots__ = ['array']  # Prevents massive RAM bloat
    def __init__(self, array):
        self.array = array
    def to_numpy(self):
        return self.array

class StationData:
    __slots__ = ['data']
    def __init__(self, data_dict):
        self.data = data_dict
    def sel(self, dynamic_features=None):
        return SimpleArray(self.data[dynamic_features])

# ============================================
# Ultra-Fast Dictionary Construction
# ============================================
print("Building ds_recent dictionary...\n")

# Convert DataFrames to 2D NumPy matrices once (orders of magnitude faster than column iteration)
pcp_arr = df_pcp.to_numpy()
pet_arr = df_pet.to_numpy()
q_arr   = df_q.to_numpy()
date_arr = pd.to_datetime(df_pcp.index).to_numpy()

# Build the dictionary using fast index slicing
ds_recent = {
    st: StationData({
        'pcp_mm': pcp_arr[:, i],
        'pet_mm': pet_arr[:, i],
        'q_cms_obs': q_arr[:, i],
        'date': date_arr
    })
    for i, st in enumerate(common_stations)
}

print(f"✅ Dictionary built successfully!")

# ============================================
# Validation Test
# ============================================
test_station = common_stations[0]
print(f"\nTesting data access for station: {test_station}")

Q_obs = ds_recent[test_station].sel(dynamic_features="q_cms_obs").to_numpy()
P     = ds_recent[test_station].sel(dynamic_features="pcp_mm").to_numpy()
PET   = ds_recent[test_station].sel(dynamic_features="pet_mm").to_numpy()

print(f"✅ Extraction works correctly!")
print(f"   Q_obs shape: {Q_obs.shape}")
print(f"   Q_obs - min: {np.nanmin(Q_obs):.2f}, max: {np.nanmax(Q_obs):.2f}, mean: {np.nanmean(Q_obs):.2f}")
print(f"   Q_obs Missing: {np.sum(np.isnan(Q_obs))} ({np.sum(np.isnan(Q_obs))/len(Q_obs)*100:.1f}%)")

Loading, filtering, and aligning CSV files...

✅ Data aligned. Total common stations: 671
Building ds_recent dictionary...

✅ Dictionary built successfully!

Testing data access for station: 10002
✅ Extraction works correctly!
   Q_obs shape: (9131,)
   Q_obs - min: 0.80, max: 94.41, mean: 5.18
   Q_obs Missing: 0 (0.0%)


## 4. MAIN CODE

In [None]:
# ============================================
# 1. COMPILED MODEL & FAST METRICS (OUTSIDE THE LOOP)
# ============================================
@njit
def dHyMoLAP_Model(params, Q0, q):
    mu, lambda_, Qs, qs = params
    N = len(q)
    k = np.zeros(N)
    x = np.zeros(N)

    # Strict mathematical guards
    if Qs <= 0.0 or qs <= 0.0 or lambda_ <= 0.0 or mu <= 0.0:
        return np.full(N, np.nan)

    k[0] = Q0 / Qs

    # Precompute constants to save CPU cycles inside the loop
    mu_lam = mu / lambda_
    one_m_mu_lam = 1.0 - mu_lam
    inv_lam = 1.0 / lambda_
    pow_term = 2.0 * mu - 1.0

    for t in range(N - 1):
        r_next = q[t+1] / qs

        if np.isnan(r_next):
            k[t+1] = k[t]
            x[t+1] = x[t]
            continue

        if r_next > 0.0:
            x[t+1] = x[t] + mu_lam * r_next
        else:
            x[t+1] = one_m_mu_lam * x[t]

        k_base = max(0.0, k[t])
        k[t+1] = max(0.0, k[t] - mu_lam * (k_base ** pow_term) + inv_lam * x[t+1] * r_next)

    return k * Qs

def NSE(obs, sim):
    mask = ~np.isnan(obs) & ~np.isnan(sim)
    valid_obs, valid_sim = obs[mask], sim[mask]
    if valid_obs.size == 0 or np.var(valid_obs) == 0.0: return np.nan
    return 1.0 - (np.sum((valid_sim - valid_obs)**2) / np.sum((valid_obs - np.mean(valid_obs))**2))

def RMSE(obs, sim):
    mask = ~np.isnan(obs) & ~np.isnan(sim)
    valid_obs, valid_sim = obs[mask], sim[mask]
    if valid_obs.size == 0: return np.nan
    return np.sqrt(np.mean((valid_sim - valid_obs)**2))

def objective(params, Q0, q_train, Q_obs_train):
    Q_sim = dHyMoLAP_Model(params, Q0, q_train)
    nse = NSE(Q_obs_train, Q_sim)
    return 1.0 - nse if np.isfinite(nse) else 1e9

# ============================================
# 2. OPTIMIZED EXECUTION LOOP
# ============================================
all_stations = list(ds_recent.keys())
b1_ratio = 0.7
max_missing_ratio = 0.1
results = {}

# Strict bounds: mu >= 0.5 is required so (2*mu - 1) is not negative.
param_bounds = [(0.5, 5.0), (1e-3, 100.0), (1e-3, 1000.0), (1e-3, 1000.0)]

for i, station_id in enumerate(all_stations, 1):
    print(f"\n=== Station {station_id} ===, Number={i}")

    # Extract data using your wrapper
    Q_obs = ds_recent[station_id].sel(dynamic_features="q_cms_obs").to_numpy()
    P     = ds_recent[station_id].sel(dynamic_features="pcp_mm").to_numpy()
    PET   = ds_recent[station_id].sel(dynamic_features="pet_mm").to_numpy()

    q = np.maximum(0.0, P - PET)
    N = len(Q_obs)

    if N == 0 or np.all(np.isnan(Q_obs)):
        print("⚠️ Station skipped (no valid data).")
        continue

    missing_count = np.sum(np.isnan(Q_obs))
    missing_ratio = missing_count / N
    if missing_ratio > max_missing_ratio:
        print(f"⚠️ Too many missing values ({missing_ratio*100:.1f}%)")
        continue

    b1 = int(N * b1_ratio)
    Q0 = Q_obs[0] if not np.isnan(Q_obs[0]) else np.nanmean(Q_obs[:10])

    q_train = q[:b1]
    Q_obs_train = Q_obs[:b1]

    # Optimization
    initial_guess = np.array([1.1, 20.0, np.nanmean(Q_obs_train), np.nanmean(q_train)])

    res = minimize(
        objective,
        initial_guess,
        args=(Q0, q_train, Q_obs_train),
        method="Nelder-Mead",
        bounds=param_bounds,
        options={'maxiter': 2500, 'disp': False}
    )

    MU, LAMBDA, Qs_best, qs_best = res.x
    NSE_train = 1.0 - res.fun

    # Validation
    Qsim = dHyMoLAP_Model(res.x, Q0, q)
    NSE_val = NSE(Q_obs[b1:], Qsim[b1:])
    RMSE_train = RMSE(Q_obs_train, Qsim[:b1])
    RMSE_val   = RMSE(Q_obs[b1:], Qsim[b1:])

    print(f"✅ Training NSE: {NSE_train:.3f}, Validation NSE: {NSE_val:.3f}")
    print(f"   Training RMSE: {RMSE_train:.3f}, Validation RMSE: {RMSE_val:.3f}")
    print(f"   Params: mu={MU:.3f}, lambda={LAMBDA:.3f}, Qs={Qs_best:.3f}, qs={qs_best:.3f}")

    # Storage
    results[station_id] = {
        "params": res.x.tolist(),
        "NSE_train": NSE_train,
        "NSE_val": NSE_val,
        "RMSE_train": RMSE_train,
        "RMSE_val": RMSE_val,
        "Qsim": Qsim,
        "missing_ratio": missing_ratio,
        "missing_count": missing_count,
    }

print(f"\n✅ Simulation terminée pour {len(results)} bassins valides (≤10% de NaN).")


=== Station 10002 ===, Number=1
✅ Training NSE: 0.675, Validation NSE: 0.532
   Training RMSE: 2.974, Validation RMSE: 4.211
   Params: mu=1.168, lambda=54.839, Qs=0.023, qs=0.064

=== Station 10003 ===, Number=2
✅ Training NSE: 0.787, Validation NSE: 0.684
   Training RMSE: 3.276, Validation RMSE: 4.326
   Params: mu=1.202, lambda=49.149, Qs=0.550, qs=0.348

=== Station 1001 ===, Number=3
⚠️ Too many missing values (23.4%)

=== Station 101002 ===, Number=4
✅ Training NSE: 0.726, Validation NSE: 0.699
   Training RMSE: 0.199, Validation RMSE: 0.219
   Params: mu=1.286, lambda=41.250, Qs=0.024, qs=0.293

=== Station 101005 ===, Number=5
✅ Training NSE: 0.736, Validation NSE: 0.743
   Training RMSE: 0.153, Validation RMSE: 0.181
   Params: mu=1.218, lambda=34.660, Qs=0.004, qs=0.144

=== Station 102001 ===, Number=6
✅ Training NSE: 0.633, Validation NSE: 0.666
   Training RMSE: 0.401, Validation RMSE: 0.405
   Params: mu=1.146, lambda=25.332, Qs=0.002, qs=0.115

=== Station 106001 ===, 

## 5. SAVE SUMMARY

In [None]:
rows = []

for station_id, res in results.items():
    MU, LAMBDA, Qs_best, qs_best = res["params"]

    rows.append({
        "station_id": station_id,
        "MU": MU,
        "LAMBDA": LAMBDA,
        "Qs": Qs_best,
        "qs": qs_best,
        "NSE_train": res["NSE_train"],
        "NSE_val": res["NSE_val"],
        "RMSE_train": res["RMSE_train"],
        "RMSE_val": res["RMSE_val"],
        "missing_ratio": res["missing_ratio"],
        "missing_count": res["missing_count"],
    })

df = pd.DataFrame(rows)

# Local save
df.to_csv("dHyMoLAP_Simulation_Data_CAMELS_GB.csv", index=False)

df.head()

# Save to Google Drive
from google.colab import drive
drive.mount("/content/drive")

df.to_csv(
    "/content/drive/MyDrive/dHyMoLAP_Simulation_Data_CAMELS_GB.csv",
    index=False
)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
nse_values = [res['NSE_val'] for res in results.values()]

if nse_values:
    nse_median = np.median(nse_values)
    nse_mean = np.mean(nse_values)
    nse_min = np.min(nse_values)
    nse_max = np.max(nse_values)
    nse_5th = np.percentile(nse_values, 5)
    nse_95th = np.percentile(nse_values, 95)

    print(f"NSE Validation -> Mean: {nse_mean:.3f}, Median {nse_median:.3f}, Min: {nse_min:.3f}, Max: {nse_max:.3f}, 5th percentile: {nse_5th:.3f}, 95th percentile: {nse_95th:.3f}")
else:
    print("No stations processed (all contain missing values).")


NSE Validation -> Mean: 0.650, Median 0.680, Min: -1.764, Max: 0.893, 5th percentile: 0.432, 95th percentile: 0.806


In [None]:
rmse_values = [res['RMSE_val'] for res in results.values()]

if rmse_values:
    rmse_median = np.median(rmse_values)
    rmse_mean = np.mean(rmse_values)
    rmse_min = np.min(rmse_values)
    rmse_max = np.max(rmse_values)
    rmse_5th = np.percentile(rmse_values, 5)
    rmse_95th = np.percentile(rmse_values, 95)

    print(
        f"RMSE Validation -> Mean: {rmse_mean:.3f}, Median {rmse_median:.3f}, "
        f"Min: {rmse_min:.3f}, Max: {rmse_max:.3f}, "
        f"5th percentile: {rmse_5th:.3f}, 95th percentile: {rmse_95th:.3f}"
    )
else:
    print("No stations processed (all contain missing values).")

RMSE Validation -> Mean: 5.234, Median 2.017, Min: 0.022, Max: 82.058, 5th percentile: 0.155, 95th percentile: 22.770
