# HyMoLAP Performance Sensitivity Analysis to 5 catchments characteristics

**Author:** Lionel Cedric Gohouede


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# XGBoost and SHAP
import xgboost as xgb
import shap

# Machine learning
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, make_scorer, balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 9
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
print("✓ Libraries loaded")
print(f"  XGBoost: {xgb.__version__}")
print(f"  SHAP: {shap.__version__}")

✓ Libraries loaded
  XGBoost: 3.2.0
  SHAP: 0.50.0


## 1. Data Loading

In [None]:
from google.colab import drive
from pathlib import Path
import pandas as pd

drive.mount("/content/drive")
DATA_DIR = Path("/content/drive/MyDrive/Colab Notebooks/Data")

params = pd.read_csv(DATA_DIR / 'dHyMoLAP_Simulation_Data_CAMELS_FR.csv')
climate = pd.read_csv(DATA_DIR / 'CAMELS_FR_climatic_statistics.csv', sep=';')
hydro = pd.read_csv(DATA_DIR / 'CAMELS_FR_hydrological_signatures.csv', sep=';')
hydro_yearly = pd.read_csv(DATA_DIR / 'CAMELS_FR_hydroclimatic_statistics_joint_availability_yearly.csv', sep=';')
topo = pd.read_csv(DATA_DIR / 'CAMELS_FR_topography_general_attributes.csv', sep=';')
geology = pd.read_csv(DATA_DIR / 'CAMELS_FR_geology_attributes.csv', sep=';')
hydrogeology = pd.read_csv(DATA_DIR / 'CAMELS_FR_hydrogeology_attributes.csv', sep=';')
landcover = pd.read_csv(DATA_DIR / 'CAMELS_FR_land_cover_attributes.csv', sep=';')
station = pd.read_csv(DATA_DIR / 'CAMELS_FR_station_general_attributes.csv', sep=';', on_bad_lines='skip')
site = pd.read_csv(DATA_DIR / 'CAMELS_FR_site_general_attributes.csv', sep=';', on_bad_lines='skip')
nestedness = pd.read_csv(DATA_DIR / 'CAMELS_FR_catchment_nestedness_information.csv', sep=';')
dams = pd.read_csv(DATA_DIR / 'CAMELS_FR_human_influences_dams.csv', sep=';')
soil_raw = pd.read_csv(DATA_DIR / 'CAMELS_FR_soil_general_attributes.csv', sep=';')

In [None]:
bins = [-np.inf, 0.50, 0.70, 0.80, 1.00]
labels = ['Unsatisfactory', 'Satisfactory', 'Good', 'Very Good']
params['Performance'] = pd.cut(params['NSE_val'], bins=bins, labels=labels, right=True)

print(f"Dataset: {len(params)} catchments")
print(f"\nClass Distribution:")
for label in labels:
    count = (params['Performance'] == label).sum()
    pct = 100 * count / len(params)
    print(f"  {label:15s}: {pct:5.1f}% ({count:3d})")

imbalance_ratio = params['Performance'].value_counts().max() / params['Performance'].value_counts().min()
print(f"\nImbalance ratio: {imbalance_ratio:.2f}:1")

Dataset: 549 catchments

Class Distribution:
  Unsatisfactory :  12.4% ( 68)
  Satisfactory   :  44.8% (246)
  Good           :  31.0% (170)
  Very Good      :  11.8% ( 65)

Imbalance ratio: 3.78:1


In [None]:
soil_mean = soil_raw[soil_raw['sol_stat'] == 'mean'].copy()

soil_pivot_list = []
for agg_level in ['no', 'topsoil', 'top_subsoil']:
    subset = soil_mean[soil_mean['sol_agg_level'] == agg_level].copy()
    subset = subset.drop(['sol_stat', 'sol_agg_level'], axis=1)
    rename_dict = {col: f'sol_{agg_level}_{col}' if col != 'sta_code_h3' else col for col in subset.columns}
    subset = subset.rename(columns=rename_dict)
    soil_pivot_list.append(subset)

soil = soil_pivot_list[0]
for df in soil_pivot_list[1:]:
    soil = soil.merge(df, on='sta_code_h3', how='outer')

print("\n" + "="*80)
print("STANDARDIZING STATION IDs")
print("="*80)

# Standardize IDs - Check each DataFrame
datasets_to_rename = {
    'climate': climate,
    'hydro': hydro,
    'hydro_yearly': hydro_yearly,
    'topo': topo,
    'soil': soil,
    'geology': geology,
    'hydrogeology': hydrogeology,
    'landcover': landcover,
    'nestedness': nestedness,
    'dams': dams,
    'station': station
}

for name, df in datasets_to_rename.items():
    if 'sta_code_h3' in df.columns:
        df.rename(columns={'sta_code_h3': 'station_id'}, inplace=True)
        print(f"  ✓ {name:15s}: Renamed sta_code_h3 → station_id")
    elif 'station_id' in df.columns:
        print(f"  ✓ {name:15s}: Already has station_id")
    else:
        print(f"  ⚠️  {name:15s}: WARNING - No sta_code_h3 or station_id column!")
        print(f"      Columns: {list(df.columns)[:5]}")

# Site has different ID column
if 'sit_code_h3' in site.columns:
    site.rename(columns={'sit_code_h3': 'station_id'}, inplace=True)
    print(f"  ✓ {'site':15s}: Renamed sit_code_h3 → station_id")
elif 'station_id' in site.columns:
    print(f"  ✓ {'site':15s}: Already has station_id")
else:
    print(f"  ⚠️  {'site':15s}: WARNING - No sit_code_h3 or station_id column!")
    print(f"      Columns: {list(site.columns)[:5]}")

print("\n" + "="*80)
print("MERGING DATASETS")
print("="*80)

# Merge (EXCLUDE training metrics and parameters)
data = params[['station_id', 'NSE_val', 'Performance']].copy()
print(f"\nBase data: {len(data)} stations")

# Merge with verification
for df, name in [(climate, 'Climate'), (hydro, 'Hydro'), (hydro_yearly, 'Hydro_Yearly'),
                 (topo, 'Topo'), (soil, 'Soil'), (geology, 'Geology'),
                 (hydrogeology, 'Hydrogeology'), (landcover, 'Landcover'),
                 (station, 'Station'), (site, 'Site'), (nestedness, 'Nestedness'), (dams, 'Dams')]:

    # Check if station_id exists before merging
    if 'station_id' not in df.columns:
        print(f"  ⚠️  SKIPPING {name:15s}: No station_id column")
        continue

    before_cols = data.shape[1]
    data = data.merge(df, on='station_id', how='left', suffixes=('', '_dup'))

    # Remove duplicate columns
    data = data.loc[:, ~data.columns.str.endswith('_dup')]

    after_cols = data.shape[1]
    added_cols = after_cols - before_cols
    print(f"  + {name:15s}: {added_cols:3d} features added → Total: {after_cols:3d} columns")

print(f"\n{'='*80}")
print(f"FINAL MERGED DATASET")
print("="*80)
print(f"Catchments: {len(data)}")
print(f"Total columns: {len(data.columns)}")
print(f"Features: {len(data.columns) - 3} (excluding station_id, NSE_val, Performance)")
print(f"\n✅ Data merge complete")


STANDARDIZING STATION IDs
  ✓ climate        : Renamed sta_code_h3 → station_id
  ✓ hydro          : Renamed sta_code_h3 → station_id
  ✓ hydro_yearly   : Renamed sta_code_h3 → station_id
  ✓ topo           : Renamed sta_code_h3 → station_id
  ✓ soil           : Renamed sta_code_h3 → station_id
  ✓ geology        : Renamed sta_code_h3 → station_id
  ✓ hydrogeology   : Renamed sta_code_h3 → station_id
  ✓ landcover      : Renamed sta_code_h3 → station_id
  ✓ nestedness     : Renamed sta_code_h3 → station_id
  ✓ dams           : Renamed sta_code_h3 → station_id
  ✓ station        : Renamed sta_code_h3 → station_id
      Columns: ['sta_code_h3', 'sit_label', 'sit_mnemonic', 'sit_label_usual', 'sit_label_add']

MERGING DATASETS

Base data: 549 stations
  + Climate        :  30 features added → Total:  33 columns
  + Hydro          :  17 features added → Total:  50 columns
  + Hydro_Yearly   :   8 features added → Total:  58 columns
  + Topo           :  27 features added → Total:  85 colu

## 2. Comprehensive Feature Engineering

In [None]:
print("="*80)
print("FEATURE ENGINEERING")
print("="*80)

# 1. CATCHMENT AREA
if 'sta_area_snap' in data.columns:
    data['catchment_area'] = data['sta_area_snap']
    data['log_area'] = np.log10(data['catchment_area'].clip(lower=1))
    print("✓ Catchment area")

# 2. SNOW FRACTION
if 'cli_psol_frac_safran' in data.columns:
    data['snow_fraction'] = data['cli_psol_frac_safran']
    print("✓ Snow fraction")

# 3. RAINFALL (convert mm/day to mm/year if needed)
if 'cli_prec_mean_yr' in data.columns:
    data['rainfall_annual'] = data['cli_prec_mean_yr']
elif 'cli_prec_mean' in data.columns:
    # cli_prec_mean is in mm/day, convert to mm/year
    data['rainfall_annual'] = data['cli_prec_mean'] * 365.25
print("✓ Rainfall annual")

# 4. Q/P RATIO (RUNOFF COEFFICIENT)
if 'hyd_q_mean' in data.columns and 'cli_prec_mean' in data.columns:
    # Both in mm/day
    data['qp_ratio'] = data['hyd_q_mean'] / (data['cli_prec_mean'] + 0.01)
    data['qp_ratio'] = data['qp_ratio'].clip(0, 1)  # Physical bounds [0, 1]

    # Wet vs Dry classification
    data['catchment_type_wet'] = (data['qp_ratio'] > 0.5).astype(int)  # Wet
    data['catchment_type_dry'] = (data['qp_ratio'] < 0.3).astype(int)  # Dry
    # Balanced is reference (both zeros)

    print("✓ Q/P ratio and wet/dry classification")
    print(f"  Wet catchments (Q/P > 0.5): {data['catchment_type_wet'].sum()}")
    print(f"  Dry catchments (Q/P < 0.3): {data['catchment_type_dry'].sum()}")
    print(f"  Balanced (0.3 ≤ Q/P ≤ 0.5): {((data['qp_ratio'] >= 0.3) & (data['qp_ratio'] <= 0.5)).sum()}")

# 5. ARIDITY INDEX
if 'cli_pet_ou_mean' in data.columns and 'cli_prec_mean' in data.columns:
    data['aridity_index'] = data['cli_pet_ou_mean'] / (data['cli_prec_mean'] + 0.01)
    data['moisture_index'] = (data['cli_prec_mean'] - data['cli_pet_ou_mean']) / (data['cli_pet_ou_mean'] + 0.01)
    print("✓ Aridity indices")

# 6. BASEFLOW INDEX (AVERAGE)
bfi_cols = [col for col in data.columns if 'bfi' in col.lower() and data[col].dtype in [np.float64, np.int64]]
if len(bfi_cols) > 0:
    data['bfi_mean'] = data[bfi_cols].mean(axis=1)
    print(f"✓ BFI mean ({len(bfi_cols)} methods)")

# 7. FLASHINESS INDEX
if 'top_slo_mean' in data.columns and 'catchment_area' in data.columns:
    data['flashiness_index'] = data['top_slo_mean'] / (np.sqrt(data['catchment_area']) + 1)
    print("✓ Flashiness index")

# 8. ALTITUDE
if 'top_altitude_mean' in data.columns:
    data['altitude_km'] = data['top_altitude_mean'] / 1000
    print("✓ Altitude (km)")

# 9. GEOGRAPHIC ZONES
if all(col in data.columns for col in ['sta_x_l93', 'sta_y_l93', 'top_altitude_mean']):
    x, y, alt = data['sta_x_l93'], data['sta_y_l93'], data['top_altitude_mean']

    data['zone_Alpine'] = ((alt > 800) & (x > 800000)).astype(int)
    data['zone_Mediterranean'] = ((y < 6200000) & (alt < 500)).astype(int)
    data['zone_Atlantic'] = ((x < 600000) & (data['zone_Alpine'] == 0)).astype(int)

    print("✓ Geographic zones")

# 10. TEMPERATURE-PRECIPITATION INTERACTION
if 'cli_temp_mean' in data.columns and 'cli_prec_mean' in data.columns:
    data['temp_prec_ratio'] = data['cli_temp_mean'] / (data['cli_prec_mean'] + 0.01)
    print("✓ Temperature-precipitation ratio")

# 11. SEASONALITY INDEX
if 'cli_prec_season_temp' in data.columns:
    data['seasonality_index'] = data['cli_prec_season_temp']
    print("✓ Seasonality index")

# 12. HUMAN INFLUENCE
if 'dam_n' in data.columns:
    data['has_dams'] = (data['dam_n'] > 0).astype(int)
    print("✓ Dam indicator")

print(f"\n✓ Feature engineering complete")

FEATURE ENGINEERING
✓ Catchment area
✓ Snow fraction
✓ Rainfall annual
✓ Q/P ratio and wet/dry classification
  Wet catchments (Q/P > 0.5): 129
  Dry catchments (Q/P < 0.3): 142
  Balanced (0.3 ≤ Q/P ≤ 0.5): 278
✓ Aridity indices
✓ BFI mean (3 methods)
✓ Flashiness index
✓ Altitude (km)
✓ Geographic zones
✓ Temperature-precipitation ratio
✓ Seasonality index
✓ Dam indicator

✓ Feature engineering complete


## 3. Comprehensive Descriptive Analyses

In [None]:
# Exclude non-predictive columns
exclude_patterns = [
    'station_id', 'NSE_val', 'Performance', 'NSE_train', 'RMSE',
    'label', 'code', 'comment', 'name', 'date', 'mnemonic', 'usual',
    'test', 'child', 'parent', 'timing', 'monitor', '_add', 'qual_'
]

quant_vars = []
for col in data.columns:
    if any(pattern in col.lower() for pattern in exclude_patterns):
        continue
    if data[col].dtype in [np.float64, np.int64]:
        if data[col].notna().sum() > len(data) * 0.3:
            if data[col].std() > 1e-10:
                quant_vars.append(col)

print(f"Quantitative variables: {len(quant_vars)}")

Quantitative variables: 175


In [None]:
print("="*80)
print("1. RAINFALL EFFECT ANALYSIS")
print("="*80)

if 'rainfall_annual' in data.columns:
    valid = data[['NSE_val', 'rainfall_annual']].dropna()
    if len(valid) > 30:
        r, p = stats.spearmanr(valid['NSE_val'], valid['rainfall_annual'])
        print(f"\nRainfall vs NSE:")
        print(f"  Spearman ρ = {r:+.3f} (p = {p:.3e})")
        print(f"  Range: [{valid['rainfall_annual'].min():.0f}, {valid['rainfall_annual'].max():.0f}] mm/year")

        if p < 0.05:
            if r > 0:
                print(f"  ✓ SIGNIFICANT: Higher rainfall → BETTER performance")
            else:
                print(f"  ✓ SIGNIFICANT: Higher rainfall → WORSE performance")
        else:
            print(f"  ✗ Not significant")

        # Tertile analysis
        data['rainfall_tertile'] = pd.qcut(data['rainfall_annual'], q=3, labels=['Low', 'Medium', 'High'], duplicates='drop')
        print(f"\nPerformance by Rainfall Tertile:")
        for tertile in ['Low', 'Medium', 'High']:
            subset = data[data['rainfall_tertile'] == tertile]
            if len(subset) > 0:
                mean_nse = subset['NSE_val'].mean()
                std_nse = subset['NSE_val'].std()
                rain_range = f"[{subset['rainfall_annual'].min():.0f}, {subset['rainfall_annual'].max():.0f}]"

                # Calculate VG% (NSE > 0.80)
                vg_count = (subset['NSE_val'] > 0.80).sum()
                vg_pct = 100 * vg_count / len(subset)

                print(f"  {tertile:6s} {rain_range:>16s} mm/yr: NSE = {mean_nse:.3f} ± {std_nse:.3f} (n={len(subset):3d}, VG={vg_pct:.1f}%)")

        # ANOVA test
        groups = [data[data['rainfall_tertile'] == tertile]['NSE_val'].dropna()
                  for tertile in ['Low', 'Medium', 'High']
                  if len(data[data['rainfall_tertile'] == tertile]) > 0]

        if len(groups) >= 2:
            f_stat, p_anova = stats.f_oneway(*groups)
            print(f"  ANOVA: F = {f_stat:.2f}, p = {p_anova:.3e}")
            if p_anova < 0.05:
                print(f"  ✓ SIGNIFICANT: Rainfall regime affects performance")
            else:
                print(f"  ✗ Not significant")
else:
    print("⚠️ Rainfall data not available")

1. RAINFALL EFFECT ANALYSIS

Rainfall vs NSE:
  Spearman ρ = +0.344 (p = 9.977e-17)
  Range: [655, 2078] mm/year
  ✓ SIGNIFICANT: Higher rainfall → BETTER performance

Performance by Rainfall Tertile:
  Low          [655, 905] mm/yr: NSE = 0.606 ± 0.132 (n=183, VG=2.7%)
  Medium      [906, 1064] mm/yr: NSE = 0.648 ± 0.174 (n=183, VG=12.6%)
  High       [1065, 2078] mm/yr: NSE = 0.701 ± 0.135 (n=183, VG=20.2%)
  ANOVA: F = 18.99, p = 1.061e-08
  ✓ SIGNIFICANT: Rainfall regime affects performance


In [None]:
print("\n" + "="*80)
print("2. WET vs DRY CATCHMENT ANALYSIS (Q/P RATIO)")
print("="*80)

if 'qp_ratio' in data.columns:
    # Correlation
    valid = data[['NSE_val', 'qp_ratio']].dropna()
    if len(valid) > 30:
        r, p = stats.spearmanr(valid['NSE_val'], valid['qp_ratio'])
        print(f"\nQ/P Ratio vs NSE:")
        print(f"  Spearman ρ = {r:+.3f} (p = {p:.3e})")
        print(f"  Range: [{valid['qp_ratio'].min():.3f}, {valid['qp_ratio'].max():.3f}]")

        if p < 0.05:
            if r > 0:
                print(f"  ✓ SIGNIFICANT: Wetter catchments (high Q/P) → BETTER performance")
            else:
                print(f"  ✓ SIGNIFICANT: Drier catchments (low Q/P) → BETTER performance (unexpected!)")
        else:
            print(f"  ✗ Not significant")

    # Compare wet vs dry vs balanced
    print(f"\nPerformance by Catchment Type:")

    wet = data[data['catchment_type_wet'] == 1]
    dry = data[data['catchment_type_dry'] == 1]
    balanced = data[(data['catchment_type_wet'] == 0) & (data['catchment_type_dry'] == 0)]

    for ctype, subset in [('Wet (Q/P > 0.5)', wet), ('Balanced (0.3-0.5)', balanced), ('Dry (Q/P < 0.3)', dry)]:
        if len(subset) > 0:
            mean_nse = subset['NSE_val'].mean()
            std_nse = subset['NSE_val'].std()
            pct_vg = 100 * (subset['Performance'] == 'Very Good').sum() / len(subset)
            print(f"  {ctype:20s}: NSE = {mean_nse:.3f} ± {std_nse:.3f} (n={len(subset):3d}, VG={pct_vg:.1f}%)")

    # ANOVA
    groups = [wet['NSE_val'].dropna(), balanced['NSE_val'].dropna(), dry['NSE_val'].dropna()]
    if all(len(g) > 0 for g in groups):
        f_stat, p_val = stats.f_oneway(*groups)
        print(f"\n  ANOVA: F = {f_stat:.2f}, p = {p_val:.3e}")
        if p_val < 0.05:
            print(f"  ✓ SIGNIFICANT: Catchment wetness affects performance")
else:
    print("⚠️ Q/P ratio not available")


2. WET vs DRY CATCHMENT ANALYSIS (Q/P RATIO)

Q/P Ratio vs NSE:
  Spearman ρ = +0.355 (p = 8.527e-18)
  Range: [0.054, 1.000]
  ✓ SIGNIFICANT: Wetter catchments (high Q/P) → BETTER performance

Performance by Catchment Type:
  Wet (Q/P > 0.5)     : NSE = 0.680 ± 0.172 (n=129, VG=24.0%)
  Balanced (0.3-0.5)  : NSE = 0.681 ± 0.126 (n=278, VG=12.2%)
  Dry (Q/P < 0.3)     : NSE = 0.567 ± 0.153 (n=142, VG=0.0%)

  ANOVA: F = 32.57, p = 4.356e-14
  ✓ SIGNIFICANT: Catchment wetness affects performance


In [None]:
print("\n" + "="*80)
print("3. SNOW EFFECT ANALYSIS")
print("="*80)

if 'snow_fraction' in data.columns:
    valid = data[['NSE_val', 'snow_fraction']].dropna()
    if len(valid) > 30:
        r, p = stats.spearmanr(valid['NSE_val'], valid['snow_fraction'])
        print(f"\nSnow Fraction vs NSE:")
        print(f"  Spearman ρ = {r:+.3f} (p = {p:.3e})")
        print(f"  Range: [{valid['snow_fraction'].min():.3f}, {valid['snow_fraction'].max():.3f}]")

        if p < 0.05:
            if r < 0:
                print(f"  ✓ SIGNIFICANT: Higher snow → WORSE performance")
            else:
                print(f"  ✓ SIGNIFICANT: Higher snow → BETTER performance")
        else:
            print(f"  ✗ Not significant")

    # Categorize by snow fraction (hydrological thresholds)
    data['snow_category'] = pd.cut(
        data['snow_fraction'],
        bins=[0, 0.02, 0.1, 0.5, 1.0],
        labels=['Negligible', 'Low', 'Moderate', 'High'],
        include_lowest=True
    )

    print(f"\nPerformance by Snow Regime:")

    for category in ['Negligible', 'Low', 'Moderate', 'High']:
        subset = data[data['snow_category'] == category]
        if len(subset) > 0:
            mean_nse = subset['NSE_val'].mean()
            std_nse = subset['NSE_val'].std()
            snow_range = f"[{subset['snow_fraction'].min():.3f}, {subset['snow_fraction'].max():.3f}]"

            # Calculate VG% (NSE > 0.80)
            vg_count = (subset['NSE_val'] > 0.80).sum()
            vg_pct = 100 * vg_count / len(subset)

            # Format category with threshold
            if category == 'Negligible':
                cat_label = f"{category} (0-0.02)"
            elif category == 'Low':
                cat_label = f"{category} (0.02-0.1)"
            elif category == 'Moderate':
                cat_label = f"{category} (0.1-0.5)"
            else:
                cat_label = f"{category} (> 0.5)"

            print(f"  {cat_label:20s}: NSE = {mean_nse:.3f} ± {std_nse:.3f} (n={len(subset)}, VG={vg_pct:.1f}%)")

    # ANOVA test
    groups = [data[data['snow_category'] == cat]['NSE_val'].dropna()
              for cat in ['Negligible', 'Low', 'Moderate', 'High']
              if len(data[data['snow_category'] == cat]) > 0]

    if len(groups) >= 2:
        f_stat, p_anova = stats.f_oneway(*groups)
        print(f"  ANOVA: F = {f_stat:.2f}, p = {p_anova:.3e}")
        if p_anova < 0.05:
            print(f"  ✓ SIGNIFICANT: Snow regime affects performance")
        else:
            print(f"  ✗ Not significant")


3. SNOW EFFECT ANALYSIS

Snow Fraction vs NSE:
  Spearman ρ = -0.192 (p = 5.674e-06)
  Range: [0.004, 0.524]
  ✓ SIGNIFICANT: Higher snow → WORSE performance

Performance by Snow Regime:
  Negligible (0-0.02) : NSE = 0.699 ± 0.140 (n=157, VG=23.6%)
  Low (0.02-0.1)      : NSE = 0.639 ± 0.149 (n=278, VG=6.8%)
  Moderate (0.1-0.5)  : NSE = 0.621 ± 0.157 (n=113, VG=8.0%)
  High (> 0.5)        : NSE = 0.008 ± nan (n=1, VG=0.0%)
  ANOVA: F = 13.98, p = 8.623e-09
  ✓ SIGNIFICANT: Snow regime affects performance


In [None]:
print("\n" + "="*80)
print("4. GEOGRAPHIC ZONE ANALYSIS")
print("="*80)

if 'zone_Alpine' in data.columns:
    zones = [
        ('Alpine', data['zone_Alpine'] == 1),
        ('Mediterranean', data['zone_Mediterranean'] == 1),
        ('Atlantic', data['zone_Atlantic'] == 1),
        ('Continental', ~((data['zone_Alpine'] == 1) | (data['zone_Mediterranean'] == 1) | (data['zone_Atlantic'] == 1)))
    ]

    zone_stats = []
    print(f"\nPerformance by Zone:")
    for zone_name, mask in zones:
        subset = data[mask]
        if len(subset) > 0:
            mean_nse = subset['NSE_val'].mean()
            std_nse = subset['NSE_val'].std()
            pct_vg = 100 * (subset['Performance'] == 'Very Good').sum() / len(subset)

            print(f"  {zone_name:15s}: NSE = {mean_nse:.3f} ± {std_nse:.3f} (n={len(subset):3d}, VG={pct_vg:.1f}%)")

            zone_stats.append({'Zone': zone_name, 'n': len(subset), 'Mean_NSE': mean_nse,
                              'Std_NSE': std_nse, 'Pct_VeryGood': pct_vg})

    # ANOVA
    zone_groups = [data[mask]['NSE_val'].dropna() for _, mask in zones if mask.sum() > 0]
    if len(zone_groups) > 2:
        f_stat, p_val = stats.f_oneway(*zone_groups)
        print(f"\n  ANOVA: F = {f_stat:.2f}, p = {p_val:.3e}")
        if p_val < 0.05:
            print(f"  ✓ SIGNIFICANT: Geographic zone affects performance")

    pd.DataFrame(zone_stats).to_csv('XGBoost_Expert_01_Geographic_Zones.csv', index=False)
    print("\n✓ Saved: XGBoost_Expert_01_Geographic_Zones.csv")


4. GEOGRAPHIC ZONE ANALYSIS

Performance by Zone:
  Alpine         : NSE = 0.553 ± 0.218 (n= 38, VG=2.6%)
  Atlantic       : NSE = 0.689 ± 0.136 (n=209, VG=21.1%)
  Continental    : NSE = 0.638 ± 0.148 (n=302, VG=6.6%)

  ANOVA: F = 16.04, p = 1.701e-07
  ✓ SIGNIFICANT: Geographic zone affects performance

✓ Saved: XGBoost_Expert_01_Geographic_Zones.csv


In [None]:
print("\n" + "="*80)
print("5. CATCHMENT SIZE ANALYSIS")
print("="*80)

if 'catchment_area' in data.columns:
    valid = data[['NSE_val', 'catchment_area']].dropna()
    if len(valid) > 30:
        r, p = stats.spearmanr(valid['NSE_val'], valid['catchment_area'])
        print(f"\nCatchment Area vs NSE:")
        print(f"  Spearman ρ = {r:+.3f} (p = {p:.3e})")
        print(f"  Range: [{valid['catchment_area'].min():.1f}, {valid['catchment_area'].max():.1f}] km²")

        if p < 0.05:
            if r > 0:
                print(f"  ✓ SIGNIFICANT: Larger catchments → BETTER performance")
            else:
                print(f"  ✓ SIGNIFICANT: Smaller catchments → BETTER performance")
        else:
            print(f"  ✗ Not significant")

        # Categorize by size (hydrological thresholds)
        data['size_category'] = pd.cut(
            data['catchment_area'],
            bins=[0, 100, 1000, float('inf')],
            labels=['Small', 'Medium', 'Large']
        )

        print(f"\nPerformance by Catchment Size:")

        for category in ['Small', 'Medium', 'Large']:
            subset = data[data['size_category'] == category]
            if len(subset) > 0:
                mean_nse = subset['NSE_val'].mean()
                std_nse = subset['NSE_val'].std()
                area_range = f"[{subset['catchment_area'].min():.1f}, {subset['catchment_area'].max():.1f}]"

                # Calculate VG% (NSE > 0.80)
                vg_count = (subset['NSE_val'] > 0.80).sum()
                vg_pct = 100 * vg_count / len(subset)

                # Format category with threshold
                if category == 'Small':
                    cat_label = f"{category} (< 100 km²)"
                elif category == 'Medium':
                    cat_label = f"{category} (100-1000 km²)"
                else:
                    cat_label = f"{category} (> 1000 km²)"

                print(f"  {cat_label:25s}: NSE = {mean_nse:.3f} ± {std_nse:.3f} (n={len(subset)}, VG={vg_pct:.1f}%)")

        # ANOVA test
        groups = [data[data['size_category'] == cat]['NSE_val'].dropna()
                  for cat in ['Small', 'Medium', 'Large']
                  if len(data[data['size_category'] == cat]) > 0]

        if len(groups) >= 2:
            f_stat, p_anova = stats.f_oneway(*groups)
            print(f"  ANOVA: F = {f_stat:.2f}, p = {p_anova:.3e}")
            if p_anova < 0.05:
                print(f"  ✓ SIGNIFICANT: Catchment size affects performance")
            else:
                print(f"  ✗ Not significant")


5. CATCHMENT SIZE ANALYSIS

Catchment Area vs NSE:
  Spearman ρ = +0.098 (p = 2.140e-02)
  Range: [7.0, 110188.2] km²
  ✓ SIGNIFICANT: Larger catchments → BETTER performance

Performance by Catchment Size:
  Small (< 100 km²)        : NSE = 0.636 ± 0.176 (n=143, VG=19.6%)
  Medium (100-1000 km²)    : NSE = 0.647 ± 0.147 (n=338, VG=8.6%)
  Large (> 1000 km²)       : NSE = 0.706 ± 0.117 (n=68, VG=11.8%)
  ANOVA: F = 5.34, p = 5.055e-03
  ✓ SIGNIFICANT: Catchment size affects performance
