Notebook to calibrate a hydrological model using the Shuffled Complex Evolution method. We use the spotpy library

In [None]:
#Import necessary packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import spotpy

In [None]:
# code for the hydrological model
def HydrologicalModel(X_SHM, initial_states, param):

    #read initial states and parameters
    ss, sf, su, si, sb = initial_states
    dd, f_thr, sumax, beta, perc, kf, ki, kb = param 

    # initialize vector to store discharges
    q_out = np.zeros((X_SHM.shape[0], 1))

    # run model for each timestep
    for i, (p, pet, temp) in enumerate(X_SHM):
        
        # Snow module --------------------------
        if temp > 0: # if there is snowmelt
            qs_out = min(ss, dd*temp) # snowmelt from snow reservoir
            ss = ss - qs_out # substract snowmelt from snow reservoir
            qsp_out = qs_out + p # flow from snowmelt and rainfall
        else: # if the is no snowmelt
            ss=ss + p # precipitation accumalates as snow in snow reservoir
            qsp_out = 0.0

        # Split snowmelt+rainfall into inflow to fastflow reservoir and unsaturated reservoir ------
        qf_in = max(0, qsp_out-f_thr)
        qu_in = min(qsp_out, f_thr)

        # Fastflow module ----------------------
        sf = sf + qf_in
        qf_out = sf/ kf
        sf = sf - qf_out

        # Unsaturated zone----------------------
        psi = (su / sumax) ** beta #[-]
        su_temp = su + qu_in * (1 - psi)
        su = min(su_temp, sumax)
        qu_out = qu_in * psi + max(0.0, su_temp - sumax) # [mm]
        
        # Evapotranspiration -------------------
        klu = 0.9 # land use correction factor [-]
        if su <= 0.0:
            ktetha = 0.0
        elif su >= 0.8 * sumax:
            ktetha = 1.0
        else:
            ktetha = su / sumax

        ret = pet * klu * ktetha #[mm]
        su = max(0.0, su - ret) #[mm]

        # Interflow reservoir ------------------
        qi_in = qu_out * perc #[mm]
        si = si + qi_in #[mm]
        qi_out = si / ki #[mm]
        si = si - qi_out #[mm]

        # Baseflow reservoir -------------------
        qb_in = qu_out * (1 - perc) #[mm]
        sb = sb + qb_in #[mm]
        qb_out = sb / kb #[mm]
        sb = sb - qb_out #[mm]

        # Output
        q_out[i,0] = qf_out + qi_out + qb_out #[mm]

    return q_out

In [None]:
# Class where I define the optimization object (following spotpy library examples)
class spot_setup(object):
    # optimization parameters
    dd = spotpy.parameter.Uniform(low=0.0, high=10)
    f_thr = spotpy.parameter.Uniform(low=10.0, high=60.0)
    su_max = spotpy.parameter.Uniform(low=20.0, high=700.0)
    beta = spotpy.parameter.Uniform(low=1.0, high=6.0)
    perc = spotpy.parameter.Uniform(low=0.0, high=1.0)
    kf = spotpy.parameter.Uniform(low=1.0, high=20.0)
    ki = spotpy.parameter.Uniform(low=1.0, high=100.0)
    kb = spotpy.parameter.Uniform(low=10.0, high=1000.0)

    def __init__(self, path_ts, basin_id, forcing, target, time_period, initial_conditions, buffer=0, obj_func=None):
        # Read inputs ---------------------
        self.basin_id = basin_id
        self.buffer=buffer
        self.initial_conditions = initial_conditions
        self.obj_func = obj_func
        
        # load time series -----------------
        path_timeseries = path_ts + 'CAMELS_GB_hydromet_timeseries_' + str(self.basin_id) + '.csv'
        df_ts = pd.read_csv(path_timeseries)
        
        # forcings
        df_forcing = df_ts.loc[:, forcing]
        df_forcing = df_forcing.set_index('date')
        # target
        df_target = df_ts.loc[:, target]
        df_target = df_target.set_index('date')
        # training subset
        df_forcing = df_forcing.loc[time_period[0]:time_period[1]]
        df_target = df_target.loc[time_period[0]:time_period[1]]

        self.X_SHM= df_forcing.to_numpy()
        self.target = df_target.to_numpy().reshape((-1,1))
        
    def simulation(self, x):
        sim_q = HydrologicalModel(self.X_SHM, self.initial_conditions, x)[:,0]
        return sim_q
    
    def evaluation(self):
        return self.target[:,0]
    
    def objectivefunction(self,simulation,evaluation, params=None):
        if not self.obj_func: #if the user does not define a loss function
            like = spotpy.objectivefunctions.rmse(evaluation[self.buffer:],simulation[self.buffer:])
            # the self.buffer allow us to not consider the warmup period when we compute the loss

        else:
            like = self.obj_func(evaluation[self.buffer:],simulation[self.buffer:])
            # the self.buffer allow us to not consider the warmup period when we compute the loss
        return like
    
    def calibrated_values(self, q_sim, parameters):
        self.q_sim = q_sim
        self.calibrated_parameters = parameters


In [None]:
# Loss function used during optimization (NSE)
def nse_loss(sim, obs):
    nse_loss = np.sum((sim - obs)**2) / np.sum((obs - np.mean(obs))**2)
    return np.round(1.0-nse_loss,2)

In [None]:
# Initialize information
path_basins= '../data/CAMELS-GB/timeseries_v2/Selected_Basins_hybrid.csv'
path_ts = '../data/CAMELS-GB/timeseries_v2/'
path_output = '../results/models/SHM/'
buffer = 365
initial_conditions = [0.0, 1.0, 5.0, 10.0, 15.0]
forcing=['date','precipitation', 'peti', 'temperature']
target=['date', 'discharge_spec']
training_period = ['1987-10-01','1999-09-30']
testing_period = ['2005-10-01','2012-09-30']
# Read information
selected_basins_id= list(np.loadtxt(path_basins, skiprows=1).astype(int))

In [None]:
# Check if the path where one will store the results exists. In case it does not, it creates such path.
if not os.path.exists(path_output):
    # Create the folder
    os.makedirs(path_output)
    print(f"Folder '{path_output}' created successfully.")
else:
    print(f"Folder '{path_output}' already exists.")

Training

In [None]:
#Create dataframe to store the results
columns_name = ['basin_id', 'NSE_training', 'dd', 'f_thr', 'su_max', 'beta', 'perc', 'kf', 'ki', 'kb'] 
df_calibration = pd.DataFrame(index=range(len(selected_basins_id)), columns=columns_name)
list_calibration = []

# Loop to go through each basin that will be calibrated
for i, basin in enumerate(selected_basins_id):
    #Create setup object
    list_calibration.append(spot_setup(path_ts= path_ts, 
                                       basin_id = basin,
                                       forcing = forcing,
                                       target = target,
                                       time_period = training_period,
                                       initial_conditions = initial_conditions, 
                                       buffer=buffer, 
                                       obj_func=None))
    
    file_name = path_output+'SCEUA_'+str(basin)
    # Run calibration
    sampler=spotpy.algorithms.sceua(list_calibration[i], dbname=file_name, dbformat='csv')
    sampler.sample(10000, ngs=7, kstop=3, peps=0.1, pcento=0.1)
    
    #Get the results
    results = spotpy.analyser.load_csv_results(file_name)
    # Extract information about best run
    bestindex,bestobjf = spotpy.analyser.get_minlikeindex(results)
    best_model_run = results[bestindex]
    # Extract calibrated parameters
    par_fields=[word for word in best_model_run.dtype.names if word.startswith('par')]
    parameters = list(best_model_run[par_fields])
    # Calculate NSE of calibrated run
    q_fields=[word for word in best_model_run.dtype.names if word.startswith('sim')]
    q_sim = np.asarray(list(best_model_run[q_fields]))[buffer:]
    NSE = nse_loss(sim=q_sim, obs=list_calibration[i].target[buffer:].flatten())
    # Save the results
    list_calibration[i].calibrated_values(q_sim, parameters)
    row_data = [basin, NSE] + parameters
    df_calibration.loc[i] = row_data
    print('Calibration of basin:'+str(i+1)+'/'+str(len(selected_basins_id))+' with ID:'+str(basin)+' is completed-------------------------------')

df_calibration.to_csv(path_output+'SHM_SCE_calibration.csv', index=False)

Testing

In [None]:
df_calibration = pd.read_csv(path_output+'SHM_SCE_calibration.csv')
NSE_testing = []

# Loop to go through each basin
for i, basin in enumerate(selected_basins_id):
     # read dataset for the basin of interest
     path_timeseries = path_ts + 'CAMELS_GB_hydromet_timeseries_' + str(basin) + '.csv'
     df_ts = pd.read_csv(path_timeseries)  
     df_forcing = df_ts.loc[:, forcing]
     df_forcing = df_forcing.set_index('date')
     df_target = df_ts.loc[:, target]
     df_target = df_target.set_index('date')

     # Run SHM for testing period
     df_forcing = df_forcing.loc[testing_period[0]:testing_period[1]]
     df = df_calibration.loc[df_calibration['basin_id'] == basin]
     param = np.ndarray.flatten(df.iloc[:, 2:].values).tolist()
     q_sim = HydrologicalModel(df_forcing.to_numpy(), initial_conditions, param)
     
     # Observations for testing subset
     df_target = df_target.loc[testing_period[0]:testing_period[1]]
     q_obs = df_target.to_numpy().reshape((-1,1))
     
     # Calculate NSE in testing
     NSE_testing.append(nse_loss(sim=q_sim[buffer:].flatten(), obs=q_obs[buffer:].flatten()))
     print('Testing of basin:'+str(i+1)+'/'+str(len(selected_basins_id))+' with ID:'+str(basin)+' is completed-------------------------------')

df_calibration['NSE_testing'] = NSE_testing
df_calibration.to_csv(path_output+'SHM_SCE_calibration.csv', index=False)