Notebook to calibrate a hydrological model using the gradient descent method

In [None]:
# Import packages
import os
import pandas as pd
import numpy as np
import random
import datetime
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.sampler import BatchSampler

In [None]:
# Class where I define my conceptual model
class SHM(torch.nn.Module):
    # Initialize the information
    def __init__(self, parameter_ranges):
        super().__init__()
        # parameters that will be used for optimization
        self.calibration_parameters = torch.nn.Parameter(torch.zeros(len(parameter_ranges),dtype=torch.float32))
        # Define ranges for optimization
        self.parameter_ranges = torch.tensor([i for i in parameter_ranges.values()], dtype=torch.float32)
    
    # call the function that runs the model
    def forward(self, X_SHM, initial_states, warmup_period = 0):     
        # run warmup period  (to stabilize the internal states of the model = buckets
        if warmup_period>0:
            with torch.no_grad():
                _, states = self.run_SHM(X_SHM = X_SHM[0:warmup_period,:],
                                         initial_states = initial_states)    
        # calibration period
        q_sim, _ = self.run_SHM(X_SHM = X_SHM[warmup_period:,:],
                                initial_states = states)
        # return simulated discharge
        return q_sim
        
    # code for the hydrological model
    def run_SHM(self, X_SHM, initial_states):
        # map parameters between 0 and 1
        sigmoid_params = torch.sigmoid(self.calibration_parameters)
        # map parameters to calibration ranges
        dd = self.parameter_ranges[0][0] + sigmoid_params[0]*(self.parameter_ranges[0][1]-self.parameter_ranges[0][0])
        f_thr = self.parameter_ranges[1][0] + sigmoid_params[1]*(self.parameter_ranges[1][1]-self.parameter_ranges[1][0])
        sumax = self.parameter_ranges[2][0] + sigmoid_params[2]*(self.parameter_ranges[2][1]-self.parameter_ranges[2][0])
        beta = self.parameter_ranges[3][0] + sigmoid_params[3]*(self.parameter_ranges[3][1]-self.parameter_ranges[3][0])
        perc = self.parameter_ranges[4][0] + sigmoid_params[4]*(self.parameter_ranges[4][1]-self.parameter_ranges[4][0])
        kf = self.parameter_ranges[5][0] + sigmoid_params[5]*(self.parameter_ranges[5][1]-self.parameter_ranges[5][0])
        ki = self.parameter_ranges[6][0] + sigmoid_params[6]*(self.parameter_ranges[6][1]-self.parameter_ranges[6][0])
        kb = self.parameter_ranges[7][0] + sigmoid_params[7]*(self.parameter_ranges[7][1]-self.parameter_ranges[7][0])
        
        #read initial states and parameters
        ss, sf, su, si, sb = initial_states
        # initialize vector to store discharges
        q_out = torch.zeros((X_SHM.shape[0], 1))
        
        # run model for each timestep
        for j, (p, pet, temp) in enumerate(X_SHM):

            # Snow module --------------------------
            if temp > 0: # if there is snowmelt
                qs_out = torch.min(ss, dd*temp) # snowmelt from snow reservoir
                ss = ss - qs_out # substract snowmelt from snow reservoir
                qsp_out = qs_out + p # flow from snowmelt and rainfall
            else: # if the is no snowmelt
                ss=ss + p # precipitation accumalates as snow in snow reservoir
                qsp_out = torch.tensor(0.0, dtype=torch.float32) 

            # Split snowmelt+rainfall into inflow to fastflow reservoir and unsaturated reservoir ------
            qf_in = torch.maximum(torch.tensor(0.0, dtype=torch.float32) , qsp_out-f_thr)
            qu_in = torch.minimum(qsp_out, f_thr)

            # Fastflow module ----------------------
            sf = sf + qf_in
            qf_out = sf/ kf
            sf = sf - qf_out

            # Unsaturated zone----------------------
            psi = (su / sumax) ** beta #[-]
            su_temp = su + qu_in * (1 - psi)
            su = torch.minimum(su_temp, sumax)
            qu_out = qu_in * psi + torch.maximum(torch.tensor(0.0, dtype=torch.float32), su_temp - sumax) # [mm]
            
            # Evapotranspiration
            klu = torch.tensor(0.90, requires_grad=False, dtype=torch.float32) # land use correction factor [-]
            if su == 0.0:
                ktetha = torch.tensor(0.0, requires_grad=False, dtype=torch.float32)
            elif su >= 0.8 * sumax:
                ktetha = torch.tensor(1.0, requires_grad=False, dtype=torch.float32)
            else:
                ktetha = su / sumax

            ret = pet * klu * ktetha #[mm]
            su = torch.maximum(torch.tensor(0.0, requires_grad=True, dtype=torch.float32), su - ret) #[mm]

            # Interflow reservoir ------------------
            qi_in = qu_out * perc #[mm]
            si = si + qi_in #[mm]
            qi_out = si / ki #[mm]
            si = si - qi_out #[mm]

            # Baseflow reservoir -------------------
            qb_in = qu_out * (1 - perc) #[mm]
            sb = sb + qb_in #[mm]
            qb_out = sb / kb #[mm]
            sb = sb - qb_out #[mm]

            # Output
            states = torch.cat((ss.unsqueeze(0), sf.unsqueeze(0), su.unsqueeze(0), si.unsqueeze(0), sb.unsqueeze(0)))
            q_out[j,0] = qf_out + qi_out + qb_out #[mm]

        return q_out, states
    
    # return the calibrated parameters of the conceptual model
    def calibrated_parameters(self):
        # map parameters between 0 and 1
        sigmoid_params = torch.sigmoid(self.calibration_parameters)
        # map parameters to calibration ranges
        dd = self.parameter_ranges[0][0] + sigmoid_params[0]*(self.parameter_ranges[0][1]-self.parameter_ranges[0][0])
        f_thr = self.parameter_ranges[1][0] + sigmoid_params[1]*(self.parameter_ranges[1][1]-self.parameter_ranges[1][0])
        sumax = self.parameter_ranges[2][0] + sigmoid_params[2]*(self.parameter_ranges[2][1]-self.parameter_ranges[2][0])
        beta = self.parameter_ranges[3][0] + sigmoid_params[3]*(self.parameter_ranges[3][1]-self.parameter_ranges[3][0])
        perc = self.parameter_ranges[4][0] + sigmoid_params[4]*(self.parameter_ranges[4][1]-self.parameter_ranges[4][0])
        kf = self.parameter_ranges[5][0] + sigmoid_params[5]*(self.parameter_ranges[5][1]-self.parameter_ranges[5][0])
        ki = self.parameter_ranges[6][0] + sigmoid_params[6]*(self.parameter_ranges[6][1]-self.parameter_ranges[6][0])
        kb = self.parameter_ranges[7][0] + sigmoid_params[7]*(self.parameter_ranges[7][1]-self.parameter_ranges[7][0])

        params = torch.cat((dd.unsqueeze(0), f_thr.unsqueeze(0), sumax.unsqueeze(0), 
                            beta.unsqueeze(0), perc.unsqueeze(0), kf.unsqueeze(0), 
                            ki.unsqueeze(0), kb.unsqueeze(0)))
        
        return params

In [None]:
# Class where I define the dataset to calibrate the model (basically a big table where I organize
# the inputs and targets used to calibrate the model)

class Forcing_DataSet(Dataset):
    def __init__(self,
                 basin_id,  # ID of basin
                 forcing,  # name of the dynamic forcings [list]
                 target,  # name of the target [list]
                 time_period,  # start and end day of time period of interest [list]
                 path_ts  # path to time series [string]
                 ):

        # read variables and store them in self
        self.time_period = time_period
        self.basin_id = basin_id  # catchment ID
        self.forcing = forcing  # dynamic attributes
        self.target = target  # target

        # full path_ts
        path_timeseries = path_ts + 'CAMELS_GB_hydromet_timeseries_' + str(self.basin_id) + '.csv'

        # load time series
        df_ts = pd.read_csv(path_timeseries)

        # load dynamic forcings
        df_forcing = df_ts.loc[:, self.forcing]
        df_forcing = df_forcing.set_index('date')

        # load target value
        df_target = df_ts.loc[:, self.target]
        df_target = df_target.set_index('date')

        # read training subset
        df_forcing = df_forcing.loc[self.time_period[0]:self.time_period[1]]
        df_target = df_target.loc[self.time_period[0]:self.time_period[1]]

        # change all columns to float
        self.df_forcing = df_forcing.astype(np.float64)
        self.df_target = df_target.astype(np.float64)

        # tensors with inputs and output
        self.X = torch.tensor(self.df_forcing.values, dtype=torch.float32)
        self.y = torch.tensor(self.df_target.values, dtype=torch.float32)

    def __len__(self):
        # Function to define length of data (same as length of my time series)
        return self.X.shape[0]

    def __getitem__(self, id):
        # Function to get the information during the optimization 
        return self.X[id, :], self.y[id]
    
    def year_batches(self, cutoff):
        # Function to generate the year batches (one list for every year of data)
        batches = []
        minibatch = []
        for id, date in enumerate(pd.DatetimeIndex(self.df_forcing.index)):
            minibatch.append(id)
            if date == datetime.datetime(date.year, cutoff[0], cutoff[1], cutoff[2]):
                batches.append(minibatch)
                minibatch = []
                
        return batches   

In [None]:
# During the optimization I will use batches of 2 years of data, one for warmup and the other for
# calibration. This class is used so the DataLoader (pytorch class) can extract the information in
# that way. 

class CustomSampler(BatchSampler):
    def __init__(self, year_batches, suffle=True):
        self.year_batches = year_batches
        self.suffle=suffle

    def __iter__(self):
        # Groups two consequtive years, one for warmup period and the other for training
        time_batches = [self.year_batches[i] + self.year_batches[i + 1] for i in range(len(self.year_batches) - 1)]
        if self.suffle:
            random.shuffle(time_batches)

        # Deliver the combinations one by one to the DataLoader.
        for batch in time_batches:
            yield batch

    def __len__(self) -> int:
        return (len(self.year_batches) - 1)

In [None]:
# auxiliary function used by pytorch dataloader to return all the information tht is used during the
# optimization process

def collate_fn(batch):
    return_list = []
    for element in batch[0]:
        return_list.append(element)
    return return_list

In [None]:
# loss used in optimization. Is the NSE without the initial one, because the optimization problem is
# stated as a minimization
def nse_loss(pred, obs):
    nse_loss = torch.sum((pred - obs)**2) / torch.sum((obs - torch.mean(obs))**2)
    return nse_loss

# function used to train the model
def train_model(data_loader, model, loss_function, optimizer, initial_states, warmup_period=0):
    num_batches = len(data_loader)
    total_loss = 0
    
    model.train()
    # this for loop updates the optimization parameters for each batch. Each batch has two years of
    # data, one for warmup and one for actual optimization
    for X, y in data_loader:
        optimizer.zero_grad()  
        q_sim = model(X, initial_states, warmup_period) # run the model
        loss = loss_function(q_sim, y[warmup_period:])  # calculate the loss
        loss.backward()  # propagate the loss into the optimization parameters
        optimizer.step() # optimize the parameters
        total_loss += loss.item()
                              
    avg_loss = total_loss / num_batches # average loss during the whole epoch
    
    return q_sim, avg_loss

In [None]:
# Initialize information
path_basins= '../data/CAMELS-GB/timeseries_v2/Selected_Basins_hybrid.csv'
path_ts = '../data/CAMELS-GB/timeseries_v2/'
path_output = '../results/models/SHM/'

# dynamic forcings and target (ALWAYS INCLUDE THE DATE AS FIRST ARGUMENT)
forcing=['date','precipitation', 'peti', 'temperature']
target=['date', 'discharge_spec']

# training period
training_period = ['1987-10-01','1999-09-30']
testing_period = ['2005-10-01','2012-09-30']
warmup_period = 365

# optimization hyperparameters
optimization_hyperparameters = {
    "no_of_epochs": 40,
    "learning_rate": 0.05
    }

# optimization ranges
parameter_ranges = {
    'dd' : [0.0,10],
    'f_thr'  : [10,60],
    'sumax' : [20,700],
    'beta'  : [1.0, 6.0],
    'perc'  : [0.0, 1.0],
    'kf'    : [1.0, 20.0],
    'ki'    : [1.0, 100.0],
    'kb'    : [10.0, 1000.0]
}

# initial states of the reservoirs (not too important because there is 1 year of warmup)
initial_states = {
    'ss_0'  : 0.0,
    'sf_0'  : 1.0,
    'su_0'  : 5.0,
    'si_0'  : 10.0,
    'sb_0'  : 15.0
}

# Read information from the basins that will be optimize
selected_basins_id= list(np.loadtxt(path_basins, skiprows=1).astype(int))

In [None]:
# Check if the path where one will store the results exists. In case it does not, it creates such path.
if not os.path.exists(path_output):
    # Create the folder
    os.makedirs(path_output)
    print(f"Folder '{path_output}' created successfully.")
else:
    print(f"Folder '{path_output}' already exists.")

In [None]:
# dataframe to store the calibration results
columns_name = ['basin_id', 'NSE_training', 'dd', 'f_thr', 'su_max', 'beta', 'perc', 'kf', 'ki', 'kb' , 'NSE_testing'] 
df_calibration = pd.DataFrame(index=range(len(selected_basins_id)), columns=columns_name)

# Lists to store information during optimization
SHM_models = []
train_datasets = []
test_datasets = []

# Loop to go through each basin that will be calibrated
for i, basin in enumerate(selected_basins_id):
    # Dataset for training
    train_datasets.append(Forcing_DataSet(basin_id=basin,
                                          forcing=forcing,
                                          target=target,
                                          time_period=training_period,
                                          path_ts=path_ts))

    # DataLoader for training
    year_batches = train_datasets[i].year_batches([9, 30, 0])
    train_loader = DataLoader(train_datasets[i], 
                              sampler=CustomSampler(year_batches = year_batches),
                              collate_fn=collate_fn)


    # Define model
    SHM_models.append(SHM(parameter_ranges))
    states = torch.tensor([i for i in initial_states.values()], dtype=torch.float32)
    # Define optimizer
    optimizer = torch.optim.Adam(SHM_models[i].parameters(), lr=optimization_hyperparameters['learning_rate'])
    
    # Training --------------------------------------------------
    for epoch in range(optimization_hyperparameters['no_of_epochs']): 
        q_sim, avg_loss= train_model(data_loader=train_loader, 
                                     model=SHM_models[i],
                                     loss_function=nse_loss,
                                     optimizer=optimizer, 
                                     initial_states=states,
                                     warmup_period=warmup_period)
        
        print(f'Epoch: {epoch + 1:<2} | Loss: {"%.4f "% avg_loss}')

    # Calculate NSE for whole training set (the loss during the previous loop was calculated by
    # batches).
    temp = [year_batches[0] , sum(year_batches[1:], [])]
    loader_2 = DataLoader(train_datasets[i], sampler=CustomSampler(year_batches = temp),collate_fn=collate_fn)
    X_training, y_training = next(iter(loader_2))
    # run model
    q_sim, _ = SHM_models[i].run_SHM(X_training, states)
    #calculate training loss
    nse_training = np.round(1.0 - nse_loss(q_sim[warmup_period:], y_training[warmup_period:]).detach().numpy(),3)
    # get the optimized parameters
    parameters = np.round(SHM_models[i].calibrated_parameters().detach().numpy(),3)

    
    # Testing --------------------------------------------------
    test_datasets.append(Forcing_DataSet(basin_id=basin,
                                         forcing=forcing,
                                         target=target,
                                         time_period=testing_period,
                                         path_ts=path_ts))
    
    year_batches = test_datasets[i].year_batches([9, 30, 0])
    temp = [year_batches[0] , sum(year_batches[1:], [])]
    loader_2 = DataLoader(test_datasets[i], sampler=CustomSampler(year_batches = temp), collate_fn=collate_fn)
    X_testing, y_testing = next(iter(loader_2))
    # run model
    q_sim, _ = SHM_models[i].run_SHM(X_testing, states)
    # calculate testing loss
    nse_testing = np.round(1.0 - nse_loss(q_sim[warmup_period:], y_testing[warmup_period:]).detach().numpy(),3)
    
    # Save the results
    row_data = [basin, nse_training] + list(parameters) + [nse_testing]
    df_calibration.loc[i] = row_data
    print('Calibration of basin:'+str(i+1)+'/'+str(len(selected_basins_id))+' with ID:'+str(basin)+' is completed-------------------------------')

# Save all the results in a dataframe
df_calibration.to_csv(path_output+'SHM_SGD_calibration.csv', index=False)