# Can we reduce overfitting?
An experiment using one basin

In [1]:
import sys, os, glob, pickle, toml, json, pickle, random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import multiprocessing


sys.path.append('../../MOASMO_support')
from MOASMO_parameter_allbasin_emulator import *

In [2]:
# load data function, taken from ~/CTSM_repos/CTSM_calibration/src/MOASMO_support/MOASMO_parameter_allbasin_emulator.py
def load_basin_data():
    infile_basin_info = f"/glade/work/guoqiang/CTSM_CAMELS/data_mesh_surf/HillslopeHydrology/CAMELS_level1_basin_info.csv"
    infile_param_info = '/glade/u/home/guoqiang/CTSM_repos/CTSM_calibration/src/parameter/CTSM_CAMELS_SA_param_240202.csv'
    infile_attr_foruse = '/glade/u/home/guoqiang/CTSM_repos/CTSM_calibration/data/camels_attributes_table_TrainModel.csv'
    inpath_moasmo = "/glade/campaign/cgd/tss/people/guoqiang/CTSM_CAMELS_proj/Calib_HH_MOASMO_bigrange"
    path_CTSM_case = f'/glade/work/guoqiang/CTSM_CAMELS/Calib_HH_MOASMO_bigrange'
    iterend = 1 # only read data from iter-0

    outpath = f"{inpath_moasmo}/allbasin_emulator"
    os.makedirs(outpath, exist_ok=True)
    
    # Load data: same for all iterations
    df_basin_info = pd.read_csv(infile_basin_info)
    df_param_info = pd.read_csv(infile_param_info)
    
    file_defa_param = f'{outpath}/camels_627basin_ctsm_defa_param.csv'
    df_param_defa = read_allbasin_defa_params(path_CTSM_case, infile_param_info, file_defa_param, len(df_basin_info))

    file_param_lb = f'{outpath}/camels_627basin_ctsm_all_param_lb.gz'
    file_param_ub = f'{outpath}/camels_627basin_ctsm_all_param_ub.gz'
    df_param_lb, df_param_ub = load_basin_param_bounds(inpath_moasmo, df_param_defa, file_param_lb, file_param_ub)

    file_camels_attribute = f'{outpath}/camels_627basin_attribute.pkl'
    df_att = read_camels_attributes(infile_basin_info, file_camels_attribute)
    
    df_att_foruse = pd.read_csv(infile_attr_foruse)
    useattrs = list(df_att_foruse[df_att_foruse['att_Xie2021'].values]['Attribute_text'].values)
    print("The number of attributes used:", len(useattrs))
    print(useattrs)

    # Load data: outputs from each iteration
    for iter in range(0, iterend):
        file_all_param = f'{outpath}/camels_627basin_ctsm_all_param_iter{iter}.gz'
        file_all_metric = f'{outpath}/camels_627basin_ctsm_all_metric_iter{iter}.gz'
        file_all_basinid = f'{outpath}/camels_627basin_ctsm_all_basinid_iter{iter}.gz'
        
        df_param_i, df_metric_i, df_basinid_i = load_all_basin_params_metrics(inpath_moasmo, df_param_defa, df_basin_info, iter, file_all_param, file_all_metric, file_all_basinid)
        
        df_basinid_i['iter'] = iter
        
        if iter == 0:
            df_param = df_param_i
            df_metric = df_metric_i
            df_basinid = df_basinid_i
        else:
            df_param = pd.concat([df_param, df_param_i])
            df_metric = pd.concat([df_metric, df_metric_i])
            df_basinid = pd.concat([df_basinid, df_basinid_i])
    
    df_param.index = np.arange(len(df_param))
    df_metric.index = np.arange(len(df_metric))
    df_basinid.index = np.arange(len(df_basinid))

    index = np.isnan(np.sum(df_metric.values, axis=1))
    df_param = df_param[~index]
    df_metric = df_metric[~index]
    df_basinid = df_basinid[~index]
    
    df_param.index = np.arange(len(df_param))
    df_metric.index = np.arange(len(df_metric))
    df_basinid.index = np.arange(len(df_basinid))
    
    print('Number of nan samples:', np.sum(index))
    print("Number of original parameter sets:", len(index))
    print("Number of final parameter sets:", len(df_param))

    return df_basin_info, df_param_info, df_param_defa, df_param_lb, df_param_ub, df_att, df_att_foruse, df_param, df_metric, df_basinid

# Load data and save data for model training and comparison
Here I just load the outputs from LSE which has summarized outputs from individual basins

In [3]:
# inpath = '/glade/campaign/cgd/tss/people/guoqiang/CTSM_CAMELS_proj/Calib_HH_MOASMO_bigrange/allbasin_emulator'

# file_all_param = f'{inpath}/camels_627basin_ctsm_all_param_iter0.gz'
# file_all_metric = f'{inpath}/camels_627basin_ctsm_all_meric_iter0.gz'
# file_all_basinid = f'{inpath}/camels_627basin_ctsm_all_basinid_iter0.gz'

# df_param = pd.read_csv(file_all_param, compression='gzip')
# df_metric = pd.read_csv(file_all_metric, compression='gzip')
# df_basinid = pd.read_csv(file_all_basinid, compression='gzip')

df_basin_info, df_param_info, df_param_defa, df_param_lb, df_param_ub, df_att, df_att_foruse, df_param, df_metric, df_basinid = load_basin_data()
print('Number of basins:', len(df_basin_info))
print('Number of all parameters:', len(df_param_info))
print('Number of all attributes:', len(df_att.columns))

File exists: /glade/campaign/cgd/tss/people/guoqiang/CTSM_CAMELS_proj/Calib_HH_MOASMO_bigrange/allbasin_emulator/camels_627basin_attribute.pkl
The number of attributes used: 27
['mean_elev', 'mean_slope', 'area_gauges2', 'p_mean', 'pet_mean', 'aridity', 'p_seasonality', 'frac_snow', 'high_prec_freq', 'high_prec_dur', 'low_prec_freq', 'low_prec_dur', 'frac_forest', 'lai_max', 'lai_diff', 'dom_land_cover', 'dom_land_cover_frac', 'soil_depth_pelletier', 'soil_depth_statsgo', 'soil_porosity', 'soil_conductivity', 'max_water_content', 'sand_frac', 'silt_frac', 'clay_frac', 'carbonate_rocks_frac', 'geol_permeability']
Number of nan samples: 3309
Number of original parameter sets: 250800
Number of final parameter sets: 247491
Number of basins: 627
Number of all parameters: 27
Number of all attributes: 62


In [4]:
inpath_moasmo = '/glade/campaign/cgd/tss/people/guoqiang/CTSM_CAMELS_proj/Calib_HH_MOASMO_bigrange'
outpath_all = f'{inpath_moasmo}/LargeSampleEmulator_exps_out'
os.makedirs(outpath_all, exist_ok=True)
numbasin = len(df_basin_info)

In [5]:
num_cpus = 1  # Example: Use 4 CPUs

# SSE train and CV

## Train/Evalute RF model

Key Parameters to Adjust to reduce overfitting

n_estimators  
Definition: The number of trees in the forest.  
Adjustment: Increase the number of trees until the model's performance on the validation set stabilizes. More trees generally reduce variance.  

max_depth  
Definition: The maximum depth of each tree.  
Adjustment: Limit the maximum depth of the trees. Shallower trees (smaller max_depth) can prevent the model from becoming too complex, thus reducing overfitting. For example, try setting max_depth=10.  

min_samples_split  
Definition: The minimum number of samples required to split an internal node.  
Adjustment: Increase this value to require more samples at a node before it splits. This can prevent the model from learning overly specific patterns. For example, try setting min_samples_split=10.  

min_samples_leaf  
Definition: The minimum number of samples required to be at a leaf node.  
Adjustment: Increase this value to ensure that each leaf has a sufficient number of samples, which can help prevent overfitting. For example, try setting min_samples_leaf=5.  

max_features  
Definition: The number of features to consider when looking for the best split.  
Adjustment: Reduce this value to limit the number of features considered for each split. This increases the diversity among the trees, helping to reduce overfitting. For example, try setting max_features='sqrt' or max_features=0.3.  

bootstrap  
Definition: Whether bootstrap samples are used when building trees.  
Adjustment: Ensure this is set to True to increase the diversity of the trees by sampling with replacement.  

max_samples  
Definition: The number of samples to draw from X to train each base estimator.  
Adjustment: Set this to a fraction of the total number of samples to increase the diversity of the trees. For example, try setting max_samples=0.8.  

Findings: some parameters like min_samples_split and max_samples can reduce overfitting, but they fail to improve the performance in the test period. Therefore, they are not used  

In [72]:
# Parallel version

from sklearn.model_selection import KFold


def rf_emulator_cv(x, y, xlb_mean, xub_mean):

    random.seed(1234567890)
    np.random.seed(1234567890)
    
    n_splits = 5

    cv_results = {}
    kf = KFold(n_splits=n_splits, shuffle=True) 
    kge_scores = np.nan * np.zeros([n_splits, y.shape[1]])


    # normalize
    x = (x - xlb_mean) / (xub_mean - xlb_mean)

    
    for fold_idx, (train_index, test_index) in enumerate(kf.split(x), 1):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Initialize and train your GPR model here; adjust parameters as needed
        sm = RandomForestRegressor()
        sm.fit(x_train, y_train)
        
        # Store results
        cv_results[fold_idx] = {
                'train_index': train_index,
                'test_index': test_index,
                'y_train': np.squeeze(y_train),
                'y_test': np.squeeze(y_test),
                'y_test_pred': np.squeeze(sm.predict(x_test)),
                'y_train_pred': np.squeeze(sm.predict(x_train)),
            }

    return cv_results


def process_basin(i):
    indi = df_basinid['basin_id'].values == i
    kgei = df_metric[indi]['kge'].values
    kgei = kgei / (2 - kgei)
    parami = df_param[indi].values

    # only select useful params
    lbi = df_param_lb.iloc[i].values
    ubi = df_param_ub.iloc[i].values
    induse = lbi != ubi
    parami = parami[:, induse]
    lbi = lbi[induse]
    ubi = ubi[induse]

    metrics_use = kgei[:, np.newaxis]

    cv_results = rf_emulator_cv(parami, metrics_use, lbi, ubi)
    return cv_results


def evaluate_cv(cv_results):
    # evaluation
    rmse_test = np.nan * np.zeros(len(cv_results))
    rmse_train = np.nan * np.zeros(len(cv_results))
    cc_test = np.nan * np.zeros(len(cv_results))
    cc_train = np.nan * np.zeros(len(cv_results))
    
    for fold in range(1, len(cv_results)+1):
        y_train, y_test, y_train_pred, y_test_pred = cv_results[fold]['y_train'], cv_results[fold]['y_test'], cv_results[fold]['y_train_pred'], cv_results[fold]['y_test_pred']
        
        # Evaluate the model using 
        rmse_test[fold - 1] = get_rmse(y_test, y_test_pred)
        rmse_train[fold - 1] = get_rmse(y_train, y_train_pred)
        cc_test[fold - 1] = get_cc(y_test, y_test_pred)
        cc_train[fold - 1] = get_cc(y_train, y_train_pred)
            
    return rmse_test, rmse_train, cc_test, cc_train


def get_rmse(d1, d2):
    return ( np.nanmean( (d1-d2)**2 ) ) ** 0.5

def get_cc(d1, d2):
    ind = ~np.isnan(d1+d2)
    return np.corrcoef(d1[ind], d2[ind])[0,1]

i = 0
cv_results = process_basin(i)
rmse_test, rmse_train, cc_test, cc_train = evaluate_cv(cv_results)
print(np.mean(rmse_test), np.mean(rmse_train))

0.05831038255420521 0.02150045959230613


## Train/Evaluate MLP model

Key Parameters to Adjust  

hidden_layer_sizes  
Definition: The number of neurons in each hidden layer.  
Adjustment: Experiment with different sizes and numbers of hidden layers. More neurons and layers can capture more complex patterns but also increase the risk of overfitting. Try smaller sizes first, like (50,), (100,), or (100, 50).  

activation  
Definition: The activation function for the hidden layers.  
Adjustment: Common choices are 'relu', 'tanh', and 'logistic'. 'relu' is often a good starting point. Experiment to see which activation function performs best for your data.  

solver  
Definition: The optimization algorithm.  
Adjustment: Common solvers are 'adam', 'sgd', and 'lbfgs'. 'adam' is a popular choice due to its efficiency and good performance. However, 'sgd' with appropriate learning rate schedules and momentum can also be effective.  

alpha  
Definition: L2 regularization parameter (also known as weight decay).  
Adjustment: Increase alpha to add more regularization, which can help prevent overfitting. Common values to try are 0.0001, 0.001, 0.01, and 0.1.  

learning_rate  
Definition: The learning rate for weight updates.  
Adjustment: Smaller learning rates can improve convergence but may require more epochs. 'constant', 'invscaling', and 'adaptive' are common options. 'adaptive' can adjust the learning rate dynamically based on performance.  

learning_rate_init  
Definition: Initial learning rate.  
Adjustment: Typical values range from 0.001 to 0.1. Start with 0.001 and adjust based on performance.  

max_iter  
Definition: Maximum number of iterations.  
Adjustment: Ensure this is large enough for the model to converge. Values like 200, 500, or 1000 are common starting points.  

early_stopping  
Definition: Whether to stop training when the validation score is not improving.  
Adjustment: Set this to True to enable early stopping, which can prevent overfitting by stopping training once the validation score stops improving.  

validation_fraction  
Definition: The proportion of training data to set aside as validation set for early stopping.  
Adjustment: Typically set to 0.1 (i.e., 10% of the training data).   

Additional Techniques  
Dropout  
Description: A regularization technique where randomly selected neurons are ignored during training.  
Implementation: Unfortunately, sklearn's MLP does not support dropout. To use dropout, consider using deep learning libraries like Keras or TensorFlow.  

Data Augmentation  
Description: Increase the diversity of the training data by applying transformations.  
Implementation: Common in image processing, less so in tabular data, but you can still consider generating synthetic samples if appropriate.  


Finding: 

Increase alpha to 0.2 to 0.8 reduces overfitting and improves performance. But overfitting is still large  
early_stopping + validation_fraction reduces overfitting but overfitting is still large and the test performance is worse  

In [75]:

# Parallel version
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold


def mlp_emulator_cv(x, y, xlb_mean, xub_mean):

    random.seed(1234567890)
    np.random.seed(1234567890)
    
    n_splits = 5

    cv_results = {}
    kf = KFold(n_splits=n_splits, shuffle=True) 
    kge_scores = np.nan * np.zeros([n_splits, y.shape[1]])


    # normalize
    x = (x - xlb_mean) / (xub_mean - xlb_mean)

    for fold_idx, (train_index, test_index) in enumerate(kf.split(x), 1):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Initialize and train your GPR model here; adjust parameters as needed
        sm = make_pipeline(StandardScaler(), MLPRegressor(hidden_layer_sizes=(2000,) ,  alpha=0.9))
        sm.fit(x_train, y_train)

        # Store results
        cv_results[fold_idx] = {
                'train_index': train_index,
                'test_index': test_index,
                'y_train': np.squeeze(y_train),
                'y_test': np.squeeze(y_test),
                'y_test_pred': np.squeeze(sm.predict(x_test)),
                'y_train_pred': np.squeeze(sm.predict(x_train)),
            }

    return cv_results


def process_basin(i):
    indi = df_basinid['basin_id'].values == i
    kgei = df_metric[indi]['kge'].values
    kgei = kgei / (2 - kgei)
    parami = df_param[indi].values

    # only select useful params
    lbi = df_param_lb.iloc[i].values
    ubi = df_param_ub.iloc[i].values
    induse = lbi != ubi
    parami = parami[:, induse]
    lbi = lbi[induse]
    ubi = ubi[induse]

    metrics_use = kgei[:, np.newaxis]

    cv_results = mlp_emulator_cv(parami, metrics_use, lbi, ubi)
    return cv_results

i = 0
cv_results = process_basin(i)
rmse_test, rmse_train, cc_test, cc_train = evaluate_cv(cv_results)
print(np.mean(rmse_test), np.mean(rmse_train))

0.056014210596499955 0.04126194164261151
