In [1]:

# use observed streamflow data to evaluate model outputs
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob, os, sys, toml

def get_modified_KGE(obs, sim):
    ind = (~np.isnan(obs)) & (~np.isnan(sim))
    obs = obs[ind]
    sim = sim[ind]

    try:
        sd_sim = np.std(sim, ddof=1)
        sd_obs = np.std(obs, ddof=1)
        m_sim = np.mean(sim)
        m_obs = np.mean(obs)
        r = (np.corrcoef(sim, obs))[0, 1]
        relvar = (float(sd_sim)/float(m_sim))/(float(sd_obs)/float(m_obs))
        bias = float(m_sim)/float(m_obs)
        kge = 1.0 - np.sqrt((r-1)**2 + (relvar-1)**2 + (bias-1)**2)
    except:
        kge = np.nan

    return kge

metric_out = np.nan * np.zeros([627, 4])

basin = 0
print('basin', basin)

if os.path.isfile(f'CAMELS_4model_emulator_metric_{basin}.npz'):
    sys.exit('file exists')


iterflag = 0

basin 0


In [2]:
####### load data

nmet = 22  # Or nmet = 19 based on your comment
metrics = np.nan * np.zeros([400, nmet])  # Removed basinnum dimension, assuming single basin focus
params = np.nan * np.zeros([400, 30])  # Removed basinnum dimension, assuming single basin focus


configfile = f'/glade/work/guoqiang/CTSM_CAMELS/Calib_HH_MOASMO/configuration/_level1-{basin}_config_MOASMO.toml'
config = toml.load(configfile)

for trialflag in range(400):

	if config['path_calib'] == 'NA':
		path_MOASMOcalib = f'{path_CTSM_base}_MOASMOcalib'  # Ensure path_CTSM_base is defined somewhere
	else:
		path_MOASMOcalib = config['path_calib']
	path_archive = f'{path_MOASMOcalib}/ctsm_outputs'
	caseflag = f'iter{iterflag}_trial{trialflag}'  # Ensure iterflag is defined or handled as needed
	outfile_metric = f'{path_archive}/{caseflag}/evaluation_many_metrics.csv'

	# load metric
	if os.path.isfile(outfile_metric):
		try:
			df = pd.read_csv(outfile_metric)
			metrics[trialflag, :] = df.values[0]  # Adjusted for the removed basin dimension
		except:
			print('failed reading')

	# load parameter
	dfparam = pd.read_pickle(f'{path_archive}/{caseflag}/paramset_{caseflag}.pkl')
	param = dfparam['Value'].values
	param = np.array([np.mean(i) for i in param])
	
	lower = np.array([i != 'None' for i in dfparam['Lower'].values])
	param = param[lower]
	
	params[trialflag, 0:len(param)] = param

params = params[:, ~np.isnan(params[0,:])]

metnames = df.columns.values
# Adjusting metrics based on new definitions
for m in ['kge', 'cc', 'nse', 'kge_log_q', 'kge_summer', 'kge_winter', 'kge_spring', 'kge_autumn']:
	ind1 = np.where(metnames == m)[0][0]
	metrics[:, ind1] = 1 - metrics[:, ind1]  # Minimize values, adjusted for the removed basin dimension
	metnames[ind1] = '1-' + m


met1 = 'max_mon_abs_err'
met2 = 'mae'
ind1 = np.where(metnames == met1)[0][0]
ind2 = np.where(metnames == met2)[0][0]
metrics_use = metrics[:, [ind1, ind2]]

ind=~np.isnan(metrics_use[:,0]+metrics_use[:,1])
params = params[ind, :]
metrics_use = metrics_use[ind, :]


In [28]:
##### RF
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import numpy as np

def train_and_evaluate_with_predictions(params, metrics_use):
	print('Processing...')

	cv = KFold(n_splits=5, shuffle=True, random_state=42)
	kge_scores = []  # To store the KGE score for each fold

	# Initialize the Random Forest regressor
	model = RandomForestRegressor(random_state=42)

	for train_idx, test_idx in cv.split(params):
		X_train, X_test = params[train_idx], params[test_idx]
		y_train, y_test = metrics_use[train_idx], metrics_use[test_idx]

		model.fit(X_train, y_train)  # Fit the model on the training data
		y_pred = model.predict(X_test)  # Predict on the test set

		# Calculate KGE for each column (target) and take the mean
		fold_kge_scores = [get_modified_KGE(y_test[:, i], y_pred[:, i]) for i in range(y_test.shape[1])]
		
		print(fold_kge_scores)
        
		fold_mean_kge = np.nanmean(fold_kge_scores)  # Compute the mean KGE score for this fold, ignoring any NaN values
		kge_scores.append(fold_mean_kge)

	mean_kge_score = np.mean(kge_scores)  # Calculate the mean KGE score across all folds

	return mean_kge_score

mean_kge_score = train_and_evaluate_with_predictions(params, metrics_use)
print('Mean KGE score:', mean_kge_score)

Processing...
[0.6380657677456222, 0.6615321741004012]
[0.6240064296306226, 0.7414032560394879]
[0.6758507401391087, 0.7096007360261101]
[0.7369110459972331, 0.7288903836220277]
[0.6879154828784957, 0.696104713738253]
Mean KGE score: 0.6900280729917363


In [26]:

######## GPR
from sklearn.preprocessing import MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel as C, Matern
from sklearn.model_selection import KFold
import numpy as np

def train_and_evaluate_with_predictions_gpr(params, metrics_use):
	print('Processing...')

	cv = KFold(n_splits=5, shuffle=True, random_state=42)
	kge_scores = []  # To store the KGE score for each fold

	# Define the kernel with Matern function
	kernel = C(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=1.5)
	
	# Initialize the GPR model
	model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, normalize_y=True, random_state=42)

	for train_idx, test_idx in cv.split(params):
		X_train, X_test = params[train_idx], params[test_idx]
		y_train, y_test = metrics_use[train_idx], metrics_use[test_idx]

		# Normalize features
		scaler = MinMaxScaler().fit(X_train)
		X_train_scaled = scaler.transform(X_train)
		X_test_scaled = scaler.transform(X_test)

		model.fit(X_train_scaled, y_train)  # Fit the model on the training data
		y_pred = model.predict(X_test_scaled)  # Predict on the test set

		# Calculate KGE for each column (target) and take the mean
		fold_kge_scores = [get_modified_KGE(y_test[:, i], y_pred[:, i]) for i in range(y_test.shape[1])]
		
		print(fold_kge_scores)
        
		fold_mean_kge = np.nanmean(fold_kge_scores)  # Compute the mean KGE score for this fold, ignoring any NaN values
		kge_scores.append(fold_mean_kge)

	mean_kge_score = np.mean(kge_scores)  # Calculate the mean KGE score across all folds

	return mean_kge_score

# Execute the function with your data
mean_kge_score_gpr = train_and_evaluate_with_predictions_gpr(params, metrics_use)
print('Mean KGE score for GPR:', mean_kge_score_gpr)


Processing...
[0.6371493649566508, 0.7363594071615217]
[0.7055858687947985, 0.8638452135502771]
[0.6179766614778539, 0.7898478106865003]
[0.7343635804558604, 0.8468106869178276]
[0.7480605042104231, 0.8585574545860488]
Mean KGE score for GPR: 0.7538556552797762


In [37]:
# MO-ASMO GPR

# import MO-ASMO functions
# path_MOASMO = '/glade/u/home/guoqiang/model_sources/MO-ASMO/src'
path_MOASMO = '/glade/u/home/guoqiang/CTSM_repos/ctsm_optz/MO-ASMO/src/'
sys.path.append(path_MOASMO)
import sampling
import gp
import NSGA2

def gpr_emulator_cv(x, y, alpha, leng_lb, leng_ub, nu, xlb_mean, xub_mean, outpath, iterflag):

    random.seed(1234567890)
    np.random.seed(1234567890)

    n_splits = 5
    
    kf = KFold(n_splits=n_splits, shuffle=True) 
    kge_scores = np.nan * np.zeros([n_splits, y.shape[1]])
    
    for fold_idx, (train_index, test_index) in enumerate(kf.split(x), 1):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Initialize and train your GPR model here; adjust parameters as needed
        sm = gp.GPR_Matern(x_train, y_train, x_train.shape[1], y_train.shape[1], x_train.shape[0], xlb_mean, xub_mean, alpha=alpha, leng_sb=[leng_lb, leng_ub], nu=nu)
        
        # Predict using the trained model
        y_pred = sm.predict(x_test)  # Adjust this method call based on your model's API
        
        # Evaluate the model using KGE
        for i in range(y.shape[1]):
            kge_scores[fold_idx-1, i] = get_modified_KGE(y_test[:,i], y_pred[:,i])
    
    # Calculate the mean KGE score across all folds
    mean_kge_score = np.nanmean(kge_scores, axis=0)[np.newaxis, :]
    kge_scores = np.concatenate([kge_scores, mean_kge_score])

    # Convert the list of KGE scores into a pandas DataFrame
    kge_scores_df = pd.DataFrame()
    kge_scores_df['Fold'] = list(np.arange(n_splits)+1) + ['mean']
    kge_scores_df['kge1'] = kge_scores[:, 0]
    kge_scores_df['kge2'] = kge_scores[:, 1]
    kge_scores_df['kge_mean'] = (kge_scores[:, 0] + kge_scores[:, 1])/2
    
    print("GPR CV KGE Score for metric1/metric2:")
    display(kge_scores_df)
    
    csv_file_path =  f'{outpath}/GPR_for_iter{iterflag}_CV_kge.csv'
    kge_scores_df.to_csv(csv_file_path, index=False)

    return kge_scores_df


def read_parameter_csv(file_parameter_list):
    df_calibparam = pd.read_csv(file_parameter_list)
    for c in ['Upper', 'Lower', 'Factor', 'Value']:
        if c in df_calibparam.columns:
            if isinstance(df_calibparam.iloc[0][c], str):
                arr = []
                for i in range(len(df_calibparam)):
                    vi = df_calibparam.iloc[i][c]
                    if ',' in vi:
                        arr.append(np.array(vi.split(',')).astype(np.float64))
                    elif '[' in vi:
                        arr.append(np.array(vi.strip('[]').replace('\n', '').split(), dtype=np.float64))
                    else:
                        try:
                            arr.append(np.array([np.float64(vi)]))
                        except:
                            arr.append(np.array([-99999]))
                df_calibparam[c] = arr
    return df_calibparam


param_infofile = '/glade/work/guoqiang/CTSM_CAMELS/data_paramcailb/ParamCalib_1013500.csv'
param_filelist = ['/glade/campaign/cgd/tss/people/guoqiang/CTSM_CAMELS_proj/Calib_HH_MOASMO/level1_0_MOASMOcalib/ctsm_outputs/iter0_all_meanparam.csv']
metric_filelist = ['/glade/campaign/cgd/tss/people/guoqiang/CTSM_CAMELS_proj/Calib_HH_MOASMO/level1_0_MOASMOcalib/ctsm_outputs/iter0_all_metric.csv']
outpath = '/glade/campaign/cgd/tss/people/guoqiang/CTSM_CAMELS_proj/Calib_HH_MOASMO/level1_0_MOASMOcalib/param_sets'
iterflag = 0
num_per_iter = 20
path_CTSM_base = '/glade/work/guoqiang/CTSM_CAMELS/Calib_HH_MOASMO/level1_0'

# define hyper parameters
pop = 100
gen = 100
crossover_rate = 0.9
mu = 20
mum = 20

# define hyperparameter
alpha = 1e-3
leng_lb = 1e-3
leng_ub = 1e3
nu = 2.5

n_sample = num_per_iter # number of selected optimal points

# input data x (parameter sets) and output data y (objective function values)
df_param = pd.concat(map(pd.read_csv, param_filelist))
df_metric = pd.concat(map(pd.read_csv, metric_filelist))
df_info = read_parameter_csv(param_infofile)

param_names = df_info['Parameter'].values # exclude binded parameters
df_param = df_param[param_names]

xlb_mean = np.array([np.nanmean(v) for v in df_info['Lower']])
xub_mean = np.array([np.nanmean(v) for v in df_info['Upper']])

x = df_param.to_numpy()
y = df_metric.to_numpy()

ind = ~np.isnan( np.sum(x,axis=1) + np.sum(y,axis=1))
x, y = x[ind, :], y[ind, :]

nInput = x.shape[1]
nOutput = y.shape[1]


n_splits = 5

kf = KFold(n_splits=n_splits, shuffle=True) 
kge_scores = np.nan * np.zeros([n_splits, y.shape[1]])

for fold_idx, (train_index, test_index) in enumerate(kf.split(x), 1):
	x_train, x_test = x[train_index], x[test_index]
	y_train, y_test = y[train_index], y[test_index]
	
	# Initialize and train your GPR model here; adjust parameters as needed
	sm = gp.GPR_Matern(x_train, y_train, x_train.shape[1], y_train.shape[1], x_train.shape[0], xlb_mean, xub_mean, alpha=alpha, leng_sb=[leng_lb, leng_ub], nu=nu)
	
	# Predict using the trained model
	y_pred = sm.predict(x_test)  # Adjust this method call based on your model's API
	
	# Evaluate the model using KGE
	for i in range(y.shape[1]):
		kge_scores[fold_idx-1, i] = get_modified_KGE(y_test[:,i], y_pred[:,i])

# Calculate the mean KGE score across all folds
mean_kge_score = np.nanmean(kge_scores, axis=0)[np.newaxis, :]
kge_scores = np.concatenate([kge_scores, mean_kge_score])

# Convert the list of KGE scores into a pandas DataFrame
kge_scores_df = pd.DataFrame()
kge_scores_df['Fold'] = list(np.arange(n_splits)+1) + ['mean']
kge_scores_df['kge1'] = kge_scores[:, 0]
kge_scores_df['kge2'] = kge_scores[:, 1]
kge_scores_df['kge_mean'] = (kge_scores[:, 0] + kge_scores[:, 1])/2

print("GPR CV KGE Score for metric1/metric2:")
display(kge_scores_df)

csv_file_path =  f'{outpath}/GPR_for_iter{iterflag}_CV_kge.csv'
kge_scores_df.to_csv(csv_file_path, index=False)

GPR CV KGE Score for metric1/metric2:


Unnamed: 0,Fold,kge1,kge2,kge_mean
0,1,0.804682,0.677885,0.741284
1,2,0.84573,0.619688,0.732709
2,3,0.756634,0.640062,0.698348
3,4,0.884484,0.816419,0.850452
4,5,0.822459,0.732531,0.777495
5,mean,0.822798,0.697317,0.760058


In [39]:
import random
gpr_emulator_cv(x, y, alpha, leng_lb, leng_ub, nu, xlb_mean, xub_mean, outpath, iterflag)

GPR CV KGE Score for metric1/metric2:


Unnamed: 0,Fold,kge1,kge2,kge_mean
0,1,0.877216,-20.19224,-9.657512
1,2,0.835489,0.708413,0.771951
2,3,0.852215,0.75589,0.804052
3,4,0.832827,0.709957,0.771392
4,5,0.836591,0.633923,0.735257
5,mean,0.846868,-3.476811,-1.314972


Unnamed: 0,Fold,kge1,kge2,kge_mean
0,1,0.877216,-20.19224,-9.657512
1,2,0.835489,0.708413,0.771951
2,3,0.852215,0.75589,0.804052
3,4,0.832827,0.709957,0.771392
4,5,0.836591,0.633923,0.735257
5,mean,0.846868,-3.476811,-1.314972


In [45]:
def gpr_emulator_cv(x, y, alpha, leng_lb, leng_ub, nu, xlb_mean, xub_mean, outpath, iterflag):

    np.random.seed(123456789)

    n_splits = 5
    
    kf = KFold(n_splits=n_splits, shuffle=True) 
    kge_scores = np.nan * np.zeros([n_splits, y.shape[1]])
    
    for fold_idx, (train_index, test_index) in enumerate(kf.split(x), 1):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Initialize and train your GPR model here; adjust parameters as needed
        sm = gp.GPR_Matern(x_train, y_train, x_train.shape[1], y_train.shape[1], x_train.shape[0], xlb_mean, xub_mean, alpha=alpha, leng_sb=[leng_lb, leng_ub], nu=nu)
        
        # Predict using the trained model
        y_pred = sm.predict(x_test)  # Adjust this method call based on your model's API
        
        # Evaluate the model using KGE
        for i in range(y.shape[1]):
            kge_scores[fold_idx-1, i] = get_modified_KGE(y_test[:,i], y_pred[:,i])
    
    # Calculate the mean KGE score across all folds
    mean_kge_score = np.nanmean(kge_scores, axis=0)[np.newaxis, :]
    kge_scores = np.concatenate([kge_scores, mean_kge_score])

    # Convert the list of KGE scores into a pandas DataFrame
    kge_scores_df = pd.DataFrame()
    kge_scores_df['Fold'] = list(np.arange(n_splits)+1) + ['mean']
    kge_scores_df['kge1'] = kge_scores[:, 0]
    kge_scores_df['kge2'] = kge_scores[:, 1]
    kge_scores_df['kge_mean'] = (kge_scores[:, 0] + kge_scores[:, 1])/2
    
    print("GPR CV KGE Score for metric1/metric2:")
    display(kge_scores_df)
    
    csv_file_path =  f'{outpath}/GPR_for_iter{iterflag}_CV_kge.csv'
    kge_scores_df.to_csv(csv_file_path, index=False)

    return kge_scores_df

kge_scores_df = gpr_emulator_cv(x, y, alpha, leng_lb, leng_ub, nu, xlb_mean, xub_mean, outpath, iterflag)

GPR CV KGE Score for metric1/metric2:


Unnamed: 0,Fold,kge1,kge2,kge_mean
0,1,0.873527,0.758944,0.816235
1,2,0.863188,0.737863,0.800526
2,3,0.845536,0.645828,0.745682
3,4,0.774624,0.722338,0.748481
4,5,0.809211,0.605578,0.707394
5,mean,0.833217,0.69411,0.763664


In [46]:
	# Initialize and train your GPR model here; adjust parameters as needed
	sm = gp.GPR_Matern(x_train, y_train, x_train.shape[1], y_train.shape[1], x_train.shape[0], xlb_mean, xub_mean, alpha=alpha, leng_sb=[leng_lb, leng_ub], nu=nu)
	
	# Predict using the trained model
	y_pred = sm.predict(x_test)  # Adjust this method call based on your model's API
	

In [49]:
y_pred, tmp = sm.predict(x_test)

ValueError: too many values to unpack (expected 2)