In [None]:
# uncomment if running in vs code
# %cd ..
# %cd ..
# %cd ..

### Kernel: run_compound

In [2]:
import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
from scipy.interpolate import griddata

from Notebooks.Scripts.post_process import ecdf
import Notebooks.Scripts.sampling_utils as sam_util
import Notebooks.Scripts.normalization as norm
from Notebooks.Scripts import selector_mda
import Notebooks.Scripts.Useful as use
import Notebooks.Scripts.post_process as post

In [3]:
def RMSE(sampled, target):
    rmse = np.sqrt(np.mean((sampled - target)**2))
    return rmse

In [4]:
df_two = pd.read_csv('fitted_stats/2d_sims.csv')
twod_training_target = pd.read_csv('Models/2d_all/damages.csv', index_col = 0)
test_set = pd.read_csv('fitted_stats/training_2d.csv')
scaler = norm.scaler(df_two)

ex_rate = np.loadtxt('fitted_stats/extreme_rate.txt').item()

In [5]:
min_max = pd.read_csv('Notebooks/4.Active_learning/2d_single/max_min.csv', index_col=0)
max_cons = min_max['Max Cons'].item()
min_cons = min_max['Min Cons'].item()

In [6]:
sampled_events = pd.read_csv('Notebooks/4.Active_learning/2d_single/Total/sampled_events.csv', index_col = 0)
mda_corners = 2**(sampled_events.shape[1] - 1) # damages is included
events_resimulate = sampled_events.shape[0] - mda_corners + 1 # + 1 needed as we are including the last sample (stop crit occured)

In [7]:
folder = 'Notebooks/5.Experiment/1b.TGP_accuracy/'

rmse_arr = np.zeros((events_resimulate))
alm = np.zeros((events_resimulate))
ks = np.zeros((events_resimulate))
ead = np.zeros((events_resimulate))
ead_ground, risk_ground = sam_util.obtain_cons(twod_training_target.values, ex_rate = ex_rate)
risk_list = []

for i in range(events_resimulate):
    interp_mean = pd.read_csv(folder + str(i) + '/XX_mean.csv').rename(columns = {'x': 'Total'}) * (max_cons - min_cons) + min_cons
    interp_95 = pd.read_csv(folder + str(i) + '/XX_95.csv').rename(columns = {'x': 'Total'}) * (max_cons - min_cons) + min_cons
    interp_5 = pd.read_csv(folder + str(i) + '/XX_5.csv').rename(columns = {'x': 'Total'}) * (max_cons - min_cons) + min_cons

    alm[i] = pd.read_csv(folder + str(i) + '/acqui.csv').mean().item()
    rmse_arr[i] = RMSE(interp_mean.values, twod_training_target.values)

    ead_tgp, risk_tgp = sam_util.obtain_cons(interp_mean.values, ex_rate = ex_rate)
    ead_tgp_95, _ = sam_util.obtain_cons(interp_95.values, ex_rate = ex_rate)
    ead_tgp_5, _ = sam_util.obtain_cons(interp_5.values, ex_rate = ex_rate)

    ks_test = scipy.stats.ks_2samp(risk_tgp.iloc[:, 0].values, risk_ground.iloc[:, 0].values, method = 'asymp')
    ead[i] = ead_tgp
    ks[i] = ks_test.pvalue
    risk_list.append(risk_tgp.iloc[:, 0].values)

In [8]:
def ks_plot(model, ground, model_lab, c = 'b', ax = None):
    if ax is None:
        fig, ax = plt.subplots()
    ax.plot(ecdf(model)[0], ecdf(model)[1], c = c, label = model_lab)
    ax.plot(ecdf(ground.iloc[:, 0].values)[0], ecdf(ground.iloc[:, 0].values)[1], c = '#ff7f0e', label = 'Benchmark')
    start = np.array([model.min(), ground.iloc[:, 0].values.min()]).min()
    ind_stop_2 = np.where(ecdf(model)[1] > 0.99)[0][0]
    ind_stop_3 = np.where(ecdf(ground.iloc[:, 0].values)[1] > 0.99)[0][0]
    end = np.array([model[len(model) - ind_stop_2], ground.iloc[:, 0].values[len(ground) - ind_stop_3]]).min()

    ax.set_xlim([start, end])
    ax.set_xlabel('Total damages [USD]', fontsize = 12)
    ax.set_ylabel('Cumulative probability [-]', fontsize = 12)
    ks_test = scipy.stats.ks_2samp(model, ground.iloc[:, 0].values, method = 'asymp')
    ax.axvline(ks_test.statistic_location, ls = ':', c = 'k', label = f'Statistic location (result: {ks_test.statistic:.2f})')
    ax.legend(fontsize = 12)
    ax.text(0.2e8, 0.98, f"KS p-value = {ks_test.pvalue:.2f}", ha='left', va='bottom', fontsize = 12)
    return ax

In [None]:
use.create_empty_folder('Figures/Comparison')

In [10]:
df_minmax_scaled = norm.normalize_dataset(df_two, scaler)

corner_combinations = list(itertools.product([0, 1], repeat=df_two.shape[1]))
df_grid = pd.DataFrame(corner_combinations, columns=df_minmax_scaled.columns)

seed = df_minmax_scaled['S Mag'].argmax()

mda_runs = 8**df_two.shape[1] - 2**df_two.shape[1]

maxmin_class = selector_mda.MaxMin()
lst_ind = maxmin_class.select_from_cluster(df_minmax_scaled.values, mda_runs, seed)
subset = df_minmax_scaled.iloc[lst_ind].copy(deep = True)

scaled_sims = pd.concat((df_grid, subset)).reset_index(drop = True)

denorm_sims = norm.denormalize_dataset(scaled_sims, scaler).values
denorm_sims = pd.DataFrame(denorm_sims, columns = df_two.columns)

In [11]:
target = pd.read_csv('Models/MDA_2d/damages.csv', index_col = 0)
current_sampled = denorm_sims.copy(deep = True)
current_sampled['Total'] = target[:len(denorm_sims)]

In [12]:
rmse_current = np.zeros(len(denorm_sims) - mda_corners)
risk_list_2 = []
num_var = sampled_events.shape[1] - 1

for i in range(len(denorm_sims) - mda_corners):
    interp_target = griddata(current_sampled.iloc[:(mda_corners + i), :num_var], current_sampled.iloc[:(mda_corners + i), num_var], test_set.values, method='linear')
    rmse_current[i] = RMSE(interp_target, twod_training_target.values)
    ead_mda, risk_mda = sam_util.obtain_cons(interp_target, ex_rate = ex_rate)
    ks_test = scipy.stats.ks_2samp(risk_mda.iloc[:, 0].values, risk_ground.iloc[:, 0].values, method = 'asymp')
    risk_list_2.append(risk_mda.iloc[:, 0].values)

In [None]:
gs_kw = dict(width_ratios=[1, 1], height_ratios=[1, 1])
fig, axd = plt.subplot_mosaic([['upper left', 'right'],
                               ['lower left', 'right']],
                              gridspec_kw=gs_kw, figsize=(10, 8),
                              layout="constrained")

axd['upper left']

_ = ks_plot(risk_list[-1], risk_ground, 'Active learning', ax = axd['upper left'])
axd['upper left'].set_xlim([0, 1.8e8])
axd['upper left'].set_xlabel('')
_ = ks_plot(risk_list_2[-1], risk_ground, 'Equidistant sampling', c='r', ax = axd['lower left'])
axd['lower left'].set_xlim([0, 1.8e8])

axd['right'].bar(0, ead_ground, color = '#ff7f0e', label = 'Benchmark')
axd['right'].bar(1, ead_tgp, color= 'b', label = 'Active learning')
axd['right'].bar(2, ead_mda, color = 'r', label = 'Equidistant\nsampling')
axd['right'].grid()
axd['right'].tick_params(
    axis='x',
    which='both',
    bottom=False, 
    top=False,
    labelbottom=False)
axd['right'].set_ylabel('EAD [USD]', fontsize = 12)
axd['right'].errorbar(1, ead_tgp, yerr=[[ead_tgp - ead_tgp_5], [ead_tgp_95 - ead_tgp]], fmt='none', ecolor='black', label = 'Active learning\nuncertainty', capsize = 5)

axd['right'].legend(fontsize = 12)

labels = ['(a)', '(b)', '(c)']

# Loop through each subplot and add the content from existing figures
clock = 0
for ax in [axd['upper left'], axd['lower left'], axd['right']]:
    ax.text(0.025, 0.95, labels[clock], transform=ax.transAxes, fontsize=14, verticalalignment='top', zorder=2, fontweight='bold')
    clock+=1

plt.tight_layout()
plt.savefig('Figures/PDF/f08.pdf', dpi=300, format='pdf', bbox_inches="tight")
plt.savefig('Figures/PNG/f08.png', format='png', bbox_inches="tight")

# # Show the plot
# plt.show()

In [None]:
plt.plot(np.arange(mda_corners, current_sampled.shape[0]), rmse_current, label = 'Equidistant sampling', c = 'r')
plt.plot(np.arange(mda_corners, sampled_events.shape[0] + 1), rmse_arr, label = 'Active learning', c = sam_util.color_dic(2))
plt.xlabel('Simulations used by approach [-]', fontsize = 12)
plt.ylabel('RMSE [USD]', fontsize = 12)
plt.grid()
plt.legend(fontsize = 12)
plt.savefig('Figures/PDF/f02.pdf', dpi=300, format='pdf', bbox_inches="tight")
plt.savefig('Figures/PNG/f02.png', format='png', bbox_inches="tight")

In [None]:
factor_improv = np.abs(ead_mda - ead_ground)/np.abs(ead_ground - ead_tgp)
print(f'EAD estimation improved by a factor: {factor_improv:.2f}')

In [None]:
mda_error = np.abs((ead_mda - ead_ground)) / ead_ground * 100
print(f'EAD error for equidistant sampling: {mda_error:.1f}%')

In [None]:
tgp_error = np.abs((ead_tgp - ead_ground)) / ead_ground * 100
print(f'EAD error for active learning: {tgp_error:.2f}%')

In [None]:
ead_tgp

In [None]:
factor_loss = np.abs(rmse_current[-1])/np.abs(rmse_arr[-1])
print(f'RMSE loss improved by a factor: {factor_loss:.2f}')

In [None]:
print(f'Post RMSE: {rmse_arr[-1]/1e6} Million USD')
print(f'Prior RMSE: {rmse_current[-1]/1e6} Million USD')

In [None]:
rmse_current.argmin() + 4

In [None]:
rmse_arr.argmin()+4

## Timing

In [None]:
times = pd.read_csv('Models/MDA_2d/times.csv', index_col = 0)
clock = len(times[times[times.columns[0]] != 0])
times = times.iloc[:clock]

In [None]:
times.sum().sum()/60

In [None]:
tgp_folder = 'Notebooks/4.Active_learning/'

In [None]:
diff_comp_single, samples_req_single = post.collect_tgp_times(tgp_folder)
samples_req_single

In [None]:
diff_comp_single

In [None]:
diff_comp_single.sum()

In [None]:
single_comp = diff_comp_single[['A Posteriori (2 dims)']].copy(deep = True)
single_comp.rename(columns = {'A Posteriori (2 dims)': 'Active learning'}, inplace = True)
single_comp.loc[:, 'Equidistant sampling'] = times.sum()/60
col_ind = [1, 0]
post.plot_times(single_comp.iloc[:, col_ind])
plt.savefig('Figures/PDF/f03.pdf', dpi=300, format='pdf', bbox_inches="tight")
plt.savefig('Figures/PNG/f03.png', format='png', bbox_inches="tight")