In [None]:
# uncomment if running in vs code
# %cd ..
# %cd ..

## Kernel: stat_compound
Create historical event set, fit vine copulas, define non magnitude marginals, generate stochastic event sets, compare kde in six dimensions with split in lag.

In [2]:
import xarray as xr
import hydromt
import pandas as pd
import numpy as np
import scipy.stats as sc
import matplotlib.pyplot as plt
from fitter import Fitter
import seaborn as sns
%matplotlib inline

import Notebooks.Scripts.statistics_helper as stats_help

In [None]:
df_drivers = pd.read_csv('Data/all_drivers.csv')
df_drivers = df_drivers.set_index(pd.to_datetime(df_drivers.iloc[:, 0]))
df_drivers = df_drivers.drop(columns = 'DateTime(UTC)')

df_drivers

In [None]:
window_s = pd.read_csv('fitted_stats/compound_given_ss.csv', parse_dates = ['time_Skew_surge (m)', 'time_Precipitation (mm/hr)'])
window_s.head()

## Quantify T Mag, P Dur, S Dur, and P Lag

In [5]:
surge_marginal = window_s.set_index('time_Skew_surge (m)')['value_Skew_surge (m)']
precip_marginal = window_s.set_index('time_Precipitation (mm/hr)')['value_Precipitation (mm/hr)']

In [None]:
window_t = stats_help.find_largest_within_window('Skew_surge (m)', surge_marginal, df_drivers['Tidal (m)'], 10, 10, one_side=True)
window_t.head()

If value is above threshold after/before pre defined window, duration is dicated by window size to not cause the duration to explode

In [7]:
# ! Quantiles are different than pot thresholds. Since we are assuming gaussian, these thresholds need to be close to zero
# However, if they are too small, durations may explode especially for surge. This is not the case for precipitation
threshold_quantile_s = 0.925 # Approximately 20 cm threshold simillar to largest observed mmsl, cutoff will be used to ensure no big diff with historical events
threshold_quantile_p = 0.9 # approximately 0.3 mm/hr
surge_dur, _, _ = stats_help.find_durations(df_drivers['Skew_surge (m)'].ffill(), surge_marginal, threshold_quantile_s)
precip_dur, _, _ = stats_help.find_durations(df_drivers['Precipitation (mm/hr)'], precip_marginal, threshold_quantile_p)

In [8]:
window_s['Precipitation Lag (h)'] = (window_s['time_Precipitation (mm/hr)'] - window_s['time_Skew_surge (m)']).dt.total_seconds()/3600 - 6 # ss is measured at trough of previous tide, but total wl is maximimed at high tide which occurs six hours later
surge_cond = window_s.drop('time_Precipitation (mm/hr)', axis = 1).set_index('time_Skew_surge (m)')

In [None]:
surge_cond.head()

In [10]:
surge_cond['Surge Dur (cycles)'] = surge_dur
surge_cond['Precip Dur (h)'] = precip_dur
surge_cond['Tidal Mag (m)'] = window_t['value_Tidal (m)'].values

## Vine copula(s), other marginals, and event set generation

In [11]:
bool_true = (surge_cond['Compound?'] == 'r')
bool_false = (surge_cond['Compound?'] == 'k')
bool_true[bool_true] = 'Extreme Precipitation'
bool_true[bool_false] = 'Non Extreme Precipitation'

In [None]:
surge_cond.head()

In [None]:
s_cond = surge_cond.rename(columns = {'value_Skew_surge (m)': 'S Mag [m]',
                                      'value_Precipitation (mm/hr)': 'P Mag [mm/hr]',
                                      'Tidal Mag (m)': 'T Mag [m]',
                                      'Surge Dur (cycles)': 'S Dur [tidal cycles]',
                                      'Precip Dur (h)': 'P Dur [hr]',
                                      'Precipitation Lag (h)': 'P Lag [hr]'})
col_ind = [0, 1, -1, 4, 5, 3]
s_cond = s_cond.iloc[:, col_ind]
s_cond['Legend'] = bool_true
s_cond.head()

Adding noise to surge duration to make it continuous, easier to fit vine

In [14]:
np.random.seed(5)
quantile = np.random.choice(np.arange(0, 1, 0.001), len(s_cond))
noise_add = sc.norm.ppf(quantile, loc = 0, scale = 0.05)
s_cond['S Dur [tidal cycles]'] += noise_add

In [15]:
s_cond.reset_index(drop=True).iloc[:, :-1].to_csv('fitted_stats/historical_eventset.csv', index = False)

In [None]:
unity_df, p_values = stats_help.plot_pair(s_cond)

In [None]:
pd.DataFrame(p_values, columns = unity_df.columns[:-1], index = unity_df.columns[:-1])

In [None]:
print(f'Number of significant correlations: {(sum(sum(p_values<0.05)) - (unity_df.shape[1] - 1))/2}')

In [None]:
unity_df.head()

In [None]:
mag_copula = s_cond.iloc[:, [0, 1]].reset_index().drop(columns = ['time_Skew_surge (m)'])
mag_copula = pd.concat([mag_copula, unity_df.iloc[:, [0, 1]]], axis = 1)
mag_copula.head()

## Tide marginal
conditionalize on HH tide because of definition of extremes

In [21]:
tidal_peaks = pd.read_csv('Data/skew_surge_tides.csv', parse_dates = ['DateTime(UTC)'])
tidal_peaks.set_index('DateTime(UTC)', inplace = True)
cond_hh = tidal_peaks[tidal_peaks['Type'] == 'HH']
ss_hh = cond_hh.iloc[:, [1]]
tide_dist = ss_hh.values.flatten()

In [22]:
np.savetxt('fitted_stats/emperical_tide.csv', tide_dist, delimiter = ',') # tide

## Durations and precipitation Lag marginal
Use truncated distributions to prevent extrapolation

In [23]:
trunc = ['truncexpon', 'gompertz', 'truncnorm', 'truncpareto', 'truncweibull_min']

In [None]:
s_dur = Fitter(s_cond.iloc[:, 3], distributions=trunc, bins = np.arange(1, 13, 1), timeout=60)
s_dur.fit()
df_sdur = s_dur.summary(method = 'sumsquare_error')
plt.xlabel('Skew Surge Duration [Tidal Cycles]')
plt.ylabel('Density [-]')
df_sdur

In [None]:
s_dur_marg = s_dur.get_best('sumsquare_error')
print(s_dur_marg)
stats_help.save_marginal('fitted_stats/surge_dur.json', s_dur_marg)

In [None]:
p_dur = Fitter(s_cond.iloc[:, 4], distributions=trunc, bins = np.arange(0, 65, 5), timeout=60)
p_dur.fit()
df_pdur = p_dur.summary(method = 'sumsquare_error')
plt.xlabel('Precipitation Duration [hr]')
plt.ylabel('Density [-]')
df_pdur

In [27]:
p_dur_marg = p_dur.get_best('sumsquare_error')
stats_help.save_marginal('fitted_stats/precipitation_dur.json', p_dur_marg)

In [None]:
p_lag = Fitter(s_cond.iloc[:, 5], distributions=trunc, bins = np.arange(-72, 78, 6), timeout=60)
p_lag.fit()
df_plag = p_lag.summary(method = 'sumsquare_error')
plt.xlabel('Precipitation Lag [hr]')
plt.ylabel('Density [-]')
plt.ylim([0, 0.03])
df_plag

Impose truncation limits to ensure compound definition is satisfied. If already satisfied, ensures extrapolation for P Lag (full width of +/- 3 days)

In [None]:
p_lag_marg = p_lag.get_best('sumsquare_error')
p_lag_marg

In [30]:
stats_help.save_marginal('fitted_stats/precipitation_lag.json', p_lag_marg)

## 2D fit copula

In [None]:
best_copula = stats_help.fit_copulas(mag_copula)

### Create benchmark for 2d

In [32]:
#-- sim best theoretical
samples = 500
best_cop = best_copula.simulate(n = samples, seeds = [30]) # fixed seed, if the same as for the vine, sims will be the same! --> good thing

ss_tmarginal = stats_help.load_marginal('fitted_stats/skew_surge.json')
precip_tmarginal = stats_help.load_marginal('fitted_stats/precipitation.json')

dur_ss_threshold = np.quantile(df_drivers['Skew_surge (m)'].dropna(), threshold_quantile_s)
sim_act_u = stats_help.obtain_sim_values(best_cop[:, 0], ss_tmarginal, threshold = [dur_ss_threshold, 5])
sim_act_v = stats_help.obtain_sim_values(best_cop[:, 1], precip_tmarginal)

In [None]:
df_simulated = pd.DataFrame({mag_copula.columns[0]: sim_act_u,
                             mag_copula.columns[1]: sim_act_v})
df_simulated.head()

In [None]:
stats_help.plot_copula_2d(best_cop, df_simulated, mag_copula)

In [35]:
df_simulated.to_csv('fitted_stats/training_2d.csv', index = False)

### Create testing for 2d

In [36]:
#-- sim best theoretical
samples = 10_000
best_cop = best_copula.simulate(n = samples, seeds = [5]) # fixed seed, if the same as for the vine, sims will be the same! --> good thing

sim_act_u = stats_help.obtain_sim_values(best_cop[:, 0], ss_tmarginal, threshold = [dur_ss_threshold, 5])
sim_act_v = stats_help.obtain_sim_values(best_cop[:, 1], precip_tmarginal)


In [None]:
df_simulated = pd.DataFrame({mag_copula.columns[0]: sim_act_u,
                             mag_copula.columns[1]: sim_act_v})
df_simulated.head()

In [None]:
stats_help.plot_copula_2d(best_cop, df_simulated, mag_copula)

In [39]:
df_simulated.to_csv('fitted_stats/2d_sims.csv', index = False)

## Fit vine copula in 3d

In [None]:
three_var = unity_df.iloc[:, :3]
three_var.head()

In [None]:
df_vine_three, cop = stats_help.understand_vine(three_var)

### testing event set generation in 3d

In [42]:
n_sim = 10000
tide_dist = pd.read_csv('fitted_stats/emperical_tide.csv', header = None).values.flatten()
df_usim, df_sim = stats_help.sim_vines(three_var, n_sim, [ss_tmarginal, precip_tmarginal, tide_dist], cop, s_cond.columns[:3])

In [None]:
stats_help.plot_sim_vine(df_sim, s_cond.iloc[:, :3], df_vine_three)

In [44]:
df_sim.to_csv('fitted_stats/3d_sims.csv', index = False)

## Fit vine copula in 6d

In [None]:
six_var = unity_df.iloc[:, :-1]
six_var.head()

In [None]:
df_vine_six, cop = stats_help.understand_vine(six_var)

### Creating benchmark in six dimensions

In [None]:
np.random.seed(30)
df_usim_sixt, df_sim_sixt = stats_help.sim_vines(six_var, 500, [ss_tmarginal, precip_tmarginal, tide_dist, s_dur_marg, p_dur_marg, p_lag_marg],
                                       cop, s_cond.columns[:-1], threshold = [[0, 1_000], [0, 1_000],
                                                                              [0, 1_000], [1, 12.5], 
                                                                              [0, 72], [-72, 72]])
df_sim_sixt['S Dur [tidal cycles]'] = df_sim_sixt['S Dur [tidal cycles]'].round(0)
df_sim_sixt.head()

In [None]:
stats_help.plot_sim_vine(df_sim_sixt, s_cond.iloc[:, :-1], df_vine_six, new_ax = [2, 3], old_ax = [3, 3])

In [49]:
df_sim_sixt.to_csv('fitted_stats/training_6d.csv')

### Create testing dataset in 6d

In [None]:
np.random.seed(5)
df_usim, df_sim_6 = stats_help.sim_vines(six_var, n_sim, [ss_tmarginal, precip_tmarginal, tide_dist, s_dur_marg, p_dur_marg, p_lag_marg],
                                       cop, s_cond.columns[:-1], threshold = [[0, 1_000], [0, 1_000],
                                                                              [0, 1_000], [1, 12.5], 
                                                                              [0, 72], [-72, 72]])
df_sim_6['S Dur [tidal cycles]'] = df_sim_6['S Dur [tidal cycles]'].round(0)
df_sim_6.head()

In [None]:
stats_help.plot_sim_vine(df_sim_6, s_cond.iloc[:, :-1], df_vine_six, new_ax = [2, 3], old_ax = [3, 3])

In [52]:
df_sim_6.to_csv('fitted_stats/6d_sims.csv')

## Fit vine copula in 4d

In [None]:
four_var = unity_df.iloc[:, [0, 1, 2, 4]]
four_var.head()

In [None]:
df_vine_four, cop = stats_help.understand_vine(four_var)

### Create a testing dataset in 4d

In [55]:
n_sim = 10000
df_usim, df_sim = stats_help.sim_vines(four_var, n_sim,
                                       [ss_tmarginal, precip_tmarginal, tide_dist, p_dur_marg],
                                       cop, s_cond.columns[[0, 1, 2, 4]], seeds = [5],
                                       threshold = [[0, 1_000], [0, 1_000], [0, 1_000], [0, 72]])

In [None]:
stats_help.plot_sim_vine(df_sim, s_cond.iloc[:, [0, 1, 2, 4]], df_vine_four, new_ax = [1, 2], old_ax = [2, 2])

In [57]:
df_sim.to_csv('fitted_stats/4d_sims.csv', index = False)

## Fit vine copula in 5d

In [None]:
five_var = unity_df.iloc[:, [0, 1, 2, 3, 4]]
five_var.head()

In [None]:
df_vine_five, cop = stats_help.understand_vine(five_var)

s_dur is independant, can remove from vine and sample randomly

### Create testing dataset in 5d

In [60]:
np.random.seed(5)
s_dur_sims = stats_help.obtain_sim_values(np.random.rand(10000), s_dur_marg, threshold = [1, 12.5]).round(0)
df_sim['S Dur [tidal cycles]'] = s_dur_sims

In [None]:
stats_help.plot_sim_vine(df_sim, s_cond.iloc[:, [0, 1, 2, 3, 4]], df_vine_five, new_ax = [2, 3], old_ax = [3, 3])

In [62]:
df_sim.to_csv('fitted_stats/5d_sims.csv', index = False)