# **Statistical Analysis of Extreme Events**
# Annual Maxima - Generalized Extreme Values distribution

In [None]:
import os
import os.path as op
import sys

# basic import
import pandas as pd
import xarray as xr
import numpy as np
import datetime

# plotting library
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from IPython.display import HTML

# distributions
import scipy
from scipy.stats import gumbel_l, genextreme, gumbel_r
from numpy.random import multivariate_normal

# append ddviewer to path
sys.path.insert(0, op.join(os.getcwd(), '..','..'))

from lib.extremes import *
from lib.config import *

In [None]:
from IPython.display import Image
Image(url='../../lib/resources/GEV.png', width=700)

## **Load input database** <a class="anchor" id="1"></a>

In [None]:
# Load database (.csv)
path = op.join(os.getcwd(), '..','storage')
file = 'TWL_1139.csv'

In [None]:
data = pd.read_csv(
    op.join(path, file), parse_dates=['time']
)
data = data.set_index('time').dropna()

## **Data visualization** <a class="anchor" id="2"></a>

In [None]:
var = 'TWL'
data = data[[var]]

data['year'] = data.index.year
data['month'] = data.index.month
data['day'] = data.index.day

### Box-plot of monthly data <a class="anchor" id="21"></a>

In [None]:
# Use plotly library for plotting boxplot
fig = px.box(data, x="month", y=var, notched=True)
fig.show()

### Monthly maxima <a class="anchor" id="22"></a>

In [None]:
df_mm = data.groupby(by=['year','month'])[var].max().dropna().reset_index()
df_mm = pd.merge(df_mm, data, how='inner', on=['year', 'month', var])
df_mm['date'] = pd.to_datetime(df_mm[['year', 'month', 'day']], errors='coerce')
df_mm = df_mm.set_index('date')

In [None]:
fig = px.line(df_mm[var], width=1200, height=400)
fig.show()

## **Fit Historical Annual Maxima to GEV distribution** <a class="anchor" id="3"></a>

### Eliminate years with incomplete months

In [None]:
data['day_id'] = 1
data_few = data.groupby('year').sum().reset_index()
data_few = data_few.loc[data_few['day_id'] > 300]

In [None]:
data = data[data['year'].isin(data_few['year'].values)]

### Calculate Annual Maxima 

In [None]:
# Calculate Annual Maxima with pandas groupby and agg functions
pmax = data.groupby(by=[data.index.year]).agg(
    Fmax = (var, lambda data: data.idxmax()),
    Amax = (var, lambda data: data.max())
)
pmax.set_index('Fmax', inplace=True)

In [None]:
# Plot Annual Maxima time series
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = data.index, y = data[var],
        mode ='lines', name = 'Historical',
    )
)
fig.add_trace(
    go.Scatter(
        x = pmax.index, y = pmax['Amax'],
        mode ='markers', name = 'Annual Maxima',
    )
)
fig.update_layout(    
    xaxis_title = "time",
    yaxis_title = var,
    yaxis=dict(rangemode='nonnegative')
)
fig.show()

### Fit Annual Maxima to Generalized Extreme Value <a class="anchor" id="32"></a>

<a href="https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.genextreme.html">scipy.stats.genextreme</a>

In [None]:
shape, loc, scale = genextreme.fit(pmax.values)
print('shape: {0:.3f} \nloc: {1:.3f} \nscale: {2:.3f}'.format(shape, loc, scale))

# negative loglikelihood
nLogL = genextreme.nnlf((shape, loc, scale), pmax)

# GEV parameters
theta = (shape, loc, scale)

# freeze GEV with parameters, get GEV PDF
rv = genextreme(shape, loc, scale)  
x = np.linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
y = rv.pdf(x)

In [None]:
# plot GEV pdf vs simulated data
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name='PDF', marker_color='black'))
fig.add_trace(go.Histogram(x=pmax['Amax'].values, nbinsx = 40, histnorm='probability density', name='Historical'))

fig.update_layout(    
    xaxis_title = "x",
    yaxis_title = "P(x)",
    title = 'GEV Probability Density Function',
    yaxis=dict(rangemode='nonnegative')
)
fig.show()

### GEV-fit quality for historical values  <a class="anchor" id="33"></a>

<span style="font-family: times, Optima; font-size:11pt; color:black;">
Generating a probability plot of sample data against the quantiles of a specified theoretical distribution will inform about the quality of the fit. <a href="https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.probplot.html">scipy.stats.probplot<span

In [None]:
res = scipy.stats.probplot(x=pmax['Amax'].values, dist=scipy.stats.genextreme, fit=True, sparams=theta)

In [None]:
# plot GEV pdf vs simulated data
fig = go.Figure()
fig.add_trace(go.Scatter(x=res[0][0], y=res[0][1], mode='markers', marker_color='black'))
fig.add_trace(go.Scatter(x=[0, np.max(res[0][0])], y=[res[1][1], res[1][1]+res[1][0]*np.max(res[0][0])], mode='lines', marker_color='red'))

fig.update_xaxes(range=[np.min(res[0][0])-0.1, np.max(res[0][0])+0.1])
fig.update_yaxes(range=[np.min(res[0][1])-0.1, np.max(res[0][1])+0.1])

fig.update_layout(  
    width=600, height=600,
    showlegend=False,
    xaxis_title = "Theoretical quantiles",
    yaxis_title = "Ordered values",
    title = 'Probplot for GEV distribution',
    yaxis=dict(rangemode='nonnegative')
)
fig.show()

### Simulate extreme values from GEV distribution <a class="anchor" id="34"></a>

<span style="font-family: times, Optima; font-size:11pt; color:black;">
If the sample size of annual maxima is not enought to characterize the extreme regime, a simulated sample of extreme events can be generated by freezing the shape, loc, and scale parameters.
    <span>


In [None]:
# simulate some values for this GEV
size_sim = 1000
var_sim = genextreme.rvs(shape, loc, scale, size=size_sim)

# plot GEV pdf vs simulated data
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name='PDF', marker_color='black'))
fig.add_trace(go.Histogram(x=var_sim, nbinsx = 60, histnorm='probability density', name='Simulation'))

fig.update_layout(    
    xaxis_title = "x",
    yaxis_title = "P(x)",
    title = 'GEV Probability Density Function',
    yaxis=dict(rangemode='nonnegative')
)
fig.show()

### Historical-Simulated quantiles <a class="anchor" id="35"></a>

In [None]:
x_quantiles, y_quantiles = qqplot(pmax['Amax'].values, var_sim, quantiles=100, interpolation='nearest',)

In [None]:
# plot GEV pdf vs simulated data
fig = go.Figure()
fig.add_trace(go.Scatter(x=x_quantiles, y=y_quantiles, mode='markers', marker_color='black'))
fig.add_trace(go.Scatter(x=[np.min([x_quantiles.min(), y_quantiles.min()]), np.max([x_quantiles.max(), y_quantiles.max()])], 
                         y=[np.min([x_quantiles.min(), y_quantiles.min()]), np.max([x_quantiles.max(), y_quantiles.max()])], 
                         mode='lines', marker_color='red'))

fig.update_layout(  
    width=600, height=600,
    showlegend=False,
    xaxis_title = "Historical",
    yaxis_title = "Simulated GEV values",
    title = 'QQ plot historical-simulated values',
    yaxis=dict(rangemode='nonnegative')
)
fig.show()

## **Simulate GEV parameters** <a class="anchor" id="33"></a>

<span style="font-family: times, Optima; font-size:11pt; color:black;">
    Including uncertainty of shape, loc and scale parameters<br>
The Fisher information matrix is used to calculate the covariance matrices associated with maximum-likelihood estimates. The square root of the diagonal elements are the standard deviations of the point estimators.<span

In [None]:
# number of GEV simulations
n_sims = 1000

# GEV Loglikelihood function covariance
acov = ACOV(genextreme.nnlf, theta, pmax)

# GEV params used for multivar. normal random generation
theta_gen = np.array(theta)
theta_sim = multivariate_normal(theta_gen, acov, n_sims)


In [None]:
theta_gen

In [None]:
theta_sim

In [None]:
df_theta = pd.DataFrame({
    'shape':list(theta_sim[:,0]) + [theta[0]],
    'loc':list(theta_sim[:,1]) + [theta[1]],
    'scale':list(theta_sim[:,2]) + [theta[2]],
    'gen': list(np.tile('sim', len(theta_sim[:,0]))) + ['gen'],
    'color':list(np.zeros(len(theta_sim[:,0]))) + [1],
    'alpha':list(np.ones(len(theta_sim[:,0]))*0.7) + [1],
})

# plot 3D simulated GEV distribution
fig = px.scatter_3d(df_theta, x='shape', y='loc', z='scale',
              color='gen', opacity=0.7)
fig.update_layout(   
    width=1000,
    height=600,
    title = 'Simulated GEV parameters',
)
fig.update_traces(marker=dict(size=4))
fig.show()

In [None]:
# plot 2D simulated GEV distribution
fig = go.Figure(
    data = go.Splom(
        dimensions=[
            dict(label = 'Shape', values = df_theta['shape']),
            dict(label = 'Location', values = df_theta['loc']),
            dict(label = 'Scale', values = df_theta['scale']),

        ],
        marker = dict(
                color = df_theta['color'],
                colorscale=['#636EFA', '#ff7f0e'],
                opacity=df_theta['alpha']),
        diagonal_visible = False,
        showupperhalf = False,
    )
)
fig.update_layout(
    title='Simulated GEV Parameters',
    width=700,
    height=700,
)
fig.show()

In [None]:
# scatter-plot matrix
fig = ff.create_scatterplotmatrix(
    df_theta[['shape', 'loc', 'scale', 'color']], 
    index='color',
    diag='histogram',
    colormap='Blues', 
    colormap_type='seq',
    height=800, width=800
)

fig.update_traces(marker_showscale=False)
fig.show()

### **Uncertainty of shape parameter**

<span style="font-family: times, Optima; font-size:11pt; color:black;">
<div class="alert alert-block alert-info"><b>Gumbel Distribution: </b> If the 95% confidence interval of the shape parameter includes the 0, the shape parameter is not significant and the distribution can be defined as a Gumbel distribution </div><span

In [None]:
# Monte Carlo (95% confidence interval)
[np.percentile(theta_sim[:,0], 2.5), np.percentile(theta_sim[:,0], 97.5)]

<span style="font-family: times, Optima; font-size:11pt; color:black;">
To obtain the confidence interval linked to each estimator, based on the normality property of the maximum-likelihood estimators, the confidence interval at the level (1-α) is calculated as follows:<span

In [None]:
# analytically (1.96 -> (1-α) quantile of the standard normal distribution with α=95%)
[theta[0] - 1.96 * np.sqrt(acov[0,0]), theta[0] + 1.96 * np.sqrt(acov[0,0])]

## **Simulate extreme values by considering the simulated GEV parameters**

### Use simulated GEVs to generate Annual Maxima 

In [None]:
years_sim = 100  # years to simulate

# use simulated GEVs
var_sim = np.zeros((theta_sim.shape[0], years_sim))*np.nan  # initialize output numpy array
for c, ts in enumerate(theta_sim):
    var_sim[c,:] = genextreme.rvs(*ts, size=years_sim)

# generate a time array for simulated data
time_sim = np.arange('1970-10-01', '{0}-10-01'.format(1970+years_sim), dtype='datetime64[Y]')


### Plot Return Period 

In [None]:
# historical rp time and sorted annual maxima
trp_hist = t_rp(pmax.index)
trp_hist_val = np.sort(pmax['Amax'])

# simulation rp time and sorted annual maxima
trp_sim = t_rp(time_sim)
trp_sim_val = np.sort(var_sim)

# calculate simulation maxima percentiles
p95 = np.percentile(trp_sim_val, 100-5/2.0, axis=0,)
p50 = np.percentile(trp_sim_val, 50, axis=0,)
p05 = np.percentile(trp_sim_val, 5/2.0, axis=0,)


In [None]:
# Plot return period

fig = go.Figure()
fig.add_trace(go.Scatter(x=trp_sim, y=p95, mode='lines', name='P95', marker_color='mediumturquoise'))
fig.add_trace(go.Scatter(x=trp_sim, y=p05, mode='lines', name='P05', marker_color='mediumturquoise',  fill='tonexty', fillcolor='rgba(0, 181, 204, 0.10)'))
fig.add_trace(go.Scatter(x=trp_sim, y=p50, mode='lines', name='P50', marker_color='black'))
fig.add_trace(go.Scatter(x=trp_hist, y=trp_hist_val, mode='markers', name='Hist', marker_color='red'))

fig.update_xaxes(type="log")
fig.update_layout(    
    xaxis_title = "Return Period (years)",
    yaxis_title = var,
    title = 'Annual Maxima',
    width=400*2.5, height=300*2.5
)
fig.show()

## **If the shape parameter is not significant, consider Gumbel distribution** <a class="anchor" id="5"></a>

<a href="https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gumbel_r.html">scipy.stats.gumbel_r</a> 

In [None]:
# fit data to GUMBEL
c = 0 # shape parameter = 0 (Gumbel distribution)
shape, loc, scale = genextreme.fit(pmax.values, f0=c)
print('shape: {0:.3f} \nloc: {1:.3f} \nscale: {2:.3f}'.format(c, loc, scale))

In [None]:
# negative loglikelihood
nLogL = genextreme.nnlf((shape, loc, scale), pmax)

# GEV parameters
theta = (shape, loc, scale)

# freeze GEV with parameters, get GEV PDF
rvg = genextreme(shape, loc, scale)  
x = np.linspace(rvg.ppf(0.001), rvg.ppf(0.999), 1000)
y = rvg.pdf(x)


In [None]:
# number of GEV simulations
n_sims = 1000

# GEV Loglikelihood function covariance
acov = ACOV(gumbel_r.nnlf, theta, pmax)

# GEV params used for multivar. normal random generation
theta_gen = np.array(theta)
theta_sim = multivariate_normal(theta_gen, acov, n_sims)


In [None]:
years_sim = 100  # years of precipitation to simulate

# use simulated GEVs
var_sim = np.zeros((theta_sim.shape[0], years_sim))*np.nan  # initialize output numpy array
for c, ts in enumerate(theta_sim):
    var_sim[c,:] = genextreme.rvs(*ts, size=years_sim)

# generate a time array for simulated data
time_sim = np.arange('1970-10-01', '{0}-10-01'.format(1970+years_sim), dtype='datetime64[Y]')


In [None]:
# historical rp time and sorted annual maxima
trp_hist = t_rp(pmax.index)
trp_hist_val = np.sort(pmax['Amax'])

# simulation rp time and sorted annual maxima
trp_sim = t_rp(time_sim)
trp_sim_val = np.sort(var_sim)

# calculate simulation maxima percentiles
p95 = np.percentile(trp_sim_val, 100-5/2.0, axis=0,)
p50 = np.percentile(trp_sim_val, 50, axis=0,)
p05 = np.percentile(trp_sim_val, 5/2.0, axis=0,)


In [None]:
# Plot return period

fig = go.Figure()
fig.add_trace(go.Scatter(x=trp_sim, y=p95, mode='lines', name='P95', marker_color='mediumturquoise'))
fig.add_trace(go.Scatter(x=trp_sim, y=p05, mode='lines', name='P05', marker_color='mediumturquoise',  fill='tonexty', fillcolor='rgba(0, 181, 204, 0.10)'))
fig.add_trace(go.Scatter(x=trp_sim, y=p50, mode='lines', name='P50', marker_color='black'))
fig.add_trace(go.Scatter(x=trp_hist, y=trp_hist_val, mode='markers', name='Hist', marker_color='red'))

fig.update_xaxes(type="log")
fig.update_layout(    
    xaxis_title = "Return Period (years)",
    yaxis_title = var,
    title = 'Annual Maxima Gumbel Distribution',
    width=400*2.5, height=300*2.5
)
fig.show()

In [None]:
# simulate some values for this GEV
size_sim = 1000
var_sim = genextreme.rvs(shape, loc, scale, size=size_sim)

# plot GEV pdf vs simulated data
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name='PDF', marker_color='black'))
fig.add_trace(go.Histogram(x=var_sim, nbinsx = 60, histnorm='probability density', name='Simulation'))

fig.update_layout(    
    xaxis_title = "x",
    yaxis_title = "P(x)",
    title = 'GEV Probability Density Function',
    yaxis=dict(rangemode='nonnegative')
)
fig.show()