# Estimation of recurrence probabilities

### Preparation

In [None]:
import numpy as np
import pandas as pd
import datetime
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import statsmodels.tsa.stattools as stattools
from statsmodels.tsa.arima_process import arma_generate_sample
from statsmodels.tsa.arima.model import ARIMA
from datetime import timedelta

import os
home_directory = os.path.expanduser( '~' )
os.chdir(home_directory + '/DS_Project/modules')
import warnings
warnings.filterwarnings("ignore")
import pickle
import yaml
config_path = 'config.yml'
with open(config_path, 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [None]:
from data_retrieval.DWD.DWDScraper import *
from models.heatwaves.HeatwaveM import *

### Data import

In [None]:
# get identified heatwave dates
with open(config['data']['dwd'] + '/heatwaves.pkl', "rb") as input_file:
    heat = pickle.load(input_file)
print(heat)

In [None]:
sublists = divide_dates_into_sublists(heat)
print(sublists)

In [None]:
# scrape data for Munich city station from 2014 to 2022
S = DWDScraper()
S.scrape("munich-city.csv", "2014-01-01","2022-12-31", [3379])

In [None]:
# get scraped data
munich_city = pd.read_csv(config['data']['dwd']+'/munich-city.csv')
munich_city['MESS_DATUM'] = pd.to_datetime(munich_city['MESS_DATUM'], format='%Y-%m-%d %H')
munich_city['DATE'] = munich_city['MESS_DATUM'].dt.date

In [None]:
# get daily maximum temperatures (for June, July, August)
years = list(range(2014, 2023))
months = [6,7,8]
sub_index = (munich_city['MESS_DATUM'].dt.year.isin(years)) & (munich_city['MESS_DATUM'].dt.month.isin(months))
tseries = munich_city[sub_index].groupby([munich_city['DATE']]).max()['TT_TU']
tseries.index = pd.to_datetime(tseries.index)

### Descriptives

In [None]:
# plot Munich 2022 data including heatwaves marker
y2022_index = tseries.index.year.isin([2022])
time_series = tseries[y2022_index]
plt.style.use('seaborn-whitegrid')
fig, ax = plt.subplots()

ax.axhline(30, color='darkred', linestyle='--')
ax.axhline(25, color='lightcoral', linestyle='--')

ax.plot(time_series.index, time_series.values, color='blue')
ax.xaxis.set_major_locator(mdates.DayLocator(bymonthday=[1,15]))

for sublist in sublists:
    alpha = 0.3
    
    start_date = sublist[0]
    end_date = sublist[-1]
    
    start_idx = time_series.index.get_loc(start_date.strftime("%Y-%m-%d"))
    end_idx = time_series.index.get_loc(end_date.strftime("%Y-%m-%d"))

    ax.fill_between(time_series.index[start_idx:end_idx+1], 0, time_series.values.max(), facecolor='red', alpha=alpha)

ax.set_xlabel('Day')
plt.xticks(rotation=45)
ax.set_ylabel('Maximum temperature in Celsius')
ax.set_title('Munich in Summer 2022')

plt.savefig(config['data']['dwd'] + '/summer_munich_2022.png', bbox_inches="tight")

plt.show()

In [None]:
# plot full time series data
tseries.reset_index(drop=True).plot(color='orangered')
plt.xticks([], [])
plt.ylabel('Maximum temperature in Celsius')
plt.title('Munich from June to August for 2014-2022')
plt.show()

### Modeling ARMA(1)

In [None]:
# dickey fuller test
dickeyfuller = stattools.adfuller(
    tseries,
    autolag="AIC"
)

print('ADF Statistic: %f' % dickeyfuller[0])

print('p-value: %f' % dickeyfuller[1])

print('Critical Values:')

for key, value in dickeyfuller[4].items():
    print('\t%s: %.3f' % (key, value))
if dickeyfuller[0] > dickeyfuller[4]["5%"]:
    print ("Reject Ho - Time Series is Stationary")
else:
    print ("Failed to Reject Ho - Time Series is Stationary")

In [None]:
arma_mod = ARIMA(tseries.reset_index(drop=True), order=(1,0,1)).fit()
print(arma_mod.summary())

In [None]:
# extract model parameters
ar = arma_mod.arparams
ma = arma_mod.maparams
ar = np.r_[1, -ar]
ma = np.r_[1, ma]

### Simulating data

In [None]:
# standardize data
m = tseries.mean()
sd = tseries.std()
standard = (tseries - m) / sd

In [None]:
# simulate data for one year (specified number of months only)
r = 10000
n = sum(y2022_index)
mat = np.empty([r,n])
for i in range(r):
    mat[i,:] = arma_generate_sample(ar, ma, nsample=n)

In [None]:
# compare to real world data
result = np.empty([r,n])
for i in range(r):
    result[i,:] = np.reshape(np.where(standard[y2022_index] > mat[i,:],1,0),[n,])
means = result.mean(axis=0)

### Compare to identified heatwaves

In [None]:
fig, ax = plt.subplots()
for sublist in sublists:
    alpha = 0.6
    start_date = sublist[0]
    end_date = sublist[-1]
    start_idx = time_series.index.get_loc(start_date.strftime("%Y-%m-%d"))
    end_idx = time_series.index.get_loc(end_date.strftime("%Y-%m-%d"))
    ax.fill_between(time_series.index[start_idx:end_idx+1], 0, 1, facecolor='red', alpha=alpha)
ax.bar(time_series.index, pd.Series(means).values, color='royalblue', width=1)
ax.xaxis.set_major_locator(mdates.DayLocator(bymonthday=[1,15]))
ax.set_xlabel('Date')
plt.xticks(rotation=45)
ax.set_ylabel('Inverse recurrence probability')
ax.set_title('Estimated recurrence probabilities in Munich for Summer 2022')
plt.show()

In [None]:
heat_index = [i in heat for i in pd.to_datetime(tseries[y2022_index].index).to_series().dt.date]
recurrence_prob = pd.Series(means[heat_index], index=heat)
print(recurrence_prob)