# Estimation of recurrence probabilities

### Preparation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os
home_directory = os.path.expanduser( '~' )
os.chdir(home_directory + '/DS_Project/modules')
import warnings
warnings.filterwarnings("ignore")
import pickle
import yaml
config_path = 'config.yml'
with open(config_path, 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
path = config['data']['data'] + '/DWD/'
path_app = path + 'app/'

In [2]:
from data_retrieval.DWD.DWDScraper import *
from models.heatwaves.HeatwaveM import *

In [3]:
from datetime import datetime
str(datetime.now().year) + '-01-01'

'2023-01-01'

### Data import

In [4]:
# get scraper
S = DWDScraper()
# modify bounding box to grid
S.bounding_boxes = config['bboxes']['munich']
# get all stations
all_stations_csv = "all-stations.csv"
S.get_all_stations(all_stations_csv, "2014-01-01","2023-07-15")
s = pd.read_csv(config['data']['dwd'] + '/meta/' + all_stations_csv)
munich_ids = S.get_relevant_station_ids(s)
print(munich_ids)

[1262, 3379, 7431]


In [5]:
station_meta = s[s.STATIONS_ID.isin(munich_ids)]
station_meta

Unnamed: 0,STATIONS_ID,VON_DATUM,BIS_DATUM,STATIONSHOEHE,GEOBREITE,GEOLAENGE,STATIONSNAME,BUNDESLAND
86,1262,1992-05-17,2023-07-17,446,48.3477,11.8134,München-Flughafen,Bayern
239,3379,1997-07-01,2023-07-17,515,48.1632,11.5429,München-Stadt,Bayern
467,7431,2007-11-01,2023-07-17,604,48.013,11.5524,Oberhaching-Laufzorn,Bayern


In [6]:
# save meta to pickle
with open(path_app + 'station_meta.pkl', 'wb') as file:
    pickle.dump(station_meta, file)

In [7]:
# scrape historic data
munich_hist_csv = "munich-historic.csv"
S.scrape_hist(munich_hist_csv, "2014-01-01","2022-12-31", munich_ids)
munich_dwd_hist = pd.read_csv(path + munich_hist_csv)
munich_dwd_hist['MESS_DATUM'] = pd.to_datetime(munich_dwd_hist['MESS_DATUM'], format='%Y-%m-%d %H')

In [8]:
# scrape recent data
munich_recent_csv = "munich-recent.csv"
S.scrape_recent(munich_recent_csv, "07-16", munich_ids)
munich_dwd_recent = pd.read_csv(path + munich_recent_csv)
munich_dwd_recent['MESS_DATUM'] = pd.to_datetime(munich_dwd_recent['MESS_DATUM'], format='%Y-%m-%d %H')

In [9]:
munich_dwd = pd.concat([munich_dwd_hist, munich_dwd_recent])
munich_dwd.sort_values(by=['STATIONS_ID', 'MESS_DATUM'], ascending=[True, False])
munich_dwd.drop_duplicates(subset=['STATIONS_ID', 'MESS_DATUM'], inplace=True)
munich_dwd.head()

Unnamed: 0,STATIONS_ID,MESS_DATUM,TT_TU,RF_TU
0,1262.0,2014-01-01 00:00:00,-3.2,100.0
1,1262.0,2014-01-01 01:00:00,-3.1,100.0
2,1262.0,2014-01-01 02:00:00,-3.4,100.0
3,1262.0,2014-01-01 03:00:00,-3.6,100.0
4,1262.0,2014-01-01 04:00:00,-3.9,100.0


In [10]:
munich_dwd.describe()

Unnamed: 0,STATIONS_ID,TT_TU,RF_TU
count,250471.0,250471.0,250471.0
mean,4020.76521,9.58476,76.342523
std,2558.516706,18.322308,29.845191
min,1262.0,-999.0,-999.0
25%,1262.0,3.3,64.0
50%,3379.0,9.5,82.0
75%,7431.0,15.7,93.0
max,7431.0,36.7,100.0


In [11]:
munich_dwd['TT_TU'] = munich_dwd['TT_TU'].replace(-999, np.nan)
munich_dwd['RF_TU'] = munich_dwd['RF_TU'].replace(-999, np.nan)

# Fill NaNs with the last value before them
munich_dwd['TT_TU'].fillna(method='ffill', inplace=True)
munich_dwd['RF_TU'].fillna(method='ffill', inplace=True)

munich_dwd.describe()

Unnamed: 0,STATIONS_ID,TT_TU,RF_TU
count,250471.0,250710.0,250710.0
mean,4020.76521,9.851791,76.837294
std,2558.516706,8.226295,18.718551
min,1262.0,-21.7,13.0
25%,1262.0,3.3,64.0
50%,3379.0,9.5,82.0
75%,7431.0,15.7,93.0
max,7431.0,36.7,100.0


In [12]:
munich_dwd['MESS_DATUM'].dt.year.value_counts(dropna=False)

2016    26352
2020    26352
2014    26280
2015    26280
2017    26280
2018    26280
2019    26280
2021    26280
2022    26211
2023    14115
Name: MESS_DATUM, dtype: int64

In [13]:
print(munich_dwd.MESS_DATUM.min())
print(munich_dwd.MESS_DATUM.max())

2014-01-01 00:00:00
2023-07-16 00:00:00


In [14]:
year_range = np.arange(2014,2024,1).tolist()

hourly = pd.DataFrame()
daily = pd.DataFrame()

for idx, loc in enumerate(munich_ids):

    w = HeatwaveM(munich_dwd[munich_dwd.STATIONS_ID == loc])
    w.get_heatwaves_ky(station_id=loc,year=year_range,t_max=30,t_min=25)

    sub = w.groupby(['STATION_ID', 'DATE'], as_index=False).agg({
        'TEMP': ['max', 'min'],
        'HEATWAVE': 'max',
        'IND': 'max'
    }).reset_index(drop=True)
    sub.columns = ['STATION_ID', 'DATE', 'MAX_TEMP', 'MIN_TEMP', 'HEATWAVE', 'IND']

    hourly = pd.concat([hourly, w[['STATION_ID','TIME','DATE','TEMP','HUMID','HEATWAVE','IND']]])
    daily = pd.concat([daily, sub])

In [17]:
hourly.describe()

Unnamed: 0,STATION_ID,TEMP,HUMID,HEATWAVE,IND
count,250471.0,250471.0,250471.0,250471.0,5376.0
mean,4020.76521,9.851598,76.84177,0.021464,5.857143
std,2558.516706,8.223336,18.726219,0.144924,4.504102
min,1262.0,-21.7,13.0,0.0,0.0
25%,1262.0,3.3,64.0,0.0,2.0
50%,3379.0,9.5,82.0,0.0,5.0
75%,7431.0,15.7,93.0,0.0,9.0
max,7431.0,36.7,100.0,1.0,18.0


In [18]:
hourly.head()

Unnamed: 0,STATION_ID,TIME,DATE,TEMP,HUMID,HEATWAVE,IND
0,1262.0,2014-01-01 00:00:00,2014-01-01,-3.2,100.0,0.0,
1,1262.0,2014-01-01 01:00:00,2014-01-01,-3.1,100.0,0.0,
2,1262.0,2014-01-01 02:00:00,2014-01-01,-3.4,100.0,0.0,
3,1262.0,2014-01-01 03:00:00,2014-01-01,-3.6,100.0,0.0,
4,1262.0,2014-01-01 04:00:00,2014-01-01,-3.9,100.0,0.0,


In [19]:
daily.describe()

Unnamed: 0,STATION_ID,MAX_TEMP,MIN_TEMP,HEATWAVE,IND
count,10444.0,10444.0,10444.0,10444.0,224.0
mean,4021.390272,14.532717,5.268633,0.021448,5.857143
std,2558.812079,8.75703,6.636951,0.144878,4.51377
min,1262.0,-10.5,-21.7,0.0,0.0
25%,1262.0,7.6,0.1,0.0,2.0
50%,3379.0,14.5,5.1,0.0,5.0
75%,7431.0,21.5,10.8,0.0,9.0
max,7431.0,36.7,23.4,1.0,18.0


In [20]:
daily.head()

Unnamed: 0,STATION_ID,DATE,MAX_TEMP,MIN_TEMP,HEATWAVE,IND
0,1262.0,2014-01-01,1.8,-5.6,0.0,
1,1262.0,2014-01-02,5.2,-3.6,0.0,
2,1262.0,2014-01-03,7.2,-3.5,0.0,
3,1262.0,2014-01-04,5.6,1.1,0.0,
4,1262.0,2014-01-05,5.6,0.3,0.0,


In [21]:
with open(path_app + 'daily.pkl', 'wb') as file:
    pickle.dump(daily, file)
with open(path_app + 'hourly.pkl', 'wb') as file:
    pickle.dump(hourly, file)