### Set Up

In [1]:
import os
import re
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer

BASE_PATH = 'D:/KIMoDIs/global-groundwater-models-main'
DATA_PATH = os.path.join(BASE_PATH, 'data')
MODEL_PATH = os.path.join(BASE_PATH, 'models')
RESULT_PATH = os.path.join(BASE_PATH, 'results')

LAG = 52  # weeks
LEAD = 12  # weeks

# Roughly 80/10/10
TRAIN_PERIOD = (pd.Timestamp(1990, 1, 1), pd.Timestamp(2010, 1, 1))
VAL_PERIOD = (pd.Timestamp(2010, 1, 1), pd.Timestamp(2013, 1, 1))
TEST_PERIOD = (pd.Timestamp(2013, 1, 1), pd.Timestamp(2016, 1, 1))

TIME_IDX = pd.date_range(TRAIN_PERIOD[0], TEST_PERIOD[1], freq='W-SUN', inclusive='neither', name='time').to_frame().reset_index(drop=True)
TIME_IDX.index.name = 'time_idx'
TIME_IDX = TIME_IDX.reset_index()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pq.read_table(os.path.join(DATA_PATH, 'train_df.parquet'))
val_df = pq.read_table(os.path.join(DATA_PATH, 'val_df.parquet'))
test_df = pq.read_table(os.path.join(DATA_PATH, 'test_df.parquet'))

train_df = train_df.to_pandas()
val_df = val_df.to_pandas()
test_df = test_df.to_pandas()

In [3]:
# Set training, validation and test data to common ids
id_test = test_df['proj_id'].unique()

train_df = train_df[train_df['proj_id'].isin(id_test)]
val_df = val_df[val_df['proj_id'].isin(id_test)]

## Time Series Data Set

In [14]:
list(train_df)

['proj_id',
 'time',
 'gwl',
 'precip',
 'humid',
 'temp',
 'lai',
 'day_sin',
 'day_cos',
 'twi',
 'gw_recharge',
 'hyraum_gr',
 'aquifer_type',
 'permeability_coef',
 'soil_texture',
 'elevation',
 'land_cover',
 'landform_entr10km',
 'landform_sha10km',
 'landform_uni10km',
 'eumohp_dsd1',
 'eumohp_dsd2',
 'eumohp_dsd3',
 'eumohp_dsd4',
 'eumohp_dsd5',
 'eumohp_lp1',
 'eumohp_lp2',
 'eumohp_lp3',
 'eumohp_lp4',
 'eumohp_lp5',
 'eumohp_sd1',
 'eumohp_sd2',
 'eumohp_sd3',
 'eumohp_sd4',
 'eumohp_sd5',
 'lon',
 'lat',
 'sin_temp',
 'time_idx']

In [15]:
train_df.dtypes

proj_id                      object
time                 datetime64[ns]
gwl                         float32
precip                      float64
humid                       float64
temp                        float64
lai                         float64
day_sin                     float32
day_cos                     float32
twi                         float64
gw_recharge                 float64
hyraum_gr                  category
aquifer_type               category
permeability_coef          category
soil_texture               category
elevation                   float32
land_cover                 category
landform_entr10km           float64
landform_sha10km            float64
landform_uni10km            float64
eumohp_dsd1                 float64
eumohp_dsd2                 float64
eumohp_dsd3                 float64
eumohp_dsd4                 float64
eumohp_dsd5                 float64
eumohp_lp1                  float64
eumohp_lp2                  float64
eumohp_lp3                  

### TFT

In [17]:
# Full dataset mqf2
train_ds_tft_mqf2 = TimeSeriesDataSet(
    train_df,
    group_ids=["proj_id"],
    target="gwl",
    time_idx="time_idx",
    min_encoder_length=LAG,
    max_encoder_length=LAG,
    min_prediction_length=LEAD,
    max_prediction_length=LEAD,
    static_reals=['twi',
                  'gw_recharge',
                  'landform_sha10km',
                  'eumohp_dsd1',
                  'eumohp_lp1',
                  'eumohp_sd1',
                  'elevation'],
    static_categoricals=['hyraum_gr',
                         'land_cover', 
                         'aquifer_type', 
                         'permeability_coef',
                         'soil_texture'],
    time_varying_unknown_reals=['gwl'],
    time_varying_known_reals=['humid',
                              'precip',
                              'temp',
                              'sin_temp',
                              'lai',
                              'day_sin',
                              'day_cos'],
    add_target_scales=True, # Adds center and scale of the unnormalized timeseries as feature to static reals
    allow_missing_timesteps=False
)

val_ds_tft_mqf2 = TimeSeriesDataSet.from_dataset(train_ds_tft_mqf2, val_df)

train_ds_tft_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'train_ds_full_tft_mqf2.pt'))
val_ds_tft_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'val_ds_full_tft_mqf2.pt.pt'))

# # Hyraum 1 mqf2
# train_ds_tft_hyraum1_mqf2 = TimeSeriesDataSet(
#     train_hyraum1,
#     group_ids=["proj_id"],
#     target="gwl",
#     time_idx="time_idx",
#     min_encoder_length=LAG,
#     max_encoder_length=LAG,
#     min_prediction_length=LEAD,
#     max_prediction_length=LEAD,
#     static_reals=['twi',
#                   'gw_recharge',
#                   'landform_sha10km',
#                   'eumohp_dsd1',
#                   'eumohp_lp1',
#                   'eumohp_sd1',
#                   'elevation'],
#     static_categoricals=['hyraum_gr',
#                          'land_cover', 
#                          'aquifer_type', 
#                          'permeability_coef',
#                          'soil_texture'],
#     time_varying_unknown_reals=['gwl'],
#     time_varying_known_reals=['humid',
#                               'precip',
#                               'temp',
#                               'sin_temp',
#                               'lai',
#                               'day_sin',
#                               'day_cos'],
#     add_target_scales=True, # Adds center and scale of the unnormalized timeseries as feature to static reals
#     allow_missing_timesteps=False
# )

# val_ds_tft_hyraum1_mqf2 = TimeSeriesDataSet.from_dataset(train_ds_tft_hyraum1_mqf2, val_hyraum1)

# train_ds_tft_hyraum1_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'train_ds_full_tft_hyraum1_mqf2.pt'))
# val_ds_tft_hyraum1_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'val_ds_full_tft_hyraum1_mqf2.pt'))
# train_ds_tft_hyraum1_mqf2.save('J:/B22-FISHy/NUTZER/Kunz.S/kimodis_preprocessed/train_ds_full_tft_hyraum1_mqf2.pt')
# val_ds_tft_hyraum1_mqf2.save('J:/B22-FISHy/NUTZER/Kunz.S/kimodis_preprocessed/val_ds_full_tft_hyraum1_mqf2.pt')

# len(train_ds_tft.decoded_index['proj_id'].unique())

In [13]:
# TFT Full interpol dataset mqf2
# Dataset without the target as input (time_varying_unknown_reals)
train_ds_tft_interpol_mqf2 = TimeSeriesDataSet(
    train_df,
    group_ids=["proj_id"],
    target="gwl",
    time_idx="time_idx",
    min_encoder_length=LAG,
    max_encoder_length=LAG,
    min_prediction_length=LEAD,
    max_prediction_length=LEAD,
    static_reals=['twi',
                  'gw_recharge',
                  'landform_sha10km',
                  'eumohp_dsd1',
                  'eumohp_lp1',
                  'eumohp_sd1',
                  'elevation'],
    static_categoricals=['hyraum_gr',
                         'land_cover', 
                         'aquifer_type', 
                         'permeability_coef',
                         'soil_texture'],
    time_varying_unknown_reals=[],
    time_varying_known_reals=['humid',
                              'precip',
                              'temp',
                              'sin_temp',
                              'lai',
                              'day_sin',
                              'day_cos'],
    add_target_scales=False, # Adds center and scale of the unnormalized timeseries as feature to static reals
    allow_missing_timesteps=False
)

val_ds_tft_interpol_mqf2 = TimeSeriesDataSet.from_dataset(train_ds_tft_interpol_mqf2, val_df)

train_ds_tft_interpol_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'train_ds_full_interpol_tft_mqf2.pt'))
val_ds_tft_interpol_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'val_ds_full_interpol_tft_mqf2.pt'))

# Hyraum 1 mqf2
# train_ds_tft_interpol_hyraum1_mqf2 = TimeSeriesDataSet(
#     train_hyraum1,
#     group_ids=["proj_id"],
#     target="gwl",
#     time_idx="time_idx",
#     min_encoder_length=LAG,
#     max_encoder_length=LAG,
#     min_prediction_length=LEAD,
#     max_prediction_length=LEAD,
#     static_reals=['twi',
#                   'gw_recharge',
#                   'landform_sha10km',
#                   'eumohp_dsd1',
#                   'eumohp_lp1',
#                   'eumohp_sd1',
#                   'elevation'],
#     static_categoricals=['hyraum_gr',
#                          'land_cover', 
#                          'aquifer_type', 
#                          'permeability_coef',
#                          'soil_texture'],
#     time_varying_unknown_reals=[],
#     time_varying_known_reals=['humid',
#                               'precip',
#                               'temp',
#                               'sin_temp',
#                               'lai',
#                               'day_sin',
#                               'day_cos'],
#     add_target_scales=False, # Adds center and scale of the unnormalized timeseries as feature to static reals
#     allow_missing_timesteps=False
# )

# val_ds_tft_interpol_hyraum1_mqf2 = TimeSeriesDataSet.from_dataset(train_ds_tft_interpol_hyraum1_mqf2, val_hyraum1)

# train_ds_tft_interpol_hyraum1_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'train_ds_full_interpol_tft_hyraum1_mqf2.pt'))
# val_ds_tft_interpol_hyraum1_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'val_ds_full_interpol_tft_hyraum1_mqf2.pt'))
# train_ds_tft_interpol_hyraum1_mqf2.save('J:/B22-FISHy/NUTZER/Kunz.S/kimodis_preprocessed/train_ds_full_interpol_tft_hyraum1_mqf2.pt')
# val_ds_tft_interpol_hyraum1_mqf2.save('J:/B22-FISHy/NUTZER/Kunz.S/kimodis_preprocessed/val_ds_full_interpol_tft_hyraum1_mqf2.pt')

In [None]:
# Checking the TimeSeriesDataSets

# train_ds_tft.get_parameters()
# dataloader = train_ds_tft.to_dataloader(batch_size=4)

# and load the first batch
# x, y = next(iter(dataloader))
# print("x =", x)
# print("\ny =", y)
# print("\nsizes of x =")
# for key, value in x.items():
#     print(f"\t{key} = {value.size()}")
    
# val_ds_tft.get_parameters()
# # convert the dataset to a dataloader
# dataloader = val_ds_tft.to_dataloader(batch_size=4)

# # and load the first batch
# x, y = next(iter(dataloader))
# print("x =", x)
# print("\ny =", y)
# print("\nsizes of x =")
# for key, value in x.items():
#     print(f"\t{key} = {value.size()}")

In [14]:
# TFT (purely dynamic) dataset mqf2
train_ds_tft_dyn_mqf2 = TimeSeriesDataSet(
    train_df,
    group_ids=["proj_id"],
    target="gwl",
    time_idx="time_idx",
    min_encoder_length=LAG,
    max_encoder_length=LAG,
    min_prediction_length=LEAD,
    max_prediction_length=LEAD,
    static_reals=[],
    static_categoricals=[],
    time_varying_unknown_reals=['gwl'],
    time_varying_known_reals=['humid',
                              'precip',
                              'temp',
                              'sin_temp',
                              'lai',
                              'day_sin',
                              'day_cos'],
    add_target_scales=False, 
    allow_missing_timesteps=False
)

val_ds_tft_dyn_mqf2 = TimeSeriesDataSet.from_dataset(train_ds_tft_dyn_mqf2, val_df)

train_ds_tft_dyn_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'train_ds_dyn_tft_mqf2.pt'))
val_ds_tft_dyn_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'val_ds_dyn_tft_mqf2.pt'))

# TFT dyn Hyraum 1 mqf2
# train_ds_tft_dyn_hyraum1_mqf2 = TimeSeriesDataSet(
#     train_hyraum1,
#     group_ids=["proj_id"],
#     target="gwl",
#     time_idx="time_idx",
#     min_encoder_length=LAG,
#     max_encoder_length=LAG,
#     min_prediction_length=LEAD,
#     max_prediction_length=LEAD,
#     static_reals=[],
#     static_categoricals=[],
#     time_varying_unknown_reals=['gwl'],
#     time_varying_known_reals=['humid',
#                               'precip',
#                               'temp',
#                               'sin_temp',
#                               'lai',
#                               'day_sin',
#                               'day_cos'],
#     add_target_scales=False, 
#     allow_missing_timesteps=False
# )

# val_ds_tft_dyn_hyraum1_mqf2 = TimeSeriesDataSet.from_dataset(train_ds_tft_dyn_hyraum1_mqf2, val_hyraum1)

# train_ds_tft_dyn_hyraum1_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'train_ds_dyn_tft_hyraum1_mqf2.pt'))
# val_ds_tft_dyn_hyraum1_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'val_ds_dyn_tft_hyraum1_mqf2.pt'))

In [4]:
# TFT without groundwater level information and without static information
# train_ds_tft_interpol_dyn_mqf2 = TimeSeriesDataSet(
#     train_df,
#     group_ids=["proj_id"],
#     target="gwl",
#     time_idx="time_idx",
#     min_encoder_length=LAG,
#     max_encoder_length=LAG,
#     min_prediction_length=LEAD,
#     max_prediction_length=LEAD,
#     time_varying_unknown_reals=[],
#     time_varying_known_reals=['humid',
#                               'precip',
#                               'temp',
#                               'sin_temp',
#                               'lai',
#                               'day_sin',
#                               'day_cos'],
#     add_target_scales=False, # Adds center and scale of the unnormalized timeseries as feature to static reals
#     allow_missing_timesteps=False
# )

# val_ds_tft_interpol_dyn_mqf2 = TimeSeriesDataSet.from_dataset(train_ds_tft_interpol_dyn_mqf2, val_df)

# train_ds_tft_interpol_dyn_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'train_ds_dyn_interpol_tft_mqf2.pt'))
# val_ds_tft_interpol_dyn_mqf2.save(os.path.join(RESULT_PATH, 'preprocessing', 'val_ds_dyn_interpol_tft_mqf2.pt'))
# train_ds_tft_interpol_dyn_mqf2.save('J:/B22-FISHy/NUTZER/Kunz.S/kimodis_preprocessed/train_ds_dyn_interpol_tft_mqf2.pt')
# val_ds_tft_interpol_dyn_mqf2.save('J:/B22-FISHy/NUTZER/Kunz.S/kimodis_preprocessed/val_ds_dyn_interpol_tft_mqf2.pt')

### N-HiTS

In [15]:
# N-HiTS_full
# min_prediction_length is set to max_prediction_length
train_ds_nhits = TimeSeriesDataSet(
    train_df,
    group_ids=['proj_id'],
    target='gwl',
    time_idx='time_idx',
    min_encoder_length=LAG,
    max_encoder_length=LAG,
    min_prediction_length=LEAD,
    max_prediction_length=LEAD,
    static_reals=['twi',
                  'gw_recharge',
                  'landform_sha10km',
                  'eumohp_dsd1',
                  'eumohp_lp1',
                  'eumohp_sd1',
                  'elevation'],
    static_categoricals=['hyraum_gr',
                         'land_cover', 
                         'aquifer_type', 
                         'permeability_coef',
                         'soil_texture'],
    time_varying_unknown_reals=['gwl'],
    time_varying_known_reals=['humid',
                              'precip',
                              'temp',
                              'sin_temp',
                              'lai',
                              'day_sin',
                              'day_cos'],
    add_target_scales=True, # Adds center and scale of the unnormalized timeseries as feature to static reals
    allow_missing_timesteps=False
)

val_ds_nhits = TimeSeriesDataSet.from_dataset(train_ds_nhits, val_df)

train_ds_nhits.save(os.path.join(RESULT_PATH, 'preprocessing', 'train_ds_full_nhits.pt'))
val_ds_nhits.save(os.path.join(RESULT_PATH, 'preprocessing', 'val_ds_full_nhits.pt'))

# Hyraum 1
# train_ds_nhits_hyraum1 = TimeSeriesDataSet(
#     train_hyraum1,
#     group_ids=['proj_id'],
#     target='gwl',
#     time_idx='time_idx',
#     min_encoder_length=LAG,
#     max_encoder_length=LAG,
#     min_prediction_length=LEAD,
#     max_prediction_length=LEAD,
#     static_reals=['twi',
#                   'gw_recharge',
#                   'landform_sha10km',
#                   'eumohp_dsd1',
#                   'eumohp_lp1',
#                   'eumohp_sd1',
#                   'elevation'],
#     static_categoricals=['hyraum_gr',
#                          'land_cover', 
#                          'aquifer_type', 
#                          'permeability_coef',
#                          'soil_texture'],
#     time_varying_unknown_reals=['gwl'],
#     time_varying_known_reals=['humid',
#                               'precip',
#                               'temp',
#                               'sin_temp',
#                               'lai',
#                               'day_sin',
#                               'day_cos'],
#     add_target_scales=True, # Adds center and scale of the unnormalized timeseries as feature to static reals
#     allow_missing_timesteps=False
# )

# val_ds_nhits_hyraum1 = TimeSeriesDataSet.from_dataset(train_ds_nhits_hyraum1, val_hyraum1)

# train_ds_nhits_hyraum1.save(os.path.join(RESULT_PATH, 'preprocessing', 'train_ds_full_nhits_hyraum1.pt'))
# val_ds_nhits_hyraum1.save(os.path.join(RESULT_PATH, 'preprocessing', 'val_ds_full_nhits_hyraum1.pt'))
# train_ds_nhits_hyraum1.save('J:/B22-FISHy/NUTZER/Kunz.S/kimodis_preprocessed/train_ds_full_nhits_hyraum1.pt')
# val_ds_nhits_hyraum1.save('J:/B22-FISHy/NUTZER/Kunz.S/kimodis_preprocessed/val_ds_full_nhits_hyraum1.pt')

In [17]:
# N-HiTS_dyn only with dynamic input (with GWL)
train_ds_nhits_dyn = TimeSeriesDataSet(
    train_df,
    group_ids=["proj_id"],
    target="gwl",
    time_idx="time_idx",
    min_encoder_length=LAG,
    max_encoder_length=LAG,
    min_prediction_length=LEAD,
    max_prediction_length=LEAD,
    static_reals=[],
    static_categoricals=[],
    time_varying_unknown_reals=['gwl'],
    time_varying_known_reals=['humid',
                              'precip',
                              'temp',
                              'sin_temp',
                              'lai',
                              'day_sin',
                              'day_cos'],
    add_target_scales=False, # Adds center and scale of the unnormalized timeseries as feature to static reals
    allow_missing_timesteps=False
)

val_ds_nhits_dyn = TimeSeriesDataSet.from_dataset(train_ds_nhits_dyn, val_df)

train_ds_nhits_dyn.save(os.path.join(RESULT_PATH, 'preprocessing', 'train_ds_dyn_nhits.pt'))
val_ds_nhits_dyn.save(os.path.join(RESULT_PATH, 'preprocessing', 'val_ds_dyn_nhits.pt'))


# Hyraum 1
# train_ds_nhits_dyn_hyraum1 = TimeSeriesDataSet(
#     train_hyraum1,
#     group_ids=["proj_id"],
#     target="gwl",
#     time_idx="time_idx",
#     min_encoder_length=LAG,
#     max_encoder_length=LAG,
#     min_prediction_length=LEAD,
#     max_prediction_length=LEAD,
#     static_reals=[],
#     static_categoricals=[],
#     time_varying_unknown_reals=['gwl'],
#     time_varying_known_reals=['humid',
#                               'precip',
#                               'temp',
#                               'sin_temp',
#                               'lai',
#                               'day_sin',
#                               'day_cos'],
#     add_target_scales=False, # Adds center and scale of the unnormalized timeseries as feature to static reals
#     allow_missing_timesteps=False
# )

# val_ds_nhits_dyn_hyraum1 = TimeSeriesDataSet.from_dataset(train_ds_nhits_dyn_hyraum1, val_hyraum1)

# train_ds_nhits_dyn_hyraum1.save(os.path.join(RESULT_PATH, 'preprocessing', 'train_ds_dyn_nhits_hyraum1.pt'))
# val_ds_nhits_dyn_hyraum1.save(os.path.join(RESULT_PATH, 'preprocessing', 'val_ds_dyn_nhits_hyraum1.pt'))
# train_ds_nhits_dyn_hyraum1.save('J:/B22-FISHy/NUTZER/Kunz.S/kimodis_preprocessed/train_ds_dyn_nhits_hyraum1.pt')
# val_ds_nhits_dyn_hyraum1.save('J:/B22-FISHy/NUTZER/Kunz.S/kimodis_preprocessed/val_ds_dyn_nhits_hyraum1.pt')