### Develop preprocessing pipeline

The complete preprocessing pipeline is then put into a separate functions.

In [1]:
# Tech preamble:
import pandas as pd
import numpy as np

from predict_sahel_rainfall.preprocessing import (
    load_data,
    split_sequence,
    train_val_test_split,
    scale_norm_inputs,
    prepare_inputs_and_target,
)

In [22]:
# Set url to csv file containing CICMoD indices from desired release:
data_url = (
    "https://github.com/MarcoLandtHayen/climate_index_collection/"
    "releases/download/v2023.03.29.1/climate_indices.csv"
)

# Choose ESM ('CESM' or 'FOCI'):
ESM = 'CESM'

# Select target index:
target_index = 'PREC_SAHEL'

# Select input features:
input_features = [
    'AMO', 'ENSO_12', 'ENSO_3', 'ENSO_34', 'ENSO_4', 'NAO_PC', 'NAO_ST', 
    'NP', 'PDO_PC', 'PREC_SAHEL', 'SAM_PC', 'SAM_ZM', 'SAT_N_ALL', 'SAT_N_LAND',
    'SAT_N_OCEAN', 'SAT_S_ALL', 'SAT_S_LAND', 'SAT_S_OCEAN', 'SOI',
    'SSS_ENA', 'SSS_NA', 'SSS_SA', 'SSS_WNA', 'SST_ESIO', 'SST_HMDR',
    'SST_MED', 'SST_TNA', 'SST_TSA', 'SST_WSIO'
]

# Choose, whether to add months as one-hot encoded features:
add_months = True

# Choose, whether to normalize target index:
norm_target = True

# Set lead time for target index:
lead_time = 1

# Specify input length:
input_length = 24

# Specify amount of combined training and validation data relative to test data:
train_test_split = 0.9

# Specify relative amount of combined training and validation used for training:
train_val_split = 0.8

## Optionally choose to scale or normalize input features according to statistics from training data:
# 'no': Keep raw input features.
# 'scale_01': Scale input features with min/max scaling to [0,1].
# 'scale_11': Scale input features with min/max scaling to [-1,1].
# 'norm': Normalize input features, hence subtract mean and divide by std dev.
scale_norm = 'scale_01'

In [23]:
# Prepare inputs and target:
(
    train_input,
    train_target,
    val_input,
    val_target,
    test_input,
    test_target,
    train_mean,
    train_std,
    train_min,
    train_max,
) = prepare_inputs_and_target(    
    data_url=data_url,
    ESM=ESM,
    target_index=target_index,
    input_features=input_features,
    add_months=add_months,
    norm_target=norm_target,
    lead_time=lead_time,
    input_length=input_length,
    train_test_split=train_test_split,
    train_val_split=train_val_split,
    scale_norm=scale_norm,
)

In [24]:
# Check dimensions:
print("train_input shape (samples, time steps, features): ", train_input.shape)
print("val_input shape (samples, time steps, features): ", val_input.shape)
print("test_input shape (samples, time steps, features): ", test_input.shape)

print("\ntrain_target shape (samples, 1): ", train_target.shape)
print("val_target shape (samples, 1): ", val_target.shape)
print("test_target shape (samples, 1): ", test_target.shape)

train_input shape (samples, time steps, features):  (8630, 1, 41)
val_input shape (samples, time steps, features):  (2158, 1, 41)
test_input shape (samples, time steps, features):  (1199, 1, 41)

train_target shape (samples, 1):  (8630, 1)
val_target shape (samples, 1):  (2158, 1)
test_target shape (samples, 1):  (1199, 1)


In [10]:
# Check statistics:
print("train_input MIN: ", np.round(np.min(train_input, axis=(0,1)), 2))
print("train_input MAX: ", np.round(np.max(train_input, axis=(0,1)), 2))
print("train_input MEAN: ", np.round(np.mean(train_input, axis=(0,1)), 2))
print("train_input STD: ", np.round(np.std(train_input, axis=(0,1)), 2))
print("val_input MIN: ", np.round(np.min(val_input, axis=(0,1)), 2))
print("val_input MAX: ", np.round(np.max(val_input, axis=(0,1)), 2))
print("val_input MEAN: ", np.round(np.mean(val_input, axis=(0,1)), 2))
print("val_input STD: ", np.round(np.std(val_input, axis=(0,1)), 2))
print("test_input MIN: ", np.round(np.min(test_input, axis=(0,1)), 2))
print("test_input MAX: ", np.round(np.max(test_input, axis=(0,1)), 2))
print("test_input MEAN: ", np.round(np.mean(test_input, axis=(0,1)), 2))
print("test_input STD: ", np.round(np.std(test_input, axis=(0,1)), 2))

train_input MIN:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
train_input MAX:  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
train_input MEAN:  [0.46 0.4  0.43 0.49 0.58 0.41 0.39 0.48 0.56 0.37 0.59 0.61 0.5  0.49
 0.57 0.52 0.42 0.58 0.66 0.49 0.52 0.48 0.52 0.42 0.48 0.51 0.51 0.46
 0.45 0.08 0.08 0.08 0.08 0.08 0.08 0.08 0.08 0.08 0.08 0.08 0.08]
train_input STD:  [0.13 0.14 0.16 0.17 0.18 0.12 0.12 0.12 0.15 0.08 0.12 0.13 0.12 0.11
 0.11 0.11 0.12 0.11 0.11 0.15 0.15 0.16 0.16 0.14 0.11 0.11 0.11 0.12
 0.14 0.28 0.28 0.28 0.28 0.28 0.28 0.28 0.28 0.28 0.28 0.28 0.28]
val_input MIN:  [-0.03  0.    0.02 -0.01 -0.01  0.08  0.04 -0.06  0.11  0.03  0.04  0.05
  0.05  0.12  0.14  0.05  0.03  0.08  0.06 -0.08 -0.14 -0.03 -0.15 -0.09
  0.13  0.09  0.15  0.09 -0.01  0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.  