In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

from _config import PKL_PROCESSED_STEP2_DTU_SOLAR_STATION

TARGETS = ['DNI', 'DHI']
INPUT_SEQ_LEN = 60  # 1 hour

In [8]:
df = pd.read_pickle(PKL_PROCESSED_STEP2_DTU_SOLAR_STATION)
df.dropna(inplace=True)
minmax_scaler = MinMaxScaler()
features = [x for x in df.columns if x not in TARGETS]
df[features] = minmax_scaler.fit_transform(df[features])
df = df[sorted(df.columns)]

print(df.shape)
print(df.info())
df.head()

(2460682, 10)
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2460682 entries, 2015-03-12 06:36:00 to 2025-01-01 08:04:00
Data columns (total 10 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   DHI                       float64
 1   DNI                       float64
 2   air_pressure_lag_60       float64
 3   air_temperature_lag_60    float64
 4   rain_duration_lag_60      float64
 5   rain_intensity_lag_60     float64
 6   relative_humidity_lag_60  float64
 7   solar_altitude_lag_60     float64
 8   wind_dir_avg_lag_60       float64
 9   wind_speed_avg_lag_60     float64
dtypes: float64(10)
memory usage: 206.5 MB
None


Unnamed: 0,DHI,DNI,air_pressure_lag_60,air_temperature_lag_60,rain_duration_lag_60,rain_intensity_lag_60,relative_humidity_lag_60,solar_altitude_lag_60,wind_dir_avg_lag_60,wind_speed_avg_lag_60
2015-03-12 06:36:00,27.93,550.361984,0.802469,0.242206,0.0,0.0,0.845311,8.5e-05,0.21193,0.195888
2015-03-12 06:37:00,28.29,556.151177,0.802469,0.242206,0.0,0.0,0.848965,0.00209,0.301688,0.164265
2015-03-12 06:38:00,28.7,563.337782,0.802469,0.242206,0.0,0.0,0.846529,0.004117,0.852706,0.182335
2015-03-12 06:39:00,29.17,569.626158,0.802469,0.242206,0.0,0.0,0.845311,0.006165,0.852706,0.182335
2015-03-12 06:40:00,29.48,575.615184,0.802469,0.242206,0.0,0.0,0.844093,0.008233,0.752975,0.204923


In [9]:
# train_df = df.loc[:'2020']
# val_df = df.loc['2021':'2022']
# test_df = df.loc['2023':]
# print(train_df.shape, val_df.shape, test_df.shape)
# df = test_df.copy()

In [10]:
# limit the data to years 2023-2024
df = df.loc['2023':]
SPLIT = (0.7, 0.85)  # 65-85% for training; next 20% for validation; remaining 15% for test
total_samples = df.shape[0]
train_end = int(SPLIT[0] * total_samples)  # e.g., 65% for training
val_end = int(SPLIT[1] * total_samples)  # e.g., next 20% for validation

train_indices = list(range(0, train_end))
val_indices = list(range(train_end, val_end))
test_indices = list(range(val_end, total_samples))
print(f"Train indices: {train_indices[:5]} ... {train_indices[-5:]}")
print(f"Validation indices: {val_indices[:5]} ... {val_indices[-5:]}")
print(f"Test indices: {test_indices[:5]} ... {test_indices[-5:]}")
print(f'Train size: {len(train_indices)}')
print(f'Validation size: {len(val_indices)}')
print(f'Test size: {len(test_indices)}')
train_df = df.iloc[train_indices]
val_df = df.iloc[val_indices]
test_df = df.iloc[test_indices]

Train indices: [0, 1, 2, 3, 4] ... [365390, 365391, 365392, 365393, 365394]
Validation indices: [365395, 365396, 365397, 365398, 365399] ... [443689, 443690, 443691, 443692, 443693]
Test indices: [443694, 443695, 443696, 443697, 443698] ... [521989, 521990, 521991, 521992, 521993]
Train size: 365395
Validation size: 78299
Test size: 78300


In [11]:
import numpy as np


def create_sequences(X, y, timestamps, input_seq_len, rolling=True):
    X_tmp, y_tmp = [], []
    total_length = len(y)
    # Define the threshold as a timedelta (here, in minutes)
    gap_threshold_timedelta = np.timedelta64(1, 'm')

    # Slide over the data
    i = 0
    while i < total_length - input_seq_len:
        # Get timestamps for the entire sequence (input and forecast)
        seq_timestamps = timestamps[i: i + input_seq_len]
        # Calculate differences between consecutive timestamps
        time_diffs = np.diff(seq_timestamps)
        # If any gap is larger than the allowed threshold, skip this sequence
        if np.any(time_diffs > gap_threshold_timedelta):
            if rolling:
                i += 1
            else:
                i += input_seq_len  # Skip to the end of the current sequence
            continue
        # Otherwise, create the sequence as before
        X_tmp.append(X[i: i + input_seq_len])
        y_tmp.append(y[i: i + input_seq_len])
        if rolling:
            i += 1
        else:
            i += input_seq_len  # Skip to the end of the current sequence
    return np.array(X_tmp), np.array(y_tmp)

In [12]:
from _config import DATA_ROOT

# Training data
X = train_df.drop(columns=TARGETS)
y = train_df[TARGETS]
print('Before sequencing:', X.shape, y.shape)
X, y = create_sequences(X.to_numpy(), y.to_numpy(), df.index.to_numpy(), INPUT_SEQ_LEN)
print('After sequencing:', X.shape, y.shape)
# save the data
np.savez_compressed(DATA_ROOT / 'train.npz', X=X, y=y)

# Validation data
X = val_df.drop(columns=TARGETS)
y = val_df[TARGETS]
print('Before sequencing:', X.shape, y.shape)
X, y = create_sequences(X.to_numpy(), y.to_numpy(), df.index.to_numpy(), INPUT_SEQ_LEN, rolling=False)
print('After sequencing:', X.shape, y.shape)
# save the data
np.savez_compressed(DATA_ROOT / 'val.npz', X=X, y=y)

# Test data
X = test_df.drop(columns=TARGETS)
y = test_df[TARGETS]
print('Before sequencing:', X.shape, y.shape)
X, y = create_sequences(X.to_numpy(), y.to_numpy(), df.index.to_numpy(), INPUT_SEQ_LEN, rolling=False)
print('After sequencing:', X.shape, y.shape)
# save the data
np.savez_compressed(DATA_ROOT / 'test.npz', X=X, y=y)

Before sequencing: (365395, 8) (365395, 2)
After sequencing: (335245, 60, 8) (335245, 60, 2)
Before sequencing: (78299, 8) (78299, 2)
After sequencing: (1185, 60, 8) (1185, 60, 2)
Before sequencing: (78300, 8) (78300, 2)
After sequencing: (1185, 60, 8) (1185, 60, 2)


In [13]:
# load the data
train = np.load(DATA_ROOT / 'train.npz')
print(train['X'].shape, train['y'].shape)

(335245, 60, 8) (335245, 60, 2)
