In [11]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

from _config import PKL_PROCESSED_STEP2_DTU_SOLAR_STATION

TARGETS = ['DNI', 'DHI']
INPUT_SEQ_LEN = 60  # 1 hour

In [12]:
df = pd.read_pickle(PKL_PROCESSED_STEP2_DTU_SOLAR_STATION)
df.dropna(inplace=True)
minmax_scaler = MinMaxScaler()
features = [x for x in df.columns if x not in TARGETS]
df[features] = minmax_scaler.fit_transform(df[features])
df = df[sorted(df.columns)]

print(df.shape)
print(df.info())
df.head()

(4787572, 10)
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4787572 entries, 2015-03-12 01:00:00 to 2025-01-01 08:04:00
Data columns (total 10 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   DHI                       float64
 1   DNI                       float64
 2   air_pressure_lag_60       float64
 3   air_temperature_lag_60    float64
 4   rain_duration_lag_60      float64
 5   rain_intensity_lag_60     float64
 6   relative_humidity_lag_60  float64
 7   solar_altitude_lag_60     float64
 8   wind_dir_avg_lag_60       float64
 9   wind_speed_avg_lag_60     float64
dtypes: float64(10)
memory usage: 401.8 MB
None


Unnamed: 0,DHI,DNI,air_pressure_lag_60,air_temperature_lag_60,rain_duration_lag_60,rain_intensity_lag_60,relative_humidity_lag_60,solar_altitude_lag_60,wind_dir_avg_lag_60,wind_speed_avg_lag_60
2015-03-12 01:00:00,0.0,0.0,0.801865,0.321343,0.0,0.0,0.83922,0.178065,0.129651,0.14104
2015-03-12 01:01:00,0.0,0.0,0.801865,0.321343,0.0,0.0,0.835566,0.178336,0.206943,0.134382
2015-03-12 01:02:00,0.0,0.0,0.801865,0.321343,0.0,0.0,0.835566,0.178614,0.132145,0.134382
2015-03-12 01:03:00,0.0,0.0,0.801865,0.321343,0.0,0.0,0.836784,0.178899,0.127158,0.137711
2015-03-12 01:04:00,0.0,0.0,0.801865,0.321343,0.0,0.0,0.834348,0.17919,0.109705,0.134382


In [13]:
# limit the data to years 2020-2024
# df = df.loc['2023':]

In [14]:
SPLIT = (0.7, 0.85)  # 65-85% for training; next 20% for validation; remaining 15% for test
total_samples = df.shape[0]
train_end = int(SPLIT[0] * total_samples)  # e.g., 65% for training
val_end = int(SPLIT[1] * total_samples)  # e.g., next 20% for validation

train_indices = list(range(0, train_end))
val_indices = list(range(train_end, val_end))
test_indices = list(range(val_end, total_samples))
print(f"Train indices: {train_indices[:5]} ... {train_indices[-5:]}")
print(f"Validation indices: {val_indices[:5]} ... {val_indices[-5:]}")
print(f"Test indices: {test_indices[:5]} ... {test_indices[-5:]}")
print(f'Train size: {len(train_indices)}')
print(f'Validation size: {len(val_indices)}')
print(f'Test size: {len(test_indices)}')

Train indices: [0, 1, 2, 3, 4] ... [3351295, 3351296, 3351297, 3351298, 3351299]
Validation indices: [3351300, 3351301, 3351302, 3351303, 3351304] ... [4069431, 4069432, 4069433, 4069434, 4069435]
Test indices: [4069436, 4069437, 4069438, 4069439, 4069440] ... [4787567, 4787568, 4787569, 4787570, 4787571]
Train size: 3351300
Validation size: 718136
Test size: 718136


In [15]:
import numpy as np


def create_sequences(X, y, timestamps, input_seq_len, rolling=True):
    X_tmp, y_tmp = [], []
    total_length = len(y)
    # Define the threshold as a timedelta (here, in minutes)
    gap_threshold_timedelta = np.timedelta64(1, 'm')

    # Slide over the data
    i = 0
    while i < total_length - input_seq_len:
        # Get timestamps for the entire sequence (input and forecast)
        seq_timestamps = timestamps[i: i + input_seq_len]
        # Calculate differences between consecutive timestamps
        time_diffs = np.diff(seq_timestamps)
        # If any gap is larger than the allowed threshold, skip this sequence
        if np.any(time_diffs > gap_threshold_timedelta):
            if rolling:
                i += 1
            else:
                i += input_seq_len  # Skip to the end of the current sequence
            continue
        # Otherwise, create the sequence as before
        X_tmp.append(X[i: i + input_seq_len])
        y_tmp.append(y[i: i + input_seq_len])
        if rolling:
            i += 1
        else:
            i += input_seq_len  # Skip to the end of the current sequence
    return np.array(X_tmp), np.array(y_tmp)

In [16]:
# Training data
X = df.iloc[train_indices].drop(columns=TARGETS)
y = df.iloc[train_indices][TARGETS]
print('Before sequencing:', X.shape, y.shape)
X, y = create_sequences(X.to_numpy(), y.to_numpy(), df.index.to_numpy(), INPUT_SEQ_LEN)
print('After sequencing:', X.shape, y.shape)
# save the data
np.savez_compressed('data/train.npz', X=X, y=y)

# Validation data
X = df.iloc[val_indices].drop(columns=TARGETS)
y = df.iloc[val_indices][TARGETS]
print('Before sequencing:', X.shape, y.shape)
X, y = create_sequences(X.to_numpy(), y.to_numpy(), df.index.to_numpy(), INPUT_SEQ_LEN, rolling=False)
print('After sequencing:', X.shape, y.shape)
# save the data
np.savez_compressed('data/val.npz', X=X, y=y)

# Test data
X = df.iloc[test_indices].drop(columns=TARGETS)
y = df.iloc[test_indices][TARGETS]
print('Before sequencing:', X.shape, y.shape)
X, y = create_sequences(X.to_numpy(), y.to_numpy(), df.index.to_numpy(), INPUT_SEQ_LEN, rolling=False)
print('After sequencing:', X.shape, y.shape)
# save the data
np.savez_compressed('data/test.npz', X=X, y=y)

Before sequencing: (3351300, 8) (3351300, 2)
After sequencing: (3349044, 60, 8) (3349044, 60, 2)
Before sequencing: (718136, 8) (718136, 2)
After sequencing: (11945, 60, 8) (11945, 60, 2)
Before sequencing: (718136, 8) (718136, 2)
After sequencing: (11945, 60, 8) (11945, 60, 2)


In [17]:
# # load the data
# train = np.load('data/train.npz')
# print(train['X'].shape, train['y'].shape)