In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

from _config import PKL_PROCESSED_STEP2_DTU_SOLAR_STATION

TARGETS = ['DNI', 'DHI']
INPUT_SEQ_LEN = 60  # 1 hour

In [2]:
df = pd.read_pickle(PKL_PROCESSED_STEP2_DTU_SOLAR_STATION)
df.dropna(inplace=True)
minmax_scaler = MinMaxScaler()
features = [x for x in df.columns if x not in TARGETS]
df[features] = minmax_scaler.fit_transform(df[features])
df = df[sorted(df.columns)]

print(df.shape)
print(df.info())
df.head()

(2460682, 18)
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2460682 entries, 2015-03-12 06:36:00 to 2025-01-01 08:04:00
Data columns (total 18 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   DHI                       float64
 1   DNI                       float64
 2   air_pressure              float64
 3   air_pressure_lag_60       float64
 4   air_temperature           float64
 5   air_temperature_lag_60    float64
 6   rain_duration             float64
 7   rain_duration_lag_60      float64
 8   rain_intensity            float64
 9   rain_intensity_lag_60     float64
 10  relative_humidity         float64
 11  relative_humidity_lag_60  float64
 12  solar_altitude            float64
 13  solar_altitude_lag_60     float64
 14  wind_dir_avg              float64
 15  wind_dir_avg_lag_60       float64
 16  wind_speed_avg            float64
 17  wind_speed_avg_lag_60     float64
dtypes: float64(18)
memory usage: 356.7 MB
None


Unnamed: 0,DHI,DNI,air_pressure,air_pressure_lag_60,air_temperature,air_temperature_lag_60,rain_duration,rain_duration_lag_60,rain_intensity,rain_intensity_lag_60,relative_humidity,relative_humidity_lag_60,solar_altitude,solar_altitude_lag_60,wind_dir_avg,wind_dir_avg_lag_60,wind_speed_avg,wind_speed_avg_lag_60
2015-03-12 06:36:00,27.93,550.361984,0.814815,0.984674,0.275779,0.242206,0.0,0.0,0.0,0.0,0.775883,0.865323,0.135375,8.5e-05,0.787881,0.21193,0.182335,0.197674
2015-03-12 06:37:00,28.29,556.151177,0.814815,0.984674,0.278177,0.242206,0.0,0.0,0.0,0.0,0.774665,0.868505,0.137669,0.00209,0.720562,0.301688,0.19137,0.165763
2015-03-12 06:38:00,28.7,563.337782,0.814815,0.984674,0.278177,0.242206,0.0,0.0,0.0,0.0,0.773447,0.866384,0.139961,0.004117,0.705602,0.852706,0.186853,0.183998
2015-03-12 06:39:00,29.17,569.626158,0.814815,0.984674,0.278177,0.242206,0.0,0.0,0.0,0.0,0.774665,0.865323,0.142252,0.006165,0.767934,0.852706,0.168783,0.183998
2015-03-12 06:40:00,29.48,575.615184,0.814815,0.984674,0.278177,0.242206,0.0,0.0,0.0,0.0,0.771011,0.864263,0.14454,0.008233,0.044879,0.752975,0.1733,0.206791


In [9]:
train_df = df.loc['2022']
val_df = df.loc['2023']
test_df = df.loc['2024']
print(train_df.shape, val_df.shape, test_df.shape)

(268187, 18) (268219, 18) (253753, 18)


In [12]:
# # limit the data to years 2023-2024
# df = df.loc['2023':]
# SPLIT = (0.7, 0.85)  # 65-85% for training; next 20% for validation; remaining 15% for test
# total_samples = df.shape[0]
# train_end = int(SPLIT[0] * total_samples)  # e.g., 65% for training
# val_end = int(SPLIT[1] * total_samples)  # e.g., next 20% for validation
#
# train_indices = list(range(0, train_end))
# val_indices = list(range(train_end, val_end))
# test_indices = list(range(val_end, total_samples))
# print(f"Train indices: {train_indices[:5]} ... {train_indices[-5:]}")
# print(f"Validation indices: {val_indices[:5]} ... {val_indices[-5:]}")
# print(f"Test indices: {test_indices[:5]} ... {test_indices[-5:]}")
# print(f'Train size: {len(train_indices)}')
# print(f'Validation size: {len(val_indices)}')
# print(f'Test size: {len(test_indices)}')
# train_df = df.iloc[train_indices]
# val_df = df.iloc[val_indices]
# test_df = df.iloc[test_indices]

In [10]:
import numpy as np


def create_sequences(X, y, timestamps, input_seq_len, rolling=True):
    X_tmp, y_tmp = [], []
    total_length = len(y)
    # Define the threshold as a timedelta (here, in minutes)
    gap_threshold_timedelta = np.timedelta64(1, 'm')

    # Slide over the data
    i = 0
    while i < total_length - input_seq_len:
        # Get timestamps for the entire sequence (input and forecast)
        seq_timestamps = timestamps[i: i + input_seq_len]
        # Calculate differences between consecutive timestamps
        time_diffs = np.diff(seq_timestamps)
        # If any gap is larger than the allowed threshold, skip this sequence
        if np.any(time_diffs > gap_threshold_timedelta):
            if rolling:
                i += 1
            else:
                i += input_seq_len  # Skip to the end of the current sequence
            continue
        # Otherwise, create the sequence as before
        X_tmp.append(X[i: i + input_seq_len])
        y_tmp.append(y[i: i + input_seq_len])
        if rolling:
            i += 1
        else:
            i += input_seq_len  # Skip to the end of the current sequence
    return np.array(X_tmp), np.array(y_tmp)

In [13]:
from _config import DATA_ROOT

# Training data
X = train_df.drop(columns=TARGETS)
y = train_df[TARGETS]
print('Before sequencing:', X.shape, y.shape)
X, y = create_sequences(X.to_numpy(), y.to_numpy(), df.index.to_numpy(), INPUT_SEQ_LEN)
print('After sequencing:', X.shape, y.shape)
# save the data
np.savez_compressed(DATA_ROOT / 'train.npz', X=X, y=y)

# Validation data
X = val_df.drop(columns=TARGETS)
y = val_df[TARGETS]
print('Before sequencing:', X.shape, y.shape)
X, y = create_sequences(X.to_numpy(), y.to_numpy(), df.index.to_numpy(), INPUT_SEQ_LEN, rolling=True)
print('After sequencing:', X.shape, y.shape)
# save the data
np.savez_compressed(DATA_ROOT / 'val.npz', X=X, y=y)

# Test data
X = test_df.drop(columns=TARGETS)
y = test_df[TARGETS]
print('Before sequencing:', X.shape, y.shape)
X, y = create_sequences(X.to_numpy(), y.to_numpy(), df.index.to_numpy(), INPUT_SEQ_LEN, rolling=True)
print('After sequencing:', X.shape, y.shape)
# save the data
np.savez_compressed(DATA_ROOT / 'test.npz', X=X, y=y)

Before sequencing: (268187, 16) (268187, 2)
After sequencing: (245728, 60, 16) (245728, 60, 2)
Before sequencing: (268219, 16) (268219, 2)
After sequencing: (245760, 60, 16) (245760, 60, 2)
Before sequencing: (253753, 16) (253753, 2)
After sequencing: (232415, 60, 16) (232415, 60, 2)


In [12]:
# load the data
train = np.load(DATA_ROOT / 'train.npz')
print(train['X'].shape, train['y'].shape)

(245728, 60, 16) (245728, 60, 2)
