# Imports

In [6]:
import pandas as pd
from feature_engine.timeseries.forecasting import LagFeatures

from _config import PKL_PROCESSED_STEP1_DTU_SOLAR_STATION, PKL_PROCESSED_STEP2_DTU_SOLAR_STATION

# Load (Preprocessed Step 1) DTU Solar Station data

In [7]:
df = pd.read_pickle(PKL_PROCESSED_STEP1_DTU_SOLAR_STATION)
df = df[sorted(df.columns)]

print(df.shape)
print(df.info())
df.head()

(5260805, 10)
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5260805 entries, 2015-01-01 00:00:00 to 2025-01-01 08:04:00
Freq: min
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   DHI                float64
 1   DNI                float64
 2   air_pressure       float64
 3   air_temperature    float64
 4   rain_duration      float64
 5   rain_intensity     float64
 6   relative_humidity  float64
 7   solar_altitude     float64
 8   wind_dir_avg       float64
 9   wind_speed_avg     float64
dtypes: float64(10)
memory usage: 441.5 MB
None


Unnamed: 0,DHI,DNI,air_pressure,air_temperature,rain_duration,rain_intensity,relative_humidity,solar_altitude,wind_dir_avg,wind_speed_avg
2015-01-01 00:00:00,,,,,,,,,,
2015-01-01 00:01:00,,,,,,,,,,
2015-01-01 00:02:00,,,,,,,,,,
2015-01-01 00:03:00,,,,,,,,,,
2015-01-01 00:04:00,,,,,,,,,,


# Lag Features
In order to predict the future values of the target variables, we need to create lagged features. This is done by shifting the values of the features by a certain number of time steps. In this case, we are shifting the values by 60 time steps (1 hour) to predict the future values of the target variables.

In [8]:
TARGETS = ['DNI', 'DHI']
X = df.dropna().drop(columns=TARGETS)  # Drop the target columns and any rows with NaN values because LagFeatures does not handle NaN values.

lf = LagFeatures(periods=[60], drop_original=True, drop_na=True)
# df_lagged = lf.fit_transform(pd.DataFrame(X['air_pressure']))
df_lagged = lf.fit_transform(X)

# Reintroduce the NaN values by reindexing the DataFrame with the full range of timestamps.
full_range = pd.date_range(start=df_lagged.index.min(), end=df_lagged.index.max(), freq="1min")
df_lagged = df_lagged.reindex(full_range)
df_lagged.sort_index(inplace=True)

df_lagged[TARGETS] = df[TARGETS]  # Reintroduce the target columns.
# sort the columns
df_lagged = df_lagged[sorted(df_lagged.columns)]

In [9]:
for col in df_lagged.columns:
    print(col)
    print(df_lagged[col].shape)
    print()

DHI
(5159945,)

DNI
(5159945,)

air_pressure_lag_60
(5159945,)

air_temperature_lag_60
(5159945,)

rain_duration_lag_60
(5159945,)

rain_intensity_lag_60
(5159945,)

relative_humidity_lag_60
(5159945,)

solar_altitude_lag_60
(5159945,)

wind_dir_avg_lag_60
(5159945,)

wind_speed_avg_lag_60
(5159945,)



Visualizing that the lagged features are correctly created. \
We see that originally at time t, the df dataframe has the values for the features. \
When looking at the lagged dataframe, we see that the values at time t are now at time t+60 and that the target values are still at time t.

In [4]:
display(df.loc['2022'][:5])
display(df_lagged.loc['2022'][60:65])

Unnamed: 0,DHI,DNI,air_pressure,air_temperature,rain_duration,rain_intensity,relative_humidity,solar_altitude,wind_dir_avg,wind_speed_avg
2022-01-01 00:00:00,0.0,0.0,1006.0,6.6,0.0,0.0,91.2,-56.105395,260.0,2.5
2022-01-01 00:01:00,0.0,0.004469,1006.0,6.6,0.0,0.0,91.2,-56.057864,273.0,2.8
2022-01-01 00:02:00,0.0,0.006635,1006.0,6.6,0.0,0.0,91.2,-56.009404,283.0,2.2
2022-01-01 00:03:00,0.0,0.015122,1006.0,6.6,0.0,0.0,91.2,-55.960019,291.0,2.2
2022-01-01 00:04:00,0.0,0.008761,1006.0,6.7,0.0,0.0,91.2,-55.909715,277.0,2.9


Unnamed: 0,DHI,DNI,air_pressure_lag_60,air_temperature_lag_60,rain_duration_lag_60,rain_intensity_lag_60,relative_humidity_lag_60,solar_altitude_lag_60,wind_dir_avg_lag_60,wind_speed_avg_lag_60
2022-01-01 01:00:00,0.0,0.002264,1006.0,6.6,0.0,0.0,91.2,-56.105395,260.0,2.5
2022-01-01 01:01:00,0.0,0.0,1006.0,6.6,0.0,0.0,91.2,-56.057864,273.0,2.8
2022-01-01 01:02:00,0.0,0.0,1006.0,6.6,0.0,0.0,91.2,-56.009404,283.0,2.2
2022-01-01 01:03:00,0.0,0.00376,1006.0,6.6,0.0,0.0,91.2,-55.960019,291.0,2.2
2022-01-01 01:04:00,0.0,0.0,1006.0,6.7,0.0,0.0,91.2,-55.909715,277.0,2.9


In [10]:
df_lagged.to_pickle(PKL_PROCESSED_STEP2_DTU_SOLAR_STATION)