# Setup


In [19]:
import importlib

import pandas as pd
import matplotlib.pyplot as plt

import helperfunctions.preprocessing as hfprep
import helperfunctions.stationarity as hfstat
import helperfunctions.features as hffe

# Load Datset from Huggingface



In [20]:
REPO_PATH = "hf://datasets/Creatorin/solarpower/"
splits = {'train': 'train_ts.csv', 'validation': 'val_ts.csv', 'test': 'test_ts.csv'}

# Load data
train_ts = pd.read_csv(REPO_PATH + splits["train"], index_col=0, date_format="%Y-%m-%d %H:%M:%S")
val_ts = pd.read_csv(REPO_PATH + splits["validation"], index_col=0, date_format="%Y-%m-%d %H:%M:%S")
test_ts = pd.read_csv(REPO_PATH + splits["test"], index_col=0, date_format="%Y-%m-%d %H:%M:%S")

# Make index datetime
train_ts.index = pd.to_datetime(train_ts.index)
val_ts.index = pd.to_datetime(val_ts.index)
test_ts.index = pd.to_datetime(test_ts.index)

# Copy train_ts to undo normalisation later
train_ts_copy = train_ts.copy()

# Verify Shapes
print(f"Train Shape: {train_ts.shape}, Validation Shape: {val_ts.shape}, Test Shape: {test_ts.shape}")

Train Shape: (70129, 31), Validation Shape: (8760, 31), Test Shape: (2926, 31)


# Preprocess Data
## Make Stationary

In [21]:
# Remove Trend and Seasonality
importlib.reload(hfstat)

# Make Unit Root Stationary
# train_ts = hfstat.make_stationary_unitroot(train_ts, val_ts, test_ts)

# Check variance stationarity
# hfstat.check_stationarity_variance(train_ts, 24)
# hfstat.check_stationarity_variance(train_ts, 365)
# 
# # Check target only 
# hfstat.check_stationarity_variance(train_ts["Leistung"], 24)
# hfprep.check_stationarity_variance(train_ts["Leistung"], 365)


<module 'helperfunctions.stationarity' from '/home/moonchild/PycharmProjects/solar-prediction/helperfunctions/stationarity.py'>

In [22]:
# Detrend 
train_detrend = hfstat.detrend_ts(train_ts)
val_detrend = hfstat.detrend_ts(val_ts)
test_detrend = hfstat.detrend_ts(test_ts)

# Deseasonalise
train_deseasonal = hfstat.deseasonalise_ts(train_detrend, 365)
val_deseasonal = hfstat.deseasonalise_ts(val_detrend, 365)
test_deseasonal = hfstat.deseasonalise_ts(test_detrend, 365)

In [18]:
# Plot Leistung before and after
plt.figure(figsize=(15, 5))
plt.plot(train_ts["Leistung"], label="Original")
plt.plot(train_detrend["Leistung"], label="Detrended")
plt.plot(train_deseasonal["Leistung"], label="Deseasonalised")
plt.legend()
plt.show()

# Plot Leistung before and after
plt.figure(figsize=(15, 5))
plt.plot(train_ts["Leistung"], label="Original")
plt.plot(train_detrend["Leistung"], label="Detrended")
#plt.plot(train_deseasonal["Leistung"], label="Deseasonalised")
plt.legend()
plt.show()

KeyboardInterrupt: 

# Feature Engineering


In [23]:
importlib.reload(hffe)

# Create Features
train_processed, val_processed, test_processed = hffe.create_lagged_features(train_deseasonal, val_deseasonal, test_deseasonal, lags=[1, 2, 3, 4, 5, 6, 12, 24, 48, 168, 365])

In [None]:
train_processed, val_processed, test_processed = hffe.create_rolling_features(train_processed, val_processed, test_processed, windows=[3, 6, 12, 24, 48, 168, 365])

In [11]:
importlib.reload(hffe)

train_processed, val_processed, test_processed = hffe.create_datetime_features(train_processed, val_processed, test_processed)

# Check the shape of the data
print(f"Train Shape: {train_processed.shape}, Validation Shape: {val_processed.shape}, Test Shape: {test_processed.shape}")
train_processed.head()

Train Shape: (69763, 378), Validation Shape: (8394, 378), Test Shape: (2560, 378)


Unnamed: 0,Leistung,temperature_2m_templin,cloud_cover_templin,shortwave_radiation_templin,diffuse_radiation_templin,direct_normal_irradiance_templin,temperature_2m_kastellaun,cloud_cover_kastellaun,shortwave_radiation_kastellaun,diffuse_radiation_kastellaun,...,cloud_cover_neumunster_lag_365,shortwave_radiation_neumunster_lag_365,diffuse_radiation_neumunster_lag_365,direct_normal_irradiance_neumunster_lag_365,hour_sin,hour_cos,dayofweek_sin,dayofweek_cos,dayofyear_sin,dayofyear_cos
2015-01-16 05:00:00+00:00,0.0,-0.1,41.499999,0.0,0.0,0.0,-0.05,-15.4,0.0,0.0,...,,,,,0.965926,0.258819,-0.433884,-0.900969,0.271234,0.962513
2015-01-16 06:00:00+00:00,0.0,0.15,27.6,0.0,0.0,0.0,-4e-07,-0.299995,0.0,0.0,...,,,,,1.0,6.123234000000001e-17,-0.433884,-0.900969,0.271234,0.962513
2015-01-16 07:00:00+00:00,1251.4,0.05,25.5,0.0,0.0,0.0,-0.5499996,1.200005,0.0,0.0,...,,,,,0.965926,-0.258819,-0.433884,-0.900969,0.271234,0.962513
2015-01-16 08:00:00+00:00,6186.5,0.55,17.1,13.0,12.0,12.928265,0.15,3.6,2.0,2.0,...,,,,,0.866025,-0.5,-0.433884,-0.900969,0.271234,0.962513
2015-01-16 09:00:00+00:00,7496.8,1.1,0.300003,72.0,36.0,265.371965,0.85,8.099995,18.0,18.0,...,,,,,0.707107,-0.7071068,-0.433884,-0.900969,0.271234,0.962513


In [14]:
train_processed.info()
train_processed.describe().transpose()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 69763 entries, 2015-01-16 05:00:00+00:00 to 2022-12-31 23:00:00+00:00
Columns: 378 entries, Leistung to dayofyear_cos
dtypes: float64(378)
memory usage: 201.7 MB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Leistung,69763.0,-7.300676e-15,14129.504960,-63943.800000,-4303.450000,7.840000e+01,8491.000000,42174.400000
temperature_2m_templin,69763.0,1.963792e-04,1.185157,-11.500000,-0.700000,5.000000e-02,0.700000,9.700000
cloud_cover_templin,69763.0,1.820449e-04,25.788628,-174.700000,-11.199999,0.000000e+00,11.200000,146.700010
shortwave_radiation_templin,69763.0,0.000000e+00,97.693606,-770.000000,-21.000000,0.000000e+00,53.000000,521.000000
diffuse_radiation_templin,69763.0,0.000000e+00,40.718875,-337.000000,-12.000000,0.000000e+00,24.000000,236.000000
...,...,...,...,...,...,...,...,...
hour_cos,69763.0,-5.789685e-05,0.707102,-1.000000,-0.707107,-1.836970e-16,0.707107,1.000000
dayofweek_sin,69763.0,-4.535651e-04,0.707161,-0.974928,-0.781831,0.000000e+00,0.781831,0.974928
dayofweek_cos,69763.0,-3.219316e-04,0.707062,-0.900969,-0.900969,-2.225209e-01,0.623490,1.000000
dayofyear_sin,69763.0,-7.239782e-04,0.709595,-0.999963,-0.710135,-1.716633e-02,0.710135,0.999963


# Standardise Data

In [None]:
# Normalise the data
train_processed, val_processed, test_processed = hfprep.normalise_ts(train_deseasonal, val_deseasonal, test_deseasonal)

# Check the normalised data mean and std
print(f"Train Mean: {train_ts.mean()}, Train Std: {train_ts.std()}")

# Save Data


In [None]:
# Save to CSV with datetime index
train_processed.to_csv("data/train_processed.csv", header=True, index=True, index_label=False)
val_processed.to_csv("data/val_processed.csv", header=True, index=True, index_label=False)
test_processed.to_csv("data/test_processed.csv", header=True, index=True, index_label=False)