## If using Colab, run:

In [None]:
%%capture
!pip install numpy matplotlib timeseriesfcst tensorflow pandas datasets scikit-learn

In [None]:
%%capture
!pip install --upgrade timeseriesfcst

# Setup


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import timeseriesfcst.preprocessing as tsprep
import timeseriesfcst.decomposition as tsdecomp
import timeseriesfcst.feature_engineering as tsfeat
import timeseriesfcst.feature_selection as tsfs

from sklearn.decomposition import PCA
from datasets import Dataset

# Load Datset from Huggingface



In [None]:
REPO_PATH = "hf://datasets/Creatorin/solarpower/"
splits = {'train': 'train_ts.csv', 'validation': 'val_ts.csv', 'test': 'test_ts.csv'}

# Load data
train_ts = pd.read_csv(REPO_PATH + splits["train"], index_col=0, date_format="%Y-%m-%d %H:%M:%S")
val_ts = pd.read_csv(REPO_PATH + splits["validation"], index_col=0, date_format="%Y-%m-%d %H:%M:%S")
test_ts = pd.read_csv(REPO_PATH + splits["test"], index_col=0, date_format="%Y-%m-%d %H:%M:%S")

# Make index datetime
train_ts.index = pd.to_datetime(train_ts.index)
val_ts.index = pd.to_datetime(val_ts.index)
test_ts.index = pd.to_datetime(test_ts.index)

# Copy train_ts to undo normalisation later
train_ts_copy = train_ts.copy()

# Verify Shapes
print(f"Train Shape: {train_ts.shape}, Validation Shape: {val_ts.shape}, Test Shape: {test_ts.shape}")

# Preprocess Data
## Make Stationary (Remove Trend and Seasonality)

In [None]:
# Make Unit Root Stationary
# train_ts = tsdecomp.make_stationary_unitroot(train_ts, val_ts, test_ts)

# Check variance stationarity
tsdecomp.check_stationarity_variance(train_ts, 24)
tsdecomp.check_stationarity_variance(train_ts, 8760)

# Check target only
tsdecomp.check_stationarity_variance_single(train_ts["Leistung"], 24, plot=True)
tsdecomp.check_stationarity_variance_single(train_ts["Leistung"], 8760, plot=True)

In [None]:
# Detrend
train_detrend = tsdecomp.detrend_ts(train_ts)
val_detrend = tsdecomp.detrend_ts(val_ts)
test_detrend = tsdecomp.detrend_ts(test_ts)

# Deseasonalise
train_deseasonal = tsdecomp.deseasonalise_ts(train_detrend, 8760)
val_deseasonal = tsdecomp.deseasonalise_ts(val_detrend, 8760)
test_deseasonal = tsdecomp.deseasonalise_ts(test_detrend, 8760)

In [None]:
# Plot Leistung before and after
plt.figure(figsize=(15, 5))
plt.plot(train_ts["Leistung"], label="Original")
plt.plot(train_detrend["Leistung"], label="Detrended")
plt.plot(train_deseasonal["Leistung"], label="Deseasonalised")
plt.legend()
plt.show()

# Feature Engineering


In [None]:
# Create Features
train_processed, val_processed, test_processed = tsfeat.create_lagged_features(
    train_deseasonal, val_deseasonal, test_deseasonal, lags=[1, 2, 3, 4, 5, 6, 12, 24, 48, 168, 8760]
)
train_processed, val_processed, test_processed = tsfeat.create_rolling_features(
    train_processed, val_processed, test_processed, windows=[3, 6, 12, 24, 48, 168, 8760]
)
train_processed, val_processed, test_processed = tsfeat.create_datetime_features(
    train_processed, val_processed, test_processed
)

# Check the shape of the data
print(f"Train Shape: {train_processed.shape}, Validation Shape: {val_processed.shape}, Test Shape: {test_processed.shape}")
train_processed.head()


# Drop first year of train
train_processed = train_processed["2016-01-01":]

# Handle any remaining missing values
for dataset in [train_processed, val_processed, test_processed]:
    dataset.fillna(method='ffill', inplace=True)
    dataset.fillna(method='bfill', inplace=True)  # In case there are still NaNs at the beginning

# Check the shape of the data
print(f"Train Shape: {train_processed.shape}, Validation Shape: {val_processed.shape}, Test Shape: {test_processed.shape}")

# Check for any remaining missing values
for name, dataset in zip(['Train', 'Validation', 'Test'], [train_processed, val_processed, test_processed]):
    missing = dataset.isnull().sum()
    if missing.sum() > 0:
        print(f"\nMissing values in {name} set:")
        print(missing[missing > 0])
    else:
        print(f"\nNo missing values in {name} set")

# Standardise Data

In [None]:
# Normalise the data
train_processed, val_processed, test_processed = tsprep.normalise_ts(train_processed, val_processed, test_processed)

# Check the normalised data mean and std
print(f"Train Mean: {train_processed.mean()}, Train Std: {train_processed.std()}")

# Feature Selection (Random Forest)



In [None]:
# Random Forest Feature selection
train_selected, selected_features = tsfs.time_series_feature_selection_gpu(train_processed, train_processed["Leistung"], n_features=50, n_estimators=100, max_depth=10, epochs=10)

# Apply feature selection to original series, keep target leistung
train_rf = train_processed[selected_features]
val_rf = val_processed[selected_features]
test_rf = test_processed[selected_features]

# Reorder Leistung to have Leistustung first
train_rf = train_rf[["Leistung"] + [col for col in train_rf.columns if col != "Leistung"]]
val_rf = val_rf[["Leistung"] + [col for col in val_rf.columns if col != "Leistung"]]
test_rf = test_rf[["Leistung"] + [col for col in test_rf.columns if col != "Leistung"]]

# Check the shape of the data
print(f"Train Shape: {train_rf.shape}, Validation Shape: {val_rf.shape}, Test Shape: {test_rf.shape}")

# Print Variables
print(train_rf.columns)

# PCA

In [None]:
# Verify shapes
print(f"Train Shape: {train_processed.shape}, Validation Shape: {val_processed.shape}, Test Shape: {test_processed.shape}")

In [None]:
# Target
target_column = "Leistung"

# Separate features and target
X_train = train_processed.drop(columns=[target_column])
X_val = val_processed.drop(columns=[target_column])
X_test = test_processed.drop(columns=[target_column])

y_train = train_processed[target_column]
y_val = val_processed[target_column]
y_test = test_processed[target_column]

# Fit PCA
pca = PCA(n_components=0.95)
pca.fit(X_train)

# Transform data
train_pca = pca.transform(X_train)
val_pca = pca.transform(X_val)
test_pca = pca.transform(X_test)

# Check the shape of the data
print(f"Train Shape: {train_pca.shape}, Validation Shape: {val_pca.shape}, Test Shape: {test_pca.shape}")

# Plot explained variance
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs. Number of PCA Components')
plt.grid(True)
plt.show()

# Print the number of components selected
print(f"Number of components selected: {pca.n_components_}")

# create DataFrames with the PCA results
train_pca_df = pd.DataFrame(train_pca, index=X_train.index, columns=[f'PC_{i+1}' for i in range(train_pca.shape[1])])
val_pca_df = pd.DataFrame(val_pca, index=X_val.index, columns=[f'PC_{i+1}' for i in range(val_pca.shape[1])])
test_pca_df = pd.DataFrame(test_pca, index=X_test.index, columns=[f'PC_{i+1}' for i in range(test_pca.shape[1])])

# Add the target variable back to the PCA DataFrames
train_pca_df[target_column] = y_train
val_pca_df[target_column] = y_val
test_pca_df[target_column] = y_test

# Check the shape of the data
print(f"Train Shape: {train_pca_df.shape}, Validation Shape: {val_pca_df.shape}, Test Shape: {test_pca_df.shape}")

#  Push Datasets to Hugginface

In [None]:
# Feature Selected
train_dataset = Dataset.from_pandas(train_rf)
val_dataset = Dataset.from_pandas(val_rf)
test_dataset = Dataset.from_pandas(test_rf)

# train_dataset.push_to_hub("Creatorin/solar_selected", split="train")
# val_dataset.push_to_hub("Creatorin/solar_selected", split="validation")
# test_dataset.push_to_hub("Creatorin/solar_selected", split="test")

In [None]:
# PCA
train_dataset = Dataset.from_pandas(train_pca_df)
val_dataset = Dataset.from_pandas(val_pca_df)
test_dataset = Dataset.from_pandas(test_pca_df)

# train_dataset.push_to_hub("Creatorin/solar_pca", split="train")
# val_dataset.push_to_hub("Creatorin/solar_pca", split="validation")
# test_dataset.push_to_hub("Creatorin/solar_pca", split="test")