# 1. Imports

In [1]:
import sys
from pathlib import Path

# Add project root (parent of "electricity") to sys.path

sys.path.append("..")   # go up one level to project root


from electricity.load import Loading
from electricity.preprocessing import Preprocessor



In [2]:
loader = Loading(filepath="complete_dataset.csv", return_X_y=False)

In [3]:
# 3. Load the dataframe
df = loader.load_data()

In [4]:
df

Unnamed: 0_level_0,RRP,school_day,holiday,RRP_t_minus_1,demand_t_minus_1,min_temperature_t_minus_1,max_temperature_t_minus_1,solar_exposure_t_minus_1,rainfall_t_minus_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-02,33.138988,0,0,25.633696,99635.030,13.3,26.9,23.6,0.0
2015-01-03,34.564855,0,0,33.138988,129606.010,15.4,38.8,26.8,0.0
2015-01-04,25.005560,0,0,34.564855,142300.540,20.0,38.2,26.5,0.0
2015-01-05,26.724176,0,0,25.005560,104330.715,16.3,21.4,25.2,4.2
2015-01-06,31.282311,0,0,26.724176,118132.200,15.0,22.0,30.7,0.0
...,...,...,...,...,...,...,...,...,...
2020-10-02,-6.076028,0,0,34.654671,106641.790,9.4,19.5,21.2,1.8
2020-10-03,-1.983471,0,0,-6.076028,99585.835,12.8,26.0,22.0,0.0
2020-10-04,25.008614,0,0,-1.983471,92277.025,17.4,29.4,19.8,0.0
2020-10-05,36.764701,0,0,25.008614,94081.565,13.5,29.5,8.4,0.0


In [5]:
import numpy as np
import pandas as pd

In [6]:
X = df.drop(columns="RRP")
y = df["RRP"]

In [7]:
X.shape

(2105, 8)

In [8]:
# X_train, X_test: (N, T, F) windows for your RNN
# date_array: shape (N, T) with np.datetime64 for each timestep (preferred)
pre = Preprocessor(filepath=".", date_col="date", target_col="RRP")
   # OR: pre.set_rnn_dates(date_feature_index=idx)
pipe = pre.build_pipeline()


In [9]:
pipe

0,1,2
,steps,"[('date_features', ...), ('pre', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,date_col,'date'

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,threshold,0.95
,keep_always,"['month_sin', 'month_cos', ...]"
,verbose,True


In [10]:

pipe.fit(X)      # -> (N, T, F_out)
X_preproc = pipe.transform(X)

:mag_right: CorrelationSelector dropped 6 features: ['pipeline-1__week', 'pipeline-1__dayofyear', 'pipeline-1__week_sin', 'pipeline-1__week_cos', 'pipeline-1__doy_sin', 'pipeline-1__doy_cos']


In [11]:
X_preproc.shape

(2105, 13)

In [12]:
df = X.join(y)          

In [13]:
train_size = 0.6
index = round(train_size*df.shape[0])

df_train = df.iloc[:index]
df_test = df.iloc[index:]

In [14]:
df_train.shape

(1263, 9)

In [15]:
from typing import Dict, List, Tuple, Sequence


In [16]:
TARGET = 'RRP'

def get_Xi_yi(
    dataset: pd.DataFrame,
    input_length: int,
    output_length: int,
    random: bool = True,
    start_index: int = None
) -> Tuple[pd.DataFrame]:
    """
    Returns one sequence (X_i, y_i) from the dataset, either randomly or at a specified index.

    Args:
        dataset (pd.DataFrame): The dataset to sample from.
        input_length (int): Length of input sequence.
        output_length (int): Length of output sequence.
        random (bool): If True, select randomly. If False, use start_index.
        start_index (int, optional): If random is False, use this as the starting index.

    Returns:
        Tuple[pd.DataFrame]: A tuple of two dataframes (X_i, y_i)
    """
    first_possible_start = 0
    last_possible_start = len(dataset) - (input_length + output_length) + 1

    if last_possible_start <= 0:
        raise ValueError("Not enough data to create a sequence with the given input and output lengths.")

    if random:
        random_start = np.random.randint(first_possible_start, last_possible_start)
    else:
        if start_index is None:
            raise ValueError("start_index must be provided when random is False.")
        if not (first_possible_start <= start_index < last_possible_start):
            raise ValueError(f"start_index must be in [{first_possible_start}, {last_possible_start-1}]")
        random_start = start_index

    X_i = dataset.iloc[random_start:random_start + input_length]
    y_i = dataset.iloc[random_start + input_length:
                      random_start + input_length + output_length][[TARGET]]

    return (X_i, y_i)


def get_X_y(
    dataset: pd.DataFrame,
    input_length: int,
    output_length: int,
    number_of_sequences: int = None,
    random: bool = False
) -> Tuple[np.array]:
    """
    Generate X and y based on the number of desired sequences of the given input_length and output_length.
    If random is False, sequences are taken sequentially to cover the full dataframe (up to number_of_sequences or max possible).
    If number_of_sequences is None, use the maximum possible number of sequences.

    Args:
        dataset (pd.DataFrame): Fold dataframe
        input_length (int): Length of each X_i
        output_length (int): Length of each y_i
        number_of_sequences (int, optional): The number of X_i and y_i pairs to include. If None, use max possible.
        random (bool): If True, sample randomly. If False, sample sequentially.

    Returns:
        Tuple[np.array]: A tuple of numpy arrays (X, y)
    """
    X, y = [], []

    first_possible_start = 0
    last_possible_start = len(dataset) - (input_length + output_length) + 1

    if last_possible_start <= 0:
        raise ValueError("Not enough data to create sequences with the given input and output lengths.")

    max_possible_sequences = last_possible_start

    if number_of_sequences is None:
        n_sequences = max_possible_sequences
    else:
        n_sequences = min(number_of_sequences, max_possible_sequences)

    if random:
        for i in range(n_sequences):
            Xi, yi = get_Xi_yi(dataset, input_length, output_length, random=True)
            X.append(Xi)
            y.append(yi)
    else:
        # Sequential, non-overlapping (or overlapping) windows
        for idx in range(first_possible_start, first_possible_start + n_sequences):
            Xi, yi = get_Xi_yi(dataset, input_length, output_length, random=False, start_index=idx)
            X.append(Xi)
            y.append(yi)

    return np.array(X), np.array(y)

In [17]:
data_train = get_X_y(df_train, 60, 1)
data_test = get_X_y(df_test, 60, 1)

In [18]:
X_train = data_train[0]
y_train = data_train[1]
X_test = data_test[0]
y_test = data_test[1]

In [19]:
# --- Notebook Cell 3: Simple model with unified preprocessing (LinearRegression + TS CV) ---
from electricity.preprocessing import Preprocessor
# Build preprocessing + model pipeline.
# Preprocessor will add date/cyclical features, impute/scale/one-hot, and prune highly correlated features.

In [20]:
import pandas as pd

In [21]:
from electricity.models import _tscv_scores
from sklearn.ensemble import RandomForestRegressor


2025-08-27 14:36:55.101561: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-27 14:36:55.102055: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-27 14:36:55.172284: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-08-27 14:36:57.822353: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off,

In [22]:
from sklearn.model_selection import TimeSeriesSplit

In [23]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers


In [24]:
from tensorflow.keras.callbacks import EarlyStopping


In [25]:
X_train[0]

array([[0, 0, 25.63369643387471, 99635.03, 13.3, 26.9, 23.6, 0.0,
        33.13898756122499],
       [0, 0, 33.13898756122499, 129606.00999999994, 15.4, 38.8, 26.8,
        0.0, 34.56485482908218],
       [0, 0, 34.56485482908218, 142300.53999999998, 20.0, 38.2, 26.5,
        0.0, 25.00556023842067],
       [0, 0, 25.00556023842067, 104330.715, 16.3, 21.4, 25.2, 4.2,
        26.72417627793271],
       [0, 0, 26.72417627793271, 118132.19999999994, 15.0, 22.0, 30.7,
        0.0, 31.282310728612853],
       [0, 0, 31.282310728612853, 130672.48499999994, 17.7, 26.0, 31.6,
        0.0, 48.31230937899024],
       [0, 0, 48.31230937899024, 153514.82, 18.9, 37.4, 20.7, 0.0,
        49.11728029878114],
       [0, 0, 49.11728029878114, 142015.65500000006, 23.1, 28.2, 13.5,
        19.4, 34.490675454596484],
       [0, 0, 34.490675454596484, 121801.15499999998, 16.5, 18.0, 3.1,
        1.2, 20.229824895097856],
       [0, 0, 20.229824895097856, 103043.66000000005, 13.6, 21.7, 5.6,
        5.2, 18

In [26]:
X_train = X_train.astype(np.float32)
y_train  = np.asarray(y_train).astype(np.float32)

X_test = X_test.astype(np.float32)
y_test  = np.asarray(y_test).astype(np.float32)



In [27]:
es = EarlyStopping(patience=10, restore_best_weights=True)
model = Sequential()
model.add(layers.LSTM(units=100, return_sequences=True))
model.add(layers.LSTM(units=100))
model.add(layers.Dense(50, activation="relu")) 
model.add(layers.Dense(1, activation="linear"))
model.compile(loss="mse", optimizer="adam", metrics=["mae"])
model.fit(X_train, y_train, epochs=1000, callbacks=[es], validation_split=0.2, batch_size=32, verbose=1)

Epoch 1/1000


2025-08-27 14:36:59.019818: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 81ms/step - loss: 5069.3252 - mae: 59.6456 - val_loss: 17251.7734 - val_mae: 95.1470
Epoch 2/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - loss: 5064.4229 - mae: 59.6049 - val_loss: 17245.9258 - val_mae: 95.1163
Epoch 3/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 5060.7954 - mae: 59.5742 - val_loss: 17240.1133 - val_mae: 95.0857
Epoch 4/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 66ms/step - loss: 5057.1328 - mae: 59.5435 - val_loss: 17234.2637 - val_mae: 95.0550
Epoch 5/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step - loss: 5053.5352 - mae: 59.5134 - val_loss: 17228.5527 - val_mae: 95.0249
Epoch 6/1000
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 67ms/step - loss: 5049.8940 - mae: 59.4826 - val_loss: 17222.6777 - val_mae: 94.9940
Epoch 7/1000
[1m31/31[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7bf80aaa5240>

In [36]:
# es = EarlyStopping(patience=10, restore_best_weights=True)
# model = Sequential()
# model.add(layers.Dense(28, input_shape=(X_train.shape[-1],), activation="relu"))
# model.add(layers.Dense(21, activation="relu"))
# model.add(layers.Dense(14, activation="relu"))
# model.add(layers.Dense(7, activation="relu"))
# model.add(layers.Dense(1, activation="linear"))

# model.compile(optimizer="adam", loss="mse", metrics=["mae"])
# model.fit(X_train, y_train, epochs=1000, callbacks=[es], validation_split=0.2, batch_size=32, verbose=1)


In [37]:
model.evaluate(X_test, y_test)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 44409.7695 - mae: 65.2660


[44409.76953125, 65.26595306396484]

In [38]:
import matplotlib.pyplot as plt

In [39]:
y_test

array([[[ 7.76177673e+01]],

       [[ 3.83011475e+01]],

       [[ 5.04469490e+01]],

       [[ 1.02338814e+02]],

       [[ 1.19228432e+02]],

       [[ 1.04816162e+02]],

       [[ 8.95793686e+01]],

       [[ 9.74678421e+01]],

       [[ 1.18487839e+02]],

       [[ 1.05462212e+02]],

       [[ 1.05011803e+02]],

       [[ 1.19823074e+02]],

       [[ 1.18379013e+02]],

       [[ 9.71664200e+01]],

       [[ 1.22425995e+02]],

       [[ 1.04026062e+02]],

       [[ 1.37131516e+02]],

       [[ 1.39516388e+02]],

       [[ 1.19574341e+02]],

       [[ 1.11540756e+02]],

       [[ 1.38907532e+02]],

       [[ 1.61506821e+02]],

       [[ 1.09293770e+02]],

       [[ 6.76155930e+01]],

       [[ 9.26159821e+01]],

       [[ 6.73332520e+01]],

       [[ 8.36948929e+01]],

       [[ 8.49782791e+01]],

       [[ 3.52242622e+01]],

       [[ 1.46735878e+01]],

       [[ 8.16376190e+01]],

       [[ 7.06853027e+01]],

       [[ 8.54524689e+01]],

       [[ 1.01004051e+02]],

       [[ 1.15

In [40]:
X_test

array([[[  1.       ,   0.       ,  62.27267  , ...,   5.1      ,
          15.6      , 112.501495 ],
        [  1.       ,   0.       , 112.501495 , ...,   5.9      ,
           7.2      , 110.05619  ],
        [  1.       ,   0.       , 110.05619  , ...,   9.       ,
           0.4      ,  91.289345 ],
        ...,
        [  1.       ,   0.       ,  79.75906  , ...,   8.6      ,
           0.       ,  69.127754 ],
        [  1.       ,   0.       ,  69.127754 , ...,  13.2      ,
           0.2      ,  67.79661  ],
        [  1.       ,   0.       ,  67.79661  , ...,  10.1      ,
           0.       ,  95.267105 ]],

       [[  1.       ,   0.       , 112.501495 , ...,   5.9      ,
           7.2      , 110.05619  ],
        [  1.       ,   0.       , 110.05619  , ...,   9.       ,
           0.4      ,  91.289345 ],
        [  1.       ,   0.       ,  91.289345 , ...,   8.       ,
           0.2      , 141.55698  ],
        ...,
        [  1.       ,   0.       ,  69.127754 , ...,  

In [41]:
y_pred = model.predict(X_test)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step


In [42]:
y_pred

array([[29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],
       [29.725874],


In [43]:
import pandas as pd

# Assume y_test is a pandas Series with date index, X_test is a DataFrame with date index
# If y_test is not a Series with index, adjust accordingly

# Convert y_pred to Series with same index as y_test
y_pred_series = pd.Series(y_pred.flatten(), index=y_test.index, name="Predicted")
y_test_series = pd.Series(y_test, index=y_test.index, name="Actual")

# Combine into a DataFrame
df_pred = pd.DataFrame({"Actual": y_test_series, "Predicted": y_pred_series})

# Plot both against date
df_pred.plot(figsize=(12, 6))
plt.xlabel("Date")
plt.ylabel("RRP")
plt.title("Actual vs Predicted RRP")
plt.show()


AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [None]:
from sklearn.metrics import r2_score

print(f"R2: {r2_score(y_test, y_pred)}")


R2: 0.033061867627804054
