# 1. Imports

In [1]:
import sys
from pathlib import Path

# Add project root (parent of "electricity") to sys.path

sys.path.append("..")   # go up one level to project root


from electricity.load import Loading
from electricity.preprocessing import Preprocessor



In [2]:
loader = Loading(filepath="complete_dataset.csv", return_X_y=True)

In [3]:
# 3. Load the dataframe
X,y = loader.load_data()

In [4]:
# --- Notebook Cell 3: Simple model with unified preprocessing (LinearRegression + TS CV) ---

# Build preprocessing + model pipeline.
# Preprocessor will add date/cyclical features, impute/scale/one-hot, and prune highly correlated features.
pre = Preprocessor(
    filepath="",                 # not used when we pass data directly
    add_date_features=True,      # safe: loader didn't create date features
    corr_threshold=0.95
)


In [5]:
import pandas as pd

In [6]:
pre.set_data(pd.concat([X, y.rename(pre.target_col)], axis=1))

Preprocessor(filepath='', date_col='date', target_col='RRP', leaky_cols=('RRP_positive', 'RRP_negative', 'demand_pos_RRP', 'demand_neg_RRP', 'frac_at_neg_RRP'), bool_maps={'holiday': {'Y': 1, 'N': 0}, 'school_day': {'Y': 1, 'N': 0}}, corr_threshold=0.95, add_date_features=True, random_state=42, df=                demand  min_temperature  max_temperature  solar_exposure  \
date                                                                       
2015-01-01   99635.030             13.3             26.9            23.6   
2015-01-02  129606.010             15.4             38.8            26.8   
2015-01-03  142300.540             20.0             38.2            26.5   
2015-01-04  104330.715             16.3             21.4            25.2   
2015-01-05  118132.200             15.0             22.0            30.7   
...                ...              ...              ...             ...   
2020-10-02   99585.835             12.8             26.0            22.0   
2020-10-03   9227

In [7]:
pipeline = pre.build_pipeline()

In [8]:
pipeline

0,1,2
,steps,"[('date_features', ...), ('pre', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,date_col,'date'

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,threshold,0.95
,verbose,True


In [9]:
from electricity.models import _tscv_scores
from sklearn.ensemble import RandomForestRegressor


In [10]:
#_tscv_scores(pipeline, X, y, model=RandomForestRegressor())

In [11]:
from electricity.models import run_lstm 

In [12]:
X.shape

(2106, 7)

In [13]:
y.shape

(2106,)

In [14]:
pipeline.fit(X)
pipeline.transform(X)

🔎 CorrelationSelector dropped 6 features: [9, 11, 14, 15, 16, 17]


Unnamed: 0,0,1,2,3,4,5,6,7,8,10,12,13
0,-0.974528,0.327869,0.931343,0.872,0.00,-1.0,1.0,-0.666667,-0.833333,0.00,0.304777,0.633975
1,0.489445,0.672131,2.352239,1.128,0.00,-1.0,0.0,-0.666667,-0.833333,0.25,0.304777,0.633975
2,1.109527,1.426230,2.280597,1.104,0.00,-1.0,0.0,-0.666667,-0.833333,0.50,0.304777,0.633975
3,-0.745161,0.819672,0.274627,1.000,5.25,-1.0,0.0,-0.666667,-0.833333,0.75,0.304777,0.633975
4,-0.071009,0.606557,0.346269,1.440,0.00,-1.0,0.0,-0.666667,-0.833333,-0.75,0.304777,0.633975
...,...,...,...,...,...,...,...,...,...,...,...,...
2101,-0.976931,0.245902,0.823881,0.744,0.00,-1.0,0.0,1.000000,0.666667,0.25,-0.527889,0.366025
2102,-1.333940,1.000000,1.229851,0.568,0.00,-1.0,0.0,1.000000,0.666667,0.50,-0.527889,0.366025
2103,-1.245794,0.360656,1.241791,-0.344,0.00,-1.0,0.0,1.000000,0.666667,0.75,-0.527889,0.366025
2104,-0.291900,-0.360656,-0.764179,-0.432,16.00,-1.0,0.0,1.000000,0.666667,-0.75,-0.527889,0.366025


In [16]:
from sklearn.model_selection import TimeSeriesSplit

In [17]:
tscv = TimeSeriesSplit(n_splits=5)
splits = list(tscv.split(X, y))
X_train, X_test = X.iloc[splits[0][0]], X.iloc[splits[0][1]]
y_train, y_test = y.iloc[splits[0][0]], y.iloc[splits[0][1]]
X_train.shape, X_test.shape, y_train.shape, y_test.shape




((351, 7), (351, 7), (351,), (351,))

In [18]:
X_train.shape[-1]

7

In [None]:
f

In [None]:
model = Sequential()