# 1. Imports

In [None]:
# --- Notebook Cell 1: Imports & setup (run from repo root) ---
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path

# make sure local modules are importable (load.py, preprocessing.py, models.py in repo root)
repo_root = Path(".").resolve()
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

import numpy as np
import pandas as pd

# optional: nicer DataFrame display
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

from load import Loading
from preprocessing import Preprocessor  # unified pipeline builder


# --- Notebook Cell 2: Load data via load.Loading (no duplicate date-features) ---

DATA_FILE = "complete_dataset.csv"  # file lives in repo root alongside load.py

loader = Loading(
    filepath=DATA_FILE,
    create_time_features=False,   # avoid duplicating with preprocessing.DateCyclicalFeatures
    return_X_y=True               # directly return X, y
)

X, y = loader.load_data()
display(X.head(3))
display(y.head(3))
print(f"Shapes -> X: {X.shape}, y: {y.shape}")

# --- Notebook Cell 3: Simple model with unified preprocessing (LinearRegression + TS CV) ---

# Build preprocessing + model pipeline.
# Preprocessor will add date/cyclical features, impute/scale/one-hot, and prune highly correlated features.
pre = Preprocessor(
    filepath="",                 # not used when we pass data directly
    add_date_features=True,      # safe: loader didn't create date features
    corr_threshold=0.95
)

# provide data (ensures DateTimeIndex, applies leaky-column drop, etc.)
pre.set_data(pd.concat([X, y.rename(pre.target_col)], axis=1))

# Evaluate with TimeSeriesSplit cross-validation; defaults to LinearRegression
metrics = pre.evaluate(n_splits=5)

print("R² (per fold):", np.round(metrics["r2_scores"], 4))
print("R² (mean):     ", round(metrics["r2_mean"], 4))
print("RMSE (per fold):", np.round(metrics["rmse_scores"], 4))
print("RMSE (mean):    ", round(metrics["rmse_mean"], 4))

# Fitted end-to-end pipeline for later .predict()
fitted_pipeline = metrics["pipeline"]

# 2. EDA

Unnamed: 0_level_0,demand,min_temperature,max_temperature,solar_exposure,rainfall,school_day,holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-01,99635.03,13.3,26.9,23.6,0.0,0,1
2015-01-02,129606.01,15.4,38.8,26.8,0.0,0,0
2015-01-03,142300.54,20.0,38.2,26.5,0.0,0,0


date
2015-01-01    25.633696
2015-01-02    33.138988
2015-01-03    34.564855
Name: RRP, dtype: float64

Shapes -> X: (2106, 7), y: (2106,)


🔎 CorrelationSelector dropped 6 features: [9, 11, 14, 15, 16, 17]
🔎 CorrelationSelector dropped 6 features: [9, 11, 14, 15, 16, 17]
🔎 CorrelationSelector dropped 6 features: [9, 11, 14, 15, 16, 17]
🔎 CorrelationSelector dropped 6 features: [9, 11, 14, 15, 16, 17]
🔎 CorrelationSelector dropped 6 features: [9, 11, 14, 15, 16, 17]
🔎 CorrelationSelector dropped 6 features: [9, 11, 14, 15, 16, 17]
R² (per fold): [-0.0982 -0.1169 -0.0401  0.0546 -0.1226]
R² (mean):      -0.0646
RMSE (per fold): [ 42.2434  33.559   78.2577 243.5744 167.7561]
RMSE (mean):     113.0781


In [22]:
# --- Notebook Cell 4 (optional): Predict on the latest horizon for a quick sanity check ---

# pick a simple holdout window (e.g., last 30 observations)
h = 30
X_train, y_train = X.iloc[:-h], y.iloc[:-h]
X_test,  y_test  = X.iloc[-h:], y.iloc[-h:]

# re-fit pipeline on train only, then evaluate on holdout
pipe = pre.build_pipeline()  # uses LinearRegression by default
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
holdout_r2 = float(((y_test - y_test.mean())**2).sum() - ((y_test - y_pred)**2).sum()) / float(((y_test - y_test.mean())**2).sum())
holdout_rmse = float(np.sqrt(np.mean((y_test - y_pred) ** 2)))

print(f"Holdout R² (last {h}):  {holdout_r2:.4f}")
print(f"Holdout RMSE (last {h}): {holdout_rmse:.4f}")


🔎 CorrelationSelector dropped 6 features: [9, 11, 14, 15, 16, 17]
Holdout R² (last 30):  -7.3575
Holdout RMSE (last 30): 46.1146
