
# Energy Consumption Forecasting — Method Comparison

**Goal:** Compare regression approaches on an hourly energy consumption problem:

1. Ordinary Linear Regression (OLS)  
2. Regularization (Ridge / Lasso / Elastic Net)  
3. Feature Expansion (polynomial features)  
4. Random Forest Regression  
5. GLM (Gamma) Regression  
6. Quantile Regression (P10, P50, P90)

Generate a realistic **synthetic hourly dataset** with seasonal, temperature, and calendar effects, then evaluate and visualize each method.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_pinball_loss

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, QuantileRegressor, GammaRegressor
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))


## 1) Create a synthetic hourly dataset (1 year)

- **Seasonality**: daily (hour-of-day) and annual (day-of-year) cycles  
- **Weather**: temperature affects usage in a U-shaped way (heating/cooling)  
- **Calendar**: weekend/holiday effects  
- **Noise**: larger variance at extreme temperatures (heteroscedastic)


In [None]:
import datetime as dt

# One year of hourly data
start = pd.Timestamp("2024-01-01 00:00:00")
end   = pd.Timestamp("2024-12-31 23:00:00")
date_range = pd.date_range(start, end, freq="H")
n = len(date_range)

df = pd.DataFrame({"timestamp": date_range})
df["hour"] = df["timestamp"].dt.hour
df["dow"]  = df["timestamp"].dt.dayofweek  # Mon=0
df["doy"]  = df["timestamp"].dt.dayofyear
df["is_weekend"] = (df["dow"] >= 5).astype(int)

# Simple holiday approximation
holidays = [
    "2024-01-01","2024-01-15","2024-02-19","2024-05-27",
    "2024-07-04","2024-09-02","2024-11-28","2024-12-25"
]
holidays = pd.to_datetime(holidays)
df["is_holiday"] = df["timestamp"].dt.normalize().isin(holidays).astype(int)

# Temperature: yearly seasonality + daily noise
doy_rad = 2*np.pi*df["doy"]/365.25
hour_rad = 2*np.pi*df["hour"]/24.0
base_temp = 60 + 20*np.sin(doy_rad)  # colder in winter, hotter in summer
temp = base_temp + 3*np.sin(hour_rad - np.pi/3) + np.random.normal(0, 2.5, size=n)
df["temp_F"] = temp

# Hourly load base pattern (higher in morning/evening)
load_daily = 1.0 + 0.6*np.sin(hour_rad - np.pi/2) + 0.3*np.sin(2*hour_rad)
# Weekend and holiday effects
weekend_effect = np.where(df["is_weekend"]==1, -0.08, 0.0)
holiday_effect = np.where(df["is_holiday"]==1, -0.12, 0.0)

# U-shaped temperature effect around 68F
temp_dev = (df["temp_F"] - 68.0)
temp_u   = 0.004 * (temp_dev**2)

# Compose expected mean load (kWh) and add heteroscedastic noise
mu = 30 * load_daily * (1 + weekend_effect + holiday_effect) * (1 + temp_u)
sigma = 1.0 + 0.12 * (np.abs(temp_dev)/10.0)
noise = np.random.normal(0, sigma, size=n)

df["consumption_kwh"] = np.maximum(0.5, mu + noise)

# Lags and rolling
df["lag1"]  = df["consumption_kwh"].shift(1)
df["lag24"] = df["consumption_kwh"].shift(24)
df["roll24_mean"] = df["consumption_kwh"].rolling(24, min_periods=1).mean()

# Fourier time features
df["sin_hour"] = np.sin(hour_rad)
df["cos_hour"] = np.cos(hour_rad)
df["sin_doy"]  = np.sin(doy_rad)
df["cos_doy"]  = np.cos(doy_rad)

df = df.dropna().reset_index(drop=True)
df.head()


## 2) Time-based split (train vs. test)

Use the **last 6 weeks** as holdout to mimic forward-looking evaluation.


In [None]:
holdout_hours = 24 * 42  # ~6 weeks
train = df.iloc[:-holdout_hours].copy()
test  = df.iloc[-holdout_hours:].copy()

y_col = "consumption_kwh"
X_cols = ["temp_F","is_weekend","is_holiday","lag1","lag24","roll24_mean",
          "sin_hour","cos_hour","sin_doy","cos_doy"]

X_train, y_train = train[X_cols], train[y_col]
X_test,  y_test  = test[X_cols],  test[y_col]

num_cols = ["temp_F","lag1","lag24","roll24_mean","sin_hour","cos_hour","sin_doy","cos_doy"]
cat_cols = ["is_weekend","is_holiday"]

pre = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(drop="if_binary"), cat_cols)
])


## 3) Fit models

Fit OLS, regularized linear models, feature-expanded linear model, Random Forest, GLM (Gamma), and Quantile Regressors (P10/P50/P90).


In [None]:
models = {}

# 1) OLS
models["ols"] = Pipeline([("pre", pre), ("est", LinearRegression())])

# 2) Regularization
models["ridge"] = Pipeline([("pre", pre), ("est", Ridge(alpha=1.0, random_state=42))])
models["lasso"] = Pipeline([("pre", pre), ("est", Lasso(alpha=0.01, random_state=42, max_iter=10000))])
models["enet"]  = Pipeline([("pre", pre), ("est", ElasticNet(alpha=0.01, l1_ratio=0.5, random_state=42, max_iter=10000))])

# 3) Feature Expansion (polynomial on numeric features only)
poly_pre = ColumnTransformer([
    ("num", Pipeline([("sc", StandardScaler()), ("poly", PolynomialFeatures(degree=2, include_bias=False))]), num_cols),
    ("cat", OneHotEncoder(drop="if_binary"), cat_cols)
])
models["poly2_ols"] = Pipeline([("pre", poly_pre), ("est", LinearRegression())])

# 4) Random Forest
models["rf"] = Pipeline([("pre", pre), ("est", RandomForestRegressor(
    n_estimators=300, max_depth=12, min_samples_leaf=2, random_state=42, n_jobs=-1))])

# 5) GLM (Gamma)
models["gamma"] = Pipeline([("pre", pre), ("est", GammaRegressor(alpha=0.01, max_iter=5000))])

# 6) Quantile (P10/P50/P90)
for tau in [0.10, 0.50, 0.90]:
    models[f"qr_{int(tau*100)}"] = Pipeline([("pre", pre), ("est", QuantileRegressor(quantile=tau, alpha=0.0005, solver="highs"))])

# Fit & predict
for name, pipe in models.items():
    pipe.fit(X_train, y_train)

preds = {name: pipe.predict(X_test) for name, pipe in models.items()}
list(preds.keys())


## 4) Evaluation metrics

- For mean-focused models: **RMSE**, **MAE**, **R²**  
- For quantile models: **pinball loss** at respective τ


In [None]:
def summarize_metrics(preds_dict):
    rows = []
    for name, yhat in preds_dict.items():
        row = {"model": name}
        if name.startswith("qr_"):
            tau = int(name.split("_")[1])/100.0
            row["pinball_loss"] = mean_pinball_loss(y_test, yhat, alpha=tau)
            row["RMSE"] = np.nan
            row["MAE"]  = np.nan
            row["R2"]   = np.nan
        else:
            row["pinball_loss"] = np.nan
            row["RMSE"] = float(np.sqrt(mean_squared_error(y_test, yhat)))
            row["MAE"]  = float(mean_absolute_error(y_test, yhat))
            row["R2"]   = float(r2_score(y_test, yhat))
        rows.append(row)
    return pd.DataFrame(rows).sort_values("model").reset_index(drop=True)

metrics_df = summarize_metrics(preds)
metrics_df


## 5) Visual — Actual vs Predicted (OLS)


In [None]:
plt.figure(figsize=(10,4))
plt.plot(test["timestamp"], y_test, label="Actual")
plt.plot(test["timestamp"], preds["ols"], label="OLS")
plt.xlabel("Time"); plt.ylabel("kWh")
plt.title("Actual vs Predicted — OLS")
plt.legend(); plt.tight_layout(); plt.show()


## 6) Visual — Actual vs Predicted (Random Forest)


In [None]:
plt.figure(figsize=(10,4))
plt.plot(test["timestamp"], y_test, label="Actual")
plt.plot(test["timestamp"], preds["rf"], label="Random Forest")
plt.xlabel("Time"); plt.ylabel("kWh")
plt.title("Actual vs Predicted — Random Forest")
plt.legend(); plt.tight_layout(); plt.show()


## 7) Visual — Actual vs Predicted (Gamma GLM)


In [None]:
plt.figure(figsize=(10,4))
plt.plot(test["timestamp"], y_test, label="Actual")
plt.plot(test["timestamp"], preds["gamma"], label="Gamma GLM")
plt.xlabel("Time"); plt.ylabel("kWh")
plt.title("Actual vs Predicted — Gamma GLM")
plt.legend(); plt.tight_layout(); plt.show()


## 8) Visual — Quantile band (P10–P90) with Median (P50)


In [None]:
q10 = preds["qr_10"]
q50 = preds["qr_50"]
q90 = preds["qr_90"]

plt.figure(figsize=(10,4))
t = test["timestamp"].to_numpy()
order = np.argsort(t)
plt.plot(t[order], q50[order], label="Median (P50)")
plt.fill_between(t[order], q10[order], q90[order], alpha=0.2, label="P10–P90 band")
plt.plot(t[order], y_test.to_numpy()[order], label="Actual")
plt.xlabel("Time"); plt.ylabel("kWh")
plt.title("Quantile Regression — Median with P10–P90 band")
plt.legend(); plt.tight_layout(); plt.show()


## 9) Visual — Residuals vs Temperature (Random Forest)


In [None]:
resid_rf = y_test - preds["rf"]
plt.figure(figsize=(6,5))
plt.scatter(test["temp_F"], resid_rf, alpha=0.35, s=18)
plt.axhline(0, linewidth=1)
plt.xlabel("Temp (°F)"); plt.ylabel("Residual (Actual − Pred)")
plt.title("Residuals vs Temperature — RF")
plt.tight_layout(); plt.show()


## 10) Calibration — Empirical coverage for quantiles


In [None]:
def empirical_coverage(y, qhat):
    return float(np.mean(np.asarray(y) <= np.asarray(qhat)))

cov50 = empirical_coverage(y_test, q50)
cov90 = empirical_coverage(y_test, q90)

plt.figure(figsize=(6,4))
plt.bar(["τ=0.50","τ=0.90"], [cov50, cov90])
plt.axhline(0.5, linestyle="--", linewidth=1)
plt.axhline(0.9, linestyle="--", linewidth=1)
plt.ylim(0,1)
plt.ylabel("P(y ≤ q̂τ)")
plt.title("Empirical Coverage — Quantile Regression")
plt.tight_layout(); plt.show()


## 11) Rolling RMSE — stability over time (OLS vs RF)


In [None]:
window = 24*7  # one week window
def rolling_rmse(y_true, y_pred, window):
    y = np.asarray(y_true); yhat = np.asarray(y_pred)
    out = np.full_like(y, fill_value=np.nan, dtype=float)
    for i in range(window-1, len(y)):
        out[i] = np.sqrt(np.mean((y[i-window+1:i+1] - yhat[i-window+1:i+1])**2))
    return out

roll_rmse_ols = rolling_rmse(y_test, preds["ols"], window)
roll_rmse_rf  = rolling_rmse(y_test, preds["rf"],  window)

plt.figure(figsize=(10,4))
plt.plot(test["timestamp"], roll_rmse_ols, label="OLS (rolling RMSE)")
plt.plot(test["timestamp"], roll_rmse_rf,  label="RF (rolling RMSE)")
plt.xlabel("Time"); plt.ylabel("RMSE")
plt.title("Rolling RMSE (1-week window)")
plt.legend(); plt.tight_layout(); plt.show()


### Takeaways

- **OLS** provides a strong baseline but struggles with nonlinearities and varying variance.  
- **Regularization** stabilizes linear models when features are correlated or numerous.  
- **Feature Expansion** (polynomial, Fourier) can capture curvature without black-boxes.  
- **Random Forest** models nonlinear interactions well, often reducing error in complex regimes.  
- **GLM (Gamma)** respects positive, heteroscedastic consumption and can improve calibration.  
- **Quantile Regression** gives **risk-aware** bands (P10/P90) for capacity planning and SLAs.
