
# Autoregressive (AR) Modeling for Power Consumption

This notebook builds an end-to-end AR time-series pipeline on *differenced* power consumption data:
- AIC-based lag selection (p = 0..14)
- Model fitting with the optimal AR(p)
- Residual diagnostics using Ljungâ€“Box test
- Forecasting on test data
- Evaluation using MAE, RMSE, and MAPE
- Visual comparison of Train, Actual, and Forecast


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from statsmodels.tsa.ar_model import AutoReg
from statsmodels.stats.diagnostic import acorr_ljungbox
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:

# Load dataset
df = pd.read_csv("/mnt/data/ML471_S2_Datafile_Concept(in).csv")

# Try to auto-detect the power column
power_col = None
for c in df.columns:
    if "power" in c.lower():
        power_col = c
        break

if power_col is None:
    raise ValueError("Power consumption column not found. Please rename it to include 'Power'.")

series = df[power_col].astype(float)

# Differencing
power_diff = series.diff().dropna()

# Train/Test split (80/20)
split = int(len(power_diff) * 0.8)
train = power_diff.iloc[:split]
test = power_diff.iloc[split:]

len(train), len(test)


In [None]:

# AIC-based grid search for AR(p), p = 0..14
aic_scores = {}

for p in range(0, 15):
    try:
        model = AutoReg(train, lags=p, old_names=False).fit()
        aic_scores[p] = model.aic
    except Exception:
        aic_scores[p] = np.nan

aic_df = pd.DataFrame({
    "p": list(aic_scores.keys()),
    "AIC": list(aic_scores.values())
}).sort_values("AIC")

aic_df


In [None]:

# Select best p (lowest AIC)
best_p = int(aic_df.dropna().iloc[0]["p"])
best_p


In [None]:

# Fit optimal AR model
ar_model = AutoReg(train, lags=best_p, old_names=False).fit()
ar_model.summary()


In [None]:

# Residual diagnostics - Ljung-Box Test
lb = acorr_ljungbox(ar_model.resid, lags=[1], return_df=True)
lb


In [None]:

# Forecast on test horizon
start = len(train)
end = len(train) + len(test) - 1
forecast = ar_model.predict(start=start, end=end, dynamic=False)

# Evaluation metrics
mae = mean_absolute_error(test, forecast)
rmse = mean_squared_error(test, forecast, squared=False)
mape = (np.mean(np.abs((test - forecast) / test)) * 100)

mae, rmse, mape


In [None]:

# Visualization: Train, Actual, Forecast
plt.figure(figsize=(10,5))
plt.plot(train.values, label="Train")
plt.plot(range(len(train), len(train)+len(test)), test.values, linestyle='--', label="Actual")
plt.plot(range(len(train), len(train)+len(test)), forecast.values, linestyle='--', label="Forecast")
plt.title(f"AR Forecast with Auto-selected Parameters (p={best_p})")
plt.legend()
plt.show()
