In [2]:
# Task 3: Energy Consumption Time Series Forecasting
# Using ARIMA and XGBoost (without Prophet)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Machine Learning Models
from statsmodels.tsa.arima.model import ARIMA
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# -------------------------------
# Step 1: Load and Clean Dataset
# -------------------------------

df = pd.read_csv(
    'household_power_consumption.csv',  # change path if needed
    sep=';', 
    low_memory=False, 
    na_values=['?']  # treat '?' as missing
)

# Clean column names
df.columns = df.columns.str.strip()

# Combine Date and Time into Datetime
df['Datetime'] = pd.to_datetime(
    df['Date'].str.strip() + ' ' + df['Time'].str.strip(),
    dayfirst=True, 
    errors='coerce'
)

# Drop invalid datetime rows
df = df.dropna(subset=['Datetime'])

# Convert target column to numeric
df['Global_active_power'] = pd.to_numeric(df['Global_active_power'], errors='coerce')

# Drop missing target values
df = df.dropna(subset=['Global_active_power'])

# Set datetime as index
df.set_index('Datetime', inplace=True)

# Resample hourly
ts = df['Global_active_power'].resample('H').mean()

# -------------------------------
# Step 2: Train-Test Split
# -------------------------------

train_size = int(len(ts) * 0.8)
train, test = ts[:train_size], ts[train_size:]

# -------------------------------
# Step 3: Forecast using ARIMA
# -------------------------------

# Fit ARIMA model (simple order, can tune)
arima_model = ARIMA(train, order=(5,1,0))
arima_fit = arima_model.fit()

# Forecast
arima_forecast = arima_fit.forecast(steps=len(test))
arima_forecast.index = test.index

# Evaluation
arima_rmse = np.sqrt(mean_squared_error(test, arima_forecast))
print(f"ARIMA RMSE: {arima_rmse:.3f}")

# Plot ARIMA
plt.figure(figsize=(12,5))
plt.plot(train, label='Train')
plt.plot(test, label='Test')
plt.plot(arima_forecast, label='ARIMA Forecast', color='red')
plt.title('ARIMA Forecast of Energy Consumption')
plt.xlabel('Datetime')
plt.ylabel('Global Active Power')
plt.legend()
plt.show()

# -------------------------------
# Step 4: Forecast using XGBoost (GradientBoostingRegressor)
# -------------------------------

# Feature Engineering: time-based features
df_ml = ts.reset_index()
df_ml['hour'] = df_ml['Datetime'].dt.hour
df_ml['day'] = df_ml['Datetime'].dt.day
df_ml['weekday'] = df_ml['Datetime'].dt.weekday

# Features and target
X = df_ml[['hour','day','weekday']]
y = df_ml['Global_active_power']

# Train-test split
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Fit Gradient Boosting Regressor
xgb_model = GradientBoostingRegressor()
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)

# Evaluation
xgb_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"XGBoost RMSE: {xgb_rmse:.3f}")

# Plot XGBoost Forecast
plt.figure(figsize=(12,5))
plt.plot(y_test.values, label='Actual')
plt.plot(y_pred, label='XGBoost Predicted', color='red')
plt.title('XGBoost Forecast of Energy Consumption')
plt.xlabel('Datetime')
plt.ylabel('Global Active Power')
plt.legend()
plt.show()


KeyError: 'Date'