In [None]:
! pip install numpy pandas matplotlib seaborn scikit-learn statsmodels xgboost lightgbm tensorflow keras

In [None]:
# load data
import pandas as pd
url = "https://archive.ics.uci.edu/static/public/235/individual+household+electric+power+consumption.zip"
df = pd.read_csv(url, sep=";", parse_dates={"datetime": ["Date", "Time"]}, infer_datetime_format=True, low_memory=False, na_values=['nan', '?'])
df.set_index("datetime", inplace=True)
print(df.head())

In [None]:
# inspect features
print(df.info())
print(df.describe())

In [None]:
# visualize data
import matplotlib.pyplot as plt
df.plot(y = ["Global_active_power", "Voltage"], subplots=True, figsize=(10, 8))
plt.show()

In [40]:
# preprocess data

# fill missing values
df.fillna(df.mean(), inplace=True)

# normalize data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

# create lag features
for lag in range(1, 4):
    scaled_df[f"lag_{lag}"] = scaled_df["Global_active_power"].shift(lag)
scaled_df.dropna(inplace=True)

In [41]:
# split data into train and test sets
train_size = int(len(scaled_df) * 0.8)
train, test = scaled_df[:train_size], scaled_df[train_size:]

target = "Global_active_power"
X_train, y_train = train.drop(target, axis=1), train[target]
X_test, y_test = test.drop(target, axis=1), test[target]

In [42]:
# choose a forecasting model like XGBoost or LightGBM
import xgboost as xgb
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

predictions_xgb = model.predict(X_test)

In [None]:
# or choose a deep learning model like LSTM

import numpy as np

def create_sequences(data, target, sequence_length=3):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data.iloc[i:i+sequence_length].values)
        y.append(target.iloc[i+sequence_length])
    return np.array(X), np.array(y)

sequence_length = 3
X_train_seq, y_train_seq = create_sequences(train, train[target], sequence_length)
X_test_seq, y_test_seq = create_sequences(test, test[target], sequence_length)

# build the LTSM model
from keras.models import Sequential
from keras.layers import LSTM, Dense

model = Sequential([
    LSTM(50, activation='relu', input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.fit(X_train_seq, y_train_seq, epochs=20, batch_size=32)

predictions_lstm = model.predict(X_test_seq)

In [None]:
# evaluate the models
from sklearn.metrics import mean_squared_error, mean_absolute_error

print(X_test.shape, y_test.shape, predictions_xgb.shape, predictions_lstm.shape)

mse_xgb = mean_squared_error(y_test, predictions_xgb)
mae_xgb = mean_absolute_error(y_test, predictions_xgb)
print(f"XGBoost -> MSE: {mse_xgb}, MAE: {mae_xgb}")

mse_lstm = mean_squared_error(y_test_seq, predictions_lstm)
mae_lstm = mean_absolute_error(y_test_seq, predictions_lstm)
print(f"LSTM    -> MSE: {mse_lstm}, MAE: {mae_lstm}")

In [None]:
# vizualize the predictions

# Convert predictions to a Pandas Series with the same index as y_test
# Slice y_test to match the length of predictions
y_test_aligned = y_test.iloc[:len(predictions_lstm)]

predictions_xgb_series = pd.Series(predictions_xgb, index=y_test.index)
predictions_lstm_series = pd.Series(predictions_lstm.flatten(), index=y_test_aligned.index)

# Select a specific time range, since the data is too large to plot
start = 0
end = 5000
plt.figure(figsize=(15, 6))
plt.plot(y_test[start:end], label="actual", alpha=0.7)
plt.plot(predictions_xgb_series[start:end], label="XGBoost", alpha=0.7)
plt.plot(predictions_lstm_series[start:end], label="LSTM", alpha=0.7)
plt.legend()
plt.show()