<a href="https://colab.research.google.com/github/MehrdadDastouri/energy_consumption_xgboost/blob/main/energy_consumption_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import matplotlib.pyplot as plt

# Load the dataset
# You can download an energy consumption dataset (e.g., from Kaggle or UCI Machine Learning Repository)
data = pd.read_csv("energy_data.csv")
print("Dataset Preview:")
print(data.head())

# Check for missing values
print("\nChecking for missing values:")
print(data.isnull().sum())

# Drop rows with missing values (if any)
data = data.dropna()

# Features and target
# Assuming 'Energy_Consumption' is the target column
X = data.drop(columns=["Energy_Consumption"])
y = data["Energy_Consumption"]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert data into DMatrix format (required for XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# XGBoost parameters
params = {
    "objective": "reg:squarederror",  # Regression task
    "max_depth": 6,                  # Maximum depth of a tree
    "eta": 0.1,                      # Learning rate
    "subsample": 0.8,                # Fraction of samples used per tree
    "colsample_bytree": 0.8,         # Fraction of features used per tree
    "seed": 42
}

# Train the XGBoost model
num_rounds = 100
print("\nTraining the XGBoost model...")
model = xgb.train(params, dtrain, num_rounds)

# Make predictions
y_pred = model.predict(dtest)

# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nModel Performance:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color="blue")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "--", color="red", linewidth=2)
plt.xlabel("Actual Energy Consumption")
plt.ylabel("Predicted Energy Consumption")
plt.title("Actual vs Predicted Energy Consumption")
plt.show()

# Feature importance
importance = model.get_score(importance_type="weight")
importance_df = pd.DataFrame({"Feature": importance.keys(), "Importance": importance.values()})
importance_df = importance_df.sort_values(by="Importance", ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.title("Feature Importance")
plt.gca().invert_yaxis()
plt.show()