In [1]:
import pandas as pd
import chardet

with open('maharashtra_2024_daily_energy_forecasting_dataset.csv', 'rb') as f:
    raw_data = f.read()
result = chardet.detect(raw_data)
print(f"Detected encoding: {result['encoding']} with confidence {result['confidence']}")

# Load the dataset
df = pd.read_csv('maharashtra_2024_daily_energy_forecasting_dataset.csv', encoding=result['encoding'])

# Convert the 'Date' column to datetime objects
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# Set the 'Date' column as the index
df.set_index('Date', inplace=True)

# Display the first few rows of the dataframe
print(df.head())

# Display information about the dataframe
print(df.info())

Detected encoding: ISO-8859-1 with confidence 0.73
           Day of Week  Season        City    Load Type  Temperature (Â°C)  \
Date                                                                         
2024-01-01      Monday  Winter      Nashik        Mixed          29.951667   
2024-01-02     Tuesday  Winter  Aurangabad        Mixed          29.735000   
2024-01-03   Wednesday  Winter        Pune  Residential          28.551250   
2024-01-04    Thursday  Winter      Nagpur   Industrial          30.765417   
2024-01-05      Friday  Winter      Nashik  Residential          29.030417   

            Humidity (%)  Rainfall (mm)  Cloud Cover (%)  Wind Speed (m/s)  \
Date                                                                         
2024-01-01          59.8            2.1             38.7          5.521667   
2024-01-02          61.3            8.8             35.2          5.666667   
2024-01-03          82.4            4.0             43.4          6.043750   
2024-01-04  

In [2]:
# Clean up the column name
df.rename(columns={'Temperature (Â°C)': 'Temperature (C)'}, inplace=True)

# Create lag features for Electric Load
df['load_lag_1'] = df['Electric Load (MW)'].shift(1)
df['load_lag_7'] = df['Electric Load (MW)'].shift(7)

# Create a lag feature for Temperature
df['temp_lag_1'] = df['Temperature (C)'].shift(1)

# Display the first 10 rows to see the new lag features
print(df.head(10))

# Display the last few rows to see how lag features are populated
print(df.tail())

           Day of Week  Season        City    Load Type  Temperature (C)  \
Date                                                                       
2024-01-01      Monday  Winter      Nashik        Mixed        29.951667   
2024-01-02     Tuesday  Winter  Aurangabad        Mixed        29.735000   
2024-01-03   Wednesday  Winter        Pune  Residential        28.551250   
2024-01-04    Thursday  Winter      Nagpur   Industrial        30.765417   
2024-01-05      Friday  Winter      Nashik  Residential        29.030417   
2024-01-06    Saturday  Winter  Aurangabad  Residential        29.453750   
2024-01-07      Sunday  Winter        Pune  Residential        29.453750   
2024-01-08      Monday  Winter      Nagpur   Industrial        29.659167   
2024-01-09     Tuesday  Winter        Pune   Industrial        29.392083   
2024-01-10   Wednesday  Winter      Nashik   Industrial        31.580417   

            Humidity (%)  Rainfall (mm)  Cloud Cover (%)  Wind Speed (m/s)  \
Date     

In [3]:
# Calculate rolling mean and standard deviation for Electric Load
df['load_rolling_mean_7'] = df['Electric Load (MW)'].rolling(window=7).mean()
df['load_rolling_std_7'] = df['Electric Load (MW)'].rolling(window=7).std()

# Calculate rolling mean for weather features
df['temp_rolling_mean_7'] = df['Temperature (C)'].rolling(window=7).mean()
df['wind_rolling_mean_7'] = df['Wind Speed (m/s)'].rolling(window=7).mean()
df['solar_rolling_mean_7'] = df['Solar Radiation (W/m²)'].rolling(window=7).mean()

# Display the first 10 rows to see the new rolling features
print(df.head(10))

           Day of Week  Season        City    Load Type  Temperature (C)  \
Date                                                                       
2024-01-01      Monday  Winter      Nashik        Mixed        29.951667   
2024-01-02     Tuesday  Winter  Aurangabad        Mixed        29.735000   
2024-01-03   Wednesday  Winter        Pune  Residential        28.551250   
2024-01-04    Thursday  Winter      Nagpur   Industrial        30.765417   
2024-01-05      Friday  Winter      Nashik  Residential        29.030417   
2024-01-06    Saturday  Winter  Aurangabad  Residential        29.453750   
2024-01-07      Sunday  Winter        Pune  Residential        29.453750   
2024-01-08      Monday  Winter      Nagpur   Industrial        29.659167   
2024-01-09     Tuesday  Winter        Pune   Industrial        29.392083   
2024-01-10   Wednesday  Winter      Nashik   Industrial        31.580417   

            Humidity (%)  Rainfall (mm)  Cloud Cover (%)  Wind Speed (m/s)  \
Date     

In [4]:
import numpy as np

# Create basic time-based features
df['dayofyear'] = df.index.dayofyear
df['quarter'] = df.index.quarter
df['is_weekend'] = (df.index.dayofweek >= 5).astype(int) # Saturday=5, Sunday=6

# Create cyclical features for day of year
day_of_year = df.index.dayofyear
df['dayofyear_sin'] = np.sin(2 * np.pi * day_of_year / 366)
df['dayofyear_cos'] = np.cos(2 * np.pi * day_of_year / 366)

# Display the dataframe with the new time-based features
print(df.head())

           Day of Week  Season        City    Load Type  Temperature (C)  \
Date                                                                       
2024-01-01      Monday  Winter      Nashik        Mixed        29.951667   
2024-01-02     Tuesday  Winter  Aurangabad        Mixed        29.735000   
2024-01-03   Wednesday  Winter        Pune  Residential        28.551250   
2024-01-04    Thursday  Winter      Nagpur   Industrial        30.765417   
2024-01-05      Friday  Winter      Nashik  Residential        29.030417   

            Humidity (%)  Rainfall (mm)  Cloud Cover (%)  Wind Speed (m/s)  \
Date                                                                         
2024-01-01          59.8            2.1             38.7          5.521667   
2024-01-02          61.3            8.8             35.2          5.666667   
2024-01-03          82.4            4.0             43.4          6.043750   
2024-01-04          70.1            3.4             26.9          6.080417   

In [5]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# --- 1. Handle NaN values ---
# We'll drop the rows with NaN values, which are at the beginning of the dataset
df.dropna(inplace=True)

# --- 2. Handle Categorical Features ---
# We'll use one-hot encoding to convert categorical features into numerical ones
categorical_features = ['Day of Week', 'Season', 'City', 'Load Type', 'Energy Consumption Trend']
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)


# --- 3. Split the Data ---
# We'll separate our features (X) from our target variable (y)
X = df_encoded.drop('Electric Load (MW)', axis=1)
y = df_encoded['Electric Load (MW)']

# We'll split the data into a training set and a testing set
# We use shuffle=False because we're working with time-series data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


# --- 4. Train the XGBoost Model ---
# We'll create and train our XGBoost regressor model
# n_estimators is the number of trees in the forest
# max_depth is how deep the trees can go
# learning_rate is a parameter that prevents overfitting
xgbr = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, max_depth=5, learning_rate=0.05)

xgbr.fit(X_train, y_train)


# --- 5. Evaluate the Model ---
# We'll make predictions on the test set
y_pred = xgbr.predict(X_test)

# We'll evaluate the model's performance
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R² Score: {r2}")
print(f"Mean Squared Error: {mse}")

R² Score: 0.8438340862748777
Mean Squared Error: 1075.1098547388901


In [12]:
from sklearn.model_selection import GridSearchCV

# --- 1. Define the Hyperparameter Grid ---
# These are the settings we want to test
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [500, 1000, 1500],
    'colsample_bytree': [0.3, 0.7]
}

# --- 2. Set up GridSearchCV ---
# We'll create a new XGBoost model instance
xgbr_tuned = xgb.XGBRegressor(objective='reg:squarederror')

# Set up the Grid Search with 3-fold cross-validation
# cv=3 means the data is split into 3 parts for more robust evaluation
grid_search = GridSearchCV(estimator=xgbr_tuned,
                           param_grid=param_grid,
                           cv=3,
                           scoring='r2',
                           n_jobs=-1,
                           verbose=2)

# --- 3. Run the Grid Search ---
# This will start the process of finding the best parameters
grid_search.fit(X_train, y_train)

# --- 4. Get the Best Parameters ---
# Print out the best combination of settings found
print("Best parameters found: ", grid_search.best_params_)

# --- 5. Evaluate the Tuned Model ---
# Get the best model from the search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred_tuned = best_model.predict(X_test)

# Evaluate the tuned model's performance
r2_tuned = r2_score(y_test, y_pred_tuned)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)

print(f"\nTuned Model R² Score: {r2_tuned}")
print(f"Tuned Model Mean Squared Error: {mse_tuned}")

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best parameters found:  {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000}

Tuned Model R² Score: 0.8311114575755825
Tuned Model Mean Squared Error: 1162.6976206381246


In [14]:
# Save the model to a file
best_model.save_model("best_xgb_model.json")

# We also need to save the columns of the training data
# This is important because the model expects the data to be in the same order
import joblib
joblib.dump(X_train.columns, 'training_columns.pkl')

print("Model and training columns saved successfully!")

Model and training columns saved successfully!


In [16]:
X_test.to_csv('X_test.csv')
y_test.to_csv('y_test.csv')