In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.1-py3-none-macosx_10_15_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.1-py3-none-macosx_10_15_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.1


In [3]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [5]:
# Load the final merged dataset
df = pd.read_csv("../../data/processed/final_merged_data.csv")

# Convert time column to datetime
df['time'] = pd.to_datetime(df['time'])
df.set_index('time', inplace=True)

# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nColumns in the dataset:")
print(df.columns.tolist())
print("\nSample of the data:")
df.head()

Dataset shape: (105192, 46)

Columns in the dataset:
['Fuel', 'MW', 'NAME', 'wind_speed', 'temperature_c', 'sky_coverage', 'precipitation_mm', 'ERCOT.LOAD_wind', 'ERCOT.WIND.GEN', 'Wind 1-hr MW change', 'Wind 1-hr % change', 'ERCOT.LOAD_solar', 'ERCOT.PVGR.GEN', 'Solar 1-hr MW change', 'Solar 1-hr % change', 'Unnamed: 0', 'datetime_col', 'ERCOT.LOAD', 'ERCOT.PVGR.GEN_solar', 'Total Solar Installed, MW', 'Solar Output, % of Load', 'Solar Output, % of Installed', 'Solar 1-hr MW change_solar', 'Solar 1-hr % change_solar', 'Daytime Hour', 'Ramping Daytime Hour', 'Timestamp', 'Biomass', 'Coal', 'Gas', 'Hydro', 'Nuclear', 'Datetime_hour', 'AGG LOAD SUMMARY', 'SUM TELEM GEN MW', 'SUM TELEM DCTIE MW', 'load_ramp', 'load_gen_ratio', 'load_minus_gen', 'tie_flow_pct', 'hour', 'day_of_week', 'is_peak_hour', '3hr_load_ma', '6hr_load_ma', '3hr_gen_ma']

Sample of the data:


Unnamed: 0_level_0,Fuel,MW,NAME,wind_speed,temperature_c,sky_coverage,precipitation_mm,ERCOT.LOAD_wind,ERCOT.WIND.GEN,Wind 1-hr MW change,...,load_ramp,load_gen_ratio,load_minus_gen,tie_flow_pct,hour,day_of_week,is_peak_hour,3hr_load_ma,6hr_load_ma,3hr_gen_ma
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 01:00:00,Biomass,22.297569,"CORPUS CHRISTI INTERNATIONAL AIRPORT, TX US",1.0,24.4,99.0,,38124.261975,12067.479497,,...,-427.0129,0.999612,-17.8,0.000388,1.0,1.0,0.0,46206.865623,47140.703833,46224.23229
2022-01-01 01:00:00,Coal,10269.999131,"HOUSTON INTERCONTINENTAL AIRPORT, TX US",1.0,25.0,99.0,,37122.946803,12884.367833,816.888337,...,-650.30986,0.999613,-17.5,0.000387,1.0,1.0,0.0,45752.79717,46429.363925,45770.130503
2022-01-01 01:00:00,Gas,1116.244747,"MIDLAND INTERNATIONAL AIRPORT, TX US",1.0,11.7,99.0,0.0,35936.747949,14366.542968,1482.175134,...,-162.9137,0.99962,-17.1,0.00038,1.0,1.0,0.0,45339.385017,45963.987092,45356.851683
2022-01-01 01:00:00,Nuclear,5078.955677,"AUSTIN BERGSTROM INTERNATIONAL AIRPORT, TX US",1.0,25.6,99.0,,35132.555947,16463.459644,2096.916676,...,-1504.77453,0.999607,-17.1,0.000393,2.0,1.0,0.0,44566.718987,45386.792305,44583.95232
2022-01-01 02:00:00,Biomass,22.300089,"SAN ANTONIO INTERNATIONAL AIRPORT, TX US",1.0,25.6,99.0,,34602.74181,18337.533839,1874.074195,...,558.81113,0.999599,-17.7,0.000402,2.0,1.0,0.0,44197.093287,44974.945228,44214.393287


In [6]:
def prepare_data(df, target_col='Price ($/MWh)', test_size=0.2):
    """Prepare data for modeling by splitting features and target, and scaling"""
    # Select features (excluding target and datetime columns)
    feature_cols = [col for col in df.columns if col != target_col]
    
    # Split data into features and target
    X = df[feature_cols]
    y = df[target_col]
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler, feature_cols

# Prepare data
X_train, X_test, y_train, y_test, scaler, feature_cols = prepare_data(df)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

KeyError: 'Price ($/MWh)'

In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate model performance using various metrics"""
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{model_name} Performance:")
    print(f"MAE: ${mae:.2f}/MWh")
    print(f"RMSE: ${rmse:.2f}/MWh")
    print(f"R²: {r2:.4f}")
    
    return y_pred

# Initialize models
models = {
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', n_estimators=200, random_state=42),
    'LightGBM': lgb.LGBMRegressor(objective='regression', n_estimators=200, random_state=42),
    'CatBoost': CatBoostRegressor(iterations=200, verbose=0, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=200, random_state=42)
}

# Train and evaluate models
predictions = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    predictions[name] = evaluate_model(model, X_test, y_test, name)

In [None]:
def plot_predictions(y_test, predictions):
    """Plot actual vs predicted values for all models"""
    plt.figure(figsize=(15, 8))
    plt.plot(y_test.index, y_test.values, label='Actual', color='black')
    
    colors = ['blue', 'red', 'green', 'purple']
    for (name, pred), color in zip(predictions.items(), colors):
        plt.plot(y_test.index, pred, label=name, alpha=0.7, color=color)
    
    plt.title('Electricity Price Forecast - Actual vs Predicted')
    plt.xlabel('Time')
    plt.ylabel('Price ($/MWh)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# Plot predictions
plot_predictions(y_test, predictions)

In [None]:
def tune_hyperparameters(model, param_dist, X_train, y_train, model_name):
    """Perform hyperparameter tuning using RandomizedSearchCV"""
    print(f"\nTuning hyperparameters for {model_name}...")
    
    search = RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        n_iter=30,
        cv=3,
        scoring='neg_root_mean_squared_error',
        random_state=42,
        n_jobs=-1,
        verbose=1
    )
    
    search.fit(X_train, y_train)
    print(f"\nBest parameters for {model_name}:")
    print(search.best_params_)
    
    return search.best_estimator_

# Define parameter distributions for each model
param_distributions = {
    'XGBoost': {
        'n_estimators': [100, 300, 500],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'gamma': [0, 0.1, 0.3],
        'reg_alpha': [0, 0.01, 0.1],
        'reg_lambda': [1, 1.5, 2]
    },
    'LightGBM': {
        'n_estimators': [100, 300, 500],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'num_leaves': [31, 63, 127],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    },
    'CatBoost': {
        'iterations': [100, 300, 500],
        'depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'l2_leaf_reg': [1, 3, 5, 7],
        'bootstrap_type': ['Bernoulli'],
        'subsample': [0.6, 0.8, 1.0]
    },
    'Random Forest': {
        'n_estimators': [100, 300, 500],
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    }
}

# Perform hyperparameter tuning for each model
tuned_models = {}
for name, model in models.items():
    tuned_models[name] = tune_hyperparameters(
        model,
        param_distributions[name],
        X_train,
        y_train,
        name
    )

# Evaluate tuned models
tuned_predictions = {}
for name, model in tuned_models.items():
    tuned_predictions[name] = evaluate_model(model, X_test, y_test, f"Tuned {name}")

# Plot predictions for tuned models
plot_predictions(y_test, tuned_predictions)