# Advanced Solar Power Forecasting
This notebook trains and evaluates a hybrid ensemble model combining **LightGBM** and **XGBoost**, optimized through both Grid Search and Bayesian Search, for predicting solar power generation. Note: only a part of the code is available publicly.
The goal is to achieve high accuracy using only weather and location-based features — without solar irradiance data.

In [None]:
!pip install xgboost lightgbm scikit-optimize

import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, explained_variance_score
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer
from itertools import combinations
import numpy as np 

In [None]:
# Prepare the dataset
# Assuming 'df' is the main DataFrame and 'PolyPwr' is the target variable
X = df.drop(['PolyPwr'], axis=1)
y = df['PolyPwr']

# Split data into training, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.176, random_state=42)  # ≈ 15% of 85%

# Drop non-feature columns
X_train = X_train.drop(columns=['Location'])
X_val = X_val.drop(columns=['Location'])
X_test = X_test.drop(columns=['Location'])

In [None]:
# Baseline Models (LightGBM and XGBoost)
# Train baseline LightGBM and XGBoost models
lightgbm_model = LGBMRegressor(random_state=42)
xgboost_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

lightgbm_model.fit(X_train, y_train)
xgboost_model.fit(X_train, y_train)

# Validation predictions
val_predictions_lgbm = lightgbm_model.predict(X_val)
val_predictions_xgb = xgboost_model.predict(X_val)

# Test predictions
test_predictions_lgbm = lightgbm_model.predict(X_test)
test_predictions_xgb = xgboost_model.predict(X_test)

In [None]:
# Ensemble Meta-Model
# Combine model predictions and train a Linear Regression meta-model
stacked_val_predictions = np.column_stack((val_predictions_lgbm, val_predictions_xgb))
meta_model = LinearRegression()
meta_model.fit(stacked_val_predictions, y_val)

# Predict on test data
stacked_test_predictions = np.column_stack((test_predictions_lgbm, test_predictions_xgb))
final_predictions = meta_model.predict(stacked_test_predictions)

# Evaluate
mse = mean_squared_error(y_test, final_predictions)
print(f"Stacked Ensemble Model MSE: {mse}")

In [None]:
# Hyperparameter Tuning (Grid Search)
param_grid_lgbm = {
    'num_leaves': [31, 50, 70],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7]
}

lgbm = LGBMRegressor(random_state=42)
grid_search_lgbm = GridSearchCV(estimator=lgbm, param_grid=param_grid_lgbm, cv=3,
                                scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search_lgbm.fit(X_train, y_train)

print("Best LightGBM parameters:", grid_search_lgbm.best_params_)
print("Best RMSE:", (-grid_search_lgbm.best_score_)**0.5)

In [None]:
# Hyperparameter Tuning (XGBoost)
param_grid_xgb = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
grid_search_xgb = GridSearchCV(estimator=xgb_reg, param_grid=param_grid_xgb, cv=3,
                               scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

print("Best XGBoost parameters:", grid_search_xgb.best_params_)
print("Best RMSE:", (-grid_search_xgb.best_score_)**0.5)

In [None]:
# Optimized Models and Evaluation
# Retrain optimized models
lightgbm_model_opt = LGBMRegressor(learning_rate=0.05, max_depth=7, n_estimators=300, num_leaves=31, random_state=42)
xgboost_model_opt = xgb.XGBRegressor(colsample_bytree=1.0, learning_rate=0.05, max_depth=7, n_estimators=200,
                                     subsample=0.8, random_state=42)

lightgbm_model_opt.fit(X_train, y_train)
xgboost_model_opt.fit(X_train, y_train)

# Ensemble averaging
preds_lgbm = lightgbm_model_opt.predict(X_test)
preds_xgb = xgboost_model_opt.predict(X_test)
ensemble_preds = (preds_lgbm + preds_xgb) / 2

# Evaluate ensemble
mse = mean_squared_error(y_test, ensemble_preds)
r2 = r2_score(y_test, ensemble_preds)
print(f"Optimized Ensemble MSE: {mse}")
print(f"Optimized Ensemble R²: {r2}")

In [None]:
# Cross-Validation (Model Stability)
tscv = TimeSeriesSplit(n_splits=5)
r2_scorer = make_scorer(r2_score)

# Cross-validation for LightGBM
lgbm_cv_r2 = cross_val_score(lightgbm_model_opt, X, y, cv=tscv, scoring=r2_scorer)
print(f"LightGBM CV R² Scores: {lgbm_cv_r2}")
print(f"Average: {np.mean(lgbm_cv_r2):.4f}")

# Cross-validation for XGBoost
xgb_cv_r2 = cross_val_score(xgboost_model_opt, X, y, cv=tscv, scoring='r2')
print(f"XGBoost CV R² Scores: {xgb_cv_r2}")
print(f"Average: {np.mean(xgb_cv_r2):.4f}")

In [None]:
# Feature Engineering (Polynomial + Interactions)
# Add interaction and polynomial features
feature_names = ['AmbientTemp', 'Humidity']

def add_interaction_terms(df, feature_names):
    combos = combinations(feature_names, 2)
    for combo in combos:
        new_feature_name = 'x'.join(combo)
        df[new_feature_name] = df[combo[0]] * df[combo[1]]
    for feature_name in feature_names:
        df[f"{feature_name}^2"] = df[feature_name] ** 2
    return df

df = add_interaction_terms(df, feature_names)

In [None]:
# Bayesian Optimization (LightGBM)
from skopt import BayesSearchCV
from skopt.space import Real, Integer

search_spaces = {
    'learning_rate': Real(0.01, 0.3, 'log-uniform'),
    'n_estimators': Integer(100, 1000),
    'max_depth': Integer(3, 10),
    'num_leaves': Integer(20, 150),
    'colsample_bytree': Real(0.6, 1.0, 'uniform'),
    'subsample': Real(0.6, 1.0, 'uniform'),
}

lgbm = LGBMRegressor(random_state=42)
opt = BayesSearchCV(lgbm, search_spaces, scoring=make_scorer(mean_squared_error, greater_is_better=False),
                    n_iter=32, cv=3, n_jobs=-1, random_state=42)
opt.fit(X_train, y_train)

print("Best parameters:", opt.best_params_)
print("Best MSE:", -opt.best_score_)

In [None]:
# Final Ensemble Meta-Model Evaluation
# Final stacking meta-model using optimized LightGBM & XGBoost
predictions_lgbm = lightgbm_model_opt.predict(X_test)
predictions_xgb = xgboost_model_opt.predict(X_test)
stacked_predictions = np.column_stack((predictions_lgbm, predictions_xgb))

meta_model = LinearRegression()
meta_model.fit(stacked_predictions, y_test)
final_predictions = meta_model.predict(stacked_predictions)

# Metrics
meta_mse = mean_squared_error(y_test, final_predictions)
meta_r2 = r2_score(y_test, final_predictions)
mae = mean_absolute_error(y_test, final_predictions)
rmse = mean_squared_error(y_test, final_predictions, squared=False)
explained_var = explained_variance_score(y_test, final_predictions)

print(f"MSE: {meta_mse}")
print(f"R²: {meta_r2}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Explained Variance: {explained_var}")