## XGradientBoost Model

### Library

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from itertools import product
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from itertools import product


### Data Loading

In [None]:
# Load the dataset
df = pd.read_csv('https://drive.google.com/uc?id=1sbFAHLYmyOBhHZ8yHqct2yhViRRV9r3u')

In [None]:
df.head()

In [None]:
df_XGB = df.drop(columns=['TimeStamp','WindSpeed_Bin'])

In [None]:
df_XGB.head()

### Model

In [None]:

class XGBoostRegressorModel:
    def __init__(self, params=None, num_boost_round=1000, early_stopping_rounds=10):
        # num_boost_round: max number of boosting iterations, early stopping: stop if validation error doesn't improve this many rounds
        self.params = {
            'objective': 'reg:squarederror',
            'eta': 0.05,               # learning_rate
            'max_depth': 6,             # max tree depth
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'seed': 42,
            'verbosity': 0
        }
        if params:
            self.params.update(params)  # allows to override defaults

        self.num_boost_round = num_boost_round
        self.early_stopping_rounds = early_stopping_rounds
        self.bst = None  # the trained Booster

    def _evaluate(self, y_true, y_pred):
        eps = 1e-2
        mask = y_true > eps

        if np.sum(mask) == 0:
            print("[Warning] No valid values after masking low y_true values.")
            return {}

        y_true = y_true[mask]
        y_pred = y_pred[mask]
        pct = (y_pred - y_true) / y_true

        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mean_y = np.mean(y_true)
        rmsep_approx = (rmse /mean_y) * 100
        r2 = r2_score(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        smape = np.mean(2.0 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred) + eps)) * 100

        return {
            'RMSE':  rmse,
            'RMSEP_approx (%)': rmsep_approx,
            'R2':    r2,
            'MAE':   mae,
            'SMAPE (%)': smape
        }


    def run(self, df, target_column='ActivePower_Mean'):

        # 1. Split features/target
        X = df.drop(columns=[target_column])
        y = df[target_column].values

        # 2. Train/val/test 70/15/15
        X_train, X_temp, y_train, y_temp = train_test_split(
            X, y, test_size=0.30, random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(
            X_temp, y_temp, test_size=0.50, random_state=42)

        # 3. Build DMatrix objects, convert to DMatrix
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval   = xgb.DMatrix(X_val,   label=y_val)
        dtest  = xgb.DMatrix(X_test,  label=y_test)

        # 4. Train with early stopping on validation set, applies validation when validation RMSE plateaus
        evallist = [(dtrain, 'train'), (dval, 'validation')]
        self.bst = xgb.train(
            self.params,
            dtrain,
            num_boost_round=self.num_boost_round,
            evals=evallist,
            early_stopping_rounds=self.early_stopping_rounds,
            verbose_eval=False
        )

        # 5. Predict
        y_train_pred = self.bst.predict(dtrain)
        y_val_pred   = self.bst.predict(dval)
        y_test_pred  = self.bst.predict(dtest)

        # 6. Evaluate
        train_metrics = self._evaluate(y_train, y_train_pred)
        val_metrics   = self._evaluate(y_val,   y_val_pred)
        test_metrics  = self._evaluate(y_test,  y_test_pred)

        # 7. Report
        print("Training Metrics:   ", train_metrics)
        print("Validation Metrics: ", val_metrics)
        print("Test Metrics:       ", test_metrics)

        return {
            'train': train_metrics,
            'val':   val_metrics,
            'test':  test_metrics
        }

    def grid_search(self, df, target_column: str = 'ActivePower_Mean', param_grid: dict | None = None, primary_metric: str = 'RMSE') -> tuple[dict, list]:

      # 1) Prepare data
      X = df.drop(columns=[target_column])
      y = df[target_column].values

      X_train, X_temp, y_train, y_temp = train_test_split(
          X, y, test_size=0.30, random_state=42
      )
      X_val, _, y_val, _ = train_test_split(
          X_temp, y_temp, test_size=0.50, random_state=42
      )

      # 2) Convert to DMatrix
      dtrain = xgb.DMatrix(X_train, label=y_train)
      dval   = xgb.DMatrix(X_val,   label=y_val)

      # 3) Prepare to collect results
      all_results: list[dict] = []

      # 4) Iterate over every combination of parameters
      keys, values = zip(*param_grid.items())
      for combo in product(*values):
          # Build a dict for this combination
          trial_params = dict(zip(keys, combo))

          # 4a) Override just those in self.params
          original_params = self.params.copy()
          self.params.update(trial_params)

          # 4b) Train
          bst = xgb.train(
              params=self.params,
              dtrain=dtrain,
              num_boost_round=self.num_boost_round,
              evals=[(dtrain, 'train')],
              early_stopping_rounds=self.early_stopping_rounds,
              verbose_eval=False
          )

          # 4c) Predict on validation set
          y_val_pred = bst.predict(dval)

          # 4d) Evaluate metrics
          metrics = self._evaluate(y_val, y_val_pred)

          # Store trial result
          all_results.append({
              'params': trial_params.copy(),
              'metrics': metrics
          })

          # Restore original params
          self.params = original_params

      # 5) Find best by primary_metric (lowest value)
      best = min(all_results, key=lambda r: r['metrics'][primary_metric])

      print(f"Best params by {primary_metric}: {best['params']}")
      print(f"{primary_metric} = {best['metrics'][primary_metric]:.4f}")

      return best, all_results




### Application Small Set

In [None]:
# 0) Sample 10% of data to test pipeline speed
df_small = df_XGB.sample(frac=0.1, random_state=42)

# 1) Use a minimal grid (one combo)
param_grid = {
    'eta':              [0.1],    # single value
    'max_depth':        [3],      # single value
    'subsample':        [0.8],    # single value
    'colsample_bytree': [0.8],    # single value
    'reg_alpha':        [0.0],
    'reg_lambda':       [0.5]
}

# 2) Fewer boosting rounds and very aggressive early stopping
xgb_model = XGBoostRegressorModel(
    params={'verbosity': 0},
    num_boost_round=50,            # only 50 trees
    early_stopping_rounds=5        # stop after 5 rounds of no improvement
)

# 3) Run grid search (this will effectively train just once on the small subset)
best, all_results = xgb_model.grid_search(
    df_small,
    target_column='ActivePower_Mean',
    param_grid=param_grid,
    primary_metric='RMSE'
)

# 4) Update and do a quick full run
xgb_model.params.update(best['params'])
final_results = xgb_model.run(df_small, target_column='ActivePower_Mean')

print("Quick‐test results:", final_results)


In [None]:
# Grid Search Results

rows = [
    {**trial['params'], **trial['metrics']}
    for trial in all_results
]
df_results = pd.DataFrame(rows)

# 3) Sort by your primary metric and display
df_results = df_results.sort_values('RMSE').reset_index(drop=True)
df_results

### Application (entire dataset)

In [None]:
# Define grid
param_grid = {
    'eta':              [0.05, 0.1, 0.2],
    'max_depth':        [4, 5, 6],
    'subsample':        [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha':        [0.000001,0.0001,1.0],
    'reg_lambda':       [0.5,10,20]

}

# Instantiate model
xgb_model = XGBoostRegressorModel(
    params={'verbosity':0},
    num_boost_round=500,
    early_stopping_rounds=20
)

# Run grid search on the validation split, optimizing RMSE
best, all_results = xgb_model.grid_search(
    df_XGB,
    target_column='ActivePower_Mean',
    param_grid=param_grid,
    primary_metric='RMSE'   # or 'MAE', 'sMAPE', etc.
)

# Feed best parameters into run() for full train/val/test
xgb_model.params.update(best['params'])
final_results = xgb_model.run(df_XGB, target_column='ActivePower_Mean')


In [None]:
# Grid Search Results

rows = [
    {**trial['params'], **trial['metrics']}
    for trial in all_results
]
df_results = pd.DataFrame(rows)

# 3) Sort by primary metric and display
df_results = df_results.sort_values('RMSE').reset_index(drop=True)
df_results

In [None]:
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt

# Prepare data: numeric features only
X = df_XGB.drop(columns=['ActivePower_Mean']).select_dtypes(include=[np.number])
y = df_XGB['ActivePower_Mean'].values
dtrain = xgb.DMatrix(X, label=y)

# 1) Sweep reg_alpha with reg_lambda fixed
alphas = np.logspace(-6, 0, 10)
rmse_alpha = []
for alpha in alphas:
    params = {
        'objective': 'reg:squarederror',
        'eta': 0.2,
        'max_depth': 5,
        'subsample': 0.8,
        'colsample_bytree': 0.7,
        'reg_alpha': alpha,
        'reg_lambda': 1.0,
        'verbosity': 0
    }
    cv = xgb.cv(
        params,
        dtrain,
        num_boost_round=500,
        nfold=5,
        metrics=('rmse',),
        early_stopping_rounds=20,
        as_pandas=True,
        seed=42,
        verbose_eval=False
    )
    rmse_alpha.append(cv['test-rmse-mean'].min())

# 2) Sweep reg_lambda with reg_alpha fixed
lambdas = np.logspace(-3, 2, 10)
rmse_lambda = []
for lam in lambdas:
    params = {
        'objective': 'reg:squarederror',
        'eta': 0.2,
        'max_depth': 5,
        'subsample': 0.8,
        'colsample_bytree': 0.7,
        'reg_alpha': 1e-4,
        'reg_lambda': lam,
        'verbosity': 0
    }
    cv = xgb.cv(
        params,
        dtrain,
        num_boost_round=500,
        nfold=5,
        metrics=('rmse',),
        early_stopping_rounds=20,
        as_pandas=True,
        seed=42,
        verbose_eval=False
    )
    rmse_lambda.append(cv['test-rmse-mean'].min())

# 3) Plot results
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.semilogx(alphas, rmse_alpha, marker='o')
plt.title('CV RMSE vs. reg_alpha')
plt.xlabel('reg_alpha (L1 penalty)')
plt.ylabel('Validation RMSE')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.semilogx(lambdas, rmse_lambda, marker='o')
plt.title('CV RMSE vs. reg_lambda')
plt.xlabel('reg_lambda (L2 penalty)')
plt.ylabel('Validation RMSE')
plt.grid(True)

plt.tight_layout()
plt.show()
