# XG Boost

### Load data & split (For Bilinear resampled methodology)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor
import joblib

# -----------------------------
# LOAD DATA
# -----------------------------
df = pd.read_csv(r"D:\WiDS\Project_sun\excelData\trainingDataBilinear.csv")

X = df.drop(columns=["pv_out"])
y = df["pv_out"]

# -----------------------------
# TRAIN–VALIDATION SPLIT
# -----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42
)

print("XGB Training samples:", X_train.shape[0])
print("XGB Validation samples:", X_val.shape[0])


XGB Training samples: 779
XGB Validation samples: 335


### Train XGBoost

In [2]:
xgb_model = XGBRegressor(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

xgb_model.fit(X_train, y_train)

joblib.dump(xgb_model, "xgb_model_bilinear.pkl")
print("XGBoost model saved")

XGBoost model saved


### Validate & metrics

In [4]:
xgb_pred = xgb_model.predict(X_val)

xgb_r2 = r2_score(y_val, xgb_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred))

print("XGBoost Results")
print("R²   :", round(xgb_r2, 3))
print("RMSE :", round(xgb_rmse, 3))


XGBoost Results
R²   : 0.849
RMSE : 0.716


### Feature importance

In [5]:
xgb_importance = pd.Series(
    xgb_model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print(xgb_importance)


norm_GHI      0.330663
norm_Cloud    0.227347
norm_Tempe    0.155017
norm_Rainf    0.115145
norm_Railw    0.045526
norm_Settl    0.044672
norm_Aspec    0.024890
norm_Road_    0.019936
norm_LULC     0.018632
norm_Slope    0.018171
dtype: float32
