# XG Boost

### Load data & split

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor
import joblib

# -----------------------------
# LOAD DATA
# -----------------------------
df = pd.read_csv(r"D:\WiDS\Project_sun\excelData\trainingData.csv")

X = df.drop(columns=["PV"])
y = df["PV"]

# -----------------------------
# TRAIN–VALIDATION SPLIT
# -----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42
)

print("XGB Training samples:", X_train.shape[0])
print("XGB Validation samples:", X_val.shape[0])


XGB Training samples: 779
XGB Validation samples: 335


### Train XGBoost

In [2]:
xgb_model = XGBRegressor(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

xgb_model.fit(X_train, y_train)

joblib.dump(xgb_model, "xgb_model.pkl")
print("XGBoost model saved")

XGBoost model saved


### Validate & metrics

In [3]:
xgb_pred = xgb_model.predict(X_val)

xgb_r2 = r2_score(y_val, xgb_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred))

print("XGBoost Results")
print("R²   :", round(xgb_r2, 3))
print("RMSE :", round(xgb_rmse, 3))


XGBoost Results
R²   : 0.84
RMSE : 0.737


### Feature importance

In [4]:
xgb_importance = pd.Series(
    xgb_model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print(xgb_importance)


GHI         0.329810
Cloud       0.249191
Temp        0.129720
Rainfall    0.103973
Settl       0.057300
Railways    0.051447
Road        0.023479
Aspect      0.020806
Slope       0.019036
LULC        0.015237
dtype: float32
