In [4]:
# Filename: xgboost_training.ipynb

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score, KFold

# Load the preprocessed data
X_train = pd.read_csv(r"../preprocessing/X_train.csv")
X_test = pd.read_csv(r"../preprocessing/X_test.csv")
y_train = pd.read_csv(r"../preprocessing/y_train.csv").squeeze()
y_test = pd.read_csv(r"../preprocessing/y_test.csv").squeeze()

# Ensure all features are numeric by dropping non-numeric columns
X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

# 1. Train the XGBoost Model without Hyperparameter Tuning
xgboost_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgboost_model.fit(X_train, y_train)

# 2. Evaluate the Model
y_pred = xgboost_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"R2 Score: {r2}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

kfold = KFold(n_splits=10, random_state=42, shuffle=True)
cv_results = cross_val_score(xgboost_model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_results.mean())

print(f"Cross-validated RMSE: {cv_rmse}")


R2 Score: 0.9958227265687337
RMSE: 0.5982930045856326
MAE: 0.3703226106166841
Cross-validated RMSE: 0.6774603451144942
