In [None]:
#Installing XGBoost
!pip install xgboost

In [None]:
#Importing all the needed libraries
import numpy as np
import pandas as pd
import xgboost as xg
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from openpyxl import load_workbook

In [None]:
#Reading the excel-document into a df
df = pd.read_excel('dataset_name.xlsx')

#dropping unwanted columns from the df
df2=df.drop(columns = ["Column 1", "Column 2"])

In [None]:
#Splitting df into two based on ID
df2["ID"] = df2["ID"].astype(str)  #Making sure its a string
df_train = df2[df2["ID"].isin(["ID 1", "ID 2", "ID 3"])]
df_unseen = df2[df2["ID"] == "ID 4"]

In [None]:
#defining which columns to keep av model input
X = df_train.drop(columns = ['Column 1', 'Column 2', 'Column 3'])

#defining which columns are the model outputs
y = df_train['Column 4']

#defining inputs and outputs for unseen dataset
X_unseen = df_unseen.drop(columns = ['Column 1', 'Column 2', 'Column 3')
y_unseen = df_unseen['Column 4']

#Setting up k-fold cross validation
k_fold = KFold(n_splits=10, random_state=66, shuffle=True)

In [None]:
#Using K-Fold cross-validation to split data into training and test sets
k_fold = KFold(n_splits=10, random_state=66, shuffle=True)

#Defining a pipeline that includes feature scaling and an XGBoost regressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),                      
    ('xg', xg.XGBRegressor(n_estimators=40,                
                           random_state=40))           
])

#Creating lists to store evaluation metrics for each fold
mae_scores_train, mse_scores_train, r2_scores_train = [], [], []
mae_scores_test, mse_scores_test, r2_scores_test = [], [], []

#Looping over the K-Fold splits
for train_index, test_index in k_fold.split(X, y):
    #Creating training and test sets for the fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    #Skipping the fold if the test set contains only one unique target value
    if len(np.unique(y_test)) == 1:
        print("Skipping this fold due to only one class in test set.")
        continue

    #Training the pipeline (scaling + XGBoost) on the training data
    pipeline.fit(X_train, y_train)

    #Making predictions for both training and test sets
    pred_train = pipeline.predict(X_train)
    pred_test = pipeline.predict(X_test)

    #Evaluating training performance
    mae_train = mean_absolute_error(y_train, pred_train)
    mse_train = mean_squared_error(y_train, pred_train)
    r2_train = r2_score(y_train, pred_train)

    #Evaluating test performance
    mae_test = mean_absolute_error(y_test, pred_test)
    mse_test = mean_squared_error(y_test, pred_test)
    r2_test = r2_score(y_test, pred_test)

    #Storing metrics for the fold
    mae_scores_train.append(mae_train)
    mse_scores_train.append(mse_train)
    r2_scores_train.append(r2_train)

    mae_scores_test.append(mae_test)
    mse_scores_test.append(mse_test)
    r2_scores_test.append(r2_test)

    #Printing performance for the fold
    print(f"Train -> MAE: {mae_train:.6f}, MSE: {mse_train:.6f}, R²: {r2_train:.6f}")
    print(f"Test  -> MAE: {mae_test:.6f}, MSE: {mse_test:.6f}, R²: {r2_test:.6f}\n")

#Printing average performance across all folds
print("\nAverage results for all folds:")
print(f"Train Mean MAE: {np.mean(mae_scores_train):.6f}, Test Mean MAE: {np.mean(mae_scores_test):.6f}")
print(f"Train Mean MSE: {np.mean(mse_scores_train):.6f}, Test Mean MSE: {np.mean(mse_scores_test):.6f}")
print(f"Train Mean R²: {np.mean(r2_scores_train):.6f}, Test Mean R²: {np.mean(r2_scores_test):.6f}")


In [None]:
#Defining a pipeline that includes feature scaling and an XGBoost regressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),            
    ('xg', xg.XGBRegressor(random_state=66)) 
])

#Defining the hyperparameter grid for tuning the XGBoost model
parameters_grid = {
    'xg__n_estimators': [300, 500, 750],      #Number of boosting rounds (trees)
    'xg__max_depth': [3,4,5],                 #Maximum depth of each tree
    'xg__learning_rate': [0.05, 0.08, 0.1],   #Step size shrinkage used in updates
    'xg__subsample': [0.6, 0.8, 1],           #Fraction of training samples to use per tree
    'xg__colsample_bytree': [0.3, 0.4, 0.5],  #Fraction of features used in each tree
    'xg__gamma': [1, 2, 4, 5],                #Minimum loss reduction required to make a further split
    'xg__reg_alpha': [3,6, 9],                #L1 regularization term on weights
    'xg__reg_lambda': [10,15,20,27],          #L2 regularization term on weights
    'xg__min_child_weight': [5,10,15]         #Minimum sum of instance weight (hessian) needed in a child
}

#Running GridSearchCV to perform hyperparameter tuning
CV_XGB = GridSearchCV(
    estimator=pipeline,
    param_grid=parameters_grid,
    cv=k_fold,
    scoring='r2',
    n_jobs=-1                                
)

#Fitting the model on the training data
CV_XGB.fit(X_train, y_train)

#Printing the best combination of hyperparameters found during the search
print('Best parameters: ', CV_XGB.best_params_)

In [None]:
#Removing the 'xg__' prefix from best_params
best_params = {key.replace("xg__", ""): value for key, value in CV_XGB.best_params_.items()}

#Getting the best pipeline
best_pipeline = CV_XGB.best_estimator_

#Making predictions using the full pipeline on training and test sets
y_train_pred = best_pipeline.predict(X_train)
y_test_pred = best_pipeline.predict(X_test)

#Evaluating performance on the training set
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

#Evaluating performance on the test set
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

#Printing model performance metrics
print(f"Train Mean Squared Error: {mse_train:.4f}")
print(f"Train R-squared: {r2_train:.4f}")
print(f"Test Mean Squared Error: {mse_test:.4f}")
print(f"Test R-squared: {r2_test:.4f}")

In [None]:
#Predicting on the unseen dataset using the best pipeline
y_unseen_pred = best_pipeline.predict(X_unseen)

#Calculating evaluation metrics on the unseen dataset
mse_unseen = mean_squared_error(y_unseen, y_unseen_pred)  
r2_unseen = r2_score(y_unseen, y_unseen_pred)             

#Printing the metrics for unseen data
print(f'Unseen Mean Squared Error: {mse_unseen:.4f}')
print(f'Unseen R-squared: {r2_unseen:.4f}')


In [None]:
#Defining a function to log model evaluation metrics and hyperparameters to an Excel-file of chosen name
def log_results(model_name, params, mse_train, r2_train, mse_test, r2_test, mse_unseen, r2_unseen, filename="Results.xlsx"):
    """ Logs model results in separate sheets within the same Excel file, ensuring appending works correctly. """

    #Creating DataFrame for this model run
    result = pd.DataFrame([{
        **params, 
        "MSE_Train": mse_train,
        "R2_Train": r2_train,
        "MSE_Test": mse_test,
        "R2_Test": r2_test,
        "MSE_Unseen": mse_unseen,
        "R2_Unseen": r2_unseen
    }])

    #Introducing a short delay to avoid file conflicts if running in multiple notebooks
    time.sleep(1)

    #Checking if the file exists
    file_exists = os.path.exists(filename)

    if not file_exists:
        #If file doesn't exist, creating a new one
        with pd.ExcelWriter(filename, engine="openpyxl", mode="w") as writer:
            result.to_excel(writer, sheet_name=model_name, index=False)
    else:
        #If file exists, loading it properly before appending
        try:
            with pd.ExcelWriter(filename, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
                #Reading existing sheet 
                try:
                    existing_df = pd.read_excel(filename, sheet_name=model_name, engine="openpyxl")
                    df_combined = pd.concat([existing_df, result], ignore_index=True)
                except (FileNotFoundError, ValueError):
                    df_combined = result  #If sheet does not exist, creating it

                #Saving results, ensuring correct appending
                df_combined.to_excel(writer, sheet_name=model_name, index=False)

        except PermissionError:
            #Printing error to warn user
            print(f"Error: Close the Excel file ({filename}) before running the script again.")
    
    #Printing a confirmation to ensure user results are logged
    print(f"Logged results for {model_name}: Train MSE={mse_train:.4f}, Test MSE={mse_test:.4f}, Unseen MSE={mse_unseen:.4f}")


In [None]:
#Using the function to log results from XGB regression model
log_results(
    model_name="XGBoost",
    params=best_params,
    mse_train=mse_train, 
    r2_train=r2_train,
    mse_test=mse_test,
    r2_test=r2_test,
    mse_unseen=mse_unseen,
    r2_unseen=r2_unseen
)