In [None]:
#Installing XGBoost
!pip install xgboost

In [None]:
#Importing necessary libraries
import numpy as np
import pandas as pd
import xgboost as xg
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,  precision_score, recall_score, f1_score
from xgboost import XGBRegressor
from sklearn.utils.class_weight import compute_sample_weight
from openpyxl import load_workbook

In [None]:
#Reading the excel-document into a df
df = pd.read_excel('dataset_name.xlsx', Sheet_name = 'Sheet_name') 

#Choosing which columns to keep and saving them in a new df
columns_to_keep = ['Column 1', 'Column 2', 'Column 3', 'Column 4', 'Column 5', '...']
df2 = df[columns_to_keep].copy()

In [None]:
#Splitting df into two based on hole ID
df2.loc[:, 'Hole_id_stat_str'] = df2['Hole_id_stat'].astype(str)
columns_unseen = ['209']
df_train = df2[~df2['Hole_id_stat_str'].isin(columns_unseen)] 
df_unseen = df2[df2['Hole_id_stat_str'].isin(columns_unseen)] 


In [None]:
#Defining which columns to keep av model input
X = df_train.drop(columns = ['Column 1', 'Column 2', 'Column 3'])

#Defining which columns are the model outputs
y = df_train['Column 4']

#Defining inputs and outputs for unseen dataset
X_unseen = df_unseen.drop(columns = ['Column 1', 'Column 2', 'Column 3')
y_unseen = df_unseen['Column 4'] 

In [None]:
#Encode labels before splitting, ensures all class labels are integers
le = LabelEncoder()
y_fixed = le.fit_transform(y) 
print("Encoded Classes:", le.classes_)

#Using K-Fold cross-validation to split the data into 5 folds
k_fold = KFold(n_splits= 5, shuffle=True, random_state=66)

#Creating lists to store splits
X_train_list, X_test_list, y_train_list, y_test_list = [], [], [], []

#Performing KFold splitting 
for fold_num, (train_idx, test_idx) in enumerate(k_fold.split(X, y_fixed)):
    print(f"\nProcessing Fold {fold_num + 1}")

    #Getting splits
    X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
    y_train, y_test = y_fixed[train_idx], y_fixed[test_idx]

    print(f"Unique labels in y_train: {np.unique(y_train)}")
    print(f"Unique labels in y_test: {np.unique(y_test)}")

    #Storing splits
    X_train_list.append(X_train)
    X_test_list.append(X_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

#Using the first fold for training/testing
X_train, X_test = X_train_list[0], X_test_list[0]
y_train, y_test = y_train_list[0], y_test_list[0]

#Printing results check
print(f"\nFINAL CHECK:")
print(f"Unique labels in FINAL y_train: {np.unique(y_train)}")
print(f"Unique labels in FINAL y_test: {np.unique(y_test)}")


In [None]:
#Standardizing input features using StandardScaler
scaler = StandardScaler() 

#Fitting the scaler on the training data and transforming it
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index, columns=X_train.columns)

#Using the same scaler to transform the test data 
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [None]:
#Defining the XGBoost classifier 
XGB = XGBClassifier(
    objective="multi:softprob",              
    eval_metric="mlogloss",                 
    num_class=len(np.unique(y_train)),      
    use_label_encoder=False,                 
    random_state=42)

In [None]:
#Defining the hyperparameter grid 
param_grid = {
    'n_estimators': [75],         
    'max_depth': [5,10,15],              
    'learning_rate': [0.2,0.5,0.7],        
    'subsample': [0.4, 0.6],        
    'colsample_bytree': [0.2, 0.4], 
    'reg_alpha': [0, 1, 5],         
    'reg_lambda': [0, 1, 5]}

#Setting up GridSearchCV to find the best combination of hyperparameters
CV_XGB = GridSearchCV(
    estimator=XGB,                 
    param_grid=param_grid,        
    cv=3,                          
    scoring='accuracy',           
    n_jobs=-1,                     
    verbose=1)

#Computing sample weights to balance class imbalance during training
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

#Fitting GridSearchCV to the training data with sample weights
CV_XGB.fit(X_train_scaled, y_train, sample_weight=sample_weights)

#Printing the best hyperparameter combination found
print('Best Parameters: ', CV_XGB.best_params_)

In [None]:
#Retrieving the best XGBoost model found by GridSearchCV
XGB_best = CV_XGB.best_estimator_

#Making predictions on the training data using the best model
pred_train = XGB_best.predict(X_train_scaled)

#Making predictions on the test data using the best model
pred_test = XGB_best.predict(X_test_scaled)

In [None]:
#Calculating performance metrics
acc_train = accuracy_score(y_train, pred_train)
precision_train = precision_score(y_train, pred_train, average='weighted', zero_division=1)
recall_train = recall_score(y_train, pred_train, average='weighted')
f1_train = f1_score(y_train, pred_train, average='weighted')

#Printing all evaluation metrics with 4 decimals
print(f'Accuracy: {acc_train:.4f}')
print(f'Precision: {precision_train:.4f}')
print(f'Recall: {recall_train:.4f}')
print(f'F1 Score: {f1_train:.4f}')

#Computing the confusion matrix 
cm = confusion_matrix(y_train, pred_train)

#Creating readable labels for the axes
class_labels = [f"Class {i}" for i in np.unique(y_train)]

#Visualizing the confusion matrix 
sns.heatmap(
    cm,
    annot=True,             
    fmt="d",                
    cmap="Blues",           
    xticklabels=class_labels, 
    yticklabels=class_labels)

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
#Calculating performance metrics for test data
acc_test = accuracy_score(y_test, pred_test)
precision_test = precision_score(y_test, pred_test, average="macro", zero_division=1)
recall_test = recall_score(y_test, pred_test, average="macro", zero_division=1)
f1_test = f1_score(y_test, pred_test, average='weighted')

#Printing the evaluation metrics with 4 decimals
print(f'Accuracy: {acc_test:.4f}')
print(f'Precision: {precision_test:.4f}')
print(f'Recall: {recall_test:.4f}')
print(f'F1 Score: {f1_test:.4f}')

#Creating a confusion matrix 
cm = confusion_matrix(y_test, pred_test)

#Creating labels based on y_test
class_labels = [f"Class {i}" for i in np.unique(y_test)]

#Plotting the confusion matrix
sns.heatmap(
    cm,
    annot=True,             
    fmt="d",                
    cmap="Blues",           
    xticklabels=class_labels,
    yticklabels=class_labels)

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
#Sacling the unseen inputs with the scaler fitted on the training data
X_unseen_scaled = scaler.transform(X_unseen)
y_pred_unseen = XGB_best.predict(X_unseen_scaled)
y_unseen_encoded = le.transform(y_unseen)

#calculating performance metrics on the unseen data
acc_unseen = accuracy_score(y_unseen_encoded, y_pred_unseen)  
precision_unseen = precision_score(y_unseen_encoded, y_pred_unseen, average='weighted', zero_division=0)  
recall_unseen = recall_score(y_unseen_encoded, y_pred_unseen, average='weighted', zero_division=0)        
f1_unseen = f1_score(y_unseen_encoded, y_pred_unseen, average='weighted', zero_division=0)                 

#Printing the evaluation metrics
print(f'Accuracy: {acc_unseen:.4f}')
print(f'Precision: {precision_unseen:.4f}')
print(f'Recall: {recall_unseen:.4f}')
print(f'F1 Score: {f1_unseen:.4f}')


#Generating the confusion matrix 
cm_unseen = confusion_matrix(y_unseen_encoded, y_pred_unseen)

#Creating labels based on y_unseen
class_labels = [f"Class {i}" for i in np.unique(y_unseen)]

#Visualizing the confusion matrix 
sns.heatmap(
    cm_unseen,
    annot=True,                
    fmt="d",                  
    cmap="Blues",            
    xticklabels=class_labels,  
    yticklabels=class_labels   )

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
#Defining a function to log model evaluation metrics and hyperparameters to an Excel-file of chosen name
def log_results(model_name, params, accuracy_train, precision_train, recall_train, f1_train,
                accuracy_test, precision_test, recall_test, f1_test,
                accuracy_unseen, precision_unseen, recall_unseen, f1_unseen,
                filename="Results.xlsx"):
    """ Logs classification model results in separate sheets within the same Excel file, ensuring appending works correctly. """

    #Creating DataFrame for this model run
    result = pd.DataFrame([{
        **params, 
        "Accuracy_Train": accuracy_train,
        "Precision_Train": precision_train,
        "Recall_Train": recall_train,
        "F1_Train": f1_train,
        "Accuracy_Test": accuracy_test,
        "Precision_Test": precision_test,
        "Recall_Test": recall_test,
        "F1_Test": f1_test,
        "Accuracy_Unseen": accuracy_unseen,
        "Precision_Unseen": precision_unseen,
        "Recall_Unseen": recall_unseen,
        "F1_Unseen": f1_unseen
    }])

    #Introducing a short delay to avoid file conflicts if running in multiple notebooks
    time.sleep(1)

    #Checking if the file exists
    file_exists = os.path.exists(filename)

    if not file_exists:
        #If file doesn't exist, creating a new one
        with pd.ExcelWriter(filename, engine="openpyxl", mode="w") as writer:
            result.to_excel(writer, sheet_name=model_name, index=False)
    else:
        #If file exists, loading it properly before appending
        try:
            with pd.ExcelWriter(filename, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
                #Reading existing sheet 
                try:
                    existing_df = pd.read_excel(filename, sheet_name=model_name, engine="openpyxl")
                    df_combined = pd.concat([existing_df, result], ignore_index=True)
                except (FileNotFoundError, ValueError):
                    df_combined = result  #If sheet does not exist, creating it

                #Saving results
                df_combined.to_excel(writer, sheet_name=model_name, index=False)

        except PermissionError:
            #Printing error to warn user
            print(f"Error: Close the Excel file ({filename}) before running the script again.")

    #Printing a confirmation to ensure user results are logged
    print(f"Logged results for {model_name}: Train Accuracy={accuracy_train:.4f}, Test Accuracy={accuracy_test:.4f}, Unseen Accuracy={accuracy_unseen:.4f}")


In [None]:
#Using the function to log results from RF classifier model
log_results(
    model_name="XGBoost_class_stat",
    params=best_params,

    accuracy_train=accuracy_score(y_train, pred_train),
    precision_train=precision_score(y_train, pred_train, average='weighted', zero_division=0),
    recall_train=recall_score(y_train, pred_train, average='weighted', zero_division=0),
    f1_train=f1_score(y_train, pred_train, average='weighted', zero_division=0),

    accuracy_test=accuracy_score(y_test, pred_test),
    precision_test=precision_score(y_test, pred_test, average='weighted', zero_division=0),
    recall_test=recall_score(y_test, pred_test, average='weighted', zero_division=0),
    f1_test=f1_score(y_test, pred_test, average='weighted', zero_division=0),

    accuracy_unseen=accuracy_score(y_unseen_encoded, y_pred_unseen),
    precision_unseen=precision_score(y_unseen_encoded, y_pred_unseen, average='weighted', zero_division=0),
    recall_unseen=recall_score(y_unseen_encoded, y_pred_unseen, average='weighted', zero_division=0),
    f1_unseen=f1_score(y_unseen_encoded, y_pred_unseen, average='weighted', zero_division=0)
)
