# XGBoost model

A predictor model that will be used to predict mortality rate.

In [44]:
# imports

import xgboost as xgb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm  # Import tqdm for progress bar
import threading  # Import threading for locking

import os
import sys
import asyncio

print("Python version: \t", sys.version)
print("Pandas version: \t", pd.__version__)
print("NumPy version: \t\t", np.__version__)
print("Seaborn version: \t", sns.__version__)
print("XGBoost version: \t", xgb.__version__)

Python version: 	 3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]
Pandas version: 	 2.2.3
NumPy version: 		 2.2.4
Seaborn version: 	 0.13.2
XGBoost version: 	 3.0.0


In [45]:
# load the data

X_train = pd.read_csv(r"split data/X_train.csv")
y_train = pd.read_csv(r"split data/y_train.csv")

print(f"X_train - Rows: {X_train.shape[0]}, Columns: {X_train.shape[1]}, Number of NaNs: {X_train.isna().sum().sum()}")
print(f"y_train - Rows: {y_train.shape[0]}, Columns: {y_train.shape[1]}, Number of NaNs: {y_train.isna().sum().sum()}")

X_train - Rows: 64199, Columns: 174, Number of NaNs: 0
y_train - Rows: 64199, Columns: 1, Number of NaNs: 0


In [65]:


# explore different hyperparameters for XGBoost

N_CONFIGS = 4 # number of different configurations to test
hyperparameters = {
    "n_estimators": [50, 100, 300, 500], # increasing number of trees to find out what fits the dataset size best, get more prone to overfitting.
    "max_depth": [3, 3, 5, 6], # increasing the depth to find out which depth matches feature dependency best.
    "learning_rate": [0.3, 0.3, 0.1, 0.01], # reducing learning rate for deeper trees with more estimators.
    "objective": ["binary:logistic", "binary:logistic", "binary:logistic", "binary:logistic"], # we choose binary classification with logistic loss for all models.
    "n_jobs": [-1, -1, -1, -1], # use all available cpu-cores by multithreading (your pc will start sweating💦)
}

async def train_modelvar(X_train: pd.DataFrame, y_train: pd.Series, savepath: str, i: int, hyperparameters: dict[str]) -> None:
    """
    Train the model with the given hyperparameters and save the model to a file, asynchronously.
    """

    # Train the model
    model = xgb.XGBClassifier(**hyperparameters)
    model.fit(X_train, y_train)

    # validate savepath is a theoretically valid path
    if not isinstance(savepath, str): raise ValueError("savepath must be a string")
    if not savepath.endswith("/"): 
        savepath += "/"
        print(f"savepath was not a valid path, added trailing slash: {savepath}")

    # create the savepath if it does not exist
    if not os.path.exists(savepath): 
        os.makedirs(savepath)
        print(f"Created missing directory: {savepath}")

    # Save the model to a file
    model.save_model(savepath + f"model_{i}.json")
    print(f"Saved model {i} to {savepath}model_{i}.json")

async def train_variations(X_train: pd.DataFrame, y_train: pd.Series, savepath: str, hyperparameters: dict[str, list[any]]) -> None:
    """
    Train all variations of the model and save them to files, based on the hyperparameters provided.
    """

    # create tasks for each model variant for asynchronous training
    tasks = []
    for model_variant_index in range(N_CONFIGS):
        paramvariation = {param : val[model_variant_index] for param, val in hyperparameters.items()}
        tasks.append(train_modelvar(X_train, y_train, savepath, model_variant_index, paramvariation))

    # start training all models in parallel
    await asyncio.gather(*tasks)

In [66]:

# train the model a couple times with different hyperparameters
# and save the models to disk

await train_variations(X_train, y_train, "./models/", hyperparameters)

Saved model 0 to ./models/model_0.json
Saved model 1 to ./models/model_1.json
Saved model 2 to ./models/model_2.json
Saved model 3 to ./models/model_3.json


## Validation

In [67]:
# load the models and evaluate them on validation data

# validation data
X_val = pd.read_csv(r"split data/X_val.csv")
y_val = pd.read_csv(r"split data/y_val.csv")

# models to validate
models = [xgb.XGBClassifier().load_model(f"./models/model_{i}.json") for i in range(N_CONFIGS)] # no need to restore the hyperparameters, they are not needed for validation.

