# XGBoost model

A predictor model that will be used to predict mortality rate.

In [1]:
# imports

import xgboost as xgb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys

print("Python version: \t", sys.version)
print("Pandas version: \t", pd.__version__)
print("NumPy version: \t\t", np.__version__)
print("Seaborn version: \t", sns.__version__)
print("XGBoost version: \t", xgb.__version__)

Python version: 	 3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]
Pandas version: 	 2.2.3
NumPy version: 		 2.2.4
Seaborn version: 	 0.13.2
XGBoost version: 	 3.0.0


In [3]:
# load the data

X_train = pd.read_csv(r"split data/X_train.csv")
y_train = pd.read_csv(r"split data/y_train.csv")

print(X_train.describe())
print(y_train.describe())
print(X_train.info())
print(y_train.info())

           num__age      num__bmi  num__elective_surgery   num__height  \
count  64199.000000  64199.000000           64199.000000  64199.000000   
mean      62.360407     29.305449               0.184894    169.678806   
std       16.454368      8.171771               0.388214     10.753492   
min       16.000000     14.844926               0.000000    137.200000   
25%       53.000000     23.804643               0.000000    162.560000   
50%       64.200000     27.932774               0.000000    170.100000   
75%       75.000000     33.121679               0.000000    177.800000   
max       89.000000     67.814990               1.000000    195.590000   

       num__pre_icu_los_days  num__readmission_status   num__weight  \
count           64199.000000                  64199.0  64199.000000   
mean                0.838438                      0.0     84.376884   
std                 2.434461                      0.0     24.772746   
min               -13.775000                     

In [None]:
# train the model a couple times with different hyperparameters
# and save the models to disk

from tqdm import tqdm  # Import tqdm for progress bar

hyperparameters = {
    "n_estimators": [50, 100, 300, 500], # increasing number of trees to find out what fits the dataset size best, get more prone to overfitting.
    "max_depth": [3, 3, 5, 6], # increasing the depth to find out which depth matches feature dependency best.
    "learning_rate": [0.3, 0.3, 0.1, 0.01], # reducing learning rate for deeper trees with more estimators.
    "objective": ["binary:logistic", "binary:logistic", "binary:logistic", "binary:logistic"], # we choose binary classification with logistic loss for all models.
    "n_jobs": [-1, -1, -1, -1], # use all available cpu-cores by multithreading (your pc will start sweating💦)
}

for modelindex in range(len(hyperparameters.keys())):
    print(f"Training model with hyperparameters: {hyperparameters}...")
    model = xgb.XGBClassifier(**hyperparameters, use_label_encoder=False)
    
    with tqdm(total=100, desc=f"Model {modelindex + 1}/{len(hyperparameters['n_estimators'])} Training Progress", unit="step") as pbar:
        for epoch in range(100):  # # use 100 epochs for training
            model.fit(X_train, y_train.values.ravel(), verbose=False, xgb_model=model.get_booster())
            pbar.update(1)
    model.save_model(f"model_{modelindex + 1}.json")
