# Building Models to Predict Patient Readmission

## Loading and Cleaning Data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
diabetic_df = pd.read_csv('../data/diabetic_data.csv')
diabetic_df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [3]:
# Dropping encounter and patient identifier columns
diabetic_df_clean = diabetic_df.drop(columns=['encounter_id', 'patient_nbr']).copy()

In [4]:
# Replacing missing values identified by '?' with nan
diabetic_df_clean.replace('?', np.nan, inplace=True)
diabetic_df_clean

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),,6,25,1,1,,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),,1,1,7,3,,,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),,1,1,7,2,,,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),,1,1,7,2,,,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),,1,1,7,1,,,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),,1,3,7,3,MC,,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,AfricanAmerican,Female,[80-90),,1,4,5,5,MC,,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,Caucasian,Male,[70-80),,1,1,7,1,MC,,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,Caucasian,Female,[80-90),,2,3,7,10,MC,Surgery-General,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [5]:
# Dropping columns with high proportion of missing values and likely low predictive power
diabetic_df_clean.drop(columns=['weight', 'payer_code', 'medical_specialty'], inplace=True)

In [6]:
# Impute missing values in race column with mode of race
print(diabetic_df_clean.race.value_counts(dropna=False), '\n')

race_mode = diabetic_df_clean['race'].mode()[0]
print(f'Most common race: {race_mode}')
diabetic_df_clean['race'] = diabetic_df_clean['race'].replace(np.nan, race_mode)

race
Caucasian          76099
AfricanAmerican    19210
NaN                 2273
Hispanic            2037
Other               1506
Asian                641
Name: count, dtype: int64 

Most common race: Caucasian


In [7]:
# Dropping rows with `Unknown/Invalid` gender
len_1 = diabetic_df_clean.shape[0]
diabetic_df_clean = diabetic_df_clean[diabetic_df_clean['gender'] != 'Unknown/Invalid']
len_2 = diabetic_df_clean.shape[0]

print(f'Number of rows dropped: {len_1 - len_2}')

Number of rows dropped: 3


In [8]:
# Assessing percentages of missing values for `max_glu_serum` and `A1Cresult`
print('Percent of values in categories: \n')

print(
    100 * 
    diabetic_df_clean['max_glu_serum'].value_counts(dropna=False) /
    diabetic_df_clean['max_glu_serum'].value_counts(dropna=False).sum(),
    '\n'
)
print(
    100 * 
    diabetic_df_clean['A1Cresult'].value_counts(dropna=False) /
    diabetic_df_clean['A1Cresult'].value_counts(dropna=False).sum()
)

Percent of values in categories: 

max_glu_serum
NaN     94.746617
Norm     2.552008
>200     1.459273
>300     1.242102
Name: count, dtype: float64 

A1Cresult
NaN     83.276829
>8       8.073661
Norm     4.903550
>7       3.745959
Name: count, dtype: float64


In [9]:
# Replace missing values in max_glu_serum and A1Cresult columns with 'Missing' as new category
diabetic_df_clean.loc[:, ['max_glu_serum', 'A1Cresult']]  = diabetic_df_clean.loc[:, ['max_glu_serum', 'A1Cresult']] .fillna('Missing')

In [10]:
diabetic_df_clean

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),1,3,7,3,51,0,16,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,AfricanAmerican,Female,[80-90),1,4,5,5,33,3,18,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,Caucasian,Male,[70-80),1,1,7,1,53,0,9,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,Caucasian,Female,[80-90),2,3,7,10,45,2,21,...,No,Up,No,No,No,No,No,Ch,Yes,NO


## Encoding Variables

In [11]:
diabetic_df_clean

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),1,3,7,3,51,0,16,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,AfricanAmerican,Female,[80-90),1,4,5,5,33,3,18,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,Caucasian,Male,[70-80),1,1,7,1,53,0,9,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,Caucasian,Female,[80-90),2,3,7,10,45,2,21,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [12]:
# Encoding `race` using one-hot encoding
# diabetic_df_clean['race'].value_counts()
encoded_race = pd.get_dummies(diabetic_df_clean['race'], prefix='race')


# Encoding `gender` using one-hot encoding
# diabetic_df_clean['race'].value_counts()
encoded_gender = pd.get_dummies(diabetic_df_clean['gender'], prefix='gender')

# Encoding `age` using ordinal encoding
age_order = {
    '[0-10)': 0,
    '[10-20)': 1,
    '[20-30)': 2,
    '[30-40)': 3,
    '[40-50)': 4,
    '[50-60)': 5,
    '[60-70)': 6,
    '[70-80)': 7,
    '[80-90)': 8,
    '[90-100)': 9
}
encoded_age = diabetic_df_clean['age'].map(age_order)

# Encoding `admission_type_id` using one-hot encoding
# diabetic_df_clean['admission_type_id'].value_counts()
encoded_admission_type_id = pd.get_dummies(diabetic_df_clean['admission_type_id'], prefix='admission_type_id')

# Encoding `discharge_disposition_id` using binary encoding
from category_encoders import BinaryEncoder
discharge_disposition_encoder = BinaryEncoder(cols=['discharge_disposition_id'])
encoded_discharge_disposition_id = discharge_disposition_encoder.fit_transform(diabetic_df_clean['discharge_disposition_id'])

# Encoding `admission_source_id` using binary encoding
admission_source_encoder = BinaryEncoder(cols=['admission_source_id'])
encoded_admission_source_id = admission_source_encoder.fit_transform(diabetic_df_clean['admission_source_id'])

# Encoding `diag_1` using frequency encoding
diag_1_freq = diabetic_df_clean['diag_1'].value_counts()
diag_1_freq
encoded_diag_1 = diabetic_df_clean['diag_1'].map(diag_1_freq).to_frame(name='diag_1_freq')
# diabetic_df_clean['diag_1'].map(diag_1_freq).to_frame(name='diag_1_freq').join(diabetic_df_clean['diag_1']).drop_duplicates()

# Encoding `diag_2` using frequency encoding
diag_2_freq = diabetic_df_clean['diag_2'].value_counts()
diag_2_freq
encoded_diag_2 = diabetic_df_clean['diag_2'].map(diag_2_freq).to_frame(name='diag_2_freq')

# Encoding `diag_3` using frequency encoding
diag_3_freq = diabetic_df_clean['diag_3'].value_counts()
diag_3_freq
encoded_diag_3 = diabetic_df_clean['diag_3'].map(diag_3_freq).to_frame(name='diag_3_freq')

# Encoding `max_glu_serum` using ordinal encoding
max_glu_serum_order = {
    'Missing': -1,
    'Norm': 0,
    '>200': 1,
    '>300': 2
}
encoded_max_glu_serum = diabetic_df_clean['max_glu_serum'].map(max_glu_serum_order)

# Encoding `A1Cresult` using ordinal encoding
A1Cresult_order = {
    'Missing': -1,
    'Norm': 0,
    '>7': 1,
    '>8': 2
}
encoded_A1Cresult = diabetic_df_clean['A1Cresult'].map(A1Cresult_order)



In [13]:
# Encoding named medicine columns (from `metformin` to `metformin-pioglitazone`) using ordinal encoding
medicine_prescription_order = {
    'No': 0,
    'Down': 1,
    'Steady': 2,
    'Up': 3
}

# Get indices of named medicine columns
start_col = diabetic_df_clean.columns.get_loc('metformin')
end_col = diabetic_df_clean.columns.get_loc('metformin-pioglitazone')

# Loop through each named medicine column and encode using ordinal encoding
medicine_encoded_columns = pd.DataFrame()
for i in range(start_col, end_col + 1):
    colname = diabetic_df_clean.columns[i]
    encoded_colname = 'encoded_' + colname
    medicine_encoded_columns[encoded_colname] = diabetic_df_clean[colname].map(medicine_prescription_order)

# Checking medicine encoding has worked correctly
medicine_encoded_columns
# medicine_encoded_columns.join(diabetic_df_clean['metformin']).iloc[:, [0, -1]].value_counts()

Unnamed: 0,encoded_metformin,encoded_repaglinide,encoded_nateglinide,encoded_chlorpropamide,encoded_glimepiride,encoded_acetohexamide,encoded_glipizide,encoded_glyburide,encoded_tolbutamide,encoded_pioglitazone,...,encoded_troglitazone,encoded_tolazamide,encoded_examide,encoded_citoglipton,encoded_insulin,encoded_glyburide-metformin,encoded_glipizide-metformin,encoded_glimepiride-pioglitazone,encoded_metformin-rosiglitazone,encoded_metformin-pioglitazone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,0,0
2,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,0,0
4,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
101762,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
101763,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
101764,0,0,0,0,0,0,2,0,0,2,...,0,0,0,0,3,0,0,0,0,0


In [14]:
# Encoding `change` with one-hot encoding
encoded_change = pd.get_dummies(diabetic_df_clean['change'], prefix='change_')

# Checking `change` encoding worked correctly
encoded_change.join(diabetic_df_clean['change']).head(10)

Unnamed: 0,change__Ch,change__No,change
0,False,True,No
1,True,False,Ch
2,False,True,No
3,True,False,Ch
4,True,False,Ch
5,False,True,No
6,True,False,Ch
7,False,True,No
8,True,False,Ch
9,True,False,Ch


In [15]:
# Encoding `diabetesMed` with one-hot encoding
encoded_diabetesMed = pd.get_dummies(diabetic_df_clean['diabetesMed'], prefix='diabetesMed')

# Checking `diabetesMed` encoding worked correctly
encoded_diabetesMed.join(diabetic_df_clean['diabetesMed']).head(10)

Unnamed: 0,diabetesMed_No,diabetesMed_Yes,diabetesMed
0,True,False,No
1,False,True,Yes
2,False,True,Yes
3,False,True,Yes
4,False,True,Yes
5,False,True,Yes
6,False,True,Yes
7,False,True,Yes
8,False,True,Yes
9,False,True,Yes


In [16]:
# Encoding the target variable, `readmitted` using ordinal encoding - higher number means better outcome
readmitted_order = {
    '<30': 0,
    '>30': 1,
    'NO': 2
}

encoded_readmitted = diabetic_df_clean['readmitted'].map(readmitted_order)

# Checking `readmitted` encoding worked correctly 
diabetic_df_clean['readmitted'].to_frame(name='orig_readmitted').join(encoded_readmitted).head(10)

Unnamed: 0,orig_readmitted,readmitted
0,NO,2
1,>30,1
2,NO,2
3,NO,2
4,NO,2
5,>30,1
6,NO,2
7,>30,1
8,NO,2
9,NO,2


## Combining Encoded Variables into DataFrame

In [17]:
# Defining columns which were not encoded (same as original dataframe)
non_encoded_col_names = [
                    'time_in_hospital',
                    'num_lab_procedures', 
                    'num_procedures', 
                    'num_medications', 
                    'number_outpatient', 
                    'number_emergency', 
                    'number_inpatient', 
                    'number_diagnoses'
                    ]

# Getting names of columns which have been encoded
encoded_col_names = [col for col in diabetic_df_clean.columns if col not in non_encoded_col_names]

# Creating dictionary for name of column and variable name with encoded variables
encoded_vars_dict = {
    f'encoded_{name}': globals()[f'encoded_{name}']
    for name in encoded_col_names
    if f'encoded_{name}' in globals() and f'encoded_{name}' not in medicine_encoded_columns.columns
}

# Converting dictionary into dataframe with encoded variables
encoded_vars_df = pd.concat(encoded_vars_dict.values(), axis=1)

# Joining encoded variables with non-encoded variables
diabetic_df_clean_encoded = encoded_vars_df.join(diabetic_df_clean[non_encoded_col_names])

# Joining with named and encoded medicines dataframe
diabetic_df_clean_encoded = diabetic_df_clean_encoded.join(medicine_encoded_columns)

# Moving target variable, `readmitted` to be first column
cols = diabetic_df_clean_encoded.columns.to_list()
cols.remove('readmitted')
cols.insert(0, 'readmitted')
diabetic_df_clean_encoded = diabetic_df_clean_encoded[cols]

In [18]:
# Checking final clean, encoded dataframe
diabetic_df_clean_encoded.head(10)

Unnamed: 0,readmitted,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Female,gender_Male,age,admission_type_id_1,...,encoded_troglitazone,encoded_tolazamide,encoded_examide,encoded_citoglipton,encoded_insulin,encoded_glyburide-metformin,encoded_glipizide-metformin,encoded_glimepiride-pioglitazone,encoded_metformin-rosiglitazone,encoded_metformin-pioglitazone
0,2,False,False,True,False,False,True,False,0,False,...,0,0,0,0,0,0,0,0,0,0
1,1,False,False,True,False,False,True,False,1,True,...,0,0,0,0,3,0,0,0,0,0
2,2,True,False,False,False,False,True,False,2,True,...,0,0,0,0,0,0,0,0,0,0
3,2,False,False,True,False,False,False,True,3,True,...,0,0,0,0,3,0,0,0,0,0
4,2,False,False,True,False,False,False,True,4,True,...,0,0,0,0,2,0,0,0,0,0
5,1,False,False,True,False,False,False,True,5,False,...,0,0,0,0,2,0,0,0,0,0
6,2,False,False,True,False,False,False,True,6,False,...,0,0,0,0,2,0,0,0,0,0
7,1,False,False,True,False,False,False,True,7,True,...,0,0,0,0,0,0,0,0,0,0
8,2,False,False,True,False,False,True,False,8,False,...,0,0,0,0,2,0,0,0,0,0
9,2,False,False,True,False,False,True,False,9,False,...,0,0,0,0,2,0,0,0,0,0


## Creating Training and Testing Data

In [45]:
from sklearn.model_selection import train_test_split

X = diabetic_df_clean_encoded.iloc[:, 1:] # Input features
y = diabetic_df_clean_encoded.iloc[:, 0] # Target variable

# 80% train, 20% test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Oversampling to balance readmission classes
train_df = pd.concat([X_train, y_train], axis=1)
max_count = train_df['readmitted'].value_counts().max()
train_df_balanced = pd.concat([
    group.sample(n=max_count, replace=True, random_state=42)
    for _, group in train_df.groupby('readmitted')
])
# Shuffling balanced dataframe
train_df_balanced = train_df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Splitting balanced dataframe back into feature dataframe and target dataframe
X_train_balanced = train_df_balanced.drop(columns=['readmitted'])
y_train_balanced = train_df_balanced['readmitted']

## Creating Decision Tree Model

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialise model with classes balanced automatically
dt_model = DecisionTreeClassifier(max_depth=4, class_weight='balanced')

# Train model
dt_model.fit(X=X_train_balanced, y=y_train_balanced)

# Make predictions
y_pred = dt_model.predict(X_test)

# Evaluate performance
print('Accuracy:', accuracy_score(y_test, y_pred), '\n') # Accuracy
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred), '\n')
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.4820910922222768 

Confusion Matrix:
 [[1079  259  931]
 [2531 1116 3421]
 [2158 1241 7617]] 

Classification Report:
               precision    recall  f1-score   support

           0       0.19      0.48      0.27      2269
           1       0.43      0.16      0.23      7068
           2       0.64      0.69      0.66     11016

    accuracy                           0.48     20353
   macro avg       0.42      0.44      0.39     20353
weighted avg       0.51      0.48      0.47     20353



## Creating Random Forest Model

In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [300],
    'max_depth': [15],
    'min_samples_split': [2],
    'min_samples_leaf': [2],
}

# Initialise model with classes balanced automatically
rf_model = RandomForestClassifier(
    n_jobs=-1,
    random_state=42,
    class_weight='balanced'
)

# Create grid search
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=2,
    scoring='f1_weighted',
    verbose=2,
    n_jobs=-1
)

# Train model
grid_search.fit(X_train_balanced, y_train_balanced)

# Find best model
best_rf_model = grid_search.best_estimator_

# Make predictions
y_pred = best_rf_model.predict(X_test)

# Evaluate performance
print('Best Parameters:', grid_search.best_params_, '\n')
print('Accuracy:', accuracy_score(y_test, y_pred), '\n') # Accuracy
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred), '\n') # Confusion matrix
print('Classification Report:\n', classification_report(y_test, y_pred)) # Classification report

Fitting 2 folds for each of 1 candidates, totalling 2 fits
Best Parameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300} 

Accuracy: 0.5442932245860561 

Confusion Matrix:
 [[ 666  852  751]
 [1191 3165 2712]
 [1049 2720 7247]] 

Classification Report:
               precision    recall  f1-score   support

           0       0.23      0.29      0.26      2269
           1       0.47      0.45      0.46      7068
           2       0.68      0.66      0.67     11016

    accuracy                           0.54     20353
   macro avg       0.46      0.47      0.46     20353
weighted avg       0.55      0.54      0.55     20353



## Creating XGBoost Model

In [53]:
import xgboost as xgb

# Initialize model
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',   # multi-class classification
    num_class=3,                 # number of classes
    max_depth=10,                 # tree depth
    n_estimators=200,            # number of trees
    learning_rate=0.1,           # step size shrinkage
    use_label_encoder=False,     # to suppress warning
    eval_metric='mlogloss',      # evaluation metric during training
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=1
)

# Train model
xgb_model.fit(X_train_balanced, y_train_balanced)

# Predict
y_pred = xgb_model.predict(X_test)

# Evaluate performance
print('Accuracy:', accuracy_score(y_test, y_pred), '\n') # Accuracy
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred), '\n')
print('Classification Report:\n', classification_report(y_test, y_pred))

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.5310273669729279 

Confusion Matrix:
 [[ 602  932  735]
 [1180 3399 2489]
 [1131 3078 6807]] 

Classification Report:
               precision    recall  f1-score   support

           0       0.21      0.27      0.23      2269
           1       0.46      0.48      0.47      7068
           2       0.68      0.62      0.65     11016

    accuracy                           0.53     20353
   macro avg       0.45      0.45      0.45     20353
weighted avg       0.55      0.53      0.54     20353

