In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

In [2]:
try:
    # Try loading the file from your laptop path
    df = pd.read_csv('C:\\Users\\Republic Of Gamers\\OneDrive\\Documents\\GitHub\\TSDN-BoyWithLuv\\Source\\Prototype\\New\\front_service_1.csv')
except FileNotFoundError:
    # If the file is not found, try loading from the PC path
    df = pd.read_csv('C:\\Users\\Republic Of Gamers\\OneDrive\\Documents\\GitHub\\TSDN-BoyWithLuv\\Source\\Prototype\\New\\front_service_1.csv')

In [3]:
df.head()

Unnamed: 0,death,hospdead,sfdm2,age,sex,dzgroup,dzclass,num.co,edu,income,...,bili,crea,sod,ph,glucose,bun,urine,adlp,adls,adlsc
0,0,0,,62.84998,male,Lung Cancer,Cancer,0,11.0,$11-$25k,...,0.199982,1.199951,141.0,7.459961,,,,7.0,7.0,7.0
1,1,1,<2 mo. follow-up,60.33899,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,$11-$25k,...,,5.5,132.0,7.25,,,,,1.0,1.0
2,1,0,<2 mo. follow-up,52.74698,female,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,under $11k,...,2.199707,2.0,134.0,7.459961,,,,1.0,0.0,0.0
3,1,0,no(M2 and SIP pres),42.38498,female,Lung Cancer,Cancer,2,11.0,under $11k,...,,0.799927,139.0,,,,,0.0,0.0,0.0
4,0,0,no(M2 and SIP pres),79.88495,female,ARF/MOSF w/Sepsis,ARF/MOSF,1,,,...,,0.799927,143.0,7.509766,,,,,2.0,2.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9105 entries, 0 to 9104
Data columns (total 45 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   death     9105 non-null   int64  
 1   hospdead  9105 non-null   int64  
 2   sfdm2     7705 non-null   object 
 3   age       9105 non-null   float64
 4   sex       9105 non-null   object 
 5   dzgroup   9105 non-null   object 
 6   dzclass   9105 non-null   object 
 7   num.co    9105 non-null   int64  
 8   edu       7471 non-null   float64
 9   income    6123 non-null   object 
 10  scoma     9104 non-null   float64
 11  charges   8933 non-null   float64
 12  totcst    8217 non-null   float64
 13  totmcst   5630 non-null   float64
 14  avtisst   9023 non-null   float64
 15  race      9063 non-null   object 
 16  sps       9104 non-null   float64
 17  aps       9104 non-null   float64
 18  surv2m    9104 non-null   float64
 19  surv6m    9104 non-null   float64
 20  hday      9105 non-null   int6

In [5]:
df.rename(columns={
    'death':'death_outside_hospital',
    'hospdead':'death_inside_hospital',
    'age': 'age_years',
    'sex': 'gender',
    'dzgroup': 'disease_group',
    'dzclass': 'disease_class',
    'num.co': 'num_comorbidities',
    'edu': 'education_years',
    'income': 'income_range',
    'scoma': 'coma_score',
    'charges': 'hospital_charges',
    'totcst': 'total_cost_to_charges_ratio',
    'totmcst': 'total_micro_cost',
    'avtisst': 'avg_tiss_score',
    'race': 'race_ethnicity',
    'sps': 'support_physiology_score',
    'aps': 'apache_score',
    'surv2m': 'survival_2_months',
    'surv6m': 'survival_6_months',
    'hday': 'hospital_day_of_study_entry',
    'diabetes': 'has_diabetes',
    'dementia': 'has_dementia',
    'ca': 'cancer_status',
    'prg2m': 'physician_survival_2_months',
    'prg6m': 'physician_survival_6_months',
    'dnr': 'do_not_resuscitate_status',
    'dnrday': 'day_of_dnr_order',
    'meanbp': 'mean_arterial_bp',
    'wblc': 'white_blood_cell_count',
    'hrt': 'heart_rate',
    'resp': 'respiratory_rate',
    'temp': 'body_temperature_celsius',
    'pafi': 'pao2_fio2_ratio',
    'alb': 'serum_albumin',
    'bili': 'bilirubin_level',
    'crea': 'serum_creatinine',
    'sod': 'serum_sodium',
    'ph': 'blood_ph',
    'glucose': 'glucose_level',
    'bun': 'blood_urea_nitrogen',
    'urine': 'urine_output',
    'adlp': 'adl_index_patient',
    'adls': 'adl_index_surrogate',
    'sfdm2': 'functional_disability_level',
    'adlsc': 'calibrated_adl_index_surrogate'
}, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9105 entries, 0 to 9104
Data columns (total 45 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   death_outside_hospital          9105 non-null   int64  
 1   death_inside_hospital           9105 non-null   int64  
 2   functional_disability_level     7705 non-null   object 
 3   age_years                       9105 non-null   float64
 4   gender                          9105 non-null   object 
 5   disease_group                   9105 non-null   object 
 6   disease_class                   9105 non-null   object 
 7   num_comorbidities               9105 non-null   int64  
 8   education_years                 7471 non-null   float64
 9   income_range                    6123 non-null   object 
 10  coma_score                      9104 non-null   float64
 11  hospital_charges                8933 non-null   float64
 12  total_cost_to_charges_ratio     82

In [7]:
functional_disability_level_mapping = {
    '<2 mo. follow-up': 1, 
    'no(M2 and SIP pres)': 2, 
    'SIP>=30': 3,
    'adl>=4 (>=5 if sur)': 4, 
    'Coma or Intub':5 
}

gender_mapping = {
    'male': 1, 
    'female': 2
}

disease_group_mapping = {
    'Lung Cancer': 1,
    'Cirrhosis': 2,
    'ARF/MOSF w/Sepsis': 3,
    'Coma': 4,
    'CHF': 5,
    'Colon Cancer': 6,
    'COPD': 7,
    'MOSF w/Malig': 8
}

disease_class_mapping = {
    'Cancer': 1,
    'COPD/CHF/Cirrhosis': 2,
    'ARF/MOSF': 3,
    'Coma': 4
}

income_range_mapping = {
    '$11-$25k': 1,
    'under $11k': 2,
    '$25-$50k': 3,
    '>$50k': 4
}

race_ethnicity_mapping = {
    'other': 1,
    'white': 2,
    'black': 3,
    'hispanic': 4,
    'asian': 5
}

cancer_status_mapping = {
    'metastatic': 1,
    'no': 0,
    'yes': 2
}

dnr_status_mapping = {
    'no dnr': 0,
    'dnr after sadm': 1,
    'dnr before sadm': 2
}

In [8]:
df['functional_disability_level'] = df['functional_disability_level'].map(functional_disability_level_mapping)
df['gender'] = df['gender'].map(gender_mapping)
df['disease_group'] = df['disease_group'].map(disease_group_mapping)
df['disease_class'] = df['disease_class'].map(disease_class_mapping)
df['income_range'] = df['income_range'].map(income_range_mapping)
df['race_ethnicity'] = df['race_ethnicity'].map(race_ethnicity_mapping)
df['cancer_status'] = df['cancer_status'].map(cancer_status_mapping)
df['do_not_resuscitate_status'] = df['do_not_resuscitate_status'].map(dnr_status_mapping)

In [9]:
df['age_years'] = df['age_years'].astype(int)

In [10]:
survival_df = df.copy()

In [11]:
survival_df['death'] = ((survival_df['death_outside_hospital'] == 1) | (survival_df['death_inside_hospital'] == 1)).astype(int)

In [12]:
survival_df[['death','death_outside_hospital','death_inside_hospital']]

Unnamed: 0,death,death_outside_hospital,death_inside_hospital
0,0,0,0
1,1,1,1
2,1,1,0
3,1,1,0
4,0,0,0
...,...,...,...
9100,0,0,0
9101,0,0,0
9102,0,0,0
9103,1,1,1


In [13]:
features = survival_df[['death','age_years', 'gender', 'num_comorbidities', 'has_diabetes', 'has_dementia', 'cancer_status',
    'functional_disability_level', 'coma_score', 'support_physiology_score', 'apache_score',
    'mean_arterial_bp', 'heart_rate', 'respiratory_rate', 'body_temperature_celsius',
    'serum_sodium', 'serum_creatinine', 'do_not_resuscitate_status']]

In [14]:
features['do_not_resuscitate_status'] = features['do_not_resuscitate_status'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['do_not_resuscitate_status'] = features['do_not_resuscitate_status'].fillna(0)


In [15]:
features = features.dropna()

In [16]:
# Separate the majority and minority classes
death_majority_class= features[features['death'] == 1]
death_minority_class = features[features['death'] == 0]

# Undersample the majority class
death_majority_class_undersampled = death_majority_class.sample(n=len(death_minority_class), random_state=42)

# Combine the undersampled majority class with the minority class
death_undersampled_df = pd.concat([death_majority_class_undersampled, death_minority_class], axis=0)

# Shuffle the dataset
death_undersampled_df= death_undersampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the class distribution after undersampling
print(death_undersampled_df['death'].value_counts())

death
1    2298
0    2298
Name: count, dtype: int64


In [17]:
X = death_undersampled_df.drop(columns = ['death'])
y = death_undersampled_df['death']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_model = XGBClassifier()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters found:  {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}


In [20]:
# Extract the best parameters from the grid search
best_params = grid_search.best_params_

# Initialize the XGBRegressor with the best parameters
best_xgb = XGBClassifier(**best_params)

# Fit the model to the full training data
best_xgb.fit(X_train, y_train)

# Make predictions on the test set or future data
y_pred = best_xgb.predict(X_test)
y_pred_prob = best_xgb.predict_proba(X_test)[:,1]
# Display predictions
print("Predictions:", y_pred)

Predictions: [1 0 0 ... 0 0 0]


In [21]:
# Print evaluation metrics
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_prob))

Confusion Matrix:
 [[615  64]
 [196 504]]

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.91      0.83       679
           1       0.89      0.72      0.79       700

    accuracy                           0.81      1379
   macro avg       0.82      0.81      0.81      1379
weighted avg       0.82      0.81      0.81      1379


ROC AUC Score: 0.873460972017673


In [22]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the model
rf = RandomForestClassifier()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit to the data
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 162 candidates, totalling 486 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found:  {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [23]:
# Extract the best parameters from the grid search
best_params = grid_search.best_params_

# Initialize the XGBRegressor with the best parameters
best_rf = RandomForestClassifier(**best_params)

# Fit the model to the full training data
best_rf.fit(X_train, y_train)

# Make predictions on the test set or future data
y_pred_rf = best_rf.predict(X_test)
y_pred_prob_rf = best_rf.predict_proba(X_test)[:,1]
# Display predictions
print("Predictions:", y_pred_rf)

Predictions: [1 0 0 ... 0 1 0]


In [24]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_prob_rf))

Confusion Matrix:
 [[609  70]
 [196 504]]

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.90      0.82       679
           1       0.88      0.72      0.79       700

    accuracy                           0.81      1379
   macro avg       0.82      0.81      0.81      1379
weighted avg       0.82      0.81      0.81      1379


ROC AUC Score: 0.8727771933515673


In [26]:
joblib.dump(best_xgb, 'xgb_survival_rate.pkl')

['xgb_survival_rate.pkl']