In [4]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef


In [71]:
# data=pd.read_csv("cleaned_data.csv")
# data=data.drop(columns=["icustay_id","hadm_id","intime"	,"outtime",	"dbsource",	"suspected_infection_time_poe",
# "antibiotic_time_poe","blood_culture_time","gender","ethnicity","icu_los","hosp_los","specimen_poe","first_service","subject_id"
# ])


# data['colloid_bolus']=data["colloid_bolus"].fillna(0)
# data=data.fillna(data.mean())
# # infection_specimen=OneHotEncoder().fit_transform(data["first_service"].to_numpy().reshape(-1,1))

# train_data,test_data=train_test_split(data,test_size=0.2,random_state=505)

# train_label=train_data["thirtyday_expire_flag"]
# train_data=train_data.drop(columns=["thirtyday_expire_flag","hospital_expire_flag"])

# test_label=test_data["thirtyday_expire_flag"]
# test_data=test_data.drop(columns=["thirtyday_expire_flag","hospital_expire_flag"])

In [3]:
def one_hot_encode_non_binary(df):
    # Identify columns that are categorical and non-binary (more than two unique values)
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    non_binary_cols = [col for col in categorical_cols if df[col].nunique() > 2]

    # Perform one-hot encoding only on non-binary columns
    df_encoded = pd.get_dummies(df, columns=non_binary_cols, drop_first=True)
    
    return df_encoded


data=pd.read_csv("all_patients_data.csv")
data=data.drop(columns=["SUBJECT_ID","HADM_ID","ICUSTAY_ID"	,"LOS_HOSPITAL","LOS_ICU"])
data=one_hot_encode_non_binary(data)
data["GENDER"]=(data["GENDER"]=='M')


train_data,test_data=train_test_split(data,test_size=0.2,random_state=505)

train_label=train_data["HOSPITAL_EXPIRE_FLAG"]
train_data=train_data.drop(columns=["HOSPITAL_EXPIRE_FLAG"])

test_label=test_data["HOSPITAL_EXPIRE_FLAG"]
test_data=test_data.drop(columns=["HOSPITAL_EXPIRE_FLAG"])

# Random Forest

In [5]:
random_forest_clf=RandomForestClassifier(n_estimators=200,min_samples_split=5)


param_grid_rf = {
    'n_estimators': [100, 200, 500],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'bootstrap': [True, False]
}
clf_rf = GridSearchCV(random_forest_clf, param_grid_rf,cv=5)

clf_rf.fit(train_data,train_label)

# random_forest_clf.score(test_data,test_label)

KeyboardInterrupt: 

In [58]:
pred_label=clf_rf.best_estimator_.predict(test_data)
print(classification_report(test_label, pred_label))
print(matthews_corrcoef(test_label,pred_label))

              precision    recall  f1-score   support

           0       0.85      0.97      0.91       723
           1       0.76      0.33      0.46       189

    accuracy                           0.84       912
   macro avg       0.80      0.65      0.68       912
weighted avg       0.83      0.84      0.81       912



1.Descriptive figs by group
2.Literature review

3.Outliers detection for expired patients()
4.Statistical tests for mortality rate on categorical covariates
Continuouse Features: whether feature space is seperable



Classification result: use Matthew correlation coef

# Logistic Regression (Prediction)

In [62]:
Logistic_clf=LogisticRegression(penalty='l1',solver="liblinear")


param_grid_logistic = {
    'C': [0.1,0.2,0.5,1,2],
}
clf_logistic = GridSearchCV(Logistic_clf, param_grid_logistic,cv=5)

clf_logistic.fit(train_data,train_label)

# random_forest_clf.score(test_data,test_label)

GridSearchCV(cv=5,
             estimator=LogisticRegression(penalty='l1', solver='liblinear'),
             param_grid={'C': [0.1, 0.2, 0.5, 1, 2]})

In [63]:
pred_label=clf_logistic.best_estimator_.predict(test_data)
print(classification_report(test_label, pred_label))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90       723
           1       0.64      0.37      0.47       189

    accuracy                           0.83       912
   macro avg       0.75      0.66      0.68       912
weighted avg       0.81      0.83      0.81       912



# XBG boost

In [69]:

xbg_clf=xgb.XGBClassifier()


# param_grid_xgb = {
#     'n_estimators': [100, 200, 500],
#     'learning_rate': [0.001, 0.01, 0.1],
#     'max_depth': [3, 5, 10],
#     'subsample': [0.7, 0.8, 1.0],
#     'colsample_bytree': [0.7, 0.8, 1.0],
#     'gamma': [0, 0.1, 0.2],
#     'reg_alpha': [0, 0.01, 0.1],
#     'reg_lambda': [1, 1.5, 2]
# }

param_grid_xgb = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5]}
clf_xgb = GridSearchCV(xbg_clf, param_grid_xgb,cv=5)

clf_xgb.fit(train_data,train_label)


GridSearchCV(cv=5, estimator=XGBClassifier(),
             param_grid={'max_depth': [3, 5], 'n_estimators': [100, 200, 500]})

In [70]:
pred_label=clf_xgb.best_estimator_.predict(test_data)
print(classification_report(test_label, pred_label))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91       723
           1       0.72      0.38      0.50       189

    accuracy                           0.84       912
   macro avg       0.79      0.67      0.70       912
weighted avg       0.83      0.84      0.82       912



# Logistic Regression (Inference)

In [55]:
Logistic_clf=sm.Logit(endog=train_label,exog=train_data.drop(columns=["race_other"]))
Logistic_result= Logistic_clf.fit_regularized(method='l1',alpha=1)

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.3425373708461775
            Iterations: 281
            Function evaluations: 384
            Gradient evaluations: 281


Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers


In [56]:
Logistic_result.summary()

0,1,2,3
Dep. Variable:,thirtyday_expire_flag,No. Observations:,3647.0
Model:,Logit,Df Residuals:,3558.0
Method:,MLE,Df Model:,88.0
Date:,"Wed, 16 Oct 2024",Pseudo R-squ.:,0.3045
Time:,15:24:07,Log-Likelihood:,-1240.4
converged:,True,LL-Null:,-1783.5
Covariance Type:,nonrobust,LLR p-value:,9.864000000000001e-172

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
suspected_infection_time_poe_days,-0.2116,0.195,-1.085,0.278,-0.594,0.171
positiveculture_poe,-0.1290,0.169,-0.762,0.446,-0.461,0.203
blood_culture_positive,0.0999,0.126,0.792,0.428,-0.147,0.347
age,0.0249,0.004,6.009,0.000,0.017,0.033
is_male,0.0266,0.114,0.233,0.815,-0.197,0.250
race_white,-0.3716,0.137,-2.711,0.007,-0.640,-0.103
race_black,-0.9565,0.246,-3.884,0.000,-1.439,-0.474
race_hispanic,-0.3674,0.352,-1.044,0.296,-1.057,0.322
metastatic_cancer,0.8565,0.195,4.401,0.000,0.475,1.238
