## Exploratory Modelling of Imputed Dataset (Clean)

In [27]:
# import dependencies
import pandas as pd
import numpy as np
import duckdb
from duckdb import sql
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

In [2]:
# import dataset
data = pd.read_csv('../data/CARES_data_imputedv1.csv', dtype={'RCRI_score':str})

In [3]:
# convert ICU variable to [0,1]
data['ICUAdmgt24h'] = data['ICUAdmgt24h'].replace({'yes': 1, 'no': 0})

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69667 entries, 0 to 69666
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   GENDER                             69667 non-null  object 
 1   RCRI_score                         69667 non-null  object 
 2   Anemia category                    69667 non-null  object 
 3   Preoptransfusionwithin30days       69667 non-null  float64
 4   Intraop                            69667 non-null  float64
 5   Postopwithin30days                 69667 non-null  float64
 6   Transfusionintraandpostop          69667 non-null  float64
 7   AnaestypeCategory                  69667 non-null  object 
 8   PriorityCategory                   69667 non-null  object 
 9   TransfusionIntraandpostopCategory  69667 non-null  object 
 10  AGEcategory                        69667 non-null  object 
 11  Mortality                          69667 non-null  obj

### Multivariate Analysis for Categorical Response "ICUAdmg24h"

In [5]:
# Define categorical, numeric & target columns
# Define the target columns
target_cols = ['ICUAdmgt24h', 'thirtydaymortality', 'Mortality']

# Select categorical columns and exclude target columns
cat_cols = data.select_dtypes(include=['object', 'category']).columns.to_list()
cat_cols = [col for col in cat_cols if col not in target_cols]

# Select numerical columns and exclude target columns
num_cols = data.select_dtypes(include=['number']).columns.tolist()
num_cols = [col for col in num_cols if col not in target_cols]

In [6]:
# See what are the unique categories present in the categorical columns
summary_cat_data = {
    'Column': [],
    'Number of Unique Values': [],
    'Unique Values': []
}

for column in cat_cols:
    summary_cat_data['Column'].append(column)
    summary_cat_data['Number of Unique Values'].append(data[column].nunique())
    summary_cat_data['Unique Values'].append(data[column].unique())

summary_df = pd.DataFrame(summary_cat_data)
summary_df
# There are no NaN values in the columns

Unnamed: 0,Column,Number of Unique Values,Unique Values
0,GENDER,2,"[FEMALE, MALE]"
1,RCRI_score,7,"[0, 1, 2, 4, 3, 6, 5]"
2,Anemia category,3,"[mild, moderate/severe, none]"
3,AnaestypeCategory,2,"[GA, RA]"
4,PriorityCategory,2,"[Elective, Emergency]"
5,TransfusionIntraandpostopCategory,3,"[0 units, 1 unit, 2 or more units]"
6,AGEcategory,6,"[50-64, 65-74, 30-49, 75-84, 18-29, >=85]"
7,SurgRiskCategory,3,"[Low, Moderate, High]"
8,RaceCategory,4,"[Chinese, Indian, Others, Malay]"
9,CVARCRICategory,2,"[no, yes]"


In [7]:
# One-hot encode the 'AKIN' and 'age_category' columns with drop_first to prevent multicollinearity
encoded_data = pd.get_dummies(data, columns=cat_cols, drop_first=True)
encoded_data.columns

Index(['Preoptransfusionwithin30days', 'Intraop', 'Postopwithin30days',
       'Transfusionintraandpostop', 'Mortality', 'thirtydaymortality',
       'ICUAdmgt24h', 'GENDER_MALE', 'RCRI_score_1', 'RCRI_score_2',
       'RCRI_score_3', 'RCRI_score_4', 'RCRI_score_5', 'RCRI_score_6',
       'Anemia category_moderate/severe', 'Anemia category_none',
       'AnaestypeCategory_RA', 'PriorityCategory_Emergency',
       'TransfusionIntraandpostopCategory_1 unit',
       'TransfusionIntraandpostopCategory_2 or more units',
       'AGEcategory_30-49', 'AGEcategory_50-64', 'AGEcategory_65-74',
       'AGEcategory_75-84', 'AGEcategory_>=85', 'SurgRiskCategory_Low',
       'SurgRiskCategory_Moderate', 'RaceCategory_Indian',
       'RaceCategory_Malay', 'RaceCategory_Others', 'CVARCRICategory_yes',
       'IHDRCRICategory_yes', 'CHFRCRICategory_yes',
       'DMinsulinRCRICategory_yes', 'CreatinineRCRICategory_yes',
       'GradeofKidneyCategory_G2', 'GradeofKidneyCategory_G3',
       'GradeofKidney

In [8]:
# Function to get all numerical and boolean columns
def get_numeric_and_bool_columns(data):
    """
    Obtain all numerical and boolean columns from a dataset.

    Parameters:
    - data (pd.DataFrame): The dataset from which to extract columns.

    Returns:
    - list: A list of column names that are numerical or boolean.
    """
    # Select columns with numeric and bool data types
    numeric_and_bool_cols = data.select_dtypes(include=['number', 'bool']).columns.tolist()
    return numeric_and_bool_cols

# Get the list of numerical and boolean columns
encoded_col = get_numeric_and_bool_columns(encoded_data)

# Print the result
print("Numerical and boolean columns:", encoded_col)

Numerical and boolean columns: ['Preoptransfusionwithin30days', 'Intraop', 'Postopwithin30days', 'Transfusionintraandpostop', 'thirtydaymortality', 'ICUAdmgt24h', 'GENDER_MALE', 'RCRI_score_1', 'RCRI_score_2', 'RCRI_score_3', 'RCRI_score_4', 'RCRI_score_5', 'RCRI_score_6', 'Anemia category_moderate/severe', 'Anemia category_none', 'AnaestypeCategory_RA', 'PriorityCategory_Emergency', 'TransfusionIntraandpostopCategory_1 unit', 'TransfusionIntraandpostopCategory_2 or more units', 'AGEcategory_30-49', 'AGEcategory_50-64', 'AGEcategory_65-74', 'AGEcategory_75-84', 'AGEcategory_>=85', 'SurgRiskCategory_Low', 'SurgRiskCategory_Moderate', 'RaceCategory_Indian', 'RaceCategory_Malay', 'RaceCategory_Others', 'CVARCRICategory_yes', 'IHDRCRICategory_yes', 'CHFRCRICategory_yes', 'DMinsulinRCRICategory_yes', 'CreatinineRCRICategory_yes', 'GradeofKidneyCategory_G2', 'GradeofKidneyCategory_G3', 'GradeofKidneyCategory_G4-G5', 'RDW15.7_>15.7', 'ASAcategorybinned_II', 'ASAcategorybinned_III', 'ASAcatego

### Estimate Logistic Regression Model (for feature importance)

In [9]:
import statsmodels.api as sm

# Define the variables for the logistic regression model
X = encoded_data[['GENDER_MALE', 'RCRI_score_1', 'RCRI_score_2', 'RCRI_score_3', 'RCRI_score_4', 'RCRI_score_5', 'RCRI_score_6', 'Anemia category_moderate/severe', 'Anemia category_none', 'AGEcategory_30-49', 'AGEcategory_50-64', 'AGEcategory_65-74','AGEcategory_75-84', 'AGEcategory_>=85', 'RDW15.7_>15.7', 'ASAcategorybinned_II', 'ASAcategorybinned_III', 'ASAcategorybinned_IV-VI']]
y = encoded_data['ICUAdmgt24h']

# Convert the data to ensure compatibility with the logistic regression model package
X = X.astype(float)
y = y.astype(float)

X = sm.add_constant(X)

# Fit the logistic regression model
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Display the summary of the logistic regression model
summary = result.summary()
print(summary)

         Current function value: 0.071261
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:            ICUAdmgt24h   No. Observations:                69667
Model:                          Logit   Df Residuals:                    69648
Method:                           MLE   Df Model:                           18
Date:                Tue, 06 Aug 2024   Pseudo R-squ.:                  0.1639
Time:                        19:43:52   Log-Likelihood:                -4964.5
converged:                      False   LL-Null:                       -5937.5
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const                              -6.1641      0.219    -28.101      0.000      -6.594      -5.734
GE



### Findings/Conclusions:
1. ASAcategorybinned_IV-VI
- Coefficient: 3.8703
- Interpretation: A presence of ASA PS Score of (IV-VI) increases the log-odds of ICU admission (24H) by 3.8703. This translates to an odds ratio of (e^3.8703\approx 47.96), indicating that an ASA PS Score of (IV-VI) increases the ods of ICU admission (24H) by approximately 50 times!, holding other factors constant.

2. RCRI Score 5
- Coefficient: 0.6166
- Interpretation: A presence of RCRI score 5 increases the log-odds of ICU admission (24H) by 0.6166. This translates to an odds ratio of (e^0.6166\approx 1.85), indicating that an RCRI score of 5 increases the ods of ICU admission (24H) by approximately 2 times, holding other factors constant. 

### Using Tree-Based Models for Exploratory Modelling & Feature Selection

In [43]:
def train_validate_test_split(data, cat_cols, num_cols, target_var, test_size=0.2, val_size=0.2, random_state=42):
    # Split the data into training and temporary sets
    train_data, temp_data = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[target_var])
    
    # Split the temporary set into validation and test sets
    val_size_adjusted = val_size / (1 - test_size)  # Adjust val_size to account for reduced size of temp_data
    val_data, test_data = train_test_split(temp_data, test_size=val_size_adjusted, random_state=random_state, stratify=temp_data[target_var])
    
    return train_data, val_data, test_data

def preprocess_and_train(data, cat_cols, num_cols, target_var):
    # Train-validation-test split
    train_data, val_data, test_data = train_validate_test_split(data, cat_cols, num_cols, target_var)
    
    # Separate features and target
    X_train = train_data.drop(columns=[target_var])
    y_train = train_data[target_var]
    X_val = val_data.drop(columns=[target_var])
    y_val = val_data[target_var]
    X_test = test_data.drop(columns=[target_var])
    y_test = test_data[target_var]
    
    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
        ]
    )
    
    # Random Forest Classifier
    rf = RandomForestClassifier(random_state=42, class_weight='balanced')
    
    # Full pipeline with preprocessing and classifier
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', rf)
    ])
    
    # Hyperparameter tuning with RandomizedSearchCV
    param_distributions = {
        'classifier__n_estimators': np.array([300, 500]),
        'classifier__max_depth': np.array([10, 15]),
        'classifier__min_samples_split': np.arange(2, 8, 2),
        'classifier__min_samples_leaf': np.arange(3, 9, 2)
    }
    
    search = RandomizedSearchCV(pipeline, param_distributions, n_iter=10, scoring='roc_auc', cv=5, random_state=42, n_jobs=-1)
    search.fit(X_train, y_train)
    
    # Best model
    best_model = search.best_estimator_
    
    # Evaluate on validation set
    y_val_pred = best_model.predict(X_val)
    y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]
    
    print("Validation Metrics:")
    print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
    print(f"Precision: {precision_score(y_val, y_val_pred):.4f}")
    print(f"Recall: {recall_score(y_val, y_val_pred):.4f}")
    print(f"F1 Score: {f1_score(y_val, y_val_pred):.4f}")
    print(f"ROC AUC Score: {roc_auc_score(y_val, y_val_pred_proba):.4f}")
    
    # Evaluate on test set
    y_test_pred = best_model.predict(X_test)
    y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    print("\nTest Metrics:")
    print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_test_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_test_pred):.4f}")
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_test_pred_proba):.4f}")
    
    # Feature importance
    # Extract the feature importances from the best model
    feature_names = num_cols + list(best_model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(cat_cols))
    feature_importances = best_model.named_steps['classifier'].feature_importances_
    
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance_df.head(10))
    
    return best_model

target_var = 'ICUAdmgt24h'  # Replace with target col
best_model = preprocess_and_train(data, cat_cols, num_cols, target_var)

Validation Metrics:
Accuracy: 0.9052
Precision: 0.1222
Recall: 0.7543
F1 Score: 0.2104
ROC AUC Score: 0.9137

Test Metrics:
Accuracy: 0.9033
Precision: 0.1202
Recall: 0.7458
F1 Score: 0.2071
ROC AUC Score: 0.8936

Feature Importance:
                                      Feature  Importance
29                      SurgRiskCategory_High    0.083279
30                       SurgRiskCategory_Low    0.066219
55                    ASAcategorybinned_IV-VI    0.064964
3                   Transfusionintraandpostop    0.060080
1                                     Intraop    0.058148
0                Preoptransfusionwithin30days    0.051405
54                      ASAcategorybinned_III    0.051358
20  TransfusionIntraandpostopCategory_0 units    0.049268
19                 PriorityCategory_Emergency    0.041926
18                  PriorityCategory_Elective    0.038978


#### Feature Importance for 'thirtydaymortality'

In [33]:
target_var = 'thirtydaymortality'
best_model2 = preprocess_and_train(data, cat_cols, num_cols, target_var)

Validation Metrics:
Accuracy: 0.9411
Precision: 0.0773
Recall: 0.6203
F1 Score: 0.1374
ROC AUC Score: 0.9333

Test Metrics:
Accuracy: 0.9455
Precision: 0.0859
Recall: 0.6538
F1 Score: 0.1518
ROC AUC Score: 0.9417

Feature Importance:
                            Feature  Importance
15             Anemia category_none    0.101140
14  Anemia category_moderate/severe    0.072899
18        PriorityCategory_Elective    0.064950
55          ASAcategorybinned_IV-VI    0.064546
19       PriorityCategory_Emergency    0.061991
53             ASAcategorybinned_II    0.049257
54            ASAcategorybinned_III    0.042659
49      GradeofKidneyCategory_G4-G5    0.038603
52              ASAcategorybinned_I    0.029874
46         GradeofKidneyCategory_G1    0.028834


### Correlations for Categorical Variables

**Important features (ICUAdmgt24h)**
- Categorical features: ALL except 'RaceCategory'
- Numerical features: 'RCRI_score', 'Preoptransfusionwithin30days','Intraop'.
- Keep 'SurgRiskCategory', 'ASAcategorybinned', 'Transfusionintraandpostop', 'Intraop', 'Preoptransfusionwithin30days', 'TransfusionIntraandpostopCategory', 'PriorityCategory'

**Important features ('thirtydaymortality')
- Categorical features: ALL except 'GENDER', 'AnaestypeCategory', 'RaceCategory'
- Numerical features: 'RCRI_score', 'Intraop'
- **From RF model:** Keep 'Anemia category', 'PriorityCategory', 'ASAcategorybinned', 'GradeofKidneyCategory'.

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69667 entries, 0 to 69666
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   GENDER                             69667 non-null  object 
 1   RCRI_score                         69667 non-null  object 
 2   Anemia category                    69667 non-null  object 
 3   Preoptransfusionwithin30days       69667 non-null  float64
 4   Intraop                            69667 non-null  float64
 5   Postopwithin30days                 69667 non-null  float64
 6   Transfusionintraandpostop          69667 non-null  float64
 7   AnaestypeCategory                  69667 non-null  object 
 8   PriorityCategory                   69667 non-null  object 
 9   TransfusionIntraandpostopCategory  69667 non-null  object 
 10  AGEcategory                        69667 non-null  object 
 11  Mortality                          69667 non-null  obj

In [34]:
selected_features = ['GENDER', 'RCRI_score', 'Anemia category', 'Preoptransfusionwithin30days', 'Intraop', 'Transfusionintraandpostop', 'AnaestypeCategory', 'PriorityCategory', 
                     'TransfusionIntraandpostopCategory', 'AGEcategory', 'SurgRiskCategory', 'GradeofKidneyCategory', 'RDW15.7', 'ASAcategorybinned']

In [35]:
# save new df for predictive modelling. 
final_data = data[selected_features + target_cols]
final_data.head()

Unnamed: 0,GENDER,Anemia category,Preoptransfusionwithin30days,Intraop,Transfusionintraandpostop,AnaestypeCategory,PriorityCategory,TransfusionIntraandpostopCategory,AGEcategory,SurgRiskCategory,GradeofKidneyCategory,RDW15.7,ASAcategorybinned,ICUAdmgt24h,thirtydaymortality,Mortality
0,FEMALE,mild,0.0,0.0,0.0,GA,Elective,0 units,50-64,Low,G1,<= 15.7,I,0,False,No death
1,MALE,moderate/severe,0.0,1.0,1.0,GA,Elective,1 unit,65-74,Moderate,G1,<= 15.7,I,0,False,No death
2,MALE,mild,0.0,0.0,0.0,GA,Elective,0 units,65-74,Low,G1,>15.7,II,0,False,Yes
3,MALE,none,0.0,0.0,0.0,GA,Emergency,0 units,50-64,Low,G1,<= 15.7,I,0,False,No death
4,FEMALE,none,0.0,0.0,0.0,GA,Elective,0 units,30-49,Moderate,G2,<= 15.7,II,0,False,No death


In [36]:
final_data.to_csv('../data/CARES_data_predictionv1.csv', index=False)