## Exploratory Modelling of Imputed Dataset (Clean)

In [40]:
# import dependencies
import pandas as pd
import duckdb
from duckdb import sql

In [19]:
# import dataset
data = pd.read_csv('../data/CARES_data_imputedv1.csv', dtype={'RCRI_score':str})

In [41]:
# convert ICU variable to [0,1]
data['ICUAdmgt24h'] = data['ICUAdmgt24h'].replace({'yes': 1, 'no': 0})

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69667 entries, 0 to 69666
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   GENDER                             69667 non-null  object 
 1   RCRI_score                         69667 non-null  object 
 2   Anemia category                    69667 non-null  object 
 3   Preoptransfusionwithin30days       69667 non-null  float64
 4   Intraop                            69667 non-null  float64
 5   Postopwithin30days                 69667 non-null  float64
 6   Transfusionintraandpostop          69667 non-null  float64
 7   AnaestypeCategory                  69667 non-null  object 
 8   PriorityCategory                   69667 non-null  object 
 9   TransfusionIntraandpostopCategory  69667 non-null  object 
 10  AGEcategory                        69667 non-null  object 
 11  Mortality                          69667 non-null  obj

### Multivariate Analysis for Categorical Response "ICUAdmg24h"

In [44]:
# Define categorical, numeric & target columns
# Define the target columns
target_cols = ['ICUAdmgt24h', 'thirtydaymortality', 'Mortality']

# Select categorical columns and exclude target columns
cat_cols = data.select_dtypes(include=['object', 'category']).columns.to_list()
cat_cols = [col for col in cat_cols if col not in target_cols]

# Select numerical columns and exclude target columns
num_cols = data.select_dtypes(include=['number']).columns.tolist()
num_cols = [col for col in num_cols if col not in target_cols]

In [45]:
# See what are the unique categories present in the categorical columns
summary_cat_data = {
    'Column': [],
    'Number of Unique Values': [],
    'Unique Values': []
}

for column in cat_cols:
    summary_cat_data['Column'].append(column)
    summary_cat_data['Number of Unique Values'].append(data[column].nunique())
    summary_cat_data['Unique Values'].append(data[column].unique())

summary_df = pd.DataFrame(summary_cat_data)
summary_df
# There are no NaN values in the columns

Unnamed: 0,Column,Number of Unique Values,Unique Values
0,GENDER,2,"[FEMALE, MALE]"
1,RCRI_score,7,"[0, 1, 2, 4, 3, 6, 5]"
2,Anemia category,3,"[mild, moderate/severe, none]"
3,AnaestypeCategory,2,"[GA, RA]"
4,PriorityCategory,2,"[Elective, Emergency]"
5,TransfusionIntraandpostopCategory,3,"[0 units, 1 unit, 2 or more units]"
6,AGEcategory,6,"[50-64, 65-74, 30-49, 75-84, 18-29, >=85]"
7,SurgRiskCategory,3,"[Low, Moderate, High]"
8,RaceCategory,4,"[Chinese, Indian, Others, Malay]"
9,CVARCRICategory,2,"[no, yes]"


In [46]:
# One-hot encode the 'AKIN' and 'age_category' columns with drop_first to prevent multicollinearity
encoded_data = pd.get_dummies(data, columns=cat_cols, drop_first=True)
encoded_data.columns

Index(['Preoptransfusionwithin30days', 'Intraop', 'Postopwithin30days',
       'Transfusionintraandpostop', 'Mortality', 'thirtydaymortality',
       'ICUAdmgt24h', 'GENDER_MALE', 'RCRI_score_1', 'RCRI_score_2',
       'RCRI_score_3', 'RCRI_score_4', 'RCRI_score_5', 'RCRI_score_6',
       'Anemia category_moderate/severe', 'Anemia category_none',
       'AnaestypeCategory_RA', 'PriorityCategory_Emergency',
       'TransfusionIntraandpostopCategory_1 unit',
       'TransfusionIntraandpostopCategory_2 or more units',
       'AGEcategory_30-49', 'AGEcategory_50-64', 'AGEcategory_65-74',
       'AGEcategory_75-84', 'AGEcategory_>=85', 'SurgRiskCategory_Low',
       'SurgRiskCategory_Moderate', 'RaceCategory_Indian',
       'RaceCategory_Malay', 'RaceCategory_Others', 'CVARCRICategory_yes',
       'IHDRCRICategory_yes', 'CHFRCRICategory_yes',
       'DMinsulinRCRICategory_yes', 'CreatinineRCRICategory_yes',
       'GradeofKidneyCategory_G2', 'GradeofKidneyCategory_G3',
       'GradeofKidney

In [47]:
# Function to get all numerical and boolean columns
def get_numeric_and_bool_columns(data):
    """
    Obtain all numerical and boolean columns from a dataset.

    Parameters:
    - data (pd.DataFrame): The dataset from which to extract columns.

    Returns:
    - list: A list of column names that are numerical or boolean.
    """
    # Select columns with numeric and bool data types
    numeric_and_bool_cols = data.select_dtypes(include=['number', 'bool']).columns.tolist()
    return numeric_and_bool_cols

# Get the list of numerical and boolean columns
encoded_col = get_numeric_and_bool_columns(encoded_data)

# Print the result
print("Numerical and boolean columns:", encoded_col)

Numerical and boolean columns: ['Preoptransfusionwithin30days', 'Intraop', 'Postopwithin30days', 'Transfusionintraandpostop', 'thirtydaymortality', 'ICUAdmgt24h', 'GENDER_MALE', 'RCRI_score_1', 'RCRI_score_2', 'RCRI_score_3', 'RCRI_score_4', 'RCRI_score_5', 'RCRI_score_6', 'Anemia category_moderate/severe', 'Anemia category_none', 'AnaestypeCategory_RA', 'PriorityCategory_Emergency', 'TransfusionIntraandpostopCategory_1 unit', 'TransfusionIntraandpostopCategory_2 or more units', 'AGEcategory_30-49', 'AGEcategory_50-64', 'AGEcategory_65-74', 'AGEcategory_75-84', 'AGEcategory_>=85', 'SurgRiskCategory_Low', 'SurgRiskCategory_Moderate', 'RaceCategory_Indian', 'RaceCategory_Malay', 'RaceCategory_Others', 'CVARCRICategory_yes', 'IHDRCRICategory_yes', 'CHFRCRICategory_yes', 'DMinsulinRCRICategory_yes', 'CreatinineRCRICategory_yes', 'GradeofKidneyCategory_G2', 'GradeofKidneyCategory_G3', 'GradeofKidneyCategory_G4-G5', 'RDW15.7_>15.7', 'ASAcategorybinned_II', 'ASAcategorybinned_III', 'ASAcatego

### Estimate Logistic Regression Model (for feature importance)

In [55]:
import statsmodels.api as sm

# Define the variables for the logistic regression model
X = encoded_data[['GENDER_MALE', 'RCRI_score_1', 'RCRI_score_2', 'RCRI_score_3', 'RCRI_score_4', 'RCRI_score_5', 'RCRI_score_6', 'Anemia category_moderate/severe', 'Anemia category_none', 'AGEcategory_30-49', 'AGEcategory_50-64', 'AGEcategory_65-74','AGEcategory_75-84', 'AGEcategory_>=85', 'RDW15.7_>15.7', 'ASAcategorybinned_II', 'ASAcategorybinned_III', 'ASAcategorybinned_IV-VI']]
y = encoded_data['ICUAdmgt24h']

# Convert the data to ensure compatibility with the logistic regression model package
X = X.astype(float)
y = y.astype(float)

X = sm.add_constant(X)

# Fit the logistic regression model
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Display the summary of the logistic regression model
summary = result.summary()
print(summary)

         Current function value: 0.071261
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:            ICUAdmgt24h   No. Observations:                69667
Model:                          Logit   Df Residuals:                    69648
Method:                           MLE   Df Model:                           18
Date:                Sat, 03 Aug 2024   Pseudo R-squ.:                  0.1639
Time:                        19:08:01   Log-Likelihood:                -4964.5
converged:                      False   LL-Null:                       -5937.5
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const                              -6.1641      0.219    -28.101      0.000      -6.594      -5.734
GE



### Findings/Conclusions:
1. ASAcategorybinned_IV-VI
- Coefficient: 3.8703
- Interpretation: A presence of ASA PS Score of (IV-VI) increases the log-odds of ICU admission (24H) by 3.8703. This translates to an odds ratio of (e^3.8703\approx 47.96), indicating that an ASA PS Score of (IV-VI) increases the ods of ICU admission (24H) by approximately 50 times!, holding other factors constant.