In [2]:
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
cares_data_path = "CARES_data_imputedv1.csv"
data = pd.read_csv(cares_data_path)
data.head()

Unnamed: 0,GENDER,RCRI_score,Anemia category,Preoptransfusionwithin30days,Intraop,Postopwithin30days,Transfusionintraandpostop,AnaestypeCategory,PriorityCategory,TransfusionIntraandpostopCategory,...,RaceCategory,CVARCRICategory,IHDRCRICategory,CHFRCRICategory,DMinsulinRCRICategory,CreatinineRCRICategory,GradeofKidneyCategory,RDW15.7,ASAcategorybinned,ICUAdmgt24h
0,FEMALE,0,mild,0.0,0.0,0.0,0.0,GA,Elective,0 units,...,Chinese,no,no,no,no,no,G1,<= 15.7,I,no
1,MALE,0,moderate/severe,0.0,1.0,0.0,1.0,GA,Elective,1 unit,...,Chinese,no,no,no,no,no,G1,<= 15.7,I,no
2,MALE,0,mild,0.0,0.0,0.0,0.0,GA,Elective,0 units,...,Chinese,no,no,no,no,no,G1,>15.7,II,no
3,MALE,0,none,0.0,0.0,0.0,0.0,GA,Emergency,0 units,...,Indian,no,no,no,no,no,G1,<= 15.7,I,no
4,FEMALE,0,none,0.0,0.0,0.0,0.0,GA,Elective,0 units,...,Chinese,no,no,no,no,no,G2,<= 15.7,II,no


In [4]:
data.isna().sum()

GENDER                               0
RCRI_score                           0
Anemia category                      0
Preoptransfusionwithin30days         0
Intraop                              0
Postopwithin30days                   0
Transfusionintraandpostop            0
AnaestypeCategory                    0
PriorityCategory                     0
TransfusionIntraandpostopCategory    0
AGEcategory                          0
Mortality                            0
thirtydaymortality                   0
SurgRiskCategory                     0
RaceCategory                         0
CVARCRICategory                      0
IHDRCRICategory                      0
CHFRCRICategory                      0
DMinsulinRCRICategory                0
CreatinineRCRICategory               0
GradeofKidneyCategory                0
RDW15.7                              0
ASAcategorybinned                    0
ICUAdmgt24h                          0
dtype: int64

In [5]:
data.shape

(69667, 24)

In [6]:
data.dtypes

GENDER                                object
RCRI_score                             int64
Anemia category                       object
Preoptransfusionwithin30days         float64
Intraop                              float64
Postopwithin30days                   float64
Transfusionintraandpostop            float64
AnaestypeCategory                     object
PriorityCategory                      object
TransfusionIntraandpostopCategory     object
AGEcategory                           object
Mortality                             object
thirtydaymortality                      bool
SurgRiskCategory                      object
RaceCategory                          object
CVARCRICategory                       object
IHDRCRICategory                       object
CHFRCRICategory                       object
DMinsulinRCRICategory                 object
CreatinineRCRICategory                object
GradeofKidneyCategory                 object
RDW15.7                               object
ASAcategor

<b>Gender

In [7]:
# Frequency Count: Counting the number of occurrences for each gender
gender_counts = data['GENDER'].value_counts()
print("Frequency Count for each Gender:")
print(gender_counts)

# Calculating Mortality Rate: Proportion of deaths for each gender
mortality_rate = data.groupby('GENDER')['Mortality'].apply(lambda x: (x == 'Yes').mean())
print("\nMortality Rate for each Gender:")
print(mortality_rate)

Frequency Count for each Gender:
GENDER
FEMALE    35099
MALE      34568
Name: count, dtype: int64

Mortality Rate for each Gender:
GENDER
FEMALE    0.067295
MALE      0.086149
Name: Mortality, dtype: float64


<b>Chi-Square Test of Independence<br>
Null Hypothesis (H0): There is no association between gender and mortality; they are independent.<br>
Alternative Hypothesis (H1): There is an association between gender and mortality; they are not independent.

In [8]:
from scipy import stats

In [9]:
# Create a contingency table
contingency_table = pd.crosstab(data['GENDER'], data['Mortality'])
print("Contingency Table:")
print(contingency_table)

Contingency Table:
Mortality  No death   Yes
GENDER                   
FEMALE        32737  2362
MALE          31590  2978


In [10]:
# Perform Chi-Square Test of Independence
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

In [11]:
print("\nChi-Square Test Results:")
print(f"Chi2 Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:") #Expected frequencies (if there were no relationship between gender and mortality) 
print(expected)


Chi-Square Test Results:
Chi2 Statistic: 87.20269819554039
P-value: 9.7950080692153e-21
Degrees of Freedom: 1
Expected Frequencies:
[[32408.64933182  2690.35066818]
 [31918.35066818  2649.64933182]]


<b>The p-value is significantly smaller than the common significance level (e.g., 0.05), indicating that the observed association between gender and mortality is statistically significant.

<b>RCRI Score

In [12]:
# Converting Mortality column to a categorical type with explicit categories
data['Mortality'] = pd.Categorical(data['Mortality'], categories=['No death', 'Yes'])

# Summary Statistics for RCRI_score
mean_rcri = data['RCRI_score'].mean()
median_rcri = data['RCRI_score'].median()
std_rcri = data['RCRI_score'].std()
range_rcri = data['RCRI_score'].max() - data['RCRI_score'].min()

print("Summary Statistics for RCRI Score:")
print(f"Mean: {mean_rcri}")
print(f"Median: {median_rcri}")
print(f"Standard Deviation: {std_rcri}")
print(f"Range: {range_rcri}")

# Mortality Rate Calculation for each RCRI score
mortality_rate = data.groupby('RCRI_score')['Mortality'].apply(lambda x: (x == 'Yes').mean())
print("\nMortality Rate for each RCRI Score:")
print(mortality_rate)

Summary Statistics for RCRI Score:
Mean: 0.2744197396184707
Median: 0.0
Standard Deviation: 0.6091160443592442
Range: 6

Mortality Rate for each RCRI Score:
RCRI_score
0    0.057633
1    0.115774
2    0.220955
3    0.347761
4    0.368932
5    0.606061
6    0.333333
Name: Mortality, dtype: float64


<b>Logistic regression will allow us to assess the odds of mortality as a function of the RCRI score and understand how the risk changes with different scores.<br>
<i>Null Hypothesis (H0): The RCRI score does not predict the odds of mortality.<br>
Alternative Hypothesis (H1): The RCRI score predicts the odds of mortality.

In [13]:
# Convert 'Mortality' to a binary numeric variable (0 for 'No death', 1 for 'Yes')
data['Mortality_numeric'] = (data['Mortality'] == 'Yes').astype(int)

In [14]:
import statsmodels.api as sm

# Logistic Regression
X = sm.add_constant(data['RCRI_score'])  # Adding a constant (intercept)
y = data['Mortality_numeric']

model = sm.Logit(y, X)
result = model.fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.259198
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:      Mortality_numeric   No. Observations:                69667
Model:                          Logit   Df Residuals:                    69665
Method:                           MLE   Df Model:                            1
Date:                Sat, 10 Aug 2024   Pseudo R-squ.:                 0.04182
Time:                        12:28:13   Log-Likelihood:                -18058.
converged:                       True   LL-Null:                       -18846.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.7760      0.017   -162.477      0.000      -2.810      -2.743
RCRI_score     0.7058      0.

<b>RCRI Score: Each unit increase in the RCRI score increases the log-odds of mortality by approximately 0.7058.<br>
Odds Ratio: exp(0.7058) ≈ 2.03<br>
This means that for each additional point in the RCRI score, the odds of mortality are approximately 2 times higher.<br>

<b>Anemia Category

In [15]:
# Converting Mortality column to a binary numeric variable (0 for 'No death', 1 for 'Yes')
# data['Mortality_numeric'] = (data['Mortality'] == 'Yes').astype(int)

# Frequency Count for each Anemia Category
frequency_count = data['Anemia category'].value_counts()
print("Frequency Count for each Anemia Category:")
print(frequency_count)

# Mortality Rate Calculation for each Anemia Category
mortality_rate = data.groupby('Anemia category')['Mortality_numeric'].mean()
print("\nMortality Rate for each Anemia Category:")
print(mortality_rate)

Frequency Count for each Anemia Category:
Anemia category
none               48116
mild               11590
moderate/severe     9961
Name: count, dtype: int64

Mortality Rate for each Anemia Category:
Anemia category
mild               0.115617
moderate/severe    0.230800
none               0.035352
Name: Mortality_numeric, dtype: float64


In [16]:
# Convert Mortality column to a binary numeric variable (0 for 'No death', 1 for 'Yes')
# data['Mortality_numeric'] = (data['Mortality'] == 'Yes').astype(int)

# Create a contingency table
contingency_table = pd.crosstab(data['Anemia category'], data['Mortality_numeric'])

# Perform Chi-Square Test
chi2_stat, p_value, dof, expected = stats.chi2_contingency(contingency_table)

# Print results
print("Chi-Square Statistic:", chi2_stat)
print("P-value:", p_value)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:")
print(expected)

# Interpret results
alpha = 0.05
if p_value < alpha:
    print("There is a significant association between Anemia category and Mortality.")
else:
    print("There is no significant association between Anemia category and Mortality.")

Chi-Square Statistic: 4752.48001382974
P-value: 0.0
Degrees of Freedom: 2
Expected Frequencies:
[[10701.62243243   888.37756757]
 [ 9197.48585413   763.51414587]
 [44427.89171344  3688.10828656]]
There is a significant association between Anemia category and Mortality.


<b>Preop Transfusion within 30 Days

In [17]:
# Convert Mortality column to a binary numeric variable (0 for 'No death', 1 for 'Yes')
# data['Mortality_numeric'] = (data['Mortality'] == 'Yes').astype(int)

# Frequency Count for Preoperative Transfusion
frequency_count = data['Preoptransfusionwithin30days'].value_counts()
print("Frequency Count for Preoperative Transfusion:")
print(frequency_count)

Frequency Count for Preoperative Transfusion:
Preoptransfusionwithin30days
0.0     67882
1.0       958
2.0       436
3.0       176
4.0        73
5.0        44
7.0        23
6.0        23
8.0        19
10.0        7
11.0        7
9.0         6
13.0        5
14.0        4
17.0        1
12.0        1
20.0        1
21.0        1
Name: count, dtype: int64


In [18]:
from scipy.stats import fisher_exact

# Combine transfusion categories into two groups
data['Transfusion_Group'] = data['Preoptransfusionwithin30days'].apply(lambda x: 'No transfusion' if x == 0 else 'Any transfusion')

# Create a 2x2 contingency table
contingency_table = pd.crosstab(data['Transfusion_Group'], data['Mortality_numeric'])

# Perform Fisher's Exact Test
oddsratio, p_value = fisher_exact(contingency_table)

print("Fisher's Exact Test Results:")
print(f"Contingency Table:\n{contingency_table}")
print(f"Odds Ratio: {oddsratio}")
print(f"P-value: {p_value}")

Fisher's Exact Test Results:
Contingency Table:
Mortality_numeric      0     1
Transfusion_Group             
Any transfusion     1187   598
No transfusion     63140  4742
Odds Ratio: 0.14907557977547373
P-value: 4.704868993621319e-226


Explanation:<br>

Any transfusion: Patients who received at least one unit of transfusion.<br>
1187: Number of patients in this group who did not die.<br>
598: Number of patients in this group who died.<br><br>
No transfusion: Patients who received no transfusion.<br>
63140: Number of patients in this group who did not die.<br>
4742: Number of patients in this group who died.<br><br>
<b>The p-value is extremely small which indicates that there is a very strong statistical evidence against the null hypothesis.<br>
The odds of mortality in the transfusion group are about 0.15 times (or 15%) of the odds in the no transfusion group, suggesting a protective effect of transfusions against mortality.

<b>Transfusionintraandpostop

In [19]:
# Convert Mortality column to a binary numeric variable (0 for 'No death', 1 for 'Yes')
# data['Mortality_numeric'] = (data['Mortality'] == 'Yes').astype(int)

# Frequency Count for Preoperative Transfusion
frequency_count = data['Transfusionintraandpostop'].value_counts()
print("Frequency Count for Transfusion intra and post operation:")
print(frequency_count)

Frequency Count for Transfusion intra and post operation:
Transfusionintraandpostop
0.0     64953
1.0      3794
2.0       589
3.0       176
4.0        61
5.0        34
6.0        23
7.0        12
8.0         9
9.0         5
10.0        3
13.0        3
16.0        2
24.0        1
17.0        1
11.0        1
Name: count, dtype: int64


In [20]:
from scipy.stats import fisher_exact

# Combine transfusion categories into two groups
data['Transfusion_Group'] = data['Transfusionintraandpostop'].apply(lambda x: 'No transfusion' if x == 0 else 'Any transfusion')

# Create a 2x2 contingency table
contingency_table = pd.crosstab(data['Transfusion_Group'], data['Mortality_numeric'])

# Perform Fisher's Exact Test
oddsratio, p_value = fisher_exact(contingency_table)

print("Fisher's Exact Test Results:")
print(f"Contingency Table:\n{contingency_table}")
print(f"Odds Ratio: {oddsratio}")
print(f"P-value: {p_value}")

Fisher's Exact Test Results:
Contingency Table:
Mortality_numeric      0     1
Transfusion_Group             
Any transfusion     3520  1194
No transfusion     60807  4146
Odds Ratio: 0.20100833074874924
P-value: 0.0


Explanation:<br>

Any transfusion: Patients who received at least one unit of transfusion.<br>
3520: Number of patients who did not die.<br>
1194: Number of patients who died.<br>
No transfusion: Patients who received no transfusion.<br>
60807: Number of patients who did not die.<br>
4146: Number of patients who died.<br><br>
<b>The p-value is extremely small which indicates that there is a very strong statistical evidence against the null hypothesis.<br>
The odds of mortality for patients with any transfusion are about 20% of the odds for patients with no transfusion, suggesting a protective effect of transfusion against mortality.

<b>AnaestypeCategory

In [21]:
# Frequency count of each anesthesia type
frequency_count = data['AnaestypeCategory'].value_counts()

# Mortality rate for each anesthesia type
mortality_rate = data.groupby('AnaestypeCategory')['Mortality_numeric'].mean()

# Display the results
print("Frequency Count for each Anesthesia Type:")
print(frequency_count)
print("\nMortality Rate for each Anesthesia Type:")
print(mortality_rate)

Frequency Count for each Anesthesia Type:
AnaestypeCategory
GA    58364
RA    11303
Name: count, dtype: int64

Mortality Rate for each Anesthesia Type:
AnaestypeCategory
GA    0.070848
RA    0.106609
Name: Mortality_numeric, dtype: float64


<b>Frequency: General anesthesia is much more common than regional anesthesia in this dataset.<br>
Mortality Rate: The mortality rate is higher for patients who underwent surgery with regional anesthesia compared to those with general anesthesia (10.66% vs. 7.08%).

In [22]:
# Create a contingency table
contingency_table = pd.crosstab(data['AnaestypeCategory'], data['Mortality_numeric'])

# Perform Chi-Square test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

# Display the results
print("\nChi-Square Test Results:")
print(f"Chi2 Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")


Chi-Square Test Results:
Chi2 Statistic: 170.5895799900371
P-value: 5.500442270525351e-39
Degrees of Freedom: 1


<b>There is a statistically significant association between the type of anesthesia and mortality rates.

<b>Race Category

In [23]:
frequency_count = data['RaceCategory'].value_counts()
print("Frequency Count for each Priority Category:")
print(frequency_count)

Frequency Count for each Priority Category:
RaceCategory
Chinese    48605
Malay       7431
Others      7019
Indian      6612
Name: count, dtype: int64


In [24]:
# Contingency Table
contingency_table = pd.crosstab(data['RaceCategory'], data['Mortality_numeric'])
print("\nContingency Table:")
print(contingency_table)

# Chi-Square Test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
print("\nChi-Square Test Results:")
print(f"Chi2 Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")


Contingency Table:
Mortality_numeric      0     1
RaceCategory                  
Chinese            44350  4255
Indian              6270   342
Malay               6869   562
Others              6838   181

Chi-Square Test Results:
Chi2 Statistic: 396.19707389151836
P-value: 1.475320109998201e-85
Degrees of Freedom: 3


<b> There is a statistically significant association between the Race and patient mortality.

<b>Age Category

In [25]:
# Frequency Count
age_category_count = data['AGEcategory'].value_counts()
print("Frequency Count for each Age Category:")
print(age_category_count)

# Mortality Rate
age_category_mortality_rate = data.groupby('AGEcategory')['Mortality_numeric'].mean()
print("\nMortality Rate for each Age Category:")
print(age_category_mortality_rate)

Frequency Count for each Age Category:
AGEcategory
50-64    22110
30-49    19419
65-74    13139
18-29     6983
75-84     6724
>=85      1292
Name: count, dtype: int64



Mortality Rate for each Age Category:
AGEcategory
18-29    0.006015
30-49    0.023791
50-64    0.078019
65-74    0.117056
75-84    0.179952
>=85     0.280960
Name: Mortality_numeric, dtype: float64


In [26]:
# Contingency Table
contingency_table_age = pd.crosstab(data['AGEcategory'], data['Mortality_numeric'])
print("\nContingency Table:")
print(contingency_table_age)

# Chi-Square Test
chi2_age, p_age, dof_age, expected_age = stats.chi2_contingency(contingency_table_age)
print("\nChi-Square Test Results:")
print(f"Chi2 Statistic: {chi2_age}")
print(f"P-value: {p_age}")
print(f"Degrees of Freedom: {dof_age}")


Contingency Table:
Mortality_numeric      0     1
AGEcategory                   
18-29               6941    42
30-49              18957   462
50-64              20385  1725
65-74              11601  1538
75-84               5514  1210
>=85                 929   363

Chi-Square Test Results:
Chi2 Statistic: 3338.422281464878
P-value: 0.0
Degrees of Freedom: 5


<b> There is a statistically significant association between age and patient mortality.

<b> ASA category

In [27]:
# Frequency Count
asa_category_count = data['ASAcategorybinned'].value_counts()
print("Frequency Count for each ASA Category:")
print(asa_category_count)

# Mortality Rate
asa_category_mortality_rate = data.groupby('ASAcategorybinned')['Mortality_numeric'].mean()
print("\nMortality Rate for each ASA Category:")
print(asa_category_mortality_rate)

Frequency Count for each ASA Category:
ASAcategorybinned
II       39286
I        15870
III      13420
IV-VI     1091
Name: count, dtype: int64

Mortality Rate for each ASA Category:
ASAcategorybinned
I        0.010334
II       0.050247
III      0.205291
IV-VI    0.409716
Name: Mortality_numeric, dtype: float64


In [28]:
# Contingency Table
contingency_table_asa = pd.crosstab(data['ASAcategorybinned'], data['Mortality_numeric'])
print("\nContingency Table:")
print(contingency_table_asa)

# Chi-Square Test
chi2_asa, p_asa, dof_asa, expected_asa = stats.chi2_contingency(contingency_table_asa)
print("\nChi-Square Test Results:")
print(f"Chi2 Statistic: {chi2_asa}")
print(f"P-value: {p_asa}")
print(f"Degrees of Freedom: {dof_asa}")


Contingency Table:
Mortality_numeric      0     1
ASAcategorybinned             
I                  15706   164
II                 37312  1974
III                10665  2755
IV-VI                644   447

Chi-Square Test Results:
Chi2 Statistic: 6220.945926622753
P-value: 0.0
Degrees of Freedom: 3


<b> There is a statistically significant association between ASA category and patient mortality.

<b> ICU Admission within 24 Hours

In [29]:
# Frequency Count
icu_admission_count = data['ICUAdmgt24h'].value_counts()
print("Frequency Count for ICU Admission Status:")
print(icu_admission_count)

# Mortality Rate
icu_admission_mortality_rate = data.groupby('ICUAdmgt24h')['Mortality_numeric'].mean()
print("\nMortality Rate for ICU Admission Status:")
print(icu_admission_mortality_rate)

Frequency Count for ICU Admission Status:
ICUAdmgt24h
no     68498
yes     1169
Name: count, dtype: int64



Mortality Rate for ICU Admission Status:
ICUAdmgt24h
no     0.071316
yes    0.389222
Name: Mortality_numeric, dtype: float64


In [30]:
# Contingency Table
contingency_table_icu = pd.crosstab(data['ICUAdmgt24h'], data['Mortality_numeric'])
print("\nContingency Table:")
print(contingency_table_icu)

# Chi-Square Test
chi2_icu, p_icu, dof_icu, expected_icu = stats.chi2_contingency(contingency_table_icu)
print("\nChi-Square Test Results:")
print(f"Chi2 Statistic: {chi2_icu}")
print(f"P-value: {p_icu}")
print(f"Degrees of Freedom: {dof_icu}")
print("Expected Frequencies:")
print(expected_icu)


Contingency Table:
Mortality_numeric      0     1
ICUAdmgt24h                   
no                 63613  4885
yes                  714   455

Chi-Square Test Results:
Chi2 Statistic: 1636.7862069880257
P-value: 0.0
Degrees of Freedom: 1
Expected Frequencies:
[[63247.60426027  5250.39573973]
 [ 1079.39573973    89.60426027]]


<b> There is a statistically significant association between ICUAdmgt24h and patient mortality.

## Multivariate Logistic to factor in all

Logistic Regression Details
Single Dependent Variable: In logistic regression, even though the dependent variable (e.g., mortality) is binary (Yes/No), the model considers the effects of multiple independent variables simultaneously. This is why logistic regression is considered a multivariate technique.

Independent Variables: These can include a mix of categorical and continuous predictors. The goal is to understand how each predictor, while accounting for the presence of others, affects the probability of the outcome.

In [31]:
from sklearn.preprocessing import LabelEncoder

# Convert categorical variables to numeric using Label Encoding
label_encoders = {}
for col in ['GENDER', 'Anemia category', 'AnaestypeCategory', 'PriorityCategory',
            'TransfusionIntraandpostopCategory', 'AGEcategory', 'SurgRiskCategory',
            'RaceCategory', 'CVARCRICategory', 'IHDRCRICategory', 'CHFRCRICategory',
            'DMinsulinRCRICategory', 'CreatinineRCRICategory', 'GradeofKidneyCategory',
            'RDW15.7', 'ASAcategorybinned', 'ICUAdmgt24h']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))  # Ensure all data is string for encoding
    label_encoders[col] = le

# Convert target variable 'Mortality' to binary
data['Mortality_numeric'] = (data['Mortality'] == 'Yes').astype(int)

# Define the independent and dependent variables
X = data[['RCRI_score', 'Preoptransfusionwithin30days', 'Intraop', 'Postopwithin30days', 
          'Transfusionintraandpostop', 'AnaestypeCategory', 'PriorityCategory', 
          'TransfusionIntraandpostopCategory', 'AGEcategory', 'SurgRiskCategory', 
          'RaceCategory', 'CVARCRICategory', 'IHDRCRICategory', 'CHFRCRICategory', 
          'DMinsulinRCRICategory', 'CreatinineRCRICategory', 'GradeofKidneyCategory', 
          'RDW15.7', 'ASAcategorybinned', 'ICUAdmgt24h']]

y = data['Mortality_numeric']

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Ensure there are no missing values
X = X.dropna()
y = y.loc[X.index]  # Align y with X

# Fit the logistic regression model
model = sm.Logit(y, X)
result = model.fit()

# Print the summary of the logistic regression
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.211216
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:      Mortality_numeric   No. Observations:                69667
Model:                          Logit   Df Residuals:                    69646
Method:                           MLE   Df Model:                           20
Date:                Sat, 10 Aug 2024   Pseudo R-squ.:                  0.2192
Time:                        12:28:17   Log-Likelihood:                -14715.
converged:                       True   LL-Null:                       -18846.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                        coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const                                -4.8541      0.061    -79

Note:<br><b>Chi-Square Tests: These tests assess the association between two variables independently. They do not account for other variables that might also be influencing the outcome.
<b><br>
Logistic Regression: When multiple variables are included in a logistic regression model, the effect of each variable is assessed while holding other variables constant. If some variables are highly correlated with each other (multicollinearity), it can affect the coefficients and significance levels of individual predictors.

In the logistic regression results, variables with p-values less than the conventional significance level of 0.05 are considered statistically significant. In this case, variables such as RCRI_score, Preoptransfusionwithin30days, PriorityCategory, AGEcategory, SurgRiskCategory, RaceCategory, IHDRCRICategory, GradeofKidneyCategory, RDW15.7, ASAcategorybinned, and ICUAdmgt24h are significant, with p-values well below 0.05. This suggests that these variables have a meaningful association with the likelihood of mortality, meaning their effects are not likely due to random chance.

On the other hand, variables such as Intraop, Postopwithin30days, Transfusionintraandpostop, AnaestypeCategory, TransfusionIntraandpostopCategory, CVARCRICategory, CHFRCRICategory, DMinsulinRCRICategory, and CreatinineRCRICategory either have missing values or p-values greater than 0.05, suggesting that they do not have a statistically significant impact on mortality in this model. These non-significant results might differ from individual chi-square tests because the logistic regression model accounts for the effects of all predictors simultaneously, allowing for the assessment of each variable's effect while controlling for others. Chi-square tests, on the other hand, evaluate the association between each variable and mortality independently, which can sometimes lead to different conclusions due to the lack of control for confounding variables.

**ANOVA**

CONTINUOUS VARIABLES: <br><br>
RCRI_score                            
Preoptransfusionwithin30days          
Intraop                               
Postopwithin30days                    
Transfusionintraandpostop

Checking normality:

H0: Distributions are normal <br>
H1: Distributions are not normal

In [32]:
from scipy.stats import shapiro

# Perform Shapiro-Wilk test for normality on each variable
shapiro_rcri = shapiro(data['RCRI_score'])
shapiro_preop = shapiro(data['Preoptransfusionwithin30days'])
shapiro_intraop = shapiro(data['Intraop'])
shapiro_postop = shapiro(data['Postopwithin30days'])
shapiro_transfusion = shapiro(data['Transfusionintraandpostop'])

# Print the results
print(f"Shapiro-Wilk test for RCRI_score: W = {shapiro_rcri.statistic}, p-value = {shapiro_rcri.pvalue}")
print(f"Shapiro-Wilk test for Preoptransfusionwithin30days: W = {shapiro_preop.statistic}, p-value = {shapiro_preop.pvalue}")
print(f"Shapiro-Wilk test for Intraop: W = {shapiro_intraop.statistic}, p-value = {shapiro_intraop.pvalue}")
print(f"Shapiro-Wilk test for Postopwithin30days: W = {shapiro_postop.statistic}, p-value = {shapiro_postop.pvalue}")
print(f"Shapiro-Wilk test for Transfusionintraandpostop: W = {shapiro_transfusion.statistic}, p-value = {shapiro_transfusion.pvalue}")


Shapiro-Wilk test for RCRI_score: W = 0.5048271804523942, p-value = 1.0756507523228306e-150
Shapiro-Wilk test for Preoptransfusionwithin30days: W = 0.09460360964005898, p-value = 2.004707083965399e-170
Shapiro-Wilk test for Intraop: W = 0.27076789482163643, p-value = 3.29292460896796e-163
Shapiro-Wilk test for Postopwithin30days: W = 0.0551013263410397, p-value = 7.233372196203211e-172
Shapiro-Wilk test for Transfusionintraandpostop: W = 0.20399443396473826, p-value = 4.120821266066347e-166


  res = hypotest_fun_out(*samples, **kwds)


p-value is <0.05 for all the variables, so we can reject the null hypothesis for all.

Due to this, ANOVA cannot be performed as the data is not normally distributed. Kruskal-Wallis is non-parametric and normality need not be assumed. Also, for kruskal-wallis the dependent variable can be categorical so we can check against Mortality.

H0: The medians (or distributions) of the groups are equal. There is no significant difference between the groups.

H1: At least one group differs in its distribution from the others.

In [33]:
import pandas as pd
from scipy.stats import kruskal

# Define the DataFrame
# data = pd.read_csv('your_data_file.csv')

# Define the categorical variable and continuous variables
categorical_var = 'Mortality'
continuous_vars = [
    'RCRI_score',
    'Preoptransfusionwithin30days',
    'Intraop',
    'Postopwithin30days',
    'Transfusionintraandpostop'
]

def perform_kruskal_wallis(data, categorical_var, continuous_vars):
    results = {}
    
    # Get the unique groups in the categorical variable
    groups = data[categorical_var].unique()
    
    for var in continuous_vars:
        # Extract data for each group
        group_data = [data[data[categorical_var] == group][var].dropna() for group in groups]
        
        # Perform Kruskal-Wallis test
        kruskal_result = kruskal(*group_data)
        
        # Store the results
        results[var] = {
            'H-statistic': kruskal_result.statistic,
            'p-value': kruskal_result.pvalue
        }
    
    return results

# Perform the test
kruskal_wallis_results = perform_kruskal_wallis(data, categorical_var, continuous_vars)

# Print the results
for var, result in kruskal_wallis_results.items():
    print(f"Kruskal-Wallis test for {var}:")
    print(f"  H-statistic = {result['H-statistic']:.4f}")
    print(f"  p-value = {result['p-value']:.4f}")
    print("-" * 40)


Kruskal-Wallis test for RCRI_score:
  H-statistic = 1539.4005
  p-value = 0.0000
----------------------------------------
Kruskal-Wallis test for Preoptransfusionwithin30days:
  H-statistic = 1739.8585
  p-value = 0.0000
----------------------------------------
Kruskal-Wallis test for Intraop:
  H-statistic = 2217.7140
  p-value = 0.0000
----------------------------------------
Kruskal-Wallis test for Postopwithin30days:
  H-statistic = 1150.9022
  p-value = 0.0000
----------------------------------------
Kruskal-Wallis test for Transfusionintraandpostop:
  H-statistic = 2269.1963
  p-value = 0.0000
----------------------------------------


**Interpretation:**

There is a significant difference in all the considered continuous variables among different mortality groups since p-value <0.05 (null hypothesis is rejected)

Checking against ICUAdmgt24h:                   

In [38]:
import pandas as pd
from scipy.stats import kruskal

# Define the DataFrame
# data = pd.read_csv('your_data_file.csv')

# Define the categorical variable and continuous variables
categorical_var = 'ICUAdmgt24h'
continuous_vars = [
    'RCRI_score',
    'Preoptransfusionwithin30days',
    'Intraop',
    'Postopwithin30days',
    'Transfusionintraandpostop'
]

def perform_kruskal_wallis(data, categorical_var, continuous_vars):
    results = {}
    
    # Get the unique groups in the categorical variable
    groups = data[categorical_var].unique()
    
    for var in continuous_vars:
        # Extract data for each group
        group_data = [data[data[categorical_var] == group][var].dropna() for group in groups]
        
        # Perform Kruskal-Wallis test
        kruskal_result = kruskal(*group_data)
        
        # Store the results
        results[var] = {
            'H-statistic': kruskal_result.statistic,
            'p-value': kruskal_result.pvalue
        }
    
    return results

# Perform the test
kruskal_wallis_results = perform_kruskal_wallis(data, categorical_var, continuous_vars)

# Print the results
for var, result in kruskal_wallis_results.items():
    print(f"Kruskal-Wallis test for {var}:")
    print(f"  H-statistic = {result['H-statistic']:.4f}")
    print(f"  p-value = {result['p-value']:.9f}")
    print("-" * 40)


Kruskal-Wallis test for RCRI_score:
  H-statistic = 732.7198
  p-value = 0.000000000
----------------------------------------
Kruskal-Wallis test for Preoptransfusionwithin30days:
  H-statistic = 3833.2621
  p-value = 0.000000000
----------------------------------------
Kruskal-Wallis test for Intraop:
  H-statistic = 2313.5680
  p-value = 0.000000000
----------------------------------------
Kruskal-Wallis test for Postopwithin30days:
  H-statistic = 419.3417
  p-value = 0.000000000
----------------------------------------
Kruskal-Wallis test for Transfusionintraandpostop:
  H-statistic = 2317.3586
  p-value = 0.000000000
----------------------------------------


**Interpretation:**

There is a significant difference in all the considered continuous variables among yes/no groups in ICUAdmgt24h since p-value <0.05 (null hypothesis is rejected)