In [1]:
import pandas as pd
import seaborn as sns

import numpy as np
from scipy import stats

## Import Data

In [2]:
patient_df_capped = pd.read_csv('/Users/josh/Desktop/Macbook Working Files/Git Repos/650-DRAGON-SLAYERS/FILES/Josh-Dev/patient_df_capped.csv')

patient_df_capped

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,SUBJECT_ID,HOSPITAL_EXPIRE_FLAG,AGE_AT_ADMISSION,LOS,LOS_ICU_MEAN,WEIGHT_MEAN,HEARTRATE_MEAN,SBP_MEAN,...,CREATININE_MIN_VAL,HEMOGLOBIN_MIN_VAL,INR_MIN_VAL,LACTATE_MIN_VAL,PLATELET_MIN_VAL,POTASSIUM_MIN_VAL,SODIUM_MIN_VAL,GENDER_M,ADMISSION_TYPE_EMERGENCY,ADMISSION_TYPE_URGENT
0,0,0.0,18333.0,1,73.0,15.0,8.628850,95.617194,76.692308,119.746305,...,1.5,7.000000,1.2,0.600000,130.0,2.900000,134.0,0,1,0
1,1,1.0,55935.0,1,59.0,8.0,8.486200,105.777654,100.165899,107.517375,...,1.7,9.300000,1.8,3.643166,39.0,4.222697,135.0,0,1,0
2,2,2.0,32012.0,1,86.0,7.0,2.356450,62.750000,94.541401,124.607143,...,1.1,6.900000,1.3,2.300000,105.0,3.100000,135.0,0,1,0
3,3,3.0,8060.0,1,53.0,34.0,8.907775,76.020000,103.072883,106.230753,...,0.5,7.400000,0.8,1.100000,24.0,2.800000,127.0,0,1,0
4,4,4.0,17634.0,1,59.0,1.0,1.299800,84.400002,110.307692,68.079180,...,0.5,9.700000,1.3,3.643166,66.0,4.200000,131.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5171,5171,5176.0,70239.0,0,81.0,1.0,1.640700,87.600000,86.250000,123.419355,...,1.2,11.546604,1.0,1.300000,113.0,3.500000,140.0,1,1,0
5172,5172,5177.0,70359.0,0,81.0,13.0,5.027500,63.185366,105.358491,113.904762,...,0.4,6.800000,1.0,1.400000,247.0,3.000000,138.0,1,1,0
5173,5173,5178.0,43776.0,0,75.0,4.0,1.650000,67.700000,82.810811,115.675676,...,1.4,11.546604,1.2,1.600000,248.0,3.600000,132.0,1,1,0
5174,5174,5179.0,43925.0,0,76.0,14.0,3.272300,71.300000,93.837209,124.887640,...,0.8,6.100000,1.1,1.500000,226.0,3.000000,134.0,1,0,0


## Statistics

- 5 Number Summary:
  - Min
  - Q1
  - Q2
  - Q3
  - Max

- Mean, CI, P-Value


In [3]:
import pandas as pd
import numpy as np
from scipy import stats

def compute_grouped_statistics(df, group_col='HOSPITAL_EXPIRE_FLAG', exclude_cols=None, confidence=0.95):
    """
    Computes 5-Number Summary, Mean, 95% CI for the mean, and P-Value for each numeric column,
    grouped by the specified group column, excluding specified columns.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - group_col (str): The column name to group by.
    - exclude_cols (list or None): List of column names to exclude from analysis.
                                   Set to None to exclude no additional columns.
    - confidence (float): The confidence level for the CI.

    Returns:
    - stats_df (pd.DataFrame): DataFrame containing the computed statistics with p-values in scientific notation.
    """
    # Initialize exclude_cols if not provided
    if exclude_cols is None:
        exclude_cols = []
    
    # Ensure the group column exists
    if group_col not in df.columns:
        raise ValueError(f"Group column '{group_col}' not found in DataFrame.")
    
    # Combine group_col and exclude_cols for exclusion
    columns_to_exclude = [group_col] + exclude_cols
    
    # Select numeric columns excluding the specified columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(columns_to_exclude, errors='ignore')
    
    # Initialize a dictionary to store statistics
    stats_dict = {}
    
    # Get the unique groups
    groups = df[group_col].dropna().unique()
    groups_sorted = sorted(groups)  # Ensure consistent order
    
    if len(groups_sorted) != 2:
        raise ValueError(f"Expected exactly 2 groups in '{group_col}', found {len(groups_sorted)}.")
    
    group1, group2 = groups_sorted
    
    for col in numeric_cols:
        stats_dict[col] = {}
        data1 = df[df[group_col] == group1][col].dropna()
        data2 = df[df[group_col] == group2][col].dropna()
    
        for grp, data in zip([group1, group2], [data1, data2]):
            # 5-Number Summary
            min_val = data.min()
            q1 = data.quantile(0.25)
            median = data.median()
            q3 = data.quantile(0.75)
            max_val = data.max()
    
            # Mean
            mean = data.mean()
    
            # 95% Confidence Interval for the Mean
            sem = stats.sem(data)  # Standard Error of the Mean
            if len(data) > 1:
                ci_range = sem * stats.t.ppf((1 + confidence) / 2., len(data)-1)
                ci_lower = mean - ci_range
                ci_upper = mean + ci_range
            else:
                ci_lower, ci_upper = np.nan, np.nan  # Not enough data for CI
    
            # Store the statistics with group identifier
            stats_dict[col][f'Group {grp} Min'] = min_val
            stats_dict[col][f'Group {grp} Q1'] = q1
            stats_dict[col][f'Group {grp} Median'] = median
            stats_dict[col][f'Group {grp} Q3'] = q3
            stats_dict[col][f'Group {grp} Max'] = max_val
            stats_dict[col][f'Group {grp} Mean'] = mean
            stats_dict[col][f'Group {grp} 95% CI Lower'] = ci_lower
            stats_dict[col][f'Group {grp} 95% CI Upper'] = ci_upper
    
        # P-Value: Comparing the two groups using independent t-test
        # Assumption: Independent samples, normally distributed, unequal variance (Welch’s t-test)
        if len(data1) > 1 and len(data2) > 1:
            t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False, nan_policy='omit')
        else:
            p_val = np.nan  # Not enough data for p-value
    
        stats_dict[col][f'P-Value (Group {group1} vs Group {group2})'] = p_val
    
    # Convert the dictionary to a DataFrame
    stats_df = pd.DataFrame(stats_dict).T
    
    # Identify p-value columns
    pval_cols = [col for col in stats_df.columns if 'P-Value' in col]
    other_cols = [col for col in stats_df.columns if col not in pval_cols]
    
    # Round non-p-value columns to 4 decimal places
    stats_df[other_cols] = stats_df[other_cols].round(4)
    
    # Format p-value columns in scientific notation
    stats_df[pval_cols] = stats_df[pval_cols].applymap(
        lambda x: f"{x:.2e}" if pd.notnull(x) else x
    )
    
    return stats_df

# Example usage:
# Assuming patient_df_capped is your DataFrame and 'HOSPITAL_EXPIRE_FLAG' is the group column
# and you want to exclude 'SUBJECT_ID' from the analysis
stats_summary = compute_grouped_statistics(
    patient_df_capped,
    group_col='HOSPITAL_EXPIRE_FLAG',
    exclude_cols=['SUBJECT_ID']
)
print(stats_summary)


                          Group 0 Min  Group 0 Q1  Group 0 Median  Group 0 Q3  \
Unnamed: 0.1                   6.0000   2111.0000       3385.0000   4280.0000   
Unnamed: 0                     6.0000   2111.0000       3386.0000   4284.0000   
AGE_AT_ADMISSION              34.4988     55.0000         67.0000     79.0000   
LOS                            0.0000      6.0000         10.0000     19.0000   
LOS_ICU_MEAN                   0.2694      2.1793          3.5983      7.0484   
WEIGHT_MEAN                   32.6028     67.0000         79.8322     93.5273   
HEARTRATE_MEAN                62.4732     79.3333         87.5164     96.0102   
SBP_MEAN                      73.8104    107.0118        114.8595    124.4730   
DBP_MEAN                      34.6725     54.2326         59.7266     65.4819   
MAP_MEAN                      56.2651     70.7778         76.1223     82.1961   
RR_MEAN                       13.1422     17.7409         19.9758     22.2323   
TEMP_MEAN_C                 