### **Predicting Pediatric In-Hospital Mortality from Admission Data**

### *Import Necessary Libraries*

In [174]:
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt



### *Load the dataset*

In [248]:
# Import the CSV file named "SyntheticData_Training.csv" into a pandas DataFrame called Sepsis_df
Sepsis_df = pd.read_csv("SyntheticData_Training.csv")

### *Data Inspection*

In [250]:
# Display the first 5 rows of the DataFrame to quickly inspect the data
Sepsis_df.head()

Unnamed: 0,studyid_adm,agecalc_adm,height_cm_adm,weight_kg_adm,muac_mm_adm,hr_bpm_adm,rr_brpm_app_adm,sysbp_mmhg_adm,diasbp_mmhg_adm,temp_c_adm,...,cookloc_adm,lightfuel_adm,tobacco_adm,bednet_adm,hctpretransfusion_adm,hivstatus_adm,malariastatuspos_adm,lengthadm,caregiver_adm_new,inhospital_mortality
0,1,16.8,79.8,11.6,150.0,130.0,23.0,92.0,54.0,36.3,...,In a separate building/building space used as ...,Electric bulbs (national grid),Never,Always,Yes,HIV negative,No,2,Mother,0
1,2,46.1,93.0,13.6,151.0,115.0,35.0,72.0,42.0,36.8,...,In a separate building/building space used as ...,Tadooba,Never,Always,Yes,HIV negative,Yes,5,Mother,0
2,3,7.9,68.2,8.2,148.0,150.0,56.0,94.0,51.0,37.2,...,In a separate building/building space used as ...,Tadooba,Never,Always,Yes,HIV negative,No,4,Mother,0
3,4,38.2,95.0,12.0,138.0,134.0,38.0,94.0,57.0,37.6,...,In the house where you sleep,Electric bulbs (national grid),Never,Always,,HIV negative,Yes,5,Mother,0
4,5,16.1,83.0,12.0,165.0,163.0,61.0,107.0,73.0,38.7,...,Outdoors in the open,Electric bulbs (national grid),Never,Sometimes,Yes,HIV negative,Yes,7,Mother,0


*** Concise summary of the dataset ***

In [180]:
# # Configure pandas to display details for up to 150 columns in the .info() summary
# #  By default, pandas displays a maximum of 100 columns in .info()
# # If the DataFrame has more than 100 columns, the output is truncated.
# pd.set_option('display.max_info_columns', 150)
# Sepsis_df.info()

*** Supset dataframe for columns with missing values ***

In [252]:
# Step 1: Get columns that have at least one missing value
cols_with_missing = Sepsis_df.columns[Sepsis_df.isnull().any()].tolist()

# Step 2: Calculate the number of missing values and the percentage of missing values for each column
missing_values_count = Sepsis_df[cols_with_missing].isnull().sum()  # Count of missing values per column
missing_percentage = Sepsis_df[cols_with_missing].isnull().mean() * 100  # Percentage of missing values per column
missing_percentage = missing_percentage.round(2)

# Step 3: Combine the results into a DataFrame for better readability
missing_data_summary_df = pd.DataFrame({
    'Missing Values': missing_values_count,
    'Missing Percentage': missing_percentage
})

# Step 4: Filter to show only columns with missing values > 0%
missing_data_summary_df = missing_data_summary_df[missing_data_summary_df['Missing Percentage'] > 0]

# Print the summary of missing data for columns with missing values
print(missing_data_summary_df)


                          Missing Values  Missing Percentage
height_cm_adm                         14                0.52
weight_kg_adm                          9                0.34
muac_mm_adm                           13                0.48
hr_bpm_adm                             1                0.04
rr_brpm_app_adm                        5                0.19
sysbp_mmhg_adm                         2                0.07
diasbp_mmhg_adm                        1                0.04
temp_c_adm                             1                0.04
spo2site1_pc_oxi_adm                   4                0.15
spo2site2_pc_oxi_adm                 205                7.63
spo2other_adm                       2505               93.26
momage_adm                            19                0.71
momagefirstpreg_adm                  224                8.34
householdsize_adm                      3                0.11
alivechildren_adm                      4                0.15
deadchildren_adm        

### *Data Cleaning*

*** Check for duplicate Values ***

In [254]:
Sepsis_df.duplicated().sum()

0

*** Drop certain columns ***

In [256]:
cols_to_drop = [col for col in Sepsis_df.columns if col.startswith('admitabx_')]
Sepsis_df= Sepsis_df.drop(columns=cols_to_drop)


___
Dropped intervention columns: These variables represent treatments that, while powerful predictors, are excluded due to their post-admission nature. The model, however, relies on data available at the time of admission.
___

In [258]:
Sepsis_df=Sepsis_df.drop(columns=["spo2other_adm","lactate2_mmolpl_adm","nonexclbreastfed_adm","vaccmeaslessource_adm"])

___
* **"spo2other\_adm" dropped:** Represents oxygen saturation with another device (redundant), and has 93.3% missing values.
* **"lactate2\_mmolpl\_adm" dropped:** A similar measurement is available, with 82.54% missing values.
* **"nonexclbreastfed\_adm" dropped:** Represents non-exclusive breastfeeding details, with 96.91% missing values.
* **"vaccmeaslessource\_adm" dropped:** Represents the source of information regarding measles vaccination (found to be less important), with 35.22% missing values.
___

*** Clean Missing Values ***

*Clean numeric-columns*

In [260]:
# Step 1: Get columns that have at least one missing value
cols_with_missing = Sepsis_df.columns[Sepsis_df.isnull().any()].tolist()

# Step 2: Create a new DataFrame with only those columns
df_missing_subset = Sepsis_df[cols_with_missing]

In [262]:
# Step 1: Select numeric columns only
numeric_cols = df_missing_subset.select_dtypes(include='number')

# Step 2: Calculate skewness for each numeric column
skewness = numeric_cols.skew()

# Step 3: Filter columns where skewness is between -0.5 and +0.5
symmetric_cols = skewness[(skewness > -0.5) & (skewness < 0.5)].index.tolist()

print("Columns with fairly symmetric distributions:", symmetric_cols)

Columns with fairly symmetric distributions: ['muac_mm_adm', 'sysbp_mmhg_adm', 'diasbp_mmhg_adm', 'hematocrit_gpdl_adm']


___
Columns with fairly symmetric distributions (skewness between -0.5 and 0.5) were identified using the `.skew()` function and separated for imputation using the **mean**. For columns with more skewed distributions (skewness < -0.5 or > 0.5), the **median** will be used instead, as it's more robust to outliers.
___

In [264]:
# Step 1: Define the age group bins
def assign_age_group(age_months):
    if age_months < 6:
        return '3-6 month'
    elif age_months < 12:
        return '6-1yr'
    elif age_months < 24:
        return '1-2yr'
    elif age_months < 36:
        return '2-3yr'
    elif age_months < 48:
        return '3-4yr'
    elif age_months < 60:
        return '4-5yr'
    elif age_months < 72:
        return '5-6yr'
    else:
        return 'older'

# Step 2: Create 'age_group' column
Sepsis_df['age_group'] = Sepsis_df['agecalc_adm'].apply(assign_age_group)

# Step 3: Group by age_group and sex_adm, then impute hematocrit
Sepsis_df['hematocrit_gpdl_adm'] = Sepsis_df.groupby(['age_group', 'sex_adm'])['hematocrit_gpdl_adm']\
    .transform(lambda x: x.fillna(x.mean()))
print(Sepsis_df["hematocrit_gpdl_adm"].isnull().sum())

0


___
" hematocrit_gpdl_adm " was imputed using a logical, group-based approach. Since hematocrit levels vary by age and sex, first  patients were categorized into age groups and then grouped the data by both age_group and sex_adm.
Within each group, missing hematocrit values were filled using the mean of that group, ensuring the imputation respects biologically relevant differences.
___

In [266]:
Sepsis_df["hr_bpm_adm"]=Sepsis_df["hr_bpm_adm"].fillna(Sepsis_df["hr_bpm_adm"].median())
Sepsis_df["hr_bpm_adm"].isnull().sum()

0

___
"hr_bpm_adm" was imputed using median since found to be assymetric.
___

In [268]:
Sepsis_df["sysbp_mmhg_adm"]=Sepsis_df["sysbp_mmhg_adm"].fillna(Sepsis_df["sysbp_mmhg_adm"].mean())
Sepsis_df["sysbp_mmhg_adm"].isnull().sum()

0

___
"sysbp_mmhg_adm" was imputed using mean since found to be fairly symetric.
___

In [202]:
# Create shock index column
Sepsis_df['shockindex_heart_rate_sbp_adm'] = Sepsis_df['hr_bpm_adm'] / Sepsis_df['sysbp_mmhg_adm']
print(Sepsis_df["shockindex_heart_rate_sbp_adm"].isnull().sum())

0


___
"shockindex_heart_rate_sbp_adm" was created to aid imputation of missing values in the "lactate_mmolpl_adm" column. It was derived from "hr_bpm_adm" and "sysbp_mmhg_adm"  columns. 
___

In [270]:
# Create high-risk indicator based on SpO2 and shock index
Sepsis_df['high_risk'] = ((Sepsis_df['spo2site1_pc_oxi_adm'] < 90) | 
                          (Sepsis_df['shockindex_heart_rate_sbp_adm'] > 1)).astype(int)

# Impute lactate using median within risk groups (only one transform needed)
Sepsis_df['lactate_mmolpl_adm'] = Sepsis_df.groupby('high_risk')['lactate_mmolpl_adm']\
                                           .transform(lambda x: x.fillna(x.median()))
print(Sepsis_df["lactate_mmolpl_adm"].isnull().sum())

KeyError: 'shockindex_heart_rate_sbp_adm'

___
* Created a `high_risk` flag for patients with SpO₂ < 90 or shock index > 1.
  These thresholds are clinically meaningful indicators of potential instability.
* Imputed missing `lactate_mmolpl_adm` values using the median within each risk group.
  This ensures imputation is done in a context-sensitive way, preserving group-level distribution patterns.
  
___

In [206]:
# Use spo2site2 to fill missing values in spo2site1
Sepsis_df['spo2site1_pc_oxi_adm'] = Sepsis_df['spo2site1_pc_oxi_adm'].fillna(
    Sepsis_df['spo2site2_pc_oxi_adm']
)
# Use spo2site1 to fill missing values in spo2site2
Sepsis_df['spo2site2_pc_oxi_adm'] = Sepsis_df['spo2site2_pc_oxi_adm'].fillna(
    Sepsis_df['spo2site1_pc_oxi_adm']
)
print(Sepsis_df['spo2site1_pc_oxi_adm'].isnull().sum())
print(Sepsis_df['spo2site2_pc_oxi_adm'].isnull().sum())


4
4


In [207]:
# Create age group
Sepsis_df['age_group2'] = Sepsis_df['agecalc_adm'].apply(lambda x: '<1yr' if x < 1 else '≥1yr')

# Impute SpO2 site 1
Sepsis_df['spo2site1_pc_oxi_adm'] = Sepsis_df.groupby('age_group2')['spo2site1_pc_oxi_adm']\
                                             .transform(lambda x: x.fillna(x.median()))

# Impute SpO2 site 2
Sepsis_df['spo2site2_pc_oxi_adm'] = Sepsis_df.groupby('age_group2')['spo2site2_pc_oxi_adm']\
                                             .transform(lambda x: x.fillna(x.median()))
print(Sepsis_df['spo2site1_pc_oxi_adm'].isnull().sum())
print(Sepsis_df['spo2site2_pc_oxi_adm'].isnull().sum())

0
0


___
The columns **'spo2site1\_pc\_oxi\_adm'** and **'spo2site2\_pc\_oxi\_adm'** represent oxygen saturation levels (percentage of hemoglobin carrying oxygen) from two different body sites. Since both measure similar values, **cross imputation** was used to fill missing values. However, 4 missing values were observed, suggesting that 4 rows were missing data for both columns. To account for age-related physiological differences (e.g., respiratory rate, oxygen saturation, heart rate), an **age\_group2** variable was created to categorize patients as <1 year or ≥1 year. Missing SpO₂ values were then imputed using the median within each age group.
___

In [209]:
# Use sqi2 to fill missing values in sqi1
Sepsis_df['sqi1_perc_oxi_adm'] = Sepsis_df['sqi1_perc_oxi_adm'].fillna(
    Sepsis_df['sqi2_perc_oxi_adm']
)

# Use sqi1 to fill missing values in sqi2
Sepsis_df['sqi2_perc_oxi_adm'] = Sepsis_df['sqi2_perc_oxi_adm'].fillna(
    Sepsis_df['sqi1_perc_oxi_adm']
)
print(Sepsis_df['sqi1_perc_oxi_adm'].isnull().sum())
print(Sepsis_df['sqi2_perc_oxi_adm'].isnull().sum())

126
126


In [210]:
Sepsis_df['sqi1_perc_oxi_adm'] = Sepsis_df['sqi1_perc_oxi_adm'].fillna(Sepsis_df['sqi1_perc_oxi_adm'].median())
Sepsis_df['sqi2_perc_oxi_adm'] = Sepsis_df['sqi2_perc_oxi_adm'].fillna(Sepsis_df['sqi2_perc_oxi_adm'].median())
print(Sepsis_df['sqi1_perc_oxi_adm'].isnull().sum())
print(Sepsis_df['sqi2_perc_oxi_adm'].isnull().sum())

0
0


___
The columns **'sqi1\_perc\_oxi\_adm'** and **'sqi2\_perc\_oxi\_adm'** represent the best and second-best SpO₂ signal quality index at admission. 
* **`sqi1_perc_oxi_adm`** = Highest quality SpO₂ signal among all measured sites at admission.
* **`sqi2_perc_oxi_adm`** = Second-best quality SpO₂ signal from a different site at admission.
Since they are related, **cross-imputation** was applied. However, 126 rows remained missing for both columns, so the remaining missing values were filled using the median.
___

In [212]:
# Round age to match WHO months
Sepsis_df['age_months_rounded'] = Sepsis_df['agecalc_adm'].round().astype(int)

# Load WHO dataset for girls and prepare
who_girls = pd.read_csv("girls_length_height_for_age.csv")
who_clean = who_girls[['Months', 'Median']].rename(columns={
    'Months': 'age_months_rounded',
    'Median': 'who_median_height'
})
Sepsis_df = Sepsis_df.merge(who_clean, on='age_months_rounded', how='left')

# Impute missing heights for female patients
girls_mask = (Sepsis_df['sex_adm'] == 'Female') & (Sepsis_df['height_cm_adm'].isna())
Sepsis_df.loc[girls_mask, 'height_cm_adm'] = Sepsis_df.loc[girls_mask, 'who_median_height']

# Load WHO dataset for boys and prepare
who_boys = pd.read_csv("boys_length_height_for_age.csv")
who_clean_boys = who_boys[['Months', 'Median']].rename(columns={
    'Months': 'age_months_rounded',
    'Median': 'who_median_height_boys'
})
Sepsis_df = Sepsis_df.merge(who_clean_boys, on='age_months_rounded', how='left')

# Impute missing heights for male patients
boys_mask = (Sepsis_df['sex_adm'] == 'Male') & (Sepsis_df['height_cm_adm'].isna())
Sepsis_df.loc[boys_mask, 'height_cm_adm'] = Sepsis_df.loc[boys_mask, 'who_median_height_boys']
print(Sepsis_df['height_cm_adm'].isnull().sum())


0


___
'height_cm_adm' Missing height values were imputed using WHO median height-for-age data, separately for males and females. Age in months was rounded to align with WHO reference tables.
___

In [214]:
# Load WHO weight-for-age data for boys and girls
boys_wfa = pd.read_csv("boys_weight_for_age.csv")  
girls_wfa = pd.read_csv("girls_weight_for_age.csv") 

# Round the age to the nearest integer for matching with the WHO data
Sepsis_df['agecalc_adm_rounded'] = Sepsis_df['agecalc_adm'].round().astype(int)

# Function to impute missing weight values
def impute_weight(row):
    if pd.isna(row['weight_kg_adm']):
        if row['sex_adm'] == 'Male':
            # Get the median weight for boys based on the rounded age
            ref = boys_wfa[boys_wfa['Months'] == row['agecalc_adm_rounded']]['Median'].values[0]
        else:
            # Get the median weight for girls based on the rounded age
            ref = girls_wfa[girls_wfa['Months'] == row['agecalc_adm_rounded']]['Median'].values[0]
        return ref
    return row['weight_kg_adm']

# Apply the imputation to the dataset
Sepsis_df['weight_kg_adm'] = Sepsis_df.apply(impute_weight, axis=1)

# Check for missing values again
Sepsis_df['weight_kg_adm'].isna().sum()


0

___
**'weight\_kg\_adm'** Missing weight values were imputed using WHO median weight-for-age data, separately for males and females. Age in months was rounded to match the WHO reference tables.
___

In [216]:
# Suppress all warning messages to keep output clean
warnings.filterwarnings("ignore")

# Divide 'weight_kg_adm' into 5 equal-sized bins (quintiles) and assign to 'weight_group'
Sepsis_df['weight_group'] = pd.qcut(Sepsis_df['weight_kg_adm'], 5)

# Group by 'age_group', 'weight_group', and 'sex_adm', then fill missing 'muac_mm_adm' values with the group mean
Sepsis_df['muac_mm_adm'] = Sepsis_df.groupby(
    ['age_group', 'weight_group', 'sex_adm']
)['muac_mm_adm'].transform(lambda x: x.fillna(x.mean()))

# Print how many missing values remain in 'muac_mm_adm']\
print(Sepsis_df["muac_mm_adm"].isnull().sum())

0


___
**"muac_mm_adm"** was imputed by grouping the data based on age_group, weight quintiles (weight_group), and sex_adm, then filling missing values with the mean MUAC within each subgroup.
___

In [218]:
# Impute missing 'bcseye_adm' as 'NoFollow'
Sepsis_df['bcseye_adm'] = Sepsis_df['bcseye_adm'].fillna('Fails to watch or follow')

# Identify rows where 'bcsverbal_adm' is missing
missing_verbal = Sepsis_df['bcsverbal_adm'].isna()

# Display corresponding eye and motor responses for those rows (diagnostic step)
print(Sepsis_df[missing_verbal][['bcseye_adm', 'bcsmotor_adm']])

# Define a subgroup with specific eye and motor scores
subgroup_mask = (
    (Sepsis_df['bcseye_adm'] == 'Watches or follows') & 
    (Sepsis_df['bcsmotor_adm'] == 'Withdraws limb from painful stimulus')
)

# Get the most common (mode) verbal score from this subgroup
mode_verbal = Sepsis_df[subgroup_mask]['bcsverbal_adm'].mode()[0]

# Apply this most common verbal score to specific missing entries
Sepsis_df.loc[[507, 1205], 'bcsverbal_adm'] = mode_verbal
print(Sepsis_df["bcseye_adm"].isnull().sum())
print(Sepsis_df["bcsverbal_adm"].isnull().sum())


              bcseye_adm                          bcsmotor_adm
507   Watches or follows  Withdraws limb from painful stimulus
1205  Watches or follows  Withdraws limb from painful stimulus
0
0


___
Missing values in **`bcseye_adm`** were imputed with `"NoFollow"`(worst case).
For **`bcsverbal_adm`**, missing entries were imputed using the **most common value** from a defined subgroup (based on specific eye and motor responses: `"Watches or follows"` and `"Withdraws limb from painful stimulus"`). This approach ensures context-aware imputation.
___

In [220]:
# Round age to match WHO months
Sepsis_df['age_months_rounded'] = Sepsis_df['agecalc_adm'].round().astype(int)

# Load WHO dataset for girls and prepare
who_girls2 = pd.read_csv("acfa-girls-3-5-zscores.csv")
who_clean2 = who_girls2[['Month', 'M']].rename(columns={
    'Month': 'age_months_rounded',
    'M': 'who_mean_munac'
})
Sepsis_df = Sepsis_df.merge(who_clean2, on='age_months_rounded', how='left')

# Impute missing heights for female patients
girls_mask = (Sepsis_df['sex_adm'] == 'Female') & (Sepsis_df['muac_mm_adm'].isna())
Sepsis_df.loc[girls_mask, 'muac_mm_adm'] = Sepsis_df.loc[girls_mask, 'who_mean_munac']

# Load WHO dataset for boys and prepare
who_boys2 = pd.read_csv("acfa-boys-3-5-zscores.csv")
who_clean_boys2 = who_boys2[['Month','M']].rename(columns={
    'Month': 'age_months_rounded',
    'M': 'who_mean_munac_boys'
})
Sepsis_df = Sepsis_df.merge(who_clean_boys2, on='age_months_rounded', how='left')

# Impute missing heights for male patients
boys_mask = (Sepsis_df['sex_adm'] == 'Male') & (Sepsis_df['muac_mm_adm'].isna())
Sepsis_df.loc[boys_mask, 'muac_mm_adm'] = Sepsis_df.loc[boys_mask, 'who_mean_munac_boys']
print(Sepsis_df['muac_mm_adm'].isnull().sum())

FileNotFoundError: [Errno 2] No such file or directory: 'acfa-girls-3-5-zscores.csv'

___
`muac_mm_adm` Missing MUAC values were imputed using WHO mean MUAC-for-age data, separately for males and females. Age in months was rounded to align with WHO reference tables.
___

In [None]:
#Sepsis_df['hivstatus_adm'] = Sepsis_df['hivstatus_adm'].fillna('NotTested') diasbp_mmhg_adm

In [None]:
Sepsis_df["diasbp_mmhg_adm"] = Sepsis_df["diasbp_mmhg_adm"].fillna(Sepsis_df["diasbp_mmhg_adm"].mean())
print(Sepsis_df["diasbp_mmhg_adm"].isnull().sum())

___
The missing values in the diasbp_mmhg_adm column were imputed with the mean of the existing values since found to be fairly symmetric.
___

In [None]:
# Step 1: Get columns that have at least one missing value
cols_with_missing2 = Sepsis_df.columns[Sepsis_df.isnull().any()].tolist()

# Step 2: Create a new DataFrame with only those columns
df_missing_subset2 = Sepsis_df[cols_with_missing2]

# Step 3: Select numeric columns only
numeric_cols2 = df_missing_subset2.select_dtypes(include='number').columns

# Step 4: Fill missing values in numeric columns with the median of each column
df_missing_subset2[numeric_cols2] = df_missing_subset2[numeric_cols2].fillna(df_missing_subset2[numeric_cols2].median())
# Update the original Sepsis_df with the imputed numeric columns
Sepsis_df[numeric_cols2] = df_missing_subset2[numeric_cols2]



___
The remaining numeric columns with null values were imputed using the median, as the number of missing values was relatively small and found to be asymmetrically distributed. This approach ensures that the imputed values do not skew the data significantly.
___

*Clean non-numeric-columns*

In [None]:
Sepsis_df['hivstatus_adm'] = Sepsis_df['hivstatus_adm'].fillna('NotTested')
print(Sepsis_df["hivstatus_adm"].isnull().sum())

___
The column **'hivstatus\_adm'** was imputed with **'NotTested'** since it was challenging to reliably estimate the missing values using the same or related columns.
___

In [None]:
# Step 1: Plot 'hematocrit_gpdl_adm' levels before and after transfusion using a boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(
    x='hctpretransfusion_adm', 
    y='hematocrit_gpdl_adm',  # Replace ''hematocrit_gpdl_adm'' with the actual column name for HCT values
    data=Sepsis_df
)
plt.title('Distribution of hematocrit_gpdl_adm Levels by Pre-Transfusion Status')
plt.xlabel('Before Transfusion (hctpretransfusion_adm)')
plt.ylabel('hematocrit_gpdl_adm Levels')
plt.show()


In [None]:
# First, calculate mode (most frequent value)
mode_value = Sepsis_df['hctpretransfusion_adm'].mode()[0]

# Define custom function
def impute_hctpretransfusion(row):
    if pd.isna(row['hctpretransfusion_adm']):
        if row['hematocrit_gpdl_adm'] < 10 or row['hematocrit_gpdl_adm'] > 50:
            return 'Yes'
        else:
            return mode_value
    else:
        return row['hctpretransfusion_adm']

# Apply the function to fill missing values
Sepsis_df['hctpretransfusion_adm'] = Sepsis_df.apply(impute_hctpretransfusion, axis=1)
print(Sepsis_df['hctpretransfusion_adm'].isnull().sum())


___
**"hctpretransfusion\_adm"** was imputed using a **distribution-based method** guided by a box plot analysis. Since the upper whisker for “No” responses was around 50 and the lower whisker around 10 (based on hematocrit values), missing entries with hematocrit values outside this range were classified as **"Yes"**. The remaining missing values were filled using the **mode**.
___


In [None]:
# Step 1: Select non-numeric columns with missing values
non_numeric_missing = Sepsis_df.select_dtypes(exclude='number').columns
non_numeric_missing = [col for col in non_numeric_missing if Sepsis_df[col].isnull().any()]

# Step 2: Fill each with its own mode
for col in non_numeric_missing:
    mode_val = Sepsis_df[col].mode()[0]
    Sepsis_df[col] = Sepsis_df[col].fillna(mode_val)

___
non numeric columns other than **'hivstatus\_adm'** and **"hctpretransfusion_adm"** were filled using mode
___

In [273]:
# Configure pandas to display details for up to 150 columns in the .info() summary
#  By default, pandas displays a maximum of 100 columns in .info()
# If the DataFrame has more than 100 columns, the output is truncated.
pd.set_option('display.max_info_columns', 150)
Sepsis_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2686 entries, 0 to 2685
Data columns (total 114 columns):
 #    Column                    Non-Null Count  Dtype  
---   ------                    --------------  -----  
 0    studyid_adm               2686 non-null   int64  
 1    agecalc_adm               2686 non-null   float64
 2    height_cm_adm             2672 non-null   float64
 3    weight_kg_adm             2677 non-null   float64
 4    muac_mm_adm               2673 non-null   float64
 5    hr_bpm_adm                2686 non-null   float64
 6    rr_brpm_app_adm           2681 non-null   float64
 7    sysbp_mmhg_adm            2686 non-null   float64
 8    diasbp_mmhg_adm           2685 non-null   float64
 9    temp_c_adm                2685 non-null   float64
 10   spo2site1_pc_oxi_adm      2682 non-null   float64
 11   spo2site2_pc_oxi_adm      2481 non-null   float64
 12   momage_adm                2667 non-null   float64
 13   momagefirstpreg_adm       2462 non-null   floa

In [275]:
Sepsis_df.columns.get_loc('inhospital_mortality')
Sepsis_df = Sepsis_df.drop(Sepsis_df.columns[113:],axis=1)
Sepsis_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2686 entries, 0 to 2685
Data columns (total 113 columns):
 #    Column                    Non-Null Count  Dtype  
---   ------                    --------------  -----  
 0    studyid_adm               2686 non-null   int64  
 1    agecalc_adm               2686 non-null   float64
 2    height_cm_adm             2672 non-null   float64
 3    weight_kg_adm             2677 non-null   float64
 4    muac_mm_adm               2673 non-null   float64
 5    hr_bpm_adm                2686 non-null   float64
 6    rr_brpm_app_adm           2681 non-null   float64
 7    sysbp_mmhg_adm            2686 non-null   float64
 8    diasbp_mmhg_adm           2685 non-null   float64
 9    temp_c_adm                2685 non-null   float64
 10   spo2site1_pc_oxi_adm      2682 non-null   float64
 11   spo2site2_pc_oxi_adm      2481 non-null   float64
 12   momage_adm                2667 non-null   float64
 13   momagefirstpreg_adm       2462 non-null   floa

___
                                                Observation : *** All columns do not contain missing value. ***
___

In [None]:
Sepsis_df_2 = pd.read_csv("sepsis_cleaned.csv")
Sepsis_df_2.columns.get_loc('inhospital_mortality')
Sepsis_df_2 = Sepsis_df_2.drop(Sepsis_df_2.columns[113:],axis=1)
Sepsis_df_2.info()
Sepsis_df_2['bcseye_adm'] = 'Fails to watch or follow' 'NoFollow'

In [None]:
# Show only variables with p < 0.05
significant = chi2_df[chi2_df['p-value'] < 0.01]
print("Significant associations with in-hospital mortality:")
print(significant)


In [None]:
from scipy import stats

from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu

# 1. Separate categorical and continuous features
target = 'inhospital_mortality'
categorical_features = Sepsis_df_2.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
continuous_features = Sepsis_df_2.select_dtypes(include=['int64', 'float64']).drop(columns=[target]).columns.tolist()

# 2. Chi-Square test for categorical features
significant_cat = []
for col in categorical_features:
    contingency = pd.crosstab(Sepsis_df_2[col], Sepsis_df_2[target])
    if contingency.shape[0] > 1:
        chi2, p, _, _ = stats.chi2_contingency(contingency)
        if p < 0.01:
            significant_cat.append(col)

# 3. Mann-Whitney U Test for continuous features
significant_cont = []
for col in continuous_features:
    group0 = Sepsis_df_2[Sepsis_df_2[target] == 0][col]
    group1 = Sepsis_df_2[Sepsis_df_2[target] == 1][col]
    try:
        u_stat, p_value = stats.mannwhitneyu(group0, group1, alternative='two-sided')
        if p_value < 0.01:
            significant_cont.append(col)
    except:
        pass  # Skip if error due to too many NaNs or invalid inputs

# 4. Combine and review
significant_features = significant_cat + significant_cont
print(f"Total insignificant features (p < 0.01): {len(significant_features)}")
print(significant_features)

In [238]:
import pandas as pd

# Identify categorical columns
categorical_cols = Sepsis_df_2.select_dtypes(include='object').columns

# Count unique values in each categorical column
cat_unique_counts = Sepsis_df_2[categorical_cols].nunique().sort_values()

# Create a summary DataFrame
cat_summary = pd.DataFrame({
    'Categorical Variable': cat_unique_counts.index,
    'Unique Values Count': cat_unique_counts.values
})

# Display all rows
pd.set_option('display.max_rows', None)

# Print the full summary
print(cat_summary)


        Categorical Variable  Unique Values Count
0          symptoms_adm___18                    1
1        comorbidity_adm___4                    1
2        comorbidity_adm___2                    2
3        comorbidity_adm___3                    2
4        comorbidity_adm___5                    2
5        comorbidity_adm___6                    2
6        comorbidity_adm___7                    2
7        comorbidity_adm___8                    2
8        comorbidity_adm___9                    2
9       comorbidity_adm___10                    2
10      comorbidity_adm___11                    2
11      malariastatuspos_adm                    2
12          duedateknown_adm                    2
13       birthdetail_adm___1                    2
14       birthdetail_adm___2                    2
15       comorbidity_adm___1                    2
16                   sex_adm                    2
17       birthdetail_adm___5                    2
18       birthdetail_adm___6                    2


In [240]:
# Display a few example values from each categorical column
for col in categorical_cols:
    print(f"\nColumn: {col}")
    print(Sepsis_df_2[col].dropna().unique()[:5])  # Show up to 5 unique non-null values



Column: sex_adm
['Female' 'Male']

Column: spo2onoxy_adm
['No' 'Yes']

Column: oxygenavail_adm
['Oxygen available and not being used' 'Oxygen available and being used'
 'Oxygen not available' 'Oxygen available but not enough']

Column: respdistress_adm
['No' 'Yes']

Column: caprefill_adm
['No' 'Yes']

Column: bcseye_adm
['Watches or follows' 'Fails to watch or follow' 'NoFollow']

Column: bcsmotor_adm
['Localizes painful stimulus' 'Withdraws limb from painful stimulus'
 'No response or inappropriate response']

Column: bcsverbal_adm
['Cries appropriately with pain, or, if verbal, speaks'
 'Moan or abnormal cry with pain' 'No vocal response to pain']

Column: bcgscar_adm
['Yes' 'No']

Column: vaccmeasles_adm
['Yes' 'No' 'Unknown']

Column: vaccpneumoc_adm
['3 doses' '2 doses' 'Unknown' '0 doses' '1 dose']

Column: vaccpneumocsource_adm
['Self report' 'Card']

Column: vaccdpt_adm
['3 doses' '2 doses' 'Unknown' '0 doses' '1 dose']

Column: vaccdptsource_adm
['Self report' 'Card']

Column

In [242]:
# Initialize a counter
col_count = 0

# Display a few example values from each categorical column
for col in categorical_cols:
    print(f"\nColumn: {col}")
    print(Sepsis_df_2[col].dropna().unique()[:5])  # Show up to 5 unique non-null values
    col_count += 1

print(f"\nTotal categorical columns displayed: {col_count}")



Column: sex_adm
['Female' 'Male']

Column: spo2onoxy_adm
['No' 'Yes']

Column: oxygenavail_adm
['Oxygen available and not being used' 'Oxygen available and being used'
 'Oxygen not available' 'Oxygen available but not enough']

Column: respdistress_adm
['No' 'Yes']

Column: caprefill_adm
['No' 'Yes']

Column: bcseye_adm
['Watches or follows' 'Fails to watch or follow' 'NoFollow']

Column: bcsmotor_adm
['Localizes painful stimulus' 'Withdraws limb from painful stimulus'
 'No response or inappropriate response']

Column: bcsverbal_adm
['Cries appropriately with pain, or, if verbal, speaks'
 'Moan or abnormal cry with pain' 'No vocal response to pain']

Column: bcgscar_adm
['Yes' 'No']

Column: vaccmeasles_adm
['Yes' 'No' 'Unknown']

Column: vaccpneumoc_adm
['3 doses' '2 doses' 'Unknown' '0 doses' '1 dose']

Column: vaccpneumocsource_adm
['Self report' 'Card']

Column: vaccdpt_adm
['3 doses' '2 doses' 'Unknown' '0 doses' '1 dose']

Column: vaccdptsource_adm
['Self report' 'Card']

Column

In [None]:
from sklearn.preprocessing import LabelEncoder

# 1. Identify categorical features
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Categorical features to encode: {len(categorical_features)} found")
print(categorical_features)

# 2. Initialize LabelEncoder
label_encoder = LabelEncoder()

# 3. Apply Label Encoding for binary categorical variables
for feature in categorical_features:
    unique_values = df[feature].nunique()
    
    if unique_values == 2:
        # Binary category - apply label encoding
        df[feature] = label_encoder.fit_transform(df[feature])
        print(f"Label encoded feature: {feature}")
    else:
        # Multiclass category - apply one-hot encoding
        df = pd.get_dummies(df, columns=[feature], prefix=[feature])
        print(f"One-hot encoded feature: {feature}")

# 4. Quick confirmation
print(f"Data shape after encoding: {df.shape}")
