In [1]:
from dragonFunctions import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import plotly.express as px
import pylab
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import chi2_contingency
from sklearn.model_selection import KFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt


# Data Loading

In [2]:
patient_df = pd.read_csv('/Users/josh/Desktop/Macbook Working Files/Git Repos/650-DRAGON-SLAYERS/FILES/Josh-Dev/patient_df.csv')

# Data Preprocessing

## Keep Only First Admission

In [3]:
patient_df = patient_df.drop_duplicates(subset='SUBJECT_ID', keep='first')

## Define 30-Day Mortality Boolean

In [4]:
patient_df['DOD'] = pd.to_datetime(patient_df['DOD'])
patient_df['ADMITTIME'] = pd.to_datetime(patient_df['ADMITTIME'])

# define 30-days mortality column and drop net income
patient_df['DIFF_DAYS'] = (patient_df['DOD'] - patient_df['ADMITTIME']).dt.days
patient_df['MORTALITY'] =patient_df['DIFF_DAYS'].apply(lambda x: 0 if x>30 else 1)

In [5]:
patient_df['MORTALITY'].value_counts()

1    3281
0    1274
Name: MORTALITY, dtype: int64

## Group Ethnicities

In [6]:
def consolidate_ethnicity(ethnicity):
    ethnicity = ethnicity.upper()  # Convert to uppercase for consistency
    
    # Grouping logic
    if 'WHITE' in ethnicity:
        return 'WHITE'
    elif 'BLACK' in ethnicity or 'AFRICAN AMERICAN' in ethnicity:
        return 'BLACK OR AFRICAN AMERICAN'
    elif 'ASIAN' in ethnicity:
        return 'ASIAN'
    elif 'HISPANIC' in ethnicity or 'LATINO' in ethnicity:
        return 'HISPANIC OR LATINO'
    elif 'AMERICAN INDIAN' in ethnicity or 'ALASKA NATIVE' in ethnicity:
        return 'AMERICAN INDIAN OR ALASKA NATIVE'
    elif 'NATIVE HAWAIIAN' in ethnicity or 'PACIFIC ISLANDER' in ethnicity:
        return 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER'
    elif 'MIDDLE EASTERN' in ethnicity:
        return 'MIDDLE EASTERN'
    elif ('UNKNOWN' in ethnicity or 'NOT SPECIFIED' in ethnicity or
          'DECLINED TO ANSWER' in ethnicity or 'UNABLE TO OBTAIN' in ethnicity):
        return 'UNKNOWN/NOT SPECIFIED/DECLINED'
    else:
        return 'OTHER'
    

# Apply the consolidation to create a new column
patient_df['ETHNICITY_CONSOLIDATED'] = patient_df['ETHNICITY'].apply(consolidate_ethnicity)

print(patient_df['ETHNICITY_CONSOLIDATED'].value_counts())

WHITE                                        3360
BLACK OR AFRICAN AMERICAN                     436
UNKNOWN/NOT SPECIFIED/DECLINED                340
ASIAN                                         148
HISPANIC OR LATINO                            144
OTHER                                         117
AMERICAN INDIAN OR ALASKA NATIVE                4
MIDDLE EASTERN                                  4
NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER       2
Name: ETHNICITY_CONSOLIDATED, dtype: int64


## Drop Extraneous Columns

Unnecessary patient data and identifiers will be dropped from the `patient_df`

In [7]:
columns_to_drop = ['Unnamed: 0', 'SUBJECT_ID', 'DOB', 'DOD','HOSPITAL_EXPIRE_FLAG', 'HADM_ID', 'ADMITTIME', 'DIFF_DAYS', 'DISCHTIME', 'ETHNICITY','TEMP_MEAN_C', 'CHLORIDE_MAX_VAL', 'CREATININE_MAX_VAL', 'LACTATE_MAX_VAL', 'PLATELET_MAX_VAL','POTASSIUM_MAX_VAL']
existing_columns_to_drop = [col for col in columns_to_drop if col in patient_df.columns]

# Drop the columns
patient_df = patient_df.drop(columns=existing_columns_to_drop)

patient_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4555 entries, 0 to 5175
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   GENDER                  4555 non-null   object 
 1   ADMISSION_TYPE          4555 non-null   object 
 2   AGE_AT_ADMISSION        4555 non-null   int64  
 3   LOS                     4555 non-null   int64  
 4   LOS_ICU_MEAN            4554 non-null   float64
 5   WEIGHT_MEAN             4323 non-null   float64
 6   HEARTRATE_MEAN          4550 non-null   float64
 7   SBP_MEAN                4551 non-null   float64
 8   DBP_MEAN                4551 non-null   float64
 9   MAP_MEAN                4551 non-null   float64
 10  RR_MEAN                 4551 non-null   float64
 11  TEMP_MIN_C              4543 non-null   float64
 12  TEMP_MAX_C              4543 non-null   float64
 13  OXYGEN_SAT_MEAN         4547 non-null   float64
 14  DIABETES                4555 non-null   

## Encoding Categorical Features

In [8]:
# Identify categorical columns
categorical_cols = patient_df.select_dtypes(include=['object', 'category']).columns.tolist()

df = pd.get_dummies(patient_df, columns=categorical_cols, drop_first=True)

## Handle `null` Values

In [9]:
null_counts = df.isnull().sum()

null_counts_df = null_counts.reset_index()
null_counts_df.columns = ['Column Name', 'Null Count']

total_rows = len(df)
null_counts = df.isnull().sum()
null_percent = (null_counts / total_rows) * 100

null_summary = pd.DataFrame({
    'Null Count': null_counts,
    'Null Percentage (%)': null_percent
})

null_summary

Unnamed: 0,Null Count,Null Percentage (%)
AGE_AT_ADMISSION,0,0.0
LOS,0,0.0
LOS_ICU_MEAN,1,0.021954
WEIGHT_MEAN,232,5.093304
HEARTRATE_MEAN,5,0.109769
SBP_MEAN,4,0.087816
DBP_MEAN,4,0.087816
MAP_MEAN,4,0.087816
RR_MEAN,4,0.087816
TEMP_MIN_C,12,0.263447


In [10]:
numerical_cols = df.select_dtypes(include=['number']).columns
mean_values = df[numerical_cols].mean()

df[numerical_cols] = df[numerical_cols].fillna(mean_values)
null_counts = df[numerical_cols].isnull().sum()

## Outlier Handling

We handle outliers using two functions: 

1. `preprocess_outliers(df, threshold=3)`
2. `compare_summary_stats(summary_before, summary_after)`



```python
def preprocess_outliers(df, threshold=3):
    """
    Detects and caps outliers in non-binary numerical columns of the DataFrame.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - threshold (float): Number of standard deviations to define outliers.
    
    Returns:
    - df_capped (pd.DataFrame): DataFrame with outliers capped.
    - summary_before (pd.DataFrame): Summary statistics before capping.
    - summary_after (pd.DataFrame): Summary statistics after capping.
    - capped_summary (pd.DataFrame): Count of capped values per column.
    """
```

```python
def compare_summary_stats(summary_before, summary_after):
    """
    Compares summary statistics before and after outlier handling.
    
    Parameters:
    - summary_before (pd.DataFrame): Summary statistics before handling outliers.
    - summary_after (pd.DataFrame): Summary statistics after handling outliers.
    
    Returns:
    - comparison_table (pd.DataFrame): Table showing before, after, and changes.
    """
```

In [11]:
# Preprocess outliers and capture all returned values
df_capped, summary_before, summary_after, capped_summary, binary_numerical_cols = preprocess_outliers(df, threshold=2)

# Correctly call compare_summary_stats with all required arguments
comparison_table = compare_summary_stats(summary_before, summary_after, capped_summary)

# Display or save the comparison table
comparison_table.to_csv('Outlier_Report.csv')

df_capped.to_csv('patient_df_capped.csv')


## EDA

We get the mean and p-values of the numberical features using the following functions: `mean_confidence_interval` and `get_stats`

```python

def mean_confidence_interval(data, confidence=0.95):
    """
    Calculate the mean and confidence interval for a list of numbers.
    """

def get_stats(df, group_col='MORTALITY', confidence=0.95):
    """
    Analyze numerical features in the dataframe, aggregating by the specified group column.
    
    Parameters:
    - df: pandas DataFrame
    - group_col: column name to group by (binary: 0 or 1)
    - confidence: confidence level for intervals
    
    Returns:
    - result_df: pandas DataFrame with mean (CI) for each group and p-value
    """
```

In [12]:
eda_df = get_stats(df_capped)
eda_df.sort_values('p-value')

eda_df.to_csv('Feature_Means_Report.csv')

# Model