In [93]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pylab
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import chi2_contingency
from sklearn.model_selection import KFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt


# Data Loading

In [94]:
patient_df = pd.read_csv('/Users/josh/Desktop/Macbook Working Files/Git Repos/650-DRAGON-SLAYERS/FILES/Josh-Dev/patient_df.csv')

patient_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5176 entries, 0 to 5175
Data columns (total 45 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            5176 non-null   int64  
 1   SUBJECT_ID            5176 non-null   int64  
 2   GENDER                5176 non-null   object 
 3   DOB                   5176 non-null   object 
 4   DOD                   3278 non-null   object 
 5   HOSPITAL_EXPIRE_FLAG  5176 non-null   int64  
 6   HADM_ID               5176 non-null   int64  
 7   ADMITTIME             5176 non-null   object 
 8   DISCHTIME             5176 non-null   object 
 9   ADMISSION_TYPE        5176 non-null   object 
 10  ETHNICITY             5176 non-null   object 
 11  AGE_AT_ADMISSION      5176 non-null   int64  
 12  LOS                   5176 non-null   int64  
 13  LOS_ICU_MEAN          5175 non-null   float64
 14  WEIGHT_MEAN           4941 non-null   float64
 15  HEARTRATE_MEAN       

# Data Preprocessing

## Drop Extraneous Columns

Unnecessary patient data and identifiers will be dropped from the `patient_df`

In [95]:
columns_to_drop = ['DOB', 'DOD', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'ETHNICITY']
existing_columns_to_drop = [col for col in columns_to_drop if col in patient_df.columns]

# Drop the columns
patient_df = patient_df.drop(columns=existing_columns_to_drop)

patient_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5176 entries, 0 to 5175
Data columns (total 39 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            5176 non-null   int64  
 1   SUBJECT_ID            5176 non-null   int64  
 2   GENDER                5176 non-null   object 
 3   HOSPITAL_EXPIRE_FLAG  5176 non-null   int64  
 4   ADMISSION_TYPE        5176 non-null   object 
 5   AGE_AT_ADMISSION      5176 non-null   int64  
 6   LOS                   5176 non-null   int64  
 7   LOS_ICU_MEAN          5175 non-null   float64
 8   WEIGHT_MEAN           4941 non-null   float64
 9   HEARTRATE_MEAN        5171 non-null   float64
 10  SBP_MEAN              5172 non-null   float64
 11  DBP_MEAN              5172 non-null   float64
 12  MAP_MEAN              5172 non-null   float64
 13  RR_MEAN               5172 non-null   float64
 14  TEMP_MEAN_C           5164 non-null   float64
 15  TEMP_MIN_C           

## Encoding Categorical Features

In [96]:
# Identify categorical columns
categorical_cols = patient_df.select_dtypes(include=['object', 'category']).columns.tolist()

df = pd.get_dummies(patient_df, columns=categorical_cols, drop_first=True)

## Handle `null` Values

In [97]:
null_counts = df.isnull().sum()

null_counts_df = null_counts.reset_index()
null_counts_df.columns = ['Column Name', 'Null Count']

total_rows = len(df)
null_counts = df.isnull().sum()
null_percent = (null_counts / total_rows) * 100

null_summary = pd.DataFrame({
    'Null Count': null_counts,
    'Null Percentage (%)': null_percent
})

null_summary

Unnamed: 0,Null Count,Null Percentage (%)
Unnamed: 0,0,0.0
SUBJECT_ID,0,0.0
HOSPITAL_EXPIRE_FLAG,0,0.0
AGE_AT_ADMISSION,0,0.0
LOS,0,0.0
LOS_ICU_MEAN,1,0.01932
WEIGHT_MEAN,235,4.540185
HEARTRATE_MEAN,5,0.0966
SBP_MEAN,4,0.07728
DBP_MEAN,4,0.07728


In [98]:
numerical_cols = df.select_dtypes(include=['number']).columns
mean_values = df[numerical_cols].mean()

df[numerical_cols] = df[numerical_cols].fillna(mean_values)
null_counts = df[numerical_cols].isnull().sum()

## Outlier Handling

In [99]:
def preprocess_outliers(df, threshold=3):
    """
    Detects and caps outliers in non-binary numerical columns of the DataFrame.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - threshold (float): Number of standard deviations to define outliers.
    
    Returns:
    - df_capped (pd.DataFrame): DataFrame with outliers capped.
    - summary_before (pd.DataFrame): Summary statistics before capping.
    - summary_after (pd.DataFrame): Summary statistics after capping.
    - capped_summary (pd.DataFrame): Count of capped values per column.
    """
    # Identify numerical columns
    numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
    
    # Detect binary numerical columns
    binary_numerical_cols = []
    for col in numerical_cols:
        unique_values = df[col].dropna().unique()
        if len(unique_values) == 2:
            binary_numerical_cols.append(col)
    
    # Separate non-binary numerical columns
    non_binary_numerical_cols = [col for col in numerical_cols if col not in binary_numerical_cols]
    
    # Calculate mean and std dev
    mean_values = df[non_binary_numerical_cols].mean()
    std_values = df[non_binary_numerical_cols].std()
    
    # Define bounds
    lower_bound = mean_values - threshold * std_values
    upper_bound = mean_values + threshold * std_values
    
    # Detect outliers
    outliers = pd.DataFrame(False, index=df.index, columns=non_binary_numerical_cols)
    for col in non_binary_numerical_cols:
        outliers[col] = (df[col] < lower_bound[col]) | (df[col] > upper_bound[col])
    outliers_any = outliers.any(axis=1)
    
    # Summary statistics before capping
    summary_before = df[non_binary_numerical_cols].describe().transpose()
    summary_before = summary_before[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]
    summary_before.columns = ['Count', 'Mean', 'Std_Dev', 'Min', 'Q1', 'Median', 'Q3', 'Max']
    
    # Cap outliers
    df_capped = df.copy()
    for col in non_binary_numerical_cols:
        df_capped[col] = np.where(
            df_capped[col] < lower_bound[col],
            lower_bound[col],
            df_capped[col]
        )
        df_capped[col] = np.where(
            df_capped[col] > upper_bound[col],
            upper_bound[col],
            df_capped[col]
        )
    
    # Summary statistics after capping
    summary_after = df_capped[non_binary_numerical_cols].describe().transpose()
    summary_after = summary_after[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]
    summary_after.columns = ['Count', 'Mean_After', 'Std_Dev_After', 'Min', 'Q1', 'Median', 'Q3', 'Max']
    
    # Count of capped values per column
    capped_lower = (df[non_binary_numerical_cols] < lower_bound).sum()
    capped_upper = (df[non_binary_numerical_cols] > upper_bound).sum()
    capped_summary = pd.DataFrame({
        'Capped_Lower': capped_lower,
        'Capped_Upper': capped_upper
    })
    
    return df_capped, summary_before, summary_after, capped_summary, binary_numerical_cols

# Usage
df_capped, summary_before, summary_after, capped_summary, binary_numerical_cols = preprocess_outliers(df, threshold=3)

In [100]:
def compare_summary_stats(summary_before, summary_after):
    """
    Compares summary statistics before and after outlier handling.
    
    Parameters:
    - summary_before (pd.DataFrame): Summary statistics before handling outliers.
    - summary_after (pd.DataFrame): Summary statistics after handling outliers.
    
    Returns:
    - comparison_table (pd.DataFrame): Table showing before, after, and changes.
    """
    # Initialize comparison DataFrame
    comparison_table = pd.DataFrame(index=summary_before.index)
    
    # Mean
    comparison_table['Mean Before'] = summary_before['Mean']
    comparison_table['Mean After'] = summary_after['Mean_After']
    comparison_table['Mean Change'] = comparison_table['Mean After'] - comparison_table['Mean Before']
    
    # Std Dev
    comparison_table['Std Dev Before'] = summary_before['Std_Dev']
    comparison_table['Std Dev After'] = summary_after['Std_Dev_After']
    comparison_table['Std Dev Change'] = comparison_table['Std Dev After'] - comparison_table['Std Dev Before']
    
    # Min
    comparison_table['Min Before'] = summary_before['Min']
    comparison_table['Min After'] = summary_after['Min']  # Changed from 'Min_After' to 'Min'
    comparison_table['Min Change'] = comparison_table['Min After'] - comparison_table['Min Before']
    
    # Max
    comparison_table['Max Before'] = summary_before['Max']
    comparison_table['Max After'] = summary_after['Max']  # Changed from 'Max_After' to 'Max'
    comparison_table['Max Change'] = comparison_table['Max After'] - comparison_table['Max Before']
    
    # Calculate percentage changes
    comparison_table['Mean % Change'] = (comparison_table['Mean Change'] / comparison_table['Mean Before']) * 100
    comparison_table['Std Dev % Change'] = (comparison_table['Std Dev Change'] / comparison_table['Std Dev Before']) * 100
    comparison_table['Min % Change'] = (comparison_table['Min Change'] / summary_before['Min']) * 100
    comparison_table['Max % Change'] = (comparison_table['Max Change'] / summary_before['Max']) * 100
    
    # Arrange columns for better readability
    comparison_table = comparison_table[[
        'Mean Before', 'Mean After', 'Mean Change', 'Mean % Change',
        'Std Dev Before', 'Std Dev After', 'Std Dev Change', 'Std Dev % Change',
        'Min Before', 'Min After', 'Min Change', 'Min % Change',
        'Max Before', 'Max After', 'Max Change', 'Max % Change'
    ]]
    
    return comparison_table

# Usage
comparison_table = compare_summary_stats(summary_before, summary_after)

# Display the comparison table
comparison_table.to_csv('Outlier.csv')