In [80]:
# Import Libraries
import pandas as pd
import csv
import sqlite3
import matplotlib.pyplot as plt

In [81]:
# ML Imports
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [82]:
# Warning Imports
import warnings
warnings.filterwarnings('ignore')

In [83]:
# Data Connection String:
data = "/Users/jp/Desktop/Capstone Project/Kaggle_Sirio_Libanes_ICU_Prediction.csv"
df = pd.read_csv(data, sep=',')


In [84]:
# Basic Data Information: 
print("Data Set:")
print(df.head())
print("\n")
print(df.tail())
print("\n")
print(df.info())
print("\nData Statistics:")
print(df.describe())
print(df.describe(include='all'))

Data Set:
   PATIENT_VISIT_IDENTIFIER  AGE_ABOVE65 AGE_PERCENTIL  GENDER  \
0                         0            1          60th       0   
1                         0            1          60th       0   
2                         0            1          60th       0   
3                         0            1          60th       0   
4                         0            1          60th       0   

   DISEASE GROUPING 1  DISEASE GROUPING 2  DISEASE GROUPING 3  \
0                 0.0                 0.0                 0.0   
1                 0.0                 0.0                 0.0   
2                 0.0                 0.0                 0.0   
3                 0.0                 0.0                 0.0   
4                 0.0                 0.0                 0.0   

   DISEASE GROUPING 4  DISEASE GROUPING 5  DISEASE GROUPING 6  ...  \
0                 0.0                 1.0                 1.0  ...   
1                 0.0                 1.0                 1.0 

In [85]:
## Data Cleaning and Preparation: 
# Check for Missing Values
missing_values = df.isnull().sum().sort_values(ascending=False)
missing_values.head(20)
print(missing_values.head(20))

# Check for Duplicate Values
duplicates = df.duplicated()
has_duplicates = duplicates.any()
print("\n")
print(f"Are there any duplicate rows? {'Yes' if has_duplicates else 'No'}")
duplicate_rows = df[duplicates]

# Check Correlation between All Variables 
print("\nCorrelation Between All Variables:")
print(df.corr)


P02_VENOUS_MIN        1104
LEUKOCYTES_MAX        1104
PC02_VENOUS_MEAN      1104
PC02_VENOUS_MIN       1104
PC02_VENOUS_MAX       1104
PC02_VENOUS_DIFF      1104
PCR_MEDIAN            1104
PCR_MEAN              1104
PCR_MIN               1104
PCR_MAX               1104
PCR_DIFF              1104
PH_ARTERIAL_MEDIAN    1104
PH_ARTERIAL_MEAN      1104
PH_ARTERIAL_MIN       1104
PH_ARTERIAL_MAX       1104
PH_ARTERIAL_DIFF      1104
PH_VENOUS_MEDIAN      1104
PH_VENOUS_MEAN        1104
PH_VENOUS_MIN         1104
PH_VENOUS_MAX         1104
dtype: int64


Are there any duplicate rows? No

Correlation Between All Variables:
<bound method DataFrame.corr of       PATIENT_VISIT_IDENTIFIER  AGE_ABOVE65 AGE_PERCENTIL  GENDER  \
0                            0            1          60th       0   
1                            0            1          60th       0   
2                            0            1          60th       0   
3                            0            1          60th       0   

In [86]:
# Preprocess Data for ML Model: 

# Drop columns with more than 50% missing values
threshold = len(df) * 0.5
df_dropped = df.dropna(thresh=threshold, axis=1)

# Separate numeric and non-numeric columns
numeric_cols = df_dropped.select_dtypes(include=['float64', 'int64']).columns
non_numeric_cols = df_dropped.select_dtypes(exclude=['float64', 'int64']).columns

# Apply mean imputation to numeric columns
data_imputed = df_dropped.copy()
data_imputed[numeric_cols] = data_imputed[numeric_cols].fillna(data_imputed[numeric_cols].mean())

# Imputation for columns with fewer missing values
#data_imputed = df_dropped.fillna(df_dropped.mean())

# Create binary columns indicating whether the original data was missing
missing_indicator = df_dropped.isnull().astype(int)
missing_indicator.columns = [f"{col}_MISSING" for col in missing_indicator.columns]

# Combine the imputed data with the missing indicators
df_final = pd.concat([data_imputed, missing_indicator], axis=1)

# Identify non-numeric columns that need encoding 
non_numeric_cols = ['AGE_PERCENTIL', 'GENDER', 'WINDOW']
# Apply one-hot encoding to these columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_features = pd.DataFrame(encoder.fit_transform(df_final[non_numeric_cols]),
                                columns=encoder.get_feature_names_out(non_numeric_cols)) 
# Drop original non-numeric columns from the dataset and concatenate encoded features
df_encoded = pd.concat([df_final.drop(columns=non_numeric_cols), encoded_features], axis=1)

# Display the shape of the final dataset and the first few rows to verify
df_final_shape = df_final.shape
df_final_head = df_final.head()

df_final_shape, df_final_head

# Select features and target variable
X = df_encoded.drop(columns=['ICU', 'PATIENT_VISIT_IDENTIFIER', 'ICU_MISSING'])
y = df_encoded['ICU']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Selection: Using Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
classification_rep = classification_report(y_test, y_pred)

#accuracy, roc_auc, classification_rep
print("\nAccuracy Score:")
print(accuracy)
print("\nROC AUC Score:")
print(roc_auc)
print("\nClassification Report:")
print(classification_rep)



Accuracy Score:
0.8615916955017301

ROC AUC Score:
0.8823684789786485

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       413
           1       0.83      0.64      0.73       165

    accuracy                           0.86       578
   macro avg       0.85      0.80      0.82       578
weighted avg       0.86      0.86      0.86       578

