In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from data.data_loader import DataLoader
from preprocessing.ed_stay_preprocessor import EDStayPreprocessor
from preprocessing.admissions_preprocessor import AdmissionsPreprocessor
from preprocessing.diagnosis_preprocessor import DiagnosisPreprocessor
from preprocessing.triage_preprocessor import TriagePreprocessor
from preprocessing.vitalsigns_preprocessor import VitalSignsPreprocessor
from data.data_merger import DataMerger
from data.data_loader import DataLoader

# Import necessary libraries
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


In [4]:
# Define file paths
file_paths = {
    'edstays': "../ED/edstays.csv",
    'admissions': "../HOSP/admissions.csv",
    'transfers': "../HOSP/transfers.csv",
    'diagnosis': "../ED/diagnosis.csv",
    'triage': "../ED/triage.csv",
    'vitalsigns': '../ED/vitalsign.csv',
    'medrecon': '../ED/medrecon.csv',
    'disease_categories': "../Data/disease_categories.csv"
}

In [5]:
# Load data
ed_stays = DataLoader.load_csv(file_paths['edstays'])
admissions = DataLoader.load_csv(file_paths['admissions'])
diagnosis = DataLoader.load_csv(file_paths['diagnosis'])
triage = DataLoader.load_csv(file_paths['triage'])
vitalsigns = DataLoader.load_csv(file_paths['vitalsigns'])

In [6]:
ed_preprocessor = EDStayPreprocessor()
ed_stays_preprocessed = ed_preprocessor.preprocess(ed_stays)

admissions_preprocessor = AdmissionsPreprocessor()
admissions_preprocessed = admissions_preprocessor.preprocess(admissions)

In [7]:
diagnosis_preprocessor = DiagnosisPreprocessor()
diagnosis_preprocessed = diagnosis_preprocessor.preprocess(
    diagnosis,
    disease_categories_path=file_paths['disease_categories']
)

triage_preprocessor = TriagePreprocessor()
triage_preprocessed = triage_preprocessor.preprocess(triage)

vitalsigns_preprocessor = VitalSignsPreprocessor()
vitalsigns_preprocessed = vitalsigns_preprocessor.preprocess(vitalsigns)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_code'] = df['icd_code'].str[:3]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['letter_code'] = df['category_code'].str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_code'] = df['category_code'].astype('category')
A value is trying to be set on a copy of a slice fro

In [8]:
ed_stays_preprocessed.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition,ed_los_hours
0,10000032,22595853.0,33258284,2180-05-06 19:17:00,2180-05-06 23:30:00,F,White/European Descent,AMBULANCE,ADMITTED,4.216667
1,10000032,22841357.0,38112554,2180-06-26 15:54:00,2180-06-26 21:31:00,F,White/European Descent,AMBULANCE,ADMITTED,5.616667
2,10000032,25742920.0,35968195,2180-08-05 20:58:00,2180-08-06 01:44:00,F,White/European Descent,AMBULANCE,ADMITTED,4.766667
3,10000032,29079034.0,32952584,2180-07-22 16:24:00,2180-07-23 05:54:00,F,White/European Descent,AMBULANCE,HOME,13.5
4,10000032,29079034.0,39399961,2180-07-23 05:54:00,2180-07-23 14:00:00,F,White/European Descent,AMBULANCE,ADMITTED,8.1


In [9]:
admissions_preprocessed.head()


Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag,admission_los_hours,is_dead
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,NaT,URGENT,TRANSFER FROM HOSPITAL,HOME,Medicaid,English,WIDOWED,White/European Descent,2180-05-06 19:17:00,2180-05-06 23:30:00,0,18.866667,False
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,NaT,EW EMER.,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,White/European Descent,2180-06-26 15:54:00,2180-06-26 21:31:00,0,24.366667,False
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,NaT,EW EMER.,EMERGENCY ROOM,HOSPICE,Medicaid,English,WIDOWED,White/European Descent,2180-08-05 20:58:00,2180-08-06 01:44:00,0,42.1,False
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,NaT,EW EMER.,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,White/European Descent,2180-07-23 05:54:00,2180-07-23 14:00:00,0,53.333333,False
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,NaT,EU OBSERVATION,EMERGENCY ROOM,,,English,SINGLE,White/European Descent,2160-03-03 21:55:00,2160-03-04 06:26:00,0,7.166667,False


In [10]:
triage_preprocessed.head()

Unnamed: 0,subject_id,stay_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,topic,topic_label
0,10000032,32952584,97.8,87.0,14.0,97.0,71.0,43.0,7,2,2,Respiratory & Trauma Symptoms
1,10000032,33258284,98.4,70.0,16.0,97.0,106.0,63.0,0,3,1,General Pain & Weakness
2,10000032,35968195,99.4,105.0,18.0,96.0,106.0,57.0,10,3,5,Limb & Head Pain
3,10000032,38112554,98.9,88.0,18.0,97.0,116.0,88.0,10,3,1,General Pain & Weakness
4,10000032,39399961,98.7,77.0,16.0,98.0,96.0,50.0,13,2,1,General Pain & Weakness


In [11]:
admissions_preprocessed.shape

(546028, 17)

In [12]:
# ED patients view
ed_patients_df = ed_stays_preprocessed.merge(admissions_preprocessed, on=['subject_id', 'hadm_id'], how='inner').merge(diagnosis_preprocessed, on=['subject_id', 'stay_id'], how='inner').merge(triage_preprocessed, on=['subject_id', 'stay_id'], how='inner')
ed_patients_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race_x,arrival_transport,disposition,ed_los_hours,...,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,topic,topic_label
0,10000084,23052089.0,35203156,2160-11-20 20:36:00,2160-11-21 03:20:00,M,White/European Descent,WALK IN,ADMITTED,6.733333,...,97.5,78.0,16.0,100.0,114.0,71.0,0,2,4,Abdominal & Chest Pain
1,10000084,23052089.0,35203156,2160-11-20 20:36:00,2160-11-21 03:20:00,M,White/European Descent,WALK IN,ADMITTED,6.733333,...,97.5,78.0,16.0,100.0,114.0,71.0,0,2,4,Abdominal & Chest Pain
2,10000084,23052089.0,35203156,2160-11-20 20:36:00,2160-11-21 03:20:00,M,White/European Descent,WALK IN,ADMITTED,6.733333,...,97.5,78.0,16.0,100.0,114.0,71.0,0,2,4,Abdominal & Chest Pain
3,10000084,23052089.0,35203156,2160-11-20 20:36:00,2160-11-21 03:20:00,M,White/European Descent,WALK IN,ADMITTED,6.733333,...,97.5,78.0,16.0,100.0,114.0,71.0,0,2,4,Abdominal & Chest Pain
4,10000084,23052089.0,35203156,2160-11-20 20:36:00,2160-11-21 03:20:00,M,White/European Descent,WALK IN,ADMITTED,6.733333,...,97.5,78.0,16.0,100.0,114.0,71.0,0,2,4,Abdominal & Chest Pain


In [13]:
ed_patients_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1469965 entries, 0 to 1469964
Data columns (total 39 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   subject_id            1469965 non-null  int64         
 1   hadm_id               1469965 non-null  float64       
 2   stay_id               1469965 non-null  int64         
 3   intime                1469965 non-null  datetime64[ns]
 4   outtime               1469965 non-null  datetime64[ns]
 5   gender                1469965 non-null  category      
 6   race_x                1469965 non-null  category      
 7   arrival_transport     1469965 non-null  category      
 8   disposition           1469965 non-null  category      
 9   ed_los_hours          1469965 non-null  float64       
 10  admittime             1469965 non-null  datetime64[ns]
 11  dischtime             1469965 non-null  datetime64[ns]
 12  deathtime             39106 non-null    da

In [14]:
X_cols = ['gender', 'race_x', 'arrival_transport', 'disposition', 'ed_los_hours', 'admission_type', 'admission_location',
          'insurance', 'language', 'marital_status', 'is_dead', 'category_code', 'category', 'temperature', 'heartrate', 'resprate', 'o2sat',
          'sbp', 'dbp', 'acuity', 'topic']

# Define the target column
y_col = 'is_dead'

X = ed_patients_df[X_cols]
y = ed_patients_df[y_col]

X


Unnamed: 0,gender,race_x,arrival_transport,disposition,ed_los_hours,admission_type,admission_location,insurance,language,marital_status,...,category_code,category,temperature,heartrate,resprate,o2sat,sbp,dbp,acuity,topic
0,M,White/European Descent,WALK IN,ADMITTED,6.733333,EW EMER.,WALK-IN/SELF REFERRAL,Medicare,English,MARRIED,...,R53,Other,97.5,78.0,16.0,100.0,114.0,71.0,2,4
1,M,White/European Descent,WALK IN,ADMITTED,6.733333,EW EMER.,WALK-IN/SELF REFERRAL,Medicare,English,MARRIED,...,G20,Nervous System,97.5,78.0,16.0,100.0,114.0,71.0,2,4
2,M,White/European Descent,WALK IN,ADMITTED,6.733333,EW EMER.,WALK-IN/SELF REFERRAL,Medicare,English,MARRIED,...,G20,Nervous System,97.5,78.0,16.0,100.0,114.0,71.0,2,4
3,M,White/European Descent,WALK IN,ADMITTED,6.733333,EW EMER.,WALK-IN/SELF REFERRAL,Medicare,English,MARRIED,...,G20,Nervous System,97.5,78.0,16.0,100.0,114.0,71.0,2,4
4,M,White/European Descent,WALK IN,ADMITTED,6.733333,EW EMER.,WALK-IN/SELF REFERRAL,Medicare,English,MARRIED,...,G20,Nervous System,97.5,78.0,16.0,100.0,114.0,71.0,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1469960,F,White/European Descent,WALK IN,ADMITTED,24.266667,EW EMER.,PHYSICIAN REFERRAL,Medicaid,English,SINGLE,...,K63,Digestive System,98.1,83.0,18.0,100.0,107.0,75.0,2,4
1469961,F,White/European Descent,WALK IN,ADMITTED,24.266667,EW EMER.,PHYSICIAN REFERRAL,Medicaid,English,SINGLE,...,K63,Digestive System,98.1,83.0,18.0,100.0,107.0,75.0,2,4
1469962,F,White/European Descent,WALK IN,ADMITTED,24.266667,EW EMER.,PHYSICIAN REFERRAL,Medicaid,English,SINGLE,...,K63,Digestive System,98.1,83.0,18.0,100.0,107.0,75.0,2,4
1469963,F,White/European Descent,WALK IN,ADMITTED,24.266667,EW EMER.,PHYSICIAN REFERRAL,Medicaid,English,SINGLE,...,K63,Digestive System,98.1,83.0,18.0,100.0,107.0,75.0,2,4


In [15]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numeric and categorical columns
numeric_features = ['ed_los_hours', 'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'acuity']
categorical_features = ['gender', 'race_x', 'arrival_transport', 'disposition', 'admission_type', 'admission_location', 
                        'insurance', 'language', 'marital_status', 'category_code', 'category', 'topic']

# Create preprocessing steps for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit the preprocessor on the training data and transform both training and test data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Convert to dense arrays if needed (e.g., for certain models)
X_train_preprocessed = X_train_preprocessed.toarray()
X_test_preprocessed = X_test_preprocessed.toarray()

print("Preprocessed training data shape:", X_train_preprocessed.shape)
print("Preprocessed test data shape:", X_test_preprocessed.shape)



Preprocessed training data shape: (1175972, 1209)
Preprocessed test data shape: (293993, 1209)


In [16]:
# Import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_classifier.fit(X_train_preprocessed, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test_preprocessed)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Classifier Accuracy: {accuracy:.4f}")

# Generate and print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = rf_classifier.feature_importances_
feature_names = (numeric_features + 
                 preprocessor.named_transformers_['cat']
                 .named_steps['onehot']
                 .get_feature_names(categorical_features).tolist())

# Sort features by importance
sorted_idx = feature_importance.argsort()
top_10_idx = sorted_idx[-10:]

print("\nTop 10 Most Important Features:")
for idx in top_10_idx[::-1]:
    print(f"{feature_names[idx]}: {feature_importance[idx]:.4f}")


Random Forest Classifier Accuracy: 0.9735

Classification Report:
              precision    recall  f1-score   support

       False       0.97      1.00      0.99    286189
        True       0.00      0.00      0.00      7804

    accuracy                           0.97    293993
   macro avg       0.49      0.50      0.49    293993
weighted avg       0.95      0.97      0.96    293993



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names'

: 