In [6]:
from google.colab import drive # Loading the Dataset
from datetime import datetime # Date & Time Manipulation
import pytz #Timezone Calculations

import pandas as pd
import os
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

from sklearn.compose import ColumnTransformer # Pipeline
from sklearn.impute import SimpleImputer # Pipeline
from sklearn.ensemble import RandomForestClassifier, IsolationForest # Pipeline
from sklearn.linear_model import LogisticRegression # Pipeline
from sklearn.pipeline import Pipeline, make_pipeline # Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler # Pipeline

import numpy as np # Data Imputation
from sklearn.impute import KNNImputer # KNN Imputation
from imblearn.over_sampling import SMOTE # Data Imputation
from collections import Counter # Data Imputation

from scipy.stats import chi2 # Checking for Outliers by Mahalanobis Distance

drive.mount('/content/drive')
file_path = '/content/drive/My Drive/COMP7015 AI Project Group/mimiciv_traindata.csv'
print ('\n-------------------------------------------------------------------')
df = pd.read_csv(file_path)

# KNN Imputation (Imputation for Missing Value)
knn_imputer = KNNImputer(n_neighbors=3)
df_imputed = knn_imputer.fit_transform(df)

# Convert the result back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

# Features & Target
X = df_imputed[[
    'Fraction inspired oxygen_mean', 'Fraction inspired oxygen_min', 'Fraction inspired oxygen_max', \
    'Glucose_mean', 'Glucose_min', 'Glucose_max', \
    'Heart Rate_mean', 'Heart Rate_min', 'Heart Rate_max',\
    'Mean blood pressure_mean', 'Mean blood pressure_min', 'Mean blood pressure_max', \
    'Diastolic blood pressure_mean', 'Diastolic blood pressure_min', 'Diastolic blood pressure_max',\
    'Systolic blood pressure_mean', 'Systolic blood pressure_min', 'Systolic blood pressure_max',\
    'Oxygen saturation_mean', 'Oxygen saturation_min', 'Oxygen saturation_max',\
    'Respiratory rate_mean', 'Respiratory rate_min', 'Respiratory rate_max',\
    'Temperature_mean', 'Temperature_min', 'Temperature_max',\
    'Weight_mean', 'Weight_min', 'Weight_max',\
    'pH_mean', 'pH_min', 'pH_max']]
y = df_imputed['mortality']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Outlier Detection with Isolation Forest
iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest.fit(X_train)

# Predict Outliers (-1 for outlier, 1 for inlier)
iso_outlier_predictions = iso_forest.predict(X_train)

# Identify Outliers
outliers = X_train[iso_outlier_predictions == -1]

# Display Original Data Size and Outliers Found
print (f'Size of training data: {X_train.shape}')
print (f'Outliers found: {outliers.shape}')

# Remove Outliers from the Training Data
X_train_cleaned = X_train[iso_outlier_predictions == 1]
y_train_cleaned = y_train[iso_outlier_predictions == 1]

print(f'Size of Cleaned X:', X_train_cleaned.shape)

# Applying SMOTE (Oversampling Technique)
smote = SMOTE(random_state = 42)
X_resampled, y_resampled = smote.fit_resample(X_train_cleaned, y_train_cleaned)
print ('\n-------------------------------------------------------------------')
print('Original Dataset Shape:', y_train_cleaned.value_counts())
print('\nResampled Dataset Shape:', y_resampled.value_counts())

# Create a pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()), # Standardizes Features
        ('classifier', RandomForestClassifier(class_weight='balanced'))
                ])

# Fit the pipeline to the resampled training data
pipe.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred = pipe.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print ('\n-------------------------------------------------------------------')
print(f'\nAccuracy with class weights balanced: {accuracy:.2f}')

# Print classification report (includes precision, recall, and F1 score)
print ('\n-------------------------------------------------------------------')
print('\nClassification Report with class weights balanced (0 - Alive / 1 - Dead):')
print('\n')
print(classification_report(y_test, y_pred))

# Calculate and print ROC-AUC score
y_pred_proba = pipe.predict_proba(X_test)[:, 1]  # Get predicted probabilities for the positive class
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'\nROC-AUC Score with class weights balanced: {roc_auc:.2f}')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

-------------------------------------------------------------------
Size of training data: (4517, 33)
Outliers found: (452, 33)
Size of Cleaned X: (4065, 33)

-------------------------------------------------------------------
Original Dataset Shape: mortality
0.0    3084
1.0     981
Name: count, dtype: int64

Resampled Dataset Shape: mortality
1.0    3084
0.0    3084
Name: count, dtype: int64

-------------------------------------------------------------------

Accuracy with class weights balanced: 0.74

-------------------------------------------------------------------

Classification Report with class weights balanced (0 - Alive / 1 - Dead):


              precision    recall  f1-score   support

         0.0       0.80      0.87      0.83       850
         1.0       0.45      0.33      0.38       280

    accuracy                           0.74      1