In [3]:
from google.colab import drive # Loading the Dataset
from datetime import datetime # Date & Time Manipulation
import pytz #Timezone Calculations

import pandas as pd
import os
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer # Pipeline
from sklearn.impute import SimpleImputer # Pipeline
from sklearn.ensemble import RandomForestClassifier # Pipeline
from sklearn.linear_model import LogisticRegression # Pipeline
from sklearn.pipeline import Pipeline, make_pipeline # Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler # Pipeline

import numpy as np # Data Imputation
from sklearn.impute import KNNImputer # KNN Imputation
from imblearn.over_sampling import SMOTE # Data Imputation
from collections import Counter # Data Imputation

from scipy.stats import chi2 # Checking for Outliers by Mahalanobis Distance

drive.mount('/content/drive')
file_path = '/content/drive/My Drive/COMP7015 AI Project Group/mimiciv_traindata.csv'
print ('\n-------------------------------------------------------------------')
df = pd.read_csv(file_path)

#print(df.head())
#print ('\n-------------------------------------------------------------------')
#print(df.mortality.value_counts())
#print ('\n-------------------------------------------------------------------')
#print(df.mortality.value_counts(normalize=True))

# KNN Imputation
knn_imputer = KNNImputer(n_neighbors=3)
df_imputed = knn_imputer.fit_transform(df)

# Convert the result back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

# Features & Target
X = df_imputed[[
    'Fraction inspired oxygen_mean', 'Fraction inspired oxygen_min', 'Fraction inspired oxygen_max', \
    'Glucose_mean', 'Glucose_min', 'Glucose_max', \
    'Heart Rate_mean', 'Heart Rate_min', 'Heart Rate_max',\
    'Mean blood pressure_mean', 'Mean blood pressure_min', 'Mean blood pressure_max', \
    'Diastolic blood pressure_mean', 'Diastolic blood pressure_min', 'Diastolic blood pressure_max',\
    'Systolic blood pressure_mean', 'Systolic blood pressure_min', 'Systolic blood pressure_max',\
    'Oxygen saturation_mean', 'Oxygen saturation_min', 'Oxygen saturation_max',\
    'Respiratory rate_mean', 'Respiratory rate_min', 'Respiratory rate_max',\
    'Temperature_mean', 'Temperature_min', 'Temperature_max',\
    'Weight_mean', 'Weight_min', 'Weight_max',\
    'pH_mean', 'pH_min', 'pH_max']]
y = df_imputed['mortality']

# Applying SMOTE
smote = SMOTE(random_state = 42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print('Original Dataset Shape:', y.value_counts())
print('\nResampled Dataset Shape:', y_resampled.value_counts())

# Create a pipeline
pipe = Pipeline([
      ('scaler', StandardScaler()), ('classifier', RandomForestClassifier())
                ])

# Fit the pipeline to the resampled training data
pipe.fit(X_resampled, y_resampled)



print ('\n-------------------------------------------------------------------')
# Calculate the mean and covariance matrix for features
mean_vector = X_resampled.mean().values
cov_matrix = X_resampled.cov().values

# Inverse of the covariance matrix
cov_matrix_inv = np.linalg.inv(cov_matrix)

# Function ot calculate Mahalanobis distance
def mah_dis(X, mean, cov_inv):
    X = np.asarray(mean, dtype = np.float64)
    mean = np.asarray(mean, dtype = np.float64)
    diff = X - mean
    return np.sqrt(np.dot(np.dot(diff, cov_inv), diff.T))

# Calculate Mahalanobis distances for each point in the resampled data
mah_distances = np.array([mah_dis(row.values, mean_vector, cov_matrix_inv) for _, row in X_resampled.iterrows()])

# Add Mahalanobis distances to the resampled DataFrame
resampled_df = pd.DataFrame(X_resampled, columns = X.columns)
resampled_df['mortality'] = y_resampled
resampled_df['mah_dis'] = mah_distances

# Determine the threshold for outliers (95% confidence level)
threshold = chi2.ppf(0.95, df=len(mean_vector))

# Checking for Outliers in the Resampled Data
resampled_df['is_outlier'] = resampled_df['mah_dis'] > np.sqrt(threshold)

# Display outliers along  with the target variable
outliers = resampled_df[resampled_df['is_outlier']]
print("\nOutliers detected based on Mahalanobis distance in the resampleddata:")
print(outliers[[*X.columns, 'mortality', 'mah_dis']])

print(f'First few rows of x_resampled:{X_resampled.head(10)}')
print(f'\nCovariance Matrix:{cov_matrix}')
print(f'\nMeanVector: {mean_vector}')
print(f'\nFirst few rows of Mah Distances: {mah_distances[:10]}')

print ('\n-------------------------------------------------------------------')
print(f'Hong Kong Current Day & Time: {datetime.now(pytz.timezone("Asia/Hong_Kong")).strftime("%d-%m-%Y %H:%M")}')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

-------------------------------------------------------------------
Original Dataset Shape: mortality
0.0    4232
1.0    1415
Name: count, dtype: int64

Resampled Dataset Shape: mortality
1.0    4232
0.0    4232
Name: count, dtype: int64

-------------------------------------------------------------------

Outliers detected based on Mahalanobis distance in the resampleddata:
Empty DataFrame
Columns: [Fraction inspired oxygen_mean, Fraction inspired oxygen_min, Fraction inspired oxygen_max, Glucose_mean, Glucose_min, Glucose_max, Heart Rate_mean, Heart Rate_min, Heart Rate_max, Mean blood pressure_mean, Mean blood pressure_min, Mean blood pressure_max, Diastolic blood pressure_mean, Diastolic blood pressure_min, Diastolic blood pressure_max, Systolic blood pressure_mean, Systolic blood pressure_min, Systolic blood pressure_max, Oxygen saturation_mean, Oxygen 