In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Setup plotting style
from IPython.display import display

import warnings
warnings.filterwarnings('ignore')


In [None]:
# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

# authenticate
auth.authenticate_user()

# Set up environment variables
project_id = 'ml-applicationfor-health'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

In [None]:
# create a client object
client = bigquery.Client(project=project_id)

dataset_id = "comp90089_group_assignment"

try:
  client.get_dataset(dataset_id)  # Make an API request.
  print("Dataset {} already exists".format(dataset_id))
except:
  client.create_dataset(dataset_id)  # Make an API request.
  print("Dataset {} is not found, new one is created".format(dataset_id))

Dataset comp90089_group_assignment already exists


In [None]:
# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# set the dataset
dataset = 'mimiciv'

In [None]:
# save the dataframe under table name
def save_query(df, table_name):
    table_id = f"comp90089_group_assignment.{table_name}"
    df.to_gbq(table_id, project_id=project_id, if_exists='replace')
    print("table saved")

In [None]:
query = """
SELECT *
FROM `comp90089_group_assignment.ami_features_24h`
"""
df = run_query(query)
print(df.columns)
#df.to_csv('ami_features_24h.csv', index=False)
#from google.colab import files
#files.download('ami_features_24h.csv')

Index(['subject_id', 'hadm_id', 'stay_id', 'hospital_expire_flag', 'age',
       'gender', 'race', 'hypertension', 'diabetes', 'copd', 'ckd',
       'heart_failure', 'hr_mean', 'hr_min', 'hr_max', 'creatinine', 'lactate',
       'glucose', 'sodium', 'potassium', 'vasopressor_use', 'icu_expire_flag',
       'death_30d_post_discharge'],
      dtype='object')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Data pre-processing

In [None]:
# ! Comment out when submission
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/COMP90089/Group Assignment/ami_features_24h.csv')

Mounted at /content/drive


In [None]:
# checking missing values
df.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
lactate,2017
vasopressor_use,663
glucose,31
creatinine,23
potassium,19
sodium,19
hr_min,8
hr_mean,8
hr_max,8
age,0


In [None]:
# use median to replace
for col in ['glucose', 'creatinine', 'potassium', 'sodium', 'hr_mean', 'hr_min', 'hr_max']:
    df[col].fillna(df[col].median(), inplace=True)

# vasopressor_use: put 0 to show no use
df['vasopressor_use'].fillna(0, inplace=True)

# too many lactate values missing, remove
df.drop(columns=['lactate'], inplace=True)

# checking missing values
df.isnull().sum().sort_values(ascending=False)


Unnamed: 0,0
subject_id,0
hadm_id,0
stay_id,0
hospital_expire_flag,0
age,0
gender,0
race,0
hypertension,0
diabetes,0
copd,0


In [None]:
continuous_features = ['hr_mean', 'hr_min', 'hr_max',
                       'creatinine', 'glucose',
                       'sodium', 'potassium']
scaler = StandardScaler()

df[continuous_features] = scaler.fit_transform(df[continuous_features])
df

Unnamed: 0,subject_id,hadm_id,stay_id,hospital_expire_flag,age,gender,race,hypertension,diabetes,copd,...,hr_mean,hr_min,hr_max,creatinine,glucose,sodium,potassium,vasopressor_use,icu_expire_flag,death_30d_post_discharge
0,12012181,26862482,39532390,0,0,F,WHITE,1,0,0,...,1.652390,2.240709,0.411594,-0.350878,-0.295980,0.015247,-0.157682,0.0,0,0
1,14204585,21383476,36095793,0,1,F,BLACK/CAPE VERDEAN,0,1,0,...,0.080195,0.677263,-0.488631,-0.350878,-0.295980,0.015247,-0.157682,0.0,0,0
2,10313200,24896438,32380148,0,0,F,WHITE - OTHER EUROPEAN,0,1,1,...,-0.519808,0.075938,-0.938744,-0.350878,-0.295980,0.015247,-0.157682,0.0,0,0
3,13680152,22145795,36046814,0,0,M,ASIAN,0,0,0,...,-0.943744,-0.705785,-0.983756,-0.350878,-0.295980,0.015247,-0.157682,0.0,0,0
4,17681159,23316013,34182123,0,4,M,WHITE,0,0,0,...,-0.219626,-0.044327,-0.488631,-0.350878,-0.295980,0.015247,-0.157682,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6592,15589457,24539514,32403533,0,1,F,BLACK/AFRICAN AMERICAN,1,0,1,...,-1.202748,-0.886182,-0.533643,0.324198,0.279266,5.332575,-1.243285,1.0,0,1
6593,12363746,29652401,33655764,0,9,M,BLACK/AFRICAN AMERICAN,1,1,0,...,2.092355,0.857661,2.752181,-0.310374,0.918227,5.359928,-1.221398,1.0,0,0
6594,16482991,23627270,37471366,0,0,F,BLACK/AFRICAN AMERICAN,0,0,0,...,0.062468,0.015806,-0.173552,0.243188,-0.725595,5.441985,-2.118772,0.0,0,0
6595,12392956,29708494,30502944,0,0,F,WHITE,1,1,0,...,1.039454,0.316468,2.842204,-0.026842,4.123953,6.864316,0.770334,1.0,0,1


In [None]:
def simplify_race(race_str):
    if pd.isnull(race_str):
        return 'Unknown'
    race_str = race_str.upper()

    if 'WHITE' in race_str:
        return 'White'
    elif 'BLACK' in race_str:
        return 'Black'
    elif 'ASIAN' in race_str:
        return 'Asian'
    elif 'HISPANIC' in race_str or 'LATINO' in race_str:
        return 'Hispanic'
    elif ('PORTUGUESE' in race_str or
          'AMERICAN INDIAN' in race_str or
          'NATIVE HAWAIIAN' in race_str or
          'SOUTH AMERICAN' in race_str or
          'MULTIPLE RACE' in race_str):
        return 'Other'
    elif ('UNKNOWN' in race_str or
          'UNABLE TO OBTAIN' in race_str or
          'PATIENT DECLINED' in race_str):
        return 'Unknown'
    else:
        return 'Other'  # catch-all fallback


In [None]:
df['race_grouped'] = df['race'].apply(simplify_race)
print(df['race_grouped'].value_counts())

race_grouped
White       4165
Unknown     1164
Black        642
Other        247
Asian        198
Hispanic     181
Name: count, dtype: int64


In [None]:
# One-hot encode
df = pd.get_dummies(df, columns=['gender', 'race_grouped'], drop_first=True)


In [None]:
df

Unnamed: 0,subject_id,hadm_id,stay_id,hospital_expire_flag,age,race,hypertension,diabetes,copd,ckd,...,potassium,vasopressor_use,icu_expire_flag,death_30d_post_discharge,gender_M,race_grouped_Black,race_grouped_Hispanic,race_grouped_Other,race_grouped_Unknown,race_grouped_White
0,12012181,26862482,39532390,0,0,WHITE,1,0,0,0,...,-0.157682,0.0,0,0,False,False,False,False,False,True
1,14204585,21383476,36095793,0,1,BLACK/CAPE VERDEAN,0,1,0,1,...,-0.157682,0.0,0,0,False,True,False,False,False,False
2,10313200,24896438,32380148,0,0,WHITE - OTHER EUROPEAN,0,1,1,1,...,-0.157682,0.0,0,0,False,False,False,False,False,True
3,13680152,22145795,36046814,0,0,ASIAN,0,0,0,0,...,-0.157682,0.0,0,0,True,False,False,False,False,False
4,17681159,23316013,34182123,0,4,WHITE,0,0,0,1,...,-0.157682,0.0,0,0,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6592,15589457,24539514,32403533,0,1,BLACK/AFRICAN AMERICAN,1,0,1,0,...,-1.243285,1.0,0,1,False,True,False,False,False,False
6593,12363746,29652401,33655764,0,9,BLACK/AFRICAN AMERICAN,1,1,0,0,...,-1.221398,1.0,0,0,True,True,False,False,False,False
6594,16482991,23627270,37471366,0,0,BLACK/AFRICAN AMERICAN,0,0,0,1,...,-2.118772,0.0,0,0,False,True,False,False,False,False
6595,12392956,29708494,30502944,0,0,WHITE,1,1,0,0,...,0.770334,1.0,0,1,False,False,False,False,False,True


## **3.1 ICU mortality**

In [None]:
y_icu = df['icu_expire_flag']
X_icu = df.drop(columns=['icu_expire_flag', 'hospital_expire_flag', 'death_30d_post_discharge','subject_id','hadm_id','stay_id','race'])

X_train_icu, X_test_icu, y_train_icu, y_test_icu = train_test_split(
    X_icu, y_icu, test_size=0.2, random_state=42, stratify=y_icu
)

# standardise continuous variable
num_cols = X_train_icu.select_dtypes(include=['int64', 'float64']).columns
num_cols = [col for col in num_cols if set(X_train_icu[col].unique()) != {0,1}]

scaler = StandardScaler()
X_train_icu[num_cols] = scaler.fit_transform(X_train_icu[num_cols])
X_test_icu[num_cols] = scaler.transform(X_test_icu[num_cols])

print("ICU mortality positive rate:", y_train_icu.mean())


ICU mortality positive rate: 0.13682016297138525


## **3.2 Hospital mortality**

In [None]:
# check rate to ensure a relatively balanced dataset
df['hospital_expire_flag'].value_counts(normalize=True)


Unnamed: 0_level_0,proportion
hospital_expire_flag,Unnamed: 1_level_1
0,0.800364
1,0.199636


In [None]:
y = df['hospital_expire_flag']
X = df.drop(columns=['icu_expire_flag', 'hospital_expire_flag', 'death_30d_post_discharge','subject_id','hadm_id','stay_id','race'])

# 80% training, 20% test
X_train_hos, X_test_hos, y_train_hos, y_test_hos = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train_hos.shape)
print("Test shape:", X_test_hos.shape)
print("Train positive rate:", y_train_hos.mean())
print("Test positive rate:", y_test_hos.mean())

Train shape: (5277, 20)
Test shape: (1320, 20)
Train positive rate: 0.19954519613416713
Test positive rate: 0.2


In [None]:
num_cols = X_train_hos.select_dtypes(include=['int64', 'float64']).columns
num_cols = [col for col in num_cols if set(X_train_hos[col].unique()) != {0,1}]

scaler = StandardScaler()
X_train_hos[num_cols] = scaler.fit_transform(X_train_hos[num_cols])
X_test_hos[num_cols] = scaler.transform(X_test_hos[num_cols])


## **3.3 Post-discharge mortality**

In [None]:
y_post = df['death_30d_post_discharge']
X_post = df.drop(columns=['icu_expire_flag', 'hospital_expire_flag', 'death_30d_post_discharge','subject_id','hadm_id','stay_id','race'])

X_train_post, X_test_post, y_train_post, y_test_post = train_test_split(
    X_post, y_post, test_size=0.2, random_state=42, stratify=y_post
)

X_train_post[num_cols] = scaler.fit_transform(X_train_post[num_cols])
X_test_post[num_cols] = scaler.transform(X_test_post[num_cols])

print("Post-discharge 30-day mortality positive rate:", y_train_post.mean())


Post-discharge 30-day mortality positive rate: 0.07238961531173015
