In [35]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import os
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

#plotly imports
#!pip install plotly==5.10.0
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.io as pio
pio.renderers.default = "colab"

# 
from scipy import stats
from scipy.spatial.distance import cdist

# Access data using Google BigQuery.
from google.colab import auth, drive
from google.cloud import bigquery

import warnings
warnings.filterwarnings('ignore')

def enable_plotly_in_cell():
  import IPython
  from plotly.offline import init_notebook_mode
  display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
  init_notebook_mode(connected=False)

In [36]:
# authenticate
auth.authenticate_user()

# mount google drive
drive.mount('/content/drive')
DATA_DIR = 'drive/MyDrive/COMP90089/group_project/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
# Set up environment variables
project_id = 'comp90089'
if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project.')
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# set the dataset
# if you want to use the demo, change this to mimic_demo
dataset = 'mimiciv'

In [38]:
##Module ICU stay with VITAL sign, gender, age
df_Glucose = run_query("""
SELECT 
  LAB.subject_id, LAB.stay_id, LAB.glucose_max, 
  DEMO.anchor_age, DEMO.dod, DEMO.gender, 
  VITAL.dbp_mean, VITAL.sbp_mean, VITAL.glucose_mean, VITAL.heart_rate_mean, 
  VITAL.spo2_mean, VITAL.resp_rate_mean, VITAL.temperature_mean, 
  IND.apsiii, IND.glucose_score, ICUSTAY.los
FROM `physionet-data.mimiciv_derived.first_day_lab` AS LAB
INNER JOIN `physionet-data.mimiciv_hosp.patients` AS DEMO
ON LAB.subject_id = DEMO.subject_id
INNER JOIN `physionet-data.mimiciv_derived.first_day_vitalsign` AS VITAL
ON LAB.stay_id = VITAL.stay_id
INNER JOIN `physionet-data.mimiciv_derived.apsiii` AS IND
ON LAB.stay_id = IND.stay_id
INNER JOIN `physionet-data.mimiciv_icu.icustays` ICUSTAY
ON LAB.stay_id = ICUSTAY.stay_id
WHERE LAB.glucose_max >= 200
""")
df_Glucose.head()

Unnamed: 0,subject_id,stay_id,glucose_max,anchor_age,dod,gender,dbp_mean,sbp_mean,glucose_mean,heart_rate_mean,spo2_mean,resp_rate_mean,temperature_mean,apsiii,glucose_score,los
0,17145022,36446667,256.0,34,NaT,M,66.5,112.166667,210.842105,79.518519,,16.111111,36.31,52,5,1.092477
1,11693046,31636456,414.0,27,NaT,M,,,254.333333,,,,,20,5,10.672859
2,10122346,30908451,273.0,36,NaT,M,,,121.5,,,,,9,3,3.615185
3,18818535,38793668,275.0,29,NaT,F,77.0,156.730769,282.857143,93.16,,22.923077,36.741667,51,5,1.111238
4,12041046,31249096,277.0,36,NaT,M,64.32,112.88,221.272727,101.8,92.0,17.346154,37.18625,38,3,1.938414


In [39]:
df_bmi = run_query("""
SELECT subject_id, 
  AVG(CAST(result_value AS FLOAT64)) AS avg_bmi_value
FROM `physionet-data.mimiciv_hosp.omr` 
WHERE result_name="BMI (kg/m2)"
GROUP BY subject_id, result_name
""")
df_bmi.head()
df_bmi.describe()

Unnamed: 0,subject_id,avg_bmi_value
count,144861.0,144861.0
mean,15015330.0,31.508083
std,2887037.0,302.44295
min,10000030.0,0.0
25%,12504140.0,23.7
50%,15026140.0,27.226667
75%,17516560.0,31.72
max,19999830.0,107840.2


In [40]:
data_List = pd.DataFrame()

data_List = pd.merge(df_Glucose, df_bmi, on=["subject_id"], how = "left")

data_List.head()
data_List.describe()

Unnamed: 0,subject_id,stay_id,glucose_max,anchor_age,dbp_mean,sbp_mean,glucose_mean,heart_rate_mean,spo2_mean,resp_rate_mean,temperature_mean,apsiii,glucose_score,los,avg_bmi_value
count,14849.0,14849.0,14849.0,14849.0,14754.0,14754.0,14705.0,14826.0,14791.0,14818.0,14402.0,14849.0,14849.0,14849.0,9427.0
mean,14951450.0,34982040.0,315.499697,63.351337,63.537791,119.606764,269.150999,87.870111,96.691182,19.96701,36.79117,56.653512,3.563607,4.068648,30.558477
std,2883976.0,2883237.0,173.953706,16.061045,11.643143,18.017578,2885.334677,16.402283,3.122335,4.093006,0.658481,25.267327,0.934916,5.426979,44.926034
min,10002160.0,30000210.0,200.0,18.0,17.0,45.5,42.0,28.5,26.6,9.0,26.67,5.0,3.0,0.001551,3.4
25%,12468090.0,32480680.0,223.0,54.0,55.431609,106.729823,167.0,76.175078,95.583333,17.025507,36.572292,39.0,3.0,1.197037,24.325
50%,14918520.0,34976100.0,260.0,65.0,62.666667,117.2,197.5,86.924501,97.15,19.346154,36.805,51.0,3.0,2.18684,27.931818
75%,17441870.0,37451900.0,334.0,75.0,70.44,130.965209,233.647059,98.666667,98.51788,22.351389,37.078643,69.0,5.0,4.541898,33.114286
max,19999830.0,39997750.0,5840.0,91.0,129.571429,217.75,166855.166667,166.307692,100.0,40.88,40.104118,184.0,9.0,86.312488,2332.25


In [41]:
data_List.columns

Index(['subject_id', 'stay_id', 'glucose_max', 'anchor_age', 'dod', 'gender',
       'dbp_mean', 'sbp_mean', 'glucose_mean', 'heart_rate_mean', 'spo2_mean',
       'resp_rate_mean', 'temperature_mean', 'apsiii', 'glucose_score', 'los',
       'avg_bmi_value'],
      dtype='object')

In [42]:
# Fill null for BMI
data_List['avg_bmi_value'] = data_List['avg_bmi_value'].fillna(
                                      data_List['avg_bmi_value'].mean())

# One-hot Encoding: Female = 1, Male = 0
data_List['gender'] = pd.get_dummies(data_List['gender']) ['F'] 

# if dod is NAN, replace with 0, if dod has a value, replace with 1
data_List['dod'] = data_List['dod'].notnull().astype("int") 

In [43]:
## Copy data for q1 analysis
df = data_List.copy()


## For Non-numberical columns (gender, dod): count, unique, top, freq
df.describe(include='all')

## Define functions to Remove outliers ##
def remove_outliers(df, col_lst):
  for col in col_lst:
    # we remove items that are outside of 99.5% of quantile 
    lower_q, higher_q = df[col].quantile(0.005), df[col].quantile(0.995)
    filtered_df = df[(df[col] > lower_q) & (df[col] < higher_q)]

  return filtered_df    

df_filtered = remove_outliers(df, ['glucose_max', 'anchor_age',
       'dbp_mean', 'sbp_mean', 'glucose_mean', 'heart_rate_mean', 'spo2_mean',
       'resp_rate_mean', 'temperature_mean', 'los',
       'avg_bmi_value'] )

df_final = df_filtered.dropna() # Drop Nulls
df_final.describe(include='all')

# df_final.to_csv(index=False)


Unnamed: 0,subject_id,stay_id,glucose_max,anchor_age,dod,gender,dbp_mean,sbp_mean,glucose_mean,heart_rate_mean,spo2_mean,resp_rate_mean,temperature_mean,apsiii,glucose_score,los,avg_bmi_value
count,14077.0,14077.0,14077.0,14077.0,14077.0,14077.0,14077.0,14077.0,14077.0,14077.0,14077.0,14077.0,14077.0,14077.0,14077.0,14077.0,14077.0
mean,14961480.0,34975750.0,315.593024,63.346594,0.494139,0.458834,63.603645,119.974062,268.46866,87.898336,96.750843,19.912943,36.794914,56.276977,3.564254,4.016762,29.744926
std,2884766.0,2884496.0,174.402243,16.034933,0.499983,0.49832,11.538817,17.766778,2927.594228,16.266383,2.766991,4.045583,0.646371,25.020672,0.935059,5.275067,5.55213
min,10002160.0,30000210.0,200.0,18.0,0.0,0.0,17.0,50.863636,42.0,28.5,51.25,9.916667,30.666667,5.0,3.0,0.016979,17.0
25%,12475420.0,32462310.0,223.0,54.0,0.0,0.0,55.538462,107.0,167.111111,76.238095,95.6,17.0,36.573333,39.0,3.0,1.229317,26.375
50%,14944360.0,34965840.0,260.0,65.0,0.0,0.0,62.708333,117.571429,197.478261,87.0,97.136364,19.3125,36.805,50.0,3.0,2.183646,30.558477
75%,17449810.0,37447170.0,334.0,75.0,1.0,1.0,70.44,131.166667,233.25,98.68,98.5,22.269231,37.08,68.0,5.0,4.433576,30.558477
max,19999830.0,39997750.0,5840.0,91.0,1.0,1.0,126.458333,217.75,166855.166667,158.269231,100.0,40.88,40.104118,184.0,9.0,86.312488,57.3


In [44]:
# Exporting csv files to local repo 
df_final.to_csv(DATA_DIR+'hyperglycemic_patients.csv', index=False)