In [1]:
import pandas as pd
import os
from sqlalchemy import text
from sqlalchemy.engine import URL

from google.cloud import bigquery

In [2]:
# Initialize the BigQuery client using ADC
client = bigquery.Client()

# Test the connection by listing your BigQuery datasets
datasets = list(client.list_datasets())
project = client.project

if datasets:
    print(f"Datasets in project {project}:")
    for dataset in datasets:
        print(f"\t{dataset.dataset_id}")
else:
    print(f"No datasets found in project {project}.")


Datasets in project physionet-data-435019:
	mimiciii


## Patient Query

In [3]:
patientQuery = """
SELECT DISTINCT
    p.SUBJECT_ID,
    p.GENDER,
    p.DOB,
    p.DOD,
    p.EXPIRE_FLAG,
    a.HADM_ID,
    a.ADMITTIME,
    a.DISCHTIME,
    a.ADMISSION_TYPE,
    a.ETHNICITY,
    DATE_DIFF(DATE(a.ADMITTIME), DATE(p.DOB), YEAR) AS AGE_AT_ADMISSION
FROM
    mimiciii.patients p
JOIN
    mimiciii.diagnoses_icd d
    ON p.SUBJECT_ID = d.SUBJECT_ID
JOIN
    mimiciii.admissions a
    ON d.HADM_ID = a.HADM_ID
WHERE
    d.ICD9_CODE IN ('99591', '99592');
"""

patient_df = client.query(patientQuery).to_dataframe()

### Length of Stay (Days)

In [4]:
# 1. Ensure 'ADMITTIME' and 'DISCHTIME' are in datetime format
patient_df['ADMITTIME'] = pd.to_datetime(patient_df['ADMITTIME'])
patient_df['DISCHTIME'] = pd.to_datetime(patient_df['DISCHTIME'])

# 2. Calculate Length of Stay (LOS) in days
patient_df['LOS'] = (patient_df['DISCHTIME'] - patient_df['ADMITTIME']).dt.days

### Length of Stay in ICU (Days)

In [5]:
icu_los_query = """
SELECT 
    ICU.SUBJECT_ID, 
    AVG(ICU.LOS) AS LOS_ICU_MEAN
FROM 
    mimiciii.icustays AS ICU
INNER JOIN 
    mimiciii.diagnoses_icd AS D 
    ON D.SUBJECT_ID = ICU.SUBJECT_ID
WHERE 
    D.ICD9_CODE IN ('99591', '99592') 
    AND ICU.LOS IS NOT NULL
GROUP BY 
    ICU.SUBJECT_ID
ORDER BY 
    ICU.SUBJECT_ID ASC
"""

# 4. Execute the Query and Load Results into a DataFrame
icu_los_df = client.query(icu_los_query).to_dataframe()

# 5. Display the Cleaned ICU LOS DataFrame
icu_los_df.head()

Unnamed: 0,SUBJECT_ID,LOS_ICU_MEAN
0,21,7.1332
1,38,25.5485
2,61,2.2841
3,62,2.8257
4,64,2.325


## Heart Rate Query

In [6]:
heart_rate_query = """
SELECT C.SUBJECT_ID, AVG(C.VALUENUM) AS HEARTRATE_MEAN
FROM mimiciii.chartevents AS C
INNER JOIN mimiciii.diagnoses_icd AS D ON D.SUBJECT_ID = C.SUBJECT_ID
JOIN mimiciii.d_items AS ITEM ON ITEM.ITEMID = C.ITEMID
WHERE D.ICD9_CODE IN ('99591', '99592') AND (C.ITEMID  IN (211, 220045))
GROUP BY C.SUBJECT_ID, ITEM.LABEL
ORDER BY C.SUBJECT_ID ASC
"""

heart_rate_df = client.query(heart_rate_query).to_dataframe()
# merge = pd.merge(patient_df, heart_rate_df, how='left', on='SUBJECT_ID')
# merge.head()

## Height Query

In [7]:
height_query = """
SELECT C.SUBJECT_ID, AVG(C.VALUENUM) AS HEIGHT_MEAN, ITEM.LABEL, ITEM.ITEMID
FROM mimiciii.chartevents AS C
INNER JOIN mimiciii.diagnoses_icd AS D ON D.SUBJECT_ID = C.SUBJECT_ID
JOIN mimiciii.d_items AS ITEM ON ITEM.ITEMID = C.ITEMID
WHERE D.ICD9_CODE IN ('99591', '99592') AND (C.ITEMID  IN (226707, 226730, 1394))
GROUP BY C.SUBJECT_ID, ITEM.LABEL, ITEM.ITEMID
ORDER BY C.SUBJECT_ID ASC
"""

height_df = client.query(height_query).to_dataframe()

# Convert HEIGHT_MEAN to centimeters where ITEMID is not 226730
height_df['HEIGHT_CM'] = height_df.apply(
    lambda row: row['HEIGHT_MEAN'] * 2.54 if row['ITEMID'] != 226730 else row['HEIGHT_MEAN'],
    axis=1
)

# Remove duplicate records 
height_df_cleaned = height_df.drop_duplicates(subset=['SUBJECT_ID'])

# Select only relevant columns
height_df_cleaned = height_df_cleaned[['SUBJECT_ID', 'HEIGHT_CM']]
height_df_cleaned.head()

Unnamed: 0,SUBJECT_ID,HEIGHT_CM
0,124,172.72
2,305,157.48
4,402,168.0
6,406,160.0
8,502,178.0


## Weight Query

In [8]:
# weight is in KG
weight_query = """
SELECT C.SUBJECT_ID, AVG(C.VALUENUM) AS WEIGHT_MEAN, ITEM.LABEL, ITEM.ITEMID
FROM mimiciii.chartevents AS C
INNER JOIN mimiciii.diagnoses_icd AS D ON D.SUBJECT_ID = C.SUBJECT_ID
JOIN mimiciii.d_items AS ITEM ON ITEM.ITEMID = C.ITEMID
WHERE D.ICD9_CODE IN ('99591', '99592') AND (C.ITEMID  IN (226531, 763, 224639, 226512))
GROUP BY C.SUBJECT_ID, ITEM.LABEL, ITEM.ITEMID
ORDER BY C.SUBJECT_ID ASC
"""

weight_df = client.query(weight_query).to_dataframe()
weight_df.head()

max_weight = weight_df['WEIGHT_MEAN'].max()
weight_df_cleaned = weight_df[weight_df['WEIGHT_MEAN'] != max_weight]

# 1. Convert WEIGHT_MEAN from LB to KG where ITEMID is 226531
weight_df_cleaned.loc[weight_df_cleaned['ITEMID'] == 226531, 'WEIGHT_MEAN'] *= 0.453592

# 2. Remove duplicate records 
weight_df_cleaned = weight_df_cleaned.drop_duplicates(subset=['SUBJECT_ID'])
weight_df_cleaned = weight_df_cleaned.drop(['LABEL', 'ITEMID'], axis=1)
weight_df_cleaned.head()

Unnamed: 0,SUBJECT_ID,WEIGHT_MEAN
0,21,64.599998
1,38,97.375
2,61,56.4
3,62,78.300001
4,64,69.0
