In [38]:
import pandas as pd
from google.cloud import bigquery


In [39]:
# Initialize the BigQuery client using ADC
client = bigquery.Client()

# Test the connection by listing your BigQuery datasets
# datasets = list(client.list_datasets())
# project = client.project

# if datasets:
    # print(f"Datasets in project {project}:")
    # for dataset in datasets:
        # print(f"\t{dataset.dataset_id}")
# else:
    # print(f"No datasets found in project {project}.")

## Patient Query and Baseline Feature Queries

This query retrieves the basic information of all patients that were diagnosed with Sepsis. Sepsis is identified with ICD9 CODE `99591` and `99592`.

In [40]:
patientQuery = """
SELECT DISTINCT
    p.SUBJECT_ID,
    p.GENDER,
    p.DOB,
    p.DOD,
    p.EXPIRE_FLAG,
    a.HADM_ID,
    a.ADMITTIME,
    a.DISCHTIME,
    a.ADMISSION_TYPE,
    a.ETHNICITY,
    DATE_DIFF(DATE(a.ADMITTIME), DATE(p.DOB), YEAR) AS AGE_AT_ADMISSION
FROM
    mimiciii.patients p
JOIN
    mimiciii.diagnoses_icd d
    ON p.SUBJECT_ID = d.SUBJECT_ID
JOIN
    mimiciii.admissions a
    ON d.HADM_ID = a.HADM_ID
WHERE
    d.ICD9_CODE IN ('99591', '99592');
"""

patient_df = client.query(patientQuery).to_dataframe()
patient_df

Unnamed: 0,SUBJECT_ID,GENDER,DOB,DOD,EXPIRE_FLAG,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ETHNICITY,AGE_AT_ADMISSION
0,4096,M,2091-09-11,NaT,0,124383,2170-07-20 18:09:00,2170-07-26 13:30:00,EMERGENCY,BLACK/AFRICAN AMERICAN,79
1,72960,M,2111-11-27,NaT,0,164620,2187-10-22 00:09:00,2187-11-14 15:50:00,EMERGENCY,WHITE,76
2,74496,M,2101-08-11,NaT,0,138943,2142-06-17 17:11:00,2142-07-02 17:18:00,EMERGENCY,WHITE,41
3,10496,F,2099-01-14,2178-12-19,1,183010,2177-12-29 17:25:00,2178-01-07 17:30:00,EMERGENCY,WHITE,78
4,11008,M,2094-11-13,NaT,0,162989,2151-06-09 17:38:00,2151-06-18 13:49:00,EMERGENCY,WHITE,57
...,...,...,...,...,...,...,...,...,...,...,...
5176,31999,M,2063-01-01,NaT,0,198085,2151-01-26 15:22:00,2151-01-29 14:39:00,EMERGENCY,WHITE,88
5177,49407,F,2062-03-20,NaT,0,183964,2119-04-12 15:26:00,2119-04-19 15:35:00,EMERGENCY,WHITE,57
5178,53247,M,2114-01-16,NaT,0,149738,2163-08-06 12:07:00,2163-08-24 17:40:00,URGENT,WHITE,49
5179,59903,F,1871-05-06,2171-06-20,1,110458,2171-05-06 00:04:00,2171-06-08 11:30:00,EMERGENCY,UNABLE TO OBTAIN,300


### Length of Stay (Days)

In [41]:
# 1. Ensure 'ADMITTIME' and 'DISCHTIME' are in datetime format
patient_df['ADMITTIME'] = pd.to_datetime(patient_df['ADMITTIME'])
patient_df['DISCHTIME'] = pd.to_datetime(patient_df['DISCHTIME'])

# 2. Calculate Length of Stay (LOS) in days
patient_df['LOS'] = (patient_df['DISCHTIME'] - patient_df['ADMITTIME']).dt.days

### Length of Stay in ICU (Days)

In [42]:
icu_los_query = """
SELECT 
    ICU.SUBJECT_ID, 
    AVG(ICU.LOS) AS LOS_ICU_MEAN
FROM 
    mimiciii.icustays AS ICU
INNER JOIN 
    mimiciii.diagnoses_icd AS D 
    ON D.SUBJECT_ID = ICU.SUBJECT_ID
WHERE 
    D.ICD9_CODE IN ('99591', '99592') 
    AND ICU.LOS IS NOT NULL
GROUP BY 
    ICU.SUBJECT_ID
ORDER BY 
    ICU.SUBJECT_ID ASC
"""

# 4. Execute the Query and Load Results into a DataFrame
icu_los_df = client.query(icu_los_query).to_dataframe()

# 5. Display the Cleaned ICU LOS DataFrame
icu_los_df.head()

Unnamed: 0,SUBJECT_ID,LOS_ICU_MEAN
0,21,7.1332
1,38,25.5485
2,61,2.2841
3,62,2.8257
4,64,2.325


### Height Query

In [43]:
height_query = """
SELECT C.SUBJECT_ID, AVG(C.VALUENUM) AS HEIGHT_MEAN, ITEM.LABEL, ITEM.ITEMID
FROM mimiciii.chartevents AS C
INNER JOIN mimiciii.diagnoses_icd AS D ON D.SUBJECT_ID = C.SUBJECT_ID
JOIN mimiciii.d_items AS ITEM ON ITEM.ITEMID = C.ITEMID
WHERE D.ICD9_CODE IN ('99591', '99592') AND (C.ITEMID  IN (226707, 226730, 1394))
GROUP BY C.SUBJECT_ID, ITEM.LABEL, ITEM.ITEMID
ORDER BY C.SUBJECT_ID ASC
"""

height_df = client.query(height_query).to_dataframe()

# Convert HEIGHT_MEAN to centimeters where ITEMID is not 226730
height_df['HEIGHT_CM'] = height_df.apply(
    lambda row: row['HEIGHT_MEAN'] * 2.54 if row['ITEMID'] != 226730 else row['HEIGHT_MEAN'],
    axis=1
)

# Remove duplicate records 
height_df_cleaned = height_df.drop_duplicates(subset=['SUBJECT_ID'])

# Select only relevant columns
height_df_cleaned = height_df_cleaned[['SUBJECT_ID', 'HEIGHT_CM']]
height_df_cleaned.head()

Unnamed: 0,SUBJECT_ID,HEIGHT_CM
0,124,172.72
2,305,157.48
4,402,168.0
6,406,160.0
8,502,178.0


### Weight Query

In [44]:
# weight is in KG
weight_query = """
SELECT C.SUBJECT_ID, AVG(C.VALUENUM) AS WEIGHT_MEAN, ITEM.LABEL, ITEM.ITEMID
FROM mimiciii.chartevents AS C
INNER JOIN mimiciii.diagnoses_icd AS D ON D.SUBJECT_ID = C.SUBJECT_ID
JOIN mimiciii.d_items AS ITEM ON ITEM.ITEMID = C.ITEMID
WHERE D.ICD9_CODE IN ('99591', '99592') AND (C.ITEMID  IN (226531, 763, 224639, 226512))
GROUP BY C.SUBJECT_ID, ITEM.LABEL, ITEM.ITEMID
ORDER BY C.SUBJECT_ID ASC
"""

weight_df = client.query(weight_query).to_dataframe()
weight_df.head()

max_weight = weight_df['WEIGHT_MEAN'].max()
weight_df_cleaned = weight_df[weight_df['WEIGHT_MEAN'] != max_weight]

# 1. Convert WEIGHT_MEAN from LB to KG where ITEMID is 226531
weight_df_cleaned.loc[weight_df_cleaned['ITEMID'] == 226531, 'WEIGHT_MEAN'] *= 0.453592

# 2. Remove duplicate records 
weight_df_cleaned = weight_df_cleaned.drop_duplicates(subset=['SUBJECT_ID'])
weight_df_cleaned = weight_df_cleaned.drop(['LABEL', 'ITEMID'], axis=1)
weight_df_cleaned.head()

Unnamed: 0,SUBJECT_ID,WEIGHT_MEAN
0,21,64.599998
1,38,97.375
2,61,56.4
3,62,78.300001
4,64,69.0


## Vital Signs Queries

### Heart Rate Query

In [45]:
heart_rate_query = """
SELECT C.SUBJECT_ID, AVG(C.VALUENUM) AS HEARTRATE_MEAN
FROM mimiciii.chartevents AS C
INNER JOIN mimiciii.diagnoses_icd AS D ON D.SUBJECT_ID = C.SUBJECT_ID
JOIN mimiciii.d_items AS ITEM ON ITEM.ITEMID = C.ITEMID
WHERE D.ICD9_CODE IN ('99591', '99592') AND (C.ITEMID  IN (211, 220045))
GROUP BY C.SUBJECT_ID, ITEM.LABEL
ORDER BY C.SUBJECT_ID ASC
"""

heart_rate_df = client.query(heart_rate_query).to_dataframe()
# merge = pd.merge(patient_df, heart_rate_df, how='left', on='SUBJECT_ID')
# merge.head()

### Sys BP

In [46]:
sbp_query = """
SELECT 
    C.SUBJECT_ID, 
    AVG(C.VALUENUM) AS SBP_MEAN
FROM 
    mimiciii.chartevents AS C
INNER JOIN 
    mimiciii.diagnoses_icd AS D 
    ON D.SUBJECT_ID = C.SUBJECT_ID
JOIN 
    mimiciii.d_items AS ITEM 
    ON ITEM.ITEMID = C.ITEMID
WHERE 
    D.ICD9_CODE IN ('99591', '99592') 
    AND C.ITEMID IN (51, 422, 455, 6701, 220050, 220179, 225309)
    AND C.VALUENUM IS NOT NULL
GROUP BY 
    C.SUBJECT_ID
ORDER BY 
    C.SUBJECT_ID ASC
"""

# 4. Execute the Query and Load Results into a DataFrame
sbp_df = client.query(sbp_query).to_dataframe()

# 5. Display the Aggregated SBP DataFrame
sbp_df.head()


Unnamed: 0,SUBJECT_ID,SBP_MEAN
0,21,109.821244
1,38,87.641624
2,61,96.341176
3,62,99.012346
4,64,109.057971


### Dialysis BP

In [47]:
dbp_query = """
SELECT 
    C.SUBJECT_ID, 
    AVG(C.VALUENUM) AS DBP_MEAN
FROM 
    mimiciii.chartevents AS C
INNER JOIN 
    mimiciii.diagnoses_icd AS D 
    ON D.SUBJECT_ID = C.SUBJECT_ID
JOIN 
    mimiciii.d_items AS ITEM 
    ON ITEM.ITEMID = C.ITEMID
WHERE 
    D.ICD9_CODE IN ('99591', '99592') 
    AND C.ITEMID IN (8368, 8441, 8555, 220051, 220180, 225310)
    AND C.VALUENUM IS NOT NULL
GROUP BY 
    C.SUBJECT_ID
ORDER BY 
    C.SUBJECT_ID ASC
"""

# 4. Execute the Query and Load Results into a DataFrame
dbp_df = client.query(dbp_query).to_dataframe()

# 5. Display the Aggregated DBP DataFrame
dbp_df.head()


Unnamed: 0,SUBJECT_ID,DBP_MEAN
0,21,42.486979
1,38,51.809375
2,61,54.188235
3,62,61.049383
4,64,57.811594


### Mean Blood Pressure

In [48]:
map_query = """
SELECT 
    C.SUBJECT_ID, 
    AVG(C.VALUENUM) AS MAP_MEAN
FROM 
    mimiciii.chartevents AS C
INNER JOIN 
    mimiciii.diagnoses_icd AS D 
    ON D.SUBJECT_ID = C.SUBJECT_ID
JOIN 
    mimiciii.d_items AS ITEM 
    ON ITEM.ITEMID = C.ITEMID
WHERE 
    D.ICD9_CODE IN ('99591', '99592') 
    AND C.ITEMID IN (52, 456, 6702, 220052, 220181, 225312)
    AND C.VALUENUM IS NOT NULL
GROUP BY 
    C.SUBJECT_ID
ORDER BY 
    C.SUBJECT_ID ASC
"""

# 4. Execute the Query and Load Results into a DataFrame
map_df = client.query(map_query).to_dataframe()

# 5. Display the Aggregated MAP DataFrame
map_df.head()


Unnamed: 0,SUBJECT_ID,MAP_MEAN
0,21,68.012345
1,38,69.527168
2,61,68.239216
3,62,73.7037
4,64,74.893717


### Resp Rate 

In [49]:
rr_query = """
SELECT 
    C.SUBJECT_ID, 
    AVG(C.VALUENUM) AS RR_MEAN
FROM 
    mimiciii.chartevents AS C
INNER JOIN 
    mimiciii.diagnoses_icd AS D 
    ON D.SUBJECT_ID = C.SUBJECT_ID
JOIN 
    mimiciii.d_items AS ITEM 
    ON ITEM.ITEMID = C.ITEMID
WHERE 
    D.ICD9_CODE IN ('99591', '99592') 
    AND C.ITEMID IN (618, 224422, 224689, 224690, 220210)
    AND C.VALUENUM IS NOT NULL
GROUP BY 
    C.SUBJECT_ID
ORDER BY 
    C.SUBJECT_ID ASC
"""

# 4. Execute the Query and Load Results into a DataFrame
rr_df = client.query(rr_query).to_dataframe()

# 5. Display the Aggregated RR DataFrame
rr_df.head()


Unnamed: 0,SUBJECT_ID,RR_MEAN
0,21,17.92623
1,38,18.510109
2,61,21.111111
3,62,22.722892
4,64,18.194444


### Temperature

In [50]:
temp_query = """
WITH Temperature_Converted AS (
    SELECT
        C.SUBJECT_ID,
        C.CHARTTIME,
        -- Convert Fahrenheit to Celsius; leave Celsius measurements as is
        CASE 
            WHEN C.ITEMID IN (678, 679, 223761) THEN (C.VALUENUM - 32) * 5/9
            ELSE C.VALUENUM
        END AS TEMP_C
    FROM 
        mimiciii.chartevents AS C
    INNER JOIN 
        mimiciii.diagnoses_icd AS D 
        ON D.SUBJECT_ID = C.SUBJECT_ID
    WHERE 
        D.ICD9_CODE IN ('99591', '99592') 
        AND C.ITEMID IN (676, 677, 678, 679, 223762, 223761)
        AND C.VALUENUM IS NOT NULL
),

Temperature_Deduplicated AS (
    SELECT
        SUBJECT_ID,
        TIMESTAMP_TRUNC(CHARTTIME, HOUR) AS CHARTTIME_HOUR,
        AVG(TEMP_C) AS TEMP_C_Avg
    FROM 
        Temperature_Converted
    GROUP BY 
        SUBJECT_ID, CHARTTIME_HOUR
)

SELECT
    SUBJECT_ID,
    AVG(TEMP_C_Avg) AS TEMP_MEAN_C,
    MIN(TEMP_C_Avg) AS TEMP_MIN_C,
    MAX(TEMP_C_Avg) AS TEMP_MAX_C
FROM 
    Temperature_Deduplicated
GROUP BY 
    SUBJECT_ID
ORDER BY 
    SUBJECT_ID ASC
"""

# 4. Execute the Query and Load Results into a DataFrame
temp_df = client.query(temp_query).to_dataframe()

# 5. Display the Aggregated Temperature DataFrame
temp_df.head()


Unnamed: 0,SUBJECT_ID,TEMP_MEAN_C,TEMP_MIN_C,TEMP_MAX_C
0,21,36.491978,35.166683,37.944423
1,38,37.408217,34.611106,38.888895
2,61,37.458728,35.5,38.888895
3,62,36.361106,35.388894,37.111106
4,64,38.428572,36.444422,39.333318


### Oxygen Saturation

In [51]:
oxygen_sat_query = """
WITH Oxygen_Saturation_Converted AS (
    SELECT
        C.SUBJECT_ID,
        C.CHARTTIME,
        C.ITEMID,
        C.VALUENUM AS OXYGEN_SAT
    FROM 
        mimiciii.chartevents AS C
    INNER JOIN 
        mimiciii.diagnoses_icd AS D 
        ON D.SUBJECT_ID = C.SUBJECT_ID
    WHERE 
        D.ICD9_CODE IN ('99591', '99592') 
        AND C.ITEMID IN (646, 834, 220227, 220277)
        AND C.VALUENUM IS NOT NULL
),

Oxygen_Saturation_Deduplicated AS (
    SELECT
        SUBJECT_ID,
        TIMESTAMP_TRUNC(CHARTTIME, MINUTE) AS CHARTTIME_MINUTE,
        AVG(OXYGEN_SAT) AS OXYGEN_SAT_Avg
    FROM 
        Oxygen_Saturation_Converted
    GROUP BY 
        SUBJECT_ID, CHARTTIME_MINUTE
)

SELECT
    SUBJECT_ID,
    AVG(OXYGEN_SAT_Avg) AS OXYGEN_SAT_MEAN,
    MIN(OXYGEN_SAT_Avg) AS OXYGEN_SAT_MIN,
    MAX(OXYGEN_SAT_Avg) AS OXYGEN_SAT_MAX
FROM 
    Oxygen_Saturation_Deduplicated
GROUP BY 
    SUBJECT_ID
ORDER BY 
    SUBJECT_ID ASC
"""

# 4. Execute the Query and Load Results into a DataFrame
oxygen_sat_df = client.query(oxygen_sat_query).to_dataframe()

# 5. Display the Aggregated Oxygen Saturation DataFrame

oxygen_sat_df.drop(['OXYGEN_SAT_MAX', 'OXYGEN_SAT_MIN'], axis=1, inplace=True)

oxygen_sat_df.head()

Unnamed: 0,SUBJECT_ID,OXYGEN_SAT_MEAN
0,21,96.834286
1,38,99.021021
2,61,98.125
3,62,96.6
4,64,98.357143


## Accompanied Diseases

In [52]:
diabetes_query = """
SELECT DISTINCT
    SUBJECT_ID
FROM
    `physionet-data-435019.mimiciii.diagnoses_icd`
WHERE
    (
        icd9_code IN ("2535","3572","5881","7751","24900","24901","24910","24911","24920","24921","24930","24931","24940","24941","24950","24951","24960","24961","24970","24971","24980","24981","24990","24991","25000","25001","25002","25003","25010","25011","25012","25013","25020","25021","25022","25023","25030","25031","25032","25033","25040","25041","25042","25043","25050","25051","25052","25053","25060","25061","25062","25063","25070","25071","25072","25073","25080","25081","25082","25083","25090","25091","25092","25093","64800","64801","64802","64803","64804","V1221","V180","V771")
    )
    AND subject_id IN (
        SELECT SUBJECT_ID
        FROM `physionet-data-435019.mimiciii.diagnoses_icd`
        WHERE icd9_code IN ('99591', '99592') -- Sepsis codes
    )
ORDER BY SUBJECT_ID ASC
"""

# Execute the query and convert to DataFrame
df_diabetes_sepsis = client.query(diabetes_query).to_dataframe()

# Add 'DIABETES' column with value 1
df_diabetes_sepsis['DIABETES'] = 1

# Verify the DataFrame
df_diabetes_sepsis.head()

Unnamed: 0,SUBJECT_ID,DIABETES
0,21,1
1,117,1
2,157,1
3,191,1
4,234,1


## Common Sources of Infection

In [53]:
bio_query = """
SELECT * 
FROM `physionet-data-435019.mimiciii.microbiologyevents` m
JOIN `physionet-data-435019.mimiciii.diagnoses_icd` d
ON d.SUBJECT_ID = m.SUBJECT_ID
WHERE d.ICD9_CODE IN ('99591', '99592') 
ORDER BY d.SUBJECT_ID
"""

bio_df = client.query(bio_query).to_dataframe()
bio_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,SPEC_ITEMID,SPEC_TYPE_DESC,ORG_ITEMID,ORG_NAME,ISOLATE_NUM,...,AB_NAME,DILUTION_TEXT,DILUTION_COMPARISON,DILUTION_VALUE,INTERPRETATION,ROW_ID_1,SUBJECT_ID_1,HADM_ID_1,SEQ_NUM,ICD9_CODE
0,91,21,111970,2135-02-01,2135-02-01 12:06:00,70011,BLOOD CULTURE ( MYCO/F LYTIC BOTTLE),,,,...,,,,,,140,21,111970,11,99592
1,94,21,111970,2135-02-02,2135-02-02 12:15:00,70011,BLOOD CULTURE ( MYCO/F LYTIC BOTTLE),,,,...,,,,,,140,21,111970,11,99592
2,70,21,109451,2134-09-11,2134-09-11 09:35:00,70012,BLOOD CULTURE,,,,...,,,,,,140,21,111970,11,99592
3,73,21,109451,2134-09-19,2134-09-19 15:55:00,70012,BLOOD CULTURE,,,,...,,,,,,140,21,111970,11,99592
4,74,21,109451,2134-09-19,2134-09-19 16:20:00,70012,BLOOD CULTURE,,,,...,,,,,,140,21,111970,11,99592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305623,631697,99991,151118,2184-12-27,2184-12-27 05:50:00,70079,URINE,,,,...,,,,,,651001,99991,151118,5,99592
305624,631701,99991,151118,2184-12-28,2184-12-28 13:30:00,70079,URINE,,,,...,,,,,,651001,99991,151118,5,99592
305625,631700,99991,151118,2184-12-28,2184-12-28 02:26:00,70091,MRSA SCREEN,80293,POSITIVE FOR METHICILLIN RESISTANT STAPH AUREUS,1,...,,,,,,651001,99991,151118,5,99592
305626,631703,99991,151118,2184-12-28,2184-12-28 21:45:00,70091,MRSA SCREEN,80293,POSITIVE FOR METHICILLIN RESISTANT STAPH AUREUS,1,...,,,,,,651001,99991,151118,5,99592


## Concatenating All Frames