In [42]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from IPython.display import display, HTML, Image
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams.update({'font.size': 20})

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [43]:
# authenticate
auth.authenticate_user()

In [44]:
# Set up environment variables
project_id = 'carbon-virtue-378402'
if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project.')
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# set the dataset
# if you want to use the demo, change this to mimic_demo
dataset = 'mimiciv'

In [45]:
# 1. Get basic patient info (admissions + demographics (excluding marital_status) + death status)
patient_info_query = """
SELECT adm.subject_id, adm.hadm_id, adm.admittime, adm.dischtime, pat.gender, pat.anchor_age AS age, adm.race, adm.hospital_expire_flag
FROM `physionet-data.mimiciv_hosp.admissions` AS adm
JOIN `physionet-data.mimiciv_hosp.patients` AS pat
ON adm.subject_id = pat.subject_id
WHERE adm.admittime IS NOT NULL
ORDER BY subject_id
"""
patient_info_df = run_query(patient_info_query)

patient_info_df['in_hospital_death'] = patient_info_df['hospital_expire_flag'] == 1

patient_info_df = patient_info_df.drop(columns=['hospital_expire_flag'])

# Calculate length of stay and have data only with positive L.O.F
patient_info_df['admittime'] = pd.to_datetime(patient_info_df['admittime'])
patient_info_df['dischtime'] = pd.to_datetime(patient_info_df['dischtime'])
patient_info_df['length_of_stay'] = (patient_info_df['dischtime'] - patient_info_df['admittime']).dt.total_seconds() / (60 * 60 * 24)

patient_info_df = patient_info_df[patient_info_df['length_of_stay'] > 0]

  return pd.io.gbq.read_gbq(


In [46]:
patient_info_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
0,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,F,52,WHITE,False,1.015278
1,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,F,52,WHITE,False,2.222222
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,F,52,WHITE,False,1.754167
3,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,F,52,WHITE,False,0.786111
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,F,19,WHITE,False,0.298611
...,...,...,...,...,...,...,...,...,...
431226,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,F,46,WHITE,False,17.074306
431227,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00,F,46,WHITE,False,10.011111
431228,19999840,26071774,2164-07-25 00:27:00,2164-07-28 12:15:00,M,58,WHITE,False,3.491667
431229,19999840,21033226,2164-09-10 13:47:00,2164-09-17 13:42:00,M,58,WHITE,True,6.996528


In [47]:
# Count number of patients
num_patients = patient_info_df['subject_id'].nunique()
print(f"Number of patients: {num_patients}")

Number of patients: 180677


In [48]:
# 2. Get high lipase level patients
lipase_item_query = """
SELECT itemid, label, fluid
FROM `physionet-data.mimiciv_hosp.d_labitems`
WHERE LOWER(label) LIKE '%lipase%'
"""
lipase_items_df = run_query(lipase_item_query)
print(lipase_items_df)
lipase_itemids = [50956, 50844, 51055, 51036] # blood and other fluid # Lipase item IDs [50956]# only blood

  return pd.io.gbq.read_gbq(


   itemid               label             fluid
0   50956              Lipase             Blood
1   50844     Lipase, Ascites           Ascites
2   51055     Lipase, Pleural           Pleural
3   51036  Lipase, Body Fluid  Other Body Fluid


In [49]:
# Retrieve lipase values
lipase_values_query = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS lipase_level
FROM `physionet-data.mimiciv_hosp.labevents`
WHERE itemid IN ({', '.join(map(str, lipase_itemids))})
ORDER BY subject_id, charttime
"""
lipase_values_df = run_query(lipase_values_query)

# Merge lipase values with patient info and clean data
lipase_values_df = pd.merge(lipase_values_df, patient_info_df, on=['subject_id', 'hadm_id'], how='left')
lipase_values_df.dropna(subset=['age', 'lipase_level'], inplace=True)

# Identify patients with high lipase levels
def check_lipase(row):
    upper_limit = 140 if row['age'] < 60 else 151
    return row['lipase_level'] >= 3 * upper_limit

high_lipase_df = lipase_values_df[lipase_values_df.apply(check_lipase, axis=1)]

  return pd.io.gbq.read_gbq(


In [50]:
lipase_values_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
4,10000084,23052089,2160-11-20 22:30:00,47.0,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,WHITE,False,4.538889
15,10000826,20032235,2146-12-06 04:24:00,33.0,2146-12-05 19:07:00,2146-12-12 16:30:00,F,32,WHITE,False,6.890972
16,10000826,28289260,2146-12-30 17:30:00,51.0,2146-12-31 00:43:00,2147-01-02 17:45:00,F,32,WHITE,False,2.709722
20,10001176,23334588,2186-11-28 22:00:00,21.0,2186-11-29 03:56:00,2186-12-02 15:00:00,F,64,WHITE,False,3.461111
25,10001338,22119639,2138-05-10 05:25:00,17.0,2138-05-09 19:47:00,2138-05-27 15:40:00,F,43,WHITE,False,17.828472
...,...,...,...,...,...,...,...,...,...,...,...
229741,19999303,23567530,2161-04-03 19:50:00,292.0,2161-04-03 15:40:00,2161-04-06 10:45:00,F,61,WHITE,False,2.795139
229742,19999303,23567530,2161-04-04 05:10:00,276.0,2161-04-03 15:40:00,2161-04-06 10:45:00,F,61,WHITE,False,2.795139
229752,19999828,29734428,2147-07-17 18:04:00,23.0,2147-07-18 16:23:00,2147-08-04 18:10:00,F,46,WHITE,False,17.074306
229754,19999840,26071774,2164-07-25 06:45:00,19.0,2164-07-25 00:27:00,2164-07-28 12:15:00,M,58,WHITE,False,3.491667


In [51]:
# Get item IDs for lipase tests in icu
lipase_item_query_icu = """
SELECT itemid, label, category
FROM `physionet-data.mimiciv_icu.d_items`
WHERE LOWER(label) LIKE '%lipase%'
"""
lipase_items_df_icu = run_query(lipase_item_query_icu)
print(lipase_items_df_icu)
lipase_itemids_icu = "225672"  # Lipase item IDs

  return pd.io.gbq.read_gbq(


   itemid   label category
0  225672  Lipase     Labs


In [52]:
# Retrieve lipase values icu

lipase_values_query_icu = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS lipase_level
FROM `physionet-data.mimiciv_icu.chartevents`
WHERE itemid IN ({lipase_itemids_icu})
ORDER BY subject_id, charttime
"""
lipase_values_df_icu = run_query(lipase_values_query_icu)

# Merge lipase values with patient info and clean data
lipase_values_df_icu = pd.merge(lipase_values_df_icu, patient_info_df, on=['subject_id', 'hadm_id'], how='left')
lipase_values_df_icu.dropna(subset=['age', 'lipase_level'], inplace=True)

# Identify patients with high lipase levels in ICU
def check_lipase(row):
    upper_limit = 140 if row['age'] < 60 else 151
    return row['lipase_level'] >= 3 * upper_limit

high_lipase_df_icu = lipase_values_df_icu[lipase_values_df_icu.apply(check_lipase, axis=1)]

  return pd.io.gbq.read_gbq(


In [53]:
high_lipase_df_icu

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
1,10004606,29242151,2159-02-20 18:30:00,1222.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,14.130556
15,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
16,10017531,22580355,2159-09-23 04:30:00,977.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
55,10036086,28728587,2196-05-26 09:25:00,677.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528
56,10036086,28728587,2196-05-28 03:39:00,777.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528
...,...,...,...,...,...,...,...,...,...,...,...
17140,19882958,29628147,2182-08-31 03:10:00,787.0,2182-08-29 17:25:00,2182-09-03 13:50:00,M,83,WHITE,False,4.850694
17167,19899716,21665899,2143-09-03 17:16:00,587.0,2143-09-03 12:59:00,2143-09-08 18:00:00,M,62,BLACK/AFRICAN AMERICAN,False,5.209028
17178,19901341,23906609,2169-08-10 06:48:00,588.0,2169-08-06 14:52:00,2169-09-05 14:58:00,F,55,WHITE,False,30.004167
17189,19907884,24707264,2181-01-24 04:42:00,789.0,2181-01-23 21:32:00,2181-02-03 13:30:00,F,38,WHITE,False,10.665278


In [54]:
high_lipase_df_combined = pd.concat([high_lipase_df, high_lipase_df_icu])

# Drop duplicate rows
high_lipase_df_unique = high_lipase_df_combined.drop_duplicates()

In [55]:
len(high_lipase_df_unique)

4241

In [56]:
# Count number of patients with lipase levels recorded
num_patients = high_lipase_df_unique['subject_id'].nunique()
print(f"Number of unique patients with high lipase levels: {num_patients}")

Number of unique patients with high lipase levels: 2106


In [57]:
# 3. Get high CRP level patients
# Get item IDs of CRP in hosp & icu
crp_item_query_hosp = """
SELECT itemid, label, fluid
FROM `physionet-data.mimiciv_hosp.d_labitems`
WHERE LOWER(label) LIKE '%c-reactive%' or LOWER(label) LIKE '%protein%'
"""
crp_items_df_hosp = run_query(crp_item_query_hosp)
print("CRP Items in hosp:")
print(crp_items_df_hosp)

crp_item_query_icu = """
SELECT itemid, label, category
FROM `physionet-data.mimiciv_icu.d_items`
WHERE LOWER(label) LIKE '%c-reactive%' or LOWER(label) LIKE '%protein%'
"""
crp_items_df_icu = run_query(crp_item_query_icu)
print("CRP Items in icu:")
print(crp_items_df_icu)


  return pd.io.gbq.read_gbq(


CRP Items in hosp:
    itemid                       label                fluid
0    50864           Alpha-Fetoprotein                Blood
1    50889          C-Reactive Protein                Blood
2    50975     Protein Electrophoresis                Blood
3    50976              Protein, Total                Blood
4    53096              Protein, Total                Blood
5    51949        Total Protein, Stool                Stool
6    51068               24 hr Protein                Urine
7    51099    Protein/Creatinine Ratio                Urine
8    51102        Total Protein, Urine                Urine
9    51992                     Protein                Urine
10   50849      Total Protein, Ascites              Ascites
11   51059      Total Protein, Pleural              Pleural
12   51024  Total Protein, Joint Fluid          Joint Fluid
13   51043   Total Protein, Body Fluid     Other Body Fluid
14   51802          Total Protein, CSF  Cerebrospinal Fluid
15   51270          P

  return pd.io.gbq.read_gbq(


CRP Items in icu:
   itemid                       label                 category
0  229583                Beneprotein.  Nutrition - Supplements
1  220454                     Protein              Ingredients
2  220612   ZC Reactive Protein (CRP)                     Labs
3  220650               Total Protein                     Labs
4  227444    C Reactive Protein (CRP)                     Labs
5  226184  Estimated Protein Needs/Kg                  General
6  225970                 Beneprotein      Nutrition - Enteral
7  229296   Vital High Protein (Full)      Nutrition - Enteral


In [58]:
# hosp 50889          C-Reactive Protein                Blood
# icu  227444    C Reactive Protein (CRP)                Labs
crp_itemids = [50889, 51006]

# Get CRP value from hosp
crp_values_query_hosp = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS crp_level
FROM `physionet-data.mimiciv_hosp.labevents`
WHERE itemid IN ({', '.join(map(str, crp_itemids))})
ORDER BY subject_id, charttime
"""
crp_values_df_hosp = run_query(crp_values_query_hosp)

# Get CRP value from icu
crp_values_query_icu = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS crp_level
FROM `physionet-data.mimiciv_icu.chartevents`
WHERE itemid IN ({', '.join(map(str, crp_itemids))})
ORDER BY subject_id, charttime
"""
crp_values_df_icu = run_query(crp_values_query_icu)

crp_values_df = pd.concat([crp_values_df_hosp, crp_values_df_icu])
crp_values_df_unique = crp_values_df.drop_duplicates()

  return pd.io.gbq.read_gbq(
  crp_values_df = pd.concat([crp_values_df_hosp, crp_values_df_icu])


In [59]:
len(crp_values_df_unique)

3307824

In [60]:
# CRP > 150 mg/dL within 48 hrs
crp_values_df = pd.merge(crp_values_df, patient_info_df[['subject_id', 'hadm_id', 'admittime']], on=['subject_id', 'hadm_id'])
crp_values_df['charttime'] = pd.to_datetime(crp_values_df['charttime'])
crp_values_df['within_48h'] = (crp_values_df['charttime'] - crp_values_df['admittime']).dt.total_seconds() / (60 * 60) <= 48
crp_critical_df = crp_values_df[(crp_values_df['crp_level'] > 150) & (crp_values_df['within_48h'])]

In [62]:
len(crp_critical_df)

2944

In [61]:
# Count number of crp_critical patients
num_patients = crp_critical_df['subject_id'].nunique()
print(f"Number of unique patients with high CRP levels within 48 hrs: {num_patients}")

Number of unique patients with high CRP levels within 48 hrs: 2124


In [63]:
# 4. Get high amylase level patients
# Get amylase item IDs in hosp and icu
amylase_item_query_hosp = """
SELECT itemid, label, fluid
FROM `physionet-data.mimiciv_hosp.d_labitems`
WHERE LOWER(label) LIKE '%amylase%'
"""
amylase_items_df_hosp = run_query(amylase_item_query_hosp)
print("Amylase Items in hosp:")
print(amylase_items_df_hosp)

amylase_item_query_icu = """
SELECT itemid, label, category
FROM `physionet-data.mimiciv_icu.d_items`
WHERE LOWER(label) LIKE '%amylase%'
"""
amylase_items_df_icu = run_query(amylase_item_query_icu)
print("Amylase Items in icu:")
print(amylase_items_df_icu)

  return pd.io.gbq.read_gbq(


Amylase Items in hosp:
    itemid                            label                fluid
0    50867                          Amylase                Blood
1    53087                          Amylase                Blood
2    51930                   Amylase, Stool                Stool
3    51072                   Amylase, Urine                Urine
4    51073  Amylase/Creatinine Ratio, Urine                Urine
5    51963     Amylase/Creatinine Clearance                Urine
6    51964                   Amylase, Serum                Urine
7    51999                    Urine Amylase                Urine
8    50836                 Amylase, Ascites              Ascites
9    51047                 Amylase, Pleural              Pleural
10   51020             Amylase, Joint Fluid          Joint Fluid
11   51026              Amylase, Body Fluid     Other Body Fluid
12   51780                     Amylase, CSF  Cerebrospinal Fluid


  return pd.io.gbq.read_gbq(


Amylase Items in icu:
   itemid    label category
0  220581  Amylase     Labs


In [64]:
# hosp 50867                          Amylase                Blood
# hosp 53087                          Amylase                Blood
# hosp 51964                   Amylase, Serum                Urine
# icu 220581  Amylase     Labs
amylase_itemids = [50867, 53087, 51964, 220581]

# Get hosp records
amylase_values_query_hosp = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS amylase_level
FROM `physionet-data.mimiciv_hosp.labevents`
WHERE itemid IN ({', '.join(map(str, amylase_itemids))})
ORDER BY subject_id, charttime
"""
amylase_values_df_hosp = run_query(amylase_values_query_hosp)

# Get icu records
amylase_values_query_icu = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS amylase_level
FROM `physionet-data.mimiciv_icu.chartevents`
WHERE itemid IN ({', '.join(map(str, amylase_itemids))})
ORDER BY subject_id, charttime
"""
amylase_values_df_icu = run_query(amylase_values_query_icu)

# Concat hosp and icu
amylase_values_df = pd.concat([amylase_values_df_hosp, amylase_values_df_icu])
amylase_values_df_unique = amylase_values_df.drop_duplicates()

  return pd.io.gbq.read_gbq(


In [65]:
# amylase > 1000 IU/l
amylase_critical_df = amylase_values_df[amylase_values_df['amylase_level'] > 1000]

In [66]:
len(amylase_critical_df)

568

In [67]:
# Count number of high amylase level patients
num_patients = amylase_critical_df['subject_id'].nunique()
print(f"Number of unique patients with high amylase levels: {num_patients}")

Number of unique patients with high amylase levels: 303


In [68]:
# Outer join high lipase level records with high crp level records
combined_df = pd.merge(high_lipase_df_unique, crp_critical_df[['subject_id', 'hadm_id', 'charttime', 'crp_level']],
                       on=['subject_id', 'hadm_id'], how='outer', suffixes=('', '_crp'))

# Outer join high amylase level records with high lipase level records and high crp level records
combined_df = pd.merge(combined_df, amylase_critical_df[['subject_id', 'hadm_id', 'charttime', 'amylase_level']],
                       on=['subject_id', 'hadm_id'], how='outer', suffixes=('', '_amylase'))

In [69]:
print(combined_df.head())
print(f"Number of unique patients in the combined dataset: {combined_df['subject_id'].nunique()}")

   subject_id   hadm_id           charttime  lipase_level           admittime  \
0    10002976  27179825                 NaT           NaN                 NaT   
1    10003400  23559586                 NaT           NaN                 NaT   
2    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
3    10006431  24638489 2129-01-23 23:36:00         508.0 2129-01-24 01:08:00   
4    10006513  29846618                 NaT           NaN                 NaT   

            dischtime gender   age   race  in_hospital_death  length_of_stay  \
0                 NaT    NaN  <NA>    NaN               <NA>             NaN   
1                 NaT    NaN  <NA>    NaN               <NA>             NaN   
2 2159-03-06 16:51:00      F    64  WHITE              False       14.130556   
3 2129-01-30 16:50:00      F    66  WHITE              False        6.654167   
4                 NaT    NaN  <NA>    NaN               <NA>             NaN   

        charttime_crp  crp_level

In [70]:
# 5. Get AP ICD Info
ap_icd_query = f"""
SELECT subject_id, hadm_id, icd_code
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE icd_code LIKE 'K85%' or icd_code = '5770'
ORDER BY subject_id
"""
ap_icd_df = run_query(ap_icd_query)

  return pd.io.gbq.read_gbq(


In [71]:
ap_icd_df

Unnamed: 0,subject_id,hadm_id,icd_code
0,10002807,28464737,K8590
1,10004606,29242151,K8510
2,10006431,24638489,K8580
3,10007795,25135483,5770
4,10007795,28477357,5770
...,...,...,...
5889,19990545,23106222,K8590
5890,19993764,23707485,K851
5891,19996968,29843339,5770
5892,19996968,28227793,5770


In [72]:
ap_icd_df['subject_id'].nunique()

3850

In [74]:
combined_df_with_ap = pd.merge(combined_df, ap_icd_df[['subject_id', 'hadm_id', 'icd_code']],
                               on=['subject_id', 'hadm_id'], how='left')
combined_df_with_ap['is_confirmed_ap'] = combined_df_with_ap['icd_code'].notna()

print(combined_df_with_ap.head())
num_confirmed_ap_patients = combined_df_with_ap[combined_df_with_ap['is_confirmed_ap'] == True]['subject_id'].nunique()
print(f"Number of unique patients with high levels who are confirmed with AP: {num_confirmed_ap_patients}")

num_rows = combined_df_with_ap.shape[0]
print(f"Number of rows in the combined dataset with AP confirmation: {num_rows}")

Number of unique patients with high levels: 4234
   subject_id   hadm_id           charttime  lipase_level           admittime  \
0    10002976  27179825                 NaT           NaN                 NaT   
1    10003400  23559586                 NaT           NaN                 NaT   
2    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
3    10006431  24638489 2129-01-23 23:36:00         508.0 2129-01-24 01:08:00   
4    10006513  29846618                 NaT           NaN                 NaT   

            dischtime gender   age   race  in_hospital_death  length_of_stay  \
0                 NaT    NaN  <NA>    NaN               <NA>             NaN   
1                 NaT    NaN  <NA>    NaN               <NA>             NaN   
2 2159-03-06 16:51:00      F    64  WHITE              False       14.130556   
3 2129-01-30 16:50:00      F    66  WHITE              False        6.654167   
4                 NaT    NaN  <NA>    NaN               <NA>    

In [40]:
# final_dataset.to_csv('AP_ICD_Lipase_Final_Dataset.csv', encoding='utf-8', index = False)