In [1]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from IPython.display import display, HTML, Image
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams.update({'font.size': 20})

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [2]:
# authenticate
auth.authenticate_user()

In [3]:
# Set up environment variables
project_id = 'project-mimic-430923'
if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project.')
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# set the dataset
# if you want to use the demo, change this to mimic_demo
dataset = 'mimiciv'

In [4]:
# 1. Get basic patient info (admissions + demographics (excluding marital_status) + death status)
patient_info_query = """
SELECT adm.subject_id, adm.hadm_id, adm.admittime, adm.dischtime, pat.gender, pat.anchor_age AS age, adm.race, adm.hospital_expire_flag
FROM `physionet-data.mimiciv_hosp.admissions` AS adm
JOIN `physionet-data.mimiciv_hosp.patients` AS pat
ON adm.subject_id = pat.subject_id
WHERE adm.admittime IS NOT NULL
"""
patient_info_df = run_query(patient_info_query)

patient_info_df['in_hospital_death'] = patient_info_df['hospital_expire_flag'] == 1

patient_info_df = patient_info_df.drop(columns=['hospital_expire_flag'])

  return pd.io.gbq.read_gbq(


In [5]:
patient_info_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,age,race,in_hospital_death
0,14962874,22614141,2165-10-15 15:59:00,2165-10-18 16:47:00,M,89,WHITE,False
1,13241600,22574379,2162-10-29 15:36:00,2162-10-30 16:00:00,M,89,WHITE,False
2,12945423,20185299,2122-07-08 05:40:00,2122-07-19 15:15:00,M,89,WHITE,False
3,16014534,21633959,2176-03-03 14:33:00,2176-03-11 13:57:00,F,89,WHITE,False
4,16014534,27188276,2176-03-27 11:53:00,2176-03-31 14:10:00,F,89,WHITE,False
...,...,...,...,...,...,...,...,...
431226,12545126,27135177,2187-11-01 15:47:00,2187-11-02 09:45:00,M,91,WHITE,False
431227,17579295,26848807,2162-11-16 07:15:00,2162-11-19 14:27:00,M,91,WHITE,False
431228,18563244,26485584,2136-08-18 14:48:00,2136-08-19 11:00:00,F,91,WHITE,False
431229,14865704,29199399,2156-11-12 13:59:00,2156-11-13 14:00:00,F,91,OTHER,False


In [6]:
# Count number of patients
num_patients = patient_info_df['subject_id'].nunique()
print(f"Number of patients: {num_patients}")

Number of patients: 180733


In [7]:
# 2. Get item IDs for lipase tests
lipase_item_query = """
SELECT itemid, label, fluid
FROM `physionet-data.mimiciv_hosp.d_labitems`
WHERE LOWER(label) LIKE '%lipase%'
"""
lipase_items_df = run_query(lipase_item_query)
print(lipase_items_df)
lipase_itemids = [50956, 50844, 51055, 51036] # blood and other fluid # Lipase item IDs [50956]# only blood

  return pd.io.gbq.read_gbq(


   itemid               label             fluid
0   50956              Lipase             Blood
1   50844     Lipase, Ascites           Ascites
2   51055     Lipase, Pleural           Pleural
3   51036  Lipase, Body Fluid  Other Body Fluid


In [8]:
# 3. Retrieve lipase values
lipase_values_query = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS lipase_level
FROM `physionet-data.mimiciv_hosp.labevents`
WHERE itemid IN ({', '.join(map(str, lipase_itemids))})
ORDER BY subject_id, charttime
"""
lipase_values_df = run_query(lipase_values_query)

# Merge lipase values with patient info and clean data
lipase_values_df = pd.merge(lipase_values_df, patient_info_df, on=['subject_id', 'hadm_id'], how='left')
lipase_values_df.dropna(subset=['age', 'lipase_level'], inplace=True)

  return pd.io.gbq.read_gbq(


In [10]:
lipase_values_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death
4,10000084,23052089,2160-11-20 22:30:00,47.0,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,WHITE,False
15,10000826,20032235,2146-12-06 04:24:00,33.0,2146-12-05 19:07:00,2146-12-12 16:30:00,F,32,WHITE,False
16,10000826,28289260,2146-12-30 17:30:00,51.0,2146-12-31 00:43:00,2147-01-02 17:45:00,F,32,WHITE,False
20,10001176,23334588,2186-11-28 22:00:00,21.0,2186-11-29 03:56:00,2186-12-02 15:00:00,F,64,WHITE,False
25,10001338,22119639,2138-05-10 05:25:00,17.0,2138-05-09 19:47:00,2138-05-27 15:40:00,F,43,WHITE,False
...,...,...,...,...,...,...,...,...,...,...
229741,19999303,23567530,2161-04-03 19:50:00,292.0,2161-04-03 15:40:00,2161-04-06 10:45:00,F,61,WHITE,False
229742,19999303,23567530,2161-04-04 05:10:00,276.0,2161-04-03 15:40:00,2161-04-06 10:45:00,F,61,WHITE,False
229752,19999828,29734428,2147-07-17 18:04:00,23.0,2147-07-18 16:23:00,2147-08-04 18:10:00,F,46,WHITE,False
229754,19999840,26071774,2164-07-25 06:45:00,19.0,2164-07-25 00:27:00,2164-07-28 12:15:00,M,58,WHITE,False


In [11]:
# 3. Retrieve lipase values icu

lipase_itemids_icu = "225672"  # Lipase item ID ICU

lipase_values_query_icu = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS lipase_level
FROM `physionet-data.mimiciv_icu.chartevents`
WHERE itemid IN ({lipase_itemids_icu})
ORDER BY subject_id, charttime
"""
lipase_values_df_icu = run_query(lipase_values_query_icu)

# Merge lipase values with patient info and clean data
lipase_values_df_icu = pd.merge(lipase_values_df_icu, patient_info_df, on=['subject_id', 'hadm_id'], how='left')
lipase_values_df_icu.dropna(subset=['age', 'lipase_level'], inplace=True)

  return pd.io.gbq.read_gbq(


In [12]:
lipase_values_df_icu

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death
0,10002428,23473524,2156-05-12 02:46:00,68.0,2156-05-11 14:49:00,2156-05-22 14:16:00,F,80,WHITE,False
1,10004606,29242151,2159-02-20 18:30:00,1222.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False
2,10004606,29242151,2159-02-21 03:58:00,442.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False
3,10004606,29242151,2159-02-22 01:36:00,67.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False
4,10004606,29242151,2159-02-23 02:32:00,40.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False
...,...,...,...,...,...,...,...,...,...,...
17372,19997752,29452285,2128-03-07 02:42:00,114.0,2128-02-28 21:28:00,2128-03-10 14:35:00,F,66,WHITE,False
17373,19998591,24349193,2185-07-03 23:24:00,28.0,2185-07-03 20:20:00,2185-08-03 14:42:00,F,52,WHITE,False
17374,19998591,24349193,2185-07-23 04:09:00,35.0,2185-07-03 20:20:00,2185-08-03 14:42:00,F,52,WHITE,False
17375,19998843,24842066,2187-02-08 04:34:00,132.0,2187-02-05 09:27:00,2187-02-08 17:28:00,M,45,UNKNOWN,False


In [13]:
lipase_df_combined = pd.concat([lipase_values_df, lipase_values_df_icu])

# Drop duplicate rows
lipase_df_unique = lipase_df_combined.drop_duplicates()

In [14]:
# Count number of patients with lipase levels recorded
num_patients = lipase_df_unique['subject_id'].nunique()
print(f"Number of unique patients with high lipase levels: {num_patients}")

Number of unique patients with high lipase levels: 32942


In [15]:
ap_icd_query = f"""
SELECT subject_id, hadm_id, icd_code
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE icd_code LIKE 'K85%'
ORDER BY subject_id
"""
ap_icd_df = run_query(ap_icd_query)

  return pd.io.gbq.read_gbq(


In [16]:
ap_icd_df

Unnamed: 0,subject_id,hadm_id,icd_code
0,10002807,28464737,K8590
1,10004606,29242151,K8510
2,10006431,24638489,K8580
3,10008816,22267961,K859
4,10012206,23961896,K8510
...,...,...,...
1872,19939903,26616095,K851
1873,19968351,23732375,K8590
1874,19972266,24170910,K8590
1875,19990545,23106222,K8590


In [17]:
ap_icd_df['subject_id'].nunique()

1401

In [18]:
merged_ap_icd_lp_df = pd.merge(lipase_df_unique, ap_icd_df, on=['subject_id', 'hadm_id'], how='inner')

# Calculate length of stay
patient_info_df['admittime'] = pd.to_datetime(patient_info_df['admittime'])
patient_info_df['dischtime'] = pd.to_datetime(patient_info_df['dischtime'])
patient_info_df['length_of_stay'] = (patient_info_df['dischtime'] - patient_info_df['admittime']).dt.total_seconds() / (60 * 60 * 24)

# Merge length of stay into the dataset
merged_ap_icd_lp_df = pd.merge(merged_ap_icd_lp_df, patient_info_df[['subject_id', 'hadm_id', 'length_of_stay']], on=['subject_id', 'hadm_id'], how='left')

In [19]:
merged_ap_icd_lp_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,icd_code,length_of_stay
0,10004606,29242151,2159-02-20 18:30:00,1222.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,K8510,14.130556
1,10004606,29242151,2159-02-21 03:58:00,442.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,K8510,14.130556
2,10004606,29242151,2159-02-22 01:36:00,67.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,K8510,14.130556
3,10004606,29242151,2159-02-23 02:32:00,40.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,K8510,14.130556
4,10004606,29242151,2159-02-24 00:18:00,41.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,K8510,14.130556
...,...,...,...,...,...,...,...,...,...,...,...,...
2951,19990545,23106222,2139-10-26 11:40:00,62.0,2139-10-04 23:11:00,2139-10-29 15:45:00,F,43,ASIAN - ASIAN INDIAN,False,K8590,24.690278
2952,19993764,23707485,2167-08-29 03:41:00,28.0,2167-08-27 23:09:00,2167-08-30 16:38:00,M,76,WHITE,False,K851,2.728472
2953,11741025,28221700,2135-04-29 17:20:00,2872.0,2135-04-29 12:52:00,2135-05-02 14:40:00,M,91,WHITE,False,K851,3.075000
2954,12402348,25416130,2173-10-18 23:28:00,153.0,2173-10-18 23:26:00,2173-11-11 18:45:00,F,65,UNKNOWN,False,K8590,23.804861


In [20]:
# Count the number of patients with high lipase levels who had a CT scan
num_ap_icd_lp_df = merged_ap_icd_lp_df['subject_id'].nunique()
print(f"Number of patients with high lipase levels who had a CT scan: {num_ap_icd_lp_df}")

# Count the number of rows in the dataset
num_rows = merged_ap_icd_lp_df.shape[0]
print(f"Number of rows in the dataset: {num_rows}")

Number of patients with high lipase levels who had a CT scan: 1003
Number of rows in the dataset: 2956


In [21]:
def select_row(group):
    # Check if there's any row with hospital_expire_flag = 1
    if (group['in_hospital_death']).any():
        # Return the first row with hospital_expire_flag = 1
        return group[group['in_hospital_death']].iloc[0]
    else:
        # Otherwise, return the row with the maximum length_of_stay
        return group.loc[group['length_of_stay'].idxmax()]

# Apply the function to each group and reset the index
ap_lp_filtered_df = merged_ap_icd_lp_df.groupby(['subject_id']).apply(select_row).reset_index(drop=True)

  ap_lp_filtered_df = merged_ap_icd_lp_df.groupby(['subject_id']).apply(select_row).reset_index(drop=True)


In [22]:
ap_lp_filtered_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,icd_code,length_of_stay
0,10004606,29242151,2159-02-20 18:30:00,1222.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,K8510,14.130556
1,10006431,24638489,2129-01-23 23:36:00,508.0,2129-01-24 01:08:00,2129-01-30 16:50:00,F,66,WHITE,False,K8580,6.654167
2,10012206,23961896,2127-07-03 20:39:00,168.0,2127-07-04 01:16:00,2127-07-14 17:00:00,M,45,WHITE,False,K8510,10.655556
3,10039708,23819016,2140-06-17 22:03:00,227.0,2140-06-18 00:22:00,2140-06-22 17:40:00,F,46,BLACK/AFRICAN AMERICAN,False,K859,4.720833
4,10058750,23135802,2149-09-07 06:59:00,35.0,2149-09-06 19:20:00,2149-09-12 14:45:00,M,40,WHITE,False,K8590,5.809028
...,...,...,...,...,...,...,...,...,...,...,...,...
998,19931286,24646409,2198-05-13 15:05:00,8.0,2198-05-13 12:04:00,2198-05-15 19:40:00,F,61,WHITE,False,K859,2.316667
999,19939903,26616095,2165-03-14 06:37:00,371.0,2165-03-13 21:33:00,2165-03-19 15:18:00,M,80,WHITE,False,K851,5.739583
1000,19972266,24170910,2177-01-27 23:30:00,33.0,2177-01-28 08:44:00,2177-01-29 11:06:00,M,43,WHITE,False,K8590,1.098611
1001,19990545,23106222,2139-10-06 08:30:00,1886.0,2139-10-04 23:11:00,2139-10-29 15:45:00,F,43,ASIAN - ASIAN INDIAN,False,K8590,24.690278


In [26]:
ap_lp_filtered_df.to_csv('AP_ICD_Lipase_Dataset.csv', encoding='utf-8', index = False)

In [23]:
# 5. Retrieve comorbid conditions (ICD codes)
hadm_ids = ', '.join(str(hadm_id) for hadm_id in ap_lp_filtered_df['hadm_id'].unique())
comorbidity_query = f"""
SELECT subject_id, hadm_id, icd_code, seq_num, icd_version
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE hadm_id IN ({hadm_ids})
ORDER BY subject_id, hadm_id, seq_num
"""
comorbidities_df = run_query(comorbidity_query)

# Merge comorbidities with the dataset
merged_comorbidities_df = pd.merge(ap_lp_filtered_df, comorbidities_df, on=['subject_id', 'hadm_id'], how='left')

# Display the final merged dataset
print(merged_comorbidities_df.head())

  return pd.io.gbq.read_gbq(


   subject_id   hadm_id           charttime  lipase_level           admittime  \
0    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
1    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
2    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
3    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
4    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   

            dischtime gender  age   race  in_hospital_death icd_code_x  \
0 2159-03-06 16:51:00      F   64  WHITE              False      K8510   
1 2159-03-06 16:51:00      F   64  WHITE              False      K8510   
2 2159-03-06 16:51:00      F   64  WHITE              False      K8510   
3 2159-03-06 16:51:00      F   64  WHITE              False      K8510   
4 2159-03-06 16:51:00      F   64  WHITE              False      K8510   

   length_of_stay icd_code_y  seq_num  icd_version  
0       14.1305

In [27]:
merged_comorbidities_df.to_csv('AP_ICD_Lipase_Comorbidities_Dataset.csv', encoding='utf-8', index = False)

In [24]:
# Count the number of unique patients who died in the hospital
num_patients_with_death_info = merged_comorbidities_df[merged_comorbidities_df['in_hospital_death'] == True][['subject_id', 'hadm_id']].drop_duplicates().shape[0]
print(f"Number of unique patients who died in the hospital: {num_patients_with_death_info}")

Number of unique patients who died in the hospital: 38


In [25]:
# 1. Handle missing values
missing_data = merged_comorbidities_df.isnull().sum()
print(missing_data)

subject_id           0
hadm_id              0
charttime            0
lipase_level         0
admittime            0
dischtime            0
gender               0
age                  0
race                 0
in_hospital_death    0
icd_code_x           0
length_of_stay       0
icd_code_y           0
seq_num              0
icd_version          0
dtype: int64
