In [1]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from IPython.display import display, HTML, Image
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams.update({'font.size': 20})

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [2]:
# authenticate
auth.authenticate_user()

In [3]:
# Set up environment variables
project_id = 'project-mimic-430923'
if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project.')
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# set the dataset
# if you want to use the demo, change this to mimic_demo
dataset = 'mimiciv'

In [4]:
# 1. Get basic patient info (admissions + demographics (excluding marital_status) + death status)
patient_info_query = """
SELECT adm.subject_id, adm.hadm_id, adm.admittime, adm.dischtime, pat.gender, pat.anchor_age AS age, adm.race, adm.hospital_expire_flag
FROM `physionet-data.mimiciv_hosp.admissions` AS adm
JOIN `physionet-data.mimiciv_hosp.patients` AS pat
ON adm.subject_id = pat.subject_id
WHERE adm.admittime IS NOT NULL
ORDER BY subject_id
"""
patient_info_df = run_query(patient_info_query)

patient_info_df['in_hospital_death'] = patient_info_df['hospital_expire_flag'] == 1

patient_info_df = patient_info_df.drop(columns=['hospital_expire_flag'])

# Calculate length of stay and have data only with positive L.O.F
patient_info_df['admittime'] = pd.to_datetime(patient_info_df['admittime'])
patient_info_df['dischtime'] = pd.to_datetime(patient_info_df['dischtime'])
patient_info_df['length_of_stay'] = (patient_info_df['dischtime'] - patient_info_df['admittime']).dt.total_seconds() / (60 * 60 * 24)

patient_info_df = patient_info_df[patient_info_df['length_of_stay'] > 0]

  return pd.io.gbq.read_gbq(


In [5]:
patient_info_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
0,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,F,52,WHITE,False,1.015278
1,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,F,52,WHITE,False,2.222222
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,F,52,WHITE,False,1.754167
3,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,F,52,WHITE,False,0.786111
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,F,19,WHITE,False,0.298611
...,...,...,...,...,...,...,...,...,...
431226,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,F,46,WHITE,False,17.074306
431227,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00,F,46,WHITE,False,10.011111
431228,19999840,26071774,2164-07-25 00:27:00,2164-07-28 12:15:00,M,58,WHITE,False,3.491667
431229,19999840,21033226,2164-09-10 13:47:00,2164-09-17 13:42:00,M,58,WHITE,True,6.996528


In [6]:
# Count number of patients
num_patients = patient_info_df['subject_id'].nunique()
print(f"Number of patients: {num_patients}")

Number of patients: 180677


In [7]:
# 2. Get item IDs for lipase tests
lipase_item_query = """
SELECT itemid, label, fluid
FROM `physionet-data.mimiciv_hosp.d_labitems`
WHERE LOWER(label) LIKE '%lipase%'
"""
lipase_items_df = run_query(lipase_item_query)
print(lipase_items_df)
lipase_itemids = [50956, 50844, 51055, 51036] # blood and other fluid # Lipase item IDs [50956]# only blood

  return pd.io.gbq.read_gbq(


   itemid               label             fluid
0   50956              Lipase             Blood
1   50844     Lipase, Ascites           Ascites
2   51055     Lipase, Pleural           Pleural
3   51036  Lipase, Body Fluid  Other Body Fluid


In [8]:
# 3. Retrieve lipase values
lipase_values_query = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS lipase_level
FROM `physionet-data.mimiciv_hosp.labevents`
WHERE itemid IN ({', '.join(map(str, lipase_itemids))})
ORDER BY subject_id, charttime
"""
lipase_values_df = run_query(lipase_values_query)

# Merge lipase values with patient info and clean data
lipase_values_df = pd.merge(lipase_values_df, patient_info_df, on=['subject_id', 'hadm_id'], how='left')
lipase_values_df.dropna(subset=['age', 'lipase_level'], inplace=True)

# Identify patients with high lipase levels
def check_lipase(row):
    upper_limit = 140 if row['age'] < 60 else 151
    return row['lipase_level'] >= 3 * upper_limit

high_lipase_df = lipase_values_df[lipase_values_df.apply(check_lipase, axis=1)]

  return pd.io.gbq.read_gbq(


In [9]:
high_lipase_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
85,10004606,29242151,2159-02-20 18:30:00,1222.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,14.130556
170,10006431,24638489,2129-01-23 23:36:00,508.0,2129-01-24 01:08:00,2129-01-30 16:50:00,F,66,WHITE,False,6.654167
362,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
363,10017531,22580355,2159-09-23 04:30:00,977.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
427,10021357,25937617,2144-12-30 06:55:00,1249.0,2144-12-27 19:41:00,2145-01-04 19:54:00,F,91,WHITE,False,8.009028
...,...,...,...,...,...,...,...,...,...,...,...
229546,19990545,23106222,2139-10-14 06:11:00,527.0,2139-10-04 23:11:00,2139-10-29 15:45:00,F,43,ASIAN - ASIAN INDIAN,False,24.690278
229547,19990545,23106222,2139-10-15 04:40:00,753.0,2139-10-04 23:11:00,2139-10-29 15:45:00,F,43,ASIAN - ASIAN INDIAN,False,24.690278
229548,19990545,23106222,2139-10-16 04:50:00,650.0,2139-10-04 23:11:00,2139-10-29 15:45:00,F,43,ASIAN - ASIAN INDIAN,False,24.690278
229672,19996968,29843339,2125-01-23 06:25:00,508.0,2125-01-20 21:17:00,2125-01-23 14:42:00,M,32,BLACK/AFRICAN AMERICAN,False,2.725694


In [10]:
# 2. Get item IDs for lipase tests in icu
lipase_item_query_icu = """
SELECT itemid, label, category
FROM `physionet-data.mimiciv_icu.d_items`
WHERE LOWER(label) LIKE '%lipase%'
"""
lipase_items_df_icu = run_query(lipase_item_query_icu)
print(lipase_items_df_icu)
lipase_itemids_icu = "225672"  # Lipase item IDs

  return pd.io.gbq.read_gbq(


   itemid   label category
0  225672  Lipase     Labs


In [11]:
# 3. Retrieve lipase values icu

lipase_values_query_icu = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS lipase_level
FROM `physionet-data.mimiciv_icu.chartevents`
WHERE itemid IN ({lipase_itemids_icu})
ORDER BY subject_id, charttime
"""
lipase_values_df_icu = run_query(lipase_values_query_icu)

# Merge lipase values with patient info and clean data
lipase_values_df_icu = pd.merge(lipase_values_df_icu, patient_info_df, on=['subject_id', 'hadm_id'], how='left')
lipase_values_df_icu.dropna(subset=['age', 'lipase_level'], inplace=True)

# Identify patients with high lipase levels in ICU
def check_lipase(row):
    upper_limit = 140 if row['age'] < 60 else 151
    return row['lipase_level'] >= 3 * upper_limit

high_lipase_df_icu = lipase_values_df_icu[lipase_values_df_icu.apply(check_lipase, axis=1)]

  return pd.io.gbq.read_gbq(


In [12]:
high_lipase_df_icu

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
1,10004606,29242151,2159-02-20 18:30:00,1222.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,14.130556
15,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
16,10017531,22580355,2159-09-23 04:30:00,977.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
55,10036086,28728587,2196-05-26 09:25:00,677.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528
56,10036086,28728587,2196-05-28 03:39:00,777.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528
...,...,...,...,...,...,...,...,...,...,...,...
17140,19882958,29628147,2182-08-31 03:10:00,787.0,2182-08-29 17:25:00,2182-09-03 13:50:00,M,83,WHITE,False,4.850694
17167,19899716,21665899,2143-09-03 17:16:00,587.0,2143-09-03 12:59:00,2143-09-08 18:00:00,M,62,BLACK/AFRICAN AMERICAN,False,5.209028
17178,19901341,23906609,2169-08-10 06:48:00,588.0,2169-08-06 14:52:00,2169-09-05 14:58:00,F,55,WHITE,False,30.004167
17189,19907884,24707264,2181-01-24 04:42:00,789.0,2181-01-23 21:32:00,2181-02-03 13:30:00,F,38,WHITE,False,10.665278


In [13]:
high_lipase_df_combined = pd.concat([high_lipase_df, high_lipase_df_icu])

# Drop duplicate rows
high_lipase_df_unique = high_lipase_df_combined.drop_duplicates()

In [14]:
# Count number of patients with lipase levels recorded
num_patients = high_lipase_df_unique['subject_id'].nunique()
print(f"Number of unique patients with high lipase levels: {num_patients}")

Number of unique patients with high lipase levels: 2106


In [15]:
high_lipase_df_unique

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
85,10004606,29242151,2159-02-20 18:30:00,1222.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,14.130556
170,10006431,24638489,2129-01-23 23:36:00,508.0,2129-01-24 01:08:00,2129-01-30 16:50:00,F,66,WHITE,False,6.654167
362,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
363,10017531,22580355,2159-09-23 04:30:00,977.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
427,10021357,25937617,2144-12-30 06:55:00,1249.0,2144-12-27 19:41:00,2145-01-04 19:54:00,F,91,WHITE,False,8.009028
...,...,...,...,...,...,...,...,...,...,...,...
13088,17509107,25893958,2122-06-07 02:05:00,999999.0,2122-06-05 00:17:00,2122-06-30 12:00:00,M,49,WHITE,False,25.488194
14505,18341278,28924376,2141-05-25 23:34:00,493.0,2141-05-25 21:27:00,2141-05-29 17:08:00,M,56,BLACK/AFRICAN AMERICAN,False,3.820139
15970,19201291,25546590,2184-01-15 02:08:00,492.0,2184-01-04 23:43:00,2184-01-22 13:15:00,M,41,BLACK/AFRICAN AMERICAN,False,17.563889
15971,19201291,25546590,2184-01-15 05:51:00,529.0,2184-01-04 23:43:00,2184-01-22 13:15:00,M,41,BLACK/AFRICAN AMERICAN,False,17.563889


In [16]:
high_lp_df = pd.merge(high_lipase_df_unique, patient_info_df[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='left')

In [17]:
high_lp_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
0,10004606,29242151,2159-02-20 18:30:00,1222.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,14.130556
1,10006431,24638489,2129-01-23 23:36:00,508.0,2129-01-24 01:08:00,2129-01-30 16:50:00,F,66,WHITE,False,6.654167
2,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
3,10017531,22580355,2159-09-23 04:30:00,977.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
4,10021357,25937617,2144-12-30 06:55:00,1249.0,2144-12-27 19:41:00,2145-01-04 19:54:00,F,91,WHITE,False,8.009028
...,...,...,...,...,...,...,...,...,...,...,...
4236,17509107,25893958,2122-06-07 02:05:00,999999.0,2122-06-05 00:17:00,2122-06-30 12:00:00,M,49,WHITE,False,25.488194
4237,18341278,28924376,2141-05-25 23:34:00,493.0,2141-05-25 21:27:00,2141-05-29 17:08:00,M,56,BLACK/AFRICAN AMERICAN,False,3.820139
4238,19201291,25546590,2184-01-15 02:08:00,492.0,2184-01-04 23:43:00,2184-01-22 13:15:00,M,41,BLACK/AFRICAN AMERICAN,False,17.563889
4239,19201291,25546590,2184-01-15 05:51:00,529.0,2184-01-04 23:43:00,2184-01-22 13:15:00,M,41,BLACK/AFRICAN AMERICAN,False,17.563889


In [18]:
def select_row(group):
    # Check if there's any row with hospital_expire_flag = 1
    if (group['in_hospital_death']).any():
        # Return the first row with hospital_expire_flag = 1
        return group[group['in_hospital_death']].iloc[0]
    else:
        # Otherwise, return the row with the maximum length_of_stay
        return group.loc[group['length_of_stay'].idxmax()]

# Apply the function to each group and reset the index
high_lp_filtered_df = high_lp_df.groupby(['subject_id']).apply(select_row).reset_index(drop=True)

  high_lp_filtered_df = high_lp_df.groupby(['subject_id']).apply(select_row).reset_index(drop=True)


In [19]:
high_lp_filtered_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
0,10004606,29242151,2159-02-20 18:30:00,1222.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,14.130556
1,10006431,24638489,2129-01-23 23:36:00,508.0,2129-01-24 01:08:00,2129-01-30 16:50:00,F,66,WHITE,False,6.654167
2,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
3,10021357,25937617,2144-12-30 06:55:00,1249.0,2144-12-27 19:41:00,2145-01-04 19:54:00,F,91,WHITE,False,8.009028
4,10036086,28728587,2196-05-26 09:25:00,677.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528
...,...,...,...,...,...,...,...,...,...,...,...
2101,19970491,22119205,2131-02-11 03:06:00,487.0,2131-02-10 18:25:00,2131-02-17 14:15:00,M,55,WHITE,False,6.826389
2102,19977727,23685838,2170-06-12 07:20:00,528.0,2170-06-10 22:54:00,2170-06-12 14:37:00,F,64,WHITE,False,1.654861
2103,19990545,23106222,2139-10-06 08:30:00,1886.0,2139-10-04 23:11:00,2139-10-29 15:45:00,F,43,ASIAN - ASIAN INDIAN,False,24.690278
2104,19996968,29843339,2125-01-23 06:25:00,508.0,2125-01-20 21:17:00,2125-01-23 14:42:00,M,32,BLACK/AFRICAN AMERICAN,False,2.725694


In [None]:
high_lp_filtered_df.to_csv('High_Lipase_Dataset.csv', encoding='utf-8', index = False)

In [None]:
# 5. Retrieve comorbid conditions (ICD codes)
hadm_ids = ', '.join(str(hadm_id) for hadm_id in high_lp_filtered_df['hadm_id'].unique())
comorbidity_query = f"""
SELECT subject_id, hadm_id, icd_code, seq_num, icd_version
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE hadm_id IN ({hadm_ids})
ORDER BY subject_id, hadm_id, seq_num
"""
comorbidities_df = run_query(comorbidity_query)

# Merge comorbidities with the dataset
merged_comorbidities_df = pd.merge(high_lp_filtered_df, comorbidities_df, on=['subject_id', 'hadm_id'], how='left')

# Display the final merged dataset
print(merged_comorbidities_df.head())

  return pd.io.gbq.read_gbq(


   subject_id   hadm_id           charttime  lipase_level           admittime  \
0    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
1    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
2    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
3    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
4    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   

            dischtime gender  age   race  in_hospital_death  length_of_stay  \
0 2159-03-06 16:51:00      F   64  WHITE              False       14.130556   
1 2159-03-06 16:51:00      F   64  WHITE              False       14.130556   
2 2159-03-06 16:51:00      F   64  WHITE              False       14.130556   
3 2159-03-06 16:51:00      F   64  WHITE              False       14.130556   
4 2159-03-06 16:51:00      F   64  WHITE              False       14.130556   

  icd_code  seq_num  icd_version  
0  

In [None]:
merged_comorbidities_df.to_csv('High_Lipase_Comorbidities_Dataset.csv', encoding='utf-8', index = False)

In [20]:
ct_item_query = """
SELECT itemid, label
FROM `physionet-data.mimiciv_icu.d_items`
WHERE LOWER(label) LIKE '%computed tomography%'
   OR LOWER(label) LIKE '%abdomen%'
   OR LOWER(label) LIKE '%pelvis%'
   OR LOWER(label) LIKE '%scan%'
   OR LOWER(label) LIKE '%ct abdomen%'
"""
ct_items_df = run_query(ct_item_query)
print(ct_items_df)
ct_itemids = [221214, 229582, 229582]  # CT item IDs

  return pd.io.gbq.read_gbq(


   itemid                                      label
0  225999               Vanilla Scandi Shake (mixed)
1  226000             Chocolate Scandi Shake (mixed)
2  226002  Vanilla Lactose Free Scandi Shake (mixed)
3  221214                                    CT scan
4  225461                                     Pelvis
5  229582                           Portable CT scan
6  228714                            Bladder scanned
7  229371                      Bladder Scan Estimate
8  220462                                   Scandium


In [21]:
ct_scan_query = f"""
SELECT subject_id, hadm_id
FROM `physionet-data.mimiciv_icu.procedureevents`
WHERE itemid IN ({', '.join(map(str, ct_itemids))})
ORDER BY subject_id, starttime
"""
ct_scan_df = run_query(ct_scan_query)

  return pd.io.gbq.read_gbq(


In [22]:
ct_scan_df

Unnamed: 0,subject_id,hadm_id
0,10001217,24597018
1,10001217,27703517
2,10001884,26184834
3,10002155,28994087
4,10002428,28662225
...,...,...
22038,19999068,21606769
22039,19999442,26785317
22040,19999442,26785317
22041,19999828,25744818


In [23]:
ct_scan_icd_diag_query = f"""
SELECT subject_id, hadm_id
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE icd_code LIKE 'R10%' or icd_code LIKE 'R935%'
ORDER BY subject_id
"""
ct_scan_icd_diag_df = run_query(ct_scan_icd_diag_query)

  return pd.io.gbq.read_gbq(


In [24]:
ct_scan_icd_diag_df

Unnamed: 0,subject_id,hadm_id
0,10004606,28731738
1,10005866,21636229
2,10006457,27072986
3,10010231,28743978
4,10014354,24357615
...,...,...
5644,19992875,28476580
5645,19995320,28017574
5646,19997062,20096107
5647,19997843,20277361


In [25]:
merge_scan_df = pd.concat([ct_scan_df, ct_scan_icd_diag_df]).drop_duplicates()

In [26]:
merge_scan_df

Unnamed: 0,subject_id,hadm_id
0,10001217,24597018
1,10001217,27703517
2,10001884,26184834
3,10002155,28994087
4,10002428,28662225
...,...,...
5644,19992875,28476580
5645,19995320,28017574
5646,19997062,20096107
5647,19997843,20277361


In [27]:
merged_ct_icd_df = pd.merge(high_lp_df, merge_scan_df, on=['subject_id', 'hadm_id'], how='inner')

In [28]:
merged_ct_icd_df = pd.merge(high_lp_df, merge_scan_df, on=['subject_id', 'hadm_id'], how='inner')

# Merge length of stay into the dataset
merged_ct_icd_df = pd.merge(merged_ct_icd_df, patient_info_df[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='left')

In [29]:
merged_ct_icd_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
0,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
1,10017531,22580355,2159-09-23 04:30:00,977.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
2,10036086,28728587,2196-05-26 09:25:00,677.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528
3,10036086,28728587,2196-05-28 03:39:00,777.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528
4,10057482,25416257,2145-04-04 15:26:00,1429.0,2145-03-23 15:01:00,2145-04-26 17:23:00,F,76,WHITE,False,34.098611
...,...,...,...,...,...,...,...,...,...,...,...
946,17417573,26862398,2202-08-24 22:23:00,2318.0,2202-08-24 21:33:00,2202-09-04 16:10:00,M,41,WHITE,False,10.775694
947,17509107,25893958,2122-06-07 02:05:00,999999.0,2122-06-05 00:17:00,2122-06-30 12:00:00,M,49,WHITE,False,25.488194
948,19201291,25546590,2184-01-15 02:08:00,492.0,2184-01-04 23:43:00,2184-01-22 13:15:00,M,41,BLACK/AFRICAN AMERICAN,False,17.563889
949,19201291,25546590,2184-01-15 05:51:00,529.0,2184-01-04 23:43:00,2184-01-22 13:15:00,M,41,BLACK/AFRICAN AMERICAN,False,17.563889


In [30]:
high_lp_ct_df = merged_ct_icd_df.groupby(['subject_id']).apply(select_row).reset_index(drop=True)

  high_lp_ct_df = merged_ct_icd_df.groupby(['subject_id']).apply(select_row).reset_index(drop=True)


In [31]:
high_lp_ct_df.nunique()

Unnamed: 0,0
subject_id,337
hadm_id,337
charttime,337
lipase_level,280
admittime,337
dischtime,337
gender,2
age,69
race,20
in_hospital_death,2


In [None]:
high_lp_ct_df.to_csv('High_Lipase_CT_Dataset.csv', encoding='utf-8', index = False)

In [32]:
ap_icd_query = f"""
SELECT subject_id, hadm_id, icd_code
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE icd_code LIKE 'K85%' or icd_code = '5770'
ORDER BY subject_id
"""
ap_icd_df = run_query(ap_icd_query)

  return pd.io.gbq.read_gbq(


In [33]:
ap_icd_df

Unnamed: 0,subject_id,hadm_id,icd_code
0,10002807,28464737,K8590
1,10004606,29242151,K8510
2,10006431,24638489,K8580
3,10007795,25135483,5770
4,10007795,28477357,5770
...,...,...,...
5889,19990545,23106222,K8590
5890,19993764,23707485,K851
5891,19996968,29843339,5770
5892,19996968,28227793,5770


In [34]:
ap_icd_df['subject_id'].nunique()

3850

In [35]:
merged_ap_lp_ct_df = pd.merge(merged_ct_icd_df, ap_icd_df, on=['subject_id', 'hadm_id'], how='inner')

# Merge length of stay into the dataset
merged_ap_lp_ct_df = pd.merge(merged_ap_lp_ct_df, patient_info_df[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='left')

In [36]:
merged_ap_lp_ct_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay,icd_code
0,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770
1,10017531,22580355,2159-09-23 04:30:00,977.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770
2,10036086,28728587,2196-05-26 09:25:00,677.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528,5770
3,10036086,28728587,2196-05-28 03:39:00,777.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528,5770
4,10143030,27543597,2180-05-29 02:31:00,563.0,2180-04-19 09:39:00,2180-07-23 05:44:00,M,63,WHITE,True,94.836806,5770
...,...,...,...,...,...,...,...,...,...,...,...,...
535,19655310,23438001,2147-03-30 19:00:00,25280.0,2147-03-14 07:15:00,2147-05-12 17:18:00,F,41,WHITE,False,59.418750,K8512
536,19676211,27134485,2134-05-15 05:26:00,980.0,2134-04-22 20:37:00,2134-05-21 18:57:00,F,79,WHITE,True,28.930556,K8590
537,19720119,28060710,2180-12-07 14:56:00,757.0,2180-12-01 00:04:00,2180-12-20 13:30:00,F,45,WHITE,False,19.559722,K8590
538,19720119,28060710,2180-12-08 10:49:00,794.0,2180-12-01 00:04:00,2180-12-20 13:30:00,F,45,WHITE,False,19.559722,K8590


In [37]:
# Count the number of patients with high lipase levels who had a CT scan
num_ap_icd_lp_df = merged_ap_lp_ct_df['subject_id'].nunique()
print(f"Number of patients with high lipase levels who had a CT scan or diagnosed with abdomen/pelvis pain or ct scan of abdomen and diagnosed with AP: {num_ap_icd_lp_df}")

# Count the number of rows in the dataset
num_rows = merged_ap_lp_ct_df.shape[0]
print(f"Number of rows in the dataset: {num_rows}")

Number of patients with high lipase levels who had a CT scan or diagnosed with abdomen/pelvis pain or ct scan of abdomen and diagnosed with AP: 156
Number of rows in the dataset: 540


In [38]:
high_lp_ap_ct_filtered_df = merged_ap_lp_ct_df.groupby(['subject_id']).apply(select_row).reset_index(drop=True)

  high_lp_ap_ct_filtered_df = merged_ap_lp_ct_df.groupby(['subject_id']).apply(select_row).reset_index(drop=True)


In [39]:
high_lp_ap_ct_filtered_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay,icd_code
0,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770
1,10036086,28728587,2196-05-26 09:25:00,677.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528,5770
2,10143030,27543597,2180-05-29 02:31:00,563.0,2180-04-19 09:39:00,2180-07-23 05:44:00,M,63,WHITE,True,94.836806,5770
3,10246670,26812375,2114-09-03 04:30:00,850.0,2114-08-25 20:34:00,2114-09-10 12:00:00,M,44,BLACK/AFRICAN AMERICAN,False,15.643056,5770
4,10309969,23107749,2182-01-21 06:45:00,747.0,2182-01-20 01:04:00,2182-02-15 14:00:00,F,68,WHITE,False,26.538889,5770
...,...,...,...,...,...,...,...,...,...,...,...,...
151,19363916,29438461,2141-01-27 08:57:00,506.0,2141-01-16 22:38:00,2141-03-01 14:20:00,M,67,WHITE,False,43.654167,5770
152,19619252,26298025,2182-10-11 15:48:00,1722.0,2182-10-07 19:05:00,2182-10-29 20:12:00,M,54,BLACK/AFRICAN AMERICAN,False,22.046528,K8590
153,19655310,23438001,2147-03-18 14:03:00,669.0,2147-03-14 07:15:00,2147-05-12 17:18:00,F,41,WHITE,False,59.418750,K8512
154,19676211,27134485,2134-05-15 05:26:00,980.0,2134-04-22 20:37:00,2134-05-21 18:57:00,F,79,WHITE,True,28.930556,K8590


In [None]:
high_lp_ap_ct_filtered_df.to_csv('High_LP_Scan_AP_Dataset.csv', encoding='utf-8', index = False)

In [49]:
# 5. Retrieve comorbid conditions (ICD codes)
hadm_ids = ', '.join(str(hadm_id) for hadm_id in high_lp_ap_ct_filtered_df['hadm_id'].unique())
comorbidity_query = f"""
SELECT subject_id, hadm_id, icd_code, seq_num, icd_version
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE hadm_id IN ({hadm_ids})
ORDER BY subject_id, hadm_id, seq_num
"""
comorbidities_df = run_query(comorbidity_query)

# Merge comorbidities with the dataset
merged_comorbidities_df = pd.merge(high_lp_ap_ct_filtered_df, comorbidities_df, on=['subject_id', 'hadm_id'], how='left')

# Display the final merged dataset
print(merged_comorbidities_df.head())

  return pd.io.gbq.read_gbq(


   subject_id   hadm_id           charttime  lipase_level           admittime  \
0    10017531  22580355 2159-09-22 20:56:00        1164.0 2159-09-22 19:30:00   
1    10017531  22580355 2159-09-22 20:56:00        1164.0 2159-09-22 19:30:00   
2    10017531  22580355 2159-09-22 20:56:00        1164.0 2159-09-22 19:30:00   
3    10017531  22580355 2159-09-22 20:56:00        1164.0 2159-09-22 19:30:00   
4    10017531  22580355 2159-09-22 20:56:00        1164.0 2159-09-22 19:30:00   

            dischtime gender  age   race  in_hospital_death  length_of_stay  \
0 2159-10-24 13:40:00      M   63  WHITE              False       31.756944   
1 2159-10-24 13:40:00      M   63  WHITE              False       31.756944   
2 2159-10-24 13:40:00      M   63  WHITE              False       31.756944   
3 2159-10-24 13:40:00      M   63  WHITE              False       31.756944   
4 2159-10-24 13:40:00      M   63  WHITE              False       31.756944   

  icd_code_x icd_code_y  seq_num  icd_

In [60]:
merged_comorbidities_df = merged_comorbidities_df.rename(columns={"icd_code_x": "icd_code_AP", "icd_code_y": "icd_code"})

In [61]:
merged_comorbidities_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay,icd_code_AP,icd_code,seq_num,icd_version
0,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770,5770,1,9
1,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770,51881,2,9
2,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770,5849,3,9
3,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770,42822,4,9
4,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770,7907,5,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3809,19720119,28060710,2180-12-07 14:56:00,757.0,2180-12-01 00:04:00,2180-12-20 13:30:00,F,45,WHITE,False,19.559722,K8590,F329,13,10
3810,19720119,28060710,2180-12-07 14:56:00,757.0,2180-12-01 00:04:00,2180-12-20 13:30:00,F,45,WHITE,False,19.559722,K8590,Z23,14,10
3811,19720119,28060710,2180-12-07 14:56:00,757.0,2180-12-01 00:04:00,2180-12-20 13:30:00,F,45,WHITE,False,19.559722,K8590,G932,15,10
3812,19720119,28060710,2180-12-07 14:56:00,757.0,2180-12-01 00:04:00,2180-12-20 13:30:00,F,45,WHITE,False,19.559722,K8590,Z781,16,10


In [62]:
icd_title_query = """
SELECT icd_code, icd_version, long_title
FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
"""
icd_titles_df = run_query(icd_title_query)

merged_comorbidities_df = pd.merge(merged_comorbidities_df, icd_titles_df, on=['icd_code', 'icd_version'], how='left')

  return pd.io.gbq.read_gbq(


In [63]:
merged_comorbidities_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay,icd_code_AP,icd_code,seq_num,icd_version,long_title
0,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770,5770,1,9,Acute pancreatitis
1,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770,51881,2,9,Acute respiratory failure
2,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770,5849,3,9,"Acute kidney failure, unspecified"
3,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770,42822,4,9,Chronic systolic heart failure
4,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770,7907,5,9,Bacteremia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3809,19720119,28060710,2180-12-07 14:56:00,757.0,2180-12-01 00:04:00,2180-12-20 13:30:00,F,45,WHITE,False,19.559722,K8590,F329,13,10,"Major depressive disorder, single episode, uns..."
3810,19720119,28060710,2180-12-07 14:56:00,757.0,2180-12-01 00:04:00,2180-12-20 13:30:00,F,45,WHITE,False,19.559722,K8590,Z23,14,10,Encounter for immunization
3811,19720119,28060710,2180-12-07 14:56:00,757.0,2180-12-01 00:04:00,2180-12-20 13:30:00,F,45,WHITE,False,19.559722,K8590,G932,15,10,Benign intracranial hypertension
3812,19720119,28060710,2180-12-07 14:56:00,757.0,2180-12-01 00:04:00,2180-12-20 13:30:00,F,45,WHITE,False,19.559722,K8590,Z781,16,10,Physical restraint status


In [64]:
merged_comorbidities_df.to_csv('High_LP_Scan_AP_Comorbidities_Dataset.csv', encoding='utf-8', index = False)

In [None]:
# Count the number of unique patients who died in the hospital
num_patients_with_death_info = merged_comorbidities_df[merged_comorbidities_df['in_hospital_death'] == True][['subject_id', 'hadm_id']].drop_duplicates().shape[0]
print(f"Number of unique patients who died in the hospital: {num_patients_with_death_info}")

Number of unique patients who died in the hospital: 31


In [None]:
# 1. Handle missing values
missing_data = merged_comorbidities_df.isnull().sum()
print(missing_data)

subject_id           0
hadm_id              0
charttime            0
lipase_level         0
admittime            0
dischtime            0
gender               0
age                  0
race                 0
in_hospital_death    0
length_of_stay       0
icd_code_x           0
icd_code_y           0
seq_num              0
icd_version          0
dtype: int64


In [None]:
high_lp_ap_ct_filtered_df.describe()

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,age,length_of_stay
count,156.0,156.0,156,156.0,156,156,156.0,156.0
mean,14945030.0,25169960.0,2155-07-01 03:02:55.000000512,1742.339744,2155-06-26 12:40:45.384614912,2155-07-25 03:03:03.846154240,56.211538,28.598825
min,10017530.0,20024360.0,2110-05-27 02:39:00,426.0,2110-05-12 02:18:00,2110-06-05 16:35:00,18.0,0.408333
25%,12459170.0,22876980.0,2133-11-19 10:22:30,646.75,2133-11-15 22:18:30,2133-11-28 05:40:15.000000512,42.0,12.877431
50%,14887510.0,25129820.0,2157-08-05 21:17:30,983.0,2157-08-04 21:12:30,2157-08-12 17:38:00,55.5,22.001042
75%,17491830.0,27669080.0,2178-04-17 04:23:44.999999488,1668.75,2178-04-11 18:00:15.000000512,2178-06-01 09:11:15.000000512,70.0,32.838542
max,19720120.0,29956550.0,2208-06-24 11:36:00,28230.0,2208-06-24 10:20:00,2208-08-13 16:40:00,91.0,133.247917
std,2912395.0,2978428.0,,2819.816998,,,17.467393,25.002323


In [100]:
merged_ct_icd_nap_df = merged_ct_icd_df[~merged_ct_icd_df['subject_id'].isin(high_lp_ap_ct_filtered_df['subject_id'])]

In [101]:
merged_ct_icd_nap_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
4,10057482,25416257,2145-04-04 15:26:00,1429.0,2145-03-23 15:01:00,2145-04-26 17:23:00,F,76,WHITE,False,34.098611
5,10057482,25416257,2145-04-04 17:51:00,1390.0,2145-03-23 15:01:00,2145-04-26 17:23:00,F,76,WHITE,False,34.098611
6,10057482,25416257,2145-04-05 01:00:00,1153.0,2145-03-23 15:01:00,2145-04-26 17:23:00,F,76,WHITE,False,34.098611
7,10057482,25416257,2145-04-06 01:07:00,1385.0,2145-03-23 15:01:00,2145-04-26 17:23:00,F,76,WHITE,False,34.098611
8,10057482,25416257,2145-04-07 02:52:00,1428.0,2145-03-23 15:01:00,2145-04-26 17:23:00,F,76,WHITE,False,34.098611
...,...,...,...,...,...,...,...,...,...,...,...
945,14331151,24487999,2187-08-15 03:33:00,999999.0,2187-07-26 20:27:00,2187-09-01 11:47:00,M,78,WHITE,False,36.638889
947,17509107,25893958,2122-06-07 02:05:00,999999.0,2122-06-05 00:17:00,2122-06-30 12:00:00,M,49,WHITE,False,25.488194
948,19201291,25546590,2184-01-15 02:08:00,492.0,2184-01-04 23:43:00,2184-01-22 13:15:00,M,41,BLACK/AFRICAN AMERICAN,False,17.563889
949,19201291,25546590,2184-01-15 05:51:00,529.0,2184-01-04 23:43:00,2184-01-22 13:15:00,M,41,BLACK/AFRICAN AMERICAN,False,17.563889


In [102]:
high_lp_ab_ap_filtered_nap = merged_ct_icd_nap_df.groupby(['subject_id']).apply(select_row).reset_index(drop=True)
high_lp_ab_ap_filtered_nap['label'] = 'NON-AP'

  high_lp_ab_ap_filtered_nap = merged_ct_icd_nap_df.groupby(['subject_id']).apply(select_row).reset_index(drop=True)


In [103]:
high_lp_ab_ap_filtered_nap

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay,label
0,10057482,25416257,2145-04-04 15:26:00,1429.0,2145-03-23 15:01:00,2145-04-26 17:23:00,F,76,WHITE,False,34.098611,NON-AP
1,10095570,28612694,2148-09-28 10:40:00,1316.0,2148-09-26 03:38:00,2148-09-29 10:50:00,M,63,ASIAN - CHINESE,True,3.300000,NON-AP
2,10122182,22489381,2145-06-20 13:38:00,2922.0,2145-06-20 11:49:00,2145-07-04 14:03:00,M,32,WHITE,False,14.093056,NON-AP
3,10164170,21635159,2163-06-06 15:40:00,2354.0,2163-06-01 03:27:00,2163-06-12 16:00:00,F,87,WHITE - OTHER EUROPEAN,True,11.522917,NON-AP
4,10199560,24622638,2175-01-11 01:53:00,487.0,2175-01-10 23:24:00,2175-01-14 12:15:00,M,70,WHITE,True,3.535417,NON-AP
...,...,...,...,...,...,...,...,...,...,...,...,...
176,19529415,21330056,2166-05-08 01:18:00,730.0,2166-04-24 20:22:00,2166-05-29 16:00:00,F,83,UNKNOWN,False,34.818056,NON-AP
177,19636128,22697287,2203-03-03 23:00:00,834.0,2203-03-04 06:19:00,2203-03-18 13:16:00,F,74,BLACK/CAPE VERDEAN,False,14.289583,NON-AP
178,19651093,26488509,2192-03-19 15:00:00,546.0,2192-03-17 21:05:00,2192-07-03 19:08:00,M,74,WHITE,True,107.918750,NON-AP
179,19669999,28614555,2148-08-09 23:40:00,2066.0,2148-08-10 02:24:00,2148-08-10 13:55:00,F,84,OTHER,True,0.479861,NON-AP


In [104]:
high_lp_ap_ct_filtered_df['label'] = 'AP'

In [105]:
final_dataset = pd.concat([high_lp_ap_ct_filtered_df, high_lp_ab_ap_filtered_nap]).reset_index(drop=True)

In [106]:
final_dataset

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay,icd_code,label
0,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,5770,AP
1,10036086,28728587,2196-05-26 09:25:00,677.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528,5770,AP
2,10143030,27543597,2180-05-29 02:31:00,563.0,2180-04-19 09:39:00,2180-07-23 05:44:00,M,63,WHITE,True,94.836806,5770,AP
3,10246670,26812375,2114-09-03 04:30:00,850.0,2114-08-25 20:34:00,2114-09-10 12:00:00,M,44,BLACK/AFRICAN AMERICAN,False,15.643056,5770,AP
4,10309969,23107749,2182-01-21 06:45:00,747.0,2182-01-20 01:04:00,2182-02-15 14:00:00,F,68,WHITE,False,26.538889,5770,AP
...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,19529415,21330056,2166-05-08 01:18:00,730.0,2166-04-24 20:22:00,2166-05-29 16:00:00,F,83,UNKNOWN,False,34.818056,,NON-AP
333,19636128,22697287,2203-03-03 23:00:00,834.0,2203-03-04 06:19:00,2203-03-18 13:16:00,F,74,BLACK/CAPE VERDEAN,False,14.289583,,NON-AP
334,19651093,26488509,2192-03-19 15:00:00,546.0,2192-03-17 21:05:00,2192-07-03 19:08:00,M,74,WHITE,True,107.918750,,NON-AP
335,19669999,28614555,2148-08-09 23:40:00,2066.0,2148-08-10 02:24:00,2148-08-10 13:55:00,F,84,OTHER,True,0.479861,,NON-AP


In [107]:
final_dataset.to_csv('High_LP_Scan_AP_Final_Dataset.csv', encoding='utf-8', index = False)