In [None]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from IPython.display import display, HTML, Image
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams.update({'font.size': 20})

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [None]:
# authenticate
auth.authenticate_user()

In [None]:
# Set up environment variables
project_id = 'project-mimic-430923'
if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project.')
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# set the dataset
# if you want to use the demo, change this to mimic_demo
dataset = 'mimiciv'

In [None]:
# 1. Get basic patient info (admissions + demographics (excluding marital_status) + death status)
patient_info_query = """
SELECT adm.subject_id, adm.hadm_id, adm.admittime, adm.dischtime, pat.gender, pat.anchor_age AS age, adm.race, adm.hospital_expire_flag
FROM `physionet-data.mimiciv_hosp.admissions` AS adm
JOIN `physionet-data.mimiciv_hosp.patients` AS pat
ON adm.subject_id = pat.subject_id
WHERE adm.admittime IS NOT NULL
ORDER BY subject_id
"""
patient_info_df = run_query(patient_info_query)

patient_info_df['in_hospital_death'] = patient_info_df['hospital_expire_flag'] == 1

patient_info_df = patient_info_df.drop(columns=['hospital_expire_flag'])

# Calculate length of stay and have data only with positive L.O.F
patient_info_df['admittime'] = pd.to_datetime(patient_info_df['admittime'])
patient_info_df['dischtime'] = pd.to_datetime(patient_info_df['dischtime'])
patient_info_df['length_of_stay'] = (patient_info_df['dischtime'] - patient_info_df['admittime']).dt.total_seconds() / (60 * 60 * 24)

patient_info_df = patient_info_df[patient_info_df['length_of_stay'] > 0]

  return pd.io.gbq.read_gbq(


In [None]:
patient_info_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
0,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,F,52,WHITE,False,1.015278
1,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,F,52,WHITE,False,2.222222
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,F,52,WHITE,False,1.754167
3,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,F,52,WHITE,False,0.786111
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,F,19,WHITE,False,0.298611
...,...,...,...,...,...,...,...,...,...
431226,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,F,46,WHITE,False,17.074306
431227,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00,F,46,WHITE,False,10.011111
431228,19999840,26071774,2164-07-25 00:27:00,2164-07-28 12:15:00,M,58,WHITE,False,3.491667
431229,19999840,21033226,2164-09-10 13:47:00,2164-09-17 13:42:00,M,58,WHITE,True,6.996528


In [None]:
patient_info_df['marital_status'].value_counts()

Unnamed: 0_level_0,count
marital_status,Unnamed: 1_level_1
MARRIED,181293
SINGLE,163213
WIDOWED,45829
DIVORCED,31663


In [None]:
# Count number of patients
num_patients = patient_info_df['subject_id'].nunique()
print(f"Number of patients: {num_patients}")

Number of patients: 180677


In [None]:
# 2. Get item IDs for lipase tests
lipase_item_query = """
SELECT itemid, label, fluid
FROM `physionet-data.mimiciv_hosp.d_labitems`
WHERE LOWER(label) LIKE '%lipase%'
"""
lipase_items_df = run_query(lipase_item_query)
print(lipase_items_df)
lipase_itemids = [50956, 50844, 51055, 51036] # blood and other fluid # Lipase item IDs [50956]# only blood

  return pd.io.gbq.read_gbq(


   itemid               label             fluid
0   50956              Lipase             Blood
1   50844     Lipase, Ascites           Ascites
2   51055     Lipase, Pleural           Pleural
3   51036  Lipase, Body Fluid  Other Body Fluid


In [None]:
# 3. Retrieve lipase values
lipase_values_query = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS lipase_level
FROM `physionet-data.mimiciv_hosp.labevents`
WHERE itemid IN ({', '.join(map(str, lipase_itemids))})
ORDER BY subject_id, charttime
"""
lipase_values_df = run_query(lipase_values_query)

# Merge lipase values with patient info and clean data
merged_df = pd.merge(lipase_values_df, patient_info_df, on=['subject_id', 'hadm_id'], how='left')
merged_df.dropna(subset=['age', 'lipase_level'], inplace=True)

# Identify patients with high lipase levels

#new_range_young = (140 - 10)/2
#new_range_old = (151 - 24)/2
def check_lipase(row):
    upper_limit = 140 if row['age'] < 60 else 151 #new_range_young if row['age'] < 60 else new_range_old
    return row['lipase_level'] >= 3 * upper_limit

high_lipase_df = merged_df[merged_df.apply(check_lipase, axis=1)]

  return pd.io.gbq.read_gbq(


In [None]:
high_lipase_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
85,10004606,29242151,2159-02-20 18:30:00,1222.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,14.130556
170,10006431,24638489,2129-01-23 23:36:00,508.0,2129-01-24 01:08:00,2129-01-30 16:50:00,F,66,WHITE,False,6.654167
362,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
363,10017531,22580355,2159-09-23 04:30:00,977.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
427,10021357,25937617,2144-12-30 06:55:00,1249.0,2144-12-27 19:41:00,2145-01-04 19:54:00,F,91,WHITE,False,8.009028
...,...,...,...,...,...,...,...,...,...,...,...
229546,19990545,23106222,2139-10-14 06:11:00,527.0,2139-10-04 23:11:00,2139-10-29 15:45:00,F,43,ASIAN - ASIAN INDIAN,False,24.690278
229547,19990545,23106222,2139-10-15 04:40:00,753.0,2139-10-04 23:11:00,2139-10-29 15:45:00,F,43,ASIAN - ASIAN INDIAN,False,24.690278
229548,19990545,23106222,2139-10-16 04:50:00,650.0,2139-10-04 23:11:00,2139-10-29 15:45:00,F,43,ASIAN - ASIAN INDIAN,False,24.690278
229672,19996968,29843339,2125-01-23 06:25:00,508.0,2125-01-20 21:17:00,2125-01-23 14:42:00,M,32,BLACK/AFRICAN AMERICAN,False,2.725694


In [None]:
check_lipase(merged_df.iloc[2000])
merged_df.iloc[2000]

In [None]:
# 3. Retrieve lipase values icu

lipase_itemids_icu = "225672"  # Lipase item IDs

lipase_values_query_icu = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS lipase_level
FROM `physionet-data.mimiciv_icu.chartevents`
WHERE itemid IN ({lipase_itemids_icu})
ORDER BY subject_id, charttime
"""
lipase_values_df_icu = run_query(lipase_values_query_icu)

# Merge lipase values with patient info and clean data
merged_df_icu = pd.merge(lipase_values_df_icu, patient_info_df, on=['subject_id', 'hadm_id'], how='left')
merged_df_icu.dropna(subset=['age', 'lipase_level'], inplace=True)

#new_range_young = (140 - 10)/2
#new_range_old = (151 - 24)/2
def check_lipase(row):
    upper_limit = 140 if row['age'] < 60 else 151 #new_range_young if row['age'] < 60 else new_range_old
    return row['lipase_level'] >= 3 * upper_limit

high_lipase_df_icu = merged_df_icu[merged_df_icu.apply(check_lipase, axis=1)]

  return pd.io.gbq.read_gbq(


In [None]:
high_lipase_df_icu

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
1,10004606,29242151,2159-02-20 18:30:00,1222.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,14.130556
15,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
16,10017531,22580355,2159-09-23 04:30:00,977.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
55,10036086,28728587,2196-05-26 09:25:00,677.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528
56,10036086,28728587,2196-05-28 03:39:00,777.0,2196-05-20 02:47:00,2196-06-12 11:42:00,M,57,WHITE,False,23.371528
...,...,...,...,...,...,...,...,...,...,...,...
17140,19882958,29628147,2182-08-31 03:10:00,787.0,2182-08-29 17:25:00,2182-09-03 13:50:00,M,83,WHITE,False,4.850694
17167,19899716,21665899,2143-09-03 17:16:00,587.0,2143-09-03 12:59:00,2143-09-08 18:00:00,M,62,BLACK/AFRICAN AMERICAN,False,5.209028
17178,19901341,23906609,2169-08-10 06:48:00,588.0,2169-08-06 14:52:00,2169-09-05 14:58:00,F,55,WHITE,False,30.004167
17189,19907884,24707264,2181-01-24 04:42:00,789.0,2181-01-23 21:32:00,2181-02-03 13:30:00,F,38,WHITE,False,10.665278


In [None]:
high_lipase_df_combined = pd.concat([high_lipase_df, high_lipase_df_icu])

# Drop duplicate rows
high_lipase_df_unique = high_lipase_df_combined.drop_duplicates()

In [None]:
len(high_lipase_df_unique)

4241

In [None]:
high_lipase_df_combined

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
85,10004606,29242151,2159-02-20 18:30:00,1222.0,2159-02-20 13:43:00,2159-03-06 16:51:00,F,64,WHITE,False,14.130556
170,10006431,24638489,2129-01-23 23:36:00,508.0,2129-01-24 01:08:00,2129-01-30 16:50:00,F,66,WHITE,False,6.654167
362,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
363,10017531,22580355,2159-09-23 04:30:00,977.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944
427,10021357,25937617,2144-12-30 06:55:00,1249.0,2144-12-27 19:41:00,2145-01-04 19:54:00,F,91,WHITE,False,8.009028
...,...,...,...,...,...,...,...,...,...,...,...
17140,19882958,29628147,2182-08-31 03:10:00,787.0,2182-08-29 17:25:00,2182-09-03 13:50:00,M,83,WHITE,False,4.850694
17167,19899716,21665899,2143-09-03 17:16:00,587.0,2143-09-03 12:59:00,2143-09-08 18:00:00,M,62,BLACK/AFRICAN AMERICAN,False,5.209028
17178,19901341,23906609,2169-08-10 06:48:00,588.0,2169-08-06 14:52:00,2169-09-05 14:58:00,F,55,WHITE,False,30.004167
17189,19907884,24707264,2181-01-24 04:42:00,789.0,2181-01-23 21:32:00,2181-02-03 13:30:00,F,38,WHITE,False,10.665278


In [None]:
# Count number of patients with high lipase levels
num_patients = high_lipase_df_unique['subject_id'].nunique()
print(f"Number of unique patients with high lipase levels: {num_patients}")

Number of unique patients with high lipase levels: 2106


In [None]:
ct_item_query = """
SELECT itemid, label
FROM `physionet-data.mimiciv_icu.d_items`
WHERE LOWER(label) LIKE '%computed tomography%'
   OR LOWER(label) LIKE '%abdomen%'
   OR LOWER(label) LIKE '%pelvis%'
   OR LOWER(label) LIKE '%scan%'
"""
ct_items_df = run_query(ct_item_query)
print(ct_items_df)
ct_itemids = [221214, 229582]  # CT item IDs

  return pd.io.gbq.read_gbq(


   itemid                                      label
0  225999               Vanilla Scandi Shake (mixed)
1  226000             Chocolate Scandi Shake (mixed)
2  226002  Vanilla Lactose Free Scandi Shake (mixed)
3  221214                                    CT scan
4  225461                                     Pelvis
5  229582                           Portable CT scan
6  228714                            Bladder scanned
7  229371                      Bladder Scan Estimate
8  220462                                   Scandium


In [None]:
ct_scan_query = f"""
SELECT subject_id, hadm_id, stay_id, starttime, endtime, itemid, value
FROM `physionet-data.mimiciv_icu.procedureevents`
WHERE itemid IN ({', '.join(map(str, ct_itemids))})
ORDER BY subject_id, starttime
"""
ct_scan_df = run_query(ct_scan_query)

  return pd.io.gbq.read_gbq(


In [None]:
ct_scan_df

Unnamed: 0,subject_id,hadm_id,stay_id,starttime,endtime,itemid,value
0,10001217,24597018,37067082,2157-11-21 14:00:00,2157-11-21 14:01:00,221214,1.0
1,10001217,27703517,34592300,2157-12-19 20:19:00,2157-12-19 20:20:00,221214,1.0
2,10001884,26184834,37510196,2131-01-11 15:15:00,2131-01-11 15:16:00,221214,1.0
3,10002155,28994087,31090461,2130-09-26 12:45:00,2130-09-26 12:46:00,221214,1.0
4,10002428,28662225,38875437,2156-04-19 23:00:00,2156-04-19 23:01:00,221214,1.0
...,...,...,...,...,...,...,...
22038,19999068,21606769,30143796,2161-08-27 17:35:00,2161-08-27 17:36:00,221214,1.0
22039,19999442,26785317,32336619,2148-11-19 19:45:00,2148-11-19 19:46:00,221214,1.0
22040,19999442,26785317,32336619,2148-11-20 05:26:00,2148-11-20 05:27:00,221214,1.0
22041,19999828,25744818,36075953,2149-01-09 21:30:00,2149-01-09 21:31:00,221214,1.0


In [None]:
print(ct_scan_df['subject_id'].nunique())

12494


In [None]:
# icd procedures
new_df = run_query("""
SELECT *
FROM `physionet-data.mimiciv_hosp.d_icd_procedures`
WHERE long_title LIKE "%abdomen%" or long_title LIKE "%pelvis%" or long_title LIKE "%Computerized axial tomography%" or long_title LIKE "%computerized axial tomography%"
""")
new_df

  return pd.io.gbq.read_gbq(


Unnamed: 0,icd_code,icd_version,long_title
0,5592,9,Percutaneous aspiration of kidney (pelvis)
1,560,9,Transurethral removal of obstruction from uret...
2,8703,9,Computerized axial tomography of head
3,8741,9,Computerized axial tomography of thorax
4,8771,9,Computerized axial tomography of kidney
5,8801,9,Computerized axial tomography of abdomen
6,8802,9,Other abdomen tomography
7,8819,9,Other x-ray of abdomen
8,8826,9,Other skeletal x-ray of pelvis and hip
9,8838,9,Other computerized axial tomography


In [None]:
icd_procedures_codes = [8801, 8802, 8838]
#icd_ap_code = ['K850']
icd_codes_str = ', '.join(f"'{code}'" for code in icd_diagnoses_codes)

In [None]:
ct_scan_icd_query = f"""
SELECT subject_id, hadm_id, icd_code
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE icd_code IN ({icd_codes_str})
ORDER BY subject_id
"""
ct_scan_icd_df = run_query(ct_scan_icd_query)

  return pd.io.gbq.read_gbq(


In [None]:
ct_scan_icd_df

Unnamed: 0,subject_id,hadm_id,icd_code
0,10004606,28731738,R109
1,10005866,21636229,R1084
2,10006457,27072986,R109
3,10010231,28743978,R1011
4,10014354,24357615,R1030
...,...,...,...
5644,19992875,28476580,R1084
5645,19995320,28017574,R1013
5646,19997062,20096107,R1011
5647,19997843,20277361,R1013


In [None]:
ct_scan_icd_df['subject_id'].nunique()

4085

In [None]:
merged_ct_icd_df = pd.merge(high_lipase_df_unique, ct_scan_df, on=['subject_id', 'hadm_id'], how='inner')

# Merge length of stay into the dataset
merged_ct_icd_df = pd.merge(merged_ct_icd_df, patient_info_df[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='left')

In [None]:
merged_ct_icd_df

Unnamed: 0,subject_id,hadm_id,charttime,lipase_level,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay,stay_id,starttime,endtime,itemid,value
0,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,35526828,2159-09-28 16:36:00,2159-09-28 16:37:00,221214,1.0
1,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,35526828,2159-10-05 18:20:00,2159-10-05 18:21:00,221214,1.0
2,10017531,22580355,2159-09-22 20:56:00,1164.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,35526828,2159-10-12 17:00:00,2159-10-12 17:01:00,221214,1.0
3,10017531,22580355,2159-09-23 04:30:00,977.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,35526828,2159-09-28 16:36:00,2159-09-28 16:37:00,221214,1.0
4,10017531,22580355,2159-09-23 04:30:00,977.0,2159-09-22 19:30:00,2159-10-24 13:40:00,M,63,WHITE,False,31.756944,35526828,2159-10-05 18:20:00,2159-10-05 18:21:00,221214,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2338,17509107,25893958,2122-06-07 02:05:00,999999.0,2122-06-05 00:17:00,2122-06-30 12:00:00,M,49,WHITE,False,25.488194,33499859,2122-06-07 07:45:00,2122-06-07 07:46:00,221214,1.0
2339,17509107,25893958,2122-06-07 02:05:00,999999.0,2122-06-05 00:17:00,2122-06-30 12:00:00,M,49,WHITE,False,25.488194,33499859,2122-06-14 09:27:00,2122-06-14 09:28:00,221214,1.0
2340,19201291,25546590,2184-01-15 02:08:00,492.0,2184-01-04 23:43:00,2184-01-22 13:15:00,M,41,BLACK/AFRICAN AMERICAN,False,17.563889,39710961,2184-01-15 11:30:00,2184-01-15 11:31:00,221214,1.0
2341,19201291,25546590,2184-01-15 05:51:00,529.0,2184-01-04 23:43:00,2184-01-22 13:15:00,M,41,BLACK/AFRICAN AMERICAN,False,17.563889,39710961,2184-01-15 11:30:00,2184-01-15 11:31:00,221214,1.0


In [None]:
# Count the number of patients with high lipase levels who had a CT scan
num_patients_icd_ct = merged_ct_icd_df['subject_id'].nunique()
print(f"Number of patients with high lipase levels who had a CT scan: {num_patients_icd_ct}")

# Count the number of rows in the dataset
num_rows = merged_ct_icd_df.shape[0]
print(f"Number of rows in the dataset: {num_rows}")

Number of patients with high lipase levels who had a CT scan: 315
Number of rows in the dataset: 2343


In [None]:
# 1. Handle missing values
missing_data = merged_ct_icd_df.isnull().sum()
print(missing_data)

subject_id          0
hadm_id             0
charttime           0
lipase_level        0
admittime           0
dischtime           0
gender              0
age                 0
marital_status    108
race                0
icd_code            0
length_of_stay      0
dtype: int64


In [None]:
merged_ct_icd_df['marital_status'].value_counts()

Unnamed: 0_level_0,count
marital_status,Unnamed: 1_level_1
MARRIED,1364
SINGLE,933
WIDOWED,226
DIVORCED,174


In [None]:
# 5. Retrieve comorbid conditions (ICD codes)
hadm_ids = ', '.join(str(hadm_id) for hadm_id in merged_ct_icd_df['hadm_id'].unique())
comorbidity_query = f"""
SELECT subject_id, hadm_id, icd_code, seq_num, icd_version
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE hadm_id IN ({hadm_ids})
ORDER BY subject_id, hadm_id, seq_num
"""
comorbidities_df = run_query(comorbidity_query)

# Merge comorbidities with the dataset
merged_comorbidities_df = pd.merge(merged_ct_icd_df, comorbidities_df, on=['subject_id', 'hadm_id'], how='left')

# Display the final merged dataset
print(merged_comorbidities_df.head())

  return pd.io.gbq.read_gbq(


   subject_id   hadm_id           charttime  lipase_level           admittime  \
0    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
1    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
2    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
3    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
4    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   

            dischtime gender  age marital_status   race  length_of_stay_x  \
0 2159-03-06 16:51:00      F   64        WIDOWED  WHITE         14.130556   
1 2159-03-06 16:51:00      F   64        WIDOWED  WHITE         14.130556   
2 2159-03-06 16:51:00      F   64        WIDOWED  WHITE         14.130556   
3 2159-03-06 16:51:00      F   64        WIDOWED  WHITE         14.130556   
4 2159-03-06 16:51:00      F   64        WIDOWED  WHITE         14.130556   

  icd_code_x  length_of_stay_y icd_code_y  seq_num

In [None]:
# 6. Retrieve mortality information from admissions table
mortality_query = """
SELECT subject_id, hadm_id, hospital_expire_flag
FROM `physionet-data.mimiciv_hosp.admissions`
"""
mortality_df = run_query(mortality_query)

# Add a column to indicate if the patient died during the hospital stay
mortality_df['in_hospital_death'] = mortality_df['hospital_expire_flag'] == 1

# Merge mortality information into the existing dataset
merged_comorbidities_df = pd.merge(merged_comorbidities_df, mortality_df[['subject_id', 'hadm_id', 'in_hospital_death']], on=['subject_id', 'hadm_id'], how='left')

# Display the first few rows of the updated dataset
print(merged_comorbidities_df.head())

  return pd.io.gbq.read_gbq(


   subject_id   hadm_id           charttime  lipase_level           admittime  \
0    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
1    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
2    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
3    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
4    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   

            dischtime gender  age marital_status   race  length_of_stay_x  \
0 2159-03-06 16:51:00      F   64        WIDOWED  WHITE         14.130556   
1 2159-03-06 16:51:00      F   64        WIDOWED  WHITE         14.130556   
2 2159-03-06 16:51:00      F   64        WIDOWED  WHITE         14.130556   
3 2159-03-06 16:51:00      F   64        WIDOWED  WHITE         14.130556   
4 2159-03-06 16:51:00      F   64        WIDOWED  WHITE         14.130556   

  icd_code_x  length_of_stay_y icd_code_y  seq_num

In [None]:
# Count the number of unique patients who died in the hospital
num_patients_with_death_info = merged_comorbidities_df[merged_comorbidities_df['in_hospital_death'] == True][['subject_id', 'hadm_id']].drop_duplicates().shape[0]
print(f"Number of unique patients who died in the hospital: {num_patients_with_death_info}")

Number of unique patients who died in the hospital: 38


In [None]:
new_df = run_query("""
SELECT *
FROM `physionet-data.mimiciv_hosp.d_icd_procedures`
WHERE icd_code LIKE "8801"
""")
new_df

  return pd.io.gbq.read_gbq(


Unnamed: 0,icd_code,icd_version,long_title
0,8801,9,Computerized axial tomography of abdomen


In [None]:
icd_procedures_codes = list(new_df['icd_code'])
icd_procedures_codes

['8801']

In [None]:
new_df = run_query("""
SELECT *
FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE icd_code LIKE "R10%" or icd_code LIKE "R935%"
""")
new_df

  return pd.io.gbq.read_gbq(


Unnamed: 0,icd_code,icd_version,long_title
0,R10,10,Abdominal and pelvic pain
1,R100,10,Acute abdomen
2,R101,10,Pain localized to upper abdomen
3,R1010,10,"Upper abdominal pain, unspecified"
4,R1011,10,Right upper quadrant pain
5,R1012,10,Left upper quadrant pain
6,R1013,10,Epigastric pain
7,R102,10,Pelvic and perineal pain
8,R103,10,Pain localized to other parts of lower abdomen
9,R1030,10,"Lower abdominal pain, unspecified"


In [None]:
icd_diagnoses_codes = list(new_df['icd_code'])
icd_diagnoses_codes

['R10',
 'R100',
 'R101',
 'R1010',
 'R1011',
 'R1012',
 'R1013',
 'R102',
 'R103',
 'R1030',
 'R1031',
 'R1032',
 'R1033',
 'R108',
 'R1081',
 'R10811',
 'R10812',
 'R10813',
 'R10814',
 'R10815',
 'R10816',
 'R10817',
 'R10819',
 'R1082',
 'R10821',
 'R10822',
 'R10823',
 'R10824',
 'R10825',
 'R10826',
 'R10827',
 'R10829',
 'R1083',
 'R1084',
 'R109',
 'R935']

In [None]:
new_df = run_query("""
SELECT *
FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE icd_code LIKE "K85%" or icd_code = "5770"
""")
new_df

  return pd.io.gbq.read_gbq(


Unnamed: 0,icd_code,icd_version,long_title
0,5770,9,Acute pancreatitis
1,K85,10,Acute pancreatitis
2,K850,10,Idiopathic acute pancreatitis
3,K8500,10,Idiopathic acute pancreatitis without necrosis...
4,K8501,10,Idiopathic acute pancreatitis with uninfected ...
5,K8502,10,Idiopathic acute pancreatitis with infected ne...
6,K851,10,Biliary acute pancreatitis
7,K8510,10,Biliary acute pancreatitis without necrosis or...
8,K8511,10,Biliary acute pancreatitis with uninfected nec...
9,K8512,10,Biliary acute pancreatitis with infected necrosis


In [None]:
icd_ap_codes = list(new_df['icd_code'])
icd_ap_codes

['5770',
 'K85',
 'K850',
 'K8500',
 'K8501',
 'K8502',
 'K851',
 'K8510',
 'K8511',
 'K8512',
 'K852',
 'K8520',
 'K8521',
 'K8522',
 'K853',
 'K8530',
 'K8531',
 'K8532',
 'K858',
 'K8580',
 'K8581',
 'K8582',
 'K859',
 'K8590',
 'K8591',
 'K8592']