In [1]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from IPython.display import display, HTML, Image
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams.update({'font.size': 20})

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [2]:
# authenticate
auth.authenticate_user()

In [3]:
# Set up environment variables
project_id = 'project-mimic-430923'
if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project.')
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# set the dataset
# if you want to use the demo, change this to mimic_demo
dataset = 'mimiciv'

In [4]:
# 1. Get basic patient info (admissions + demographics (excluding marital_status) + death status)
patient_info_query = """
SELECT adm.subject_id, adm.hadm_id, adm.admittime, adm.dischtime, pat.gender, pat.anchor_age AS age, adm.race, adm.hospital_expire_flag
FROM `physionet-data.mimiciv_hosp.admissions` AS adm
JOIN `physionet-data.mimiciv_hosp.patients` AS pat
ON adm.subject_id = pat.subject_id
WHERE adm.admittime IS NOT NULL
"""
patient_info_df = run_query(patient_info_query)

patient_info_df['in_hospital_death'] = patient_info_df['hospital_expire_flag'] == 1

patient_info_df = patient_info_df.drop(columns=['hospital_expire_flag'])

In [5]:
patient_info_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,age,race,in_hospital_death
0,14962874,22614141,2165-10-15 15:59:00,2165-10-18 16:47:00,M,89,WHITE,False
1,13241600,22574379,2162-10-29 15:36:00,2162-10-30 16:00:00,M,89,WHITE,False
2,12945423,20185299,2122-07-08 05:40:00,2122-07-19 15:15:00,M,89,WHITE,False
3,16014534,21633959,2176-03-03 14:33:00,2176-03-11 13:57:00,F,89,WHITE,False
4,16014534,27188276,2176-03-27 11:53:00,2176-03-31 14:10:00,F,89,WHITE,False
...,...,...,...,...,...,...,...,...
431226,12545126,27135177,2187-11-01 15:47:00,2187-11-02 09:45:00,M,91,WHITE,False
431227,17579295,26848807,2162-11-16 07:15:00,2162-11-19 14:27:00,M,91,WHITE,False
431228,18563244,26485584,2136-08-18 14:48:00,2136-08-19 11:00:00,F,91,WHITE,False
431229,14865704,29199399,2156-11-12 13:59:00,2156-11-13 14:00:00,F,91,OTHER,False


In [6]:
# Count number of patients
num_patients = patient_info_df['subject_id'].nunique()
print(f"Number of patients: {num_patients}")

Number of patients: 180733


In [7]:
not_ap_icd_query = f"""
SELECT subject_id, hadm_id, icd_code
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE icd_code NOT LIKE 'K85%'
ORDER BY subject_id
"""
not_ap_icd_df = run_query(not_ap_icd_query)

In [8]:
not_ap_icd_df

Unnamed: 0,subject_id,hadm_id,icd_code
0,10000032,22595853,5723
1,10000032,22841357,07071
2,10000032,25742920,07054
3,10000032,29079034,45829
4,10000032,22595853,78959
...,...,...,...
4754444,19999987,23865745,41401
4754445,19999987,23865745,78039
4754446,19999987,23865745,0413
4754447,19999987,23865745,36846


In [9]:
not_ap_icd_df['subject_id'].nunique()

180631

In [10]:
not_ap_icd_first_df = not_ap_icd_df.drop_duplicates(subset='subject_id', keep='first')

In [13]:
#merged_ap_icd_df = pd.merge(patient_info_df, ap_icd_df, on=['subject_id', 'hadm_id'], how='inner')

# Calculate length of stay
patient_info_df['admittime'] = pd.to_datetime(patient_info_df['admittime'])
patient_info_df['dischtime'] = pd.to_datetime(patient_info_df['dischtime'])
patient_info_df['length_of_stay'] = (patient_info_df['dischtime'] - patient_info_df['admittime']).dt.total_seconds() / (60 * 60 * 24)

# Merge length of stay into the dataset
merged_not_ap_icd_df = pd.merge(patient_info_df, not_ap_icd_first_df[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='right')

In [14]:
merged_not_ap_icd_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,F,52,WHITE,False,0.786111
1,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,F,19,WHITE,False,0.298611
2,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,WHITE,False,4.538889
3,10000108,27250926,2163-09-27 23:17:00,2163-09-28 09:04:00,M,25,WHITE,False,0.407639
4,10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,F,48,WHITE,False,0.532639
...,...,...,...,...,...,...,...,...,...
180626,19999733,27674281,2152-07-08 23:29:00,2152-07-09 03:45:00,F,19,WHITE,False,0.177778
180627,19999784,21739106,2119-09-05 11:20:00,2119-09-08 19:00:00,M,57,BLACK/AFRICAN AMERICAN,False,3.319444
180628,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00,F,46,WHITE,False,10.011111
180629,19999840,21033226,2164-09-10 13:47:00,2164-09-17 13:42:00,M,58,WHITE,True,6.996528


In [18]:
sampled_not_ap_df = merged_not_ap_icd_df.sample(n = 1500, random_state = 42).reset_index(drop = True)

In [19]:
# Count the number of patients with high lipase levels who had a CT scan
num_not_ap_icd_df = sampled_not_ap_df['subject_id'].nunique()
print(f"Number of patients with high lipase levels who had a CT scan: {num_not_ap_icd_df}")

# Count the number of rows in the dataset
num_rows = sampled_not_ap_df.shape[0]
print(f"Number of rows in the dataset: {num_rows}")

Number of patients with high lipase levels who had a CT scan: 1500
Number of rows in the dataset: 1500


In [20]:
sampled_not_ap_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
0,12198375,21636889,2176-12-27 20:05:00,2177-01-02 18:40:00,M,77,OTHER,False,5.940972
1,18809075,25722919,2110-02-25 11:45:00,2110-02-27 16:22:00,M,49,WHITE,False,2.192361
2,11684680,25799199,2184-03-20 19:51:00,2184-03-27 19:57:00,F,50,WHITE,False,7.004167
3,16068315,21848937,2154-12-23 07:15:00,2154-12-24 17:28:00,F,46,WHITE,False,1.425694
4,10966239,27593189,2137-06-22 05:11:00,2137-06-25 17:30:00,F,24,BLACK/CAPE VERDEAN,False,3.513194
...,...,...,...,...,...,...,...,...,...
1495,16032226,20056878,2185-07-01 19:04:00,2185-07-24 16:58:00,M,73,WHITE,False,22.912500
1496,11252876,22252531,2173-10-17 00:50:00,2173-10-17 12:46:00,M,56,BLACK/AFRICAN AMERICAN,False,0.497222
1497,13287719,20198354,2182-01-15 18:32:00,2182-01-25 18:30:00,F,80,UNKNOWN,False,9.998611
1498,17632100,23756653,2134-09-15 01:41:00,2134-09-16 15:00:00,M,66,WHITE,False,1.554861


In [21]:
sampled_not_ap_df.to_csv('NON_AP_ICD_Dataset.csv', encoding='utf-8', index = False)

In [22]:
# 5. Retrieve comorbid conditions (ICD codes)
hadm_ids = ', '.join(str(hadm_id) for hadm_id in sampled_not_ap_df['hadm_id'].unique())
comorbidity_query = f"""
SELECT subject_id, hadm_id, icd_code, seq_num, icd_version
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE hadm_id IN ({hadm_ids})
ORDER BY subject_id, hadm_id, seq_num
"""
comorbidities_df = run_query(comorbidity_query)

# Merge comorbidities with the dataset
merged_comorbidities_df = pd.merge(sampled_not_ap_df, comorbidities_df, on=['subject_id', 'hadm_id'], how='left')

# Display the final merged dataset
print(merged_comorbidities_df.head())

   subject_id   hadm_id           admittime           dischtime gender  age  \
0    12198375  21636889 2176-12-27 20:05:00 2177-01-02 18:40:00      M   77   
1    12198375  21636889 2176-12-27 20:05:00 2177-01-02 18:40:00      M   77   
2    12198375  21636889 2176-12-27 20:05:00 2177-01-02 18:40:00      M   77   
3    12198375  21636889 2176-12-27 20:05:00 2177-01-02 18:40:00      M   77   
4    12198375  21636889 2176-12-27 20:05:00 2177-01-02 18:40:00      M   77   

    race  in_hospital_death  length_of_stay icd_code  seq_num  icd_version  
0  OTHER              False        5.940972     2539        1            9  
1  OTHER              False        5.940972      486        2            9  
2  OTHER              False        5.940972     1629        3            9  
3  OTHER              False        5.940972     5853        4            9  
4  OTHER              False        5.940972    78830        5            9  


In [23]:
merged_comorbidities_df.to_csv('NON_AP_ICD_Comorbidities_Dataset.csv', encoding='utf-8', index = False)

In [24]:
# Count the number of unique patients who died in the hospital
num_patients_with_death_info = merged_comorbidities_df[merged_comorbidities_df['in_hospital_death'] == True][['subject_id', 'hadm_id']].drop_duplicates().shape[0]
print(f"Number of unique patients who died in the hospital: {num_patients_with_death_info}")

Number of unique patients who died in the hospital: 52


In [25]:
# 1. Handle missing values
missing_data = merged_comorbidities_df.isnull().sum()
print(missing_data)

subject_id           0
hadm_id              0
admittime            0
dischtime            0
gender               0
age                  0
race                 0
in_hospital_death    0
length_of_stay       0
icd_code             0
seq_num              0
icd_version          0
dtype: int64
