In [None]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from IPython.display import display, HTML, Image
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams.update({'font.size': 20})

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [None]:
# authenticate
auth.authenticate_user()

In [None]:
# Set up environment variables
project_id = 'project-mimic-430923'
if project_id == 'CHANGE-ME':
  raise ValueError('You must change project_id to your GCP project.')
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# set the dataset
# if you want to use the demo, change this to mimic_demo
dataset = 'mimiciv'

In [None]:
# 1. Get basic patient info (admissions + demographics (excluding marital_status) + death status)
patient_info_query = """
SELECT adm.subject_id, adm.hadm_id, adm.admittime, adm.dischtime, pat.gender, pat.anchor_age AS age, adm.race, adm.hospital_expire_flag
FROM `physionet-data.mimiciv_hosp.admissions` AS adm
JOIN `physionet-data.mimiciv_hosp.patients` AS pat
ON adm.subject_id = pat.subject_id
WHERE adm.admittime IS NOT NULL
ORDER BY subject_id
"""
patient_info_df = run_query(patient_info_query)

patient_info_df['in_hospital_death'] = patient_info_df['hospital_expire_flag'] == 1

patient_info_df = patient_info_df.drop(columns=['hospital_expire_flag'])

# Calculate length of stay and have data only with positive L.O.F
patient_info_df['admittime'] = pd.to_datetime(patient_info_df['admittime'])
patient_info_df['dischtime'] = pd.to_datetime(patient_info_df['dischtime'])
patient_info_df['length_of_stay'] = (patient_info_df['dischtime'] - patient_info_df['admittime']).dt.total_seconds() / (60 * 60 * 24)

patient_info_df = patient_info_df[patient_info_df['length_of_stay'] > 0]

  return pd.io.gbq.read_gbq(


In [None]:
patient_info_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay
0,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,F,52,WHITE,False,1.015278
1,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,F,52,WHITE,False,2.222222
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,F,52,WHITE,False,1.754167
3,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,F,52,WHITE,False,0.786111
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,F,19,WHITE,False,0.298611
...,...,...,...,...,...,...,...,...,...
431226,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,F,46,WHITE,False,17.074306
431227,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00,F,46,WHITE,False,10.011111
431228,19999840,26071774,2164-07-25 00:27:00,2164-07-28 12:15:00,M,58,WHITE,False,3.491667
431229,19999840,21033226,2164-09-10 13:47:00,2164-09-17 13:42:00,M,58,WHITE,True,6.996528


In [None]:
# Count number of patients
num_patients = patient_info_df['subject_id'].nunique()
print(f"Number of patients: {num_patients}")

Number of patients: 180677


In [None]:
not_ap_icd_query = f"""
SELECT subject_id, hadm_id, icd_code
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE icd_code NOT LIKE 'K85%' or icd_code != '5770'
ORDER BY subject_id
"""
not_ap_icd_df = run_query(not_ap_icd_query)

  return pd.io.gbq.read_gbq(


In [None]:
not_ap_icd_df

Unnamed: 0,subject_id,hadm_id,icd_code
0,10000032,22595853,5723
1,10000032,22841357,07071
2,10000032,25742920,07054
3,10000032,29079034,45829
4,10000032,22595853,78959
...,...,...,...
4756321,19999987,23865745,41401
4756322,19999987,23865745,78039
4756323,19999987,23865745,0413
4756324,19999987,23865745,36846


In [None]:
not_ap_icd_df['subject_id'].nunique()

180640

In [None]:
not_ap_icd_first_df = not_ap_icd_df.drop_duplicates(subset='subject_id', keep='first')

In [None]:
# Merge length of stay into the dataset
merged_not_ap_icd_df = pd.merge(patient_info_df, not_ap_icd_first_df, on=['subject_id', 'hadm_id'], how='inner')

In [None]:
merged_not_ap_icd_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay,icd_code
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,F,52,WHITE,False,0.786111,5723
1,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,F,19,WHITE,False,0.298611,30500
2,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,WHITE,False,4.538889,G3183
3,10000108,27250926,2163-09-27 23:17:00,2163-09-28 09:04:00,M,25,WHITE,False,0.407639,5283
4,10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,F,48,WHITE,False,0.532639,R1310
...,...,...,...,...,...,...,...,...,...,...
180553,19999733,27674281,2152-07-08 23:29:00,2152-07-09 03:45:00,F,19,WHITE,False,0.177778,9953
180554,19999784,21739106,2119-09-05 11:20:00,2119-09-08 19:00:00,M,57,BLACK/AFRICAN AMERICAN,False,3.319444,Z5111
180555,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00,F,46,WHITE,False,10.011111,T8141XA
180556,19999840,21033226,2164-09-10 13:47:00,2164-09-17 13:42:00,M,58,WHITE,True,6.996528,3453


In [None]:
sampled_not_ap_df = merged_not_ap_icd_df.sample(n = 3900, random_state = 42).reset_index(drop = True)

In [None]:
# Count the number of patients with high lipase levels who had a CT scan
num_not_ap_icd_df = sampled_not_ap_df['subject_id'].nunique()
print(f"Number of patients with high lipase levels who had a CT scan: {num_not_ap_icd_df}")

# Count the number of rows in the dataset
num_rows = sampled_not_ap_df.shape[0]
print(f"Number of rows in the dataset: {num_rows}")

Number of patients with high lipase levels who had a CT scan: 3900
Number of rows in the dataset: 3900


In [None]:
sampled_not_ap_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,age,race,in_hospital_death,length_of_stay,icd_code
0,12892826,23568814,2167-11-17 10:50:00,2167-11-21 10:20:00,F,32,WHITE,False,3.979167,99663
1,18455209,26680179,2114-12-25 15:52:00,2114-12-26 17:40:00,M,43,WHITE,False,1.075000,I81
2,17386331,23099205,2158-07-06 09:00:00,2158-07-08 13:24:00,F,34,BLACK/AFRICAN AMERICAN,False,2.183333,2182
3,15587926,21685507,2120-03-14 06:39:00,2120-03-23 16:15:00,F,28,WHITE,False,9.400000,64781
4,18889070,26674366,2125-11-18 04:24:00,2125-11-24 12:25:00,F,47,WHITE,False,6.334028,431
...,...,...,...,...,...,...,...,...,...,...
3895,10848709,20516522,2170-01-06 13:23:00,2170-01-07 14:20:00,F,75,WHITE,False,1.039583,99672
3896,10436234,24751769,2145-02-08 22:05:00,2145-03-02 18:30:00,M,84,WHITE,False,21.850694,T402X5A
3897,16960643,22322966,2152-01-21 07:56:00,2152-01-25 17:11:00,M,81,ASIAN - ASIAN INDIAN,False,4.385417,R339
3898,14493403,20877932,2192-08-16 19:49:00,2192-08-19 11:20:00,F,34,WHITE,False,2.646528,99812


In [None]:
sampled_not_ap_df.to_csv('NOT_AP_ICD_Dataset.csv', encoding='utf-8', index = False)

In [None]:
# 5. Retrieve comorbid conditions (ICD codes)
hadm_ids = ', '.join(str(hadm_id) for hadm_id in sampled_not_ap_df['hadm_id'].unique())
comorbidity_query = f"""
SELECT subject_id, hadm_id, icd_code, seq_num, icd_version
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE hadm_id IN ({hadm_ids})
ORDER BY subject_id, hadm_id, seq_num
"""
comorbidities_df = run_query(comorbidity_query)

# Merge comorbidities with the dataset
merged_comorbidities_df = pd.merge(sampled_not_ap_df, comorbidities_df, on=['subject_id', 'hadm_id'], how='left')

# Display the final merged dataset
print(merged_comorbidities_df.head())

  return pd.io.gbq.read_gbq(


   subject_id   hadm_id           admittime           dischtime gender  age  \
0    12892826  23568814 2167-11-17 10:50:00 2167-11-21 10:20:00      F   32   
1    12892826  23568814 2167-11-17 10:50:00 2167-11-21 10:20:00      F   32   
2    12892826  23568814 2167-11-17 10:50:00 2167-11-21 10:20:00      F   32   
3    12892826  23568814 2167-11-17 10:50:00 2167-11-21 10:20:00      F   32   
4    12892826  23568814 2167-11-17 10:50:00 2167-11-21 10:20:00      F   32   

    race  in_hospital_death  length_of_stay icd_code_x icd_code_y  seq_num  \
0  WHITE              False        3.979167      99663      99663        1   
1  WHITE              False        3.979167      99663      E8798        2   
2  WHITE              False        3.979167      99663       7243        3   
3  WHITE              False        3.979167      99663       7295        4   
4  WHITE              False        3.979167      99663      33829        5   

   icd_version  
0            9  
1            9  
2    

In [None]:
merged_comorbidities_df.to_csv('NOT_AP_ICD_Comorbidities_Dataset.csv', encoding='utf-8', index = False)

In [None]:
# Count the number of unique patients who died in the hospital
num_patients_with_death_info = merged_comorbidities_df[merged_comorbidities_df['in_hospital_death'] == True][['subject_id', 'hadm_id']].drop_duplicates().shape[0]
print(f"Number of unique patients who died in the hospital: {num_patients_with_death_info}")

Number of unique patients who died in the hospital: 110


In [None]:
# 1. Handle missing values
missing_data = merged_comorbidities_df.isnull().sum()
print(missing_data)

subject_id           0
hadm_id              0
admittime            0
dischtime            0
gender               0
age                  0
race                 0
in_hospital_death    0
length_of_stay       0
icd_code_x           0
icd_code_y           0
seq_num              0
icd_version          0
dtype: int64
