In [110]:
# Connect to the Google Account

import subprocess
# Install the Google Cloud SDK
subprocess.run(["gcloud", "auth", "application-default", "login"])

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=5oeiMRdKxqmWTooVZme2fBxWId1GdK&access_type=offline&code_challenge=Sfq-u31YZBjCV7PSpdMyVGQj1vWUvnn2yC-9sUW2aNE&code_challenge_method=S256


Credentials saved to file: [/Users/zhuyu/.config/gcloud/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "carbon-virtue-378402" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.


CompletedProcess(args=['gcloud', 'auth', 'application-default', 'login'], returncode=0)

In [111]:
from google.cloud import bigquery
import pandas as pd
import os

# Initialize BigQuery client
client = bigquery.Client()

# Function to run queries and return DataFrame
def run_query(query):
    return client.query(query).to_dataframe()

In [112]:
# 1. Get basic patient info (admissions + demographics (excluding marital_status) + death status)
patient_info_query = """
SELECT adm.subject_id, adm.hadm_id, adm.admittime, adm.dischtime, 
       pat.gender, pat.anchor_age AS approximate_age_at_admission, 
       adm.race, adm.hospital_expire_flag,
       DATETIME_DIFF(adm.admittime, DATETIME(pat.anchor_year, 1, 1, 0, 0, 0), YEAR) + pat.anchor_age AS actual_age
FROM `physionet-data.mimiciv_hosp.admissions` AS adm
JOIN `physionet-data.mimiciv_hosp.patients` AS pat
ON adm.subject_id = pat.subject_id
WHERE adm.admittime IS NOT NULL
ORDER BY subject_id
"""
patient_info_df = run_query(patient_info_query)

# Set in-hospital death status
patient_info_df['in_hospital_death'] = patient_info_df['hospital_expire_flag'] == 1
patient_info_df = patient_info_df.drop(columns=['hospital_expire_flag'])

# Calculate length of stay and keep data with positive L.O.S
patient_info_df['admittime'] = pd.to_datetime(patient_info_df['admittime'])
patient_info_df['dischtime'] = pd.to_datetime(patient_info_df['dischtime'])
patient_info_df['length_of_stay'] = (patient_info_df['dischtime'] - patient_info_df['admittime']).dt.total_seconds() / (60 * 60 * 24)
patient_info_df = patient_info_df[patient_info_df['length_of_stay'] > 0]

# Display a sample of the resulting DataFrame
print(patient_info_df.head())
print(f"Number of rows in the dataset: {patient_info_df.shape[0]}")



   subject_id   hadm_id           admittime           dischtime gender  \
0    10000032  22841357 2180-06-26 18:27:00 2180-06-27 18:49:00      F   
1    10000032  29079034 2180-07-23 12:35:00 2180-07-25 17:55:00      F   
2    10000032  25742920 2180-08-05 23:44:00 2180-08-07 17:50:00      F   
3    10000032  22595853 2180-05-06 22:23:00 2180-05-07 17:15:00      F   
4    10000068  25022803 2160-03-03 23:16:00 2160-03-04 06:26:00      F   

   approximate_age_at_admission   race  actual_age  in_hospital_death  \
0                            52  WHITE          52              False   
1                            52  WHITE          52              False   
2                            52  WHITE          52              False   
3                            52  WHITE          52              False   
4                            19  WHITE          19              False   

   length_of_stay  
0        1.015278  
1        2.222222  
2        1.754167  
3        0.786111  
4        0.29861

In [113]:
patient_info_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,approximate_age_at_admission,race,actual_age,in_hospital_death,length_of_stay
0,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,F,52,WHITE,52,False,1.015278
1,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,F,52,WHITE,52,False,2.222222
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,F,52,WHITE,52,False,1.754167
3,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,F,52,WHITE,52,False,0.786111
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,F,19,WHITE,19,False,0.298611
...,...,...,...,...,...,...,...,...,...,...
431226,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,F,46,WHITE,46,False,17.074306
431227,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00,F,46,WHITE,48,False,10.011111
431228,19999840,26071774,2164-07-25 00:27:00,2164-07-28 12:15:00,M,58,WHITE,58,False,3.491667
431229,19999840,21033226,2164-09-10 13:47:00,2164-09-17 13:42:00,M,58,WHITE,58,True,6.996528


In [114]:
# Count number of patients
num_patients = patient_info_df['subject_id'].nunique()
print(f"Number of patients: {num_patients}")

Number of patients: 180677


In [115]:
# 2. Get high lipase level patients

# Get item IDs for lipase tests in ICU
lipase_item_query_icu = """
SELECT itemid, label, category
FROM `physionet-data.mimiciv_icu.d_items`
WHERE LOWER(label) LIKE '%lipase%'
"""
lipase_items_df_icu = run_query(lipase_item_query_icu)
print(lipase_items_df_icu)
lipase_itemids_icu = "225672"  # Lipase item IDs for ICU

# Get item IDs for lipase tests in hosp
lipase_item_query_hosp = """
SELECT itemid, label, fluid
FROM `physionet-data.mimiciv_hosp.d_labitems`
WHERE LOWER(label) LIKE '%lipase%'
"""
lipase_items_df_hosp = run_query(lipase_item_query_hosp)
print(lipase_items_df_hosp)
lipase_items_hosp = [50956, 50844, 51055, 51036] # blood and other fluid # Lipase item IDs [50956]# only blood

# Retrieve lipase values from ICU
lipase_values_query_icu = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS lipase_level
FROM `physionet-data.mimiciv_icu.chartevents`
WHERE itemid IN ({lipase_itemids_icu})
ORDER BY subject_id, charttime
"""
lipase_values_df_icu = run_query(lipase_values_query_icu)

# Retrieve lipase values from hosp
lipase_values_query_hosp = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS lipase_level
FROM `physionet-data.mimiciv_hosp.labevents`
WHERE itemid IN ({', '.join(map(str, lipase_items_hosp))})
ORDER BY subject_id, charttime
"""
lipase_values_df_hosp = run_query(lipase_values_query_hosp)

# Combine ICU and hosp lipase records, retaining the earliest record for each subject_id and hadm_id
lipase_values_df = pd.concat([lipase_values_df_icu, lipase_values_df_hosp], ignore_index=True)
lipase_values_df = lipase_values_df.sort_values(by=['subject_id', 'hadm_id', 'charttime']).drop_duplicates(subset=['subject_id', 'hadm_id'], keep='first')

# Merge with patient info and clean data
lipase_values_df = pd.merge(lipase_values_df, patient_info_df.copy(), on=['subject_id', 'hadm_id'], how='left', suffixes=('', '_lipase'))
lipase_values_df.dropna(subset=['actual_age', 'lipase_level'], inplace=True)

# Identify patients with high lipase levels
def check_lipase(row):
    upper_limit = 140 if row['actual_age'] < 60 else 151
    return row['lipase_level'] >= 3 * upper_limit

high_lipase_df = lipase_values_df[lipase_values_df.apply(check_lipase, axis=1)]



   itemid   label category
0  225672  Lipase     Labs




   itemid               label             fluid
0   50956              Lipase             Blood
1   50844     Lipase, Ascites           Ascites
2   51055     Lipase, Pleural           Pleural
3   51036  Lipase, Body Fluid  Other Body Fluid




In [116]:
# Display the result
print(high_lipase_df.head())
print(f"Number of high lipase cases: {high_lipase_df.shape[0]}")
print(f"Number of unique patients with high lipase levels: {high_lipase_df['subject_id'].nunique()}")

     subject_id   hadm_id           charttime  lipase_level  \
57     10004606  29242151 2159-02-20 18:30:00        1222.0   
94     10006431  24638489 2129-01-23 23:36:00         508.0   
227    10017531  22580355 2159-09-22 20:56:00        1164.0   
265    10021357  25937617 2144-12-30 06:55:00        1249.0   
449    10037818  21016472 2189-09-15 23:23:00        2519.0   

              admittime           dischtime gender  \
57  2159-02-20 13:43:00 2159-03-06 16:51:00      F   
94  2129-01-24 01:08:00 2129-01-30 16:50:00      F   
227 2159-09-22 19:30:00 2159-10-24 13:40:00      M   
265 2144-12-27 19:41:00 2145-01-04 19:54:00      F   
449 2189-09-16 02:13:00 2189-09-18 01:10:00      F   

     approximate_age_at_admission   race  actual_age  in_hospital_death  \
57                             64  WHITE          64              False   
94                             66  WHITE          67              False   
227                            63  WHITE          64              False

In [117]:
# Generate a CSV file with the high lipase cases
high_lipase_df.to_csv('high_lipase_cases.csv', index=False)

In [118]:
# 3. Get high amylase level patients
# Step 1: Retrieve Amylase Item IDs for both hosp and ICU
amylase_item_query_hosp = """
SELECT itemid, label, fluid
FROM `physionet-data.mimiciv_hosp.d_labitems`
WHERE LOWER(label) LIKE '%amylase%'
"""
amylase_items_df_hosp = run_query(amylase_item_query_hosp)
print("Amylase Items in hosp:")
print(amylase_items_df_hosp)

amylase_item_query_icu = """
SELECT itemid, label, category
FROM `physionet-data.mimiciv_icu.d_items`
WHERE LOWER(label) LIKE '%amylase%'
"""
amylase_items_df_icu = run_query(amylase_item_query_icu)
print("Amylase Items in icu:")
print(amylase_items_df_icu)

# Identified Amylase item IDs for both hosp and ICU
amylase_itemids = [50867, 53087, 51964, 220581]

# Step 2: Retrieve Amylase Values from hosp
amylase_values_query_hosp = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS amylase_level
FROM `physionet-data.mimiciv_hosp.labevents`
WHERE itemid IN ({', '.join(map(str, amylase_itemids))})
ORDER BY subject_id, charttime
"""
amylase_values_df_hosp = run_query(amylase_values_query_hosp)

# Step 3: Retrieve Amylase Values from ICU
amylase_values_query_icu = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS amylase_level
FROM `physionet-data.mimiciv_icu.chartevents`
WHERE itemid IN ({', '.join(map(str, amylase_itemids))})
ORDER BY subject_id, charttime
"""
amylase_values_df_icu = run_query(amylase_values_query_icu)

# Step 4: Concatenate ICU and hosp data, keep only the earliest record for each patient
amylase_values_df = pd.concat([amylase_values_df_hosp, amylase_values_df_icu], ignore_index=True)
amylase_values_df = amylase_values_df.sort_values(by=['subject_id', 'hadm_id', 'charttime']).drop_duplicates(subset=['subject_id', 'hadm_id'], keep='first')

# Step 5: Identify patients with high Amylase levels (e.g., > 1000 IU/L)
amylase_critical_df = amylase_values_df[amylase_values_df['amylase_level'] > 1000]

# Count number of unique patients with high amylase levels
num_patients = amylase_critical_df['subject_id'].nunique()
print(amylase_critical_df.head())
print(f"Number of high amylase cases: {amylase_critical_df.shape[0]}")
print(f"Number of unique patients with high amylase levels: {num_patients}")

# Step 6: Inner join high Amylase level records with high Lipase level records
hl_ha_df = pd.merge(
    high_lipase_df.copy(),
    amylase_critical_df[['subject_id', 'hadm_id', 'charttime', 'amylase_level']],
    on=['subject_id', 'hadm_id'],
    how='outer',
    suffixes=('_lipase','_amylase')
)



Amylase Items in hosp:
    itemid                            label                fluid
0    50867                          Amylase                Blood
1    53087                          Amylase                Blood
2    51930                   Amylase, Stool                Stool
3    51072                   Amylase, Urine                Urine
4    51073  Amylase/Creatinine Ratio, Urine                Urine
5    51963     Amylase/Creatinine Clearance                Urine
6    51964                   Amylase, Serum                Urine
7    51999                    Urine Amylase                Urine
8    50836                 Amylase, Ascites              Ascites
9    51047                 Amylase, Pleural              Pleural
10   51020             Amylase, Joint Fluid          Joint Fluid
11   51026              Amylase, Body Fluid     Other Body Fluid
12   51780                     Amylase, CSF  Cerebrospinal Fluid




Amylase Items in icu:
   itemid    label category
0  220581  Amylase     Labs




      subject_id   hadm_id           charttime  amylase_level
767     10112484      <NA> 2124-05-30 07:10:00         2173.0
1087    10149959  24022109 2144-11-28 09:15:00         1056.0
1560    10219697      <NA> 2114-09-17 11:29:00         2219.0
2407    10305478  20378167 2124-09-28 07:20:00         1129.0
2419    10305478  27897783 2126-06-24 07:30:00         1718.0
Number of high amylase cases: 236
Number of unique patients with high amylase levels: 222


In [119]:
# Display the combined dataset
print(hl_ha_df.head())
print(f"Number of rows in the combined dataset with high lipase or high amylase levels: {hl_ha_df.shape[0]}")
print(f"Number of unique patients in the combined dataset with high lipase or high amylase levels: {hl_ha_df['subject_id'].nunique()}")

   subject_id   hadm_id    charttime_lipase  lipase_level           admittime  \
0    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
1    10006431  24638489 2129-01-23 23:36:00         508.0 2129-01-24 01:08:00   
2    10017531  22580355 2159-09-22 20:56:00        1164.0 2159-09-22 19:30:00   
3    10021357  25937617 2144-12-30 06:55:00        1249.0 2144-12-27 19:41:00   
4    10037818  21016472 2189-09-15 23:23:00        2519.0 2189-09-16 02:13:00   

            dischtime gender  approximate_age_at_admission   race  actual_age  \
0 2159-03-06 16:51:00      F                            64  WHITE          64   
1 2129-01-30 16:50:00      F                            66  WHITE          67   
2 2159-10-24 13:40:00      M                            63  WHITE          64   
3 2145-01-04 19:54:00      F                            91  WHITE          91   
4 2189-09-18 01:10:00      F                            61  WHITE          61   

   in_hospital_death  leng

In [120]:
# 4. Get high CRP level patients
# Step 1: Retrieve CRP Item IDs for both hosp and ICU
crp_item_query_hosp = """
SELECT itemid, label, fluid
FROM `physionet-data.mimiciv_hosp.d_labitems`
WHERE LOWER(label) LIKE '%c-reactive%' or LOWER(label) LIKE '%protein%'
"""
crp_items_df_hosp = run_query(crp_item_query_hosp)
print("CRP Items in hosp:")
print(crp_items_df_hosp)

crp_item_query_icu = """
SELECT itemid, label, category
FROM `physionet-data.mimiciv_icu.d_items`
WHERE LOWER(label) LIKE '%c-reactive%' or LOWER(label) LIKE '%protein%'
"""
crp_items_df_icu = run_query(crp_item_query_icu)
print("CRP Items in icu:")
print(crp_items_df_icu)

crp_itemid_hosp = "50889"
crp_itemid_icu = "51006"

# Step 2: Retrieve CRP values from hosp
crp_values_query_hosp = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS crp_level
FROM `physionet-data.mimiciv_hosp.labevents`
WHERE itemid IN ({crp_itemid_hosp})
ORDER BY subject_id, charttime
"""
crp_values_df_hosp = run_query(crp_values_query_hosp)

# Step 3: Retrieve CRP values from icu
crp_values_query_icu = f"""
SELECT subject_id, hadm_id, charttime, valuenum AS crp_level
FROM `physionet-data.mimiciv_icu.chartevents`
WHERE itemid IN ({crp_itemid_icu})
ORDER BY subject_id, charttime
"""
crp_values_df_icu = run_query(crp_values_query_icu)

# Step 4: Concatenate ICU and hosp data, keep only the earliest record for each patient
crp_values_df = pd.concat([crp_values_df_hosp, crp_values_df_icu], ignore_index=True)
crp_values_df = crp_values_df.sort_values(by=['subject_id', 'hadm_id', 'charttime']).drop_duplicates(subset=['subject_id', 'hadm_id'], keep='first')

# Step 5: Merge CRP values with patient info to calculate time difference within 48 hrs
crp_values_df = pd.merge(crp_values_df, patient_info_df[['subject_id', 'hadm_id', 'admittime']], on=['subject_id', 'hadm_id'])
crp_values_df['charttime'] = pd.to_datetime(crp_values_df['charttime'])
crp_values_df['within_48h'] = (crp_values_df['charttime'] - crp_values_df['admittime']).dt.total_seconds() / (60 * 60) <= 48

# Step 6: Filter records with CRP > 150 mg/dL within 48 hrs
crp_critical_df = crp_values_df[(crp_values_df['crp_level'] > 150) & (crp_values_df['within_48h'])]

# Count the number of unique patients with high CRP levels
num_patients = crp_critical_df['subject_id'].nunique()
print(f"Number of unique patients with high CRP levels within 48 hrs: {num_patients}")

# Step : Outer join CRP critical data with the existing combined dataset
hl_ha_hc_df = pd.merge(
    hl_ha_df.copy(),
    crp_critical_df[['subject_id', 'hadm_id', 'charttime', 'crp_level']],
    on=['subject_id', 'hadm_id'],
    how='outer',
    suffixes=('', '_crp')
)



CRP Items in hosp:
    itemid                       label                fluid
0    50864           Alpha-Fetoprotein                Blood
1    50889          C-Reactive Protein                Blood
2    50975     Protein Electrophoresis                Blood
3    50976              Protein, Total                Blood
4    53096              Protein, Total                Blood
5    51949        Total Protein, Stool                Stool
6    51068               24 hr Protein                Urine
7    51099    Protein/Creatinine Ratio                Urine
8    51102        Total Protein, Urine                Urine
9    51992                     Protein                Urine
10   50849      Total Protein, Ascites              Ascites
11   51059      Total Protein, Pleural              Pleural
12   51024  Total Protein, Joint Fluid          Joint Fluid
13   51043   Total Protein, Body Fluid     Other Body Fluid
14   51802          Total Protein, CSF  Cerebrospinal Fluid
15   51270          P



CRP Items in icu:
   itemid                       label                 category
0  229583                Beneprotein.  Nutrition - Supplements
1  220454                     Protein              Ingredients
2  220612   ZC Reactive Protein (CRP)                     Labs
3  220650               Total Protein                     Labs
4  227444    C Reactive Protein (CRP)                     Labs
5  226184  Estimated Protein Needs/Kg                  General
6  225970                 Beneprotein      Nutrition - Enteral
7  229296   Vital High Protein (Full)      Nutrition - Enteral




Number of unique patients with high CRP levels within 48 hrs: 1763


  crp_values_df = pd.concat([crp_values_df_hosp, crp_values_df_icu], ignore_index=True)


In [121]:
hl_ha_hc_df.set_index(['subject_id', 'hadm_id'], inplace=True)
hl_ha_hc_df.update(patient_info_df.set_index(['subject_id', 'hadm_id']))
hl_ha_hc_df.reset_index(inplace=True)

In [122]:
# Display the combined dataset
print(hl_ha_hc_df.head())
print(f"Number of rows in the combined dataset with high lipase or high amylase or high CRP levels: {hl_ha_hc_df.shape[0]}")
print(f"Number of unique patients in the combined dataset with high lipase or high amylase or high CRP levels: {hl_ha_hc_df['subject_id'].nunique()}")

   subject_id   hadm_id    charttime_lipase  lipase_level           admittime  \
0    10002976  27179825                 NaT           NaN 2145-02-28 19:44:00   
1    10003400  23559586                 NaT           NaN 2137-08-04 00:07:00   
2    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
3    10006431  24638489 2129-01-23 23:36:00         508.0 2129-01-24 01:08:00   
4    10006513  29846618                 NaT           NaN 2127-03-27 14:52:00   

            dischtime gender  approximate_age_at_admission  \
0 2145-03-05 16:45:00      M                            70   
1 2137-09-02 17:05:00      F                            72   
2 2159-03-06 16:51:00      F                            64   
3 2129-01-30 16:50:00      F                            66   
4 2127-03-28 15:20:00      M                            42   

                     race  actual_age  in_hospital_death  length_of_stay  \
0                   WHITE          71              False        

In [123]:
# 5. Get AP ICD Info
# Step 1: Retrieve records with AP diagnosis based on ICD codes
ap_icd_query = f"""
SELECT subject_id, hadm_id, icd_code, seq_num 
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE icd_code LIKE 'K85%' OR icd_code = '5770'
ORDER BY subject_id, seq_num
"""
ap_icd_df = run_query(ap_icd_query)

# Keep only the first record for each patient
ap_icd_df = ap_icd_df.drop_duplicates(subset=['subject_id', 'hadm_id'], keep='first')

# Step 2: Merge AP diagnosis info with the existing combined dataset
combined_df_with_ap = pd.merge(
    hl_ha_hc_df.copy(),
    ap_icd_df[['subject_id', 'hadm_id', 'icd_code']],
    on=['subject_id', 'hadm_id'],
    how='left'
)

# Step 3: Label records with AP confirmation
combined_df_with_ap['is_confirmed_ap'] = combined_df_with_ap['icd_code'].notna()

# Display the resulting dataset
print(combined_df_with_ap.head())

# Count unique patients confirmed with AP
num_confirmed_ap_patients = combined_df_with_ap[combined_df_with_ap['is_confirmed_ap'] == True]['subject_id'].nunique()
print(f"Number of unique patients with high levels who are confirmed with AP: {num_confirmed_ap_patients}")

# Total number of rows in the combined dataset with AP confirmation
num_rows = combined_df_with_ap.shape[0]
print(f"Number of rows in the combined dataset with AP confirmation: {num_rows}")



   subject_id   hadm_id    charttime_lipase  lipase_level           admittime  \
0    10002976  27179825                 NaT           NaN 2145-02-28 19:44:00   
1    10003400  23559586                 NaT           NaN 2137-08-04 00:07:00   
2    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
3    10006431  24638489 2129-01-23 23:36:00         508.0 2129-01-24 01:08:00   
4    10006513  29846618                 NaT           NaN 2127-03-27 14:52:00   

            dischtime gender  approximate_age_at_admission  \
0 2145-03-05 16:45:00      M                            70   
1 2137-09-02 17:05:00      F                            72   
2 2159-03-06 16:51:00      F                            64   
3 2129-01-30 16:50:00      F                            66   
4 2127-03-28 15:20:00      M                            42   

                     race  actual_age  in_hospital_death  length_of_stay  \
0                   WHITE          71              False        

In [124]:
# 6. Get Weight Info
# Step 1: Retrieve weight data for ICU patients (only admit and daily weight)
weight_query = """
SELECT
    subject_id, hadm_id, stay_id, charttime,
    CASE WHEN itemid = 226512 THEN 'admit' ELSE 'daily' END AS weight_type,
    valuenum AS weight
FROM `physionet-data.mimiciv_icu.chartevents`
WHERE valuenum IS NOT NULL
  AND itemid IN (226512, 224639)  -- Admit Weight and Daily Weight
  AND valuenum > 0
ORDER BY subject_id, charttime
"""

# Step 2: Run the query to retrieve weight data
weight_df = run_query(weight_query)

# Step 3: Calculate the average weight per patient (admit and daily weights)
# Group by patient identifiers and calculate the mean weight
average_weight_df = weight_df.groupby(['subject_id', 'hadm_id']).agg(
    average_weight=('weight', 'mean')
).reset_index()

# Step 4: Merge the average weight data with the main combined dataset
combined_df_with_avg_weight = pd.merge(
    combined_df_with_ap.copy(),
    average_weight_df[['subject_id', 'hadm_id', 'average_weight']],
    on=['subject_id', 'hadm_id'],
    how='left'
)

# Display the resulting DataFrame
print(combined_df_with_avg_weight.head())
print(f"Number of rows with average weight data: {combined_df_with_avg_weight.shape[0]}")



   subject_id   hadm_id    charttime_lipase  lipase_level           admittime  \
0    10002976  27179825                 NaT           NaN 2145-02-28 19:44:00   
1    10003400  23559586                 NaT           NaN 2137-08-04 00:07:00   
2    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
3    10006431  24638489 2129-01-23 23:36:00         508.0 2129-01-24 01:08:00   
4    10006513  29846618                 NaT           NaN 2127-03-27 14:52:00   

            dischtime gender  approximate_age_at_admission  \
0 2145-03-05 16:45:00      M                            70   
1 2137-09-02 17:05:00      F                            72   
2 2159-03-06 16:51:00      F                            64   
3 2129-01-30 16:50:00      F                            66   
4 2127-03-28 15:20:00      M                            42   

                     race  actual_age  in_hospital_death  length_of_stay  \
0                   WHITE          71              False        

In [125]:
# 7. Get CCI Info
# Step 1: Query for Charlson Comorbidity Index (CCI) based on ICD codes
cci_query = """
WITH diag AS (
    SELECT 
        hadm_id,
        CASE WHEN icd_version = 9 THEN icd_code ELSE NULL END AS icd9_code,
        CASE WHEN icd_version = 10 THEN icd_code ELSE NULL END AS icd10_code
    FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
),
com AS (
    SELECT ad.hadm_id,
        -- Myocardial infarction
        MAX(CASE WHEN SUBSTR(icd9_code, 1, 3) IN ('410','412') OR SUBSTR(icd10_code, 1, 3) IN ('I21','I22') OR SUBSTR(icd10_code, 1, 4) = 'I252' THEN 1 ELSE 0 END) AS myocardial_infarct,
        -- Congestive heart failure
        MAX(CASE WHEN SUBSTR(icd9_code, 1, 3) = '428' OR SUBSTR(icd9_code, 1, 5) IN ('39891','40201','40211','40291','40401','40403','40411','40413','40491','40493') OR SUBSTR(icd9_code, 1, 4) BETWEEN '4254' AND '4259' OR SUBSTR(icd10_code, 1, 3) IN ('I43','I50') OR SUBSTR(icd10_code, 1, 4) IN ('I099','I110','I130','I132','I255','I420','I425','I426','I427','I428','I429','P290') THEN 1 ELSE 0 END) AS congestive_heart_failure,
        -- Peripheral vascular disease
        MAX(CASE WHEN SUBSTR(icd9_code, 1, 3) IN ('440','441') OR SUBSTR(icd9_code, 1, 4) IN ('0930','4373','4471','5571','5579','V434') OR SUBSTR(icd9_code, 1, 4) BETWEEN '4431' AND '4439' OR SUBSTR(icd10_code, 1, 3) IN ('I70','I71') OR SUBSTR(icd10_code, 1, 4) IN ('I731','I738','I739','I771','I790','I792','K551','K558','K559','Z958','Z959') THEN 1 ELSE 0 END) AS peripheral_vascular_disease,
        -- Additional comorbidity definitions (Cerebrovascular disease, Dementia, Chronic pulmonary disease, etc.)
        -- Add other conditions following similar MAX/CASE structure as above for each comorbidity
    FROM `physionet-data.mimiciv_hosp.admissions` ad
    LEFT JOIN diag ON ad.hadm_id = diag.hadm_id
    GROUP BY ad.hadm_id
),
ag AS (
    SELECT 
        hadm_id,
        age,
        CASE WHEN age <= 40 THEN 0 WHEN age <= 50 THEN 1 WHEN age <= 60 THEN 2 WHEN age <= 70 THEN 3 ELSE 4 END AS age_score
    FROM `physionet-data.mimiciv_derived.age`
)
SELECT 
    ad.subject_id,
    ad.hadm_id,
    ag.age_score,
    myocardial_infarct,
    congestive_heart_failure,
    peripheral_vascular_disease,
    -- Include all other comorbidities fields here...
    age_score + myocardial_infarct + congestive_heart_failure + peripheral_vascular_disease
    -- + add all the weighted conditions here as in your full CCI calculation
    AS charlson_comorbidity_index
FROM `physionet-data.mimiciv_hosp.admissions` ad
LEFT JOIN com ON ad.hadm_id = com.hadm_id
LEFT JOIN ag ON com.hadm_id = ag.hadm_id
"""

# Step 2: Run the query to get CCI data
cci_df = run_query(cci_query)

# Step 3: Merge CCI data with the main dataset based on `subject_id` and `hadm_id`
combined_df_with_ap = pd.merge(
    combined_df_with_avg_weight.copy(),
    cci_df[['subject_id', 'hadm_id', 'charlson_comorbidity_index']],
    on=['subject_id', 'hadm_id'],
    how='left'
)

# Step 4: Display the final combined dataset with CCI information
print(combined_df_with_ap.head())
print(f"Number of rows in the combined dataset with CCI: {combined_df_with_ap.shape[0]}")



   subject_id   hadm_id    charttime_lipase  lipase_level           admittime  \
0    10002976  27179825                 NaT           NaN 2145-02-28 19:44:00   
1    10003400  23559586                 NaT           NaN 2137-08-04 00:07:00   
2    10004606  29242151 2159-02-20 18:30:00        1222.0 2159-02-20 13:43:00   
3    10006431  24638489 2129-01-23 23:36:00         508.0 2129-01-24 01:08:00   
4    10006513  29846618                 NaT           NaN 2127-03-27 14:52:00   

            dischtime gender  approximate_age_at_admission  \
0 2145-03-05 16:45:00      M                            70   
1 2137-09-02 17:05:00      F                            72   
2 2159-03-06 16:51:00      F                            64   
3 2129-01-30 16:50:00      F                            66   
4 2127-03-28 15:20:00      M                            42   

                     race  actual_age  in_hospital_death  length_of_stay  \
0                   WHITE          71              False        

In [126]:
# 8. Get vital signs info
# Step 1: Get the list of unique subject_id and hadm_id
subject_ids = combined_df_with_ap['subject_id'].dropna().unique().tolist()
hadm_ids = combined_df_with_ap['hadm_id'].dropna().unique().tolist()

# Step 2: Define the time limit for the first 24 hours
time_limit = 24 * 60 * 60  # 24 hours in seconds

# Step 3: Modify each query to select only the first record within 24 hours of admission

# 1.1 Respiratory Rate (RR)
icu_respiratory_rate_query = f"""
SELECT icu.subject_id, icu.hadm_id, icu.charttime, icu.itemid, icu.valuenum AS respiratory_rate,
FROM `physionet-data.mimiciv_icu.chartevents` icu
JOIN `physionet-data.mimiciv_hosp.admissions` adm
ON icu.subject_id = adm.subject_id AND icu.hadm_id = adm.hadm_id
WHERE icu.itemid = 220210
AND TIMESTAMP_DIFF(icu.charttime, adm.admittime, SECOND) <= {time_limit}
AND icu.subject_id IN ({', '.join(map(str, subject_ids))})
AND icu.hadm_id IN ({', '.join(map(str, hadm_ids))})
ORDER BY icu.subject_id, icu.hadm_id, icu.charttime
"""
icu_respiratory_rate_df = run_query(icu_respiratory_rate_query)
icu_respiratory_rate_df = icu_respiratory_rate_df.drop_duplicates(subset=['subject_id', 'hadm_id'], keep='first')
print(icu_respiratory_rate_df)

# 1.2 Heart Rate (HR)
icu_heart_rate_query = f"""
SELECT icu.subject_id, icu.hadm_id, icu.charttime, icu.itemid, icu.valuenum AS heart_rate
FROM `physionet-data.mimiciv_icu.chartevents` icu
JOIN `physionet-data.mimiciv_hosp.admissions` adm
ON icu.subject_id = adm.subject_id AND icu.hadm_id = adm.hadm_id
WHERE icu.itemid = 220045
AND TIMESTAMP_DIFF(icu.charttime, adm.admittime, SECOND) <= {time_limit}
AND icu.subject_id IN ({', '.join(map(str, subject_ids))})
AND icu.hadm_id IN ({', '.join(map(str, hadm_ids))})
ORDER BY icu.subject_id, icu.hadm_id, icu.charttime
"""
icu_heart_rate_df = run_query(icu_heart_rate_query)
icu_heart_rate_df = icu_heart_rate_df.drop_duplicates(subset=['subject_id', 'hadm_id'], keep='first')
print(icu_heart_rate_df)

# 1.3 Oxygen Saturation (OS)
hosp_oxygen_saturation_query = f"""
SELECT hosp.subject_id, hosp.hadm_id, hosp.charttime, hosp.itemid, hosp.valuenum AS oxygen_saturation
FROM `physionet-data.mimiciv_hosp.labevents` hosp
JOIN `physionet-data.mimiciv_hosp.admissions` adm
ON hosp.subject_id = adm.subject_id AND hosp.hadm_id = adm.hadm_id
WHERE hosp.itemid = 50817
AND TIMESTAMP_DIFF(hosp.charttime, adm.admittime, SECOND) <= {time_limit}
AND hosp.subject_id IN ({', '.join(map(str, subject_ids))})
AND hosp.hadm_id IN ({', '.join(map(str, hadm_ids))})
ORDER BY hosp.subject_id, hosp.hadm_id, hosp.charttime
"""
hosp_oxygen_saturation_df = run_query(hosp_oxygen_saturation_query)
hosp_oxygen_saturation_df = hosp_oxygen_saturation_df.drop_duplicates(subset=['subject_id', 'hadm_id'], keep='first')
print(hosp_oxygen_saturation_df)

# 1.4 Temperature
hosp_temperature_query = f"""
SELECT hosp.subject_id, hosp.hadm_id, hosp.charttime, hosp.itemid, hosp.valuenum AS temperature
FROM `physionet-data.mimiciv_hosp.labevents` hosp
JOIN `physionet-data.mimiciv_hosp.admissions` adm
ON hosp.subject_id = adm.subject_id AND hosp.hadm_id = adm.hadm_id
WHERE hosp.itemid = 50825
AND TIMESTAMP_DIFF(hosp.charttime, adm.admittime, SECOND) <= {time_limit}
AND hosp.subject_id IN ({', '.join(map(str, subject_ids))})
AND hosp.hadm_id IN ({', '.join(map(str, hadm_ids))})
ORDER BY hosp.subject_id, hosp.hadm_id, hosp.charttime
"""
hosp_temperature_df = run_query(hosp_temperature_query)
hosp_temperature_df = hosp_temperature_df.drop_duplicates(subset=['subject_id', 'hadm_id'], keep='first')
print(hosp_temperature_df)

# Step 4: Merge these results with the main dataset
combined_df_with_vitals = combined_df_with_ap.copy()
combined_df_with_vitals = pd.merge(combined_df_with_vitals, icu_respiratory_rate_df[['subject_id', 'hadm_id', 'respiratory_rate']], on=['subject_id', 'hadm_id'], how='left')
combined_df_with_vitals = pd.merge(combined_df_with_vitals, icu_heart_rate_df[['subject_id', 'hadm_id', 'heart_rate']], on=['subject_id', 'hadm_id'], how='left')
combined_df_with_vitals = pd.merge(combined_df_with_vitals, hosp_oxygen_saturation_df[['subject_id', 'hadm_id', 'oxygen_saturation']], on=['subject_id', 'hadm_id'], how='left')
combined_df_with_vitals = pd.merge(combined_df_with_vitals, hosp_temperature_df[['subject_id', 'hadm_id', 'temperature']], on=['subject_id', 'hadm_id'], how='left')

# Display the final combined dataset
print(combined_df_with_vitals.head())
print(f"Number of rows in the combined dataset with vital signs: {combined_df_with_vitals.shape[0]}")



       subject_id   hadm_id           charttime  itemid  respiratory_rate
0        10004606  29242151 2159-02-20 18:19:00  220210              15.0
20       10011668  22181970 2131-06-14 17:17:00  220210              19.0
44       10015860  25085565 2186-09-15 18:00:00  220210              20.0
61       10017531  22580355 2159-09-22 19:35:00  220210              26.0
87       10049833  20762302 2168-05-24 16:44:00  220210              16.0
...           ...       ...                 ...     ...               ...
19991    19929625  29789943 2153-06-19 21:59:00  220210              18.0
20039    19934880  20689670 2166-11-23 11:39:00  220210              15.0
20062    19962418  25331514 2132-10-24 21:40:00  220210              16.0
20087    19970491  22119205 2131-02-11 01:43:00  220210              15.0
20108    19970491  25338284 2129-05-17 17:55:00  220210              23.0

[825 rows x 5 columns]




       subject_id   hadm_id           charttime  itemid  heart_rate
0        10004606  29242151 2159-02-20 18:19:00  220045       107.0
20       10011668  22181970 2131-06-14 17:17:00  220045       107.0
44       10015860  25085565 2186-09-15 18:00:00  220045       114.0
61       10017531  22580355 2159-09-22 19:35:00  220045       116.0
87       10049833  20762302 2168-05-24 16:44:00  220045       124.0
...           ...       ...                 ...     ...         ...
20255    19929625  29789943 2153-06-19 21:58:00  220045       112.0
20303    19934880  20689670 2166-11-23 11:39:00  220045        84.0
20326    19962418  25331514 2132-10-24 21:40:00  220045       109.0
20351    19970491  22119205 2131-02-11 01:42:00  220045       116.0
20372    19970491  25338284 2129-05-17 17:55:00  220045        69.0

[826 rows x 5 columns]




     subject_id   hadm_id           charttime  itemid  oxygen_saturation
0      10017531  22580355 2159-09-23 10:45:00   50817               95.0
1      10057482  25416257 2145-03-23 21:08:00   50817               66.0
11     10153439  22115349 2121-03-11 22:58:00   50817               94.0
14     10163709  29550274 2148-03-23 03:39:00   50817               96.0
16     10199560  24622638 2175-01-11 07:29:00   50817               44.0
..          ...       ...                 ...     ...                ...
635    19811045  27885031 2162-07-08 04:35:00   50817               79.0
637    19859524  27439975 2151-01-30 21:35:00   50817               50.0
638    19882958  29628147 2182-08-29 18:52:00   50817               80.0
639    19904101  23626019 2131-04-22 23:52:00   50817               97.0
640    19962418  25331514 2132-10-25 01:30:00   50817               69.0

[260 rows x 5 columns]




     subject_id   hadm_id           charttime  itemid  temperature
0      10064759  25061074 2173-03-22 19:50:00   50825         37.5
1      10153439  22115349 2121-03-11 22:58:00   50825         40.4
3      10157331  26293454 2175-05-31 10:19:00   50825         37.0
7      10163709  29550274 2148-03-22 23:16:00   50825         36.5
9      10199560  24622638 2175-01-11 02:09:00   50825         33.6
..          ...       ...                 ...     ...          ...
559    19734308  27089714 2166-02-28 03:45:00   50825         37.1
560    19776126  20550940 2184-03-19 22:42:00   50825         36.5
562    19811045  27885031 2162-07-07 22:50:00   50825         37.8
567    19873806  23963601 2156-10-07 06:08:00   50825         37.2
568    19962418  25331514 2132-10-24 21:46:00   50825         36.1

[253 rows x 5 columns]
   subject_id   hadm_id    charttime_lipase  lipase_level           admittime  \
0    10002976  27179825                 NaT           NaN 2145-02-28 19:44:00   
1    10003

In [130]:
# Count the number of unique patients with vital signs data
num_patients_with_vitals = combined_df_with_vitals['subject_id'].nunique()
print(f"Number of unique patients with vital signs data: {num_patients_with_vitals}")

# Count the number of rows in the combined dataset with vital signs data
num_rows_with_vitals = combined_df_with_vitals.shape[0]
print(f"Number of rows in the combined dataset with vital signs data: {num_rows_with_vitals}")

Number of unique patients with vital signs data: 3412
Number of rows in the combined dataset with vital signs data: 3722


In [129]:
nan_counts = combined_df_with_vitals.isna().sum()
# Columns without NaN values
print("Columns without NaN values:")
print(combined_df_with_vitals.columns[combined_df_with_vitals.isna().sum() == 0])
# Columns with NaN values and their counts
nan_columns = nan_counts[nan_counts > 0]
print("Columns with NaN values and their counts:")
print(nan_columns)

Columns without NaN values:
Index(['subject_id', 'is_confirmed_ap'], dtype='object')
Columns with NaN values and their counts:
hadm_id                           58
charttime_lipase                1948
lipase_level                    1948
admittime                         58
dischtime                         58
gender                            58
approximate_age_at_admission      58
race                              58
actual_age                        58
in_hospital_death                 58
length_of_stay                    58
charttime_amylase               3486
amylase_level                   3486
charttime                       1851
crp_level                       1851
icd_code                        2428
average_weight                  2701
charlson_comorbidity_index        58
respiratory_rate                2897
heart_rate                      2896
oxygen_saturation               3462
temperature                     3469
dtype: int64


In [131]:
print("Missing `hadm_id` in patient_info_df:", patient_info_df['hadm_id'].isna().sum())
print("Missing `hadm_id` in lipase_values_df:", lipase_values_df['hadm_id'].isna().sum())
print("Missing `hadm_id` in amylase_values_df:", amylase_values_df['hadm_id'].isna().sum())
print("Missing `hadm_id` in crp_values_df:", crp_values_df['hadm_id'].isna().sum())
# Continue for all intermediate dataframes

Missing `hadm_id` in patient_info_df: 0
Missing `hadm_id` in lipase_values_df: 0
Missing `hadm_id` in amylase_values_df: 19449
Missing `hadm_id` in crp_values_df: 0


In [133]:
# Delete those rows with missing hadm_id
combined_df_with_vitals = combined_df_with_vitals.dropna(subset=['hadm_id'])

nan_counts = combined_df_with_vitals.isna().sum()
# Columns without NaN values
print("Columns without NaN values:")
print(combined_df_with_vitals.columns[combined_df_with_vitals.isna().sum() == 0])
# Columns with NaN values and their counts
nan_columns = nan_counts[nan_counts > 0]
print("Columns with NaN values and their counts:")
print(nan_columns)

Columns without NaN values:
Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'gender',
       'approximate_age_at_admission', 'race', 'actual_age',
       'in_hospital_death', 'length_of_stay', 'is_confirmed_ap',
       'charlson_comorbidity_index'],
      dtype='object')
Columns with NaN values and their counts:
charttime_lipase     1890
lipase_level         1890
charttime_amylase    3486
amylase_level        3486
charttime            1793
crp_level            1793
icd_code             2370
average_weight       2643
respiratory_rate     2839
heart_rate           2838
oxygen_saturation    3404
temperature          3411
dtype: int64


In [135]:
# Generate a CSV file with the final combined dataset
combined_df_with_vitals.to_csv('AP_ICD_CCI_dataset.csv', index=False)