In [53]:
import os
import pandas as pd

mimic_iv_cxr_parent = "/data/wang/junh/datasets/physionet.org/files/mimic-cxr-jpg/2.0.0"

mm_dir = "/data/wang/junh/datasets/multimodal"
preprocessing_dir = os.path.join(mm_dir, "preprocessing")
f_path = os.path.join(preprocessing_dir, "cxr_embeddings.pkl")

In [54]:
cxr_df = pd.read_pickle(f_path)
print(cxr_df.columns)

Index(['dicom_id', 'subject_id', 'study_id',
       'PerformedProcedureStepDescription', 'ViewPosition',
       'ProcedureCodeSequence_CodeMeaning', 'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning', 'densefeatures',
       'predictions'],
      dtype='object')


In [55]:
mimic_iv_cxr_parent = "/data/wang/junh/datasets/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-jpg-2.1.0.physionet.org"
f_path = os.path.join(mimic_iv_cxr_parent, "mimic-cxr-2.0.0-metadata.csv.gz")
meta_data_df = pd.read_csv(f_path, low_memory=False)

In [56]:
print(meta_data_df.columns)

Index(['dicom_id', 'subject_id', 'study_id',
       'PerformedProcedureStepDescription', 'ViewPosition', 'Rows', 'Columns',
       'StudyDate', 'StudyTime', 'ProcedureCodeSequence_CodeMeaning',
       'ViewCodeSequence_CodeMeaning',
       'PatientOrientationCodeSequence_CodeMeaning'],
      dtype='object')


In [57]:
print(meta_data_df.head())

                                       dicom_id  subject_id  study_id  \
0  02aa804e-bde0afdd-112c0b34-7bc16630-4e384014    10000032  50414267   
1  174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962    10000032  50414267   
2  2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab    10000032  53189527   
3  e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c    10000032  53189527   
4  68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714    10000032  53911762   

  PerformedProcedureStepDescription ViewPosition  Rows  Columns  StudyDate  \
0                CHEST (PA AND LAT)           PA  3056     2544   21800506   
1                CHEST (PA AND LAT)      LATERAL  3056     2544   21800506   
2                CHEST (PA AND LAT)           PA  3056     2544   21800626   
3                CHEST (PA AND LAT)      LATERAL  3056     2544   21800626   
4               CHEST (PORTABLE AP)           AP  2705     2539   21800723   

    StudyTime ProcedureCodeSequence_CodeMeaning ViewCodeSequence_CodeMeaning  \
0  213014.53

In [64]:
cxr_embeddings_df = cxr_df.merge(meta_data_df[['dicom_id', 'StudyDate', 'StudyTime']], on='dicom_id', how='left')

In [65]:
# Convert StudyDate to a string format (YYYY-MM-DD)
cxr_embeddings_df['StudyDate'] = pd.to_datetime(cxr_embeddings_df['StudyDate'].astype(str), format='%Y%m%d')

# Function to format StudyTime into HH:MM:SS and handle the 60-second case
def format_study_time(study_time):
    # Convert StudyTime to a six-digit time (HHMMSS) without fractional seconds
    study_time_str = '{:06.0f}'.format(study_time)  # Ensure it has 6 digits and no decimals
    hours = int(study_time_str[:2])
    minutes = int(study_time_str[2:4])
    seconds = int(study_time_str[4:6])
    
    # Handle cases where seconds are 60 by incrementing the minutes and resetting seconds to 00
    if seconds >= 60:
        seconds = 0
        minutes += 1
        
        # Handle overflow for minutes (i.e., if minutes become 60)
        if minutes >= 60:
            minutes = 0
            hours += 1
            
            # Handle overflow for hours (i.e., if hours become 24)
            if hours >= 24:
                hours = 0

    return f"{hours:02}:{minutes:02}:{seconds:02}"

# Apply the formatting function to the StudyTime column
cxr_embeddings_df['StudyTime'] = cxr_embeddings_df['StudyTime'].apply(format_study_time)

# Combine StudyDate and StudyTime into a single datetime column
cxr_embeddings_df['cxrtime'] = pd.to_datetime(cxr_embeddings_df['StudyDate'].astype(str) + ' ' + cxr_embeddings_df['StudyTime'])

# Display the resulting dataframe
print(cxr_embeddings_df[['StudyDate', 'StudyTime', 'cxrtime']])

        StudyDate StudyTime             cxrtime
0      2180-05-06  21:30:15 2180-05-06 21:30:15
1      2180-05-06  21:30:15 2180-05-06 21:30:15
2      2180-06-26  16:55:00 2180-06-26 16:55:00
3      2180-06-26  16:55:00 2180-06-26 16:55:00
4      2180-07-23  08:05:57 2180-07-23 08:05:57
...           ...       ...                 ...
377105 2152-07-08  22:45:50 2152-07-08 22:45:50
377106 2152-07-08  22:45:50 2152-07-08 22:45:50
377107 2145-11-04  05:14:48 2145-11-04 05:14:48
377108 2145-11-02  20:28:09 2145-11-02 20:28:09
377109 2145-11-03  05:05:08 2145-11-03 05:05:08

[377110 rows x 3 columns]


In [66]:
cxr_embeddings_df['cxrtime'] = pd.to_datetime(cxr_embeddings_df['cxrtime'])

mimic_iv_path = "/data/wang/junh/datasets/physionet.org/files/mimiciv/2.2"
icustays_df = pd.read_csv(os.path.join(mimic_iv_path, "icu", "icustays.csv.gz"), low_memory=False)
icustays_df['intime'] = pd.to_datetime(icustays_df['intime'])
icustays_df['outtime'] = pd.to_datetime(icustays_df['outtime'])

admissions_df = pd.read_csv(os.path.join(mimic_iv_path, "hosp", "admissions.csv.gz"), low_memory=False)
admissions_df['admittime'] = pd.to_datetime(admissions_df['admittime'])
admissions_df['dischtime'] = pd.to_datetime(admissions_df['dischtime'])

In [67]:
print(cxr_embeddings_df.head())

                                       dicom_id  subject_id  study_id  \
0  02aa804e-bde0afdd-112c0b34-7bc16630-4e384014    10000032  50414267   
1  174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962    10000032  50414267   
2  2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab    10000032  53189527   
3  e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c    10000032  53189527   
4  68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714    10000032  53911762   

  PerformedProcedureStepDescription ViewPosition  \
0                CHEST (PA AND LAT)           PA   
1                CHEST (PA AND LAT)      LATERAL   
2                CHEST (PA AND LAT)           PA   
3                CHEST (PA AND LAT)      LATERAL   
4               CHEST (PORTABLE AP)           AP   

  ProcedureCodeSequence_CodeMeaning ViewCodeSequence_CodeMeaning  \
0                CHEST (PA AND LAT)             postero-anterior   
1                CHEST (PA AND LAT)                      lateral   
2                CHEST (PA AND LAT)         

In [68]:
cxr_embeddings_df = cxr_embeddings_df.drop(columns=['StudyDate', 'StudyTime'])

In [69]:
from tqdm import tqdm

def calc_time_delta_hrs(icu_intime, charttime):
    return (charttime - icu_intime).total_seconds() / 3600

cxr_embeddings_df['hadm_id'] = None
cxr_embeddings_df['stay_id'] = None
cxr_embeddings_df['icu_time_delta'] = None
cxr_embeddings_df['hosp_time_delta'] = None

for index, row in tqdm(cxr_embeddings_df.iterrows(), total=cxr_embeddings_df.shape[0]):
    curr_pts_icustays = icustays_df[icustays_df['subject_id'] == row['subject_id']]
    
    for icu_index, icu_row in curr_pts_icustays.iterrows():
        if icu_row['intime'] <= row['cxrtime'] <= icu_row['outtime']:
            cxr_embeddings_df.loc[index, 'stay_id'] = icu_row['stay_id']
            cxr_embeddings_df.loc[index, 'icu_time_delta'] = calc_time_delta_hrs(icu_row['intime'], row['cxrtime'])
    
    curr_pts_admissions = admissions_df[admissions_df['subject_id'] == row['subject_id']]

    for hosp_index, hosp_row in curr_pts_admissions.iterrows():
        if hosp_row['admittime'] <= row['cxrtime'] <= hosp_row['dischtime']:
            cxr_embeddings_df.loc[index, 'hadm_id'] = hosp_row['hadm_id']
            cxr_embeddings_df.loc[index, 'hosp_time_delta'] = calc_time_delta_hrs(hosp_row['admittime'], row['cxrtime'])

100%|██████████| 377110/377110 [09:09<00:00, 686.12it/s]


In [70]:
cxr_embeddings_df.to_pickle(os.path.join(preprocessing_dir, "cxr_embeddings_stay.pkl"))

In [71]:
cxr_embeddings_df['stay_id'].isna().mean()

0.8424465010209222

In [72]:
cxr_embeddings_df = pd.read_pickle(os.path.join(preprocessing_dir, "cxr_embeddings_stay.pkl"))