In [1]:
import zipfile
import os
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import boto3
import pandas as pd
from io import BytesIO
from io import StringIO
import json
import requests 

In [2]:
# Initialize S3 client
s3 = boto3.client('s3')

# Function to load data 
bucket_name = 'ml-stack-759578648427-us-west-2'

def from_s3(bucket_name, file_name):
    response = s3.get_object(Bucket=bucket_name, Key=file_name)
    ext = os.path.splitext(file_name)[1]
    if ext == '.gz':
        data = pd.read_csv(BytesIO(response['Body'].read()), compression='gzip')
    elif ext == '.csv':
        data = pd.read_csv(BytesIO(response['Body'].read()))
    else:
        data = pd.read_excel(BytesIO(response['Body'].read()), sheet_name='DX_to_CCSR_Mapping', header=1)
    return data

# Import Modeling Set

In [3]:
file_name = 'MIMIC/Cleaned/df_omr_pt_ad.csv'
df = from_s3(bucket_name, file_name) 
df['charttime'] = pd.to_datetime(df['charttime'])
subjects = df['subject_id'].unique()
df.head()

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,gender,anchor_age,insurance,language,marital_status,race
0,10000826,2146-12-08 22:22:00,63.786,166.608,116.368,73.125,F,32,Private,English,SINGLE,WHITE
1,10000898,2187-09-26 08:00:00,62.251,162.011,126.0,76.0,F,79,,,,
2,10000980,2185-10-24 00:00:00,63.242,173.0,150.0,90.0,F,72,,,,
3,10001122,2142-04-08 10:48:00,62.926,170.767,131.007,72.597,F,70,,,,
4,10001401,2133-12-15 11:04:00,65.45,167.941,146.537,60.63,F,91,Medicare,English,MARRIED,WHITE


In [4]:
df[df.subject_id  == 10000980]

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,gender,anchor_age,insurance,language,marital_status,race
2,10000980,2185-10-24,63.242,173.0,150.0,90.0,F,72,,,,


# + Diagnosis

Use ICD 9 to 10 crosswalk from: https://github.com/JeanNikiema/mimicinicd11/tree/main

Use broad category map from: https://hcup-us.ahrq.gov/toolssoftware/ccsr/dxccsr.jsp

In [11]:
file_name = 'MIMIC/Cleaned/icd_code_map.csv'
icd_code_map = from_s3(bucket_name, file_name) 

file_name = 'MIMIC/admissions.csv.gz'
ad = from_s3(bucket_name, file_name) 

file_name = 'MIMIC/diagnoses_icd.csv.gz'
diag = from_s3(bucket_name, file_name) 

# filter to possible patients only
diag = diag[diag['subject_id'].isin(subjects)]

# join addmission to get discharge time - prevent data leakage
diag = diag.merge(ad[['subject_id', 'hadm_id', 'dischtime']], on=['subject_id', 'hadm_id'], how='left')
del ad

# merge icd_code_mapping
diag_mapped = diag.merge(icd_code_map, on='icd_code', how='left')

# % matched
matched = diag_mapped[~diag_mapped.ccsr_category.isna()]
num_matched = diag_mapped[~diag_mapped.ccsr_category.isna()].icd_code.nunique() / diag_mapped.icd_code.nunique()
print(f"Matched {num_matched:.2%} codes")

# keep broad codes only
diag_mapped = diag_mapped[['subject_id', 'dischtime', 'ccsr_category']].drop_duplicates()
# fill diag_mapped ccsr_category with 99
diag_mapped.ccsr_category.fillna(99, inplace=True)
diag_mapped.head()

Matched 98.49% codes


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diag_mapped.ccsr_category.fillna(99, inplace=True)


Unnamed: 0,subject_id,dischtime,ccsr_category
0,10000826,2146-12-12 16:30:00,DIG019
1,10000826,2146-12-12 16:30:00,MBD017
2,10000826,2146-12-12 16:30:00,RSP002
3,10000826,2146-12-12 16:30:00,SYM006
5,10000826,2146-12-12 16:30:00,GEN004


In [6]:
diag_pivot = diag_mapped.groupby(['subject_id', 'dischtime', 'ccsr_category']).size().reset_index()
diag_pivot = diag_pivot.pivot(index=['subject_id', 'dischtime'], columns='ccsr_category', values=0).reset_index()
diag_pivot.fillna(0, inplace=True)
diag_pivot.head()

ccsr_category,subject_id,dischtime,99,BLD001,BLD002,BLD003,BLD004,BLD005,BLD006,BLD007,...,SYM009,SYM010,SYM011,SYM012,SYM013,SYM014,SYM015,SYM016,SYM017,SYM018
0,10000826,2146-12-12 16:30:00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10000826,2146-12-24 19:55:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10000826,2147-01-02 17:45:00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10000980,2188-01-05 17:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10000980,2189-07-03 03:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# merge to sub/charttime data set
sub_time = df[['subject_id', 'charttime']].drop_duplicates()
df_diag = sub_time.merge(diag_pivot, on=['subject_id'], how='left')

# filter disch less than chartime
df_diag['dischtime'] = pd.to_datetime(df_diag['dischtime'])
df_diag['charttime'] = pd.to_datetime(df_diag['charttime'])
df_diag = df_diag[(df_diag.dischtime <= df_diag.charttime) | (df_diag.dischtime.isna() == True)]

# drop disch
df_diag.drop(columns=['dischtime'], inplace=True)

# count all diagnoses in medical hisotry up to charttime
df_diag = df_diag.groupby(['subject_id', 'charttime']).sum().reset_index()
df_diag.fillna(0, inplace=True)
df_diag.head()

Unnamed: 0,subject_id,charttime,99,BLD001,BLD002,BLD003,BLD004,BLD005,BLD006,BLD007,...,SYM009,SYM010,SYM011,SYM012,SYM013,SYM014,SYM015,SYM016,SYM017,SYM018
0,10000898,2187-09-26 08:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10001122,2142-04-08 10:48:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10001401,2133-12-15 11:04:00,0.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,10001401,2134-10-09 13:45:00,0.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
4,10001401,2135-09-20 13:04:00,0.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [8]:
# join to final data set 
df_omr_pt_ad_diag = df.merge(df_diag, on=['subject_id', 'charttime'], how='left')
df_omr_pt_ad_diag.head()

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,gender,anchor_age,insurance,language,...,SYM009,SYM010,SYM011,SYM012,SYM013,SYM014,SYM015,SYM016,SYM017,SYM018
0,10000826,2146-12-08 22:22:00,63.786,166.608,116.368,73.125,F,32,Private,English,...,,,,,,,,,,
1,10000898,2187-09-26 08:00:00,62.251,162.011,126.0,76.0,F,79,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10000980,2185-10-24 00:00:00,63.242,173.0,150.0,90.0,F,72,,,...,,,,,,,,,,
3,10001122,2142-04-08 10:48:00,62.926,170.767,131.007,72.597,F,70,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10001401,2133-12-15 11:04:00,65.45,167.941,146.537,60.63,F,91,Medicare,English,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0


In [9]:
df_omr_pt_ad_diag.describe()

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,anchor_age,99,BLD001,BLD002,...,SYM009,SYM010,SYM011,SYM012,SYM013,SYM014,SYM015,SYM016,SYM017,SYM018
count,44918.0,44918,44916.0,44916.0,44915.0,44915.0,44918.0,30209.0,30209.0,30209.0,...,30209.0,30209.0,30209.0,30209.0,30209.0,30209.0,30209.0,30209.0,30209.0,30209.0
mean,15008362.972,2155-11-13 19:40:29.742196736,64.81,175.4,128.159,72.68,64.246,1.192,0.339,0.03,...,0.0,0.268,0.313,0.249,0.365,0.074,0.059,0.333,0.486,0.009
min,10000826.0,2109-03-19 11:40:00,0.0,0.0,70.0,8.0,17.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12495580.5,2135-12-22 12:53:30,62.5,147.863,120.722,68.969,53.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15002051.5,2155-11-02 22:04:30,64.019,171.72,129.147,72.429,67.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,17544474.25,2175-11-29 19:46:00,67.75,193.633,134.25,76.59,79.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,19999987.0,2214-12-24 05:14:00,96.0,1213.18,236.0,135.0,106.0,51.0,40.0,27.0,...,1.0,43.0,24.0,21.0,30.0,8.0,10.0,26.0,32.0,7.0
std,2893254.342,,3.705,44.533,12.304,7.45,19.028,2.791,1.414,0.437,...,0.014,0.809,0.978,0.761,1.069,0.333,0.308,0.959,1.249,0.131


In [21]:
df_omr_pt_ad_diag[df_omr_pt_ad_diag.subject_id  == 10002114]

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,gender,anchor_age,insurance,language,...,SYM009,SYM010,SYM011,SYM012,SYM013,SYM014,SYM015,SYM016,SYM017,SYM018
12,10002114,2162-02-17 21:53:00,69.097,204.136,128.14,76.176,M,56,,,...,,,,,,,,,,


In [22]:
diag[diag.subject_id  == 10002114].dischtime.min()

'2162-03-04 15:16:00'

In [18]:
s3_key = 'MIMIC/Cleaned/df_omr_pt_ad_diag.csv'

# Save the DataFrame to a CSV in memory (using StringIO)
csv_buffer = StringIO()
df_omr_pt_ad_diag.to_csv(csv_buffer, index=False)

# Upload the CSV data to S3
s3.put_object(Bucket=bucket_name, Key=s3_key, Body=csv_buffer.getvalue())

print(f"DataFrame successfully exported to s3://{bucket_name}/{s3_key}")

DataFrame successfully exported to s3://ml-stack-759578648427-us-west-2/MIMIC/Cleaned/df_omr_pt_ad_diag.csv
