In [2]:
import zipfile
import os
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import boto3
import pandas as pd
from io import BytesIO
from io import StringIO
import json
import requests 

In [3]:
# Initialize S3 client
s3 = boto3.client('s3')

# Function to load data 
bucket_name = 'ml-stack-759578648427-us-west-2'

def from_s3(bucket_name, file_name):
    response = s3.get_object(Bucket=bucket_name, Key=file_name)
    ext = os.path.splitext(file_name)[1]
    if ext == '.gz':
        data = pd.read_csv(BytesIO(response['Body'].read()), compression='gzip')
    elif ext == '.csv':
        data = pd.read_csv(BytesIO(response['Body'].read()))
    else:
        data = pd.read_excel(BytesIO(response['Body'].read()), sheet_name='DX_to_CCSR_Mapping', header=1)
    return data

# Import Modeling Set

In [39]:
file_name = 'MIMIC/Cleaned/df_omr_pt_ad_diag.csv'
df = from_s3(bucket_name, file_name) 
df['charttime'] = pd.to_datetime(df['charttime'])
subjects = df['subject_id'].unique()
df.head()

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,gender,anchor_age,insurance,language,...,SYM009,SYM010,SYM011,SYM012,SYM013,SYM014,SYM015,SYM016,SYM017,SYM018
0,10000826,2146-12-08 22:22:00,63.786,166.608,116.368,73.125,F,32,Private,English,...,,,,,,,,,,
1,10000898,2187-09-26 08:00:00,62.251,162.011,126.0,76.0,F,79,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10000980,2185-10-24 00:00:00,63.242,173.0,150.0,90.0,F,72,,,...,,,,,,,,,,
3,10001122,2142-04-08 10:48:00,62.926,170.767,131.007,72.597,F,70,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10001401,2133-12-15 11:04:00,65.45,167.941,146.537,60.63,F,91,Medicare,English,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0


# + Procedures


In [40]:
file_name = 'MIMIC/procedures_icd.csv.gz'
proc = from_s3(bucket_name, file_name) 
# filter subjects 
proc = proc[proc['subject_id'].isin(subjects)]
proc = proc[['subject_id', 'chartdate']]
proc['chartdate'] = pd.to_datetime(proc['chartdate'])
proc.head()

Unnamed: 0,subject_id,chartdate
13,10000826,2146-12-05
14,10000826,2146-12-11
15,10000826,2146-12-18
16,10000826,2146-12-24
17,10000826,2146-12-20


In [41]:
# get total number of procedures per chardate
proc_num = proc.groupby(['subject_id', 'chartdate']).size().reset_index()
# merge to sub/charttime data set
sub_time = df[['subject_id', 'charttime']].drop_duplicates()
df_proc = sub_time.merge(proc_num, on=['subject_id'], how='left')

# filter chartdate less than chartime
df_proc = df_proc[(df_proc.chartdate <= df_proc.charttime) | (df_proc.chartdate.isna() == True)]

# count num of procedures & get max chardate 
df_proc = df_proc.groupby(['subject_id', 'charttime']).agg({0:'sum', 'chartdate':'max'}).reset_index()
df_proc.rename({0:'procedure_ct'}, axis=1, inplace=True)
#df_proc.chartdate.fillna(df_proc.charttime, inplace = True)
df_proc['days_since_last_proc'] = (df_proc.charttime - df_proc.chartdate).dt.days
# drop disch
df_proc.drop(columns=['chartdate'], inplace=True)
df_proc.head()

Unnamed: 0,subject_id,charttime,procedure_ct,days_since_last_proc
0,10000826,2146-12-08 22:22:00,1.0,3.0
1,10000898,2187-09-26 08:00:00,0.0,
2,10001122,2142-04-08 10:48:00,0.0,
3,10001401,2133-12-15 11:04:00,23.0,155.0
4,10001401,2134-10-09 13:45:00,23.0,453.0


In [44]:
# join to final data set 
df_omr_pt_ad_diag_proc = df.merge(df_proc, on=['subject_id', 'charttime'], how='left')
df_omr_pt_ad_diag_proc.head()

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,gender,anchor_age,insurance,language,...,SYM011,SYM012,SYM013,SYM014,SYM015,SYM016,SYM017,SYM018,procedure_ct,days_since_last_proc
0,10000826,2146-12-08 22:22:00,63.786,166.608,116.368,73.125,F,32,Private,English,...,,,,,,,,,1.0,3.0
1,10000898,2187-09-26 08:00:00,62.251,162.011,126.0,76.0,F,79,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,10000980,2185-10-24 00:00:00,63.242,173.0,150.0,90.0,F,72,,,...,,,,,,,,,,
3,10001122,2142-04-08 10:48:00,62.926,170.767,131.007,72.597,F,70,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,10001401,2133-12-15 11:04:00,65.45,167.941,146.537,60.63,F,91,Medicare,English,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,23.0,155.0


In [46]:
df.shape

(44918, 537)

In [45]:
df_omr_pt_ad_diag_proc.shape

(44918, 539)

In [47]:
s3_key = 'MIMIC/Cleaned/df_omr_pt_ad_diag_proc.csv'

# Save the DataFrame to a CSV in memory (using StringIO)
csv_buffer = StringIO()
df_omr_pt_ad_diag_proc.to_csv(csv_buffer, index=False)

# Upload the CSV data to S3
s3.put_object(Bucket=bucket_name, Key=s3_key, Body=csv_buffer.getvalue())

print(f"DataFrame successfully exported to s3://{bucket_name}/{s3_key}")

DataFrame successfully exported to s3://ml-stack-759578648427-us-west-2/MIMIC/Cleaned/df_omr_pt_ad_diag_proc.csv
