In [2]:
import zipfile
import os
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import boto3
import pandas as pd
from io import BytesIO
from io import StringIO
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [3]:
# Initialize S3 client
s3 = boto3.client('s3')

# Function to load data 
bucket_name = 'ml-stack-759578648427-us-west-2'

def from_s3(bucket_name, file_name):
    response = s3.get_object(Bucket=bucket_name, Key=file_name)
    ext = os.path.splitext(file_name)[1]
    if ext == '.gz':
        data = pd.read_csv(BytesIO(response['Body'].read()), compression='gzip')
    elif ext == '.csv':
        data = pd.read_csv(BytesIO(response['Body'].read()))
    else:
        data = pd.read_excel(BytesIO(response['Body'].read()), sheet_name='DX_to_CCSR_Mapping', header=1)
    return data

# Import Labeled Data

In [4]:
file_name = 'final_microbiology_df.csv.gz'
df = from_s3(bucket_name, file_name) 
df['charttime'] = pd.to_datetime(df['charttime'])
subjects = df['subject_id'].unique()
df.head()

Unnamed: 0,subject_id,hadm_id,org_name,ab_name,charttime,interpretation,susceptible_flag
0,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN,2146-12-08 22:22:00,S,1.0
1,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,2146-12-08 22:22:00,S,1.0
2,10000826,20032235.0,ESCHERICHIA COLI,CEFAZOLIN,2146-12-08 22:22:00,S,1.0
3,10000826,20032235.0,ESCHERICHIA COLI,CEFEPIME,2146-12-08 22:22:00,S,1.0
4,10000826,20032235.0,ESCHERICHIA COLI,CEFTAZIDIME,2146-12-08 22:22:00,S,1.0


In [5]:
# final pairs 
pairs = [('ESCHERICHIA COLI', 'AMPICILLIN'),
 ('ESCHERICHIA COLI', 'AMPICILLIN/SULBACTAM'),
 ('ESCHERICHIA COLI', 'CEFAZOLIN'),
 ('ESCHERICHIA COLI', 'CEFEPIME'),
 ('ESCHERICHIA COLI', 'CEFTAZIDIME'),
 ('ESCHERICHIA COLI', 'CEFTRIAXONE'),
 ('ESCHERICHIA COLI', 'CIPROFLOXACIN'),
 ('ESCHERICHIA COLI', 'GENTAMICIN'),
 ('ESCHERICHIA COLI', 'MEROPENEM'),
 ('ESCHERICHIA COLI', 'NITROFURANTOIN'),
 ('ESCHERICHIA COLI', 'TOBRAMYCIN'),
 ('ESCHERICHIA COLI', 'TRIMETHOPRIM/SULFA'),
 ('PSEUDOMONAS AERUGINOSA', 'CEFEPIME'),
 ('PSEUDOMONAS AERUGINOSA', 'CEFTAZIDIME'),
 ('PSEUDOMONAS AERUGINOSA', 'CIPROFLOXACIN'),
 ('PSEUDOMONAS AERUGINOSA', 'GENTAMICIN'),
 ('PSEUDOMONAS AERUGINOSA', 'MEROPENEM'),
 ('PSEUDOMONAS AERUGINOSA', 'PIPERACILLIN/TAZO'),
 ('PSEUDOMONAS AERUGINOSA', 'TOBRAMYCIN'),
 ('ESCHERICHIA COLI', 'PIPERACILLIN/TAZO'),
 ('KLEBSIELLA PNEUMONIAE', 'AMPICILLIN/SULBACTAM'),
 ('KLEBSIELLA PNEUMONIAE', 'CEFAZOLIN'),
 ('KLEBSIELLA PNEUMONIAE', 'CEFEPIME'),
 ('KLEBSIELLA PNEUMONIAE', 'CEFTAZIDIME'),
 ('KLEBSIELLA PNEUMONIAE', 'CEFTRIAXONE'),
 ('KLEBSIELLA PNEUMONIAE', 'CIPROFLOXACIN'),
 ('KLEBSIELLA PNEUMONIAE', 'GENTAMICIN'),
 ('KLEBSIELLA PNEUMONIAE', 'MEROPENEM'),
 ('KLEBSIELLA PNEUMONIAE', 'NITROFURANTOIN'),
 ('KLEBSIELLA PNEUMONIAE', 'PIPERACILLIN/TAZO'),
 ('KLEBSIELLA PNEUMONIAE', 'TOBRAMYCIN'),
 ('KLEBSIELLA PNEUMONIAE', 'TRIMETHOPRIM/SULFA')]

df = df[df[['org_name', 'ab_name']].apply(tuple, axis=1).isin(pairs)]

# Features

In [8]:
pd.options.display.max_seq_items = 4000

In [28]:
file_name = 'MIMIC/Cleaned/df_omr_pt_ad_diag_proc_pr.csv'
feat = from_s3(bucket_name, file_name)
feat.rename({'99':'DIAG_99'}, axis=1, inplace=True)
feat['charttime'] = pd.to_datetime(feat['charttime'])
feat.head()

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,gender,anchor_age,insurance,language,...,V03AC,V03AE,V03AF,V03AX,V04CC,V04CF,V04CG,V04CK,V06DC,V07AB
0,10000826,2146-12-08 22:22:00,63.786,166.608,116.368,73.125,F,32,Private,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10000898,2187-09-26 08:00:00,62.251,162.011,126.0,76.0,F,79,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10001122,2142-04-08 10:48:00,62.926,170.767,131.007,72.597,F,70,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10001401,2133-12-15 11:04:00,65.45,167.941,146.537,60.63,F,91,Medicare,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10001401,2134-10-09 13:45:00,65.5,168.283,146.155,60.552,F,92,Medicare,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# change data types 
feat.gender = feat.gender.astype('object')
feat.insurance = feat.insurance.astype('object')
feat.language = feat.language.astype('object')
feat.marital_status = feat.marital_status.astype('object')
feat.race = feat.race.astype('object')

# one hote encode categorical features 
categorical_columns = feat.select_dtypes(include=['object']).columns
feat_encoded = pd.get_dummies(feat, columns=categorical_columns)
feat_encoded.head()

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,anchor_age,DIAG_99,BLD001,BLD002,...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,10000826,2146-12-08 22:22:00,63.786,166.608,116.368,73.125,32,,,,...,False,False,False,False,False,True,False,False,False,False
1,10000898,2187-09-26 08:00:00,62.251,162.011,126.0,76.0,79,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,10001122,2142-04-08 10:48:00,62.926,170.767,131.007,72.597,70,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,10001401,2133-12-15 11:04:00,65.45,167.941,146.537,60.63,91,0.0,1.0,0.0,...,False,False,False,False,False,True,False,False,False,False
4,10001401,2134-10-09 13:45:00,65.5,168.283,146.155,60.552,92,0.0,1.0,0.0,...,False,False,False,False,False,True,False,False,False,False


In [31]:
# missing data ?
feat_encoded.dropna()

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,anchor_age,DIAG_99,BLD001,BLD002,...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
3,10001401,2133-12-15 11:04:00,65.450,167.941,146.537,60.630,91,0.000,1.000,0.000,...,False,False,False,False,False,True,False,False,False,False
4,10001401,2134-10-09 13:45:00,65.500,168.283,146.155,60.552,92,0.000,1.000,0.000,...,False,False,False,False,False,True,False,False,False,False
5,10001401,2135-09-20 13:04:00,65.542,168.566,146.350,60.567,93,0.000,1.000,0.000,...,False,False,False,False,False,True,False,False,False,False
6,10001725,2116-10-25 13:07:00,62.201,158.384,119.786,74.410,52,1.000,0.000,0.000,...,False,False,False,False,False,True,False,False,False,False
7,10001725,2118-11-26 11:05:00,62.201,158.476,120.703,74.785,54,1.000,0.000,0.000,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35522,19997911,2194-07-20 10:45:00,56.125,120.387,127.844,59.969,85,0.000,0.000,0.000,...,False,False,False,False,False,True,False,False,False,False
35523,19998497,2144-01-13 11:29:00,61.557,147.863,135.221,69.256,87,5.000,0.000,0.000,...,False,False,False,False,False,True,False,False,False,False
35524,19998591,2185-07-11 02:27:00,63.947,173.560,125.759,76.545,53,1.000,0.000,0.000,...,False,False,False,False,False,True,False,False,False,False
35525,19999442,2148-11-27 22:13:00,75.750,220.000,122.000,82.000,43,1.000,0.000,0.000,...,False,False,False,False,False,True,False,False,False,False


# Final Modeling Set

In [32]:
df = df.merge(feat_encoded, on=['subject_id', 'charttime'])
df.head()

Unnamed: 0,subject_id,hadm_id,org_name,ab_name,charttime,interpretation,susceptible_flag,Height (Inches),Weight (Lbs),bp_systolic,...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN,2146-12-08 22:22:00,S,1.0,63.786,166.608,116.368,...,False,False,False,False,False,True,False,False,False,False
1,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,2146-12-08 22:22:00,S,1.0,63.786,166.608,116.368,...,False,False,False,False,False,True,False,False,False,False
2,10000826,20032235.0,ESCHERICHIA COLI,CEFAZOLIN,2146-12-08 22:22:00,S,1.0,63.786,166.608,116.368,...,False,False,False,False,False,True,False,False,False,False
3,10000826,20032235.0,ESCHERICHIA COLI,CEFEPIME,2146-12-08 22:22:00,S,1.0,63.786,166.608,116.368,...,False,False,False,False,False,True,False,False,False,False
4,10000826,20032235.0,ESCHERICHIA COLI,CEFTAZIDIME,2146-12-08 22:22:00,S,1.0,63.786,166.608,116.368,...,False,False,False,False,False,True,False,False,False,False
