In [1]:
import zipfile
import os
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import boto3
import pandas as pd
from io import BytesIO
from io import StringIO
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [2]:
# Initialize S3 client
s3 = boto3.client('s3')

# Function to load data 
bucket_name = 'ml-stack-759578648427-us-west-2'

def from_s3(bucket_name, file_name):
    response = s3.get_object(Bucket=bucket_name, Key=file_name)
    ext = os.path.splitext(file_name)[1]
    if ext == '.gz':
        data = pd.read_csv(BytesIO(response['Body'].read()), compression='gzip')
    elif ext == '.csv':
        data = pd.read_csv(BytesIO(response['Body'].read()))
    else:
        data = pd.read_excel(BytesIO(response['Body'].read()), sheet_name='DX_to_CCSR_Mapping', header=1)
    return data

# Import Labeled Data

In [12]:
file_name = 'final_microbiology_df.csv.gz'
df = from_s3(bucket_name, file_name) 
df['charttime'] = pd.to_datetime(df['charttime'])
subjects = df['subject_id'].unique()
df.head()

Unnamed: 0,subject_id,hadm_id,org_name,ab_name,charttime,interpretation,susceptible_flag
0,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN,2146-12-08 22:22:00,S,1.0
1,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,2146-12-08 22:22:00,S,1.0
2,10000826,20032235.0,ESCHERICHIA COLI,CEFAZOLIN,2146-12-08 22:22:00,S,1.0
3,10000826,20032235.0,ESCHERICHIA COLI,CEFEPIME,2146-12-08 22:22:00,S,1.0
4,10000826,20032235.0,ESCHERICHIA COLI,CEFTAZIDIME,2146-12-08 22:22:00,S,1.0


In [62]:
# final pairs 
pairs = [('ESCHERICHIA COLI', 'AMPICILLIN'),
 ('ESCHERICHIA COLI', 'AMPICILLIN/SULBACTAM'),
 ('ESCHERICHIA COLI', 'CEFAZOLIN'),
 ('ESCHERICHIA COLI', 'CEFEPIME'),
 ('ESCHERICHIA COLI', 'CEFTAZIDIME'),
 ('ESCHERICHIA COLI', 'CEFTRIAXONE'),
 ('ESCHERICHIA COLI', 'CIPROFLOXACIN'),
 ('ESCHERICHIA COLI', 'GENTAMICIN'),
 ('ESCHERICHIA COLI', 'MEROPENEM'),
 ('ESCHERICHIA COLI', 'NITROFURANTOIN'),
 ('ESCHERICHIA COLI', 'TOBRAMYCIN'),
 ('ESCHERICHIA COLI', 'TRIMETHOPRIM/SULFA'),
 ('PSEUDOMONAS AERUGINOSA', 'CEFEPIME'),
 ('PSEUDOMONAS AERUGINOSA', 'CEFTAZIDIME'),
 ('PSEUDOMONAS AERUGINOSA', 'CIPROFLOXACIN'),
 ('PSEUDOMONAS AERUGINOSA', 'GENTAMICIN'),
 ('PSEUDOMONAS AERUGINOSA', 'MEROPENEM'),
 ('PSEUDOMONAS AERUGINOSA', 'PIPERACILLIN/TAZO'),
 ('PSEUDOMONAS AERUGINOSA', 'TOBRAMYCIN'),
 ('ESCHERICHIA COLI', 'PIPERACILLIN/TAZO'),
 ('KLEBSIELLA PNEUMONIAE', 'AMPICILLIN/SULBACTAM'),
 ('KLEBSIELLA PNEUMONIAE', 'CEFAZOLIN'),
 ('KLEBSIELLA PNEUMONIAE', 'CEFEPIME'),
 ('KLEBSIELLA PNEUMONIAE', 'CEFTAZIDIME'),
 ('KLEBSIELLA PNEUMONIAE', 'CEFTRIAXONE'),
 ('KLEBSIELLA PNEUMONIAE', 'CIPROFLOXACIN'),
 ('KLEBSIELLA PNEUMONIAE', 'GENTAMICIN'),
 ('KLEBSIELLA PNEUMONIAE', 'MEROPENEM'),
 ('KLEBSIELLA PNEUMONIAE', 'NITROFURANTOIN'),
 ('KLEBSIELLA PNEUMONIAE', 'PIPERACILLIN/TAZO'),
 ('KLEBSIELLA PNEUMONIAE', 'TOBRAMYCIN'),
 ('KLEBSIELLA PNEUMONIAE', 'TRIMETHOPRIM/SULFA')]

df = df[df[['org_name', 'ab_name']].apply(tuple, axis=1).isin(pairs)]

# Majority Classifier - Baseline Metrics 

In [63]:
# get unique org & ab combos 
org_ab_label = df[['org_name', 'ab_name']].drop_duplicates()

In [65]:
# for each label calculate metrics for majority classifier 
baselines = []

for index, row in org_ab_label.iterrows():
    org = f"{row.org_name}"
    ab = f"{row.ab_name}"
    sub = df.query(f"org_name == '{org}' & ab_name == '{ab}'")
    y_true = sub.susceptible_flag 

    if (y_true.sum() / len(y_true)) > 0.5:
        maj_class = 1
    else:
        maj_class = 0
    
    y_pred = [maj_class] * len(y_true)
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    metrics = {'bacteria':org, 'antibiotic':ab, 'accuracy':accuracy, 'precision':precision, 'recall':recall, 'f1_score':f1}
    baselines.append(metrics)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [66]:
baseline_results = pd.DataFrame(baselines)
baseline_results

Unnamed: 0,bacteria,antibiotic,accuracy,precision,recall,f1_score
0,ESCHERICHIA COLI,AMPICILLIN,0.504,0.504,1.0,0.67
1,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,0.611,0.611,1.0,0.759
2,ESCHERICHIA COLI,CEFAZOLIN,0.806,0.806,1.0,0.892
3,ESCHERICHIA COLI,CEFEPIME,0.915,0.915,1.0,0.956
4,ESCHERICHIA COLI,CEFTAZIDIME,0.921,0.921,1.0,0.959
5,ESCHERICHIA COLI,CEFTRIAXONE,0.883,0.883,1.0,0.938
6,ESCHERICHIA COLI,CIPROFLOXACIN,0.732,0.732,1.0,0.846
7,ESCHERICHIA COLI,GENTAMICIN,0.897,0.897,1.0,0.946
8,ESCHERICHIA COLI,MEROPENEM,0.999,0.999,1.0,0.999
9,ESCHERICHIA COLI,NITROFURANTOIN,0.957,0.957,1.0,0.978


# Features

In [3]:
file_name = 'MIMIC/Cleaned/df_omr_pt_ad_diag_proc_pr.csv'
feat = from_s3(bucket_name, file_name)
feat.head()

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,gender,anchor_age,insurance,language,...,V03AC,V03AE,V03AF,V03AX,V04CC,V04CF,V04CG,V04CK,V06DC,V07AB
0,10000826,2146-12-08 22:22:00,63.786,166.608,116.368,73.125,F,32,Private,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10000898,2187-09-26 08:00:00,62.251,162.011,126.0,76.0,F,79,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10001122,2142-04-08 10:48:00,62.926,170.767,131.007,72.597,F,70,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10001401,2133-12-15 11:04:00,65.45,167.941,146.537,60.63,F,91,Medicare,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10001401,2134-10-09 13:45:00,65.5,168.283,146.155,60.552,F,92,Medicare,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
pd.options.display.max_seq_items = 4000

In [28]:
df[df.subject_id == 10001401][['subject_id', 'org_name', 'ab_name', 'charttime','susceptible_flag']].head(1)

Unnamed: 0,subject_id,org_name,ab_name,charttime,susceptible_flag
52,10001401,ENTEROCOCCUS FAECALIS,AMPICILLIN,2134-10-09 13:45:00,1.0


In [None]:
ex.loc[:, ex.gt(0).all()]

In [32]:
ex = feat[feat.subject_id == 10001401]
ex = ex[ex.charttime == '2134-10-09 13:45:00']

In [38]:
ex.subject_id = ex.subject_id.astype(str)

In [44]:
pd.concat([ex.select_dtypes(include=['object']), num.loc[:, num.gt(0).all()]], axis=1)

Unnamed: 0,subject_id,charttime,gender,insurance,language,marital_status,race,Height (Inches),Weight (Lbs),bp_systolic,...,M01AE,N01BB,N02AA,N02BE,N05BA,N05CH,P01AB,PR_99,R03AC,R03AL
4,10001401,2134-10-09 13:45:00,F,Medicare,English,MARRIED,WHITE,65.5,168.283,146.155,...,1.0,5.0,12.0,13.0,8.0,1.0,1.0,98.0,3.0,1.0


In [43]:
num = ex.select_dtypes(include=['number'])
num.loc[:, num.gt(0).all()]

Unnamed: 0,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,anchor_age,BLD001,BLD003,BLD004,BLD007,CIR007,...,M01AE,N01BB,N02AA,N02BE,N05BA,N05CH,P01AB,PR_99,R03AC,R03AL
4,65.5,168.283,146.155,60.552,92,1.0,1.0,2.0,1.0,6.0,...,1.0,5.0,12.0,13.0,8.0,1.0,1.0,98.0,3.0,1.0


In [24]:
feat.dropna().head()

Unnamed: 0,subject_id,charttime,Height (Inches),Weight (Lbs),bp_systolic,bp_diastolic,gender,anchor_age,insurance,language,...,V03AC,V03AE,V03AF,V03AX,V04CC,V04CF,V04CG,V04CK,V06DC,V07AB
3,10001401,2133-12-15 11:04:00,65.45,167.941,146.537,60.63,F,91,Medicare,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10001401,2134-10-09 13:45:00,65.5,168.283,146.155,60.552,F,92,Medicare,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,10001401,2135-09-20 13:04:00,65.542,168.566,146.35,60.567,F,93,Medicare,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,10001725,2116-10-25 13:07:00,62.201,158.384,119.786,74.41,F,52,Private,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,10001725,2118-11-26 11:05:00,62.201,158.476,120.703,74.785,F,54,Private,English,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
import pandas as pd

# Sample DataFrame
data = {
    'A': [1, 2, 3, 0],
    'B': [-1, -2, -3, -4],
    'C': [4, 5, 6, 7],
    'D': ['foo', 'bar', 'baz', 'qux']  # Non-numeric column
}

df = pd.DataFrame(data)

# Select only numeric columns
numeric_cols = df.select_dtypes(include=['number'])

# Filter numeric columns to keep only values greater than 0
filtered_df = numeric_cols[numeric_cols > 0]

# Display the filtered DataFrame
print(filtered_df)


Index(['subject_id', 'charttime', 'Height (Inches)', 'Weight (Lbs)',
       'bp_systolic', 'bp_diastolic', 'gender', 'anchor_age', 'insurance',
       'language', 'marital_status', 'race', '99', 'BLD001', 'BLD002',
       'BLD003', 'BLD004', 'BLD005', 'BLD006', 'BLD007', 'BLD008', 'BLD009',
       'BLD010', 'CIR001', 'CIR002', 'CIR003', 'CIR004', 'CIR005', 'CIR006',
       'CIR007', 'CIR008', 'CIR009', 'CIR010', 'CIR011', 'CIR012', 'CIR013',
       'CIR014', 'CIR015', 'CIR016', 'CIR017', 'CIR018', 'CIR019', 'CIR020',
       'CIR021', 'CIR022', 'CIR023', 'CIR024', 'CIR025', 'CIR026', 'CIR027',
       'CIR028', 'CIR029', 'CIR030', 'CIR031', 'CIR032', 'CIR033', 'CIR034',
       'CIR035', 'CIR036', 'CIR037', 'CIR038', 'CIR039', 'DEN001', 'DEN002',
       'DEN003', 'DIG001', 'DIG002', 'DIG003', 'DIG004', 'DIG005', 'DIG006',
       'DIG007', 'DIG008', 'DIG009', 'DIG010', 'DIG011', 'DIG012', 'DIG013',
       'DIG014', 'DIG015', 'DIG016', 'DIG017', 'DIG018', 'DIG019', 'DIG020',
       'DIG021'