In [28]:
import zipfile
import os
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import boto3
import pandas as pd
from io import BytesIO
from io import StringIO
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [3]:
# Initialize S3 client
s3 = boto3.client('s3')

# Function to load data 
bucket_name = 'ml-stack-759578648427-us-west-2'

def from_s3(bucket_name, file_name):
    response = s3.get_object(Bucket=bucket_name, Key=file_name)
    ext = os.path.splitext(file_name)[1]
    if ext == '.gz':
        data = pd.read_csv(BytesIO(response['Body'].read()), compression='gzip')
    elif ext == '.csv':
        data = pd.read_csv(BytesIO(response['Body'].read()))
    else:
        data = pd.read_excel(BytesIO(response['Body'].read()), sheet_name='DX_to_CCSR_Mapping', header=1)
    return data

# Import Labeled Data

In [56]:
file_name = 'final_microbiology_df.csv.gz'
df = from_s3(bucket_name, file_name) 
df['charttime'] = pd.to_datetime(df['charttime'])
subjects = df['subject_id'].unique()
df.head()

Unnamed: 0,subject_id,hadm_id,org_name,ab_name,charttime,interpretation,susceptible_flag
0,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN,2146-12-08 22:22:00,S,1.0
1,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,2146-12-08 22:22:00,S,1.0
2,10000826,20032235.0,ESCHERICHIA COLI,CEFAZOLIN,2146-12-08 22:22:00,S,1.0
3,10000826,20032235.0,ESCHERICHIA COLI,CEFEPIME,2146-12-08 22:22:00,S,1.0
4,10000826,20032235.0,ESCHERICHIA COLI,CEFTAZIDIME,2146-12-08 22:22:00,S,1.0


In [62]:
# final pairs 
pairs = [('ESCHERICHIA COLI', 'AMPICILLIN'),
 ('ESCHERICHIA COLI', 'AMPICILLIN/SULBACTAM'),
 ('ESCHERICHIA COLI', 'CEFAZOLIN'),
 ('ESCHERICHIA COLI', 'CEFEPIME'),
 ('ESCHERICHIA COLI', 'CEFTAZIDIME'),
 ('ESCHERICHIA COLI', 'CEFTRIAXONE'),
 ('ESCHERICHIA COLI', 'CIPROFLOXACIN'),
 ('ESCHERICHIA COLI', 'GENTAMICIN'),
 ('ESCHERICHIA COLI', 'MEROPENEM'),
 ('ESCHERICHIA COLI', 'NITROFURANTOIN'),
 ('ESCHERICHIA COLI', 'TOBRAMYCIN'),
 ('ESCHERICHIA COLI', 'TRIMETHOPRIM/SULFA'),
 ('PSEUDOMONAS AERUGINOSA', 'CEFEPIME'),
 ('PSEUDOMONAS AERUGINOSA', 'CEFTAZIDIME'),
 ('PSEUDOMONAS AERUGINOSA', 'CIPROFLOXACIN'),
 ('PSEUDOMONAS AERUGINOSA', 'GENTAMICIN'),
 ('PSEUDOMONAS AERUGINOSA', 'MEROPENEM'),
 ('PSEUDOMONAS AERUGINOSA', 'PIPERACILLIN/TAZO'),
 ('PSEUDOMONAS AERUGINOSA', 'TOBRAMYCIN'),
 ('ESCHERICHIA COLI', 'PIPERACILLIN/TAZO'),
 ('KLEBSIELLA PNEUMONIAE', 'AMPICILLIN/SULBACTAM'),
 ('KLEBSIELLA PNEUMONIAE', 'CEFAZOLIN'),
 ('KLEBSIELLA PNEUMONIAE', 'CEFEPIME'),
 ('KLEBSIELLA PNEUMONIAE', 'CEFTAZIDIME'),
 ('KLEBSIELLA PNEUMONIAE', 'CEFTRIAXONE'),
 ('KLEBSIELLA PNEUMONIAE', 'CIPROFLOXACIN'),
 ('KLEBSIELLA PNEUMONIAE', 'GENTAMICIN'),
 ('KLEBSIELLA PNEUMONIAE', 'MEROPENEM'),
 ('KLEBSIELLA PNEUMONIAE', 'NITROFURANTOIN'),
 ('KLEBSIELLA PNEUMONIAE', 'PIPERACILLIN/TAZO'),
 ('KLEBSIELLA PNEUMONIAE', 'TOBRAMYCIN'),
 ('KLEBSIELLA PNEUMONIAE', 'TRIMETHOPRIM/SULFA')]

df = df[df[['org_name', 'ab_name']].apply(tuple, axis=1).isin(pairs)]

# Majority Classifier - Baseline Metrics 

In [63]:
# get unique org & ab combos 
org_ab_label = df[['org_name', 'ab_name']].drop_duplicates()

In [65]:
# for each label calculate metrics for majority classifier 
baselines = []

for index, row in org_ab_label.iterrows():
    org = f"{row.org_name}"
    ab = f"{row.ab_name}"
    sub = df.query(f"org_name == '{org}' & ab_name == '{ab}'")
    y_true = sub.susceptible_flag 

    if (y_true.sum() / len(y_true)) > 0.5:
        maj_class = 1
    else:
        maj_class = 0
    
    y_pred = [maj_class] * len(y_true)
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    metrics = {'bacteria':org, 'antibiotic':ab, 'accuracy':accuracy, 'precision':precision, 'recall':recall, 'f1_score':f1}
    baselines.append(metrics)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [66]:
baseline_results = pd.DataFrame(baselines)
baseline_results

Unnamed: 0,bacteria,antibiotic,accuracy,precision,recall,f1_score
0,ESCHERICHIA COLI,AMPICILLIN,0.504,0.504,1.0,0.67
1,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,0.611,0.611,1.0,0.759
2,ESCHERICHIA COLI,CEFAZOLIN,0.806,0.806,1.0,0.892
3,ESCHERICHIA COLI,CEFEPIME,0.915,0.915,1.0,0.956
4,ESCHERICHIA COLI,CEFTAZIDIME,0.921,0.921,1.0,0.959
5,ESCHERICHIA COLI,CEFTRIAXONE,0.883,0.883,1.0,0.938
6,ESCHERICHIA COLI,CIPROFLOXACIN,0.732,0.732,1.0,0.846
7,ESCHERICHIA COLI,GENTAMICIN,0.897,0.897,1.0,0.946
8,ESCHERICHIA COLI,MEROPENEM,0.999,0.999,1.0,0.999
9,ESCHERICHIA COLI,NITROFURANTOIN,0.957,0.957,1.0,0.978
