# Algorithmically Assign Abx To CSNs based on Model Predictions

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pulp import *
import os, glob

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/conorcorbin/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'mining-clinical-decisions' 
%load_ext google.cloud.bigquery

from google.cloud import bigquery
client=bigquery.Client()

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery




### Load Predictions Data For Each Abx Option

Predictions of coverage for each antibiotic selection are stored in csv files using the following directory schema.  This function reads in predictions for each classifier so that each row is CSN and has an estimated probability of coverage for each antibiotic selection.  We implement a test function that compute the AUROC of each model after the predictions have been read in and cross checks it with the AUROC that was computed and stored in an `auroc.txt` text file during the model training procedure

In [6]:
import pdb
from sklearn.metrics import roc_auc_score

def test_load_predictions(df):
    """
    Reads in output of df, computes AUROC for each classifier and asserts that it equals the AUROC
    listed in the auroc.txt file associate with the classfier's directory
    """
    base_path="/Users/conorcorbin/repos/er_infection/results/ast_models_bucket1/testing/{abx}"
    abx_options = ["Vancomycin",
               "Ampicillin",
               "Cefazolin",
               "Ceftriaxone",
               "Cefepime",
               "Zosyn",
               "Ciprofloxacin",
               "Meropenem",
               "Vancomycin_Meropenem",
               "Vancomycin_Zosyn",
               "Vancomycin_Cefepime",
               "Vancomycin_Ceftriaxone"
               ]
    for abx in abx_options:
        path = base_path.format(abx=abx)
        f_auroc = os.path.join(path, 'auroc.txt')
        with open(f_auroc, 'r') as f:
            auroc = round(float(f.read()), 3)
        
        computed_auroc = round(roc_auc_score(df['%s_label' % abx], df['%s_predictions' % abx]), 3)
        
        assert auroc == computed_auroc
        print("%s_auroc: %s"% (abx, str(auroc)))

def load_predictions():
    """Helper function that loads predictions from AST classifiers for test set data"""
    
    base_path="/Users/conorcorbin/repos/er_infection/results/ast_models_bucket1/testing/{abx}"
    abx_options = ["Vancomycin",
                   "Ampicillin",
                   "Cefazolin",
                   "Ceftriaxone",
                   "Cefepime",
                   "Zosyn",
                   "Ciprofloxacin",
                   "Meropenem",
                   "Vancomycin_Meropenem",
                   "Vancomycin_Zosyn",
                   "Vancomycin_Cefepime",
                   "Vancomycin_Ceftriaxone"
                   ]
    df = pd.DataFrame()
    for i, abx in enumerate(abx_options):
        path = base_path.format(abx=abx)
        f_path = glob.glob(os.path.join(path, '*predictions.csv'))[0]
        if i == 0:
            df = pd.read_csv(f_path)
            df = df[['anon_id', 'pat_enc_csn_id_coded', 'label', 'predictions']]
            df = df.rename(columns={'label' : '%s_label' % abx,
                                    'predictions' : '%s_predictions' % abx})
        else:
            df_preds = pd.read_csv(f_path)
            df_preds = df_preds[['anon_id', 'pat_enc_csn_id_coded', 'label', 'predictions']]
            df_preds = df_preds.rename(columns={'label' : '%s_label' % abx,
                                                'predictions' : '%s_predictions' % abx})
            df = df.merge(df_preds, how='left', on=['anon_id', 'pat_enc_csn_id_coded'])
    
    return df
    
df = load_predictions()
test_load_predictions(df)

Vancomycin_auroc: 0.717
Ampicillin_auroc: 0.621
Cefazolin_auroc: 0.674
Ceftriaxone_auroc: 0.688
Cefepime_auroc: 0.649
Zosyn_auroc: 0.641
Ciprofloxacin_auroc: 0.611
Meropenem_auroc: 0.685
Vancomycin_Meropenem_auroc: 0.731
Vancomycin_Zosyn_auroc: 0.699
Vancomycin_Cefepime_auroc: 0.697
Vancomycin_Ceftriaxone_auroc: 0.669


In [7]:
print(len(df))
df.head()

1320


Unnamed: 0,anon_id,pat_enc_csn_id_coded,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,Ceftriaxone_predictions,...,Meropenem_label,Meropenem_predictions,Vancomycin_Meropenem_label,Vancomycin_Meropenem_predictions,Vancomycin_Zosyn_label,Vancomycin_Zosyn_predictions,Vancomycin_Cefepime_label,Vancomycin_Cefepime_predictions,Vancomycin_Ceftriaxone_label,Vancomycin_Ceftriaxone_predictions
0,JC2a03b24,131260812263,0,0.108161,0,0.462078,0,0.664275,1,0.79827,...,1,0.896608,1,0.98767,1,0.964979,1,0.961668,1,0.871461
1,JCe45a3c,131260883970,1,0.483241,1,0.538177,0,0.631024,0,0.784364,...,0,0.883272,1,0.987744,1,0.966207,1,0.970636,1,0.904466
2,JCd235bb,131261001599,0,0.21561,0,0.501686,1,0.631382,1,0.653581,...,1,0.815785,1,0.98887,1,0.965573,1,0.956648,1,0.897293
3,JCd29af0,131261001696,0,0.321022,0,0.351406,0,0.459186,0,0.424984,...,1,0.554059,1,0.980191,1,0.947425,1,0.948773,0,0.805817
4,JCd356bf,131261014293,1,0.421153,1,0.412029,1,0.528823,1,0.724792,...,1,0.839434,1,0.954408,1,0.963848,1,0.946225,1,0.840946


### Get clinician prescribing patterns
This SQL query gathers all abx medications ordered within the first 24 hours of admission that were administered to the patient in long format ( one row per administered med_description ) and then joins to our labels table so that we can cross check whether the administered antibiotic was sufficient to cover the patient. 

In [8]:
query = """
SELECT
    om.anon_id, om.pat_enc_csn_id_coded, om.order_med_id_coded, l.index_time, om.med_description,
    l.Ampicillin, l.Ciprofloxacin, l.Cefazolin, l.Ceftriaxone, l.Cefepime, l.Zosyn, l.Vancomycin,
    l.Meropenem, l.Vancomycin_Meropenem, l.Vancomycin_Zosyn, l.Vancomycin_Cefepime, l.Vancomycin_Ceftriaxone
FROM
    `mining-clinical-decisions.abx.abx_orders_given_and_stopped` om
INNER JOIN 
    `mining-clinical-decisions.abx.final_ast_labels` l
USING
    (pat_enc_csn_id_coded)
WHERE
    om.was_given = 1
ORDER BY 
    om.anon_id, om.pat_enc_csn_id_coded, om.order_time
"""
query_job = client.query(query)
df_abx = query_job.result().to_dataframe()
df_abx.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_med_id_coded,index_time,med_description,Ampicillin,Ciprofloxacin,Cefazolin,Ceftriaxone,Cefepime,Zosyn,Vancomycin,Meropenem,Vancomycin_Meropenem,Vancomycin_Zosyn,Vancomycin_Cefepime,Vancomycin_Ceftriaxone
0,JC29f92d0,131265245074,603371616,2019-02-20 10:49:00+00:00,CEFTRIAXONE 1 GRAM/100 ML NS MINIBAG PLUS,0,0,0,0,0,1,0,1,1,1,0,0
1,JC29f92d0,131265245074,603374106,2019-02-20 10:49:00+00:00,PIPERACILLIN-TAZOBACTAM-DEXTRS 3.375 GRAM/50 M...,0,0,0,0,0,1,0,1,1,1,0,0
2,JC29f92d0,131265245074,603418477,2019-02-20 10:49:00+00:00,ERTAPENEM 1 GRAM/50 ML NS MINIBAG PLUS,0,0,0,0,0,1,0,1,1,1,0,0
3,JC29f9afd,131114292966,476869102,2015-09-03 14:08:00+00:00,VANCOMYCIN IN D5W 1 GRAM/200 ML IV PGBK,1,0,1,1,1,1,1,1,1,1,1,1
4,JC29f9afd,131114292966,476869103,2015-09-03 14:08:00+00:00,PIPERACILLIN-TAZOBACTAM-DEXTRS 4.5 GRAM/100 ML...,1,0,1,1,1,1,1,1,1,1,1,1


### Aggregate antibiotic orders 
Here we aggregate the antibiotic orders so that one row in the desulting dataframe corresponds to a unique CSN. We do this by
1. Grouping by the CSN
2. Grabbing the first word (antibiotic name) from the med description
3. Aggregating the `med_description` column such that it is a single string with all antibiotics admistered to the patient, sorted in alphabetical order and separated by spaces. 
4. Only keep CSNs where the set of administered antibiotics is equal to one of the antbiotic selections we've trained classifiers for. 

#### List Most Commmon Antibiotic Combinations

In [9]:
concat_abx = lambda x : ' '.join(np.unique(sorted([a for a in x])))
df_common_abx = (df_abx
    .assign(med_description=lambda x: [a.split(' ')[0] for a in x.med_description]) # Only Take first word (abx)
    .assign(med_description=lambda x: [(a.replace('PIPERACILLIN-TAZOBACTAM-DEXTRS','PIPERACILLIN-TAZOBACTAM')
                                        .replace('VANCOMYCIN-WATER', 'VANCOMYCIN'))
                                       for a in x.med_description])
    .assign(year=lambda x: x.index_time.dt.year) # get year of each CSN - used to filter later on
    .groupby('pat_enc_csn_id_coded')
    .agg({'med_description' : concat_abx})
    .reset_index()
    .groupby('med_description')
    .agg(num_csns=('pat_enc_csn_id_coded', 'nunique'))
    .sort_values('num_csns', ascending=False)
           )
df_common_abx.head(20)

Unnamed: 0_level_0,num_csns
med_description,Unnamed: 1_level_1
CEFTRIAXONE,1886
PIPERACILLIN-TAZOBACTAM VANCOMYCIN,985
PIPERACILLIN-TAZOBACTAM,640
CIPROFLOXACIN,307
CEFTRIAXONE PIPERACILLIN-TAZOBACTAM,225
CEFTRIAXONE PIPERACILLIN-TAZOBACTAM VANCOMYCIN,211
LEVOFLOXACIN PIPERACILLIN-TAZOBACTAM VANCOMYCIN,191
AZITHROMYCIN CEFTRIAXONE,175
CEFTRIAXONE VANCOMYCIN,152
LEVOFLOXACIN,128


In [10]:
# Useful dictionaries to map corresponding. strings for the sameantibiotic selections
abx_map = {'Ceftriaxone' : "CEFTRIAXONE",
           'Vancomycin_Zosyn' : "PIPERACILLIN-TAZOBACTAM VANCOMYCIN",
           'Zosyn' : "PIPERACILLIN-TAZOBACTAM",
           'Vancomycin_Ceftriaxone' : "CEFTRIAXONE VANCOMYCIN",
           'Vancomycin_Cefepime' : "CEFEPIME VANCOMYCIN",
           'Cefepime' : "CEFEPIME",
           'Vancomycin' :  "VANCOMYCIN",
           'Meropenem' : "MEROPENEM",
           'Vancomycin_Meropenem' : "MEROPENEM VANCOMYCIN",
           'Cefazolin' : "CEFAZOLIN",
           'Ciprofloxacin' : "CIPROFLOXACIN",
           'Ampicillin' : 'AMPICILLIN'
          }
abx_map_inverse = {abx_map[key] : key for key in abx_map}
abx_map_inverse['CEFTRIAXONE PIPERACILLIN-TAZOBACTAM VANCOMYCIN'] = 'Vancomycin_Zosyn'
# abx_map_inverse['LEVOFLOXACIN PIPERACILLIN-TAZOBACTAM VANCOMYCIN'] = 'Vancomycin_Zosyn'
abx_map_inverse['AZITHROMYCIN PIPERACILLIN-TAZOBACTAM VANCOMYCIN'] = 'Vancomycin_Zosyn'
# abx_map_inverse['MEROPENEM PIPERACILLIN-TAZOBACTAM VANCOMYCIN'] = 'Vancomycin_Meropenem'
abx_map_inverse['AZITHROMYCIN CEFTRIAXONE'] = 'Ceftriaxone'
years = [2019]
# Lambda that aggregate Antibiotic orders after we've grouped by CSN
concat_abx = lambda x : ' '.join(np.unique(sorted([a for a in x])))

# 
df_drugs = (df_abx
    .assign(med_description=lambda x: [a.split(' ')[0] for a in x.med_description]) # Only Take first word (abx)
    .assign(med_description=lambda x: [(a.replace('PIPERACILLIN-TAZOBACTAM-DEXTRS','PIPERACILLIN-TAZOBACTAM')
                                        .replace('VANCOMYCIN-WATER', 'VANCOMYCIN'))
                                       for a in x.med_description])
    .assign(year=lambda x: x.index_time.dt.year) # get year of each CSN - used to filter later on
    .groupby('pat_enc_csn_id_coded')
    .agg({'med_description' : concat_abx,
          'year' : 'first',
          'Ampicillin' : 'first',
          'Ciprofloxacin' : 'first',
          'Cefazolin' : 'first',
          'Ceftriaxone' : 'first',
          'Cefepime' : 'first',
          'Zosyn' : 'first',
          'Vancomycin' : 'first',
          'Meropenem' : 'first',
          'Vancomycin_Ceftriaxone' : 'first',
          'Vancomycin_Cefepime' : 'first',
          'Vancomycin_Zosyn' : 'first',
          'Vancomycin_Meropenem' : 'first'})
    .reset_index()
    # Only look at test set data and CSNs where allowed antibiotic selection was administered
    .query("year == @years and med_description in @abx_map_inverse", engine='python') 
    .assign(med_description=lambda x: [abx_map_inverse[a] for a in x.med_description])
)

# Roughly 700 of the 1300 original CSNs in the test set
print(len(df_drugs))
df_drugs.head()

770


Unnamed: 0,pat_enc_csn_id_coded,med_description,year,Ampicillin,Ciprofloxacin,Cefazolin,Ceftriaxone,Cefepime,Zosyn,Vancomycin,Meropenem,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem
6847,131260812263,Ceftriaxone,2019,0,1,0,1,1,1,0,1,1,1,1,1
6855,131260883970,Vancomycin_Zosyn,2019,1,1,0,0,0,1,1,0,1,1,1,1
6858,131261001599,Zosyn,2019,0,1,1,1,1,1,0,1,1,1,1,1
6861,131261014293,Ceftriaxone,2019,1,1,1,1,1,1,1,1,1,1,1,1
6878,131261155365,Ceftriaxone,2019,1,1,1,1,1,1,0,1,1,1,1,1


### Merge this dataframe to predictions dataframe
After this step we should have a dataframe that has one row per CSN, each row should have the antibiotic selection actually administered to the patient, along with the predicted probability of said antibiotic selection covering the patient, and the ground truth as to whether it did. 

In [11]:
# Merge df_total to df on pat_enc_csn_id_coded
df_new = (df
    .merge(df_drugs, how='inner', on='pat_enc_csn_id_coded')
)

# Sanity check - make sure %abx_label columns are equal to %abx columns
for abx in abx_map:
    for i in range(len(df_new)):
        assert df_new[abx].values[i] == df_new['%s_label' % abx].values[i]
        
# Sanity check 2: compute AUROC of this subset of patients and compare to AUROC on full test set
base_path = "/Users/conorcorbin/repos/er_infection/results/ast_models_bucket1/testing/{abx}"
for abx in abx_map:
    computed_auroc = roc_auc_score(df_new['%s_label' % abx], df_new['%s_predictions' % abx])
    f_auroc = os.path.join(base_path.format(abx=abx), 'auroc.txt')
    with open(f_auroc, 'r') as f:
        auroc = float(f.read())
    print("{}: Full test set AUROC:{:.3f} Subset AUROC:{:.3f}".format(abx, auroc, computed_auroc))

Ceftriaxone: Full test set AUROC:0.688 Subset AUROC:0.697
Vancomycin_Zosyn: Full test set AUROC:0.699 Subset AUROC:0.672
Zosyn: Full test set AUROC:0.641 Subset AUROC:0.602
Vancomycin_Ceftriaxone: Full test set AUROC:0.669 Subset AUROC:0.674
Vancomycin_Cefepime: Full test set AUROC:0.697 Subset AUROC:0.739
Cefepime: Full test set AUROC:0.649 Subset AUROC:0.661
Vancomycin: Full test set AUROC:0.717 Subset AUROC:0.698
Meropenem: Full test set AUROC:0.685 Subset AUROC:0.678
Vancomycin_Meropenem: Full test set AUROC:0.731 Subset AUROC:0.766
Cefazolin: Full test set AUROC:0.674 Subset AUROC:0.675
Ciprofloxacin: Full test set AUROC:0.611 Subset AUROC:0.615
Ampicillin: Full test set AUROC:0.621 Subset AUROC:0.597


### Create Binary Integer Programming Problem Formulation and Solve
Here we specificy the problem formulation of the optimization process we wish to solve. The goal is to maximize the probability of covering the set of patients in the test set with the available antibiotic selections subject to the constraints that we assign each antibiotic selection a prespecified number of times, and that we only assign one antibiotic selection to each patient CSN. 

More technically, Let $N$ be the number of patient CSNs in our test set who were administered one of the 12 abx selections by clinicians, and let $K$ be the number of possible antibiotic selections.  Let $A$ be a matrix in $\mathbb{R}^{N\times K}$ such that $a_{ij}$ is 1 if antibiotic selection $j$ is selected for patient CSN $i$ and 0 otherwise. Let $\Phi$ be a matrix in $\mathbb{R}^{N \times K}$ such that $\phi_{ij}$ is the predicted probability that antibiotic $j$ will cover patient CSN $i$.  Let $C$ be a vector in $\mathbb{R}^K$ such that $c_j$ specifies the budget for anitbiotic selection $j$ - that is the number of times we are allowed to select antibiotic $j$ across our $N$ patient CSNs. Our problem formulation is as follows. 

$$  \underset{A}{\text{maximize}} \sum_{i=1}^{N} \sum_{j=1}^K \phi_{ij} a_{ij} $$

Subject to the following constraints:

$$ \sum_{j=1}^{K} a_{ij} = 1 \quad i = 1, ..., N $$

$$ \sum_{i=1}^{N} a_{ij} = c_j \quad j = 1, ...,  K $$

In the following code, we implenent and solve this optimization process using the pulp python package. 

#### How often was each options prescribed by clinicians?

In [12]:
(df_new
.groupby('med_description')
.agg(num_csns=('pat_enc_csn_id_coded', 'nunique'))
)

Unnamed: 0_level_0,num_csns
med_description,Unnamed: 1_level_1
Cefazolin,8
Cefepime,14
Ceftriaxone,404
Ciprofloxacin,8
Meropenem,9
Vancomycin,13
Vancomycin_Cefepime,23
Vancomycin_Ceftriaxone,31
Vancomycin_Meropenem,9
Vancomycin_Zosyn,149


In [13]:
abx_options = ["Vancomycin",
           "Ampicillin",
           "Cefazolin",
           "Ceftriaxone",
           "Cefepime",
           "Zosyn",
           "Ciprofloxacin",
           "Meropenem",
           "Vancomycin_Meropenem",
           "Vancomycin_Zosyn",
           "Vancomycin_Cefepime",
           "Vancomycin_Ceftriaxone"
           ]

abx_model = LpProblem("Antibiotics", LpMaximize)

# Create binary indicators for whether treatment is used
drug_inds = {}
for abx in abx_options:
    drug_inds[abx] = [LpVariable('%s_%d' % (abx, i), lowBound=0, upBound=1, cat='Binary')
                      for i in range(len(df_new))]

# Add objective function to model
per_csn_sum = []
for i in range(len(df_new)):
    _sum = 0
    for abx in abx_options:
        _sum += drug_inds[abx][i] * df_new['%s_predictions' % abx].values[i]
    per_csn_sum.append(_sum)
    
abx_model += lpSum(per_csn_sum)

# Add one selection constraint
for i in range(len(df_new)):
    selections = []
    for abx in abx_options:
        selections.append(drug_inds[abx][i])
    abx_model += lpSum(selections) == 1

# Add max assignment constraints
# abx_assignment_constraints = {"Vancomycin" : 13,
#                               "Ampicillin" : 0,
#                               "Cefazolin" : 8,
#                               "Ceftriaxone" : 367,
#                               "Cefepime" : 14,
#                               "Zosyn" : 102,
#                               "Ciprofloxacin" : 8,
#                               "Meropenem" : 9,
#                               "Vancomycin_Meropenem" : 9,
#                               "Vancomycin_Zosyn" :  113,
#                               "Vancomycin_Cefepime" : 23,
#                               "Vancomycin_Ceftriaxone" : 31
#                              }
abx_assignment_constraints = {"Vancomycin" : 13,
                              "Ampicillin" : 0,
                              "Cefazolin" : 8,
                              "Ceftriaxone" : 404,
                              "Cefepime" : 14,
                              "Zosyn" : 102,
                              "Ciprofloxacin" : 8,
                              "Meropenem" : 9,
                              "Vancomycin_Meropenem" : 9,
                              "Vancomycin_Zosyn" :  149,
                              "Vancomycin_Cefepime" : 23,
                              "Vancomycin_Ceftriaxone" : 31
                             }


for drug in drug_inds:
    abx_model += lpSum([drug_inds[drug][i] for i in range(len(df_new))]) == abx_assignment_constraints[drug]

# Solve model
abx_model.solve()
print("Status:", LpStatus[abx_model.status])

# Save selected antibiotic to df_new
abx_decisions = []
for i in range(len(df_new)):
    abx_decision = None
    for abx in abx_options:
        if drug_inds[abx][i].value() == 1:
            abx_decision = abx
    assert abx_decision is not None
    abx_decisions.append(abx_decision)
df_new['IP_med_description'] = abx_decisions


Status: Optimal


### Compare Performance to Clinician Performance
1. Write a function that takes in antibiotic selection and outputs a 1 if that selection covered the patient.  Simple to do, but annoying because of different ways we've named antibiotic selections.
2. Compute fraction of time each patient CSN was covered by the antibiotic selection. 

In [14]:
# Ugly helper function that just does some string mapping
def compute_was_covered(x, decision_column='med_description'):
    """
    Given med description, find appropriate label column and return whether patient was covered during CSN
    Returns "Not in abx options" if abx regimen isn't in our set of 12 options - useful for filtering later
    """
    if decision_column == 'med_description':
        med_description = x.med_description
    elif decision_column == 'random_med_description':
        med_description = x.random_med_description
    elif decision_column == 'IP_med_description':
        med_description = x.IP_med_description
    elif decision_column == 'guideline_med_description':
        med_description = x.guideline_med_description
    return x[med_description]
#     if med_description == "CEFTRIAXONE":
#         return x.Ceftriaxone
#     elif med_description == "PIPERACILLIN-TAZOBACTAM VANCOMYCIN":
#         return x.Vancomycin_Zosyn
#     elif med_description == "PIPERACILLIN-TAZOBACTAM":
#         return x.Zosyn
#     elif med_description == "CEFTRIAXONE VANCOMYCIN":
#         return x.Vancomycin_Ceftriaxone
#     elif med_description == "CEFEPIME VANCOMYCIN":
#         return x.Vancomycin_Cefepime
#     elif med_description == "CEFEPIME":
#         return x.Cefepime
#     elif med_description == "VANCOMYCIN":
#         return x.Vancomycin
#     elif med_description == "MEROPENEM":
#         return x.Meropenem
#     elif med_description == "MEROPENEM VANCOMYCIN":
#         return x.Vancomycin_Meropenem
#     elif med_description == "CEFAZOLIN":
#         return x.Cefazolin
#     elif med_description == "CIPROFLOXACIN":
#         return x.Ciprofloxacin
#     elif med_description == "AMPICILLIN":
#         return x.Ampicillin
#     else:
#         return "Not in abx options"
    
    
# Create flag for whether clinicians covered the patient during the csn, whether a random assignemnt covered patient
# CSN, and whether optimized assignment covered the patient CSN.

df_new = (df_new
    .assign(random_med_description=lambda x: np.random.choice(x.med_description, size=len(x.med_description), replace=False))
)
df_new = (df_new
    #.sample(frac=1.0, replace=True) # bootstrap each iteration
    .assign(was_covered_dr=df_new.apply(lambda x: compute_was_covered(x), axis=1))
    .assign(was_covered_random=df_new.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='random_med_description'),
                                                                          axis=1))
    .assign(was_covered_IP=df_new.apply(lambda x: compute_was_covered(x, 
                                                                      decision_column='IP_med_description'),
                                                                      axis=1))
)

clin_covered_rate = df_new['was_covered_dr'].sum() / len(df_new)
random_covered_rate = df_new['was_covered_random'].sum() / len(df_new)
ip_covered_rate = df_new['was_covered_IP'].sum() / len(df_new)

print(clin_covered_rate)
print(random_covered_rate)
print(ip_covered_rate)

df_new_random = (df_new
        .groupby('random_med_description')
        .agg(num_distinct_csns=('pat_enc_csn_id_coded', 'count'),
             num_times_covered_random=('was_covered_random', 'sum'))
        .reset_index()
        .assign(random_covered=lambda x: ['{}/{}'.format(c, t) for c, t in zip(x.num_times_covered_random,
                                                                               x.num_distinct_csns)])
        .rename(columns={'random_med_description' : 'med_description'})
)[['med_description', 'random_covered']]
                 
df_new_clinician = (df_new
        .groupby('med_description')
        .agg(num_distinct_csns=('pat_enc_csn_id_coded', 'count'),
             num_times_covered_dr=('was_covered_dr', 'sum'))
        .reset_index()
        .assign(dr_covered=lambda x: ['{}/{}'.format(c, t) for c, t in zip(x.num_times_covered_dr,
                                                                               x.num_distinct_csns)])
)[['med_description', 'dr_covered']]
                    
df_new_ip = (df_new
        .groupby('IP_med_description')
        .agg(num_distinct_csns=('pat_enc_csn_id_coded', 'count'),
             num_times_covered_IP=('was_covered_IP', 'sum'))
        .reset_index()
        .assign(IP_covered=lambda x: ['{}/{}'.format(c, t) for c, t in zip(x.num_times_covered_IP,
                                                                               x.num_distinct_csns)])
        .rename(columns={'IP_med_description' : 'med_description'})
)[['med_description', 'IP_covered']]

df_new_agg = (df_new_random
    .merge(df_new_clinician, how='inner', on='med_description')
    .merge(df_new_ip, how='inner', on='med_description')
)

df_new_agg

0.8428571428571429
0.787012987012987
0.8597402597402597


Unnamed: 0,med_description,random_covered,dr_covered,IP_covered
0,Cefazolin,6/8,5/8,8/8
1,Cefepime,12/14,11/14,12/14
2,Ceftriaxone,278/404,313/404,325/404
3,Ciprofloxacin,7/8,7/8,8/8
4,Meropenem,8/9,7/9,5/9
5,Vancomycin,2/13,9/13,8/13
6,Vancomycin_Cefepime,23/23,22/23,22/23
7,Vancomycin_Ceftriaxone,26/31,30/31,29/31
8,Vancomycin_Meropenem,9/9,9/9,8/9
9,Vancomycin_Zosyn,143/149,143/149,143/149


### Upload this final table to bigquery projet

In [15]:
df_final = (df_new
[['anon_id', 'pat_enc_csn_id_coded', 'med_description', 'IP_med_description', 'was_covered_dr', 'was_covered_IP']]
)

df_final.to_gbq(destination_table='abx.abx_allocation',
                project_id='mining-clinical-decisions',
                if_exists='replace')   

1it [00:04,  4.43s/it]


##### Find cases where optimized procedure orders Zosyn and clinicians prescribe vancomycin and zosyn and zosyn sufficient

In [15]:
query_str = "med_description == 'Vancomycin_Zosyn' AND IP_med_description == 'Zosyn' AND Zosyn_label == 1"
df_zosyn_ok = (df_new
[(df_new['med_description']=='Vancomycin_Zosyn') &
(df_new['IP_med_description']=='Zosyn') &
(df_new['Zosyn_label']==1)]
[['anon_id', 'pat_enc_csn_id_coded', 'med_description', 'IP_med_description', 'Zosyn_label', 'Vancomycin_Zosyn_label']]
)

In [16]:
df_zosyn_ok

Unnamed: 0,anon_id,pat_enc_csn_id_coded,med_description,IP_med_description,Zosyn_label,Vancomycin_Zosyn_label
22,JCe6159a,131261616681,Vancomycin_Zosyn,Zosyn,1,1
30,JCd2fc4d,131261807863,Vancomycin_Zosyn,Zosyn,1,1
40,JCea0302,131262039722,Vancomycin_Zosyn,Zosyn,1,1
48,JCcda7db,131262420251,Vancomycin_Zosyn,Zosyn,1,1
74,JCd3d869,131263011349,Vancomycin_Zosyn,Zosyn,1,1
80,JCea97b3,131263078721,Vancomycin_Zosyn,Zosyn,1,1
126,JCeaaa9a,131264012380,Vancomycin_Zosyn,Zosyn,1,1
127,JCec2c71,131264055666,Vancomycin_Zosyn,Zosyn,1,1
207,JCdb0553,131265867822,Vancomycin_Zosyn,Zosyn,1,1
248,JCe8d86c,131266988673,Vancomycin_Zosyn,Zosyn,1,1


In [None]:
schema = [
{'name': 'pat_enc_csn_id_coded', 'type': 'INTEGER'},
{'name': 'feature_type', 'type': 'STRING'},
{'name': 'feature', 'type': 'STRING'},
{'name': 'value', 'type': 'INTEGER'} # bag of words
]
df_zosyn_ok.to_gbq(
    destination_table="abx.zosyn_only_okay",
    project_id='mining-clinical-decisions',
    if_exists='append',
    table_schema=self.feature_matrix_schema
)

Unnamed: 0,anon_id,pat_enc_csn_id_coded,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,Ceftriaxone_predictions,...,Meropenem,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP
0,JC2a03b24,131260812263,0,0.108161,0,0.462078,0,0.664275,1,0.79827,...,1,1,1,1,1,Ceftriaxone,Zosyn,1,1,1
1,JCe45a3c,131260883970,1,0.483241,1,0.538177,0,0.631024,0,0.784364,...,0,1,1,1,1,Ceftriaxone,Zosyn,1,1,0
2,JCd235bb,131261001599,0,0.21561,0,0.501686,1,0.631382,1,0.653581,...,1,1,1,1,1,Vancomycin_Zosyn,Ceftriaxone,1,1,1
3,JCd356bf,131261014293,1,0.421153,1,0.412029,1,0.528823,1,0.724792,...,1,1,1,1,1,Ceftriaxone,Zosyn,1,1,1
4,JCd1c07c,131261155365,0,0.064201,1,0.528883,1,0.749904,1,0.789198,...,1,1,1,1,1,Ceftriaxone,Ceftriaxone,1,1,1


In [57]:
%%bigquery df_uti_guidelines --project mining-clinical-decisions
WITH observations_with_pos_urine_cultures as (
    SELECT 
        DISTINCT a.*, cohort.index_time
    FROM
        `mining-clinical-decisions.abx.abx_allocation` a
    INNER JOIN 
        `mining-clinical-decisions.abx.final_cohort_table` cohort
    USING
        (pat_enc_csn_id_coded)
    INNER JOIN
        `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
    USING 
        (pat_enc_csn_id_coded)
    INNER JOIN 
        `mining-clinical-decisions.shc_core.culture_sensitivity` cs 
    USING 
        (order_proc_id_coded)
    WHERE
        cs.description = "URINE CULTURE"
),

observations_with_pos_other_cultures as (
    SELECT 
        DISTINCT pat_enc_csn_id_coded
    FROM
        `mining-clinical-decisions.abx.abx_allocation` a
    INNER JOIN 
        `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
    USING 
        (pat_enc_csn_id_coded)
    INNER JOIN 
        `mining-clinical-decisions.shc_core.culture_sensitivity` cs 
    USING 
        (order_proc_id_coded)
    WHERE
        cs.description <> "URINE CULTURE"
),

only_utis as (
SELECT 
    DISTINCT a.*
FROM 
    observations_with_pos_urine_cultures a 
WHERE
    a.pat_enc_csn_id_coded NOT IN (SELECT DISTINCT pat_enc_csn_id_coded 
                                   FROM observations_with_pos_other_cultures)
),

utis_with_neutra as (
    SELECT DISTINCT
        a.*,
        MAX(CASE WHEN lr.ord_num_value IS NULL THEN 0
            WHEN lr.ord_num_value < 10 THEN 1
            ELSE 0 END)
        OVER(PARTITION BY a.pat_enc_csn_id_coded) NEUTRA
    FROM
        only_utis a
    LEFT JOIN 
        (SELECT anon_id, cohort.pat_enc_csn_id_coded, lab_name, base_name, ord_num_value, result_time_utc 
         FROM shc_core.lab_result 
         INNER JOIN abx.final_cohort_table cohort
         USING (anon_id)
         WHERE base_name in ('NEUTRA', 'NEUTAB', 'ABSBAND', 'ABSNEUTBDYFL')
         AND result_time_utc <= cohort.index_time
         AND TIMESTAMP_ADD(result_time_utc, INTERVAL 24 Hour) >= cohort.index_time) lr
    USING 
        (pat_enc_csn_id_coded)

),

final_table as (
    SELECT DISTINCT
        a.*,
        MAX(CASE WHEN UPPER(cs.antibiotic) LIKE "%CEFTRIAXONE%" AND UPPER(cs.suscept) LIKE "%RESISTANT%" THEN 1 ELSE 0 END)
        OVER (PARTITION BY a.anon_id) AS prior_ceftriaxone_resistance,
        MAX(CASE WHEN UPPER(cs.antibiotic) LIKE "%PIPERACILLIN/TAZOBACTAM%" AND UPPER(cs.suscept) LIKE "%RESISTANT%" THEN 1 ELSE 0 END)
        OVER (PARTITION BY a.anon_id) AS prior_zosyn_resistance
    FROM
        utis_with_neutra a
    LEFT JOIN 
        (SELECT DISTINCT anon_id, cohort.index_time, antibiotic, suscept, result_time_jittered_utc
        FROM shc_core.culture_sensitivity
        INNER JOIN abx.final_cohort_table cohort
        USING (anon_id)
        WHERE result_time_jittered_utc < cohort.index_time) cs
    USING
        (anon_id, index_time)
)

SELECT
    *,
    CASE 
    WHEN NEUTRA = 0 and  prior_ceftriaxone_resistance = 0 THEN 'Ceftriaxone'
    WHEN NEUTRA = 0 and prior_ceftriaxone_resistance = 1 and prior_zosyn_resistance = 0 THEN 'Zosyn'
    WHEN NEUTRA = 0 and prior_ceftriaxone_resistance = 1 and prior_zosyn_resistance = 1 THEN 'Meropenem'
    WHEN NEUTRA = 1 and prior_zosyn_resistance = 1 THEN 'Meropenem'
    ELSE 'Zosyn'
    END guideline_med_description
FROM final_table




In [58]:
df_uti_guidelines.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,med_description,IP_med_description,was_covered_dr,was_covered_IP,index_time,NEUTRA,prior_ceftriaxone_resistance,prior_zosyn_resistance,guideline_med_description
0,JC29fd3dc,131262483875,Vancomycin_Zosyn,Ceftriaxone,1,1,2019-01-06 02:35:00+00:00,1,0,0,Zosyn
1,JCcd45b3,131277210163,Ceftriaxone,Ceftriaxone,1,1,2019-09-29 22:47:00+00:00,1,0,0,Zosyn
2,JC2a1bfa5,131268574477,Ceftriaxone,Vancomycin_Zosyn,1,1,2019-06-30 22:35:00+00:00,1,1,1,Meropenem
3,JCdf8793,131265312214,Ceftriaxone,Ceftriaxone,1,1,2019-02-27 16:43:00+00:00,0,0,0,Ceftriaxone
4,JCd13ceb,131272779729,Ceftriaxone,Ceftriaxone,1,1,2019-07-29 13:49:00+00:00,1,0,0,Zosyn


In [59]:
df_uti_new = (df_uti_guidelines
    [['anon_id', 'pat_enc_csn_id_coded', 'guideline_med_description']]
    .merge(df_new, how='inner')
)
print(df_uti_new.shape)
df_uti_new.head()

(476, 46)


Unnamed: 0,anon_id,pat_enc_csn_id_coded,guideline_med_description,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,...,Meropenem,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP
0,JC29fd3dc,131262483875,Zosyn,0,0.298378,1,0.517242,1,0.750544,1,...,1,1,1,1,1,Ceftriaxone,Ceftriaxone,1,1,1
1,JCcd45b3,131277210163,Zosyn,0,0.09553,1,0.406455,1,0.712526,1,...,1,1,1,1,1,Ceftriaxone,Zosyn,1,1,1
2,JC2a1bfa5,131268574477,Meropenem,0,0.137741,0,0.47321,1,0.58104,1,...,1,1,1,1,1,Vancomycin_Zosyn,Vancomycin_Ceftriaxone,1,1,1
3,JCdf8793,131265312214,Ceftriaxone,0,0.154827,0,0.472927,1,0.613512,1,...,1,1,1,1,1,Ceftriaxone,Zosyn,1,1,1
4,JCd13ceb,131272779729,Zosyn,0,0.072778,1,0.426553,1,0.703634,1,...,1,1,1,1,1,Ceftriaxone,Vancomycin_Zosyn,1,1,1


In [60]:
df_uti_new = (df_uti_new
    .assign(was_covered_guideline=df_uti_new.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='guideline_med_description'),
                                                                          axis=1)
           )
)
clin_covered_rate = df_uti_new['was_covered_dr'].sum() / len(df_uti_new)
random_covered_rate = df_uti_new['was_covered_random'].sum() / len(df_uti_new)
ip_covered_rate = df_uti_new['was_covered_IP'].sum() / len(df_uti_new)
guideline_covered_rate = df_uti_new['was_covered_guideline'].sum() / len(df_uti_new)


In [61]:
print(clin_covered_rate)
print(random_covered_rate)
print(ip_covered_rate)
print(guideline_covered_rate)

0.8151260504201681
0.8067226890756303
0.8550420168067226
0.8466386554621849


### We note that this comparison is likely unfair because the the optimizer is using much broader spectrum antibiotics in this subset than what the guidelines recommend

##### Thus we do another analysis where we constrict the optimizer to the budget used by the guidelines themselves. 

In [62]:
print(df_uti_new.shape)

(476, 47)


In [63]:
df_uti_new['guideline_med_description'].value_counts()

Zosyn          247
Ceftriaxone    211
Meropenem       18
Name: guideline_med_description, dtype: int64

In [75]:
from tqdm import tqdm

def solve_lp_problem(df, budgets):
    """
    Creates a lp problem formulation with pulp for antibiotic selection and adds
    budget constraints and solves.
    """

    abx_options = ["Vancomycin",
                   "Ampicillin",
                   "Cefazolin",
                   "Ceftriaxone",
                   "Cefepime",
                   "Zosyn",
                   "Ciprofloxacin",
                   "Meropenem",
                   "Vancomycin_Meropenem",
                   "Vancomycin_Zosyn",
                   "Vancomycin_Cefepime",
                   "Vancomycin_Ceftriaxone"
                   ]

    abx_model = LpProblem("Antibiotics", LpMaximize)

    # Create binary indicators for whether treatment is used
    drug_inds = {}
    for abx in abx_options:
        drug_inds[abx] = [LpVariable('%s_%d' % (abx, i), lowBound=0, upBound=1, cat='Binary')
                          for i in range(len(df))]

    # Add objective function to model
    per_csn_sum = []
    for i in range(len(df)):
        _sum = 0
        for abx in abx_options:
            _sum += drug_inds[abx][i] * df['%s_predictions' % abx].values[i]
        per_csn_sum.append(_sum)

    abx_model += lpSum(per_csn_sum)

    # Add one selection constraint
    for i in range(len(df)):
        selections = []
        for abx in abx_options:
            selections.append(drug_inds[abx][i])
        abx_model += lpSum(selections) == 1
    
    for drug in drug_inds:
        abx_model += lpSum([drug_inds[drug][i] for i in range(len(df))]) == budgets[drug]

    # Solve model
    abx_model.solve()
    print("Status:", LpStatus[abx_model.status])

    # Save selected antibiotic to df_new
    abx_decisions = []
    for i in range(len(df)):
        abx_decision = None
        for abx in abx_options:
            if drug_inds[abx][i].value() == 1:
                abx_decision = abx
        assert abx_decision is not None
        abx_decisions.append(abx_decision)
    df['IP_med_description'] = abx_decisions
    
    return df

def boostrap_coverage_rates(df):
    """
    Get's guideline and lp coverage rates given a dataframe with med descriptions and assignments. 
    We'll do a stratified bootstrap (stratified by antibiotic) to build up two distributions in coverage
    rates. We'll return the confidence intervals around each estimate and a pvalue for testing the null 
    hypothesis that the ip assignment is equal to or less than the guideline assignment
    """
    ip_covered_rates = []
    guideline_covered_rates = []
    actual_ip_rate = df['was_covered_IP'].sum() / len(df)
    actual_guideline_rate = df['was_covered_guideline'].sum() / len(df)
    for i in tqdm(range(1000)):
        df_boot_guide = (df
            .groupby('guideline_med_description')
            .sample(frac=1., replace=True)
            .reset_index()
        )
        df_boot_ip = (df
            .groupby('IP_med_description')
            .sample(frac=1., replace=True)
            .reset_index()
        )
        ip_covered_rates.append(df_boot_ip['was_covered_IP'].sum() / len(df_boot_ip))
        guideline_covered_rates.append(df_boot_guide['was_covered_guideline'].sum() / len(df_boot_guide))
        
    ip_high = np.percentile(ip_covered_rates, 97.5)
    ip_low = np.percentile(ip_covered_rates, 2.5)
    guide_high = np.percentile(guideline_covered_rates, 97.5)
    guide_low = np.percentile(guideline_covered_rates, 2.5)
    
    ip_estimate = f"{round(actual_ip_rate, 3)}: [{round(ip_low, 3)}, {round(ip_high, 3)}]"
    guide_estimate = f"{round(actual_guideline_rate, 3)}: [{round(guide_low, 3)}, {round(guide_high, 3)}]"
    
    pval = len(
        [g for g in guideline_covered_rates if g >= actual_ip_rate]
    ) / len(guideline_covered_rates)
    
    return ip_estimate, guide_estimate, pval
    
    
def boostrap_clin_coverage_rates(df):
    """
    Get's guideline and lp coverage rates given a dataframe with med descriptions and assignments. 
    We'll do a stratified bootstrap (stratified by antibiotic) to build up two distributions in coverage
    rates. We'll return the confidence intervals around each estimate and a pvalue for testing the null 
    hypothesis that the ip assignment is equal to or less than the guideline assignment
    """
    ip_covered_rates = []
    guideline_covered_rates = []
    actual_ip_rate = df['was_covered_dr'].sum() / len(df)
    actual_guideline_rate = df['was_covered_guideline'].sum() / len(df)
    for i in tqdm(range(1000)):
        df_boot_guide = (df
            .groupby('guideline_med_description')
            .sample(frac=1., replace=True)
            .reset_index()
        )
        df_boot_ip = (df
            .groupby('med_description')
            .sample(frac=1., replace=True)
            .reset_index()
        )
        ip_covered_rates.append(df_boot_ip['was_covered_dr'].sum() / len(df_boot_ip))
        guideline_covered_rates.append(df_boot_guide['was_covered_guideline'].sum() / len(df_boot_guide))
        
    ip_high = np.percentile(ip_covered_rates, 97.5)
    ip_low = np.percentile(ip_covered_rates, 2.5)
    guide_high = np.percentile(guideline_covered_rates, 97.5)
    guide_low = np.percentile(guideline_covered_rates, 2.5)
    
    ip_estimate = f"{round(actual_ip_rate, 3)}: [{round(ip_low, 3)}, {round(ip_high, 3)}]"
    guide_estimate = f"{round(actual_guideline_rate, 3)}: [{round(guide_low, 3)}, {round(guide_high, 3)}]"
    
    pval = len(
        [g for g in guideline_covered_rates if g >= actual_ip_rate]
    ) / len(guideline_covered_rates)
    
    return ip_estimate, guide_estimate, pval


In [65]:
budgets  = {
  "Vancomycin" : 0,
  "Ampicillin" : 0,
  "Cefazolin" : 0,
  "Ceftriaxone" : 211,
  "Cefepime" : 0,
  "Zosyn" : 247,
  "Ciprofloxacin" : 0,
  "Meropenem" : 18,
  "Vancomycin_Meropenem" : 0,
  "Vancomycin_Zosyn" :  0,
  "Vancomycin_Cefepime" : 0,
  "Vancomycin_Ceftriaxone" : 0
}
df_uti_final = solve_lp_problem(df_uti_new, budgets)

Status: Optimal


In [66]:
df_uti_final = (df_uti_final
    .assign(was_covered_guideline=df_uti_final.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='guideline_med_description'),
                                                                          axis=1)
           )
    .assign(was_covered_IP=df_uti_final.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='IP_med_description'),
                                                                          axis=1)
           )

)
clin_covered_rate = df_uti_final['was_covered_dr'].sum() / len(df_uti_new)
random_covered_rate = df_uti_final['was_covered_random'].sum() / len(df_uti_new)
ip_covered_rate = df_uti_final['was_covered_IP'].sum() / len(df_uti_new)
guideline_covered_rate = df_uti_final['was_covered_guideline'].sum() / len(df_uti_new)


In [67]:
print(clin_covered_rate)
print(random_covered_rate)
print(ip_covered_rate)
print(guideline_covered_rate)

0.8151260504201681
0.8067226890756303
0.8907563025210085
0.8466386554621849


In [68]:
for abx in ['Ceftriaxone', 'Cefepime', 'Meropenem', 'Zosyn']:
    print(f"{abx}: {roc_auc_score(df_uti_final[f'{abx}_label'], df_uti_final[f'{abx}_predictions'])}")

Ceftriaxone: 0.6900497406229318
Cefepime: 0.6533521109322769
Meropenem: 0.6583880943177426
Zosyn: 0.5792461197339246


In [74]:
ip, guide, pval = boostrap_coverage_rates(df_uti_final)

 69%|██████▉   | 6903/10000 [02:56<01:19, 39.02it/s]


KeyboardInterrupt: 

In [70]:
ip

'0.891: [0.863, 0.916]'

In [71]:
guide

'0.847: [0.813, 0.876]'

In [72]:
pval

0.0

In [76]:
clin_rate, guide, pval = boostrap_clin_coverage_rates(df_uti_final)

100%|██████████| 1000/1000 [00:54<00:00, 18.50it/s]


In [77]:
clin_rate

'0.815: [0.782, 0.849]'

In [79]:
df_uti_final['med_description'].value_counts()

Ceftriaxone               335
Vancomycin_Zosyn           51
Zosyn                      36
Vancomycin_Ceftriaxone     10
Cefepime                    9
Vancomycin_Cefepime         8
Ciprofloxacin               7
Cefazolin                   7
Vancomycin_Meropenem        5
Meropenem                   5
Vancomycin                  3
Name: med_description, dtype: int64

### But now do this using cefepime as originally discussed

In [93]:
%%bigquery df_uti_guidelines --project mining-clinical-decisions
WITH observations_with_pos_urine_cultures as (
    SELECT 
        DISTINCT a.*, cohort.index_time
    FROM
        `mining-clinical-decisions.abx.abx_allocation` a
    INNER JOIN 
        `mining-clinical-decisions.abx.final_cohort_table` cohort
    USING
        (pat_enc_csn_id_coded)
    INNER JOIN
        `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
    USING 
        (pat_enc_csn_id_coded)
    INNER JOIN 
        `mining-clinical-decisions.shc_core.culture_sensitivity` cs 
    USING 
        (order_proc_id_coded)
    WHERE
        cs.description = "URINE CULTURE"
),

observations_with_pos_other_cultures as (
    SELECT 
        DISTINCT pat_enc_csn_id_coded
    FROM
        `mining-clinical-decisions.abx.abx_allocation` a
    INNER JOIN 
        `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
    USING 
        (pat_enc_csn_id_coded)
    INNER JOIN 
        `mining-clinical-decisions.shc_core.culture_sensitivity` cs 
    USING 
        (order_proc_id_coded)
    WHERE
        cs.description <> "URINE CULTURE"
),

only_utis as (
SELECT 
    DISTINCT a.*
FROM 
    observations_with_pos_urine_cultures a 
WHERE
    a.pat_enc_csn_id_coded NOT IN (SELECT DISTINCT pat_enc_csn_id_coded 
                                   FROM observations_with_pos_other_cultures)
),

utis_with_neutra as (
    SELECT DISTINCT
        a.*,
        MAX(CASE WHEN lr.ord_num_value IS NULL THEN 0
            WHEN lr.ord_num_value < 10 THEN 1
            ELSE 0 END)
        OVER(PARTITION BY a.pat_enc_csn_id_coded) NEUTRA
    FROM
        only_utis a
    LEFT JOIN 
        (SELECT anon_id, cohort.pat_enc_csn_id_coded, lab_name, base_name, ord_num_value, result_time_utc 
         FROM shc_core.lab_result 
         INNER JOIN abx.final_cohort_table cohort
         USING (anon_id)
         WHERE base_name in ('NEUTRA', 'NEUTAB', 'ABSBAND', 'ABSNEUTBDYFL')
         AND result_time_utc <= cohort.index_time
         AND TIMESTAMP_ADD(result_time_utc, INTERVAL 24 Hour) >= cohort.index_time) lr
    USING 
        (pat_enc_csn_id_coded)

),

final_table as (
    SELECT DISTINCT
        a.*,
        MAX(CASE WHEN UPPER(cs.antibiotic) LIKE "%CEFTRIAXONE%" AND UPPER(cs.suscept) LIKE "%RESISTANT%" THEN 1 ELSE 0 END)
        OVER (PARTITION BY a.anon_id) AS prior_ceftriaxone_resistance,
        MAX(CASE WHEN UPPER(cs.antibiotic) LIKE "%CEFEPIME%" AND UPPER(cs.suscept) LIKE "%RESISTANT%" THEN 1 ELSE 0 END)
        OVER (PARTITION BY a.anon_id) AS prior_cefepime_resistance
    FROM
        utis_with_neutra a
    LEFT JOIN 
        (SELECT DISTINCT anon_id, cohort.index_time, antibiotic, suscept, result_time_jittered_utc
        FROM shc_core.culture_sensitivity
        INNER JOIN abx.final_cohort_table cohort
        USING (anon_id)
        WHERE result_time_jittered_utc < cohort.index_time) cs
    USING
        (anon_id, index_time)
)

SELECT
    *,
    CASE 
    WHEN NEUTRA = 0 and  prior_ceftriaxone_resistance = 0 THEN 'Ceftriaxone'
    WHEN NEUTRA = 0 and prior_ceftriaxone_resistance = 1 and prior_cefepime_resistance = 0 THEN 'Cefepime'
    WHEN NEUTRA = 0 and prior_ceftriaxone_resistance = 1 and prior_cefepime_resistance = 1 THEN 'Meropenem'
    WHEN NEUTRA = 1 and prior_cefepime_resistance = 1 THEN 'Meropenem'
    ELSE 'Cefepime'
    END guideline_med_description
FROM final_table




In [94]:
df_uti_guidelines.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,med_description,IP_med_description,was_covered_dr,was_covered_IP,index_time,NEUTRA,prior_ceftriaxone_resistance,prior_cefepime_resistance,guideline_med_description
0,JCdb4134,131279966991,Vancomycin_Zosyn,Vancomycin_Zosyn,1,1,2019-11-25 18:27:00+00:00,0,1,0,Cefepime
1,JCd01f42,131262173637,Ceftriaxone,Ceftriaxone,1,1,2019-01-23 22:22:00+00:00,0,0,0,Ceftriaxone
2,JCd13ceb,131272779729,Ceftriaxone,Ceftriaxone,1,1,2019-07-29 13:49:00+00:00,1,0,0,Cefepime
3,JCd79080,131265141270,Ceftriaxone,Ceftriaxone,1,1,2019-03-27 20:24:00+00:00,1,0,0,Cefepime
4,JCdf8793,131265312214,Ceftriaxone,Ceftriaxone,1,1,2019-02-27 16:43:00+00:00,0,0,0,Ceftriaxone


In [95]:
df_uti_new = (df_uti_guidelines
    [['anon_id', 'pat_enc_csn_id_coded', 'guideline_med_description']]
    .merge(df_new, how='inner')
)
print(df_uti_new.shape)
df_uti_new.head()

(476, 46)


Unnamed: 0,anon_id,pat_enc_csn_id_coded,guideline_med_description,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,...,Meropenem,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP
0,JCdb4134,131279966991,Cefepime,0,0.231372,1,0.249809,1,0.350487,1,...,1,1,1,1,1,Vancomycin_Zosyn,Ceftriaxone,1,1,1
1,JCd01f42,131262173637,Ceftriaxone,0,0.109541,1,0.465354,1,0.707414,1,...,1,1,1,1,1,Ceftriaxone,Ceftriaxone,1,1,1
2,JCd13ceb,131272779729,Cefepime,0,0.072778,1,0.426553,1,0.703634,1,...,1,1,1,1,1,Ceftriaxone,Vancomycin_Meropenem,1,1,1
3,JCd79080,131265141270,Cefepime,0,0.173595,1,0.379767,1,0.637087,1,...,1,1,1,1,1,Ceftriaxone,Ceftriaxone,1,1,1
4,JCdf8793,131265312214,Ceftriaxone,0,0.154827,0,0.472927,1,0.613512,1,...,1,1,1,1,1,Ceftriaxone,Ceftriaxone,1,1,1


In [96]:
df_uti_new = (df_uti_new
    .assign(was_covered_guideline=df_uti_new.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='guideline_med_description'),
                                                                          axis=1)
           )
)
clin_covered_rate = df_uti_new['was_covered_dr'].sum() / len(df_uti_new)
random_covered_rate = df_uti_new['was_covered_random'].sum() / len(df_uti_new)
ip_covered_rate = df_uti_new['was_covered_IP'].sum() / len(df_uti_new)
guideline_covered_rate = df_uti_new['was_covered_guideline'].sum() / len(df_uti_new)


In [97]:
print(clin_covered_rate)
print(random_covered_rate)
print(ip_covered_rate)
print(guideline_covered_rate)

0.8151260504201681
0.7983193277310925
0.8550420168067226
0.7899159663865546


### We note that this comparison is likely unfair because the the optimizer is using much broader spectrum antibiotics in this subset than what the guidelines recommend

##### Thus we do another analysis where we constrict the optimizer to the budget used by the guidelines themselves. 

In [98]:
print(df_uti_new.shape)

(476, 47)


In [99]:
df_uti_new['guideline_med_description'].value_counts()

Cefepime       254
Ceftriaxone    211
Meropenem       11
Name: guideline_med_description, dtype: int64

In [88]:
from tqdm import tqdm

def solve_lp_problem(df, budgets):
    """
    Creates a lp problem formulation with pulp for antibiotic selection and adds
    budget constraints and solves.
    """

    abx_options = ["Vancomycin",
                   "Ampicillin",
                   "Cefazolin",
                   "Ceftriaxone",
                   "Cefepime",
                   "Zosyn",
                   "Ciprofloxacin",
                   "Meropenem",
                   "Vancomycin_Meropenem",
                   "Vancomycin_Zosyn",
                   "Vancomycin_Cefepime",
                   "Vancomycin_Ceftriaxone"
                   ]

    abx_model = LpProblem("Antibiotics", LpMaximize)

    # Create binary indicators for whether treatment is used
    drug_inds = {}
    for abx in abx_options:
        drug_inds[abx] = [LpVariable('%s_%d' % (abx, i), lowBound=0, upBound=1, cat='Binary')
                          for i in range(len(df))]

    # Add objective function to model
    per_csn_sum = []
    for i in range(len(df)):
        _sum = 0
        for abx in abx_options:
            _sum += drug_inds[abx][i] * df['%s_predictions' % abx].values[i]
        per_csn_sum.append(_sum)

    abx_model += lpSum(per_csn_sum)

    # Add one selection constraint
    for i in range(len(df)):
        selections = []
        for abx in abx_options:
            selections.append(drug_inds[abx][i])
        abx_model += lpSum(selections) == 1
    
    for drug in drug_inds:
        abx_model += lpSum([drug_inds[drug][i] for i in range(len(df))]) == budgets[drug]

    # Solve model
    abx_model.solve()
    print("Status:", LpStatus[abx_model.status])

    # Save selected antibiotic to df_new
    abx_decisions = []
    for i in range(len(df)):
        abx_decision = None
        for abx in abx_options:
            if drug_inds[abx][i].value() == 1:
                abx_decision = abx
        assert abx_decision is not None
        abx_decisions.append(abx_decision)
    df['IP_med_description'] = abx_decisions
    
    return df

def boostrap_coverage_rates(df):
    """
    Get's guideline and lp coverage rates given a dataframe with med descriptions and assignments. 
    We'll do a stratified bootstrap (stratified by antibiotic) to build up two distributions in coverage
    rates. We'll return the confidence intervals around each estimate and a pvalue for testing the null 
    hypothesis that the ip assignment is equal to or less than the guideline assignment
    """
    ip_covered_rates = []
    guideline_covered_rates = []
    actual_ip_rate = df['was_covered_IP'].sum() / len(df)
    actual_guideline_rate = df['was_covered_guideline'].sum() / len(df)
    for i in tqdm(range(1000)):
        df_boot_guide = (df
            .groupby('guideline_med_description')
            .sample(frac=1., replace=True)
            .reset_index()
        )
        df_boot_ip = (df
            .groupby('IP_med_description')
            .sample(frac=1., replace=True)
            .reset_index()
        )
        ip_covered_rates.append(df_boot_ip['was_covered_IP'].sum() / len(df_boot_ip))
        guideline_covered_rates.append(df_boot_guide['was_covered_guideline'].sum() / len(df_boot_guide))
        
    ip_high = np.percentile(ip_covered_rates, 97.5)
    ip_low = np.percentile(ip_covered_rates, 2.5)
    guide_high = np.percentile(guideline_covered_rates, 97.5)
    guide_low = np.percentile(guideline_covered_rates, 2.5)
    
    ip_estimate = f"{round(actual_ip_rate, 2)}: [{round(ip_low, 2)}, {round(ip_high, 2)}]"
    guide_estimate = f"{round(actual_guideline_rate, 2)}: [{round(guide_low, 2)}, {round(guide_high, 2)}]"
    
    pval = len(
        [g for g in guideline_covered_rates if g >= actual_ip_rate]
    ) / len(guideline_covered_rates)
    
    return ip_estimate, guide_estimate, pval

In [100]:
budgets  = {
  "Vancomycin" : 0,
  "Ampicillin" : 0,
  "Cefazolin" : 0,
  "Ceftriaxone" : 211,
  "Cefepime" : 254,
  "Zosyn" : 0,
  "Ciprofloxacin" : 0,
  "Meropenem" : 11,
  "Vancomycin_Meropenem" : 0,
  "Vancomycin_Zosyn" :  0,
  "Vancomycin_Cefepime" : 0,
  "Vancomycin_Ceftriaxone" : 0
}
df_uti_final = solve_lp_problem(df_uti_new, budgets)

Status: Optimal


In [101]:
df_uti_final = (df_uti_final
    .assign(was_covered_guideline=df_uti_final.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='guideline_med_description'),
                                                                          axis=1)
           )
    .assign(was_covered_IP=df_uti_final.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='IP_med_description'),
                                                                          axis=1)
           )

)
clin_covered_rate = df_uti_final['was_covered_dr'].sum() / len(df_uti_new)
random_covered_rate = df_uti_final['was_covered_random'].sum() / len(df_uti_new)
ip_covered_rate = df_uti_final['was_covered_IP'].sum() / len(df_uti_new)
guideline_covered_rate = df_uti_final['was_covered_guideline'].sum() / len(df_uti_new)


In [102]:
print(clin_covered_rate)
print(random_covered_rate)
print(ip_covered_rate)
print(guideline_covered_rate)

0.8151260504201681
0.7983193277310925
0.7794117647058824
0.7899159663865546


In [103]:
for abx in ['Ceftriaxone', 'Cefepime', 'Meropenem', 'Zosyn']:
    print(f"{abx}: {roc_auc_score(df_uti_final[f'{abx}_label'], df_uti_final[f'{abx}_predictions'])}")

Ceftriaxone: 0.6900497406229318
Cefepime: 0.6533521109322769
Meropenem: 0.6583880943177426
Zosyn: 0.5792461197339246


In [104]:
ip, guide, pval = boostrap_coverage_rates(df_uti_final)

100%|██████████| 1000/1000 [00:23<00:00, 42.19it/s]


In [105]:
ip

'0.78: [0.74, 0.82]'

In [106]:
guide

'0.79: [0.75, 0.83]'

In [107]:
pval

0.707

### Repeat but now drop all cases where ENTEROCOCCUS was one of the bugs that grew

In [108]:
%%bigquery df_uti_guidelines --project mining-clinical-decisions
WITH observations_with_pos_urine_cultures as (
    SELECT 
        DISTINCT a.*, cohort.index_time
    FROM
        `mining-clinical-decisions.abx.abx_allocation` a
    INNER JOIN 
        `mining-clinical-decisions.abx.final_cohort_table` cohort
    USING
        (pat_enc_csn_id_coded)
    INNER JOIN
        `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
    USING 
        (pat_enc_csn_id_coded)
    INNER JOIN 
        `mining-clinical-decisions.shc_core.culture_sensitivity` cs 
    USING 
        (order_proc_id_coded)
    WHERE
        cs.description = "URINE CULTURE"
),

observations_with_pos_other_cultures as (
    SELECT 
        DISTINCT pat_enc_csn_id_coded
    FROM
        `mining-clinical-decisions.abx.abx_allocation` a
    INNER JOIN 
        `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
    USING 
        (pat_enc_csn_id_coded)
    INNER JOIN 
        `mining-clinical-decisions.shc_core.culture_sensitivity` cs 
    USING 
        (order_proc_id_coded)
    WHERE
        cs.description <> "URINE CULTURE"
),

observations_with_pos_enterococcus as (
    SELECT DISTINCT
        pat_enc_csn_id_coded
    FROM 
        `mining-clinical-decisions.abx.abx_allocation` a
    INNER JOIN 
        `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
    USING 
        (pat_enc_csn_id_coded)
    INNER JOIN 
        `mining-clinical-decisions.shc_core.culture_sensitivity` cs 
    USING 
        (order_proc_id_coded)
    WHERE
        UPPER(cs.organism) LIKE "%ENTEROCOCCUS%"
),

only_utis as (
SELECT 
    DISTINCT a.*
FROM 
    observations_with_pos_urine_cultures a 
WHERE
    a.pat_enc_csn_id_coded NOT IN (SELECT DISTINCT pat_enc_csn_id_coded 
                                   FROM observations_with_pos_other_cultures)
AND
    a.pat_enc_csn_id_coded NOT IN (SELECT DISTINCT pat_enc_csn_id_coded
                                   FROM observations_with_pos_enterococcus)
),

utis_with_neutra as (
    SELECT DISTINCT
        a.*,
        MAX(CASE WHEN lr.ord_num_value IS NULL THEN 0
            WHEN lr.ord_num_value < 10 THEN 1
            ELSE 0 END)
        OVER(PARTITION BY a.pat_enc_csn_id_coded) NEUTRA
    FROM
        only_utis a
    LEFT JOIN 
        (SELECT anon_id, cohort.pat_enc_csn_id_coded, lab_name, base_name, ord_num_value, result_time_utc 
         FROM shc_core.lab_result 
         INNER JOIN abx.final_cohort_table cohort
         USING (anon_id)
         WHERE base_name in ('NEUTRA', 'NEUTAB', 'ABSBAND', 'ABSNEUTBDYFL')
         AND result_time_utc <= cohort.index_time
         AND TIMESTAMP_ADD(result_time_utc, INTERVAL 24 Hour) >= cohort.index_time) lr
    USING 
        (pat_enc_csn_id_coded)

),

final_table as (
    SELECT DISTINCT
        a.*,
        MAX(CASE WHEN UPPER(cs.antibiotic) LIKE "%CEFTRIAXONE%" AND UPPER(cs.suscept) LIKE "%RESISTANT%" THEN 1 ELSE 0 END)
        OVER (PARTITION BY a.anon_id) AS prior_ceftriaxone_resistance,
        MAX(CASE WHEN UPPER(cs.antibiotic) LIKE "%CEFEPIME%" AND UPPER(cs.suscept) LIKE "%RESISTANT%" THEN 1 ELSE 0 END)
        OVER (PARTITION BY a.anon_id) AS prior_cefepime_resistance
    FROM
        utis_with_neutra a
    LEFT JOIN 
        (SELECT DISTINCT anon_id, cohort.index_time, antibiotic, suscept, result_time_jittered_utc
        FROM shc_core.culture_sensitivity
        INNER JOIN abx.final_cohort_table cohort
        USING (anon_id)
        WHERE result_time_jittered_utc < cohort.index_time) cs
    USING
        (anon_id, index_time)
)

SELECT
    *,
    CASE 
    WHEN NEUTRA = 0 and  prior_ceftriaxone_resistance = 0 THEN 'Ceftriaxone'
    WHEN NEUTRA = 0 and prior_ceftriaxone_resistance = 1 and prior_cefepime_resistance = 0 THEN 'Cefepime'
    WHEN NEUTRA = 0 and prior_ceftriaxone_resistance = 1 and prior_cefepime_resistance = 1 THEN 'Meropenem'
    WHEN NEUTRA = 1 and prior_cefepime_resistance = 1 THEN 'Meropenem'
    ELSE 'Cefepime'
    END guideline_med_description
FROM final_table




In [109]:
df_uti_guidelines.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,med_description,IP_med_description,was_covered_dr,was_covered_IP,index_time,NEUTRA,prior_ceftriaxone_resistance,prior_cefepime_resistance,guideline_med_description
0,JCda81cb,131267723331,Vancomycin_Zosyn,Vancomycin_Zosyn,1,1,2019-05-06 21:57:00+00:00,0,0,0,Ceftriaxone
1,JCcda7db,131262420251,Vancomycin_Zosyn,Zosyn,1,1,2019-02-09 14:21:00+00:00,0,0,0,Ceftriaxone
2,JCe09d08,131277291645,Ceftriaxone,Vancomycin_Zosyn,1,1,2019-10-24 04:41:00+00:00,1,0,0,Cefepime
3,JCe22085,131264672428,Ceftriaxone,Cefepime,0,1,2019-02-18 21:29:00+00:00,1,0,0,Cefepime
4,JCdec57d,131264827613,Ceftriaxone,Ceftriaxone,1,1,2019-03-05 06:21:00+00:00,0,0,0,Ceftriaxone


In [110]:
df_uti_new = (df_uti_guidelines
    [['anon_id', 'pat_enc_csn_id_coded', 'guideline_med_description']]
    .merge(df_new, how='inner')
)
print(df_uti_new.shape)
df_uti_new.head()

(403, 46)


Unnamed: 0,anon_id,pat_enc_csn_id_coded,guideline_med_description,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,...,Meropenem,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP
0,JCda81cb,131267723331,Ceftriaxone,0,0.40874,0,0.479061,1,0.631823,1,...,1,1,1,1,1,Vancomycin_Zosyn,Ceftriaxone,1,1,1
1,JCcda7db,131262420251,Ceftriaxone,0,0.075222,1,0.498187,1,0.630462,1,...,1,1,1,1,1,Zosyn,Ceftriaxone,1,1,1
2,JCe09d08,131277291645,Cefepime,0,0.191309,0,0.420485,0,0.289654,1,...,1,1,1,1,1,Vancomycin_Zosyn,Vancomycin_Zosyn,1,1,1
3,JCe22085,131264672428,Cefepime,0,0.191883,0,0.405953,0,0.590927,0,...,1,0,1,0,1,Cefepime,Ceftriaxone,0,0,1
4,JCdec57d,131264827613,Ceftriaxone,0,0.111572,1,0.482054,1,0.632007,1,...,1,1,1,1,1,Ceftriaxone,Ceftriaxone,1,1,1


In [111]:
df_uti_new = (df_uti_new
    .assign(was_covered_guideline=df_uti_new.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='guideline_med_description'),
                                                                          axis=1)
           )
)
clin_covered_rate = df_uti_new['was_covered_dr'].sum() / len(df_uti_new)
random_covered_rate = df_uti_new['was_covered_random'].sum() / len(df_uti_new)
ip_covered_rate = df_uti_new['was_covered_IP'].sum() / len(df_uti_new)
guideline_covered_rate = df_uti_new['was_covered_guideline'].sum() / len(df_uti_new)


In [112]:
print(clin_covered_rate)
print(random_covered_rate)
print(ip_covered_rate)
print(guideline_covered_rate)

0.8933002481389578
0.8610421836228288
0.9106699751861043
0.9330024813895782


### We note that this comparison is likely unfair because the the optimizer is using much broader spectrum antibiotics in this subset than what the guidelines recommend

##### Thus we do another analysis where we constrict the optimizer to the budget used by the guidelines themselves. 

In [113]:
print(df_uti_new.shape)

(403, 47)


In [114]:
df_uti_new['guideline_med_description'].value_counts()

Cefepime       217
Ceftriaxone    178
Meropenem        8
Name: guideline_med_description, dtype: int64

In [88]:
from tqdm import tqdm

def solve_lp_problem(df, budgets):
    """
    Creates a lp problem formulation with pulp for antibiotic selection and adds
    budget constraints and solves.
    """

    abx_options = ["Vancomycin",
                   "Ampicillin",
                   "Cefazolin",
                   "Ceftriaxone",
                   "Cefepime",
                   "Zosyn",
                   "Ciprofloxacin",
                   "Meropenem",
                   "Vancomycin_Meropenem",
                   "Vancomycin_Zosyn",
                   "Vancomycin_Cefepime",
                   "Vancomycin_Ceftriaxone"
                   ]

    abx_model = LpProblem("Antibiotics", LpMaximize)

    # Create binary indicators for whether treatment is used
    drug_inds = {}
    for abx in abx_options:
        drug_inds[abx] = [LpVariable('%s_%d' % (abx, i), lowBound=0, upBound=1, cat='Binary')
                          for i in range(len(df))]

    # Add objective function to model
    per_csn_sum = []
    for i in range(len(df)):
        _sum = 0
        for abx in abx_options:
            _sum += drug_inds[abx][i] * df['%s_predictions' % abx].values[i]
        per_csn_sum.append(_sum)

    abx_model += lpSum(per_csn_sum)

    # Add one selection constraint
    for i in range(len(df)):
        selections = []
        for abx in abx_options:
            selections.append(drug_inds[abx][i])
        abx_model += lpSum(selections) == 1
    
    for drug in drug_inds:
        abx_model += lpSum([drug_inds[drug][i] for i in range(len(df))]) == budgets[drug]

    # Solve model
    abx_model.solve()
    print("Status:", LpStatus[abx_model.status])

    # Save selected antibiotic to df_new
    abx_decisions = []
    for i in range(len(df)):
        abx_decision = None
        for abx in abx_options:
            if drug_inds[abx][i].value() == 1:
                abx_decision = abx
        assert abx_decision is not None
        abx_decisions.append(abx_decision)
    df['IP_med_description'] = abx_decisions
    
    return df

def boostrap_coverage_rates(df):
    """
    Get's guideline and lp coverage rates given a dataframe with med descriptions and assignments. 
    We'll do a stratified bootstrap (stratified by antibiotic) to build up two distributions in coverage
    rates. We'll return the confidence intervals around each estimate and a pvalue for testing the null 
    hypothesis that the ip assignment is equal to or less than the guideline assignment
    """
    ip_covered_rates = []
    guideline_covered_rates = []
    actual_ip_rate = df['was_covered_IP'].sum() / len(df)
    actual_guideline_rate = df['was_covered_guideline'].sum() / len(df)
    for i in tqdm(range(1000)):
        df_boot_guide = (df
            .groupby('guideline_med_description')
            .sample(frac=1., replace=True)
            .reset_index()
        )
        df_boot_ip = (df
            .groupby('IP_med_description')
            .sample(frac=1., replace=True)
            .reset_index()
        )
        ip_covered_rates.append(df_boot_ip['was_covered_IP'].sum() / len(df_boot_ip))
        guideline_covered_rates.append(df_boot_guide['was_covered_guideline'].sum() / len(df_boot_guide))
        
    ip_high = np.percentile(ip_covered_rates, 97.5)
    ip_low = np.percentile(ip_covered_rates, 2.5)
    guide_high = np.percentile(guideline_covered_rates, 97.5)
    guide_low = np.percentile(guideline_covered_rates, 2.5)
    
    ip_estimate = f"{round(actual_ip_rate, 2)}: [{round(ip_low, 2)}, {round(ip_high, 2)}]"
    guide_estimate = f"{round(actual_guideline_rate, 2)}: [{round(guide_low, 2)}, {round(guide_high, 2)}]"
    
    pval = len(
        [g for g in guideline_covered_rates if g >= actual_ip_rate]
    ) / len(guideline_covered_rates)
    
    return ip_estimate, guide_estimate, pval

In [115]:
budgets  = {
  "Vancomycin" : 0,
  "Ampicillin" : 0,
  "Cefazolin" : 0,
  "Ceftriaxone" : 178,
  "Cefepime" : 217,
  "Zosyn" : 0,
  "Ciprofloxacin" : 0,
  "Meropenem" : 8,
  "Vancomycin_Meropenem" : 0,
  "Vancomycin_Zosyn" :  0,
  "Vancomycin_Cefepime" : 0,
  "Vancomycin_Ceftriaxone" : 0
}
df_uti_final = solve_lp_problem(df_uti_new, budgets)

Status: Optimal


In [116]:
df_uti_final = (df_uti_final
    .assign(was_covered_guideline=df_uti_final.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='guideline_med_description'),
                                                                          axis=1)
           )
    .assign(was_covered_IP=df_uti_final.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='IP_med_description'),
                                                                          axis=1)
           )

)
clin_covered_rate = df_uti_final['was_covered_dr'].sum() / len(df_uti_new)
random_covered_rate = df_uti_final['was_covered_random'].sum() / len(df_uti_new)
ip_covered_rate = df_uti_final['was_covered_IP'].sum() / len(df_uti_new)
guideline_covered_rate = df_uti_final['was_covered_guideline'].sum() / len(df_uti_new)


In [117]:
print(clin_covered_rate)
print(random_covered_rate)
print(ip_covered_rate)
print(guideline_covered_rate)

0.8933002481389578
0.8610421836228288
0.9255583126550868
0.9330024813895782


In [118]:
for abx in ['Ceftriaxone', 'Cefepime', 'Meropenem', 'Zosyn']:
    print(f"{abx}: {roc_auc_score(df_uti_final[f'{abx}_label'], df_uti_final[f'{abx}_predictions'])}")

Ceftriaxone: 0.7026796151425231
Cefepime: 0.6910603496897914
Meropenem: 0.8447236180904523
Zosyn: 0.5746481970096746


In [119]:
ip, guide, pval = boostrap_coverage_rates(df_uti_final)

100%|██████████| 1000/1000 [00:23<00:00, 42.22it/s]


In [120]:
ip

'0.93: [0.9, 0.95]'

In [121]:
guide

'0.93: [0.91, 0.96]'

In [122]:
pval

0.758

### Now repeat zosyn guideline but without the ENTEROCOCCUS

In [124]:
%%bigquery df_uti_guidelines --project mining-clinical-decisions
WITH observations_with_pos_urine_cultures as (
    SELECT 
        DISTINCT a.*, cohort.index_time
    FROM
        `mining-clinical-decisions.abx.abx_allocation` a
    INNER JOIN 
        `mining-clinical-decisions.abx.final_cohort_table` cohort
    USING
        (pat_enc_csn_id_coded)
    INNER JOIN
        `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
    USING 
        (pat_enc_csn_id_coded)
    INNER JOIN 
        `mining-clinical-decisions.shc_core.culture_sensitivity` cs 
    USING 
        (order_proc_id_coded)
    WHERE
        cs.description = "URINE CULTURE"
),

observations_with_pos_other_cultures as (
    SELECT 
        DISTINCT pat_enc_csn_id_coded
    FROM
        `mining-clinical-decisions.abx.abx_allocation` a
    INNER JOIN 
        `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
    USING 
        (pat_enc_csn_id_coded)
    INNER JOIN 
        `mining-clinical-decisions.shc_core.culture_sensitivity` cs 
    USING 
        (order_proc_id_coded)
    WHERE
        cs.description <> "URINE CULTURE"
),

observations_with_pos_enterococcus as (
    SELECT DISTINCT
        pat_enc_csn_id_coded
    FROM 
        `mining-clinical-decisions.abx.abx_allocation` a
    INNER JOIN 
        `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
    USING 
        (pat_enc_csn_id_coded)
    INNER JOIN 
        `mining-clinical-decisions.shc_core.culture_sensitivity` cs 
    USING 
        (order_proc_id_coded)
    WHERE
        UPPER(cs.organism) LIKE "%ENTEROCOCCUS%"
),


only_utis as (
SELECT 
    DISTINCT a.*
FROM 
    observations_with_pos_urine_cultures a 
WHERE
    a.pat_enc_csn_id_coded NOT IN (SELECT DISTINCT pat_enc_csn_id_coded 
                                   FROM observations_with_pos_other_cultures)
AND 
    a.pat_enc_csn_id_coded NOT IN (SELECT DISTINCT pat_enc_csn_id_coded 
                                   FROM observations_with_pos_enterococcus)

),

utis_with_neutra as (
    SELECT DISTINCT
        a.*,
        MAX(CASE WHEN lr.ord_num_value IS NULL THEN 0
            WHEN lr.ord_num_value < 10 THEN 1
            ELSE 0 END)
        OVER(PARTITION BY a.pat_enc_csn_id_coded) NEUTRA
    FROM
        only_utis a
    LEFT JOIN 
        (SELECT anon_id, cohort.pat_enc_csn_id_coded, lab_name, base_name, ord_num_value, result_time_utc 
         FROM shc_core.lab_result 
         INNER JOIN abx.final_cohort_table cohort
         USING (anon_id)
         WHERE base_name in ('NEUTRA', 'NEUTAB', 'ABSBAND', 'ABSNEUTBDYFL')
         AND result_time_utc <= cohort.index_time
         AND TIMESTAMP_ADD(result_time_utc, INTERVAL 24 Hour) >= cohort.index_time) lr
    USING 
        (pat_enc_csn_id_coded)

),

final_table as (
    SELECT DISTINCT
        a.*,
        MAX(CASE WHEN UPPER(cs.antibiotic) LIKE "%CEFTRIAXONE%" AND UPPER(cs.suscept) LIKE "%RESISTANT%" THEN 1 ELSE 0 END)
        OVER (PARTITION BY a.anon_id) AS prior_ceftriaxone_resistance,
        MAX(CASE WHEN UPPER(cs.antibiotic) LIKE "%PIPERACILLIN/TAZOBACTAM%" AND UPPER(cs.suscept) LIKE "%RESISTANT%" THEN 1 ELSE 0 END)
        OVER (PARTITION BY a.anon_id) AS prior_zosyn_resistance
    FROM
        utis_with_neutra a
    LEFT JOIN 
        (SELECT DISTINCT anon_id, cohort.index_time, antibiotic, suscept, result_time_jittered_utc
        FROM shc_core.culture_sensitivity
        INNER JOIN abx.final_cohort_table cohort
        USING (anon_id)
        WHERE result_time_jittered_utc < cohort.index_time) cs
    USING
        (anon_id, index_time)
)

SELECT
    *,
    CASE 
    WHEN NEUTRA = 0 and  prior_ceftriaxone_resistance = 0 THEN 'Ceftriaxone'
    WHEN NEUTRA = 0 and prior_ceftriaxone_resistance = 1 and prior_zosyn_resistance = 0 THEN 'Zosyn'
    WHEN NEUTRA = 0 and prior_ceftriaxone_resistance = 1 and prior_zosyn_resistance = 1 THEN 'Meropenem'
    WHEN NEUTRA = 1 and prior_zosyn_resistance = 1 THEN 'Meropenem'
    ELSE 'Zosyn'
    END guideline_med_description
FROM final_table




In [125]:
df_uti_guidelines.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,med_description,IP_med_description,was_covered_dr,was_covered_IP,index_time,NEUTRA,prior_ceftriaxone_resistance,prior_zosyn_resistance,guideline_med_description
0,JCcde394,131269348156,Ceftriaxone,Ceftriaxone,1,1,2019-06-27 02:59:00+00:00,0,0,0,Ceftriaxone
1,JCe4d962,131269280090,Ceftriaxone,Vancomycin_Zosyn,0,1,2019-05-13 03:38:00+00:00,1,0,0,Zosyn
2,JCdfd218,131278102592,Ceftriaxone,Zosyn,1,1,2019-10-22 17:58:00+00:00,0,0,0,Ceftriaxone
3,JCe75909,131265108077,Vancomycin_Zosyn,Ceftriaxone,1,1,2019-04-15 05:45:00+00:00,1,0,0,Zosyn
4,JCe0907f,131277794260,Ceftriaxone,Ceftriaxone,1,1,2019-11-04 20:06:00+00:00,0,0,0,Ceftriaxone


In [126]:
df_uti_new = (df_uti_guidelines
    [['anon_id', 'pat_enc_csn_id_coded', 'guideline_med_description']]
    .merge(df_new, how='inner')
)
print(df_uti_new.shape)
df_uti_new.head()

(403, 46)


Unnamed: 0,anon_id,pat_enc_csn_id_coded,guideline_med_description,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,...,Meropenem,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP
0,JCcde394,131269348156,Ceftriaxone,0,0.129348,1,0.391012,1,0.826151,1,...,1,1,1,1,1,Ceftriaxone,Ceftriaxone,1,1,1
1,JCe4d962,131269280090,Zosyn,0,0.241762,0,0.400408,0,0.431113,0,...,1,0,1,1,1,Vancomycin_Zosyn,Cefepime,0,1,1
2,JCdfd218,131278102592,Ceftriaxone,0,0.381317,0,0.557518,1,0.68102,1,...,1,1,1,1,1,Zosyn,Ceftriaxone,1,1,1
3,JCe75909,131265108077,Zosyn,0,0.148747,0,0.459139,0,0.634669,1,...,1,1,1,1,1,Ceftriaxone,Vancomycin_Zosyn,1,1,1
4,JCe0907f,131277794260,Ceftriaxone,0,0.196151,1,0.507214,1,0.788909,1,...,1,1,1,1,1,Ceftriaxone,Ceftriaxone,1,1,1


In [127]:
df_uti_new = (df_uti_new
    .assign(was_covered_guideline=df_uti_new.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='guideline_med_description'),
                                                                          axis=1)
           )
)
clin_covered_rate = df_uti_new['was_covered_dr'].sum() / len(df_uti_new)
random_covered_rate = df_uti_new['was_covered_random'].sum() / len(df_uti_new)
ip_covered_rate = df_uti_new['was_covered_IP'].sum() / len(df_uti_new)
guideline_covered_rate = df_uti_new['was_covered_guideline'].sum() / len(df_uti_new)


In [128]:
print(clin_covered_rate)
print(random_covered_rate)
print(ip_covered_rate)
print(guideline_covered_rate)

0.8933002481389578
0.8610421836228288
0.9106699751861043
0.9106699751861043


### We note that this comparison is likely unfair because the the optimizer is using much broader spectrum antibiotics in this subset than what the guidelines recommend

##### Thus we do another analysis where we constrict the optimizer to the budget used by the guidelines themselves. 

In [129]:
print(df_uti_new.shape)

(403, 47)


In [130]:
df_uti_new['guideline_med_description'].value_counts()

Zosyn          210
Ceftriaxone    178
Meropenem       15
Name: guideline_med_description, dtype: int64

In [30]:
from tqdm import tqdm

def solve_lp_problem(df, budgets):
    """
    Creates a lp problem formulation with pulp for antibiotic selection and adds
    budget constraints and solves.
    """

    abx_options = ["Vancomycin",
                   "Ampicillin",
                   "Cefazolin",
                   "Ceftriaxone",
                   "Cefepime",
                   "Zosyn",
                   "Ciprofloxacin",
                   "Meropenem",
                   "Vancomycin_Meropenem",
                   "Vancomycin_Zosyn",
                   "Vancomycin_Cefepime",
                   "Vancomycin_Ceftriaxone"
                   ]

    abx_model = LpProblem("Antibiotics", LpMaximize)

    # Create binary indicators for whether treatment is used
    drug_inds = {}
    for abx in abx_options:
        drug_inds[abx] = [LpVariable('%s_%d' % (abx, i), lowBound=0, upBound=1, cat='Binary')
                          for i in range(len(df))]

    # Add objective function to model
    per_csn_sum = []
    for i in range(len(df)):
        _sum = 0
        for abx in abx_options:
            _sum += drug_inds[abx][i] * df['%s_predictions' % abx].values[i]
        per_csn_sum.append(_sum)

    abx_model += lpSum(per_csn_sum)

    # Add one selection constraint
    for i in range(len(df)):
        selections = []
        for abx in abx_options:
            selections.append(drug_inds[abx][i])
        abx_model += lpSum(selections) == 1
    
    for drug in drug_inds:
        abx_model += lpSum([drug_inds[drug][i] for i in range(len(df))]) == budgets[drug]

    # Solve model
    abx_model.solve()
    print("Status:", LpStatus[abx_model.status])

    # Save selected antibiotic to df_new
    abx_decisions = []
    for i in range(len(df)):
        abx_decision = None
        for abx in abx_options:
            if drug_inds[abx][i].value() == 1:
                abx_decision = abx
        assert abx_decision is not None
        abx_decisions.append(abx_decision)
    df['IP_med_description'] = abx_decisions
    
    return df

def boostrap_coverage_rates(df):
    """
    Get's guideline and lp coverage rates given a dataframe with med descriptions and assignments. 
    We'll do a stratified bootstrap (stratified by antibiotic) to build up two distributions in coverage
    rates. We'll return the confidence intervals around each estimate and a pvalue for testing the null 
    hypothesis that the ip assignment is equal to or less than the guideline assignment
    """
    ip_covered_rates = []
    guideline_covered_rates = []
    actual_ip_rate = df['was_covered_IP'].sum() / len(df)
    actual_guideline_rate = df['was_covered_guideline'].sum() / len(df)
    for i in tqdm(range(1000)):
        df_boot_guide = (df
            .groupby('guideline_med_description')
            .sample(frac=1., replace=True)
            .reset_index()
        )
        df_boot_ip = (df
            .groupby('IP_med_description')
            .sample(frac=1., replace=True)
            .reset_index()
        )
        ip_covered_rates.append(df_boot_ip['was_covered_IP'].sum() / len(df_boot_ip))
        guideline_covered_rates.append(df_boot_guide['was_covered_guideline'].sum() / len(df_boot_guide))
        
    ip_high = np.percentile(ip_covered_rates, 97.5)
    ip_low = np.percentile(ip_covered_rates, 2.5)
    guide_high = np.percentile(guideline_covered_rates, 97.5)
    guide_low = np.percentile(guideline_covered_rates, 2.5)
    
    ip_estimate = f"{round(actual_ip_rate, 2)}: [{round(ip_low, 3)}, {round(ip_high, 3)}]"
    guide_estimate = f"{round(actual_guideline_rate, 2)}: [{round(guide_low, 3)}, {round(guide_high, 3)}]"
    
    pval = len(
        [g for g in guideline_covered_rates if g >= actual_ip_rate]
    ) / len(guideline_covered_rates)
    
    return ip_estimate, guide_estimate, pval
    
    
    

In [131]:
budgets  = {
  "Vancomycin" : 0,
  "Ampicillin" : 0,
  "Cefazolin" : 0,
  "Ceftriaxone" : 178,
  "Cefepime" : 0,
  "Zosyn" : 210,
  "Ciprofloxacin" : 0,
  "Meropenem" : 15,
  "Vancomycin_Meropenem" : 0,
  "Vancomycin_Zosyn" :  0,
  "Vancomycin_Cefepime" : 0,
  "Vancomycin_Ceftriaxone" : 0
}
df_uti_final = solve_lp_problem(df_uti_new, budgets)

Status: Optimal


In [132]:
df_uti_final = (df_uti_final
    .assign(was_covered_guideline=df_uti_final.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='guideline_med_description'),
                                                                          axis=1)
           )
    .assign(was_covered_IP=df_uti_final.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='IP_med_description'),
                                                                          axis=1)
           )

)
clin_covered_rate = df_uti_final['was_covered_dr'].sum() / len(df_uti_new)
random_covered_rate = df_uti_final['was_covered_random'].sum() / len(df_uti_new)
ip_covered_rate = df_uti_final['was_covered_IP'].sum() / len(df_uti_new)
guideline_covered_rate = df_uti_final['was_covered_guideline'].sum() / len(df_uti_new)


In [133]:
print(clin_covered_rate)
print(random_covered_rate)
print(ip_covered_rate)
print(guideline_covered_rate)

0.8933002481389578
0.8610421836228288
0.9205955334987593
0.9106699751861043


In [134]:
for abx in ['Ceftriaxone', 'Cefepime', 'Meropenem', 'Zosyn']:
    print(f"{abx}: {roc_auc_score(df_uti_final[f'{abx}_label'], df_uti_final[f'{abx}_predictions'])}")

Ceftriaxone: 0.7026796151425231
Cefepime: 0.6910603496897914
Meropenem: 0.8447236180904523
Zosyn: 0.5746481970096746


In [135]:
ip, guide, pval = boostrap_coverage_rates(df_uti_final)

100%|██████████| 1000/1000 [00:25<00:00, 39.88it/s]


In [136]:
ip

'0.92: [0.89, 0.95]'

In [137]:
guide

'0.91: [0.88, 0.94]'

In [138]:
pval

0.275

### Antibiogram comparison

##### We note this is unfair because here we let the benchmark know the species type (which the lp method does not know). 
Assign each observation antibiotic efficacy estimates based on the antibiogram value of the bug(s). Take mean over each unique bug combo. 

In [17]:
%%bigquery df_biogram_baseline --project mining-clinical-decisions

WITH observations_and_bugs as (
    SELECT DISTINCT 
        a.*, cohort.index_time, cs.organism
    FROM
        `mining-clinical-decisions.abx.abx_allocation` a
    INNER JOIN 
        `mining-clinical-decisions.abx.final_cohort_table` cohort
    USING
        (pat_enc_csn_id_coded)
    INNER JOIN
        `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
    USING 
        (pat_enc_csn_id_coded)
    INNER JOIN 
        `mining-clinical-decisions.shc_core.culture_sensitivity` cs 
    USING 
        (order_proc_id_coded)
    ORDER BY
        pat_enc_csn_id_coded, organism

)
SELECT DISTINCT
    pat_enc_csn_id_coded,
    STRING_AGG(organism) bug
FROM
    observations_and_bugs
GROUP BY 
    pat_enc_csn_id_coded


In [18]:
print(df_biogram_baseline.shape)
df_biogram_baseline.head()

(770, 2)


Unnamed: 0,pat_enc_csn_id_coded,bug
0,131260812263,"CITROBACTER FREUNDII COMPLEX,PROTEUS MIRABILIS"
1,131260883970,ENTEROCOCCUS SPECIES
2,131261001599,ESCHERICHIA COLI
3,131261014293,STAPHYLOCOCCUS AUREUS
4,131261155365,ESCHERICHIA COLI


In [19]:
df_new.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,Ceftriaxone_predictions,...,Meropenem,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP
0,JC2a03b24,131260812263,0,0.108161,0,0.462078,0,0.664275,1,0.79827,...,1,1,1,1,1,Ceftriaxone,Cefazolin,1,0,1
1,JCe45a3c,131260883970,1,0.483241,1,0.538177,0,0.631024,0,0.784364,...,0,1,1,1,1,Ceftriaxone,Ceftriaxone,1,0,0
2,JCd235bb,131261001599,0,0.21561,0,0.501686,1,0.631382,1,0.653581,...,1,1,1,1,1,Vancomycin_Zosyn,Ceftriaxone,1,1,1
3,JCd356bf,131261014293,1,0.421153,1,0.412029,1,0.528823,1,0.724792,...,1,1,1,1,1,Ceftriaxone,Ceftriaxone,1,1,1
4,JCd1c07c,131261155365,0,0.064201,1,0.528883,1,0.749904,1,0.789198,...,1,1,1,1,1,Ceftriaxone,Vancomycin_Ceftriaxone,1,1,1


In [20]:
df_new_with_bug = df_new.merge(df_biogram_baseline, how='inner', on='pat_enc_csn_id_coded')
print(df_new_with_bug.shape)
df_new_with_bug.head()

(770, 46)


Unnamed: 0,anon_id,pat_enc_csn_id_coded,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,Ceftriaxone_predictions,...,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP,bug
0,JC2a03b24,131260812263,0,0.108161,0,0.462078,0,0.664275,1,0.79827,...,1,1,1,1,Ceftriaxone,Cefazolin,1,0,1,"CITROBACTER FREUNDII COMPLEX,PROTEUS MIRABILIS"
1,JCe45a3c,131260883970,1,0.483241,1,0.538177,0,0.631024,0,0.784364,...,1,1,1,1,Ceftriaxone,Ceftriaxone,1,0,0,ENTEROCOCCUS SPECIES
2,JCd235bb,131261001599,0,0.21561,0,0.501686,1,0.631382,1,0.653581,...,1,1,1,1,Vancomycin_Zosyn,Ceftriaxone,1,1,1,ESCHERICHIA COLI
3,JCd356bf,131261014293,1,0.421153,1,0.412029,1,0.528823,1,0.724792,...,1,1,1,1,Ceftriaxone,Ceftriaxone,1,1,1,STAPHYLOCOCCUS AUREUS
4,JCd1c07c,131261155365,0,0.064201,1,0.528883,1,0.749904,1,0.789198,...,1,1,1,1,Ceftriaxone,Vancomycin_Ceftriaxone,1,1,1,ESCHERICHIA COLI


In [21]:
print(df_new.shape)

(770, 45)


In [22]:
df_biogram_baseline.sort_values('pat_enc_csn_id_coded').head(20)

Unnamed: 0,pat_enc_csn_id_coded,bug
0,131260812263,"CITROBACTER FREUNDII COMPLEX,PROTEUS MIRABILIS"
1,131260883970,ENTEROCOCCUS SPECIES
2,131261001599,ESCHERICHIA COLI
3,131261014293,STAPHYLOCOCCUS AUREUS
4,131261155365,ESCHERICHIA COLI
5,131261167965,STAPHYLOCOCCUS AUREUS
6,131261225689,ESCHERICHIA COLI
7,131261364764,MORGANELLA MORGANII
8,131261423740,STREPTOCOCCUS MITIS GROUP
9,131261425594,MORGANELLA MORGANII


In [23]:
test = df_new_with_bug.groupby('bug')['Vancomycin_predictions'].transform('mean')
# for abx in antibiotic_options

In [24]:
assign_kwargs = {
    f"{abx}_predictions" : df_new_with_bug.groupby('bug')[f'{abx}_label'].transform('mean')
    for abx in abx_options
}
df_new_with_bug = df_new_with_bug.assign(**assign_kwargs)

In [27]:
budgets = {"Vancomycin" : 13,
          "Ampicillin" : 0,
          "Cefazolin" : 8,
          "Ceftriaxone" : 404,
          "Cefepime" : 14,
          "Zosyn" : 102,
          "Ciprofloxacin" : 8,
          "Meropenem" : 9,
          "Vancomycin_Meropenem" : 9,
          "Vancomycin_Zosyn" :  149,
          "Vancomycin_Cefepime" : 23,
          "Vancomycin_Ceftriaxone" : 31
 }
df_new_with_bug = solve_lp_problem(df_new_with_bug, budgets)

Status: Optimal


In [28]:
df_new_with_bug = (df_new_with_bug
    .assign(was_covered_IP=df_new_with_bug.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='IP_med_description'),
                                                                          axis=1)
           )

)
clin_covered_rate = df_new_with_bug['was_covered_dr'].sum() / len(df_new_with_bug)
random_covered_rate = df_new_with_bug['was_covered_random'].sum() / len(df_new_with_bug)
ip_covered_rate = df_new_with_bug['was_covered_IP'].sum() / len(df_new_with_bug)


In [29]:
print(clin_covered_rate)
print(random_covered_rate)
print(ip_covered_rate)


0.8428571428571429
0.787012987012987
0.948051948051948


In [190]:
df_new_with_bug.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,Ceftriaxone_predictions,...,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP,bug
0,JC2a03b24,131260812263,0,0.0,0,0.0,0,0.0,1,1.0,...,1,1,1,1,Ceftriaxone,Zosyn,1,1,1,"CITROBACTER FREUNDII COMPLEX,PROTEUS MIRABILIS"
1,JCe45a3c,131260883970,1,0.942857,1,0.942857,0,0.0,0,0.0,...,1,1,1,1,Zosyn,Ceftriaxone,1,0,1,ENTEROCOCCUS SPECIES
2,JCd235bb,131261001599,0,0.0,0,0.488372,1,0.813953,1,0.883721,...,1,1,1,1,Ceftriaxone,Zosyn,1,1,1,ESCHERICHIA COLI
3,JCd356bf,131261014293,1,1.0,1,0.882353,1,1.0,1,0.882353,...,1,1,1,1,Vancomycin_Zosyn,Zosyn,1,1,1,STAPHYLOCOCCUS AUREUS
4,JCd1c07c,131261155365,0,0.0,1,0.488372,1,0.813953,1,0.883721,...,1,1,1,1,Zosyn,Vancomycin_Ceftriaxone,1,1,1,ESCHERICHIA COLI


In [191]:
df_new.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,Ceftriaxone_predictions,...,Meropenem,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP
0,JC2a03b24,131260812263,0,0.108161,0,0.462078,0,0.664275,1,0.79827,...,1,1,1,1,1,Ceftriaxone,Zosyn,1,1,1
1,JCe45a3c,131260883970,1,0.483241,1,0.538177,0,0.631024,0,0.784364,...,0,1,1,1,1,Ceftriaxone,Ceftriaxone,1,0,0
2,JCd235bb,131261001599,0,0.21561,0,0.501686,1,0.631382,1,0.653581,...,1,1,1,1,1,Vancomycin_Zosyn,Zosyn,1,1,1
3,JCd356bf,131261014293,1,0.421153,1,0.412029,1,0.528823,1,0.724792,...,1,1,1,1,1,Ceftriaxone,Zosyn,1,1,1
4,JCd1c07c,131261155365,0,0.064201,1,0.528883,1,0.749904,1,0.789198,...,1,1,1,1,1,Ceftriaxone,Vancomycin_Ceftriaxone,1,1,1


In [183]:
df_new_with_bug['was_covered_IP'].sum()

662

In [184]:
df_new['was_covered_IP'].sum()

662

In [186]:
df_new.query('was_covered_IP == 0', engine='python').head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,Ceftriaxone_predictions,...,Meropenem,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP
1,JCe45a3c,131260883970,1,0.483241,1,0.538177,0,0.631024,0,0.784364,...,0,1,1,1,1,Ceftriaxone,Ceftriaxone,1,0,0
9,JCe5fd00,131261425594,0,0.660258,0,0.583722,0,0.569683,1,0.690373,...,1,1,1,1,1,Vancomycin,Zosyn,1,1,0
19,JCd5d810,131261586041,0,0.170803,0,0.218224,0,0.530841,0,0.746683,...,0,1,1,1,1,Ceftriaxone,Ceftriaxone,1,0,0
25,JCcfaf6c,131261698993,0,0.105692,0,0.486886,0,0.752132,0,0.784198,...,1,0,1,1,1,Ceftriaxone,Vancomycin_Zosyn,0,1,0
28,JC2a25c15,131261798884,1,0.121255,1,0.435473,0,0.702215,0,0.781805,...,0,1,1,1,1,Ceftriaxone,Ceftriaxone,1,0,0


In [187]:
df_new_with_bug.query('was_covered_IP == 0', engine='python').head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,Ceftriaxone_predictions,...,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP,bug
1,JCe45a3c,131260883970,1,0.942857,1,0.942857,0,0.0,0,0.0,...,1,1,1,1,Zosyn,Ceftriaxone,1,0,0,ENTEROCOCCUS SPECIES
9,JCe5fd00,131261425594,0,0.0,0,0.0,0,0.0,1,0.75,...,1,1,1,1,Vancomycin_Zosyn,Zosyn,1,1,0,MORGANELLA MORGANII
19,JCd5d810,131261586041,0,0.0,0,0.0,0,0.0,0,0.0,...,1,1,1,1,Zosyn,Ceftriaxone,1,0,0,"ENTEROCOCCUS SPECIES,KLEBSIELLA PNEUMONIAE"
25,JCcfaf6c,131261698993,0,0.0,0,0.0,0,0.0,0,0.0,...,0,1,1,1,Zosyn,Vancomycin_Zosyn,0,1,0,PROTEUS VULGARIS GROUP
28,JC2a25c15,131261798884,1,0.944444,1,1.0,0,0.0,0,0.0,...,1,1,1,1,Zosyn,Ceftriaxone,1,0,0,ENTEROCOCCUS FAECALIS


### Let's do this again but instead create antibiogram values only on the gram stain.

In [31]:
# Load in gram positive and gram negative text files
with open ('gram_positive.txt', 'r') as f:
    gram_positives = set([l.rstrip() for l in f.readlines()])
with open ('gram_negative.txt', 'r') as f:
    gram_negatives = set([l.rstrip() for l in f.readlines()])

# We'll drop observations that have more than one bug or whos species we don't identify
def get_gram_stain(a):
    """
    Get's gram stain for bugs
    """
    bugs = a.split(",")
    gram_positive = False
    gram_negative = False
    for bug in bugs:
        if bug in gram_positives:
            gram_positive = True
        if bug in gram_negatives:
            gram_negative = True
    if gram_positive and gram_negative:
        return 'both'
    elif gram_positive: 
        return 'positive'
    elif gram_negative:
        return 'negative'
    else:
        return 'unknown'
        

df_new_gram_stain = (df_new_with_bug
    .assign(gram_stain=lambda x: [get_gram_stain(a) for a in x.bug])
)
df_new_gram_stain['gram_stain'].value_counts()

negative    530
positive    175
both         64
unknown       1
Name: gram_stain, dtype: int64

In [32]:
assign_kwargs = {
    f"{abx}_predictions" : df_new_gram_stain.groupby('gram_stain')[f'{abx}_label'].transform('mean')
    for abx in abx_options
}
df_new_gram_stain = df_new_gram_stain.assign(**assign_kwargs)

In [33]:
budgets = {"Vancomycin" : 13,
          "Ampicillin" : 0,
          "Cefazolin" : 8,
          "Ceftriaxone" : 404,
          "Cefepime" : 14,
          "Zosyn" : 102,
          "Ciprofloxacin" : 8,
          "Meropenem" : 9,
          "Vancomycin_Meropenem" : 9,
          "Vancomycin_Zosyn" :  149,
          "Vancomycin_Cefepime" : 23,
          "Vancomycin_Ceftriaxone" : 31
 }
df_new_gram_stain = solve_lp_problem(df_new_gram_stain, budgets)

Status: Optimal


In [34]:
df_new_gram_stain = (df_new_gram_stain
    .assign(was_covered_IP=df_new_gram_stain.apply(lambda x: compute_was_covered(x, 
                                                                          decision_column='IP_med_description'),
                                                                          axis=1)
           )

)
clin_covered_rate = df_new_gram_stain['was_covered_dr'].sum() / len(df_new_gram_stain)
random_covered_rate = df_new_gram_stain['was_covered_random'].sum() / len(df_new_gram_stain)
ip_covered_rate = df_new_gram_stain['was_covered_IP'].sum() / len(df_new_gram_stain)


In [35]:
print(clin_covered_rate)
print(random_covered_rate)
print(ip_covered_rate)


0.8428571428571429
0.787012987012987
0.8935064935064935


In [36]:
# It's the comma
df_new_gram_stain.query("gram_stain == 'unknown'", engine='python').bug

316    STAPHYLOCOCCUS SPECIES - COAG NEGATIVE, NOT ST...
Name: bug, dtype: object

In [37]:
df_new_gram_stain.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,Ceftriaxone_predictions,...,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP,bug,gram_stain
0,JC2a03b24,131260812263,0,0.0,0,0.35283,0,0.7,1,0.820755,...,1,1,1,Ciprofloxacin,Cefazolin,1,0,1,"CITROBACTER FREUNDII COMPLEX,PROTEUS MIRABILIS",negative
1,JCe45a3c,131260883970,1,0.948571,1,0.868571,0,0.485714,0,0.514286,...,1,1,1,Vancomycin_Zosyn,Ceftriaxone,1,0,1,ENTEROCOCCUS SPECIES,positive
2,JCd235bb,131261001599,0,0.0,0,0.35283,1,0.7,1,0.820755,...,1,1,1,Ceftriaxone,Ceftriaxone,1,1,1,ESCHERICHIA COLI,negative
3,JCd356bf,131261014293,1,0.948571,1,0.868571,1,0.485714,1,0.514286,...,1,1,1,Vancomycin,Ceftriaxone,1,1,1,STAPHYLOCOCCUS AUREUS,positive
4,JCd1c07c,131261155365,0,0.0,1,0.35283,1,0.7,1,0.820755,...,1,1,1,Ceftriaxone,Vancomycin_Ceftriaxone,1,1,1,ESCHERICHIA COLI,negative


In [54]:
import pdb
def boostrap_gram_stain_covereage(df):
    """
    Get's guideline and lp coverage rates given a dataframe with med descriptions and assignments. 
    We'll do a stratified bootstrap (stratified by antibiotic) to build up two distributions in coverage
    rates. We'll return the confidence intervals around each estimate and a pvalue for testing the null 
    hypothesis that the ip assignment is equal to or less than the guideline assignment
    """
    ip_covered_rates = []
    actual_ip_rate = df['was_covered_IP'].sum() / len(df)
    for i in tqdm(range(1000)):
        df_boot_ip = (df
            .groupby('IP_med_description')
            .sample(frac=1., replace=True)
            .reset_index()
        )
        ip_covered_rates.append(df_boot_ip['was_covered_IP'].sum() / len(df_boot_ip))
        
    ip_high = np.percentile(ip_covered_rates, 97.5)
    ip_low = np.percentile(ip_covered_rates, 2.5)
    
    ip_estimate = f"{round(actual_ip_rate, 3)}: [{round(ip_low, 3)}, {round(ip_high, 3)}]"
    
    return ip_estimate

def boostrap_clin_coverage_covereage(df):
    """
    Get's guideline and lp coverage rates given a dataframe with med descriptions and assignments. 
    We'll do a stratified bootstrap (stratified by antibiotic) to build up two distributions in coverage
    rates. We'll return the confidence intervals around each estimate and a pvalue for testing the null 
    hypothesis that the ip assignment is equal to or less than the guideline assignment
    """
    ip_covered_rates = []
    actual_ip_rate = df['was_covered_dr'].sum() / len(df)
    for i in tqdm(range(1000)):
        df_boot_ip = (df
            .groupby('IP_med_description')
            .sample(frac=1., replace=True)
            .reset_index()
        )
        ip_covered_rates.append(df_boot_ip['was_covered_dr'].sum() / len(df_boot_ip))
        
    ip_high = np.percentile(ip_covered_rates, 97.5)
    ip_low = np.percentile(ip_covered_rates, 2.5)
    
    ip_estimate = f"{round(actual_ip_rate, 3)}: [{round(ip_low, 3)}, {round(ip_high, 3)}]"
    
    return ip_estimate



In [44]:
ci = boostrap_gram_stain_covereage(df_new_gram_stain)

100%|██████████| 1000/1000 [00:45<00:00, 21.82it/s]


In [45]:
ci

'0.894: [0.871, 0.914]'

In [46]:
ci = boostrap_gram_stain_covereage(df_new_with_bug)

100%|██████████| 1000/1000 [00:40<00:00, 24.99it/s]


In [47]:
ci

'0.948: [0.932, 0.962]'

In [48]:
ci = boostrap_gram_stain_covereage(df_new)

100%|██████████| 1000/1000 [00:30<00:00, 32.98it/s]


In [49]:
ci

'0.86: [0.836, 0.883]'

In [55]:
boostrap_clin_coverage_covereage(df_new)

100%|██████████| 1000/1000 [00:28<00:00, 35.12it/s]


'0.843: [0.818, 0.869]'

In [56]:
df_new.head()

Unnamed: 0,anon_id,pat_enc_csn_id_coded,Vancomycin_label,Vancomycin_predictions,Ampicillin_label,Ampicillin_predictions,Cefazolin_label,Cefazolin_predictions,Ceftriaxone_label,Ceftriaxone_predictions,...,Meropenem,Vancomycin_Ceftriaxone,Vancomycin_Cefepime,Vancomycin_Zosyn,Vancomycin_Meropenem,IP_med_description,random_med_description,was_covered_dr,was_covered_random,was_covered_IP
0,JC2a03b24,131260812263,0,0.108161,0,0.462078,0,0.664275,1,0.79827,...,1,1,1,1,1,Ceftriaxone,Cefazolin,1,0,1
1,JCe45a3c,131260883970,1,0.483241,1,0.538177,0,0.631024,0,0.784364,...,0,1,1,1,1,Ceftriaxone,Ceftriaxone,1,0,0
2,JCd235bb,131261001599,0,0.21561,0,0.501686,1,0.631382,1,0.653581,...,1,1,1,1,1,Vancomycin_Zosyn,Ceftriaxone,1,1,1
3,JCd356bf,131261014293,1,0.421153,1,0.412029,1,0.528823,1,0.724792,...,1,1,1,1,1,Ceftriaxone,Ceftriaxone,1,1,1
4,JCd1c07c,131261155365,0,0.064201,1,0.528883,1,0.749904,1,0.789198,...,1,1,1,1,1,Ceftriaxone,Vancomycin_Ceftriaxone,1,1,1


In [89]:
%%bigquery df_ampC_producers --project mining-clinical-decisions
WITH all_bugs as (
SELECT DISTINCT
    c.anon_id, c.pat_enc_csn_id_coded, c.order_proc_id_coded, c.description, r.organism
FROM
    `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
INNER JOIN 
    `mining-clinical-decisions.shc_core.culture_sensitivity` r
USING
    (order_proc_id_coded)
INNER JOIN 
     (SELECT DISTINCT pat_enc_csn_id_coded from `mining-clinical-decisions.abx.final_ast_labels`
     WHERE EXTRACT(YEAR FROM index_time) < 2020) f
USING
    (pat_enc_csn_id_coded)
)

SELECT DISTINCT 
    anon_id, pat_enc_csn_id_coded,
    MAX(CASE WHEN LOWER(organism) LIKE "%enterobacter%" THEN 1 ELSE 0 END) enterobacter,
    MAX(CASE WHEN LOWER(organism) LIKE "%serratia%" THEN 1 ELSE 0 END) serratia,
    MAX(CASE WHEN LOWER(organism) LIKE "%citrobacter freundii%" THEN 1 ELSE 0 END) citrobacter_freundii,
    MAX(CASE WHEN LOWER(organism) LIKE "%aeromonas%" THEN 1 ELSE 0 END) aeromonas,
    MAX(CASE WHEN LOWER(organism) LIKE "%proteus%" THEN 1 ELSE 0 END) proteus,
    MAX(CASE WHEN LOWER(organism) LIKE "%providencia%" THEN 1 ELSE 0 END) providencia,
    MAX(CASE WHEN LOWER(organism) LIKE "%morganella morganii%" THEN 1 ELSE 0 END) morganella_morganii
FROM
    all_bugs
GROUP BY
    anon_id, pat_enc_csn_id_coded


In [90]:
df_ampC_producers.shape

(8342, 9)

In [91]:
df_ampC_producers[['anon_id', 'pat_enc_csn_id_coded']].drop_duplicates().shape

(8342, 2)

In [114]:
df_ampC_producers = df_ampC_producers.assign(ampC_sum=
lambda x: x.enterobacter + x.serratia + x.citrobacter_freundii + x.aeromonas + x.proteus + x.providencia
          + x.morganella_morganii)
df_ampC_producers = df_ampC_producers.assign(ampC_any=lambda x: [1 if a > 0 else 0 for a in x.ampC_sum])
df_ampC_producers['ampC_any'].value_counts()

0    7348
1     994
Name: ampC_any, dtype: int64

In [100]:
%%bigquery df_ampC_producers_with_ctx --project mining-clinical-decisions
WITH all_bugs as (
SELECT DISTINCT
    c.anon_id, c.pat_enc_csn_id_coded, c.order_proc_id_coded, c.description, r.organism
FROM
    `mining-clinical-decisions.abx.culture_orders_within_24_hrs` c
INNER JOIN 
    `mining-clinical-decisions.shc_core.culture_sensitivity` r
USING
    (order_proc_id_coded)
INNER JOIN 
     (SELECT DISTINCT pat_enc_csn_id_coded from `mining-clinical-decisions.abx.final_ast_labels`
     WHERE EXTRACT(YEAR FROM index_time) < 2020 AND Ceftriaxone=1) f
USING
    (pat_enc_csn_id_coded)
)

SELECT DISTINCT 
    anon_id, pat_enc_csn_id_coded,
    MAX(CASE WHEN LOWER(organism) LIKE "%enterobacter%" THEN 1 ELSE 0 END) enterobacter,
    MAX(CASE WHEN LOWER(organism) LIKE "%serratia%" THEN 1 ELSE 0 END) serratia,
    MAX(CASE WHEN LOWER(organism) LIKE "%citrobacter freundii%" THEN 1 ELSE 0 END) citrobacter_freundii,
    MAX(CASE WHEN LOWER(organism) LIKE "%aeromonas%" THEN 1 ELSE 0 END) aeromonas,
    MAX(CASE WHEN LOWER(organism) LIKE "%proteus%" THEN 1 ELSE 0 END) proteus,
    MAX(CASE WHEN LOWER(organism) LIKE "%providencia%" THEN 1 ELSE 0 END) providencia,
    MAX(CASE WHEN LOWER(organism) LIKE "%morganella morganii%" THEN 1 ELSE 0 END) morganella_morganii
FROM
    all_bugs
GROUP BY
    anon_id, pat_enc_csn_id_coded


In [101]:
df_ampC_producers_with_ctx.shape

(5715, 9)

In [102]:
df_ampC_producers_with_ctx.sum()

anon_id                 JC29f9afdJC29f9cb8JC29fadf0JC29fb7b9JC29fcd61J...
pat_enc_csn_id_coded                                      749449248729939
enterobacter                                                          161
serratia                                                               53
citrobacter_freundii                                                   74
aeromonas                                                              15
proteus                                                               352
providencia                                                            21
morganella_morganii                                                    47
dtype: object

In [113]:
df_ampC_producers_with_ctx = df_ampC_producers_with_ctx.assign(ampC_sum=
lambda x: x.enterobacter + x.serratia + x.citrobacter_freundii + x.aeromonas + x.proteus + x.providencia
          + x.morganella_morganii)
df_ampC_producers_with_ctx = df_ampC_producers_with_ctx.assign(ampC_any=lambda x: [1 if a > 0 else 0 for a in x.ampC_sum])
df_ampC_producers_with_ctx['ampC_any'].value_counts()

0    5014
1     701
Name: ampC_any, dtype: int64

In [120]:
test = [   '\item ICD9 995.92 Sepsis',
    '\item ICD9 995.92 Severe Sepsis', 
    '\item ICD9 481 Pneumococcal pneumonia',
    '\item ICD9 482 Other bacterial pneumonia',
    '\item ICD9 483 Pneumonia due to other specified organism',
    '\item ICD9 484 Pneumonia in infectious diseases classified elsewhere,'
    '\item ICD9 485 Bronchopneumonia org NOS',
   '\item ICD9 486 Pneumonia, organism NOS',
    '\item ICD9 590 Infections of kidney',
    '\item ICD10 A41 Other sepsis',
    '\item ICD10 J13 Pneumonia due to Streptococcus pneumoniae',
   '\item ICD10 J15 Bacterial pneumonia, not elsewhere classified',
    '\item ICD10 J16 Pneumonia due to other infectious organisms, not elsewhere classified',
    '\item ICD10 J17 Pneumonia in diseases classified elsewhere',
    '\item ICD10 J18 Pneumonia, unspecified organism',
    '\item ICD10 N10 Acute pyelonephritis',
    '\item ICD10 N11 Chronic tubulo-interstitial nephritis',
    '\item ICD10 N12 Tubulo-interstitial nephritis, not specified as acute or chronic',
    '\item ICD10 N39.0 Urinary tract infection, site not specified',
    '\item ICD10 J06 Acute upper respiratory infections of multiple and unspecified sites',
    '\item ICD10 A49 Bacterial infection of unspecified site',
    '\item ICD10 J22 Unspecified acute lower respiratory infection',
    '\item ICD10 R65.2 Severe sepsis (with and without septic shock)'
]
for t in test:
    text = t.split()[1:]
    item = t.split()[0]
    print(item + " \added[id=CKC]{" + ' '.join(text) + "}")

\item dded[id=CKC]{ICD9 995.92 Sepsis}
\item dded[id=CKC]{ICD9 995.92 Severe Sepsis}
\item dded[id=CKC]{ICD9 481 Pneumococcal pneumonia}
\item dded[id=CKC]{ICD9 482 Other bacterial pneumonia}
\item dded[id=CKC]{ICD9 483 Pneumonia due to other specified organism}
\item dded[id=CKC]{ICD9 484 Pneumonia in infectious diseases classified elsewhere,\item ICD9 485 Bronchopneumonia org NOS}
\item dded[id=CKC]{ICD9 486 Pneumonia, organism NOS}
\item dded[id=CKC]{ICD9 590 Infections of kidney}
\item dded[id=CKC]{ICD10 A41 Other sepsis}
\item dded[id=CKC]{ICD10 J13 Pneumonia due to Streptococcus pneumoniae}
\item dded[id=CKC]{ICD10 J15 Bacterial pneumonia, not elsewhere classified}
\item dded[id=CKC]{ICD10 J16 Pneumonia due to other infectious organisms, not elsewhere classified}
\item dded[id=CKC]{ICD10 J17 Pneumonia in diseases classified elsewhere}
\item dded[id=CKC]{ICD10 J18 Pneumonia, unspecified organism}
\item dded[id=CKC]{ICD10 N10 Acute pyelonephritis}
\item dded[id=CKC]