In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pulp import *
import pandas as pd
import os, glob

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/conorcorbin/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'mining-clinical-decisions' 
%load_ext google.cloud.bigquery

from google.cloud import bigquery
client=bigquery.Client()



## UTI Labelling Cohort Project

**Objective**: The purpose of this notebook is to define a cohort and conduct basic exploratory-data-analysis (eda) for patients with suspected UTI. We plan to manually review a subset of these cases to find instances where the urine culture was negative but the patient needed antibiotics as well as instances where the urine culture was positive but the patient did not need antibiotics. We'll then use these expert labels to train phenotyping models that attempt to address infection status in a way that is more robust than simply relying on whether the urine culture was positive. 

**Cohort Description**: We include adult patients with a urine culture ordered at Stanford hospital and clinics. The urine culture must be the first urine culture ordered in a two week time period. There must be an order for some form of antibiotics (iv/im/oral) within four hours after the urine culture order. We only include urine culture orders that appear in lab results (there exists a strange artifact in the data where many order ids in `order_proc` never result (don't exist in `lab_results` - potentially orders that are placed but not signed, or just duplicate entries in shc_core). 

**Table Schema**: The SQL query below produces a table with the following schema. 
* `anon_id`: patient identifier
* `pat_enc_csn_id_coded`: csn associated with urine culture order
* `order_proc_id_coded`: id associated with order for urine culture
* `order_time_jittered_utc`: time of urine culture order
* `ordering_mode`: either inpatient, outpatient, or null
* `was_positive`: 1 if urine culture was positive, 0 otherwise
* `had_other_pos_culture`: 1 if some other culture (from any anatomical site) ordered within a 48 hour time window centered at the urine culture order time was positive, 0 otherwise
* `ordered_antibiotics`: comma separated list of antibiotics ordered for the patient during the four hour window after the order of the urine culture
* `organisms`: comma separated list of organisms that grew from the urine culture 
* `other_pos_sites`: comma separated list of other cultures from all anatomical sites that were positive within 48 hour time window centered at urine culture order time
* `other_organisms`: comma separated list of organisms that grew from these other positive cultures



In [163]:
%%bigquery df_uc_all
-- All urine culture orders at Stanford, order_id must be in lab_results
WITH urine_cultures AS (
    SELECT DISTINCT
        op.anon_id, op.pat_enc_csn_id_coded, op.order_proc_id_coded, op.order_time_jittered_utc, op.ordering_mode
    FROM 
        `mining-clinical-decisions.shc_core.order_proc` op
    INNER JOIN
        `mining-clinical-decisions.shc_core.lab_result` lr
    ON
        op.order_proc_id_coded = lr.order_id_coded
    WHERE
        op.order_type LIKE "Microbiology%"
        AND op.description LIKE "%URINE%"
),

-- Filter to adult only
adult_urine_cultures AS (
    SELECT DISTINCT
        uc.*
    FROM 
        urine_cultures uc
    INNER JOIN
        `shc_core.demographic` demo
    USING
        (anon_id)
    WHERE
        DATE_DIFF(CAST(uc.order_time_jittered_utc as DATE), demo.BIRTH_DATE_JITTERED, YEAR) >= 18
),

-- Must not have any urine culture orders (that go on to result in lab results) in prior two weeks
-- This finds urine cultures that do have other urine cultures orders in prior two weeks
order_in_prior_two_weeks AS (
    SELECT DISTINCT
         auc.order_proc_id_coded
    FROM 
        `mining-clinical-decisions.shc_core.order_proc` op
    INNER JOIN
        `mining-clinical-decisions.shc_core.lab_result` lr
    ON
        op.order_proc_id_coded = lr.order_id_coded
    INNER JOIN
        adult_urine_cultures auc 
    ON
        op.anon_id = auc.anon_id
    WHERE
        op.order_type LIKE "Microbiology%"
        AND op.description LIKE "%URINE%"
        AND auc.order_time_jittered_utc > op.order_time_jittered_utc
        AND TIMESTAMP_DIFF(auc.order_time_jittered_utc, op.order_time_jittered_utc, DAY) < 14
),

-- Set of antibiotics that show up in order_med (may not be totally all encompassing) 
include_abx AS (
    SELECT
        med_description
    FROM
        `mining-clinical-decisions.abx.abx_types` 
    WHERE
        is_include_abx = 1 OR is_oral = 1
),

-- Must have antibiotic order placed within 4 hours of the urine culture 
included_urine_cultures AS (
    SELECT DISTINCT
        auc.*, om.med_description
    FROM 
        adult_urine_cultures auc
    INNER JOIN
        `mining-clinical-decisions.shc_core.order_med` om
    USING
        (anon_id)
    WHERE 
        auc.order_proc_id_coded NOT IN (SELECT order_proc_id_coded  FROM order_in_prior_two_weeks)
    AND
        om.med_description IN (SELECT med_description FROM include_abx)
    AND 
        TIMESTAMP_DIFF(om.order_inst_utc, auc.order_time_jittered_utc, HOUR) BETWEEN 0 AND 4 
), 

-- Flag to indicated if the culture was positive, and resulting organisms
orders_that_were_positive AS (
    SELECT DISTINCT
        auc.order_proc_id_coded, 1 as was_positive, organism
    FROM
        included_urine_cultures auc
    INNER JOIN  
        (SELECT DISTINCT order_proc_id_coded, organism
        FROM `mining-clinical-decisions.shc_core.culture_sensitivity`) cs
    USING
        (order_proc_id_coded) 
),

--Other postive culture types ordered within window of urine culture order. window = -24 to 24 hours
orders_with_other_pos_cultures AS (
    SELECT DISTINCT
        auc.order_proc_id_coded, 1 as had_other_pos_culture, description other_pos_site, organism other_organism
    FROM
        included_urine_cultures auc
    INNER JOIN  
        `mining-clinical-decisions.shc_core.culture_sensitivity` cs
    USING
        (anon_id) 
    WHERE
        auc.order_proc_id_coded <> cs.order_proc_id_coded
    AND
        -- Some urine culture orders have two order ids where only one results, make sure we're not counting these.
        -- Likely not needed logic after forcing orders to appear in lab_results
        (auc.order_time_jittered_utc <> cs.order_time_jittered_utc OR cs.description NOT LIKE '%URINE%')
    AND
        TIMESTAMP_DIFF(cs.order_time_jittered_utc, auc.order_time_jittered_utc, HOUR)
    BETWEEN
        -24 AND 24
),

--Temporary building block that has medications and organisms in long form
orders_meds_and_bugs AS (
SELECT DISTINCT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, order_time_jittered_utc, ordering_mode, 
    CASE WHEN was_positive = 1 THEN 1 ELSE 0 END was_positive,
    CASE WHEN had_other_pos_culture = 1 THEN 1 ELSE 0 END had_other_pos_culture,
    med_description,
    organism,
    other_pos_site,
    other_organism
FROM 
    included_urine_cultures
LEFT JOIN 
    orders_with_other_pos_cultures
USING 
    (order_proc_id_coded)
LEFT JOIN 
    orders_that_were_positive
USING
    (order_proc_id_coded)
),

-- Collapse medication orders to order id of the urine culture
concatenated_meds AS (
    SELECT DISTINCT
        order_proc_id_coded,
        STRING_AGG(DISTINCT med_description ORDER BY med_description) ordered_antibiotics
    FROM
        orders_meds_and_bugs
    GROUP BY
        order_proc_id_coded
),

-- Collpase organisms to order id of the urine culture
concatenated_bugs AS (
    SELECT DISTINCT
        order_proc_id_coded,
        STRING_AGG(DISTINCT organism ORDER BY organism) organisms
    FROM
        orders_meds_and_bugs
    GROUP BY
        order_proc_id_coded
),

-- Collpase other positive sites
concatenated_sites AS (
    SELECT DISTINCT
        order_proc_id_coded,
        STRING_AGG(DISTINCT other_pos_site ORDER BY other_pos_site) other_pos_sites
    FROM
        orders_meds_and_bugs
    GROUP BY
        order_proc_id_coded
),

-- Collpase other organisms
concatenated_other_bugs AS (
    SELECT DISTINCT
        order_proc_id_coded,
        STRING_AGG(DISTINCT other_organism ORDER BY other_organism) other_organisms
    FROM
        orders_meds_and_bugs
    GROUP BY
        order_proc_id_coded
)

SELECT DISTINCT
    anon_id, pat_enc_csn_id_coded, order_proc_id_coded, order_time_jittered_utc, ordering_mode, was_positive, had_other_pos_culture,
    ordered_antibiotics, organisms, other_pos_sites, other_organisms
FROM
    orders_meds_and_bugs
INNER JOIN 
    concatenated_meds
USING
    (order_proc_id_coded)
INNER JOIN 
    concatenated_bugs 
USING
    (order_proc_id_coded)
INNER JOIN 
    concatenated_sites  
USING
    (order_proc_id_coded)
INNER JOIN 
    concatenated_other_bugs  
USING
    (order_proc_id_coded)
ORDER BY 
    order_proc_id_coded


In [184]:
print(df_uc_all.shape)
df_uc_all.head(20)

(71504, 11)


Unnamed: 0,anon_id,pat_enc_csn_id_coded,order_proc_id_coded,order_time_jittered_utc,ordering_mode,was_positive,had_other_pos_culture,ordered_antibiotics,organisms,other_pos_sites,other_organisms
0,JCdffb05,15231199,328156918,2008-05-03 16:08:00+00:00,Inpatient,0,0,PIPERACILLIN-TAZOBACTAM-DEXTRS 3.375 GRAM/50 M...,,,
1,JCde3c0e,15216457,328157013,2008-04-17 16:10:00+00:00,Inpatient,0,0,ERYTHROMYCIN IVPB (CUSTOM DOSE),,,
2,JCe20edc,15249126,328172671,2008-04-11 00:44:00+00:00,Inpatient,0,0,"AZITHROMYCIN 250 MG PO TABS,PIPERACILLIN-TAZOB...",,,
3,JCe84e0f,15214669,328172680,2008-05-25 00:44:00+00:00,Inpatient,0,0,VANCOMYCIN 1 GM CENTRAL LINE IVPB,,,
4,JCe98741,15252666,328173037,2008-03-31 00:54:00+00:00,Outpatient,0,0,CEFAZOLIN IN DEXTROSE (ISO-OS) 1 GRAM/50 ML IV...,,,
5,JCe9868d,15404113,328175287,2008-04-15 01:53:00+00:00,Inpatient,0,0,PIPERACILLIN-TAZOBACTAM-DEXTRS 2.25 GRAM/50 ML...,,,
6,JCcf203c,15418831,328200593,2008-04-16 14:05:00+00:00,Inpatient,0,0,VANCOMYCIN IN DEXTROSE 1 GRAM/200 ML IV PGBK,,,
7,JCcdecda,15419187,328207135,2008-04-01 19:58:00+00:00,Inpatient,1,0,CIPROFLOXACIN IN D5W 400 MG/200 ML IV PGBK,ESCHERICHIA COLI,,
8,JCcf7213,15419723,328214067,2008-04-02 03:52:00+00:00,Inpatient,0,0,PIPERACILLIN-TAZOBACTAM-DEXTRS 2.25 GRAM/50 ML...,,,
9,JCcbefde,15419417,328225061,2008-05-16 09:45:00+00:00,Inpatient,0,0,METRONIDAZOLE IN NACL (ISO-OS) 500 MG/100 ML I...,,,


### Positive vs negative cultures

In [177]:
df_uc_all.was_positive.value_counts()

0    46173
1    25331
Name: was_positive, dtype: int64

### How many had other positive cultures at different anatomical sites?

In [178]:
df_uc_all.had_other_pos_culture.value_counts()

0    66101
1     5403
Name: had_other_pos_culture, dtype: int64

### Repeat within positive and negative urine cultures

In [179]:
df_uc_all[['was_positive', 'had_other_pos_culture']].value_counts()

was_positive  had_other_pos_culture
0             0                        43138
1             0                        22963
0             1                         3035
1             1                         2368
dtype: int64

### Of all cases where urine culture was negative but another culture was positive, what were most common anatomical sites of the positive cultures?


In [180]:
(df_uc_all
.query('was_positive == 0 and had_other_pos_culture == 1', engine='python')
['other_pos_sites']
).value_counts()


BLOOD CULTURE (AEROBIC & ANAEROBIC BOTTLES)                                                                       522
BLOOD CULTURE (2 AEROBIC BOTTLES)                                                                                 415
BLOOD CULTURE (AEROBIC & ANAEROBIC BOTTLE)                                                                        320
URINE CULTURE                                                                                                     273
RESPIRATORY CULTURE                                                                                               242
                                                                                                                 ... 
ANAEROBIC CULTURE,BIOPSY/TISSUE(GEN)GS                                                                              1
BLOOD CULTURE (2 AEROBIC BOTTLES),BLOOD CULTURE (AEROBIC & ANAEROBIC BOTTLES),FLUID CULTURE / BB GRAM STAIN         1
BLOOD CULTURE (AEROBIC & ANAEROBIC BOTTLES),FLUID CULTUR

### Most common antibiotic orders

In [181]:
df_uc_all.ordered_antibiotics.value_counts().head(20)

NITROFURANTOIN MONOHYD/M-CRYST 100 MG PO CAPS              10621
CIPROFLOXACIN HCL 500 MG PO TABS                            4580
SULFAMETHOXAZOLE-TRIMETHOPRIM 800-160 MG PO TABS            3992
CEPHALEXIN 500 MG PO CAPS                                   3680
CIPROFLOXACIN 500 MG PO TABS                                3110
CIPROFLOXACIN HCL 250 MG PO TABS                            1352
CEFTRIAXONE 1 GRAM/10 ML IV PUSH (VIAL ONLY)                1210
CIPROFLOXACIN IN D5W 400 MG/200 ML IV PGBK                   987
PIPERACILLIN-TAZOBACTAM-DEXTRS 3.375 GRAM/50 ML IV PGBK      866
CEFTRIAXONE 1 GRAM/50 ML MINI-BAG PLUS                       788
CEPHALEXIN 250 MG PO CAPS,CEPHALEXIN 500 MG PO CAPS          730
CIPROFLOXACIN 250 MG PO TABS                                 691
LEVOFLOXACIN 500 MG PO TABS                                  685
MACROBID 100 MG PO CAPS                                      600
CEFTRIAXONE 1 GRAM/100 ML NS MINIBAG PLUS                    549
NITROFURANTOIN (MACROCRYS

### Most common organisms

In [182]:
df_uc_all.organisms.value_counts(dropna=False).head(20)

NaN                                                    46173
ESCHERICHIA COLI                                       14831
KLEBSIELLA PNEUMONIAE                                   1931
ENTEROCOCCUS SPECIES                                    1856
PROTEUS MIRABILIS                                        933
ENTEROCOCCUS SPECIES,ESCHERICHIA COLI                    752
PSEUDOMONAS AERUGINOSA                                   487
STREPTOCOCCUS AGALACTIAE (GROUP B)                       350
ESCHERICHIA COLI,KLEBSIELLA PNEUMONIAE                   320
KLEBSIELLA OXYTOCA                                       240
STREPTOCOCCUS AGALACTIAE {GROUP B}                       210
ENTEROBACTER CLOACAE COMPLEX                             204
ZZZENTEROBACTER AEROGENES                                198
ESCHERICHIA COLI,PROTEUS MIRABILIS                       174
STAPHYLOCOCCUS AUREUS                                    154
CITROBACTER KOSERI                                       153
ENTEROCOCCUS FAECALIS   

### Cultures by year

In [183]:
df_uc_all.assign(year=lambda x: x.order_time_jittered_utc.dt.year)[['year']].value_counts()

year
2019    10693
2018    10648
2017     8578
2016     7695
2015     5737
2014     4226
2010     4140
2013     3985
2012     3939
2009     3846
2011     3786
2020     2231
2008     2000
dtype: int64