In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pulp import *
import os, glob

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/conorcorbin/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'mining-clinical-decisions' 
%load_ext google.cloud.bigquery

from google.cloud import bigquery
client=bigquery.Client()

In [None]:
query="""
SELECT DISTINCT
    orders.order_proc_id_coded,
    orders.description,
    EXTRACT(YEAR FROM orders.order_time_jittered_utc) year,
    TIMESTAMP_DIFF(cs.result_time_jittered_utc, orders.order_time_jittered_utc, HOUR) hours_to_abx_sensitivities
FROM 
    mining-clinical-decisions.abx.culture_orders_within_24_hrs orders
INNER JOIN
    mining-clinical-decisions.shc_core.culture_sensitivity cs
USING
    (order_proc_id_coded)
"""
query_job = client.query(query)
df = query_job.result().to_dataframe()
df.head()

In [None]:
# cultures = ['URINE CULTURE', 'BLOOD CULTURE (AEROBIC & ANAEROBIC BOTTLE)', 'BLOOD CULTURE (2 AEROBIC BOTTLES)' ]
train_years = ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']
test_years = ['2019']
df_time_to_results = (df
.assign(description=lambda x: ['Blood Culture' if "BLOOD" in a
                               else 'Urine Culture' if 'URINE' in a
                               else 'Other Fluid Culture' for a in x.description])
.assign(dataset=lambda x: ['Training Set' if str(year) in train_years else 'Test Set' for year in x.year])
.groupby(['description', 'dataset'])
.agg(mean_hours=('hours_to_abx_sensitivities', 'mean'),
     median_hours=('hours_to_abx_sensitivities', 'median'),
     std_hours=('hours_to_abx_sensitivities', 'std')
    )
.reset_index()
.sort_values(['dataset', 'description'], ascending=True)
)

In [None]:
df_time_to_results

In [None]:
df_time_to_results.to_csv('time_to_results.csv', index=None)

In [None]:
### Extract method info for each positive culture (kirby bauer or MIC)
query="""
SELECT DISTINCT
    orders.order_proc_id_coded,
    orders.description,
    antibiotic,
    sensitivity_value,
FROM 
    mining-clinical-decisions.abx.culture_orders_within_24_hrs orders
INNER JOIN
    mining-clinical-decisions.shc_core.culture_sensitivity cs
USING
    (order_proc_id_coded)
WHERE
    UPPER(antibiotic) LIKE "%METHOD%"
"""
query_job = client.query(query)
df_method = query_job.result().to_dataframe()
df_method.head()

In [None]:
(df_method
.groupby('sensitivity_value')
.agg(num_orders=('order_proc_id_coded', 'nunique'))
.reset_index()
)

In [None]:
### Prevalance in trainign set vs test set for each of the twelve classifiers
query="""
SELECT
    *, EXTRACT(YEAR FROM index_time) as year
FROM 
    mining-clinical-decisions.abx.final_ast_labels
"""
query_job = client.query(query)
df_labels = query_job.result().to_dataframe()
df_labels.head()

In [None]:
abx_columns = ['Ampicillin',
             'Ciprofloxacin',
             'Cefazolin',
             'Ceftriaxone',
             'Cefepime',
             'Zosyn',
             'Vancomycin',
             'Meropenem',
             'Vancomycin_Ceftriaxone',
             'Vancomycin_Cefepime',
             'Vancomycin_Zosyn',
             'Vancomycin_Meropenem']
df_labels[abx_columns].mean()