Labogram calculated using a 7-Day window

For each lab look in the previous 7 days. Calculate how many immediately preceding normal results there were in a row.
Group by 'preceding' count to get normal prevalence for each group of the labogram.

In [5]:
from google.cloud import bigquery
import pandas as pd
import os
import time
import numpy as np
import datetime

os.environ['GOOGLE_APPLICATION+CREDENTIALS'] = '/Users/nrabbani/.config/gcloud/application_default_credentials.json'
os.environ['GCLOUD_PROJECT'] = 'mining-clinical-decisions'
%load_ext google.cloud.bigquery

client=bigquery.Client()

pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
pd.options.display.min_rows = 20

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery




In [9]:
def labQuery(basename, date_start, date_end, time_window = '7D'):
    query = '''
    SELECT pat_enc_csn_id_coded, base_name, result_flag, result_flag IS NULL AS normal,
        ord_value, reference_low, reference_high, result_time
        FROM `som-nero-phi-jonc101.shc_core_2021.lab_result`
    WHERE
        ordering_mode = 'Inpatient'
        AND order_type = 'Lab'
        AND ord_value IS NOT NULL
        AND lower(group_lab_name) NOT LIKE '%arterial%'

        AND base_name = '{basename}'
        AND result_time < '{date_end}'
        AND result_time >= '{date_start}'

    ORDER BY pat_enc_csn_id_coded asc, result_time asc
    '''.format_map({'basename': basename,
                    'date_start': (np.datetime64(date_start) - pd.Timedelta(time_window)).strftime('%Y-%m-%d'),
                    'date_end': date_end})
    query_job = client.query(query)
    df = query_job.to_dataframe()
    df.pat_enc_csn_id_coded = df.pat_enc_csn_id_coded.astype(int)
    df.reset_index(drop=True, inplace=True)    
    return df

In [12]:
def windowedLabCounts(df_lab, date_start, time_window = '7D'):
    df_lab.reset_index()
    totenc = len(df_lab.pat_enc_csn_id_coded.unique())
    enc_num = 0

    df_lab['preceding'] = 0
    for enc in df_lab.pat_enc_csn_id_coded.unique():
        enc_idx = (df_lab.pat_enc_csn_id_coded==enc)
        df = df_lab.loc[enc_idx, ['result_time', 'normal']]
        rownum = 0
        for (idx, row) in df.iterrows():
            if rownum>0:
                rtime = row.result_time
                dfslice = df.loc[(df.result_time <= rtime) & (df.result_time > rtime - pd.Timedelta(time_window))]
                if dfslice.shift().loc[idx, 'normal']==True:
                    y = dfslice.shift().normal
                    prec = y * (y.groupby((y != y.shift()).cumsum()).cumcount() + 1)
                    df_lab.loc[idx, 'preceding'] = prec.values[-1]
            else:
                df_lab.loc[idx, 'preceding'] = -1
            rownum = rownum + 1

        enc_num = enc_num + 1
    return df_lab.loc[df_lab.result_time >= date_start]

In [60]:
def unitTestWindow():
    query = '''
    SELECT pat_enc_csn_id_coded, base_name, result_flag, result_flag IS NULL AS normal,
        ord_value, reference_low, reference_high, result_time
        FROM `som-nero-phi-jonc101.shc_core_2021.lab_result`
    WHERE
        ordering_mode = 'Inpatient'
        AND order_type = 'Lab'
        AND ord_value IS NOT NULL
        AND lower(group_lab_name) NOT LIKE '%arterial%'

        AND base_name = 'NA'
        AND (pat_enc_csn_id_coded = 131290584921
        OR pat_enc_csn_id_coded = 131287053881)

    ORDER BY pat_enc_csn_id_coded asc, result_time asc
    '''
    query_job = client.query(query)
    df = query_job.to_dataframe()
    df.pat_enc_csn_id_coded = df.pat_enc_csn_id_coded.astype(int)
    df.reset_index(drop=True, inplace=True)  
    
    df = windowedLabCounts(df, '2020-01-01')
    unit_test = pd.read_csv('unit_test_results.csv')

    if (df.preceding == unit_test.preceding).all():
        print('Unit test passed')
        return True
    else:
        print('Unit test failed')
        return False

In [63]:
def labogramGroupings(df, basename, max_streak = 7): 
    df = df.groupby('preceding').agg({'base_name':'count', 'normal':'sum'}).rename(columns={'base_name':'cnt'})
    if len(df) > max_streak+2:
        df.iloc[max_streak+1] = df.iloc[max_streak+1:-1].sum()
        df['prop'] = df.normal / df.cnt
        df['basename'] = basename
        return df.iloc[0:max_streak+2][['basename', 'prop', 'normal', 'cnt']]
    df['prop'] = df.normal / df.cnt
    df['basename'] = basename
    return df[['basename', 'prop', 'normal', 'cnt']]

In [64]:
def calculateLabogram(basename, date_start, date_end):
    filename = basename + '_' + date_start + '_' + date_end
    df = labQuery(basename, date_start, date_end)
    df = windowedLabCounts(df, date_start)
    df = labogramGroupings(df, basename)
    df.to_csv('labogram.csv', mode='a', header=False, index=True)
    print(df)

In [151]:
date_start = '2020-01-01'
date_end = '2021-01-01'
labnames = ['ALB', 'ALKP', 'ALT', 'AST', 'AG', 'EGFR', 'BUN', 'CA', 'CL', 'CO2', 'CR', 'GLU', 'GLOB', 'K', 'NA', 'TBIL', 'TP', 'PHOS', 'MG', 'HCT', 'HGB', 'PLT', 'WBC', 'RBC', 'PT', 'INR', 'XPTT']

for labname in labnames:
    calculateLabogram(labname, date_start, date_end)

          basename      prop  normal     cnt
preceding                                   
-1             ALB  0.837009  110851  132437
 0             ALB  0.065996    4100   62125
 1             ALB  0.655087    9485   14479
 2             ALB  0.733176    4042    5513
 3             ALB  0.799864    2346    2933
 4             ALB  0.824772    1445    1752
 5             ALB  0.859574    1010    1175
 6             ALB  0.911765    1581    1734
 7             ALB  0.908502    2105    2317
          basename      prop  normal     cnt
preceding                                   
-1            ALKP  0.789522  103545  131149
 0            ALKP  0.107027    3919   36617
 1            ALKP  0.920057   12821   13935
 2            ALKP  0.942559    7778    8252
 3            ALKP  0.951199    5633    5922
 4            ALKP  0.949367    4200    4424
 5            ALKP  0.954139    3412    3576
 6            ALKP  0.960374    5962    6208
 7            ALKP  0.960298   10812   11259
          

          basename      prop  normal    cnt
preceding                                  
-1            PHOS  0.709227   17286  24373
 0            PHOS  0.414322   10750  25946
 1            PHOS  0.707765   10792  15248
 2            PHOS  0.782594    6933   8859
 3            PHOS  0.819064    4898   5980
 4            PHOS  0.849339    3467   4082
 5            PHOS  0.854987    2606   3048
 6            PHOS  0.891239    3286   3687
 7            PHOS  0.878209    5884   6700
          basename      prop  normal    cnt
preceding                                  
-1              MG  0.849455   38279  45063
 0              MG  0.434399    6400  14733
 1              MG  0.904415   15385  17011
 2              MG  0.937842   11316  12066
 3              MG  0.949590    9023   9502
 4              MG  0.955121    6938   7264
 5              MG  0.955943    5815   6083
 6              MG  0.964738    8372   8678
 7              MG  0.967439   31346  32401
          basename      prop  no

In [65]:
date_start = '2020-01-01'
date_end = '2021-01-01'
labnames = ['ALB', 'ALKP', 'ALT', 'AST', 'AG', 'EGFR', 'BUN', 'CA', 'CL', 'CO2', 'CR', 'GLU', 'GLOB', 'K', 'NA', 'TBIL', 'TP', 'PHOS', 'MG', 'HCT', 'HGB', 'PLT', 'WBC', 'RBC', 'PT', 'INR', 'XPTT']

calculateLabogram('NA', date_start, date_end)

          basename      prop  normal     cnt
preceding                                   
-1              NA  0.862697  129308  149888
 0              NA  0.249406   21115   84661
 1              NA  0.828911   36550   44094
 2              NA  0.874752   24277   27753
 3              NA  0.894802   17250   19278
 4              NA  0.907234   12704   14003
 5              NA  0.911765    9982   10948
 6              NA  0.920643   10302   11190
 7              NA  0.933346   43283   46374


## Add Confidence Intervals

In [80]:
from statsmodels.stats.proportion import proportion_confint

labogram_file = 'labogram_2022-04-19'
labogram_df = pd.read_csv(labogram_file+'.csv')
ci_df = pd.DataFrame(proportion_confint(labogram_df.normal, labogram_df.cnt, alpha=0.05)).T
labogram_df['cilb'] = ci_df[0]
labogram_df['ciub'] = ci_df[1]

labogram_df[['preceding', 'basename', 'prop', 'cilb', 'ciub', 'normal', 'cnt']].to_csv(labogram_file+'_ci.csv', index=False, float_format='%.4f')