### V3 OMOP Item Prevalence and RR

In [26]:
##Setting up Google sdk environment
import os 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/wip/.config/gcloud/application_default_credentials.json'  
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

import sys
import json

JSON_FILE_PATH = '/Users/wip/Recommender/V3/resource'
sys.path.append('/Users/wip/Recommender/V3/python_scripts')
#from bigQueryUtil import BigQueryClient 
from StatsUtil import getStats

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

##Setting up BQ API
from google.cloud import bigquery
client = bigquery.Client()
project_id = 'som-rit-phi-starr-prod'
dataset_id = 'starr_omop_cdm5_deid_latest'



In [2]:
# defining variables
year_start = '2014'
year_end = '2020'
prefix = 'V3'

save_project_id = 'som-nero-phi-jonc101'
save_dataset_id = 'wui_omop_peds'

table_id_outpt = prefix + '_Outpt_Cohort_' + year_start + '_' + year_end
table_id_cohort =  prefix + '_cohort_'+ year_start + '_' + year_end

format_map_dict = {'project_id': project_id,
                   'dataset_id': dataset_id, 
                   'save_project_id': save_project_id,
                   'save_dataset_id': save_dataset_id,
                   'table_id_outpt': table_id_outpt,
                   'table_id_cohort': table_id_cohort,
                   'prefix': prefix 
                    }

table_list =  [('measurement','measurement','measurement_concept_id'),
                 ('procedure_occurrence','procedure','procedure_concept_id'),
                 ('drug_exposure','drug','drug_concept_id'),
                 ('condition_occurrence','condition','condition_concept_id'),
                 ]



In [3]:
def save_query_table(sql, table_str):
    job_config = bigquery.QueryJobConfig(destination=table_str)
    client.delete_table(table_str, not_found_ok = True)
    query_job = client.query(sql, job_config=job_config)  
    query_job.result() 
    print("Query results loaded to the table {}".format(table_str))

In [4]:
def get_query_row_count(sql):
    query_job = client.query(sql)  
    results = query_job.result()
    results_list = [row for row in results]
    return results_list[0][0]

In [5]:
def read_BQ(table_id):
        # reading a table from BiqQuery
        sql = """ 
            SELECT * FROM 
                som-nero-phi-jonc101.wui_omop_peds.{table_id}
            """.format_map({'table_id':table_id})
        query_job = client.query(sql)
        dataframe = query_job.to_dataframe()
        return dataframe

In [33]:
sqlCohort = """SELECT COUNT(DISTINCT(person_id))  
                FROM `{save_project_id}.{save_dataset_id}.{table_id_cohort}` 
                """.format_map(format_map_dict)
    
sqlAll = """SELECT COUNT(DISTINCT(person_id))  
                FROM `{save_project_id}.{save_dataset_id}.{table_id_outpt}` 
                """.format_map(format_map_dict)

N_cohort = get_query_row_count(sqlCohort)
N_All = get_query_row_count(sqlAll)

In [31]:
# get counts (number of patients, visits and instances) per each clinical items
# in all outpatient pediatric visits 2015-2019

table_list =  [('measurement','measurement','measurement_concept_id'),
                 ('procedure_occurrence','procedure','procedure_concept_id'),
                 ('drug_exposure','drug','drug_concept_id'),
                 ('condition_occurrence','condition','condition_concept_id'),
                 ]


for t in table_list:
    (table, tableName, concept_id) = t
    
    feature_dict = {'table': table,
                    'concept_id': concept_id}
    
    feature_dict.update(format_map_dict)
    
    sql = """
            WITH itemAllOutpt AS (        
                    SELECT 
                        x.person_id,
                        x.visit_occurrence_id,
                        x.{concept_id} as item_concept_id
                    FROM 
                        `{project_id}.{dataset_id}.{table}` x
                    INNER JOIN 
                        `{save_project_id}.{save_dataset_id}.{table_id_outpt}` c 
                        ON 
                            (x.person_id = c.person_id) AND
                            (x.visit_occurrence_id = c.visit_occurrence_id)
                  )

            SELECT item_concept_id,
                   COUNT(DISTINCT(person_id)) as num_pt, 
                   COUNT(DISTINCT(visit_occurrence_id)) as num_visit,
                   COUNT(*) as instance
            FROM itemAllOutpt 
            GROUP BY item_concept_id 
            ORDER BY num_pt DESC, num_visit DESC, instance DESC

    """.format_map(feature_dict)

    save_table_id = prefix + "_count_" + tableName + "_Outpt"
    table_str = save_project_id + '.' + save_dataset_id + '.' + save_table_id
    save_query_table(sql, table_str)

Query results loaded to the table som-nero-phi-jonc101.wui_omop_peds.V3_count_measurement_Outpt
Query results loaded to the table som-nero-phi-jonc101.wui_omop_peds.V3_count_procedure_Outpt
Query results loaded to the table som-nero-phi-jonc101.wui_omop_peds.V3_count_drug_Outpt
Query results loaded to the table som-nero-phi-jonc101.wui_omop_peds.V3_count_condition_Outpt


In [36]:
# getting baseline outpatient prevalence for each item 
combined_df = pd.DataFrame()
features =  ['measurement','procedure','drug','condition']
for feature in features:
    print('reading...{}'.format(feature))
    df = read_BQ(table_id = prefix + '_count_'+ feature +'_Outpt')
    combined_df = pd.concat([combined_df, df])

combined_df = combined_df[combined_df['item_concept_id']!=0]
combined_df["baseline_prevalence"] = combined_df["num_pt"].apply(lambda x : x * 100 / N_All)
final_df = combined_df[["item_concept_id","baseline_prevalence"]]
final_df.sort_values(by = "baseline_prevalence", ascending = False)
final_dict = final_df.set_index("item_concept_id").to_dict()
baseline_prevalence_map = final_dict["baseline_prevalence"]
with open(JSON_FILE_PATH + '/baseline_prevalence_map.json', 'w') as fp:
    print('saving baseline prevalence map as json')
    json.dump(baseline_prevalence_map, fp)

reading...measurement




reading...procedure
reading...drug
reading...condition
saving baseline prevalence map as json


In [None]:
# cohort item counts - Primary care (PC) and Specialty care (SC)

table_list =  [('measurement','measurement_concept_id'),
                 ('procedure','procedure_concept_id'),
                 ('drug','drug_concept_id'),
                 ('condition','condition_concept_id'),
                 ]

cohorts = [('PC','NOT'),
            ('SC','')]

for c in cohorts:
    (cohort_suffix, cohort_str) = c 
    feature_dict = {'cohort_str': cohort_str}

    for t in table_list:
        (table, concept_id) = t
        feature_dict.update({'table': table,
                     'concept_id': concept_id})
    
        feature_dict.update(format_map_dict)

        sql = """
        WITH item AS 
            (SELECT 
               {concept_id} as item_concept_id,
               COUNT(DISTINCT(person_id)) as num_pt, 
               COUNT(DISTINCT(visit_id)) as num_visit,
               COUNT(*) as instance

        FROM  `{save_project_id}.{save_dataset_id}.{prefix}_{table}` item

        WHERE item.visit_id {cohort_str} IN 
                    (SELECT Specialty_visit_id 
                     FROM `{save_project_id}.{save_dataset_id}.{table_id_cohort}`)
        GROUP BY 
            {concept_id})

        SELECT 
            n.*, c.concept_name 
        FROM 
            item n 
        LEFT JOIN 
            `{project_id}.{dataset_id}.concept` c
        ON
            n.item_concept_id = c.concept_id
        ORDER BY 
            num_pt DESC, num_visit DESC, instance DESC
        """.format_map(feature_dict)
    
        save_table_id = prefix + "_count_" + table + "_" + cohort_suffix + "_Cohort"
        table_str = save_project_id + '.' + save_dataset_id + '.' + save_table_id
        save_query_table(sql, table_str)
    

In [29]:
# get relative risk of items based on how likely it will appear in the cohort  

def getItemRR(category, mode = 'PC', writeFile = False):
    
    item_Cohort = read_BQ(table_id = prefix + '_count_'+ category +'_' + mode + '_Cohort')
    item_NonCohort = read_BQ(table_id = prefix + '_count_'+ category +'_NonCohort')

    
    N_noncohort = N_All - N_cohort
        
    # calculate Fisher Negative Log p-value and relative risk in the Cohort 
    fisherMap = {}
    fisherList = []
    for index, row in item_Cohort.iterrows():
        concept_id = row["item_concept_id"]
        pt_cohort = row["num_pt"]
        concept_name = row["concept_name"]
        if concept_id in item_NonCohort["item_concept_id"].values:
            pt_noncohort = item_NonCohort.loc[item_NonCohort["item_concept_id"] == concept_id,"num_pt"].iloc[0]
        else:
            pt_noncohort = 0

        nAB = pt_cohort 
        nA = N_cohort
        nB = pt_cohort + pt_noncohort 
        N = N_cohort + N_noncohort 

        s = getStats(nAB = nAB, nA = nA, nB = nB, N = N)
        s.normalize()
        rr = s.calc('rr')
        neglogP = s.calc('fisher_neglog')
        if concept_id != 0:
            fisherMap[concept_id] = (neglogP, rr)
            fisherList.append([concept_id, concept_name, neglogP, rr])
    
    if writeFile:
        with open(JSON_FILE_PATH + '/' + category + '_rrMap_Cohort' + mode + '.json', 'w') as fp:
             json.dump(fisherMap, fp)
        
    return pd.DataFrame(fisherList, columns=["concept_id","name","neglogP","rr"])


In [30]:
category_list =  ['measurement','procedure','drug','condition']
DFlist = []
for c in category_list:
    DFlist.append(getItemRR(c, writeFile=True))