### V2 OMOP NLP RR with Fisher Neglog-P

In [None]:
##Setting up Google sdk environment
import os 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/wui/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

import sys
import json
sys.path.append('/home/wui/Codes')

from bigQueryUtil import BigQueryClient 
from StatsUtil import getStats

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

##Setting up BQ API
from google.cloud import bigquery
client = bigquery.Client()
project_id = 'som-rit-phi-starr-prod'
dataset_id = 'starr_omop_cdm5_deid_latest'

In [None]:
# get counts (number of patients, visits and instances) per each clinical items
# in all outpatient pediatric visits 2015-2019

table_list =  [('measurement','measurement','measurement_concept_id'),
                 ('procedure_occurrence','procedure','procedure_concept_id'),
                 ('drug_exposure','drug','drug_concept_id'),
                 ('condition_occurrence','condition','condition_concept_id'),
                 ('device_exposure','device','device_concept_id')]

for t in table_list:
    (table, tableName, concept) = t
    
    sql = """
            WITH itemAllOutpt AS (        
                    SELECT 
                        x.person_id,
                        x.visit_occurrence_id,
                        x.{concept_id} as item_concept_id
                    FROM 
                        `som-rit-phi-starr-prod.starr_omop_cdm5_deid_latest.{table}` x
                    INNER JOIN 
                        `wui_omop_peds.V2_Outpt_Visit_2015_2019` c 
                        ON 
                            (x.person_id = c.person_id) AND
                            (x.visit_occurrence_id = c.visit_occurrence_id)
                    WHERE 
                        x.person_id NOT IN 
                        (SELECT person_id FROM `wui_omop_peds.V2_test_cohort`)
                  )

            SELECT item_concept_id,
                   COUNT(DISTINCT(person_id)) as num_pt, 
                   COUNT(DISTINCT(visit_occurrence_id)) as num_visit,
                   COUNT(*) as instance
            FROM itemAllOutpt 
            GROUP BY item_concept_id 
            ORDER BY num_pt DESC, num_visit DESC, instance DESC

    """.format_map({'concept_id':concept, 'table': table })

    table_id = "som-nero-phi-jonc101.wui_omop_peds.V2_count_" + tableName + "_NonCohort"
    job_config = bigquery.QueryJobConfig(destination=table_id)
    client.delete_table(table_id, not_found_ok = True)
    query_job = client.query(sql, job_config=job_config)  
    query_job.result() 
    print("Query results loaded to the table {}".format(table_id))

In [None]:
# cohort item counts - Primary care

table_list =  [('measurement','measurement_concept_id'),
                 ('procedure','procedure_concept_id'),
                 ('drug','drug_concept_id'),
                 ('condition','condition_concept_id'),
                 ('device','device_concept_id')]

for t in table_list:
    (table, concept) = t

    sql = """
    WITH item AS 
        (SELECT 
               {concept_id} as item_concept_id,
               COUNT(DISTINCT(person_id)) as num_pt, 
               COUNT(DISTINCT(visit_id)) as num_visit,
               COUNT(*) as instance

        FROM  `som-nero-phi-jonc101.wui_omop_peds.V2_test_{table}` item

        WHERE item.visit_id NOT IN 
                    (SELECT Specialty_visit_id 
                     FROM `som-nero-phi-jonc101.wui_omop_peds.V2_test_cohort`)
        GROUP BY 
            {concept_id})

    SELECT 
        n.*, c.concept_name 
    FROM 
        item n 
    LEFT JOIN 
        `som-rit-phi-starr-prod.starr_omop_cdm5_deid_latest.concept` c
    ON
        n.item_concept_id = c.concept_id
    ORDER BY 
        num_pt DESC, num_visit DESC, instance DESC
    """.format_map({'concept_id':concept, 'table': table })
    
    bq = BigQueryClient(project_id = 'som-nero-phi-jonc101', dataset_id = 'wui_omop_peds')
    bq.saveQuerytoBQ(tableName = 'V2_count_'+ table +'_PC_Cohort', sql = sql)

In [None]:
# cohort item counts - Specialty care

table_list =  [('measurement','measurement_concept_id'),
                 ('procedure','procedure_concept_id'),
                 ('drug','drug_concept_id'),
                 ('condition','condition_concept_id'),
                 ('device','device_concept_id')]

for t in table_list:
    (table, concept) = t

    sql = """
    WITH item AS 
        (SELECT 
               {concept_id} as item_concept_id,
               COUNT(DISTINCT(person_id)) as num_pt, 
               COUNT(DISTINCT(visit_id)) as num_visit,
               COUNT(*) as instance

        FROM  `som-nero-phi-jonc101.wui_omop_peds.V2_test_{table}` item

        WHERE item.visit_id IN 
                    (SELECT Specialty_visit_id 
                     FROM `som-nero-phi-jonc101.wui_omop_peds.V2_test_cohort`)
        GROUP BY 
            {concept_id})

    SELECT 
        n.*, c.concept_name 
    FROM 
        item n 
    LEFT JOIN 
        `som-rit-phi-starr-prod.starr_omop_cdm5_deid_latest.concept` c
    ON
        n.item_concept_id = c.concept_id
    ORDER BY 
        num_pt DESC, num_visit DESC, instance DESC
    """.format_map({'concept_id':concept, 'table': table })
    
    bq = BigQueryClient(project_id = 'som-nero-phi-jonc101', dataset_id = 'wui_omop_peds')
    bq.saveQuerytoBQ(tableName = 'V2_count_'+ table +'_SC_Cohort', sql = sql)

In [None]:
# get relative risk of items based on how likely it will appear in the cohort  

def getItemRR(category, mode = 'SC', writeFile = False):
    
    bq = BigQueryClient(project_id = 'som-nero-phi-jonc101', dataset_id = 'wui_omop_peds')
    
    item_Cohort = bq.readBQFile(tableName='V2_count_'+ category +'_' + mode + '_Cohort')
    item_NonCohort = bq.readBQFile(tableName = 'V2_count_'+ category +'_NonCohort')

    sqlCohort = """SELECT COUNT(DISTINCT(person_id))  
                FROM `som-nero-phi-jonc101.wui_omop_peds.V2_test_cohort` """
    sqlAll = """SELECT COUNT(DISTINCT(person_id))  
                FROM `som-nero-phi-jonc101.wui_omop_peds.V2_Outpt_Visit_2015_2019` """

    N_cohort = bq.getRowCountfromQuery(sqlCohort)
    N_All = bq.getRowCountfromQuery(sqlAll)
    N_noncohort = N_All - N_cohort
        
    # calculate Fisher Negative Log p-value and relative risk in the Cohort 
    fisherMap = {}
    fisherList = []
    for index, row in item_Cohort.iterrows():
        concept_id = row["item_concept_id"]
        pt_cohort = row["num_pt"]
        concept_name = row["concept_name"]
        if concept_id in item_NonCohort["item_concept_id"].values:
            pt_noncohort = item_NonCohort.loc[item_NonCohort["item_concept_id"] == concept_id,"num_pt"].iloc[0]
        else:
            pt_noncohort = 0

        nAB = pt_cohort 
        nA = N_cohort
        nB = pt_cohort + pt_noncohort 
        N = N_cohort + N_noncohort 

        s = getStats(nAB = nAB, nA = nA, nB = nB, N = N)
        s.normalize()
        rr = s.calc('rr')
        neglogP = s.calc('fisher_neglog')
        if concept_id != 0:
            fisherMap[concept_id] = (neglogP, rr)
            fisherList.append([concept_id, concept_name, neglogP, rr])
    
    if writeFile:
        with open(category + '_rrMap_Cohort' + mode + '.json', 'w') as fp:
             json.dump(fisherMap, fp)
        
    return pd.DataFrame(fisherList, columns=["concept_id","name","neglogP","rr"])


In [None]:
category_list =  ['measurement','procedure','drug','condition','device']
DFlist = []
for c in category_list:
    DFlist.append(getItemRR(c, writeFile=True))