### V2 OMOP Peds NLP

In [None]:
##Setting up Google sdk environment
import os 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/wui/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

import sys
import json
sys.path.append('/home/wui/Codes')

from bigQueryUtil import BigQueryClient 

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

##Setting up BQ API
from google.cloud import bigquery
client = bigquery.Client()
project_id = 'som-rit-phi-starr-prod'
dataset_id = 'starr_omop_cdm5_deid_latest'

In [None]:
# note table -> note nlp (Primary Care)
# incluing office notes and telephone encounter notes
# exclude primary care visit in the final cohort 

sql = """
WITH 
     notePrimaryCare AS (        
     SELECT 
            x.person_id,
            x.visit_occurrence_id,
            x.note_id 
        FROM 
            `som-rit-phi-starr-prod.starr_omop_cdm5_deid_latest.note` x
        INNER JOIN 
            `wui_omop_peds.V2_PrimaryCare_Visit_2015_2019` c 
            ON 
                (x.person_id = c.person_id) AND
                (x.visit_occurrence_id = c.visit_occurrence_id)
        WHERE 
            (lower(x.note_title) LIKE '%progress%' OR
             lower(x.note_title) LIKE '%tele%') AND
             (x.note_id NOT IN 
                (SELECT note_id FROM `wui_omop_peds.V2_test_note`))
 
      )

SELECT note_nlp_concept_id as item_concept_id,
       COUNT(DISTINCT(n.person_id)) as num_pt, 
       COUNT(DISTINCT(n.visit_occurrence_id)) as num_visit,
       COUNT(*) as instance
FROM  `som-rit-phi-starr-prod.starr_omop_cdm5_deid_latest.note_nlp` nlp
INNER JOIN 
    notePrimaryCare n 
ON 
    nlp.note_id = n.note_id
GROUP BY note_nlp_concept_id 
ORDER BY num_pt DESC, num_visit DESC, instance DESC
            
""".format_map({'project_id':project_id, 'dataset_id':dataset_id})

bq = BigQueryClient(project_id = 'som-nero-phi-jonc101', dataset_id = 'wui_omop_peds')
bq.saveQuerytoBQ(tableName = 'V2_count_nlp_PrimaryCare', sql = sql)

In [None]:
# note table -> note nlp (Specialty Care)
# incluing office notes and telephone encounter notes
# exclude specialty care visit in the final cohort 

sql = """
WITH noteSpecialtyCare AS (        
     SELECT 
            x.person_id,
            x.visit_occurrence_id,
            x.note_id 
        FROM 
            `som-rit-phi-starr-prod.starr_omop_cdm5_deid_latest.note` x
        INNER JOIN 
            `wui_omop_peds.V2_Endo_Visit_2015_2019` c 
            ON 
                (x.person_id = c.person_id) AND
                (x.visit_occurrence_id = c.visit_occurrence_id)
        WHERE 
            (lower(x.note_title) LIKE '%progress%' OR
             lower(x.note_title) LIKE '%tele%') AND
             (x.note_id NOT IN 
                (SELECT note_id FROM `wui_omop_peds.V2_test_note`))
      )

SELECT note_nlp_concept_id as item_concept_id,
       COUNT(DISTINCT(n.person_id)) as num_pt, 
       COUNT(DISTINCT(n.visit_occurrence_id)) as num_visit,
       COUNT(*) as instance
FROM  `som-rit-phi-starr-prod.starr_omop_cdm5_deid_latest.note_nlp` nlp
INNER JOIN 
    noteSpecialtyCare n 
ON 
    nlp.note_id = n.note_id
GROUP BY note_nlp_concept_id 
ORDER BY num_pt DESC, num_visit DESC, instance DESC
            
""".format_map({'project_id':project_id, 'dataset_id':dataset_id})

bq = BigQueryClient(project_id = 'som-nero-phi-jonc101', dataset_id = 'wui_omop_peds')
bq.saveQuerytoBQ(tableName = 'V2_count_nlp_SpecialtyCare', sql = sql)

In [None]:
# cohort NLP counts for prevalence - Primary care

sql = """
SELECT 
       nlp_concept_id as item_concept_id,
       COUNT(DISTINCT(person_id)) as num_pt, 
       COUNT(DISTINCT(visit_id)) as num_visit,
       COUNT(*) as instance

FROM  `som-nero-phi-jonc101.wui_omop_peds.V2_test_nlp` nlp

WHERE nlp.visit_id NOT IN 
            (SELECT Specialty_visit_id 
             FROM `som-nero-phi-jonc101.wui_omop_peds.V2_test_cohort`)
GROUP BY 
    nlp_concept_id 
ORDER BY 
    num_pt DESC, num_visit DESC, instance DESC
"""
bq = BigQueryClient(project_id = 'som-nero-phi-jonc101', dataset_id = 'wui_omop_peds')
bq.saveQuerytoBQ(tableName = 'V2_count_nlp_cohortPC', sql = sql)

In [None]:
# cohort NLP counts for prevalence - Specialty care

sql = """
SELECT 
       nlp_concept_id as item_concept_id,
       COUNT(DISTINCT(person_id)) as num_pt, 
       COUNT(DISTINCT(visit_id)) as num_visit,
       COUNT(*) as instance

FROM  `som-nero-phi-jonc101.wui_omop_peds.V2_test_nlp` nlp

WHERE nlp.visit_id IN 
            (SELECT Specialty_visit_id 
             FROM `som-nero-phi-jonc101.wui_omop_peds.V2_test_cohort`)
GROUP BY 
    nlp_concept_id 
ORDER BY 
    num_pt DESC, num_visit DESC, instance DESC
"""
bq = BigQueryClient(project_id = 'som-nero-phi-jonc101', dataset_id = 'wui_omop_peds')
bq.saveQuerytoBQ(tableName = 'V2_count_nlp_cohortSC', sql = sql)

In [None]:
# get all notes from All pediatric outpatient visit

sql = """

        SELECT 
            x.person_id,
            x.visit_occurrence_id as visit_id,
            x.note_DATETIME,
            x.note_id,
            x.note_title,
        FROM 
            `{project_id}.{dataset_id}.note` x 
        INNER JOIN 
            `wui_omop_peds.V2_Outpt_Visit_2015_2019` c 
            ON 
                (x.person_id = c.person_id) 
                
                AND
             
                (DATETIME_DIFF(c.visit_start_DATETIME, x.note_DATETIME, MONTH) BETWEEN 0 AND 6) 
                  
         LEFT JOIN 
            `{project_id}.{dataset_id}.visit_occurrence` v
            ON
              x.visit_occurrence_id = v.visit_occurrence_id
         
         WHERE
            v.visit_concept_id IN (0,9202, 581477) AND
             (lower(x.note_title) LIKE '%progress%' OR
              lower(x.note_title) LIKE '%tele%')

""".format_map({'project_id':project_id, 'dataset_id':dataset_id})

bq = BigQueryClient(project_id = 'som-nero-phi-jonc101', dataset_id = 'wui_omop_peds')
bq.saveQuerytoBQ(tableName = 'V2_all_note', sql = sql)

In [None]:
sql = """
        SELECT 
            n.person_id,
            n.visit_id,
            x.note_id,
            x.lexical_variant,
            n.note_title,
            n.note_DATETIME as nlp_DATETIME,
            x.note_nlp_concept_id as nlp_concept_id
        FROM 
            `{project_id}.{dataset_id}.note_nlp` x 
        INNER JOIN 
            `wui_omop_peds.V2_all_note` n 
            ON 
                x.note_id = n.note_id 
        WHERE 
            x.term_exists = 'Y'
""".format_map({'project_id':project_id, 'dataset_id':dataset_id})

bq = BigQueryClient(project_id = 'som-nero-phi-jonc101', dataset_id = 'wui_omop_peds')
bq.saveQuerytoBQ(tableName = 'V2_all_nlp', sql = sql)

In [None]:
sql = """
SELECT 
       nlp_concept_id as item_concept_id,
       COUNT(DISTINCT(person_id)) as num_pt, 
       COUNT(DISTINCT(visit_id)) as num_visit,
       COUNT(*) as instance

FROM  `som-nero-phi-jonc101.wui_omop_peds.V2_all_nlp` nlp

GROUP BY 
    nlp_concept_id 

ORDER BY 
    num_pt DESC, num_visit DESC, instance DESC
"""
bq = BigQueryClient(project_id = 'som-nero-phi-jonc101', dataset_id = 'wui_omop_peds')
bq.saveQuerytoBQ(tableName = 'V2_count_nlp_all', sql = sql)

In [None]:
# calculate prevalence for nlp_phrases in 
# nlpPC: primary care excluding the notes that used in the cohort
# nlpSC: specialty care excluding the notes used in the cohort 
# nlpCohortPC: cohort primary care 
# nlpCohortSC: cohort specialty care

bq = BigQueryClient(project_id = 'som-nero-phi-jonc101', dataset_id = 'wui_omop_peds')

nlpPC = bq.readBQFile(tableName='V2_count_nlp_PrimaryCare')
nlpSC = bq.readBQFile(tableName='V2_count_nlp_SpecialtyCare')
nlpCohortPC = bq.readBQFile(tableName='V2_count_nlp_cohortPC')
nlpCohortSC = bq.readBQFile(tableName='V2_count_nlp_cohortSC')
nlpAll = bq.readBQFile(tableName = 'V2_count_nlp_all')

sqlPC = """SELECT COUNT(DISTINCT(person_id))  
            FROM `som-nero-phi-jonc101.wui_omop_peds.V2_PrimaryCare_Visit_2015_2019` """
sqlSC = """SELECT COUNT(DISTINCT(person_id))  
            FROM `som-nero-phi-jonc101.wui_omop_peds.V2_Endo_Visit_2015_2019` """
sqlCohort = """SELECT COUNT(DISTINCT(person_id))  
            FROM `som-nero-phi-jonc101.wui_omop_peds.V2_test_cohort` """
sqlAll = """SELECT COUNT(DISTINCT(person_id))  
            FROM `som-nero-phi-jonc101.wui_omop_peds.V2_all_note` """

numPt_PrimaryCare = bq.getRowCountfromQuery(sqlPC)
numPt_SpecialtyCare = bq.getRowCountfromQuery(sqlSC)
numPt_cohort = bq.getRowCountfromQuery(sqlCohort)
numPt_all = bq.getRowCountfromQuery(sqlAll)

nlpPC["prevalence"] = nlpPC["num_pt"].apply(lambda x : x*100/numPt_PrimaryCare)
nlpSC["prevalence"] = nlpSC["num_pt"].apply(lambda x : x*100/numPt_SpecialtyCare)
nlpCohortPC["prevalence"] = nlpCohortPC["num_pt"].apply(lambda x : x*100/numPt_cohort)
nlpCohortSC["prevalence"] = nlpCohortSC["num_pt"].apply(lambda x : x*100/numPt_cohort)
nlpAll["prevalence"] = nlpAll["num_pt"].apply(lambda x : x*100/numPt_all)

nlpPC_prevMap = nlpPC.set_index("item_concept_id").to_dict()['prevalence']
nlpSC_prevMap = nlpSC.set_index("item_concept_id").to_dict()['prevalence']
nlpCohortPC_prevMap = nlpCohortPC.set_index("item_concept_id").to_dict()['prevalence']
nlpCohortSC_prevMap = nlpCohortSC.set_index("item_concept_id").to_dict()['prevalence']
nlpAll_prevMap = nlpAll.set_index("item_concept_id").to_dict()['prevalence']

prevMap = [(nlpPC_prevMap, 'nlp_prevMap_PC'),
           (nlpSC_prevMap, 'nlp_prevMap_SC'),
           (nlpCohortPC_prevMap, 'nlp_prevMap_CohortPC'),
           (nlpCohortSC_prevMap, 'nlp_prevMap_CohortSC'),
           (nlpAll_prevMap, 'nlp_prevMap_All')]

for p in prevMap:
    with open(p[1] + '.json', 'w') as fp:
        json.dump(p[0], fp)
