### V2 OMOP Peds Clinical Items Prevalence Outpatient 

In [3]:
##Setting up Google sdk environment
import os 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/wui/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

##Setting up BQ API
from google.cloud import bigquery
client = bigquery.Client()
project_id = 'som-rit-phi-starr-prod'
dataset_id = 'starr_omop_cdm5_deid_latest'

In [10]:
# get counts (number of patients, visits and instances) per each clinical items
# in all outpatient pediatric visits 2015-2019

table_list =  [('measurement','measurement_concept_id'),
                 ('procedure_occurrence','procedure_concept_id'),
                 ('drug_exposure','drug_concept_id'),
                 ('condition_occurrence','condition_concept_id'),
                 ('device_exposure','device_concept_id')]

for t in table_list:
    (table, concept) = t
    
    sql = """
            WITH itemAllOutpt AS (        
                    SELECT 
                        x.person_id,
                        x.visit_occurrence_id,
                        x.{concept_id} as item_concept_id
                    FROM 
                        `som-rit-phi-starr-prod.starr_omop_cdm5_deid_latest.{table}` x
                    INNER JOIN 
                        `wui_omop_peds.V2_Outpt_Visit_2015_2019` c 
                        ON 
                            (x.person_id = c.person_id) AND
                            (x.visit_occurrence_id = c.visit_occurrence_id)
                  )

            SELECT item_concept_id,
                   COUNT(DISTINCT(person_id)) as num_pt, 
                   COUNT(DISTINCT(visit_occurrence_id)) as num_visit,
                   COUNT(*) as instance
            FROM itemAllOutpt 
            GROUP BY item_concept_id 
            ORDER BY num_pt DESC, num_visit DESC, instance DESC

    """.format_map({'concept_id':concept, 'table': table })

    table_id = "som-nero-phi-jonc101.wui_omop_peds.V2_count_" + table
    job_config = bigquery.QueryJobConfig(destination=table_id)
    client.delete_table(table_id, not_found_ok = True)
    query_job = client.query(sql, job_config=job_config)  
    query_job.result() 
    print("Query results loaded to the table {}".format(table_id))

Query results loaded to the table som-nero-phi-jonc101.wui_omop_peds.V2_count_measurement
Query results loaded to the table som-nero-phi-jonc101.wui_omop_peds.V2_count_procedure_occurrence
Query results loaded to the table som-nero-phi-jonc101.wui_omop_peds.V2_count_drug_exposure
Query results loaded to the table som-nero-phi-jonc101.wui_omop_peds.V2_count_condition_occurrence
Query results loaded to the table som-nero-phi-jonc101.wui_omop_peds.V2_count_device_exposure


In [6]:
from bigQueryUtil import BigQueryClient 

bq = BigQueryClient(project_id = 'som-nero-phi-jonc101', dataset_id = 'wui_omop_peds')

# read the above table in a python dictionary format 
table_list =  ['measurement','procedure_occurrence','drug_exposure',
                'condition_occurrence','device_exposure','nlp']
df_list = []
for t in table_list:
    print('reading...{}'.format(t))
    df = bq.readBQFile(tableName='V2_count_' + t)
    df_list.append(df)



som-nero-phi-jonc101
reading...measurement
reading...procedure_occurrence
reading...drug_exposure
reading...condition_occurrence
reading...device_exposure
reading...nlp


In [7]:
import pandas as pd
combinedDF = pd.concat([df_list[0],df_list[1],df_list[2],df_list[3],df_list[4]])
# excluding NLP    

In [13]:
newDF = combinedDF[combinedDF['item_concept_id']!=0]

In [15]:
N = 445308 # total number of patients from V2_Outpt_Visit_2015_2019

In [18]:
newDF["prevalence"] = newDF["num_pt"].apply(lambda x : x*100/N)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [20]:
newDF = newDF[["item_concept_id","prevalence"]]

In [25]:
newDF.sort_values(by="prevalence",ascending=False)
x = newDF.set_index("item_concept_id").to_dict()

In [28]:
prevalence_map = x['prevalence']


{3025315: 72.86709423589964,
 3036277: 70.012665391145,
 3038553: 69.96685440189711,
 4301868: 54.38236007437549,
 3020891: 52.65524086699543,
 4154790: 51.44371985232693,
 4152194: 51.44371985232693,
 3000963: 18.47350597788497,
 42870592: 15.299523026759008,
 3023314: 13.055458244630682,
 3007461: 12.832691081229171,
 3019897: 12.50887026507496,
 3024731: 12.464406657863771,
 3035941: 12.461038202772015,
 3003338: 12.458568002371392,
 3002864: 11.834954683050832,
 3010813: 11.77656812812705,
 3011948: 11.51180755791497,
 3037511: 11.509786484859918,
 3026361: 11.402220485596485,
 3010457: 11.388971228902243,
 3009261: 11.305882669972243,
 3018010: 11.295777304696974,
 3022096: 11.167776011210218,
 3045414: 10.616023067180468,
 3004501: 10.569538386914227,
 3033575: 10.46174782397801,
 3004327: 10.460400441941308,
 3028615: 10.34295364107539,
 3006923: 10.171611558741365,
 3024128: 10.085828235737962,
 3016723: 9.448291968704806,
 2212093: 9.443351567903564,
 3022621: 9.43706378506561

In [29]:
import json
with open('item_prevalence_map.json', 'w') as fp:
    json.dump(prevalence_map, fp)