## V3 Feature Extraction

In [None]:
##Setting up Google sdk environment
import os 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/wip/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

##Setting up BQ API
from google.cloud import bigquery
client = bigquery.Client()
project_id = 'som-rit-phi-starr-prod'
dataset_id = 'starr_omop_cdm5_deid_latest'

In [None]:
# defining variables
year_start = '2014'
year_end = '2020'
prefix = 'V3'

save_project_id = 'som-nero-phi-jonc101'
save_dataset_id = 'wui_omop_peds'

table_id_outpt = prefix + '_Outpt_Cohort_' + year_start + '_' + year_end
table_id_endorefer =  prefix + '_Endorefer_Cohort_' + year_start + '_' + year_end

table_id_cohort = prefix + '_cohort_' + year_start + '_' + year_end
table_id_demographic = prefix + '_demographic'


outpt_visit_concept = "(0,9202,581477,5083)"

format_map_dict = {'project_id': project_id,
                   'dataset_id': dataset_id, 
                   'save_project_id': save_project_id,
                   'save_dataset_id': save_dataset_id,
                   'year_start':year_start,
                   'year_end':year_end,
                   'table_id_endorefer': table_id_endorefer,
                   'table_id_cohort': table_id_cohort,
                   'outpt_visit_concept': outpt_visit_concept 
                    }

def save_query_table(sql, table_str):
    job_config = bigquery.QueryJobConfig(destination=table_str)
    client.delete_table(table_str, not_found_ok = True)
    query_job = client.query(sql, job_config=job_config)  
    query_job.result() 
    print("Query results loaded to the table {}".format(table_str))

In [None]:
# cohort 
sql = """

WITH rank_cohort AS 
    (
    SELECT 
        person_id, 
        PrimaryCare_visit_id,
        PrimaryCare_DATETIME,
        Specialty_visit_id,
        Specialty_DATETIME,
        ROW_NUMBER() OVER (PARTITION BY person_id ORDER BY PrimaryCare_DATETIME) AS visit_rank
    FROM 
        `{save_project_id}.{save_dataset_id}.{table_id_endorefer}`
    WHERE 
        Endo_visit_rank = 1
    )
SELECT 
    * 
FROM 
    rank_cohort
WHERE 
    visit_rank = 1
ORDER BY
    PrimaryCare_DATETIME   
""".format_map(format_map_dict)

table_str = save_project_id + '.' + save_dataset_id + '.' + table_id_cohort
save_query_table(sql, table_str)


In [None]:
def extract_cohort_feature(feature_dict, feature):

    sql = """
    WITH feature_table AS
        (
        SELECT 
           {feature_columns}
        FROM 
            `{project_id}.{dataset_id}.{feature_table_id}` f 
        INNER JOIN 
            `{save_project_id}.{save_dataset_id}.{table_id_cohort}` c 
            ON 
                f.person_id = c.person_id AND
                (
                (DATETIME_DIFF(c.PrimaryCare_DATETIME, f.{feature_datetime}, DAY) BETWEEN 0 AND 180) 
                 OR 
                (f.visit_occurrence_id = c.Specialty_visit_id)
                )
        LEFT JOIN 
            `{project_id}.{dataset_id}.visit_occurrence` v
            ON
              f.visit_occurrence_id = v.visit_occurrence_id
        WHERE
            v.visit_concept_id IN {outpt_visit_concept}
        )
    
    SELECT
        ft.*,
        c.concept_name
    FROM feature_table ft
        LEFT JOIN 
            `{project_id}.{dataset_id}.concept` c ON ft.{feature_concept_id} = c.concept_id
    ORDER BY 
        ft.person_id,
        ft.visit_id
    """.format_map(feature_dict)

    table_str = feature_dict['save_project_id'] + '.' + feature_dict['save_dataset_id'] + '.' + feature_dict['save_table_id']
    save_query_table(sql, table_str)

In [None]:
feature_columns_dict = {
    
            'measurement': """
                        f.person_id,
                        f.visit_occurrence_id as visit_id,
                        f.measurement_DATETIME,
                        f.value_as_number,
                        f.range_low,
                        f.range_high,
                        f.measurement_source_concept_id,
                        f.measurement_source_value,
                        f.measurement_concept_id,
                        """,
    
    
            'procedure': """
                        f.person_id,
                        f.visit_occurrence_id as visit_id,
                        f.procedure_DATETIME,
                        f.procedure_source_concept_id,
                        f.procedure_source_value,
                        f.procedure_concept_id 
                        """,
            
            'condition': """
                        f.person_id,
                        f.visit_occurrence_id as visit_id,
                        f.condition_start_DATETIME as condition_DATETIME,
                        f.condition_source_concept_id,
                        f.condition_source_value,
                        f.condition_concept_id
                        """,
            'drug': """
                    f.person_id,
                    f.visit_occurrence_id as visit_id,
                    f.drug_exposure_start_DATETIME as drug_DATETIME,
                    f.drug_exposure_end_DATETIME as drug_end_DATETIME,
                    f.drug_exposure_id,
                    f.drug_source_concept_id,
                    f.drug_source_value,
                    f.drug_concept_id
                    """
                           }


feature_table_id_dict = {'measurement': 'measurement',
                         'procedure': 'procedure_occurrence',
                         'condition': 'condition_occurrence',
                          'drug': 'drug_exposure'}

feature_datetime_dict = {'measurement': 'measurement_DATETIME',
                        'procedure': 'procedure_DATETIME',
                         'condition': 'condition_start_DATETIME',
                         'drug': 'drug_exposure_start_DATETIME'}

feature_concept_id_dict = {'measurement': 'measurement_concept_id',
                        'procedure': 'procedure_concept_id',
                         'condition': 'condition_concept_id',
                         'drug': 'drug_concept_id'}

save_table_id_dict = {'measurement': prefix + '_measurement',
                         'procedure': prefix + '_procedure',
                         'condition': prefix + '_condition',
                          'drug': prefix + '_drug'}


In [None]:
features = ['measurement','procedure','condition','drug']

for feature in features:
    feature_dict = {'feature_columns': feature_columns_dict[feature],
                    'feature_table_id': feature_table_id_dict[feature],
                    'feature_datetime': feature_datetime_dict[feature],
                    'feature_concept_id': feature_concept_id_dict[feature],
                    'save_table_id': save_table_id_dict[feature]}
    
    feature_dict.update(format_map_dict)
    extract_cohort_feature(feature_dict, feature)

In [None]:
# person table for patient demographics

sql = """

WITH person_table AS
    (
        SELECT 
            p.person_id,
            p.birth_DATETIME,
            p.gender_concept_id,
            p.race_concept_id,
            p.ethnicity_concept_id
        FROM 
            `{project_id}.{dataset_id}.person` p 
        INNER JOIN 
            (SELECT DISTINCT(person_id) FROM `{save_project_id}.{save_dataset_id}.{table_id_cohort}`) c 
            ON 
                p.person_id = c.person_id 
    )
    
SELECT
    pt.person_id,
    pt.birth_DATETIME,
    pt.gender_concept_id,
    c1.concept_name as gender,
    pt.race_concept_id,
    c2.concept_name as race,
    pt.ethnicity_concept_id,
    c3.concept_name as ethnicity
FROM person_table pt
    LEFT JOIN 
        `{project_id}.{dataset_id}.concept` c1 ON pt.gender_concept_id = c1.concept_id
    LEFT JOIN 
        `{project_id}.{dataset_id}.concept` c2 ON pt.race_concept_id = c2.concept_id
    LEFT JOIN 
        `{project_id}.{dataset_id}.concept` c3 ON pt.ethnicity_concept_id = c3.concept_id
ORDER BY 
    pt.person_id
""".format_map(format_map_dict)

table_str = save_project_id + '.' + save_dataset_id + '.' + table_id_demographic
save_query_table(sql, table_str)
