In [None]:
import pandas
import os

# This query represents dataset "controls_60_NOV_05_2024" for domain "person" and was generated for All of Us Registered Tier Dataset v8
dataset_93497021_person_sql = """
    SELECT
        person.person_id,
        person.gender_concept_id,
        p_gender_concept.concept_name as gender,
        person.birth_datetime as date_of_birth,
        person.race_concept_id,
        p_race_concept.concept_name as race,
        person.ethnicity_concept_id,
        p_ethnicity_concept.concept_name as ethnicity,
        person.sex_at_birth_concept_id,
        p_sex_at_birth_concept.concept_name as sex_at_birth 
    FROM
        `""" + os.environ["WORKSPACE_CDR"] + """.person` person 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_gender_concept 
            ON person.gender_concept_id = p_gender_concept.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_race_concept 
            ON person.race_concept_id = p_race_concept.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_ethnicity_concept 
            ON person.ethnicity_concept_id = p_ethnicity_concept.concept_id 
    LEFT JOIN
        `""" + os.environ["WORKSPACE_CDR"] + """.concept` p_sex_at_birth_concept 
            ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id  
    WHERE
        person.PERSON_ID IN (SELECT
            distinct person_id  
        FROM
            `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` cb_search_person  
        WHERE
            cb_search_person.person_id IN (SELECT
                person_id 
            FROM
                `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_person` p 
            WHERE
                DATE_DIFF(CURRENT_DATE, dob, YEAR) - IF(EXTRACT(MONTH FROM dob)*100 + EXTRACT(DAY FROM dob) > EXTRACT(MONTH FROM CURRENT_DATE)*100 + EXTRACT(DAY FROM CURRENT_DATE), 1, 0) BETWEEN 60 AND 91 
                AND NOT EXISTS (      SELECT
                    'x'      
                FROM
                    `""" + os.environ["WORKSPACE_CDR"] + """.death` d      
                WHERE
                    d.person_id = p.person_id ) ) 
            AND cb_search_person.person_id IN (SELECT
                person_id 
            FROM
                `""" + os.environ["WORKSPACE_CDR"] + """.person` p 
            WHERE
                race_concept_id IN (8527) ) 
            AND cb_search_person.person_id NOT IN (SELECT
                criteria.person_id 
            FROM
                (SELECT
                    DISTINCT person_id, entry_date, concept_id 
                FROM
                    `""" + os.environ["WORKSPACE_CDR"] + """.cb_search_all_events` 
                WHERE
                    (concept_id IN(SELECT
                        DISTINCT c.concept_id 
                    FROM
                        `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` c 
                    JOIN
                        (SELECT
                            CAST(cr.id as string) AS id       
                        FROM
                            `""" + os.environ["WORKSPACE_CDR"] + """.cb_criteria` cr       
                        WHERE
                            concept_id IN (44831122, 1568428, 45547675, 44826540, 1568088, 1568429, 1568284, 1568360, 1568087, 1568442, 35207459, 35207114, 1568372, 1568370, 35207329, 44819535, 1568441, 45547690, 1568408, 44819583, 1568297, 44824105, 1568279, 1568299, 1568423, 1568397, 1568298, 35207440, 1568282, 35207445, 1568406, 1568294, 1568415, 1568361, 1568394, 920130, 44821814, 1568293, 1568395, 35207501, 45566776, 45890911, 1568407, 1568400, 1568404, 1568393, 1568436, 1568286, 45552458, 44828400, 35207314, 35207429, 1568374, 1568280, 35207365, 35207460, 44830700, 1568435, 35207511, 44820755, 44826489, 1568373, 1568287, 1568412, 44830618, 44833435, 1568289, 44826537, 1568382, 1568402)       
                            AND full_text LIKE '%_rank1]%'      ) a 
                            ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                            OR c.path LIKE CONCAT('%.', a.id) 
                            OR c.path LIKE CONCAT(a.id, '.%') 
                            OR c.path = a.id) 
                    WHERE
                        is_standard = 0 
                        AND is_selectable = 1) 
                    AND is_standard = 0 )) criteria ) )"""

dataset_93497021_person_df = pandas.read_gbq(
    dataset_93497021_person_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

dataset_93497021_person_df.head(5)


In [None]:
# rename df
df1 = dataset_93497021_person_df

In [None]:
# Double check race 
df1.race.value_counts()

In [None]:
# Check sex at birth
df1.sex_at_birth.value_counts()

In [None]:
# Only select individuals who are male or female and are listed as white to match UKB data
sex_list = ['Male', 'Female']
df1 = df1[['person_id', 'date_of_birth', 'race', 'sex_at_birth']]
df1 = df1[df1['race']=='White']
df1 = df1[df1['sex_at_birth'].isin(sex_list)]
df1[['date_of_birth', 'extra']] = df1.date_of_birth.astype(str).str.split(' ', expand = True)
df1 = df1.drop(columns = ['extra'])
df1

In [None]:
# Load the cases created in 01 notebookes to double check controls
import pandas as pd
cases1 = pd.read_csv('AD_cases_n666.csv')
cases2 = pd.read_csv(f'PD_cases_n1713.csv')
cases3 = pd.read_csv('DEM_cases_n2825.csv')
cases = pd.concat([cases1, cases2, cases3])
cases_list = list(cases['person_id'])

In [None]:
# Only select people who were not in the cases list
df2 = df1[~df1['person_id'].isin(cases_list)]
remove = df1[df1['person_id'].isin(cases_list)]
print("controls: ", len(df2))
print("removed: ", len(remove))

In [None]:
# save controls
df2.to_csv('controls_60_n135891.csv', header = True, index = False)

In [None]:
import os
import subprocess
import numpy as np
import pandas as pd

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = df2   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'controls_60_n135891.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr

In [None]:
# This snippet assumes that you run setup first

# This code lists objects in your Google Bucket

# Get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# List objects in the bucket
print(subprocess.check_output(f"gsutil ls -r {my_bucket}", shell=True).decode('utf-8'))
