## Load libraries

In [177]:
# Use the os package to interact with the environment
import os
import sys

# Bring in Pandas for Dataframe functionality
import pandas as pd
from functools import reduce

# Bring some visualization functionality 
import seaborn as sns

# numpy for basics
import numpy as np

# Use StringIO for working with file contents
from io import StringIO

# Enable IPython to display matplotlib graphs
import matplotlib.pyplot as plt
%matplotlib inline

# Enable interaction with the FireCloud API
from firecloud import api as fapi

# Import the iPython HTML rendering for displaying links to Google Cloud Console
from IPython.core.display import display, HTML

# Import urllib modules for building URLs to Google Cloud Console
import urllib.parse

# BigQuery for querying data
from google.cloud import bigquery

## Create Python Functions to Interact With GCP/Terra

In [178]:
# Utility routine for printing a shell command before executing it
def shell_do(command):
    print(f'Executing: {command}', file=sys.stderr)
    !$command
    
def shell_return(command):
    print(f'Executing: {command}', file=sys.stderr)
    output = !$command
    return '\n'.join(output)

# Utility routine for printing a query before executing it
def bq_query(query):
    print(f'Executing: {query}', file=sys.stderr)
    return pd.read_gbq(query, project_id=BILLING_PROJECT_ID, dialect='standard')

# Utility routine for display a message and a link
def display_html_link(description, link_text, url):
    html = f'''
    <p>
    </p>
    <p>
    {description}
    <a target=_blank href="{url}">{link_text}</a>.
    </p>
    '''

    display(HTML(html))

# Utility routines for reading files from Google Cloud Storage
def gcs_read_file(path):
    """Return the contents of a file in GCS"""
    contents = !gsutil -u {BILLING_PROJECT_ID} cat {path}
    return '\n'.join(contents)
    
def gcs_read_csv(path, sep=None):
    """Return a DataFrame from the contents of a delimited file in GCS"""
    return pd.read_csv(StringIO(gcs_read_file(path)), sep=sep, engine='python')

# Utility routine for displaying a message and link to Cloud Console
def link_to_cloud_console_gcs(description, link_text, gcs_path):
    url = '{}?{}'.format(
        os.path.join('https://console.cloud.google.com/storage/browser',
                     gcs_path.replace("gs://","")),
        urllib.parse.urlencode({'userProject': BILLING_PROJECT_ID}))

    display_html_link(description, link_text, url)

## Initialize Workspace Variables

In [179]:
# Set up billing project and data path variables
BILLING_PROJECT_ID = os.environ['GOOGLE_PROJECT']
WORKSPACE_NAMESPACE = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE_NAME = os.environ['WORKSPACE_NAME']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']

WORKSPACE_ATTRIBUTES = fapi.get_workspace(WORKSPACE_NAMESPACE, WORKSPACE_NAME).json().get('workspace',{}).get('attributes',{})

print(BILLING_PROJECT_ID)
print(WORKSPACE_NAME)
print(WORKSPACE_BUCKET)

terra-2b540c0c
PDD_Transcriptomic_Project_Trainees
gs://fc-48371aef-4547-45c8-9640-6bd27b0cd4a5


In [180]:
## AMP-PD v2.5
## Explicitly define release v2.5 path and other paths

AMP_RELEASE_PATH = 'gs://amp-pd-data/releases/2021_v2-5release_0510'
AMP_CLINICAL_RELEASE_PATH = f'{AMP_RELEASE_PATH}/clinical'
AMP_TRANSCRIPTOMICS_PATH_PICARD = 'gs://amp-pd-transcriptomics/releases/2021_v2-5release_0510/rnaseq/picard/metrics'
AMP_TRANSCRIPTOMICS_PATH_FEATURE_COUNTS = 'gs://amp-pd-transcriptomics/releases/2021_v2-5release_0510/rnaseq/subread/feature-counts'
AMP_TRANSCRIPTOMICS_PATH_SEQUENCING = 'gs://amp-pd-transcriptomics/releases/2021_v2-5release_0510/rnaseq/sequencing/metrics'

In [181]:
#Set working directory
WORK_DIR = f'/home/jupyter/'
print(WORK_DIR)

/home/jupyter/


In [182]:
##Copy files from bucket workspace to notebook workspace
#shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_RELEASE_PATH}/amp_pd_case_control.csv {WORK_DIR}')
#shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_RELEASE_PATH}/rna_sample_inventory.csv {WORK_DIR}')
#shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_CLINICAL_RELEASE_PATH}/Enrollment.csv {WORK_DIR}')
#shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_CLINICAL_RELEASE_PATH}/Demographics.csv {WORK_DIR}')
#shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_CLINICAL_RELEASE_PATH}/MOCA.csv {WORK_DIR}')
#shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_CLINICAL_RELEASE_PATH}/PD_Medical_History.csv {WORK_DIR}')
#shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_TRANSCRIPTOMICS_PATH_PICARD}/aggregated.alignment_summary_metrics.tsv {WORK_DIR}')
#shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_TRANSCRIPTOMICS_PATH_PICARD}/aggregated.rna_seq_metrics.tsv {WORK_DIR}')
#shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_TRANSCRIPTOMICS_PATH_FEATURE_COUNTS}/matrix.featureCounts.tsv {WORK_DIR}')
#shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {AMP_TRANSCRIPTOMICS_PATH_SEQUENCING}/rna_quality_metrics.csv {WORK_DIR}')

In [183]:
#Read all the different files needed with pd.read_excel() or pd.read_csv()
moca = pd.read_csv(f'{WORK_DIR}/MOCA.csv')
medical_history = pd.read_csv(f'{WORK_DIR}/PD_Medical_History.csv')
demographics = pd.read_csv(f'{WORK_DIR}/Demographics.csv')
case_control = pd.read_csv(f'{WORK_DIR}/amp_pd_case_control.csv')
plate = pd.read_csv(f'{WORK_DIR}/rna_quality_metrics.csv')
base = pd.read_csv(f'{WORK_DIR}/aggregated.rna_seq_metrics.tsv', sep = '\t')
strand = pd.read_csv(f'{WORK_DIR}/aggregated.alignment_summary_metrics.tsv', sep = '\t')
rnaseq = pd.read_csv(f'{WORK_DIR}/rna_sample_inventory.csv')

In [184]:
medical_history

Unnamed: 0,participant_id,GUID,visit_name,visit_month,diagnosis,initial_diagnosis,most_recent_diagnosis,change_in_diagnosis,change_in_diagnosis_months_after_baseline,surgery_for_parkinson_disease,pd_diagnosis_months_after_baseline,age_at_diagnosis,pd_medication_initiation_months_after_baseline,pd_medication_start_months_after_baseline,use_of_pd_medication,pd_medication_recent_use_months_after_baseline,on_levodopa,on_dopamine_agonist,on_other_pd_medications,diagnosis_type
0,BF-1001,PDNW781VHY,M0,0.0,No PD Nor Other Neurological Disorder,,,,,,,,,,,,,,,
1,BF-1002,PDCB969UGG,LOG,,,,,,,,,,,-39.0,,,,,,
2,BF-1002,PDCB969UGG,M0,0.0,Idiopathic PD,,,,,,-61.0,61.0,,,Yes,0.0,Yes,No,No,
3,BF-1003,PDLW805AHT,LOG,,,,,,,,,,,-57.0,,,,,,
4,BF-1003,PDLW805AHT,M0,0.0,Idiopathic PD,,,,,,-60.0,56.0,,,Yes,0.0,Yes,Yes,Yes,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28722,SY-PDZZ260HUM,PDZZ260HUM,M18,18.0,,,,,,,,,,,,,Yes,,Yes,
28723,SY-PDZZ260HUM,PDZZ260HUM,M24,24.0,,,,,,,,,,,,,Yes,,No,
28724,SY-PDZZ260HUM,PDZZ260HUM,M36,36.0,Parkinson's Disease,,,,,,,,,,,,,,,
28725,SY-PDZZ260HUM,PDZZ260HUM,M42,42.0,,,,,,,,,,,,,Yes,,No,


In [185]:
#Keep baseline visit with visit_name=="M0"
moca_m0 = moca[moca.visit_name=="M0"]
medical_history_m0 = medical_history[medical_history.visit_name=="M0"]
demographics_m0 = demographics[(demographics.visit_month==0) | (demographics.visit_month==-1)]

In [186]:
demographics_m0.columns

Index(['participant_id', 'GUID', 'visit_name', 'visit_month',
       'age_at_baseline', 'sex', 'ethnicity', 'race', 'education_level_years'],
      dtype='object')

In [187]:
demographics_m0.shape

(10773, 9)

In [188]:
#Keep the columns that we are interested in from the original datasets
moca_m0 = moca_m0[["participant_id", "GUID", "visit_name","visit_month","moca_total_score"]]
medical_history_m0 = medical_history_m0[["participant_id","GUID","visit_name","visit_month","diagnosis","initial_diagnosis","most_recent_diagnosis","age_at_diagnosis"]]

In [189]:
case_control

Unnamed: 0,participant_id,diagnosis_at_baseline,diagnosis_latest,case_control_other_at_baseline,case_control_other_latest
0,BF-1018,Essential Tremor,Essential Tremor,Other,Other
1,BF-1050,Other Neurological Disorder(s),Other Neurological Disorder(s),Other,Other
2,BF-1074,Other Neurological Disorder(s),Other Neurological Disorder(s),Other,Other
3,BF-1181,Other Neurological Disorder(s),Other Neurological Disorder(s),Other,Other
4,BF-1215,Essential Tremor,Essential Tremor,Other,Other
...,...,...,...,...,...
10767,PP-75512,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,Control
10768,PP-75520,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,Control
10769,PP-75547,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,Control
10770,PP-75550,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,Control


In [190]:
#Keep only those Case and Controls from the case_control_other_latest column.
case_control_PD_HC_M0_latest = case_control.loc[(case_control.case_control_other_latest=="Case") | (case_control.case_control_other_latest=="Control")]
case_control_PD_HC_M0_latest.case_control_other_latest.value_counts()

Control    4312
Case       3527
Name: case_control_other_latest, dtype: int64

In [191]:
case_control_PD_HC_M0_latest

Unnamed: 0,participant_id,diagnosis_at_baseline,diagnosis_latest,case_control_other_at_baseline,case_control_other_latest
2801,BF-1002,Idiopathic PD,Idiopathic PD,Case,Case
2802,BF-1003,Idiopathic PD,Idiopathic PD,Case,Case
2803,BF-1004,Idiopathic PD,Idiopathic PD,Case,Case
2804,BF-1006,Idiopathic PD,Idiopathic PD,Case,Case
2805,BF-1008,Idiopathic PD,Idiopathic PD,Case,Case
...,...,...,...,...,...
10767,PP-75512,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,Control
10768,PP-75520,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,Control
10769,PP-75547,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,Control
10770,PP-75550,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,Control


### MERGE DATASETS AT VISIT 0

In [192]:
#Merge all the different datasets on participant_id with how="outer" especified to keep all the participants_id. Not only those that are in common between the datasets (this would be the default option)

moca_medicalhistory = pd.merge(moca_m0,medical_history_m0,on="participant_id",how="outer")
moca_medicalhistory_demographics = pd.merge(moca_medicalhistory,demographics_m0,on="participant_id",how="outer")
clinical_data = pd.merge(moca_medicalhistory_demographics,case_control_PD_HC_M0_latest,on="participant_id",how="outer")

In [193]:
clinical_data.count()

participant_id                    10782
GUID_x                             2387
visit_name_x                       3362
visit_month_x                      3362
moca_total_score                   3361
GUID_y                             3611
visit_name_y                       9639
visit_month_y                      9639
diagnosis                          9639
initial_diagnosis                    16
most_recent_diagnosis                13
age_at_diagnosis                   2345
GUID                               3612
visit_name                        10782
visit_month                       10782
age_at_baseline                   10782
sex                               10782
ethnicity                          6177
race                              10763
education_level_years              6196
diagnosis_at_baseline              7849
diagnosis_latest                   7849
case_control_other_at_baseline     7849
case_control_other_latest          7849
dtype: int64

In [194]:
clinical_data['age_at_baseline'] = clinical_data['age_at_baseline'].astype(float)

In [195]:
#Create new variable age_at_diagnosis - age_at_baseline and add it as a covariate
clinical_data['disease_duration'] = clinical_data['age_at_baseline'] - clinical_data['age_at_diagnosis'] 

In [196]:
clinical_data

Unnamed: 0,participant_id,GUID_x,visit_name_x,visit_month_x,moca_total_score,GUID_y,visit_name_y,visit_month_y,diagnosis,initial_diagnosis,...,age_at_baseline,sex,ethnicity,race,education_level_years,diagnosis_at_baseline,diagnosis_latest,case_control_other_at_baseline,case_control_other_latest,disease_duration
0,BF-1001,PDNW781VHY,M0,0.0,28.0,PDNW781VHY,M0,0.0,No PD Nor Other Neurological Disorder,,...,55.0,Male,Not Hispanic or Latino,White,12-16 years,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,Control,
1,BF-1002,PDCB969UGG,M0,0.0,29.0,PDCB969UGG,M0,0.0,Idiopathic PD,,...,66.0,Female,Not Hispanic or Latino,White,12-16 years,Idiopathic PD,Idiopathic PD,Case,Case,5.0
2,BF-1003,PDLW805AHT,M0,0.0,30.0,PDLW805AHT,M0,0.0,Idiopathic PD,,...,61.0,Male,Not Hispanic or Latino,White,12-16 years,Idiopathic PD,Idiopathic PD,Case,Case,5.0
3,BF-1004,PDKW284DYW,M0,0.0,28.0,PDKW284DYW,M0,0.0,Idiopathic PD,,...,62.0,Male,Not Hispanic or Latino,White,12-16 years,Idiopathic PD,Idiopathic PD,Case,Case,7.0
4,BF-1005,PDTM274KX6,M0,0.0,27.0,PDTM274KX6,M0,0.0,No PD Nor Other Neurological Disorder,,...,61.0,Female,Not Hispanic or Latino,White,12-16 years,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,Control,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10777,PP-75537,,,,,,,,,,...,37.0,Female,Not Hispanic or Latino,White,12-16 years,Idiopathic PD,Idiopathic PD,Case,Case,
10778,PP-75547,,,,,,,,,,...,52.0,Male,Not Hispanic or Latino,White,Greater than 16 years,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,Control,
10779,PP-75550,,,,,,,,,,...,69.0,Male,Not Hispanic or Latino,White,Greater than 16 years,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,Control,
10780,PP-75564,,,,,,,,,,...,77.0,Female,Not Hispanic or Latino,White,12-16 years,Prodromal motor PD,Idiopathic PD,Other,Case,


## MERGE WITH TRANSCRIPTOME DATA

In [197]:
#Keep baseline visits with visit_month==0.0 and visit_month==0.5
rnaseq_m0_05 = rnaseq[(rnaseq.visit_month==0.0) | (rnaseq.visit_month==0.5)]
rnaseq_m0_05.participant_id.count()

3196

In [198]:
rnaseq_m0_05.shape

(3196, 3)

In [199]:
clinical_data.shape

(10782, 25)

In [200]:
#Merge both datasets on participant_id
clinical_rnaseq_data = pd.merge(clinical_data,rnaseq_m0_05, on="participant_id")

  


In [201]:
clinical_rnaseq_data.diagnosis.value_counts()

No PD Nor Other Neurological Disorder    815
Parkinson's Disease                      800
Idiopathic PD                            506
Progressive Supranuclear Palsy            55
Multiple System Atrophy                   54
Prodromal non-motor PD                    51
Essential Tremor                          28
Other Neurological Disorder(s)            15
Prodromal motor PD                         7
Corticobasal Degeneration                  5
Dementia With Lewy Bodies                  5
Parkinsonism                               2
Olivopontocerebellar Atrophy               1
Fahr's Syndrome                            1
Name: diagnosis, dtype: int64

In [202]:
clinical_rnaseq_data.count()

participant_id                    3196
GUID_x                            1640
visit_name_x                      2030
visit_month_x                     2030
moca_total_score                  2030
GUID_y                            1673
visit_name_y                      2345
visit_month_y                     2345
diagnosis                         2345
initial_diagnosis                   12
most_recent_diagnosis               12
age_at_diagnosis                  1075
GUID                              1674
visit_name                        3196
visit_month_x                     3196
age_at_baseline                   3196
sex                               3196
ethnicity                         3196
race                              3196
education_level_years             3196
diagnosis_at_baseline             2923
diagnosis_latest                  2923
case_control_other_at_baseline    2923
case_control_other_latest         2923
disease_duration                  1075
sample_id                

In [203]:
clinical_rnaseq_data.shape

(3196, 27)

In [204]:
clinical_rnaseq_data.diagnosis.value_counts()

No PD Nor Other Neurological Disorder    815
Parkinson's Disease                      800
Idiopathic PD                            506
Progressive Supranuclear Palsy            55
Multiple System Atrophy                   54
Prodromal non-motor PD                    51
Essential Tremor                          28
Other Neurological Disorder(s)            15
Prodromal motor PD                         7
Corticobasal Degeneration                  5
Dementia With Lewy Bodies                  5
Parkinsonism                               2
Olivopontocerebellar Atrophy               1
Fahr's Syndrome                            1
Name: diagnosis, dtype: int64

In [205]:
#Keep only those with No PD Nor Other Neurological Disorder, Parkinson's Disease and Idiopathic PD
clinical_rnaseq_data = clinical_rnaseq_data.loc[(clinical_rnaseq_data['diagnosis']=="No PD Nor Other Neurological Disorder") | (clinical_rnaseq_data['diagnosis']=="Parkinson's Disease") | (clinical_rnaseq_data['diagnosis']=="Idiopathic PD")]

In [206]:
#we are left with just 2121 cases that are of interest, those left out the previous filter are diagnosed with other condition
clinical_rnaseq_data.shape

(2121, 27)

In [207]:
clinical_rnaseq_data.count()

participant_id                    2121
GUID_x                            1496
visit_name_x                      1871
visit_month_x                     1871
moca_total_score                  1871
GUID_y                            1520
visit_name_y                      2121
visit_month_y                     2121
diagnosis                         2121
initial_diagnosis                    1
most_recent_diagnosis                1
age_at_diagnosis                   926
GUID                              1520
visit_name                        2121
visit_month_x                     2121
age_at_baseline                   2121
sex                               2121
ethnicity                         2121
race                              2121
education_level_years             2121
diagnosis_at_baseline             2103
diagnosis_latest                  2103
case_control_other_at_baseline    2103
case_control_other_latest         2103
disease_duration                   926
sample_id                

In [208]:
#If we just keep the MOCA scale we would have 1226 participants with normal cognition
clinical_rnaseq_data[clinical_rnaseq_data.moca_total_score > 25.0].shape[0]

1226

In [209]:
#If we just keep the MOCA scale we would have 64 participants with dementia
clinical_rnaseq_data[clinical_rnaseq_data.moca_total_score < 19.0].shape[0]

64

In [210]:
#If we just keep the MOCA scale we would have 399 participants with MCI
clinical_rnaseq_data[(clinical_rnaseq_data.moca_total_score > 19.0) & (clinical_rnaseq_data.moca_total_score < 25.0)].shape[0]

399

In [211]:
#Now we want to know how many of these participants with dementia are PD patients and how many controls
clinical_rnaseq_data_dementia = clinical_rnaseq_data.loc[(clinical_rnaseq_data['moca_total_score']<19)]
clinical_rnaseq_data_dementia.diagnosis.value_counts()

Parkinson's Disease                      41
Idiopathic PD                            18
No PD Nor Other Neurological Disorder     5
Name: diagnosis, dtype: int64

In [212]:
#Now we want to know how many of these cognitive normal participants are PD patients and how many controls
clinical_rnaseq_data_cognitive_normal = clinical_rnaseq_data.loc[(clinical_rnaseq_data['moca_total_score']>25)]
clinical_rnaseq_data_cognitive_normal.diagnosis.value_counts()

No PD Nor Other Neurological Disorder    621
Parkinson's Disease                      426
Idiopathic PD                            179
Name: diagnosis, dtype: int64

In [213]:
#Now we want to know how many of these MCI participants are PD patients and how many controls
clinical_rnaseq_data_mci = clinical_rnaseq_data.loc[(clinical_rnaseq_data['moca_total_score']<25) & (clinical_rnaseq_data['moca_total_score']>19) ]
clinical_rnaseq_data_mci.diagnosis.value_counts()

Parkinson's Disease                      220
No PD Nor Other Neurological Disorder    115
Idiopathic PD                             64
Name: diagnosis, dtype: int64

In [214]:
plate_edited = plate[["sample_id","RIN_Value","Plate"]]

In [215]:
bases_edited = base[["participant_id","sample_id","PCT_USABLE_BASES"]]

In [216]:
strand_edited = strand[["participant_id","sample_id","STRAND_BALANCE"]]

In [217]:
#We now merge the final_dataset with the plate, STRAND_BALANCE, and PCT_USABLE_BASES
clinical_rnaseq_data_plate = pd.merge(clinical_rnaseq_data,plate_edited,on="sample_id")
clinical_rnaseq_data_plate_base = pd.merge(clinical_rnaseq_data_plate,bases_edited,on="sample_id")



In [218]:
sample_meta_uncleaned = pd.merge(clinical_rnaseq_data_plate_base,strand_edited,on="sample_id")

In [219]:
sample_meta_uncleaned.count()

participant_id_x                  2121
GUID_x                            1496
visit_name_x                      1871
visit_month_x                     1871
moca_total_score                  1871
GUID_y                            1520
visit_name_y                      2121
visit_month_y                     2121
diagnosis                         2121
initial_diagnosis                    1
most_recent_diagnosis                1
age_at_diagnosis                   926
GUID                              1520
visit_name                        2121
visit_month_x                     2121
age_at_baseline                   2121
sex                               2121
ethnicity                         2121
race                              2121
education_level_years             2121
diagnosis_at_baseline             2103
diagnosis_latest                  2103
case_control_other_at_baseline    2103
case_control_other_latest         2103
disease_duration                   926
sample_id                

In [220]:
#See column headers
sample_meta_uncleaned.columns

Index(['participant_id_x', 'GUID_x', 'visit_name_x', 'visit_month_x',
       'moca_total_score', 'GUID_y', 'visit_name_y', 'visit_month_y',
       'diagnosis', 'initial_diagnosis', 'most_recent_diagnosis',
       'age_at_diagnosis', 'GUID', 'visit_name', 'visit_month_x',
       'age_at_baseline', 'sex', 'ethnicity', 'race', 'education_level_years',
       'diagnosis_at_baseline', 'diagnosis_latest',
       'case_control_other_at_baseline', 'case_control_other_latest',
       'disease_duration', 'sample_id', 'visit_month_y', 'RIN_Value', 'Plate',
       'participant_id_y', 'PCT_USABLE_BASES', 'participant_id',
       'STRAND_BALANCE'],
      dtype='object')

In [221]:
#Clean data - keep all the columns we need
sample_meta = sample_meta_uncleaned[["participant_id_x", "GUID_x", "visit_month_x", "sample_id", "PCT_USABLE_BASES", "Plate", "STRAND_BALANCE", "diagnosis_at_baseline", "case_control_other_at_baseline", "case_control_other_latest", "age_at_baseline", "age_at_diagnosis", "disease_duration", "sex", "ethnicity", "race", "education_level_years", "moca_total_score", "diagnosis_latest"]]

In [222]:
#Check headers
sample_meta.count()

participant_id_x                  2121
GUID_x                            1496
visit_month_x                     1871
visit_month_x                     2121
sample_id                         2121
PCT_USABLE_BASES                  2121
Plate                             2121
STRAND_BALANCE                    2121
diagnosis_at_baseline             2103
case_control_other_at_baseline    2103
case_control_other_latest         2103
age_at_baseline                   2121
age_at_diagnosis                   926
disease_duration                   926
sex                               2121
ethnicity                         2121
race                              2121
education_level_years             2121
moca_total_score                  1871
diagnosis_latest                  2103
dtype: int64

In [223]:
#Check csv file
sample_meta.columns

Index(['participant_id_x', 'GUID_x', 'visit_month_x', 'visit_month_x',
       'sample_id', 'PCT_USABLE_BASES', 'Plate', 'STRAND_BALANCE',
       'diagnosis_at_baseline', 'case_control_other_at_baseline',
       'case_control_other_latest', 'age_at_baseline', 'age_at_diagnosis',
       'disease_duration', 'sex', 'ethnicity', 'race', 'education_level_years',
       'moca_total_score', 'diagnosis_latest'],
      dtype='object')

In [224]:
#Rename headers
sample_meta.columns.values[0] = 'participant_id'
sample_meta.columns.values[1] = 'GUID'
sample_meta.columns.values[2] = 'visit_month_1'
sample_meta.columns.values[3] = 'visit_month_2'

In [225]:
#Recheck headers
sample_meta.count()

participant_id                    2121
GUID                              1496
visit_month_1                     1871
visit_month_2                     2121
sample_id                         2121
PCT_USABLE_BASES                  2121
Plate                             2121
STRAND_BALANCE                    2121
diagnosis_at_baseline             2103
case_control_other_at_baseline    2103
case_control_other_latest         2103
age_at_baseline                   2121
age_at_diagnosis                   926
disease_duration                   926
sex                               2121
ethnicity                         2121
race                              2121
education_level_years             2121
moca_total_score                  1871
diagnosis_latest                  2103
dtype: int64

In [226]:
#Select the GUID and visit_month that we need
sample_meta = sample_meta[["participant_id", "GUID", "visit_month_2", "sample_id", "PCT_USABLE_BASES", "Plate", "STRAND_BALANCE", "diagnosis_at_baseline", "diagnosis_latest", "case_control_other_at_baseline", "case_control_other_latest", "age_at_baseline", "age_at_diagnosis", "disease_duration", "sex", "ethnicity", "race", "education_level_years", "moca_total_score"]]
sample_meta.count()

participant_id                    2121
GUID                              1496
visit_month_2                     2121
sample_id                         2121
PCT_USABLE_BASES                  2121
Plate                             2121
STRAND_BALANCE                    2121
diagnosis_at_baseline             2103
diagnosis_latest                  2103
case_control_other_at_baseline    2103
case_control_other_latest         2103
age_at_baseline                   2121
age_at_diagnosis                   926
disease_duration                   926
sex                               2121
ethnicity                         2121
race                              2121
education_level_years             2121
moca_total_score                  1871
dtype: int64

In [227]:
#Rename the columns as we need
sample_meta.columns.values[2] = 'visit_month'
sample_meta.count()

participant_id                    2121
GUID                              1496
visit_month                       2121
sample_id                         2121
PCT_USABLE_BASES                  2121
Plate                             2121
STRAND_BALANCE                    2121
diagnosis_at_baseline             2103
diagnosis_latest                  2103
case_control_other_at_baseline    2103
case_control_other_latest         2103
age_at_baseline                   2121
age_at_diagnosis                   926
disease_duration                   926
sex                               2121
ethnicity                         2121
race                              2121
education_level_years             2121
moca_total_score                  1871
dtype: int64

In [228]:
sample_meta.shape

(2121, 19)

In [229]:
sample_meta.moca_total_score.value_counts()

28.0    284
27.0    276
29.0    254
26.0    236
30.0    164
25.0    162
24.0    153
23.0    100
22.0     75
21.0     39
20.0     32
18.0     28
19.0     20
31.0     12
17.0     11
16.0      6
12.0      4
10.0      3
15.0      3
9.0       2
14.0      2
13.0      2
8.0       1
6.0       1
11.0      1
Name: moca_total_score, dtype: int64

In [230]:
sample_meta.case_control_other_latest.value_counts()

Case       1294
Control     809
Name: case_control_other_latest, dtype: int64

In [231]:
#Define Cognition phenotype variable - assign PDnoCI 
sample_meta.loc[(sample_meta['moca_total_score'] >25) & (sample_meta['case_control_other_latest'] == "Case"), 'Cognition_phenotype'] = 'PDnoCI'
sample_meta.loc[(sample_meta['moca_total_score'] <19) & (sample_meta['case_control_other_latest'] == "Case"), 'Cognition_phenotype'] = 'PDD'
sample_meta.loc[(sample_meta['moca_total_score'] > 25) & (sample_meta['case_control_other_latest'] == "Control"), 'Cognition_phenotype'] = 'HCnoCI'
sample_meta.loc[(sample_meta['moca_total_score'] < 19) & (sample_meta['case_control_other_latest'] == "Control"), 'Cognition_phenotype'] = 'HCD'
sample_meta.loc[(sample_meta['moca_total_score'] >19) & (sample_meta['moca_total_score'] <25) & (sample_meta['case_control_other_latest'] == "Control"), 'Cognition_phenotype'] = 'HCMCI'
sample_meta.loc[(sample_meta['moca_total_score'] >19) & (sample_meta['moca_total_score'] <25) & (sample_meta['case_control_other_latest'] == "Case"), 'Cognition_phenotype'] = 'PDMCI'

In [232]:
sample_meta.count()

participant_id                    2121
GUID                              1496
visit_month                       2121
sample_id                         2121
PCT_USABLE_BASES                  2121
Plate                             2121
STRAND_BALANCE                    2121
diagnosis_at_baseline             2103
diagnosis_latest                  2103
case_control_other_at_baseline    2103
case_control_other_latest         2103
age_at_baseline                   2121
age_at_diagnosis                   926
disease_duration                   926
sex                               2121
ethnicity                         2121
race                              2121
education_level_years             2121
moca_total_score                  1871
Cognition_phenotype               1678
dtype: int64

In [233]:
#Check new variable counts
sample_meta.Cognition_phenotype.value_counts()

HCnoCI    618
PDnoCI    602
PDMCI     282
HCMCI     114
PDD        57
HCD         5
Name: Cognition_phenotype, dtype: int64

In [234]:
sample_meta.shape
#still having 2121 individuals

(2121, 20)

In [235]:
#Drop the rows with NA in the cognition phenotype variable
sample_meta.dropna(subset=["Cognition_phenotype"], how = "all", inplace=True)
#This is important because here we drop around 500 individuals
#this happended because we defined cognitition phenotype based on 2 variables
#MOCA (only 1800 values available) and Case_control_other_latest (2100 values available)
#So individuals that contain info for those 2 variables are kept

In [236]:
#Check new variable counts
sample_meta.Cognition_phenotype.value_counts()

HCnoCI    618
PDnoCI    602
PDMCI     282
HCMCI     114
PDD        57
HCD         5
Name: Cognition_phenotype, dtype: int64

In [237]:
sample_meta.shape

(1678, 20)

In [238]:
#Change the age variable from continuous to range
sample_meta.loc[sample_meta['age_at_baseline'] <55, 'age_baseline_range'] = 'under_55'
sample_meta.loc[(sample_meta['age_at_baseline'] >54) & (sample_meta['age_at_baseline'] < 66) , 'age_baseline_range'] = '55_to_65'
sample_meta.loc[sample_meta['age_at_baseline']>65, 'age_baseline_range'] = 'above_65'

In [239]:
sample_meta.head()

Unnamed: 0,participant_id,GUID,visit_month,sample_id,PCT_USABLE_BASES,Plate,STRAND_BALANCE,diagnosis_at_baseline,diagnosis_latest,case_control_other_at_baseline,...,age_at_baseline,age_at_diagnosis,disease_duration,sex,ethnicity,race,education_level_years,moca_total_score,Cognition_phenotype,age_baseline_range
0,BF-1001,PDNW781VHY,0,BF-1001-SVM0_5T1,0.379947,P341,0.628003,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,...,55.0,,,Male,Not Hispanic or Latino,White,12-16 years,28.0,HCnoCI,55_to_65
1,BF-1002,PDCB969UGG,0,BF-1002-SVM0_5T1,0.204625,P341,0.750852,Idiopathic PD,Idiopathic PD,Case,...,66.0,61.0,5.0,Female,Not Hispanic or Latino,White,12-16 years,29.0,PDnoCI,above_65
2,BF-1003,PDLW805AHT,0,BF-1003-SVM0_5T1,0.251916,P341,0.714203,Idiopathic PD,Idiopathic PD,Case,...,61.0,56.0,5.0,Male,Not Hispanic or Latino,White,12-16 years,30.0,PDnoCI,55_to_65
3,BF-1004,PDKW284DYW,0,BF-1004-SVM0_5T1,0.269126,P341,0.675027,Idiopathic PD,Idiopathic PD,Case,...,62.0,55.0,7.0,Male,Not Hispanic or Latino,White,12-16 years,28.0,PDnoCI,55_to_65
4,BF-1005,PDTM274KX6,0,BF-1005-SVM0_5T1,0.319619,P341,0.595464,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,...,61.0,,,Female,Not Hispanic or Latino,White,12-16 years,27.0,HCnoCI,55_to_65


In [240]:
sample_meta.age_baseline_range.value_counts()

above_65    763
55_to_65    530
under_55    385
Name: age_baseline_range, dtype: int64

In [241]:
sample_meta.count()

participant_id                    1678
GUID                              1333
visit_month                       1678
sample_id                         1678
PCT_USABLE_BASES                  1678
Plate                             1678
STRAND_BALANCE                    1678
diagnosis_at_baseline             1678
diagnosis_latest                  1678
case_control_other_at_baseline    1678
case_control_other_latest         1678
age_at_baseline                   1678
age_at_diagnosis                   800
disease_duration                   800
sex                               1678
ethnicity                         1678
race                              1678
education_level_years             1678
moca_total_score                  1678
Cognition_phenotype               1678
age_baseline_range                1678
dtype: int64

In [242]:
sample_meta_white = sample_meta.loc[sample_meta['race'] =="White"]

In [243]:
sample_meta_white.Cognition_phenotype.value_counts()

HCnoCI    571
PDnoCI    570
PDMCI     268
HCMCI     100
PDD        50
HCD         4
Name: Cognition_phenotype, dtype: int64

In [244]:
sample_meta.Cognition_phenotype.value_counts()

HCnoCI    618
PDnoCI    602
PDMCI     282
HCMCI     114
PDD        57
HCD         5
Name: Cognition_phenotype, dtype: int64

In [245]:
sample_meta_white['participant_id'].duplicated().any()

False

In [246]:
participant_id_M0 = sample_meta_white.participant_id

In [247]:
#participant_id_M0.to_csv(f'{WORKSPACE_BUCKET}/Participant_id_M0.csv', header="participant_id")

In [248]:
#sample_meta_white.to_csv(f'{WORKSPACE_BUCKET}/sample_meta_white_final_PDD_PDMCI_PDNC.csv')

In [249]:
sample_meta_white.head()


Unnamed: 0,participant_id,GUID,visit_month,sample_id,PCT_USABLE_BASES,Plate,STRAND_BALANCE,diagnosis_at_baseline,diagnosis_latest,case_control_other_at_baseline,...,age_at_baseline,age_at_diagnosis,disease_duration,sex,ethnicity,race,education_level_years,moca_total_score,Cognition_phenotype,age_baseline_range
0,BF-1001,PDNW781VHY,0,BF-1001-SVM0_5T1,0.379947,P341,0.628003,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,...,55.0,,,Male,Not Hispanic or Latino,White,12-16 years,28.0,HCnoCI,55_to_65
1,BF-1002,PDCB969UGG,0,BF-1002-SVM0_5T1,0.204625,P341,0.750852,Idiopathic PD,Idiopathic PD,Case,...,66.0,61.0,5.0,Female,Not Hispanic or Latino,White,12-16 years,29.0,PDnoCI,above_65
2,BF-1003,PDLW805AHT,0,BF-1003-SVM0_5T1,0.251916,P341,0.714203,Idiopathic PD,Idiopathic PD,Case,...,61.0,56.0,5.0,Male,Not Hispanic or Latino,White,12-16 years,30.0,PDnoCI,55_to_65
3,BF-1004,PDKW284DYW,0,BF-1004-SVM0_5T1,0.269126,P341,0.675027,Idiopathic PD,Idiopathic PD,Case,...,62.0,55.0,7.0,Male,Not Hispanic or Latino,White,12-16 years,28.0,PDnoCI,55_to_65
4,BF-1005,PDTM274KX6,0,BF-1005-SVM0_5T1,0.319619,P341,0.595464,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,...,61.0,,,Female,Not Hispanic or Latino,White,12-16 years,27.0,HCnoCI,55_to_65


In [250]:
sample_meta_white[['belonging_cohort', 'ID_number']] = sample_meta_white['participant_id'].str.split('-', 1, expand=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [251]:
sample_meta_white.belonging_cohort.value_counts()

PD    1068
PP     319
BF     176
Name: belonging_cohort, dtype: int64

In [252]:
sample_meta_white.head()

Unnamed: 0,participant_id,GUID,visit_month,sample_id,PCT_USABLE_BASES,Plate,STRAND_BALANCE,diagnosis_at_baseline,diagnosis_latest,case_control_other_at_baseline,...,disease_duration,sex,ethnicity,race,education_level_years,moca_total_score,Cognition_phenotype,age_baseline_range,belonging_cohort,ID_number
0,BF-1001,PDNW781VHY,0,BF-1001-SVM0_5T1,0.379947,P341,0.628003,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,...,,Male,Not Hispanic or Latino,White,12-16 years,28.0,HCnoCI,55_to_65,BF,1001
1,BF-1002,PDCB969UGG,0,BF-1002-SVM0_5T1,0.204625,P341,0.750852,Idiopathic PD,Idiopathic PD,Case,...,5.0,Female,Not Hispanic or Latino,White,12-16 years,29.0,PDnoCI,above_65,BF,1002
2,BF-1003,PDLW805AHT,0,BF-1003-SVM0_5T1,0.251916,P341,0.714203,Idiopathic PD,Idiopathic PD,Case,...,5.0,Male,Not Hispanic or Latino,White,12-16 years,30.0,PDnoCI,55_to_65,BF,1003
3,BF-1004,PDKW284DYW,0,BF-1004-SVM0_5T1,0.269126,P341,0.675027,Idiopathic PD,Idiopathic PD,Case,...,7.0,Male,Not Hispanic or Latino,White,12-16 years,28.0,PDnoCI,55_to_65,BF,1004
4,BF-1005,PDTM274KX6,0,BF-1005-SVM0_5T1,0.319619,P341,0.595464,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,...,,Female,Not Hispanic or Latino,White,12-16 years,27.0,HCnoCI,55_to_65,BF,1005


In [253]:
sample_meta_white_BF_cohort = sample_meta_white.loc[sample_meta_white.belonging_cohort=="BF"]

In [254]:
sample_meta_white_BF_cohort.shape

(176, 23)

In [255]:
sample_meta_white_BF_cohort.head()

Unnamed: 0,participant_id,GUID,visit_month,sample_id,PCT_USABLE_BASES,Plate,STRAND_BALANCE,diagnosis_at_baseline,diagnosis_latest,case_control_other_at_baseline,...,disease_duration,sex,ethnicity,race,education_level_years,moca_total_score,Cognition_phenotype,age_baseline_range,belonging_cohort,ID_number
0,BF-1001,PDNW781VHY,0,BF-1001-SVM0_5T1,0.379947,P341,0.628003,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,...,,Male,Not Hispanic or Latino,White,12-16 years,28.0,HCnoCI,55_to_65,BF,1001
1,BF-1002,PDCB969UGG,0,BF-1002-SVM0_5T1,0.204625,P341,0.750852,Idiopathic PD,Idiopathic PD,Case,...,5.0,Female,Not Hispanic or Latino,White,12-16 years,29.0,PDnoCI,above_65,BF,1002
2,BF-1003,PDLW805AHT,0,BF-1003-SVM0_5T1,0.251916,P341,0.714203,Idiopathic PD,Idiopathic PD,Case,...,5.0,Male,Not Hispanic or Latino,White,12-16 years,30.0,PDnoCI,55_to_65,BF,1003
3,BF-1004,PDKW284DYW,0,BF-1004-SVM0_5T1,0.269126,P341,0.675027,Idiopathic PD,Idiopathic PD,Case,...,7.0,Male,Not Hispanic or Latino,White,12-16 years,28.0,PDnoCI,55_to_65,BF,1004
4,BF-1005,PDTM274KX6,0,BF-1005-SVM0_5T1,0.319619,P341,0.595464,No PD Nor Other Neurological Disorder,No PD Nor Other Neurological Disorder,Control,...,,Female,Not Hispanic or Latino,White,12-16 years,27.0,HCnoCI,55_to_65,BF,1005


In [256]:
#sample_meta_white_BF_cohort.to_csv(f'{WORKSPACE_BUCKET}/sample_meta_white_BF_cohort_final_PDD_PDMCI_PDNC.csv')

In [257]:
sample_meta_white_BF_cohort.columns

Index(['participant_id', 'GUID', 'visit_month', 'sample_id',
       'PCT_USABLE_BASES', 'Plate', 'STRAND_BALANCE', 'diagnosis_at_baseline',
       'diagnosis_latest', 'case_control_other_at_baseline',
       'case_control_other_latest', 'age_at_baseline', 'age_at_diagnosis',
       'disease_duration', 'sex', 'ethnicity', 'race', 'education_level_years',
       'moca_total_score', 'Cognition_phenotype', 'age_baseline_range',
       'belonging_cohort', 'ID_number'],
      dtype='object')

In [258]:
sample_meta_white_BF_cohort.sex.value_counts()

Male      99
Female    77
Name: sex, dtype: int64

In [259]:
sample_meta_white_BF_cohort.Cognition_phenotype.value_counts()

PDnoCI    82
HCnoCI    75
PDMCI     15
PDD        2
HCMCI      2
Name: Cognition_phenotype, dtype: int64

In [260]:
# Define the condition
target_value = 'PDnoCI'
condition = sample_meta_white_BF_cohort['Cognition_phenotype'] == target_value

# Use boolean indexing to filter the DataFrame
subset = sample_meta_white_BF_cohort.loc[condition, 'age_at_baseline']

# Calculate basic statistics on the subset
mean_value = subset.mean()
median_value = subset.median()
std_dev_value = subset.std()

# Print the results
print(f"Mean: {mean_value}")
print(f"Median: {median_value}")
print(f"Standard Deviation: {std_dev_value}")


Mean: 67.65853658536585
Median: 68.0
Standard Deviation: 6.178855845573141


In [261]:
# Define the condition
target_value = 'PDMCI'
condition = sample_meta_white_BF_cohort['Cognition_phenotype'] == target_value

# Use boolean indexing to filter the DataFrame
subset = sample_meta_white_BF_cohort.loc[condition, 'age_at_baseline']

# Calculate basic statistics on the subset
mean_value = subset.mean()
median_value = subset.median()
std_dev_value = subset.std()

# Print the results
print(f"Mean: {mean_value}")
print(f"Median: {median_value}")
print(f"Standard Deviation: {std_dev_value}")

Mean: 72.33333333333333
Median: 72.0
Standard Deviation: 6.031662488978672


In [262]:
# Define the condition
target_value = 'PDD'
condition = sample_meta_white_BF_cohort['Cognition_phenotype'] == target_value

# Use boolean indexing to filter the DataFrame
subset = sample_meta_white_BF_cohort.loc[condition, 'age_at_baseline']

# Calculate basic statistics on the subset
mean_value = subset.mean()
median_value = subset.median()
std_dev_value = subset.std()

# Print the results
print(f"Mean: {mean_value}")
print(f"Median: {median_value}")
print(f"Standard Deviation: {std_dev_value}")

Mean: 71.5
Median: 71.5
Standard Deviation: 2.1213203435596424


In [269]:
sample_meta_white_BF_cohort_subsetting = sample_meta_white_BF_cohort.loc[sample_meta_white_BF_cohort.Cognition_phenotype=="PDMCI"]

In [270]:
sample_meta_white_BF_cohort_subsetting.sex.value_counts()

Male      11
Female     4
Name: sex, dtype: int64