# UKBiobank

* **Project:** ADRD Genetic Diversity in Biobanks
* **Version:** Python/3.9
* **Last Updated:** 22-August-2024

## Notebook Overview
Fetch cohorts, ancestry info, remove related individuals, filter non-WGS, combine pVCFs, normalize VCFs, annotate, allele freqs, APOE genotyping, phenotype data, resilience/protective variants

# Initialize Notebook

## Import packages

In [None]:
import pyspark
import dxdata
import dxpy
import pandas as pd
from datetime import date, datetime
import os 
import numpy as np
import random
import shutil
import glob
import requests
from functools import reduce

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)


## Initialize variables

In [None]:
gene_names = [
    "APOE",
    "APP",
    "GBA1",
    "GBA1LP",
    "GRN",
    "MAPT",
    "PSEN1",
    "PSEN2",
    "SNCA",
    "TREM2",
]


## Initialize helper functions

In [None]:
def fetch_gene_info_ensembl(gene_names, species='human', genome_version='GRCh38'):
    gene_info_dict = {}
    server = "https://rest.ensembl.org"
    
    for gene_name in gene_names:
        endpoint = f"/lookup/symbol/{species}/{gene_name}"
        headers = {"Content-Type": "application/json"}

        response = requests.get(server + endpoint, headers=headers, params={"expand": "1"})
        if not response.ok:
            print(f"Fetching failed for {gene_name}")
            continue

        data = response.json()
        gene_info = {
            "gene_name": data.get("display_name", gene_name),
            "chromosome": f"chr{data['seq_region_name']}",
            "start": int(data["start"]),
            "end": int(data["end"]),
            "genome_version": genome_version
        }

        gene_info_dict[gene_name] = gene_info

    return gene_info_dict


# Fetch cohorts

## Grab the dataset containing participant information

In [None]:
dispensed_dataset_id = dxpy.find_one_data_object(typename="Dataset", name="app*.dataset", folder="/", name_mode="glob")["id"]
dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset["participant"]


## Retrieve Cases

### AD + Non-AD Dementia Cohorts

#### Pull down the fields we need 
https://docs.google.com/document/d/1AebkQ-Nxrk63jhsDzZpn5QD-7EK4unsykHVj-saEm3U/edit?usp=sharing

In [None]:
field_names = [
    "eid", 
    "p31", 
    "p34", 
    "p22189", 
    "p22006", 
    "p21022", 
    "p42020", 
    "p42018", 
    "p22009_a1", 
    "p22009_a2", 
    "p22009_a3", 
    "p22009_a4", 
    "p22009_a5", 
    "p40000_i0",
    "p120042",
    "p26302_i2",
    "p26302_i3",
    "p21625_i2",
    "p21625_i3",
    "p62_i0",
    "p62_i1",
    "p62_i2",
    #"p32104",
    #"p32105",
    #"p32106",
    #"p32107",
    #"p32108",
    #"p32109",
]
cases_df = participant.retrieve_fields(names=field_names, coding_values="replace", engine=dxdata.connect())
cases_df = cases_df.toPandas()


#### Rename columns to be human-readable

In [None]:
cases_df = cases_df.rename(columns={
    'eid':'ID',
    'p31':'GENETIC_SEX', 
    'p34':'BIRTH_YEAR', 
    'p22189':'TOWNSEND', 
    'p22006':'ETHNICITY', 
    'p21022':'AGE_OF_RECRUIT',
    'p42020':'AD_DATE',
    'p42018':'DEM_DATE',
    'p22009_a1':'PC1',
    'p22009_a2':'PC2',
    'p22009_a3':'PC3',
    'p22009_a4':'PC4',
    'p22009_a5':'PC5',
    'p40000_i0':'DATE_OF_DEATH',
    "p120042":"COGNITIVE_SYMPTOMS_SEVERITY_PAST_WEEK",
    "p26302_i2":"SPECIFIC_COGNITIVE_ABILITY_2014",
    "p26302_i3":"SPECIFIC_COGNITIVE_ABILITY_2019",
    "p21625_i2":"TOUCHSCREEN_COGNITIVE_DURATION_2014",
    "p21625_i3":"TOUCHSCREEN_COGNITIVE_DURATION_2019",
    "p62_i0":"COGNITIVE_TEST_WILLINGNESS",
    "p62_i1":"COGNITIVE_TEST_WILLINGNESS",
    "p62_i2":"COGNITIVE_TEST_WILLINGNESS",
    #"p32104":"FORGETFULNESS_PAST_WEEK",
    #"p32105":"POOR_CONCENTRATION_PAST_WEEK",
    #"p32106":"TROUBLE_EXPRESSING_THOUGHTS_PAST_WEEK",
    #"p32107":"TROUBLE_FINDING_RIGHT_WORD_PAST_WEEK",
    #"p32108":"SLOW_THINKING_SPEED_PAST_WEEK",
    #"p32109":"TROUBLE_SOLVING_PROBLEMS_PAST_WEEK",
})


#### Find participants with AD and RD

In [None]:
# AD:
ad_df = cases_df[~cases_df[f'AD_DATE'].isna()]
ad_df = ad_df[[
    'ID', 'GENETIC_SEX', 'BIRTH_YEAR', 'TOWNSEND', 'ETHNICITY', 'AGE_OF_RECRUIT', 
    f'AD_DATE', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'DATE_OF_DEATH', 
    "COGNITIVE_SYMPTOMS_SEVERITY_PAST_WEEK", "SPECIFIC_COGNITIVE_ABILITY_2014",
    "SPECIFIC_COGNITIVE_ABILITY_2019", "TOUCHSCREEN_COGNITIVE_DURATION_2014",
    "TOUCHSCREEN_COGNITIVE_DURATION_2019", "COGNITIVE_TEST_WILLINGNESS",
    "COGNITIVE_TEST_WILLINGNESS", "COGNITIVE_TEST_WILLINGNESS",
]]
ad_df["ID"] = pd.to_numeric(ad_df["ID"])
    
# RD:
rd_df = cases_df[cases_df['AD_DATE'].isna() & ~cases_df['DEM_DATE'].isna()]
rd_df = rd_df[[
    'ID', 'GENETIC_SEX', 'BIRTH_YEAR', 'TOWNSEND', 'ETHNICITY', 'AGE_OF_RECRUIT', 
    'DEM_DATE', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'DATE_OF_DEATH', 
    "COGNITIVE_SYMPTOMS_SEVERITY_PAST_WEEK", "SPECIFIC_COGNITIVE_ABILITY_2014",
    "SPECIFIC_COGNITIVE_ABILITY_2019", "TOUCHSCREEN_COGNITIVE_DURATION_2014",
    "TOUCHSCREEN_COGNITIVE_DURATION_2019", "COGNITIVE_TEST_WILLINGNESS",
    "COGNITIVE_TEST_WILLINGNESS", "COGNITIVE_TEST_WILLINGNESS",
]]
rd_df["ID"] = pd.to_numeric(rd_df["ID"])


## Retrieve Controls

### Age 65+, no NDD, no parents with AD or PD

#### Retrieve field names of interest for each participant

In [None]:
# Date G10 first reported (huntington's disease),
# Date D11 first reported (hereditary ataxia), 
# Date G12 first reported (spinal muscular atrophy and related syndromes), 
# Date G13 first reported (systemic atrophies primarily affecting central nervous system in diseases classified elswhere), 
# Date G14 first reported (postpolio syndrome), 
# Date G20 first reported (parkinson's disease), 
# Date G21 first reported (secondary parkinsonism), 
# Date G22 first reported (parkinsonism in diseases classified elsewhere), 
# Date G23 first reported (other degenerative diseases of basal ganglia), 
# Date G24 first reported (dystonia), 
# Date G25 first reported (other extrapyramidal and movement disorders), 
# Date G30 first reported (alzheimer's disease), 
# Date G31 first reported (other degenerative diseases of nervous system, not elsewhere classified), 
# Date G32 first reported (other degenerative disorders of nervous system in diseases classified elsewhere), 
# Date G35 first reported (multiple sclerosis), 
# Date G36 first reported (other acute disseminated demyelination), 
# Date G37 first reported (other demyelinating diseases of central nervous system), 
# Date G45 first reported (transient cerebral ischaemic attacks and related syndromes), 
# Date G46 first reported (vascular syndromes of brain in cerebrovascular diseases), 
# Date G50 first reported (disorders of trigeminal nerve), 
# Date G52 first reported (disorders of other cranial nerves), 
# Date G53 first reported (cranial nerve disorders in diseases classified elsewhere), 
# Date G54 first reported (nerve root and plexus disorders), 
# Date G55 first reported (nerve root and plexus compressions in diseases classified elsewhere), 
# Date G56 first reported (mononeuropathies of upper limb), 
# Date G57 first reported (mononeuropathies of lower limb), 
# Date G58 first reported (other mononeuropathies), 
# Date G59 first reported (mononeuropathy in diseases classified elsewhere), 
# Date G60 first reported (hereditary and idiopathic neuropathy), 
# Date G61 first reported (inflammatory polyneuropathy), 
# Date G62 first reported (other polyneuropathies), 
# Date G63 first reported (polyneuropathy in diseases classified elsewhere), 
# Date G64 first reported (other disorders of peripheral nervous system), 
# Date G70 first reported (myasthenia gravis and other myoneural disorders), 
# Date G71 first reported (primary disorders of muscles), 
# Date G72 first reported (other myopathies), 
# Date G73 first reported (disorders of myoneural junction and muscle in diseases classified elsewhere), 
# Date G80 first reported (infantile cerebral palsy), 
# Date G81 first reported (hemiplegia), 
# Date G82 first reported (paraplegia and tetraplegia), 
# Date G83 first reported (other paralytic syndromes), 
# Date G90 first reported (disorders of autonomic nervous system),
# Date G91 first reported (hydrocephalus), 
# Date G92 first reported (toxic encephalopathy), 
# Date G93 first reported (other disorders of brain), 
# Date G94 first reported (other disorders of brain in diseases classified elsewhere), 
# Date G96 first reported (other disorders of central nervous system), 
# Date G97 first reported (postprocedural disorders of nervous system, not elsewhere classified),  
# Date G98 first reported (other disorders of nervous system, not elsewhere classified), 
# Date G99 first reported (other disorders of nervous system in diseases classified elsewhere), 
# Date of all cause dementia report, 
# Date of alzheimer's disease report, 
# Date of vascular dementia report, 
# Date of frontotemporal dementia report, 
# Date of motor neurone disease report, 
# Date of all cause parkinsonism report, 
# Date of parkinson's disease report, 
# Date of progressive supranuclear palsy report, 
# Date of multiple system atrophy report, 
# Genetic ethnic grouping, 
# Age at recruitment, 
# Townsend deprivation index at recruitment, 
# Sex, 
# Genetic Principal components | Array 1, 
# Genetic Principal components | Array 2, 
# Genetic Principal components | Array 3, 
# Genetic Principal components | Array 4, 
# Genetic Principal components | Array 5

field_names = ['eid', 'p131012', 'p131016', 'p131018', 'p131020', 'p131022', 'p131024', 'p131026', 'p131028', 'p131030', 'p131036', 'p131038', 'p131040', 'p131042',
               'p131046', 'p131056', 'p131058', 'p131062', 'p131066', 'p131068', 'p131070', 'p131074', 'p131076', 'p131078', 'p131080', 'p131082', 'p131084', 'p131086', 
               'p131088', 'p131090', 'p131092', 'p131094', 'p131096', 'p131098', 'p131100', 'p131102', 'p131104', 'p131106', 'p131108', 'p131110', 'p131112', 'p131114', 
               'p131116', 'p131120', 'p131122', 'p131124', 'p131126',  'p42018', 'p42020', 'p42022', 'p42024', 'p42028', 'p42030', 'p42032', 'p42034', 'p42036', 'p22006', 
               'p21022', 'p22189', 'p31', 'p22009_a1', 'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5', 'p34', 'p40000_i0', 'p20110_i0', 'p20110_i1', 'p20110_i2', 
               'p20110_i3', 'p20107_i0', 'p20107_i1', 'p20107_i2', 'p20107_i3']
control_df = participant.retrieve_fields(names=field_names, coding_values="replace", engine=dxdata.connect())
control_df = control_df.toPandas()


#### Remove participants with any of the listed conditions

In [None]:
control_df = control_df[control_df['p131012'].isnull() & control_df['p131016'].isnull() & control_df['p131018'].isnull() & control_df['p131020'].isnull() 
                        & control_df['p131022'].isnull() & control_df['p131024'].isnull() & control_df['p131026'].isnull() & control_df['p131028'].isnull() 
                        & control_df['p131030'].isnull() & control_df['p131036'].isnull() & control_df['p131038'].isnull() & control_df['p131040'].isnull() 
                        & control_df['p131042'].isnull() & control_df['p131046'].isnull() & control_df['p131056'].isnull() & control_df['p131058'].isnull() 
                        & control_df['p131062'].isnull() & control_df['p131066'].isnull() & control_df['p131068'].isnull() & control_df['p131070'].isnull() 
                        & control_df['p131074'].isnull() & control_df['p131076'].isnull() & control_df['p131078'].isnull() & control_df['p131080'].isnull() 
                        & control_df['p131082'].isnull() & control_df['p131084'].isnull() & control_df['p131086'].isnull() & control_df['p131088'].isnull() 
                        & control_df['p131090'].isnull() & control_df['p131092'].isnull() & control_df['p131094'].isnull() & control_df['p131096'].isnull() 
                        & control_df['p131098'].isnull() & control_df['p131100'].isnull() & control_df['p131102'].isnull() & control_df['p131104'].isnull() 
                        & control_df['p131106'].isnull() & control_df['p131108'].isnull() & control_df['p131110'].isnull() & control_df['p131112'].isnull() 
                        & control_df['p131114'].isnull() & control_df['p131116'].isnull() & control_df['p131120'].isnull() & control_df['p131122'].isnull() 
                        & control_df['p131124'].isnull() & control_df['p131126'].isnull() & control_df['p42018'].isnull() & control_df['p42020'].isnull() 
                        & control_df['p42022'].isnull() & control_df['p42024'].isnull() & control_df['p42028'].isnull() & control_df['p42030'].isnull() 
                        & control_df['p42032'].isnull() & control_df['p42034'].isnull() & control_df['p42036'].isnull()]


#### Remove participants whose parents have AD or PD

In [None]:
# Columns defining all instances of parent illness
parent_illness_cols = ['p20110_i0', 'p20110_i1', 'p20110_i2', 'p20110_i3', 'p20107_i0', 'p20107_i1', 'p20107_i2', 'p20107_i3']

# Convert None values to empty lists
for illness_col in parent_illness_cols:
    control_df[illness_col] = control_df[illness_col].apply(lambda l: l if isinstance(l, list) else [])

# Define a condition as anybody who has never reported a parent as having AD or PD
condition = lambda participant: all(("Alzheimer's disease/dementia" not in illnesses and "Parkinson's disease" not in illnesses) for illnesses in participant[parent_illness_cols])

# Apply the condition to give all participants who have a parent who has/had AD or PD
control_df = control_df[control_df.apply(condition, axis=1)]


#### Remove participants below the defined age threshold

In [None]:
control_df = control_df[control_df['p21022'] >= 65]


#### Rename columns

In [None]:
control_df = control_df[['eid', 'p21022', 'p22189', 'p31', 'p22009_a1', 'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5', 'p34', 'p22006', 'p40000_i0']]
control_df.rename(columns={
    'eid':'ID',
    'p21022':'AGE_OF_RECRUIT', 
    'p22189':'TOWNSEND', 
    'p31':'GENETIC_SEX', 
    'p22009_a1':'PC1', 
    'p22009_a2':'PC2', 
    'p22009_a3':'PC3', 
    'p22009_a4':'PC4', 
    'p22009_a5':'PC5', 
    'p34':'BIRTH_YEAR', 
    'p22006':'ETHNICITY', 
    'p40000_i0':'DATE_OF_DEATH',
}, inplace=True)
control_df["ID"] = pd.to_numeric(control_df["ID"])
control_df.info()


# Find ancestry information about each cohort

#### Read ancestry label mappings

In [None]:
ancestries = pd.read_csv("../../mnt/project/wgs_analysis/data/ukbb_imputed_genotypes_umap_linearsvc_predicted_labels.txt", sep="\t")


#### Add labels to cohort dataframes

In [None]:
control_df = control_df.merge(ancestries[["IID","label"]], left_on="ID", right_on="IID").drop("IID", axis=1)
ad_df = ad_df.merge(ancestries[["IID","label"]], left_on="ID", right_on="IID").drop("IID", axis=1)
rd_df = rd_df.merge(ancestries[["IID","label"]], left_on="ID", right_on="IID").drop("IID", axis=1)


#### Get list of IDs for each cohort

In [None]:
ad_ids = ad_df["ID"].tolist()
rd_ids = rd_df["ID"].tolist()
control_ids = control_df["ID"].tolist()


# Remove related individuals

#### Fetch relatedness data

In [None]:
full_related_df = pd.read_csv('../../mnt/project/Bulk/Genotype Results/Genotype calls/ukb_rel.dat', sep = ' ')
full_related_df = full_related_df[full_related_df['Kinship'] > 0.0884]


#### Define cohorts to maximize cases included

In [None]:
full_cohort_ids = ad_ids + rd_ids + control_ids
case_ids = ad_ids + rd_ids


#### Keep only rows with both participants in cohorts of interest

In [None]:
related_cohort_df = full_related_df.loc[full_related_df['ID1'].isin(full_cohort_ids) & full_related_df['ID2'].isin(full_cohort_ids)]
related_cohort_df = related_cohort_df.reset_index(drop=True)


#### Maximize the number of cases included

In [None]:
flipped_df = related_cohort_df[related_cohort_df["ID1"].isin(control_ids) & related_cohort_df["ID2"].isin(case_ids)].copy()
related_cohort_df = related_cohort_df[~(related_cohort_df["ID1"].isin(control_ids) & related_cohort_df["ID2"].isin(case_ids))]
flipped_df.rename(columns={"ID1":"ID2", "ID2":"ID1"}, inplace=True)
related_cohort_df = pd.concat([related_cohort_df, flipped_df])


#### Get set of participants to remove

In [None]:
ids_to_remove = set(related_cohort_df["ID2"])
print(f"Removing {len(ids_to_remove)} participants")


#### Filter ID lists accordingly

In [None]:
ad_ids = [iid for iid in ad_ids if iid not in ids_to_remove]
rd_ids = [iid for iid in rd_ids if iid not in ids_to_remove]
control_ids = [iid for iid in control_ids if iid not in ids_to_remove]
total_ids = ad_ids + rd_ids + control_ids


#### Save the IDs of each participant to a txt file

In [None]:
with open('ad_ids_pre_VCF.txt', 'w') as file:
    for iid in ad_ids:
        file.write(f"{iid}\n")
        

In [None]:
with open('rd_ids_pre_VCF.txt', 'w') as file:
    for iid in rd_ids:
        file.write(f"{iid}\n")


In [None]:
with open('control_ids_pre_VCF.txt', 'w') as file:
    for iid in control_ids:
        file.write(f"{iid}\n")


In [None]:
with open('ids_pre_VCF.txt', 'w') as file:
    for iid in total_ids:
        file.write(f"{iid}\n")


# Filter out participants without WGS data

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr1/ukb24310_c1_b7761_v1.vcf.gz" \
-iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr1/ukb24310_c1_b7761_v1.vcf.gz.tbi" \
-icmd="bcftools query -l ukb24310_c1_b7761_v1.vcf.gz > pvcf_full_ids.txt" \
--instance-type mem1_hdd1_v2_x16 \
--destination "${projectid}:/wgs_analysis/results"


In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
!dx download /wgs_analysis/results/pvcf_full_ids.txt
!grep -Fwf pvcf_full_ids.txt ids_pre_VCF.txt > filtered_sample_ids.txt
!grep -Fwf pvcf_full_ids.txt ad_ids_pre_VCF.txt > filtered_ad_ids.txt
!grep -Fwf pvcf_full_ids.txt rd_ids_pre_VCF.txt > filtered_rd_ids.txt
!grep -Fwf pvcf_full_ids.txt control_ids_pre_VCF.txt > filtered_control_ids.txt


In [None]:
with open('filtered_ad_ids.txt', 'r') as file:
    ad_ids = [int(line.strip()) for line in file]
with open('filtered_rd_ids.txt', 'r') as file:
    rd_ids = [int(line.strip()) for line in file]
with open('filtered_control_ids.txt', 'r') as file:
    control_ids = [int(line.strip()) for line in file]


#### Get list of IDs for each cohort

In [None]:
ad_df = ad_df[ad_df["ID"].isin(ad_ids)]
rd_df = rd_df[rd_df["ID"].isin(rd_ids)]
control_df = control_df[control_df["ID"].isin(control_ids)]


In [None]:
print(f"Number of AD participants:       {len(ad_ids)}")
print(f"Number of RD participants:       {len(rd_ids)}")
print(f"Number of Control participants:  {len(control_ids)}")


In [None]:
!dx upload filtered_sample_ids.txt --path /wgs_analysis/results/sample_ids.txt
!dx upload filtered_ad_ids.txt --path /wgs_analysis/results/ad_ids.txt
!dx upload filtered_rd_ids.txt --path /wgs_analysis/results/rd_ids.txt
!dx upload filtered_control_ids.txt --path /wgs_analysis/results/control_ids.txt


# Save and print cohort statistics

In [None]:
control_df.to_csv("Controls.csv", header=True, index=False)
ad_df.to_csv(f'AD_cases.csv', header=True, index=False)
rd_df.to_csv(f'RD_cases.csv', header=True, index=False)


In [None]:
! dx upload Controls.csv --path /wgs_analysis/results/Controls.csv
! dx upload AD_cases.csv --path /wgs_analysis/results/AD_cases.csv
! dx upload RD_cases.csv --path /wgs_analysis/results/RD_cases.csv


In [None]:
print(control_df["label"].value_counts())
print(ad_df["label"].value_counts())
print(rd_df["label"].value_counts())
print("\n")

print(control_df["GENETIC_SEX"].value_counts())
print(ad_df["GENETIC_SEX"].value_counts())
print(rd_df["GENETIC_SEX"].value_counts())
print("\n")

print(f'{control_df[control_df["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].mean()} +/- {control_df[control_df["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].std()}')
print(f'{ad_df[ad_df["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].mean()} +/- {ad_df[ad_df["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].std()}')
print(f'{rd_df[rd_df["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].mean()} +/- {rd_df[rd_df["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].std()}')
print(f'{control_df[control_df["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].mean()} +/- {control_df[control_df["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].std()}')
print(f'{ad_df[ad_df["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].mean()} +/- {ad_df[ad_df["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].std()}')
print(f'{rd_df[rd_df["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].mean()} +/- {rd_df[rd_df["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].std()}')
print("\n")


# Fetch pVCF chunks for each gene of interest

## GBA (chr1: 155,225,002 - 155,254,507) (b: 7761 - 7763)

In [None]:
%%bash
for b_val in {7761..7763};
do
    dx run swiss-army-knife \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr1/ukb24310_c1_b${b_val}_v1.vcf.gz" \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr1/ukb24310_c1_b${b_val}_v1.vcf.gz.tbi" \
    -iin="/wgs_analysis/results/sample_ids.txt" \
    -icmd="bcftools view -O z -S sample_ids.txt ukb24310_c1_b${b_val}_v1.vcf.gz -o GBA_b${b_val}.vcf.gz" \
    --instance-type mem2_ssd1_v2_x32 \
    --destination "${projectid}:/wgs_analysis/results/1_pvcf_chunks"
done

## PSEN2 (chr1: 226,860,648 - 226,905,565) (b: 11343 - 11346)

In [None]:
%%bash
for b_val in {11343..11346};
do
    dx run swiss-army-knife \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr1/ukb24310_c1_b${b_val}_v1.vcf.gz" \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr1/ukb24310_c1_b${b_val}_v1.vcf.gz.tbi" \
    -iin="/wgs_analysis/results/sample_ids.txt" \
    -icmd="bcftools view -O z -S sample_ids.txt ukb24310_c1_b${b_val}_v1.vcf.gz -o PSEN2_b${b_val}.vcf.gz" \
    --instance-type mem2_ssd1_v2_x32 \
    --destination "${projectid}:/wgs_analysis/results/1_pvcf_chunks"
done


## SNCA (chr4: 89,716,632 - 89,848,254) (b: 4485 - 4493)

In [None]:
%%bash
for b_val in {4485..4493};
do
    dx run swiss-army-knife \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr4/ukb24310_c4_b${b_val}_v1.vcf.gz" \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr4/ukb24310_c4_b${b_val}_v1.vcf.gz.tbi" \
    -iin="/wgs_analysis/results/sample_ids.txt" \
    -icmd="bcftools view -O z -S sample_ids.txt ukb24310_c4_b${b_val}_v1.vcf.gz -o SNCA_b${b_val}.vcf.gz" \
    --instance-type mem2_ssd1_v2_x32 \
    --destination "${projectid}:/wgs_analysis/results/1_pvcf_chunks"
done


## TREM2 (chr6: 41,148,607 - 41,173,076) (b: 2057 - 2059)

In [None]:
%%bash
for b_val in {2057..2059};
do
    dx run swiss-army-knife \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr6/ukb24310_c6_b${b_val}_v1.vcf.gz" \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr6/ukb24310_c6_b${b_val}_v1.vcf.gz.tbi" \
    -iin="/wgs_analysis/results/sample_ids.txt" \
    -icmd="bcftools view -O z -S sample_ids.txt ukb24310_c6_b${b_val}_v1.vcf.gz -o TREM2_b${b_val}.vcf.gz" \
    --instance-type mem2_ssd1_v2_x32 \
    --destination "${projectid}:/wgs_analysis/results/1_pvcf_chunks"
done


## PSEN1 (chr14: 73,126,590 - 73,229,275) (b: 3656 - 3662)

In [None]:
%%bash
for b_val in {3656..3662};
do
    dx run swiss-army-knife \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr14/ukb24310_c14_b${b_val}_v1.vcf.gz" \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr14/ukb24310_c14_b${b_val}_v1.vcf.gz.tbi" \
    -iin="/wgs_analysis/results/sample_ids.txt" \
    -icmd="bcftools view -O z -S sample_ids.txt ukb24310_c14_b${b_val}_v1.vcf.gz -o PSEN1_b${b_val}.vcf.gz" \
    --instance-type mem2_ssd1_v2_x32 \
    --destination "${projectid}:/wgs_analysis/results/1_pvcf_chunks"
done


## GRN (chr17: 44,335,332 - 44,362,797) (b: 2216 - 2219)

In [None]:
%%bash
for b_val in {2216..2219};
do
    dx run swiss-army-knife \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr17/ukb24310_c17_b${b_val}_v1.vcf.gz" \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr17/ukb24310_c17_b${b_val}_v1.vcf.gz.tbi" \
    -iin="/wgs_analysis/results/sample_ids.txt" \
    -icmd="bcftools view -O z -S sample_ids.txt ukb24310_c17_b${b_val}_v1.vcf.gz -o GRN_b${b_val}.vcf.gz" \
    --instance-type mem2_ssd1_v2_x32 \
    --destination "${projectid}:/wgs_analysis/results/1_pvcf_chunks"
done


## MAPT (chr17: 45,884,685 - 46,035,185) (b: 2294 - 2302)

In [None]:
%%bash
for b_val in {2294..2302};
do
    dx run swiss-army-knife \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr17/ukb24310_c17_b${b_val}_v1.vcf.gz" \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr17/ukb24310_c17_b${b_val}_v1.vcf.gz.tbi" \
    -iin="/wgs_analysis/results/sample_ids.txt" \
    -icmd="bcftools view -O z -S sample_ids.txt ukb24310_c17_b${b_val}_v1.vcf.gz -o MAPT_b${b_val}.vcf.gz" \
    --instance-type mem2_ssd1_v2_x32 \
    --destination "${projectid}:/wgs_analysis/results/1_pvcf_chunks"
done


## APOE (chr19: 44,895,840 - 44,919,238) (b: 2244 - 2246)

In [None]:
%%bash
for b_val in {2244..2246};
do
    dx run swiss-army-knife \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr19/ukb24310_c19_b${b_val}_v1.vcf.gz" \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr19/ukb24310_c19_b${b_val}_v1.vcf.gz.tbi" \
    -iin="/wgs_analysis/results/sample_ids.txt" \
    -icmd="bcftools view -O z -S sample_ids.txt ukb24310_c19_b${b_val}_v1.vcf.gz -o APOE_b${b_val}.vcf.gz" \
    --instance-type mem2_ssd1_v2_x32 \
    --destination "${projectid}:/wgs_analysis/results/1_pvcf_chunks"
done


## APP (chr21: 25,871,670 - 26,180,987) (b: 1293 - 1310)

In [None]:
%%bash
for b_val in {1293..1310};
do
    dx run swiss-army-knife \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr21/ukb24310_c21_b${b_val}_v1.vcf.gz" \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr21/ukb24310_c21_b${b_val}_v1.vcf.gz.tbi" \
    -iin="/wgs_analysis/results/sample_ids.txt" \
    -icmd="bcftools view -O z -S sample_ids.txt ukb24310_c21_b${b_val}_v1.vcf.gz -o APP_b${b_val}.vcf.gz" \
    --instance-type mem2_ssd1_v2_x32 \
    --destination "${projectid}:/wgs_analysis/results/1_pvcf_chunks"
done


# Combine pVCF chunks into one file for each gene

In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

## GBA

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/1_pvcf_chunks/GBA_b7761.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/GBA_b7762.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/GBA_b7763.vcf.gz" \
-icmd="bcftools concat -O z GBA_b7761.vcf.gz GBA_b7762.vcf.gz GBA_b7763.vcf.gz -o GBA.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/wgs_analysis/results/2_pvcf_genes"


## PSEN2

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/1_pvcf_chunks/PSEN2_b11343.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/PSEN2_b11344.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/PSEN2_b11345.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/PSEN2_b11346.vcf.gz" \
-icmd="bcftools concat -O z PSEN2_b11343.vcf.gz PSEN2_b11344.vcf.gz PSEN2_b11345.vcf.gz PSEN2_b11346.vcf.gz -o PSEN2.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/wgs_analysis/results/2_pvcf_genes"


## SNCA

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/1_pvcf_chunks/SNCA_b4485.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/SNCA_b4486.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/SNCA_b4487.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/SNCA_b4488.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/SNCA_b4489.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/SNCA_b4490.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/SNCA_b4491.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/SNCA_b4492.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/SNCA_b4493.vcf.gz" \
-icmd="bcftools concat -O z SNCA_b4485.vcf.gz SNCA_b4486.vcf.gz SNCA_b4487.vcf.gz SNCA_b4488.vcf.gz SNCA_b4489.vcf.gz SNCA_b4490.vcf.gz SNCA_b4491.vcf.gz SNCA_b4492.vcf.gz SNCA_b4493.vcf.gz -o SNCA.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/wgs_analysis/results/2_pvcf_genes"


## TREM2

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/1_pvcf_chunks/TREM2_b2057.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/TREM2_b2058.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/TREM2_b2059.vcf.gz" \
-icmd="bcftools concat -O z TREM2_b2057.vcf.gz TREM2_b2058.vcf.gz TREM2_b2059.vcf.gz -o TREM2.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/wgs_analysis/results/2_pvcf_genes"


## PSEN1

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/1_pvcf_chunks/PSEN1_b3656.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/PSEN1_b3657.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/PSEN1_b3658.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/PSEN1_b3659.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/PSEN1_b3660.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/PSEN1_b3661.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/PSEN1_b3662.vcf.gz" \
-icmd="bcftools concat -O z PSEN1_b3656.vcf.gz PSEN1_b3657.vcf.gz PSEN1_b3658.vcf.gz PSEN1_b3659.vcf.gz PSEN1_b3660.vcf.gz PSEN1_b3661.vcf.gz PSEN1_b3662.vcf.gz -o PSEN1.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/wgs_analysis/results/2_pvcf_genes"


## GRN

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/1_pvcf_chunks/GRN_b2216.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/GRN_b2217.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/GRN_b2218.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/GRN_b2219.vcf.gz" \
-icmd="bcftools concat -O z GRN_b2216.vcf.gz GRN_b2217.vcf.gz GRN_b2218.vcf.gz GRN_b2219.vcf.gz -o GRN.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/wgs_analysis/results/2_pvcf_genes"


## MAPT

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/1_pvcf_chunks/MAPT_b2294.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/MAPT_b2295.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/MAPT_b2296.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/MAPT_b2297.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/MAPT_b2298.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/MAPT_b2299.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/MAPT_b2300.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/MAPT_b2301.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/MAPT_b2302.vcf.gz" \
-icmd="bcftools concat -O z MAPT_b2294.vcf.gz MAPT_b2295.vcf.gz MAPT_b2296.vcf.gz MAPT_b2297.vcf.gz MAPT_b2298.vcf.gz MAPT_b2299.vcf.gz MAPT_b2300.vcf.gz MAPT_b2301.vcf.gz MAPT_b2302.vcf.gz -o MAPT.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/wgs_analysis/results/2_pvcf_genes"


## APOE

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/1_pvcf_chunks/APOE_b2244.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APOE_b2245.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APOE_b2246.vcf.gz" \
-icmd="bcftools concat -O z APOE_b2244.vcf.gz APOE_b2245.vcf.gz APOE_b2246.vcf.gz -o APOE.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/wgs_analysis/results/2_pvcf_genes"


## APP

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1293.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1294.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1295.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1296.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1297.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1298.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1299.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1300.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1301.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1302.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1303.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1304.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1305.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1306.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1307.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1308.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1309.vcf.gz" \
-iin="/wgs_analysis/results/1_pvcf_chunks/APP_b1310.vcf.gz" \
-icmd="bcftools concat -O z APP_b1293.vcf.gz APP_b1294.vcf.gz APP_b1295.vcf.gz APP_b1296.vcf.gz APP_b1297.vcf.gz APP_b1298.vcf.gz APP_b1299.vcf.gz APP_b1300.vcf.gz APP_b1301.vcf.gz APP_b1302.vcf.gz APP_b1303.vcf.gz APP_b1304.vcf.gz APP_b1305.vcf.gz APP_b1306.vcf.gz APP_b1307.vcf.gz APP_b1308.vcf.gz APP_b1309.vcf.gz APP_b1310.vcf.gz -o APP.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/wgs_analysis/results/2_pvcf_genes"


# Concatenate genes together

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/2_pvcf_genes/GBA.vcf.gz" \
-iin="/wgs_analysis/results/2_pvcf_genes/PSEN2.vcf.gz" \
-iin="/wgs_analysis/results/2_pvcf_genes/SNCA.vcf.gz" \
-iin="/wgs_analysis/results/2_pvcf_genes/TREM2.vcf.gz" \
-iin="/wgs_analysis/results/2_pvcf_genes/PSEN1.vcf.gz" \
-iin="/wgs_analysis/results/2_pvcf_genes/GRN.vcf.gz" \
-iin="/wgs_analysis/results/2_pvcf_genes/MAPT.vcf.gz" \
-iin="/wgs_analysis/results/2_pvcf_genes/APOE.vcf.gz" \
-iin="/wgs_analysis/results/2_pvcf_genes/APP.vcf.gz" \
-icmd="bcftools concat -O z GBA.vcf.gz PSEN2.vcf.gz SNCA.vcf.gz TREM2.vcf.gz PSEN1.vcf.gz GRN.vcf.gz MAPT.vcf.gz APOE.vcf.gz APP.vcf.gz -o combined.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/wgs_analysis/results/3_pvcf_combined"


# Normalize VCFs before annotation

#### Split multiallelic sites into biallelic records

In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/3_pvcf_combined/combined.vcf.gz" \
-icmd="bcftools norm -m-both -o biallelic.vcf combined.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/wgs_analysis/results/4_normalized"


#### Left-align and normalize

In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/4_normalized/biallelic.vcf" \
-iin="/wgs_analysis/data/Homo_sapiens_assembly38.fasta" \
-icmd="bcftools norm -f Homo_sapiens_assembly38.fasta -o normalized.vcf biallelic.vcf" \
--instance-type mem2_ssd1_v2_x64 \
--destination "${projectid}:/wgs_analysis/results/4_normalized"


# Annotation

## Filter VCFs to only include a few participants

#### Get subset of participant IDs

In [None]:
! dx download wgs_analysis/results/ad_ids.txt --overwrite
! dx download wgs_analysis/results/rd_ids.txt --overwrite
! dx download wgs_analysis/results/control_ids.txt --overwrite


In [None]:
ad_ids = "ad_ids.txt"
rd_ids = "rd_ids.txt"
control_ids = "control_ids.txt"
output_file = "annot_ids.txt"

with open(ad_ids, "r") as f1, open(rd_ids, "r") as f2, open(control_ids, "r") as f3, open(output_file, "w") as out:
    out.write(f1.readline().strip() + "\n")
    out.write(f2.readline().strip() + "\n")
    out.write(f3.readline().strip() + "\n")


In [None]:
! dx upload annot_ids.txt --path wgs_analysis/results/annot_ids.txt


#### Get filtered VCFs

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/4_normalized/normalized.vcf" \
-iin="/wgs_analysis/results/annot_ids.txt" \
-icmd="bcftools view -O z -S annot_ids.txt normalized.vcf -o filtered.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/wgs_analysis/results/5_annotated"


## Fetch Annovar libraries and reference genome data

In [None]:
%%capture

! wget http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz
! tar -xzf annovar.latest.tar.gz
! chmod a+x ./annovar/*.pl
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar refGene annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar avsnp150 annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar clinvar_20221231 annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar dbnsfp30a annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar gnomad40_genome annovar/humandb/
! dx download wgs_analysis/data/Homo_sapiens_assembly38.fasta --overwrite
! dx download wgs_analysis/data/Homo_sapiens_assembly38.fasta.fai --overwrite
! dx download wgs_analysis/data/Homo_sapiens_assembly38.dict --overwrite


## Perform annotation

In [None]:
%%bash

annovar/table_annovar.pl ../../mnt/project/wgs_analysis/results/5_annotated/filtered.vcf.gz annovar/humandb/ \
--buildver hg38 \
--thread 96 \
--remove \
--protocol refGene,avsnp150,clinvar_20221231,dbnsfp30a,gnomad40_genome \
--operation g,f,f,f,f \
--nopolish \
--nastring . \
--out var_calling.annovar \
--vcfinput


In [None]:
annot_df = pd.read_csv(f"var_calling.annovar.hg38_multianno.txt", sep = '\t')
annot_df.to_csv(f"annotated.csv", index=False)
! dx upload annotated.csv --path wgs_analysis/results/5_annotated/annotated.csv


# Calculate allele frequencies

## Subset IDs for all cohort-ancestry combinations

In [None]:
ad_df = pd.read_csv("../../mnt/project/wgs_analysis/results/AD_cases.csv")
rd_df = pd.read_csv("../../mnt/project/wgs_analysis/results/RD_cases.csv")
control_df = pd.read_csv("../../mnt/project/wgs_analysis/results/Controls.csv")


In [None]:
ad_df = ad_df[["ID","label"]]
rd_df = rd_df[["ID","label"]]
control_df = control_df[["ID","label"]]


In [None]:
for ancestry in ad_df["label"].unique():
    ids = ad_df[ad_df["label"] == ancestry]["ID"]
    with open(f"AD_{ancestry}.txt", 'w') as file:
        for iid in ids:
            file.write(f"{iid}\n")
    !dx upload AD_{ancestry}.txt --path wgs_analysis/data/ID_Files/AD_{ancestry}.txt

for ancestry in rd_df["label"].unique():
    ids = rd_df[rd_df["label"] == ancestry]["ID"]
    with open(f"RD_{ancestry}.txt", 'w') as file:
        for iid in ids:
            file.write(f"{iid}\n")
    !dx upload RD_{ancestry}.txt --path wgs_analysis/data/ID_Files/RD_{ancestry}.txt

for ancestry in control_df["label"].unique():
    ids = control_df[control_df["label"] == ancestry]["ID"]
    with open(f"Control_{ancestry}.txt", 'w') as file:
        for iid in ids:
            file.write(f"{iid}\n")
    !dx upload Control_{ancestry}.txt --path wgs_analysis/data/ID_Files/Control_{ancestry}.txt


## Get frequencies for each gene-cohort-ancestry combination

In [None]:
%%bash

for cohort in {"AD","RD","Control"};
do
    for ancestry in {"AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"};
    do
        dx run swiss-army-knife \
        -iin="/wgs_analysis/results/4_normalized/normalized.vcf" \
        -iin="/wgs_analysis/data/ID_Files/${cohort}_${ancestry}.txt" \
        -icmd="plink2 --vcf normalized.vcf --set-all-var-ids 'chr@:#:\$r:\$a' --new-id-max-allele-len 999 --keep ${cohort}_${ancestry}.txt --freq --out ${cohort}_${ancestry}" \
        --instance-type mem1_hdd1_v2_x16 \
        --destination "${projectid}:/wgs_analysis/results/6_frequencies"
    done
done


## Get zygosity info

In [None]:
%%bash

for cohort in {"AD","RD","Control"};
do
    for ancestry in {"AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"};
    do
        dx run swiss-army-knife \
        -iin="/wgs_analysis/results/4_normalized/normalized.vcf" \
        -iin="/wgs_analysis/data/ID_Files/${cohort}_${ancestry}.txt" \
        -iin="/wgs_analysis/results/6_frequencies/${cohort}_${ancestry}.afreq" \
        -icmd="plink2 --vcf normalized.vcf --set-all-var-ids 'chr@:#:\$r:\$a' --new-id-max-allele-len 999 --keep ${cohort}_${ancestry}.txt --read-freq ${cohort}_${ancestry}.afreq --export A --het --out ${cohort}_${ancestry}" \
        --instance-type mem1_hdd1_v2_x16 \
        --destination "${projectid}:/wgs_analysis/results/6_frequencies"
    done
done


## Find homozygous/heterozygous counts

In [None]:
for cohort in ["AD","RD","Control"]:
    for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
        ! dx download /wgs_analysis/results/6_frequencies/{cohort}_{ancestry}.raw


In [None]:
%%bash

for cohort in {"AD","RD","Control"};
do
    for ancestry in {"AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"};
    do
        output_file="counts_${cohort}_${ancestry}.tsv"
        temp_output="temp_${cohort}_${ancestry}.tsv"

        > $output_file

        input_file="cut.raw"

        grep "FID" ${cohort}_${ancestry}.raw | cut -d$'\t' -f7- > header.tmp

        cut -d$'\t' -f7- "${cohort}_${ancestry}.raw" > $input_file

        declare -A count_2
        declare -A count_1
        declare -A count_0

        awk -F'\t' '
        {
            for (i=1; i<=NF; i++) {
                if ($i == 2) count_2[i]++;
                else if ($i == 1) count_1[i]++;
                else if ($i == 0) count_0[i]++;
            }
        }
        END {
            for (i=1; i<=NF; i++) {
                printf("%d", count_2[i]);
                if (i<NF) printf("\t");
            }
            print "";

            for (i=1; i<=NF; i++) {
                printf("%d", count_1[i]);
                if (i<NF) printf("\t");
            }
            print "";

            for (i=1; i<=NF; i++) {
                printf("%d", count_0[i]);
                if (i<NF) printf("\t");
            }
            print "";
        }' $input_file > count.tmp

        cat header.tmp count.tmp > $output_file

        rm $input_file count.tmp header.tmp

        echo "Counts have been appended to $output_file"
    done
done


In [None]:
for cohort in ["AD","RD","Control"]:
    for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
        ! dx upload counts_{cohort}_{ancestry}.tsv --path wgs_analysis/results/7_zygosity/counts_{cohort}_{ancestry}.tsv


In [None]:
for cohort in ["AD","RD","Control"]:
    for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
        ! dx download wgs_analysis/results/7_zygosity/counts_{cohort}_{ancestry}.tsv


In [None]:
for cohort in ["AD","RD","Control"]:
    zyg_cohort = []
    for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
        try:
            df = pd.read_csv(f"counts_{cohort}_{ancestry}.tsv", sep="\t")
            variant_ids = df.columns.values
            rename_dict = {}
            for vid in variant_ids:
                rename_dict[vid] = vid.split("_")[0]
            df.rename(rename_dict, axis=1, inplace=True)
            zyg_cohort.append(df)
        except:
            print(f"No data found at counts_{cohort}_{ancestry}.tsv")
    result_zyg = reduce(lambda x, y: x + y, zyg_cohort)
    result_zyg.to_csv(f"{cohort}_Final.csv", index=False)
    ! dx upload {cohort}_Final.csv --path wgs_analysis/results/7_zygosity/{cohort}_Final.csv


In [None]:
ad_zyg = pd.read_csv(f"AD_Final.csv")
rd_zyg = pd.read_csv(f"RD_Final.csv")
control_zyg = pd.read_csv(f"Control_Final.csv")

ad_zyg = ad_zyg.T
rd_zyg = rd_zyg.T
control_zyg = control_zyg.T

ad_zyg = ad_zyg.reset_index()
rd_zyg = rd_zyg.reset_index()
control_zyg = control_zyg.reset_index()

ad_zyg.rename(columns={'index': 'ID', 0: 'AD_Homozygous_Ref', 1: 'AD_Heterozygous', 2: 'AD_Homozygous_Alt'}, inplace=True)
rd_zyg.rename(columns={'index': 'ID', 0: 'RD_Homozygous_Ref', 1: 'RD_Heterozygous', 2: 'RD_Homozygous_Alt'}, inplace=True)
control_zyg.rename(columns={'index': 'ID', 0: 'Control_Homozygous_Ref', 1: 'Control_Heterozygous', 2: 'Control_Homozygous_Alt'}, inplace=True)

final_zyg = ad_zyg.merge(rd_zyg, on="ID")
final_zyg = final_zyg.merge(control_zyg, on="ID")

display(ad_zyg.head())
display(final_zyg.head())

final_zyg.to_csv("final_zygosity.csv", index=False)
! dx upload final_zygosity.csv --path wgs_analysis/results/7_zygosity/final_zygosity.csv


# Merge annotations with allele frequency outputs

## Merge frequencies for all ancestries across each cohort

In [None]:
for cohort in ["AD","RD","Control"]:
    freq_files = glob.glob(f"../../mnt/project/wgs_analysis/results/6_frequencies/{cohort}*.afreq")

    df = pd.read_csv(freq_files[0], sep="\t")
    df = df[["ID","ALT_FREQS","OBS_CT"]]

    ancestry = freq_files[0].split("_")[-1].split(".")[0]
    df.rename({"ALT_FREQS":f"ALT_FREQS_{ancestry}_{cohort}", "OBS_CT":f"OBS_CT_{ancestry}_{cohort}"}, inplace=True, axis=1)

    for i in range(1, len(freq_files)):
        df_merge = pd.read_csv(freq_files[i], sep="\t")
        df_merge = df_merge[["ID","ALT_FREQS","OBS_CT"]]

        ancestry = freq_files[i].split("_")[-1].split(".")[0]
        df_merge.rename({"ALT_FREQS":f"ALT_FREQS_{ancestry}_{cohort}", "OBS_CT":f"OBS_CT_{ancestry}_{cohort}"}, inplace=True, axis=1)

        df = df.merge(df_merge, on="ID")

    df.to_csv(f"{cohort}.csv", index=False)


## Convert back to vcf format for external CADD calculation

In [None]:
ids_list = list(pd.read_csv("AD.csv")["ID"])
df_for_cadd = pd.DataFrame({"ID":ids_list})


In [None]:
df_for_cadd[["#CHROM","POS","REF","ALT"]] = df_for_cadd["ID"].str.split(':', expand=True)
df_for_cadd.drop("ID", axis=1, inplace=True)
df_for_cadd.insert(2, 'ID', '.')
df_for_cadd.to_csv("for_CADD.vcf", index=False, sep="\t")


In [None]:
! gzip -c for_CADD.vcf > for_CADD.vcf.gz


In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
---------------- DOWNLOAD OUTPUT FILE AND PASS THROUGH EXTERNAL CADD SCORE CALCULATOR ----------------
------------------------------ (https://cadd.gs.washington.edu/upload) -------------------------------
--------------------------- RENAME RESULT TO "CADD.tsv.gz" AND UPLOAD HERE ---------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
! gzip -d CADD.tsv.gz
! grep -v "##" CADD.tsv > CADD_prelim.tsv
cadd_df = pd.read_csv("CADD_prelim.tsv", sep="\t")
cadd_df["#Chrom"] = "chr" + cadd_df["#Chrom"].astype(str)
cadd_df["ID"] = cadd_df["#Chrom"] + ":" + cadd_df["Pos"].astype(str) + ":" + cadd_df["Ref"] + ":" + cadd_df["Alt"]
cadd_df = cadd_df[["ID","PHRED"]]
cadd_df.rename({"PHRED":"CADD"}, axis=1, inplace=True)
cadd_df.to_csv("CADD_final.csv", index=False)


## Merge CADD scores, frequency files, and annotations

In [None]:
annot_df = pd.read_csv(f"../../mnt/project/wgs_analysis/results/5_annotated/annotated.csv", low_memory=False)
annot_df.insert(1, "ID", annot_df[["Chr","Start","Ref","Alt"]].astype(str).agg(':'.join, axis=1))
filtered_columns = [col for col in annot_df.columns if "Otherinfo" not in col]
annot_df = annot_df[filtered_columns]

ad_freq_df = pd.read_csv("AD.csv")
ad_freq_df.drop(columns="ID", inplace=True)
rd_freq_df = pd.read_csv("RD.csv")
rd_freq_df.drop(columns="ID", inplace=True)
control_freq_df = pd.read_csv("Control.csv")
display(control_freq_df.head())
control_freq_df = control_freq_df.merge(final_zyg, on="ID")
display(control_freq_df.head())
control_freq_df = control_freq_df.merge(cadd_df, on="ID", how="left")
display(control_freq_df.head())
control_freq_df.drop(columns="ID", inplace=True)

merged_df = pd.concat([annot_df,ad_freq_df], axis=1)
merged_df = pd.concat([merged_df,rd_freq_df], axis=1)
merged_df = pd.concat([merged_df,control_freq_df], axis=1)

merged_df.to_csv("merged.csv", index=False)
!dx upload merged.csv --path wgs_analysis/results/8_merged/merged.csv


In [None]:
gene_info_dict = fetch_gene_info_ensembl(gene_names=gene_names, species='human', genome_version='GRCh38')
for gene in gene_info_dict:
    print(gene_info_dict[gene])


In [None]:
criteria_list = []
for gene in ["APOE", "APP", "GBA", "GRN", "MAPT", "PSEN1", "PSEN2", "SNCA", "TREM2"]:
    if gene == "GBA":
        gene = "GBA1"
    chrnum = gene_info_dict[gene]["chromosome"]
    start = gene_info_dict[gene]["start"]
    end = gene_info_dict[gene]["end"]
    criteria_list.append((merged_df["Chr"] == chrnum) 
                         & (merged_df["Start"] > start) 
                         & (merged_df["Start"] < end)
                         & (merged_df["Func.refGene"].isin(["exonic", "splicing"]))
                        )

filter_criteria = criteria_list[0]
for criterion in criteria_list[1:]:
    filter_criteria |= criterion

filtered_df = merged_df[filter_criteria]
filtered_df = filtered_df[filtered_df["Func.refGene"].isin(["exonic", "splicing"])]

print(filtered_df.shape)
display(filtered_df)


In [None]:
ancestries = ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]
ctrl_col_names = [f"ALT_FREQS_{ancestry}_Control" for ancestry in ancestries if f"ALT_FREQS_{ancestry}_Control" in filtered_df.columns.values]
ad_col_names = [f"ALT_FREQS_{ancestry}_AD" for ancestry in ancestries if f"ALT_FREQS_{ancestry}_AD" in filtered_df.columns.values]
rd_col_names = [f"ALT_FREQS_{ancestry}_RD" for ancestry in ancestries if f"ALT_FREQS_{ancestry}_RD" in filtered_df.columns.values]

final_filtered_df = filtered_df[(filtered_df[ad_col_names + rd_col_names]>0).any(axis=1)]

final_filtered_df["Disease"] = ""
final_filtered_df["Disease"][(final_filtered_df[ad_col_names]>0).any(axis=1)] = "AD"
final_filtered_df["Disease"][(final_filtered_df[rd_col_names]>0).any(axis=1)] = "RD"
final_filtered_df["Disease"][(final_filtered_df[ad_col_names]>0).any(axis=1) & (final_filtered_df[rd_col_names]>0).any(axis=1)] = "Both"
display(final_filtered_df.head())
final_filtered_df.to_csv("final_filtered.csv", index=False)


In [None]:
! dx upload final_filtered.csv --path wgs_analysis/results/8_merged/final_filtered.csv


# Remove variants expressed in controls

In [None]:
final_filtered_df = final_filtered_df[(final_filtered_df["Control_Heterozygous"] == 0) & (final_filtered_df["Control_Homozygous_Alt"] == 0)]
final_filtered_df.to_csv("final_filtered_onlycases.csv", index=False)


In [None]:
! dx upload final_filtered_onlycases.csv --path wgs_analysis/results/8_merged/final_filtered_onlycases.csv


# APOE Genotyping

## Calculate APOE Genotypes

In [None]:
%%bash

for cohort in {"AD","RD","Control"};
do
    for ancestry in {"AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"};
    do
        dx run swiss-army-knife \
        -iin="/wgs_analysis/results/4_normalized/normalized.vcf" \
        -iin="/wgs_analysis/data/ID_Files/${cohort}_${ancestry}.txt" \
        -icmd="plink2 --vcf normalized.vcf --chr 19 --from-bp 44905791 --to-bp 44909393 --keep ${cohort}_${ancestry}.txt --set-all-var-ids 'chr@:#:\$r:\$a' --new-id-max-allele-len 999 --make-bed --out ${cohort}_${ancestry}" \
        --instance-type mem1_hdd1_v2_x16 \
        --destination "${projectid}:/wgs_analysis/results/9_apoe_genotyping"
    done
done


In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
%%bash

for cohort in {"AD","RD","Control"};
do
    for ancestry in {"AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"};
    do
        dx run swiss-army-knife \
        -iin="/wgs_analysis/results/9_apoe_genotyping/${cohort}_${ancestry}.bim" \
        -iin="/wgs_analysis/results/9_apoe_genotyping/${cohort}_${ancestry}.bed" \
        -iin="/wgs_analysis/results/9_apoe_genotyping/${cohort}_${ancestry}.fam" \
        -iin="/wgs_analysis/data/apoe_variants.txt" \
        -icmd="plink --bfile ${cohort}_${ancestry} --extract apoe_variants.txt --make-bed --out apoe_snps_${cohort}_${ancestry}" \
        --instance-type mem1_hdd1_v2_x16 \
        --destination "${projectid}:/wgs_analysis/results/9_apoe_genotyping"
    done
done


In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
%%bash

for cohort in {"AD","RD","Control"};
do
    for ancestry in {"AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"};
    do
        dx run swiss-army-knife \
        -iin="/wgs_analysis/results/9_apoe_genotyping/apoe_snps_${cohort}_${ancestry}.bim" \
        -iin="/wgs_analysis/results/9_apoe_genotyping/apoe_snps_${cohort}_${ancestry}.bed" \
        -iin="/wgs_analysis/results/9_apoe_genotyping/apoe_snps_${cohort}_${ancestry}.fam" \
        -iin="/wgs_analysis/data/apoe_variants.txt" \
        -icmd="plink --bfile apoe_snps_${cohort}_${ancestry} --recode compound-genotypes --out apoe_snps_${cohort}_${ancestry}" \
        --instance-type mem1_hdd1_v2_x16 \
        --destination "${projectid}:/wgs_analysis/results/9_apoe_genotyping"
    done
done


In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
! dx download wgs_analysis/data/APOE_genotypes_PLINK_ped.py
for cohort in ["AD","RD","Control"]:
    for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
        ! dx download wgs_analysis/results/9_apoe_genotyping/apoe_snps_{cohort}_{ancestry}.ped
        

In [None]:
for cohort in ["AD","RD","Control"]:
    for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
        if os.path.exists(f"apoe_snps_{cohort}_{ancestry}.ped"):
            ! python APOE_genotypes_PLINK_ped.py -i apoe_snps_{cohort}_{ancestry}.ped -o apoe_final_{cohort}_{ancestry}
            ! dx upload apoe_final_{cohort}_{ancestry}.APOE_GENOTYPES.csv --path wgs_analysis/results/9_apoe_genotyping/apoe_final_{cohort}_{ancestry}.APOE_GENOTYPES.csv
        

## Combine these results into a table

In [None]:
for cohort in ["AD","RD","Control"]:
    cohort_counts = []
    included_ancestries = []
    for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
        if os.path.exists(f"apoe_final_{cohort}_{ancestry}.APOE_GENOTYPES.csv"):
            apoe_geno = pd.read_csv(f"apoe_final_{cohort}_{ancestry}.APOE_GENOTYPES.csv")
            counts = apoe_geno['APOE_GENOTYPE'].value_counts()
            counts = counts.reindex(["e1/e1", "e1/e2", "e1/e4", "e2/e2", "e2/e3", "e2/e4 or e1/e3", "e3/e3", "e3/e4", "e4/e4"], fill_value=0)
            counts = counts.rename(ancestry)
            counts = pd.concat([counts, pd.Series([counts.sum()], index=['total'])])
            cohort_counts.append(counts)
            included_ancestries.append(ancestry)
    cohort_counts = pd.concat(cohort_counts, axis=1)
    cohort_counts.columns = included_ancestries
    display(cohort_counts)
    cohort_percentages = cohort_counts.div(cohort_counts.loc["total"], axis=1) * 100
    cohort_percentages.loc["total"] = cohort_counts.loc["total"]
    display(cohort_percentages)
    cohort_combined = cohort_counts.applymap(str) + ' (' + cohort_percentages.applymap(lambda x: f'{x:.2f}%') + ')'
    cohort_combined.loc["total"] = cohort_counts.loc["total"]
    display(cohort_combined)
    cohort_counts.to_csv(f"{cohort}_apoe_genotype_counts.csv")
    cohort_percentages.to_csv(f"{cohort}_apoe_genotype_percentages.csv")
    cohort_combined.to_csv(f"{cohort}_apoe_genotype_combined.csv")
    ! dx upload {cohort}_apoe_genotype_counts.csv --path wgs_analysis/results/9_apoe_genotyping/{cohort}_apoe_genotype_counts.csv
    ! dx upload {cohort}_apoe_genotype_percentages.csv --path wgs_analysis/results/9_apoe_genotyping/{cohort}_apoe_genotype_percentages.csv
    ! dx upload {cohort}_apoe_genotype_combined.csv --path wgs_analysis/results/9_apoe_genotyping/{cohort}_apoe_genotype_combined.csv
    

# Find number of controls in pathogenic variants

In [None]:
! dx download wgs_analysis/results/7_zygosity/counts_Control_AAC.tsv
! dx download wgs_analysis/results/7_zygosity/counts_Control_AFR.tsv
! dx download wgs_analysis/results/7_zygosity/counts_Control_AJ.tsv
! dx download wgs_analysis/results/7_zygosity/counts_Control_AMR.tsv
! dx download wgs_analysis/results/7_zygosity/counts_Control_CAH.tsv
! dx download wgs_analysis/results/7_zygosity/counts_Control_CAS.tsv
! dx download wgs_analysis/results/7_zygosity/counts_Control_EAS.tsv
! dx download wgs_analysis/results/7_zygosity/counts_Control_EUR.tsv
! dx download wgs_analysis/results/7_zygosity/counts_Control_FIN.tsv
! dx download wgs_analysis/results/7_zygosity/counts_Control_MDE.tsv
! dx download wgs_analysis/results/7_zygosity/counts_Control_SAS.tsv


In [None]:
control_zyg = []
for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
    df = pd.read_csv(f"counts_Control_{ancestry}.tsv", sep="\t")
    variant_ids = df.columns.values
    rename_dict = {}
    for vid in variant_ids:
        rename_dict[vid] = vid.split("_")[0]
    df.rename(rename_dict, axis=1, inplace=True)
    df = df.T
    df = df.reset_index()
    df.rename(columns={'index': 'ID', 0: f'{ancestry}_Homozygous_Ref', 1: f'{ancestry}_Heterozygous', 2: f'{ancestry}_Homozygous_Alt'}, inplace=True)
    df[ancestry] = df[f'{ancestry}_Heterozygous'] + df[f'{ancestry}_Homozygous_Alt']
    df = df[["ID",ancestry]]
    control_zyg.append(df)


In [None]:
control_zyg_merged = reduce(lambda left, right: pd.merge(left, right, on="ID"), control_zyg)


In [None]:
pathogenic_vars = [
    "chr1:155235196",
    "chr1:155235217",
    "chr1:155235252",
    "chr1:155235727",
    "chr1:155235790",
    "chr1:155235823",
    "chr1:155235843",
    "chr1:155236277",
    "chr1:155237453",
    "chr1:155238174",
    "chr1:155238214",
    "chr1:155238215",
    "chr1:155238260",
    "chr1:155238630",
    "chr1:155240629",
    "chr17:44350262",
    "chr17:44350800",
    "chr17:44351409",
    "chr4:89828156",
]


In [None]:
filtered_control_var_counts = control_zyg_merged[control_zyg_merged['ID'].str.startswith(tuple(pathogenic_vars))]
filtered_control_var_counts.to_csv("filtered_control_var_counts.csv", index=False)


In [None]:
! dx upload filtered_control_var_counts.csv --path wgs_analysis/results/10_pathogenic_variants/filtered_control_var_counts.csv


# Resilience/protective variants

In [None]:
! dx download wgs_analysis/results/8_merged/final_filtered_onlycases.csv
! dx download wgs_analysis/results/8_merged/merged.csv
! dx download wgs_analysis/data/protein_var_map.csv --overwrite


In [None]:
filtered_var_counts = pd.read_csv("final_filtered_onlycases.csv")
filtered_var_counts = filtered_var_counts[filtered_var_counts["CADD"] >= 20]
filtered_var_ids = list(filtered_var_counts["ID"])


In [None]:
! dx download wgs_analysis/results/6_frequencies/Control_EUR.afreq --overwrite
df_freq = pd.read_csv("Control_EUR.afreq", sep="\t")
df_merged = pd.read_csv("merged.csv")
df_merged.insert(2, "VCF_ID", list(df_freq["ID"]))


In [None]:
df_merged_filtered = df_merged[df_merged["ID"].isin(filtered_var_ids)]
filtered_vcf_ids = list(df_merged_filtered["VCF_ID"])


In [None]:
file_path = 'variants_to_keep.txt'
with open(file_path, 'w') as file:
    for variant_id in filtered_vcf_ids:
        file.write(f"{variant_id}\n")
! dx upload variants_to_keep.txt --path wgs_analysis/results/11_phenotypic_data/variant_ids.txt
        

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/wgs_analysis/results/4_normalized/normalized.vcf" \
-iin="/wgs_analysis/results/11_phenotypic_data/variant_ids.txt" \
-icmd="plink2 --vcf normalized.vcf --set-all-var-ids 'chr@:#:\$r:\$a' --new-id-max-allele-len 999 --extract variant_ids.txt --freq --out full_cohort" \
--instance-type mem1_hdd1_v2_x16 \
--destination "${projectid}:/wgs_analysis/results/11_phenotypic_data"


In [None]:
%%bash 

dx run swiss-army-knife \
-iin="/wgs_analysis/results/4_normalized/normalized.vcf" \
-iin="/wgs_analysis/results/11_phenotypic_data/full_cohort.afreq" \
-iin="/wgs_analysis/results/11_phenotypic_data/variant_ids.txt" \
-icmd="plink2 --vcf normalized.vcf --set-all-var-ids 'chr@:#:\$r:\$a' --new-id-max-allele-len 999 --extract variant_ids.txt --read-freq full_cohort.afreq --export A --het --out full_cohort" \
--instance-type mem1_hdd1_v2_x16 \
--destination "${projectid}:/wgs_analysis/results/11_phenotypic_data"


In [None]:
! dx download wgs_analysis/results/11_phenotypic_data/full_cohort.raw --overwrite


In [None]:
df_raw = pd.read_csv("full_cohort.raw", sep="\t")
variant_ids = df_raw.columns.values
rename_dict = {}
for vid in variant_ids:
    rename_dict[vid] = vid.split("_")[0]
df_raw.rename(rename_dict, axis=1, inplace=True)
df_raw.drop(columns=["FID","PAT","MAT","SEX","PHENOTYPE"], inplace=True)
df_raw.fillna(2, inplace=True)


In [None]:
list_vars = []
list_vars_chrpos = []
list_ids = []
for variant in df_raw.columns[1:]:
    var_ids = df_raw[df_raw[variant] != 2]['IID'].tolist()
    for iid in var_ids:
        list_vars.append(variant)
        var_chrpos = ":".join(variant.split(":")[:2])
        list_vars_chrpos.append(var_chrpos)
        list_ids.append(iid)
    
#id_var_mapper = {}    
#for index, row in df_raw.iterrows():
#    iid = int(row['IID'])
#    
#    list_vars = [col for col in df_raw.columns[1:] if row[col] != 2]
#    if len(list_vars) > 0:
#        id_var_mapper[iid] = list_vars
#    

In [None]:
df_pheno = pd.DataFrame({
    "Variant_ID_Full":list_vars,
    "Variant_ID":list_vars_chrpos,
    "Participant_ID":list_ids,
})


In [None]:
protein_var_map = pd.read_csv("protein_var_map.csv")
df_pheno = df_pheno.merge(protein_var_map, on="Variant_ID", how="left")


In [None]:
display(df_pheno)

In [None]:
! dx download wgs_analysis/results/AD_cases.csv
! dx download wgs_analysis/results/RD_cases.csv


In [None]:
df_ad = pd.read_csv("AD_cases.csv")
df_rd = pd.read_csv("RD_cases.csv")

df_ad.rename({"AD_DATE":"DATE_OF_ONSET", "label":"ANCESTRY", "ID":"Participant_ID"}, inplace=True, axis=1)
df_rd.rename({"DEM_DATE":"DATE_OF_ONSET", "label":"ANCESTRY", "ID":"Participant_ID"}, inplace=True, axis=1)

df_dem = pd.concat([df_ad, df_rd], axis=0)
df_dem.reset_index(inplace=True, drop=True)


In [None]:
df_dem["AGE_AT_ONSET"] = (pd.to_datetime(df_dem['DATE_OF_ONSET']) - pd.to_datetime(df_dem["BIRTH_YEAR"], format='%Y')).dt.days // 365.242374
df_dem["DAYS_SINCE_ONSET"] = (pd.to_datetime(df_dem['DATE_OF_DEATH'].fillna(datetime.today().strftime('%Y-%m-%d'))) - pd.to_datetime(df_dem['DATE_OF_ONSET'])).dt.days
df_dem["AGE"] = (pd.to_datetime(df_dem['DATE_OF_DEATH'].fillna(datetime.today().strftime('%Y-%m-%d'))) - pd.to_datetime(df_dem['BIRTH_YEAR'], format='%Y')).dt.days // 365.242374


In [None]:
print(df_pheno.shape)

df_pheno_1 = df_pheno.merge(df_dem[[
    "Participant_ID","GENETIC_SEX","AGE","DATE_OF_ONSET","DATE_OF_DEATH","AGE_AT_ONSET",
    "DAYS_SINCE_ONSET","ANCESTRY","COGNITIVE_SYMPTOMS_SEVERITY_PAST_WEEK", 
    "SPECIFIC_COGNITIVE_ABILITY_2014","SPECIFIC_COGNITIVE_ABILITY_2019", 
    "TOUCHSCREEN_COGNITIVE_DURATION_2014","TOUCHSCREEN_COGNITIVE_DURATION_2019", 
    "COGNITIVE_TEST_WILLINGNESS","COGNITIVE_TEST_WILLINGNESS", "COGNITIVE_TEST_WILLINGNESS",
]], on="Participant_ID", how="inner")


In [None]:
display(df_pheno_1)
df_pheno_1.to_csv("pheno.csv")


## Find variants expressed by anyone in the dataset

In [None]:
%%bash

for cohort in {"AD","RD","Control"};
do
    for ancestry in {"AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"};
    do
        dx run swiss-army-knife \
        -iin="/wgs_analysis/data/protective_variants.txt" \
        -iin="/wgs_analysis/results/4_normalized/normalized.vcf" \
        -iin="/wgs_analysis/data/ID_Files/${cohort}_${ancestry}.txt" \
        -iin="/wgs_analysis/results/6_frequencies/${cohort}_${ancestry}.afreq" \
        -icmd="plink2 --vcf normalized.vcf --set-all-var-ids 'chr@:#:\$r:\$a' --new-id-max-allele-len 999 --extract protective_variants.txt --keep ${cohort}_${ancestry}.txt --read-freq ${cohort}_${ancestry}.afreq --export A --het --out ${cohort}_${ancestry}" \
        --instance-type mem1_hdd1_v2_x16 \
        --destination "${projectid}:/wgs_analysis/results/12_protective_variants"
    done
done


In [None]:
cohort = "AD"
ancestry = "AAC"
! dx download wgs_analysis/results/12_protective_variants/{cohort}_{ancestry}.raw --overwrite
df = pd.read_csv(f"{cohort}_{ancestry}.raw", sep="\t")
df.drop(["FID","PAT","MAT","SEX","PHENOTYPE"], axis=1, inplace=True)
df.columns = df.columns.str.split('_').str[0]
for var_id in df.columns[1:]:
    participant_ids = list(df["IID"][df[var_id] < 2])
    if len(participant_ids) > 0:
        with open(f"{cohort}_{ancestry}.txt", "w") as file:
            for item in participant_ids:
                file.write(f"{item}\n")
        ! dx upload {cohort}_{ancestry}.txt --path wgs_analysis/results/12_protective_variants/{var_id.replace(":","_")}/{cohort}_{ancestry}.txt
display(df)


In [None]:
for cohort in ["AD","RD","Control"]:
    for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
        ! dx download wgs_analysis/results/12_protective_variants/{cohort}_{ancestry}.raw --overwrite
        if os.path.exists(f"{cohort}_{ancestry}.raw"):
            df = pd.read_csv(f"{cohort}_{ancestry}.raw", sep="\t")
            df.drop(["FID","PAT","MAT","SEX","PHENOTYPE"], axis=1, inplace=True)
            df.columns = df.columns.str.split('_').str[0]
            for var_id in df.columns[1:]:
                participant_ids = list(df["IID"][df[var_id] < 2])
                if len(participant_ids) > 0:
                    with open(f"{cohort}_{ancestry}.txt", "w") as file:
                        for item in participant_ids:
                            file.write(f"{item}\n")
                    ! dx upload {cohort}_{ancestry}.txt --path wgs_analysis/results/12_protective_variants/{var_id.replace(":","_")}/{cohort}_{ancestry}.txt


## APOE genotyping for each variant

In [None]:
%%bash

#for var_id in {"chr19_44892887_C_T","chr19_44905307_A_T","chr19_44908756_C_A","chr21_25897620_C_T","chr21_26171645_A_G","chr21_26171723_T_C"};
for var_id in {"chr19_44905307_A_T","chr19_44908756_C_A","chr21_25897620_C_T","chr21_26171645_A_G","chr21_26171723_T_C"};
do
    files=($(ls ../../mnt/project/wgs_analysis/results/12_protective_variants/${var_id}/*.txt 2>/dev/null))
    
    for file in "${files[@]}";
    do
        filename=$(basename "$file" .txt)
        
        dx run swiss-army-knife \
            -iin="/wgs_analysis/results/4_normalized/normalized.vcf" \
            -iin="/wgs_analysis/results/12_protective_variants/${var_id}/${filename}.txt" \
            -icmd="plink2 --vcf normalized.vcf --chr 19 --from-bp 44905791 --to-bp 44909393 --keep ${filename}.txt --set-all-var-ids 'chr@:#:\$r:\$a' --new-id-max-allele-len 999 --make-bed --out ${filename}" \
            --instance-type mem1_hdd1_v2_x16 \
            --destination "${projectid}:/wgs_analysis/results/12_protective_variants/${var_id}"
    done
done


In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
%%bash

for var_id in {"chr19_44892887_C_T","chr19_44905307_A_T","chr19_44908756_C_A","chr21_25897620_C_T","chr21_26171645_A_G","chr21_26171723_T_C"};
do
    files=($(ls ../../mnt/project/wgs_analysis/results/12_protective_variants/${var_id}/*.bim 2>/dev/null))
    
    for file in "${files[@]}";
    do
        filename=$(basename "$file" .bim)
        
        dx run swiss-army-knife \
        -iin="/wgs_analysis/results/12_protective_variants/${var_id}/${filename}.bim" \
        -iin="/wgs_analysis/results/12_protective_variants/${var_id}/${filename}.bed" \
        -iin="/wgs_analysis/results/12_protective_variants/${var_id}/${filename}.fam" \
        -iin="/wgs_analysis/data/apoe_variants.txt" \
        -icmd="plink --bfile ${filename} --extract apoe_variants.txt --make-bed --out apoe_snps_${filename}" \
        --instance-type mem1_hdd1_v2_x16 \
        --destination "${projectid}:/wgs_analysis/results/12_protective_variants/${var_id}"
    done
done


In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
%%bash

for var_id in {"chr19_44892887_C_T","chr19_44905307_A_T","chr19_44908756_C_A","chr21_25897620_C_T","chr21_26171645_A_G","chr21_26171723_T_C"};
do
    files=($(ls ../../mnt/project/wgs_analysis/results/12_protective_variants/${var_id}/*.bim 2>/dev/null))
    
    for file in "${files[@]}";
    do
        filename=$(basename "$file" .bim)
        
        dx run swiss-army-knife \
        -iin="/wgs_analysis/results/12_protective_variants/${var_id}/apoe_snps_${filename}.bim" \
        -iin="/wgs_analysis/results/12_protective_variants/${var_id}/apoe_snps_${filename}.bed" \
        -iin="/wgs_analysis/results/12_protective_variants/${var_id}/apoe_snps_${filename}.fam" \
        -iin="/wgs_analysis/data/apoe_variants.txt" \
        -icmd="plink --bfile apoe_snps_${filename} --recode compound-genotypes --out apoe_snps_${filename}" \
        --instance-type mem1_hdd1_v2_x16 \
        --destination "${projectid}:/wgs_analysis/results/12_protective_variants/${var_id}"
    done
done


In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
for var_id in ["chr19_44892887_C_T","chr19_44905307_A_T","chr19_44908756_C_A","chr21_25897620_C_T","chr21_26171645_A_G","chr21_26171723_T_C"]:
    ! mkdir {var_id}
    
    ! dx download wgs_analysis/results/12_protective_variants/{var_id}/*.ped --overwrite
    ! mv *.ped {var_id}/
    

In [None]:
! dx download wgs_analysis/data/APOE_genotypes_PLINK_ped.py --overwrite
for var_id in ["chr19_44892887_C_T","chr19_44905307_A_T","chr19_44908756_C_A","chr21_25897620_C_T","chr21_26171645_A_G","chr21_26171723_T_C"]:
    for ped_file in glob.glob(f"{var_id}/*"):
        ! python APOE_genotypes_PLINK_ped.py -i {ped_file} -o {ped_file.replace("snps","final")[:-4]}
        ! dx upload {ped_file.replace("snps","final")[:-4]}.APOE_GENOTYPES.csv --path wgs_analysis/results/12_protective_variants/{ped_file.replace("snps","final")[:-4]}.APOE_GENOTYPES.csv
        

In [None]:
for var_id in ["chr19_44892887_C_T","chr19_44905307_A_T","chr19_44908756_C_A","chr21_25897620_C_T","chr21_26171645_A_G","chr21_26171723_T_C"]:
    print(var_id)
    for cohort in ["AD","RD","Control"]:
        print(cohort)
        cohort_counts = []
        included_ancestries = []
        for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
            if os.path.exists(f"{var_id}/apoe_final_{cohort}_{ancestry}.APOE_GENOTYPES.csv"):
                apoe_geno = pd.read_csv(f"{var_id}/apoe_final_{cohort}_{ancestry}.APOE_GENOTYPES.csv")
                counts = apoe_geno['APOE_GENOTYPE'].value_counts()
                counts = counts.reindex(["e1/e1", "e1/e2", "e1/e4", "e2/e2", "e2/e3", "e2/e4 or e1/e3", "e3/e3", "e3/e4", "e4/e4"], fill_value=0)
                counts = counts.rename(ancestry)
                counts = pd.concat([counts, pd.Series([counts.sum()], index=['total'])])
                cohort_counts.append(counts)
                included_ancestries.append(ancestry)
        if len(cohort_counts) > 0:
            cohort_counts = pd.concat(cohort_counts, axis=1)
            cohort_counts.columns = included_ancestries
            display(cohort_counts)
            cohort_percentages = cohort_counts.div(cohort_counts.loc["total"], axis=1) * 100
            cohort_percentages.loc["total"] = cohort_counts.loc["total"]
            display(cohort_percentages)
            cohort_combined = cohort_counts.applymap(str) + ' (' + cohort_percentages.applymap(lambda x: f'{x:.2f}%') + ')'
            cohort_combined.loc["total"] = cohort_counts.loc["total"]
            display(cohort_combined)
            cohort_counts.to_csv(f"{var_id}/{cohort}_apoe_genotype_counts.csv")
            cohort_percentages.to_csv(f"{var_id}/{cohort}_apoe_genotype_percentages.csv")
            cohort_combined.to_csv(f"{var_id}/{cohort}_apoe_genotype_combined.csv")
            ! dx upload {var_id}/{cohort}_apoe_genotype_counts.csv --path wgs_analysis/results/12_protective_variants/{var_id}/{cohort}_apoe_genotype_counts.csv
            ! dx upload {var_id}/{cohort}_apoe_genotype_percentages.csv --path wgs_analysis/results/12_protective_variants/{var_id}/{cohort}_apoe_genotype_percentages.csv
            ! dx upload {var_id}/{cohort}_apoe_genotype_combined.csv --path wgs_analysis/results/12_protective_variants/{var_id}/{cohort}_apoe_genotype_combined.csv
