# UKBiobank

* **Project:** ADRD Genetic Diversity in Biobanks
* **Version:** Python/3.9
* **Last Updated:** 24-FEB-2024

## Notebook Overview
Fetch cohorts, ancestry info, remove related individuals, filter non-WGS, combine pVCFs, normalize VCFs, annotate, allele freqs, APOE genotyping, phenotype data, resilience/protective variants

# Initialize Notebook

## Import packages

In [None]:
import pyspark
import dxdata
import dxpy
import pandas as pd
from datetime import date, datetime
import os 
import numpy as np
import random
import shutil
import glob
import requests
from functools import reduce
import subprocess
import pprint

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)


## Initialize helper functions

In [None]:
def fetch_gene_info_ensembl(gene_names, species='human', genome_version='GRCh38'):
    gene_info_dict = {}
    server = "https://rest.ensembl.org"
    
    for gene_name in gene_names:
        endpoint = f"/lookup/symbol/{species}/{gene_name}"
        headers = {"Content-Type": "application/json"}

        response = requests.get(server + endpoint, headers=headers, params={"expand": "1"})
        if not response.ok:
            print(f"Fetching failed for {gene_name}")
            continue

        data = response.json()
        gene_info = {
            "gene_name": data.get("display_name", gene_name),
            "chromosome": f"chr{data['seq_region_name']}",
            "start": int(data["start"]),
            "end": int(data["end"]),
            "genome_version": genome_version
        }

        gene_info_dict[gene_name] = gene_info

    return gene_info_dict


## Initialize variables

In [None]:
gene_names = [
    "APOE",
    "APP",
    "GBA1",
    "GRN",
    "MAPT",
    "PSEN1",
    "PSEN2",
    "SNCA",
    "TARDBP",
    "TBK1",
    "TREM2",
]

results_dir = "/results/dementia_project"

! dx find projects --name "wgs_analyses" > projectid.txt
projectid = open("projectid.txt", "r")
projectid = projectid.read()
projectid = projectid.split(" : ")[0]

gene_info = fetch_gene_info_ensembl(gene_names)
gene_names = sorted(gene_info.keys(), key=lambda gene: (int(gene_info[gene]["chromosome"][3:]), gene_info[gene]["start"]))


## Grab participant data

In [None]:
dispensed_dataset_id = dxpy.find_one_data_object(typename="Dataset", name="app*.dataset", folder="/", name_mode="glob")["id"]
dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset["participant"]


In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
--------------------- CREATE "00_id_files" SUBDIRECTORY WITHIN RESULTS DIRECTORY ---------------------
---------------------- CREATE "08_merged" SUBDIRECTORY WITHIN RESULTS DIRECTORY ----------------------
--------------- CREATE "10_pathogenic_variants" SUBDIRECTORY WITHIN RESULTS DIRECTORY ----------------
----------------- CREATE "11_phenotypic_data" SUBDIRECTORY WITHIN RESULTS DIRECTORY ------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

# Fetch cohorts

## Retrieve Cases

In [None]:
# Pull down the fields we need 
# https://docs.google.com/document/d/1AebkQ-Nxrk63jhsDzZpn5QD-7EK4unsykHVj-saEm3U/edit?usp=sharing

field_names = [
    "eid", 
    "p31", 
    "p34", 
    "p21022", 
    "p42018", 
    "p42020", 
    "p40000_i0",
    "p120042",
]
df_cases = participant.retrieve_fields(names=field_names, coding_values="replace", engine=dxdata.connect())
df_cases = df_cases.toPandas()


In [None]:
# Rename columns

df_cases.rename(columns={
    "eid":"ID",
    "p31":"GENETIC_SEX", 
    "p34":"BIRTH_YEAR", 
    "p21022":"AGE_OF_RECRUIT",
    "p42018":"DEM_DATE",
    "p42020":"AD_DATE",
    "p40000_i0":"DATE_OF_DEATH",
    "p120042":"COGNITIVE_SYMPTOMS_SEVERITY_PAST_WEEK",
}, inplace=True)
df_cases["ID"] = pd.to_numeric(df_cases["ID"])


In [None]:
# Find participants with AD and RD:

df_ad = df_cases[~df_cases[f"AD_DATE"].isna()]
df_rd = df_cases[df_cases["AD_DATE"].isna() & ~df_cases["DEM_DATE"].isna()]


## Retrieve Controls (Age 65+, no NDD, no parents with AD or PD)

### Pull down the fields we need 

In [None]:
# Date G10 first reported (huntington's disease),
# Date D11 first reported (hereditary ataxia), 
# Date G12 first reported (spinal muscular atrophy and related syndromes), 
# Date G13 first reported (systemic atrophies primarily affecting central nervous system in diseases classified elswhere), 
# Date G14 first reported (postpolio syndrome), 
# Date G20 first reported (parkinson's disease), 
# Date G21 first reported (secondary parkinsonism), 
# Date G22 first reported (parkinsonism in diseases classified elsewhere), 
# Date G23 first reported (other degenerative diseases of basal ganglia), 
# Date G24 first reported (dystonia), 
# Date G25 first reported (other extrapyramidal and movement disorders), 
# Date G30 first reported (alzheimer's disease), 
# Date G31 first reported (other degenerative diseases of nervous system, not elsewhere classified), 
# Date G32 first reported (other degenerative disorders of nervous system in diseases classified elsewhere), 
# Date G35 first reported (multiple sclerosis), 
# Date G36 first reported (other acute disseminated demyelination), 
# Date G37 first reported (other demyelinating diseases of central nervous system), 
# Date G45 first reported (transient cerebral ischaemic attacks and related syndromes), 
# Date G46 first reported (vascular syndromes of brain in cerebrovascular diseases), 
# Date G50 first reported (disorders of trigeminal nerve), 
# Date G52 first reported (disorders of other cranial nerves), 
# Date G53 first reported (cranial nerve disorders in diseases classified elsewhere), 
# Date G54 first reported (nerve root and plexus disorders), 
# Date G55 first reported (nerve root and plexus compressions in diseases classified elsewhere), 
# Date G56 first reported (mononeuropathies of upper limb), 
# Date G57 first reported (mononeuropathies of lower limb), 
# Date G58 first reported (other mononeuropathies), 
# Date G59 first reported (mononeuropathy in diseases classified elsewhere), 
# Date G60 first reported (hereditary and idiopathic neuropathy), 
# Date G61 first reported (inflammatory polyneuropathy), 
# Date G62 first reported (other polyneuropathies), 
# Date G63 first reported (polyneuropathy in diseases classified elsewhere), 
# Date G64 first reported (other disorders of peripheral nervous system), 
# Date G70 first reported (myasthenia gravis and other myoneural disorders), 
# Date G71 first reported (primary disorders of muscles), 
# Date G72 first reported (other myopathies), 
# Date G73 first reported (disorders of myoneural junction and muscle in diseases classified elsewhere), 
# Date G80 first reported (infantile cerebral palsy), 
# Date G81 first reported (hemiplegia), 
# Date G82 first reported (paraplegia and tetraplegia), 
# Date G83 first reported (other paralytic syndromes), 
# Date G90 first reported (disorders of autonomic nervous system),
# Date G91 first reported (hydrocephalus), 
# Date G92 first reported (toxic encephalopathy), 
# Date G93 first reported (other disorders of brain), 
# Date G94 first reported (other disorders of brain in diseases classified elsewhere), 
# Date G96 first reported (other disorders of central nervous system), 
# Date G97 first reported (postprocedural disorders of nervous system, not elsewhere classified),  
# Date G98 first reported (other disorders of nervous system, not elsewhere classified), 
# Date G99 first reported (other disorders of nervous system in diseases classified elsewhere), 
# Date of all cause dementia report, 
# Date of alzheimer's disease report, 
# Date of vascular dementia report, 
# Date of frontotemporal dementia report, 
# Date of motor neurone disease report, 
# Date of all cause parkinsonism report, 
# Date of parkinson's disease report, 
# Date of progressive supranuclear palsy report, 
# Date of multiple system atrophy report, 
# Age at recruitment, 
# Sex, 

field_names = [
    "eid", "p21022", "p31", "p34", "p40000_i0", "p131012", "p131016", "p131018", "p131020", "p131022", "p131024", 
    "p131026", "p131028", "p131030", "p131036", "p131038", "p131040", "p131042", "p131046", "p131056", "p131058", 
    "p131062", "p131066", "p131068", "p131070", "p131074", "p131076", "p131078", "p131080", "p131082", "p131084", 
    "p131086", "p131088", "p131090", "p131092", "p131094", "p131096", "p131098", "p131100", "p131102", "p131104", 
    "p131106", "p131108", "p131110", "p131112", "p131114", "p131116", "p131120", "p131122", "p131124", "p131126",  
    "p42018", "p42020", "p42022", "p42024", "p42028", "p42030", "p42032", "p42034", "p42036", "p20110_i0", 
    "p20110_i1", "p20110_i2", "p20110_i3", "p20107_i0", "p20107_i1", "p20107_i2", "p20107_i3", "p120042",
]
df_controls = participant.retrieve_fields(names=field_names, coding_values="replace", engine=dxdata.connect())
df_controls = df_controls.toPandas()


### Remove participants with any of the listed conditions

In [None]:
df_controls = df_controls[
    df_controls['p131012'].isnull() & df_controls['p131016'].isnull() & df_controls['p131018'].isnull() & df_controls['p131020'].isnull() 
    & df_controls['p131022'].isnull() & df_controls['p131024'].isnull() & df_controls['p131026'].isnull() & df_controls['p131028'].isnull() 
    & df_controls['p131030'].isnull() & df_controls['p131036'].isnull() & df_controls['p131038'].isnull() & df_controls['p131040'].isnull() 
    & df_controls['p131042'].isnull() & df_controls['p131046'].isnull() & df_controls['p131056'].isnull() & df_controls['p131058'].isnull() 
    & df_controls['p131062'].isnull() & df_controls['p131066'].isnull() & df_controls['p131068'].isnull() & df_controls['p131070'].isnull() 
    & df_controls['p131074'].isnull() & df_controls['p131076'].isnull() & df_controls['p131078'].isnull() & df_controls['p131080'].isnull() 
    & df_controls['p131082'].isnull() & df_controls['p131084'].isnull() & df_controls['p131086'].isnull() & df_controls['p131088'].isnull() 
    & df_controls['p131090'].isnull() & df_controls['p131092'].isnull() & df_controls['p131094'].isnull() & df_controls['p131096'].isnull() 
    & df_controls['p131098'].isnull() & df_controls['p131100'].isnull() & df_controls['p131102'].isnull() & df_controls['p131104'].isnull() 
    & df_controls['p131106'].isnull() & df_controls['p131108'].isnull() & df_controls['p131110'].isnull() & df_controls['p131112'].isnull() 
    & df_controls['p131114'].isnull() & df_controls['p131116'].isnull() & df_controls['p131120'].isnull() & df_controls['p131122'].isnull() 
    & df_controls['p131124'].isnull() & df_controls['p131126'].isnull() & df_controls['p42018'].isnull() & df_controls['p42020'].isnull() 
    & df_controls['p42022'].isnull() & df_controls['p42024'].isnull() & df_controls['p42028'].isnull() & df_controls['p42030'].isnull() 
    & df_controls['p42032'].isnull() & df_controls['p42034'].isnull() & df_controls['p42036'].isnull()
]


### Remove participants whose parents have AD or PD

In [None]:
# Columns defining all instances of parent illness
parent_illness_cols = ['p20110_i0', 'p20110_i1', 'p20110_i2', 'p20110_i3', 'p20107_i0', 'p20107_i1', 'p20107_i2', 'p20107_i3']

# Convert None values to empty lists
for illness_col in parent_illness_cols:
    df_controls[illness_col] = df_controls[illness_col].apply(lambda l: l if isinstance(l, list) else [])

# Define a condition as anybody who has never reported a parent as having AD or PD
condition = lambda participant: all(("Alzheimer's disease/dementia" not in illnesses and "Parkinson's disease" not in illnesses) for illnesses in participant[parent_illness_cols])

# Apply the condition to give all participants who have a parent who has/had AD or PD
df_controls = df_controls[df_controls.apply(condition, axis=1)]


### Remove participants below the defined age threshold

In [None]:
df_controls = df_controls[df_controls['p21022'] >= 65]


### Rename columns

In [None]:
df_controls = df_controls[[
    'eid', 
    'p21022', 
    'p31', 
    'p34', 
    'p40000_i0',
    "p120042",
]]
df_controls.rename(columns={
    "eid":"ID",
    "p21022":"AGE_OF_RECRUIT", 
    "p31":"GENETIC_SEX", 
    "p34":"BIRTH_YEAR", 
    "p40000_i0":"DATE_OF_DEATH",
    "p120042":"COGNITIVE_SYMPTOMS_SEVERITY_PAST_WEEK",
}, inplace=True)
df_controls["ID"] = pd.to_numeric(df_controls["ID"])


# Find ancestry information about each cohort

## Read ancestry label mappings

In [None]:
! dx download /data/ukbb_imputed_genotypes_umap_linearsvc_predicted_labels.txt --overwrite
! dx download /data/ukbb_imputed_genotypes_proj_pca.txt --overwrite

df_ancestries = pd.read_csv("ukbb_imputed_genotypes_umap_linearsvc_predicted_labels.txt", sep="\t")
df_ancestries.rename(columns={"label":"ancestry", "IID":"ID"}, inplace=True)

df_pcs = pd.read_csv("ukbb_imputed_genotypes_proj_pca.txt", sep="\t")
df_pcs.rename(columns={"IID":"ID"}, inplace=True)

df_covar = df_ancestries.merge(df_pcs, on="ID")
df_covar = df_covar[["ID","ancestry","PC1","PC2","PC3","PC4","PC5"]]


## Add labels to cohort dataframes

In [None]:
df_controls = df_controls.merge(df_covar, on="ID")
df_ad = df_ad.merge(df_covar, on="ID")
df_rd = df_rd.merge(df_covar, on="ID")


## Get list of IDs for each cohort

In [None]:
ids_controls = df_controls["ID"].tolist()
ids_ad = df_ad["ID"].tolist()
ids_rd = df_rd["ID"].tolist()


# Remove related individuals

## Fetch relatedness data

In [None]:
! dx download '/Bulk/Genotype\ Results/Genotype\ calls/ukb_rel.dat' --overwrite
df_full_related = pd.read_csv('ukb_rel.dat', sep = ' ')
df_full_related = df_full_related[df_full_related['Kinship'] > 0.0884]


## Define cohorts to maximize cases included

In [None]:
ids_full_cohort = ids_ad + ids_rd + ids_controls
ids_cases = ids_ad + ids_rd


## Keep only rows with both participants in cohorts of interest

In [None]:
df_related_cohort = df_full_related.loc[df_full_related['ID1'].isin(ids_full_cohort) & df_full_related['ID2'].isin(ids_full_cohort)]
df_related_cohort.reset_index(drop=True, inplace=True)


## Maximize the number of cases included

In [None]:
df_flipped = df_related_cohort[df_related_cohort["ID1"].isin(ids_controls) & df_related_cohort["ID2"].isin(ids_cases)].copy()
df_related_cohort = df_related_cohort[~(df_related_cohort["ID1"].isin(ids_controls) & df_related_cohort["ID2"].isin(ids_cases))]
df_flipped.rename(columns={"ID1":"ID2", "ID2":"ID1"}, inplace=True)
df_related_cohort = pd.concat([df_related_cohort, df_flipped])


## Get set of participants to remove

In [None]:
ids_to_remove = set(df_related_cohort["ID2"])
print(f"Removing {len(ids_to_remove)} participants")


## Filter ID lists accordingly

In [None]:
ids_ad = [iid for iid in ids_ad if iid not in ids_to_remove]
ids_rd = [iid for iid in ids_rd if iid not in ids_to_remove]
ids_controls = [iid for iid in ids_controls if iid not in ids_to_remove]
ids_total = ids_ad + ids_rd + ids_controls


## Save the IDs of each participant to a txt file

In [None]:
with open('ad_ids_pre_VCF.txt', 'w') as file:
    for iid in ids_ad:
        file.write(f"{iid}\n")
        

In [None]:
with open('rd_ids_pre_VCF.txt', 'w') as file:
    for iid in ids_rd:
        file.write(f"{iid}\n")


In [None]:
with open('control_ids_pre_VCF.txt', 'w') as file:
    for iid in ids_controls:
        file.write(f"{iid}\n")


In [None]:
with open('ids_pre_VCF.txt', 'w') as file:
    for iid in ids_total:
        file.write(f"{iid}\n")


# Filter out participants without WGS data

## Find participants without WGS data

In [None]:
cmd = f"dx run swiss-army-knife "
cmd += f"-iin='/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr1/ukb24310_c1_b1_v1.vcf.gz' "
cmd += f"-iin='/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr1/ukb24310_c1_b1_v1.vcf.gz.tbi' "
cmd += f"-iin='/results/african_pd/combined_ids.txt' "
cmd += f"-icmd='bcftools query -l ukb24310_c1_b1_v1.vcf.gz > pvcf_full_ids.txt' "
cmd += f"--instance-type mem1_hdd1_v2_x2 "
cmd += f"--destination '{projectid}:{results_dir}'"

subprocess.run(
    cmd, 
    shell=True, 
)


In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

## Filter ID lists and clinical data to only include participants with WGS data

In [None]:
! dx download {results_dir}/pvcf_full_ids.txt --overwrite
! grep -Fwf pvcf_full_ids.txt ids_pre_VCF.txt > filtered_sample_ids.txt
! grep -Fwf pvcf_full_ids.txt ad_ids_pre_VCF.txt > filtered_ad_ids.txt
! grep -Fwf pvcf_full_ids.txt rd_ids_pre_VCF.txt > filtered_rd_ids.txt
! grep -Fwf pvcf_full_ids.txt control_ids_pre_VCF.txt > filtered_control_ids.txt

! dx upload filtered_sample_ids.txt --path {results_dir}/sample_ids.txt
! dx upload filtered_ad_ids.txt --path {results_dir}/ad_ids.txt
! dx upload filtered_rd_ids.txt --path {results_dir}/rd_ids.txt
! dx upload filtered_control_ids.txt --path {results_dir}/control_ids.txt


In [None]:
with open('filtered_ad_ids.txt', 'r') as file:
    ids_ad = [int(line.strip()) for line in file]
with open('filtered_rd_ids.txt', 'r') as file:
    ids_rd = [int(line.strip()) for line in file]
with open('filtered_control_ids.txt', 'r') as file:
    ids_controls = [int(line.strip()) for line in file]
    
print(f"Number of AD participants:       {len(ids_ad)}")
print(f"Number of RD participants:       {len(ids_rd)}")
print(f"Number of Control participants:  {len(ids_controls)}")


In [None]:
df_ad = df_ad[df_ad["ID"].isin(ids_ad)]
df_rd = df_rd[df_rd["ID"].isin(ids_rd)]
df_controls = df_controls[df_controls["ID"].isin(ids_controls)]


In [None]:
df_ad["AGE"] = pd.to_datetime(df_ad["AD_DATE"]).dt.year - df_ad["BIRTH_YEAR"]
df_ad = df_ad[df_ad["AGE"] >= 20]
df_rd["AGE"] = pd.to_datetime(df_rd["DEM_DATE"]).dt.year - df_rd["BIRTH_YEAR"]
df_rd = df_rd[df_rd["AGE"] >= 20]

most_recent_disease_year = max(pd.to_datetime(df_ad["AD_DATE"]).dt.year.max(), pd.to_datetime(df_rd["DEM_DATE"]).dt.year.max())
df_controls["AGE"] = pd.to_datetime(df_controls["DATE_OF_DEATH"]).dt.year.fillna(most_recent_disease_year) - df_controls["BIRTH_YEAR"]


In [None]:
df_ad.to_csv(f'ad_cases.txt', header=True, index=False, sep="\t")
df_rd.to_csv(f'rd_cases.txt', header=True, index=False, sep="\t")
df_controls.to_csv("controls.txt", header=True, index=False, sep="\t")

! dx upload ad_cases.txt --path {results_dir}/ad_cases.txt
! dx upload rd_cases.txt --path {results_dir}/rd_cases.txt
! dx upload controls.txt --path {results_dir}/controls.txt


# Print cohort statistics

## Ancestry Distribution

In [None]:
print("----- ANCESTRY DISTRIBUTION -----")
print("AD CASES:")
print(df_ad["ancestry"].value_counts())
print("\n")
print("RD CASES:")
print(df_rd["ancestry"].value_counts())
print("\n")
print("CONTROLS:")
print(df_controls["ancestry"].value_counts())
print("\n")


## Sex Distribution

In [None]:
print("----- SEX DISTRIBUTION -----")
print("AD CASES:")
print(df_ad["GENETIC_SEX"].value_counts())
print("\n")
print("RD CASES:")
print(df_rd["GENETIC_SEX"].value_counts())
print("\n")
print("CONTROLS:")
print(df_controls["GENETIC_SEX"].value_counts())
print("\n")


## Age distribution

In [None]:
print("----- AGE DISTRIBUTION -----")
print(f'Male AD cases:   {df_ad[df_ad["GENETIC_SEX"] == "Male"]["AGE"].mean():.2f} +/- {df_ad[df_ad["GENETIC_SEX"] == "Male"]["AGE"].std():.2f}')
print(f'Female AD cases: {df_ad[df_ad["GENETIC_SEX"] == "Female"]["AGE"].mean():.2f} +/- {df_ad[df_ad["GENETIC_SEX"] == "Female"]["AGE"].std():.2f}')
print(f'Male RD cases:   {df_rd[df_rd["GENETIC_SEX"] == "Male"]["AGE"].mean():.2f} +/- {df_rd[df_rd["GENETIC_SEX"] == "Male"]["AGE"].std():.2f}')
print(f'Female RD cases: {df_rd[df_rd["GENETIC_SEX"] == "Female"]["AGE"].mean():.2f} +/- {df_rd[df_rd["GENETIC_SEX"] == "Female"]["AGE"].std():.2f}')
print(f'Male controls:   {df_controls[df_controls["GENETIC_SEX"] == "Male"]["AGE"].mean():.2f} +/- {df_controls[df_controls["GENETIC_SEX"] == "Male"]["AGE"].std():.2f}')
print(f'Female controls: {df_controls[df_controls["GENETIC_SEX"] == "Female"]["AGE"].mean():.2f} +/- {df_controls[df_controls["GENETIC_SEX"] == "Female"]["AGE"].std():.2f}')
print("\n")


# Fetch pVCF chunks for each gene of interest

In [None]:
with open('gene_ranges.txt', 'w') as file:
    for gene_name in gene_names:
        chrom = gene_info[gene_name]["chromosome"]
        start = gene_info[gene_name]["start"]
        end = gene_info[gene_name]["end"]
        file.write(f"{chrom}\t{start}\t{end}\n")
! dx upload gene_ranges.txt --path {results_dir}/gene_ranges.txt


In [None]:
for gene_name in gene_names:
    print(gene_name)
    start = gene_info[gene_name]["start"]
    end = gene_info[gene_name]["end"]
    start_bval = start // 20000 - 1
    end_bval = end // 20000 + 1
    chrom = gene_info[gene_name]['chromosome']
    print(f"Chromosome:    {chrom}")
    print(f"Start b-val:   {start_bval}")
    print(f"End b-val:     {end_bval}")
    
    for b_val in range(start_bval, end_bval + 1):
        cmd = f"dx run swiss-army-knife "
        cmd += f"-iin='/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/{chrom}/ukb24310_c{chrom[3:]}_b{b_val}_v1.vcf.gz' "
        cmd += f"-iin='/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/{chrom}/ukb24310_c{chrom[3:]}_b{b_val}_v1.vcf.gz.tbi' "
        cmd += f"-iin='{results_dir}/gene_ranges.txt' "
        cmd += f"-iin='{results_dir}/sample_ids.txt' "
        cmd += f"-icmd='bcftools view -R gene_ranges.txt -O z -S sample_ids.txt ukb24310_c{chrom[3:]}_b{b_val}_v1.vcf.gz -o {gene_name}_b{b_val}.vcf.gz' "
        cmd += f"--instance-type mem2_ssd1_v2_x4 "
        cmd += f"--destination '{projectid}:{results_dir}/01_pvcf_chunks'"

        result = subprocess.run(
            cmd, 
            shell=True, 
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        if result.returncode != 0:
            print(f"Error running command for {gene_name} (b_val={b_val}):")
            print(result.stderr.decode("utf-8"))
            
        print(b_val)
    print()
    

# Combine pVCF chunks

In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
cmd = f"dx run swiss-army-knife "
icmd = "-icmd='bcftools concat -O z "

for gene_name in gene_names:
    start = gene_info[gene_name]["start"]
    end = gene_info[gene_name]["end"]
    start_bval = start // 20000 - 1
    end_bval = end // 20000 + 1
    chrom = gene_info[gene_name]['chromosome'] 
    for b_val in range(start_bval, end_bval + 1):
        cmd += f"-iin='{results_dir}/01_pvcf_chunks/{gene_name}_b{b_val}.vcf.gz' "
        icmd += f"{gene_name}_b{b_val}.vcf.gz "
icmd += f"-o concat.vcf.gz' "
cmd += icmd
cmd += f"--instance-type mem2_ssd1_v2_x32 "
cmd += f"--destination '{projectid}:{results_dir}/02_pvcf_concat'"

result = subprocess.run(
    cmd, 
    shell=True, 
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
)

if result.returncode != 0:
    print(f"Error running command:")
    print(result.stderr.decode("utf-8"))
    

# Normalize VCFs before annotation

### Split multiallelic sites into biallelic records

In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
cmd = f"dx run swiss-army-knife "
cmd += f"-iin='{results_dir}/02_pvcf_concat/concat.vcf.gz' "
cmd += f"-icmd='bcftools norm -m-both -O z -o biallelic.vcf.gz concat.vcf.gz' "
cmd += f"--instance-type mem2_ssd1_v2_x16 "
cmd += f"--destination '{projectid}:{results_dir}/03_pvcf_normalized'"

result = subprocess.run(
    cmd, 
    shell=True, 
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
)

if result.returncode != 0:
    print(f"Error running command:")
    print(result.stderr.decode("utf-8"))
    

### Left-align and normalize

In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
cmd = f"dx run swiss-army-knife "
cmd += f"-iin='{results_dir}/03_pvcf_normalized/biallelic.vcf.gz' "
cmd += f"-iin='/data/Homo_sapiens_assembly38.fasta' "
cmd += f"-icmd='bcftools norm -f Homo_sapiens_assembly38.fasta -O z -o normalized.vcf.gz biallelic.vcf.gz' "
cmd += f"--instance-type mem2_ssd1_v2_x16 "
cmd += f"--destination '{projectid}:{results_dir}/03_pvcf_normalized'"

result = subprocess.run(
    cmd, 
    shell=True, 
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
)

if result.returncode != 0:
    print(f"Error running command:")
    print(result.stderr.decode("utf-8"))


# Generate plink files for each ancestry-phenotype pair

## Subset IDs for all cohort-ancestry combinations

In [None]:
! dx download {results_dir}/ad_cases.txt --overwrite
! dx download {results_dir}/rd_cases.txt --overwrite
! dx download {results_dir}/controls.txt --overwrite


In [None]:
df_ad = pd.read_csv("ad_cases.txt", sep="\t")
df_rd = pd.read_csv("rd_cases.txt", sep="\t")
df_controls = pd.read_csv("controls.txt", sep="\t")


In [None]:
df_ad = df_ad[["ID","ancestry"]]
df_rd = df_rd[["ID","ancestry"]]
df_controls = df_controls[["ID","ancestry"]]


In [None]:
pheno_ancestry_combos = []

for ancestry in df_ad["ancestry"].unique():
    ids = df_ad[df_ad["ancestry"] == ancestry]["ID"]
    with open(f"ad_ids_{ancestry}.txt", 'w') as file:
        for iid in ids:
            file.write(f"{iid}\n")
    ! dx upload ad_ids_{ancestry}.txt --path {results_dir}/00_id_files/ad_ids_{ancestry}.txt
    pheno_ancestry_combos.append(["ad", ancestry])

for ancestry in df_rd["ancestry"].unique():
    ids = df_rd[df_rd["ancestry"] == ancestry]["ID"]
    with open(f"rd_ids_{ancestry}.txt", 'w') as file:
        for iid in ids:
            file.write(f"{iid}\n")
    ! dx upload rd_ids_{ancestry}.txt --path {results_dir}/00_id_files/rd_ids_{ancestry}.txt
    pheno_ancestry_combos.append(["rd", ancestry])

for ancestry in df_controls["ancestry"].unique():
    ids = df_controls[df_controls["ancestry"] == ancestry]["ID"]
    with open(f"control_ids_{ancestry}.txt", 'w') as file:
        for iid in ids:
            file.write(f"{iid}\n")
    ! dx upload control_ids_{ancestry}.txt --path {results_dir}/00_id_files/control_ids_{ancestry}.txt
    pheno_ancestry_combos.append(["control", ancestry])


## Generate plink files

In [None]:
for pheno_ancestry_combo in pheno_ancestry_combos:
    pheno = pheno_ancestry_combo[0]
    ancestry = pheno_ancestry_combo[1]
    
    cmd = f"dx run swiss-army-knife "
    cmd += f"-iin='{results_dir}/03_pvcf_normalized/normalized.vcf.gz' "
    cmd += f"-iin='{results_dir}/00_id_files/{pheno}_ids_{ancestry}.txt' "
    cmd += f"-icmd='plink2 --vcf normalized.vcf.gz --set-all-var-ids \"chr@:#:\\$r:\\$a\" --new-id-max-allele-len 999 --keep {pheno}_ids_{ancestry}.txt --make-pgen --out {pheno}_{ancestry}' "
    cmd += f"--instance-type mem2_ssd1_v2_x32 "
    cmd += f"--destination '{projectid}:{results_dir}/04_plink'"

    result = subprocess.run(
        cmd, 
        shell=True, 
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )

    if result.returncode != 0:
        print(f"Error running command:")
        print(result.stderr.decode("utf-8"))


# Annotation

## Filter VCFs to only include a few participants

In [None]:
cmd = f"dx run swiss-army-knife "
cmd += f"-iin='{results_dir}/03_pvcf_normalized/normalized.vcf.gz' "
cmd += f"-icmd='bcftools view -O z -G normalized.vcf.gz -o annovar_input.vcf.gz' "
cmd += f"--instance-type mem2_ssd1_v2_x4 "
cmd += f"--destination '{projectid}:{results_dir}/05_annotated'"

result = subprocess.run(
    cmd, 
    shell=True, 
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
)

if result.returncode != 0:
    print(f"Error running command:")
    print(result.stderr.decode("utf-8"))


## Fetch Annovar libraries and reference genome data

In [None]:
%%capture

! wget http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz
! tar -xzf annovar.latest.tar.gz
! chmod a+x ./annovar/*.pl
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar refGene annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar avsnp151 annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar clinvar_20240917 annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar dbnsfp47a annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar dbnsfp47a_interpro annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar gnomad41_genome annovar/humandb/
! dx download /data/Homo_sapiens_assembly38.fasta --overwrite
! dx download /data/Homo_sapiens_assembly38.fasta.fai --overwrite
! dx download /data/Homo_sapiens_assembly38.dict --overwrite


In [None]:
! dx download {results_dir}/05_annotated/annovar_input.vcf.gz


## Perform annotation

In [None]:
cmd = f"annovar/table_annovar.pl annovar_input.vcf.gz annovar/humandb/ "
cmd += f"--buildver hg38 "
cmd += f"--thread 36 "
cmd += f"--remove "
cmd += f"--protocol refGene,avsnp151,clinvar_20240917,dbnsfp47a,dbnsfp47a_interpro,gnomad41_genome "
cmd += f"--operation g,f,f,f,f,f "
cmd += f"--nopolish "
cmd += f"--nastring . "
cmd += f"--out annotated "
cmd += f"--vcfinput "

result = subprocess.run(
    cmd, 
    shell=True, 
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
)

if result.returncode != 0:
    print(f"Error running command:")
    print(result.stderr.decode("utf-8"))


In [None]:
! mv annotated.hg38_multianno.txt annotated.txt
! dx upload annotated.txt --path {results_dir}/05_annotated/annotated.txt


# Allele frequencies

In [None]:
for pheno_ancestry_combo in pheno_ancestry_combos:
    pheno = pheno_ancestry_combo[0]
    ancestry = pheno_ancestry_combo[1]

    cmd = f"dx run swiss-army-knife "
    cmd += f"-iin='{results_dir}/04_plink/{pheno}_{ancestry}.pgen' "
    cmd += f"-iin='{results_dir}/04_plink/{pheno}_{ancestry}.pvar' "
    cmd += f"-iin='{results_dir}/04_plink/{pheno}_{ancestry}.psam' "
    cmd += f"-icmd='plink2 --pfile {pheno}_{ancestry} --freq --out {pheno}_{ancestry}' "
    cmd += f"--instance-type mem2_ssd1_v2_x4 "
    cmd += f"--destination '{projectid}:{results_dir}/06_frequencies'"

    result = subprocess.run(
        cmd, 
        shell=True, 
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )

    if result.returncode != 0:
        print(f"Error running command:")
        print(result.stderr.decode("utf-8"))


In [None]:
pheno_ancestry_combos_arr = np.array(pheno_ancestry_combos)

df_merged = None
for pheno in ["ad","rd","control"]:
    ancestries = pheno_ancestry_combos_arr[pheno_ancestry_combos_arr[:,0] == pheno][:,1]

    for ancestry in ancestries:
        ! dx download {results_dir}/06_frequencies/{pheno}_{ancestry}.afreq --overwrite
        
        df = pd.read_csv(f"{pheno}_{ancestry}.afreq", sep="\t")
        df = df[["ID","ALT_FREQS","OBS_CT"]]
        df.rename({"ALT_FREQS":f"ALT_FREQS_{ancestry}_{pheno.upper()}", "OBS_CT":f"OBS_CT_{ancestry}_{pheno.upper()}"}, inplace=True, axis=1)

        if df_merged is None:
            df_merged = df
            print(df_merged.shape)
        else:
            df_merged = df_merged.merge(df, on="ID")
            print(df_merged.shape)
            
        ! rm {pheno}_{ancestry}.afreq

df_merged.to_csv(f"frequencies.txt", index=False, sep="\t")
! dx upload frequencies.txt --path {results_dir}/06_frequencies/frequencies.txt


# Zygosity

## Recode files

In [None]:
for pheno_ancestry_combo in pheno_ancestry_combos:
    pheno = pheno_ancestry_combo[0]
    ancestry = pheno_ancestry_combo[1]

    cmd = f"dx run swiss-army-knife "
    cmd += f"-iin='{results_dir}/04_plink/{pheno}_{ancestry}.pgen' "
    cmd += f"-iin='{results_dir}/04_plink/{pheno}_{ancestry}.pvar' "
    cmd += f"-iin='{results_dir}/04_plink/{pheno}_{ancestry}.psam' "
    cmd += f"-iin='{results_dir}/06_frequencies/{pheno}_{ancestry}.afreq' "
    cmd += f"-icmd='plink2 --pfile {pheno}_{ancestry} --read-freq {pheno}_{ancestry}.afreq --export A --out {pheno}_{ancestry}' "
    cmd += f"--instance-type mem2_ssd1_v2_x4 "
    cmd += f"--destination '{projectid}:{results_dir}/07_zygosity'"

    result = subprocess.run(
        cmd, 
        shell=True, 
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )

    if result.returncode != 0:
        print(f"Error running command:")
        print(result.stderr.decode("utf-8"))


## Find homozygous/heterozygous counts

In [None]:
%%bash

for pheno in {"ad","rd","control"};
do
    for ancestry in {"AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"};
    do
        dx download /results/dementia_project/07_zygosity/${pheno}_${ancestry}.raw --overwrite || 
        { 
            echo "No file found at /results/dementia_project/07_zygosity/${pheno}_${ancestry}.raw"; continue; 
        }
        
        output_file="counts_${pheno}_${ancestry}.txt"
        temp_output="temp_${pheno}_${ancestry}.txt"

        > $output_file

        input_file="cut.raw"
        grep "FID" ${pheno}_${ancestry}.raw | cut -d$'\t' -f7- > header.tmp
        cut -d$'\t' -f7- "${pheno}_${ancestry}.raw" > $input_file

        declare -A count_2
        declare -A count_1
        declare -A count_0

        awk -F'\t' '
        {
            for (i=1; i<=NF; i++) {
                if ($i == 2) count_2[i]++;
                else if ($i == 1) count_1[i]++;
                else if ($i == 0) count_0[i]++;
            }
        }
        END {
            for (i=1; i<=NF; i++) {
                printf("%d", count_2[i]);
                if (i<NF) printf("\t");
            }
            print "";

            for (i=1; i<=NF; i++) {
                printf("%d", count_1[i]);
                if (i<NF) printf("\t");
            }
            print "";

            for (i=1; i<=NF; i++) {
                printf("%d", count_0[i]);
                if (i<NF) printf("\t");
            }
            print "";
        }' $input_file > count.tmp

        cat header.tmp count.tmp > $output_file
        rm $input_file count.tmp header.tmp ${pheno}_${ancestry}.raw
        echo "Counts have been appended to $output_file"
    done
done


In [None]:
for pheno_ancestry_combo in pheno_ancestry_combos:
    pheno = pheno_ancestry_combo[0]
    ancestry = pheno_ancestry_combo[1]
    ! dx upload counts_{pheno}_{ancestry}.txt --path {results_dir}/07_zygosity/counts_{pheno}_{ancestry}.txt


In [None]:
for pheno in ["ad","rd","control"]:
    zyg_pheno = []
    for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
        try:
            df = pd.read_csv(f"counts_{pheno}_{ancestry}.txt", sep="\t")
            variant_ids = df.columns.values
            rename_dict = {}
            for vid in variant_ids:
                rename_dict[vid] = vid.split("_")[0]
            df.rename(rename_dict, axis=1, inplace=True)
            zyg_pheno.append(df)
        except:
            print(f"No data found at counts_{pheno}_{ancestry}.txt")
    result_zyg = reduce(lambda x, y: x + y, zyg_pheno)
    result_zyg.to_csv(f"{pheno}_counts.txt", index=False, sep="\t")
    ! dx upload {pheno}_counts.txt --path {results_dir}/07_zygosity/{pheno}_counts.txt


In [None]:
! dx download {results_dir}/07_zygosity/ad_counts.txt
! dx download {results_dir}/07_zygosity/rd_counts.txt
! dx download {results_dir}/07_zygosity/control_counts.txt


In [None]:
ad_zyg = pd.read_csv(f"ad_counts.txt", sep="\t")
rd_zyg = pd.read_csv(f"rd_counts.txt", sep="\t")
control_zyg = pd.read_csv(f"control_counts.txt", sep="\t")

ad_zyg = ad_zyg.T
rd_zyg = rd_zyg.T
control_zyg = control_zyg.T

ad_zyg = ad_zyg.reset_index()
rd_zyg = rd_zyg.reset_index()
control_zyg = control_zyg.reset_index()

ad_zyg.rename(columns={'index': 'ID', 0: 'AD_Homozygous_Ref', 1: 'AD_Heterozygous', 2: 'AD_Homozygous_Alt'}, inplace=True)
rd_zyg.rename(columns={'index': 'ID', 0: 'RD_Homozygous_Ref', 1: 'RD_Heterozygous', 2: 'RD_Homozygous_Alt'}, inplace=True)
control_zyg.rename(columns={'index': 'ID', 0: 'Control_Homozygous_Ref', 1: 'Control_Heterozygous', 2: 'Control_Homozygous_Alt'}, inplace=True)

final_zyg = ad_zyg.merge(rd_zyg, on="ID")
final_zyg = final_zyg.merge(control_zyg, on="ID")

final_zyg.to_csv("zygosity.txt", index=False, sep="\t")
! dx upload zygosity.txt --path {results_dir}/07_zygosity/zygosity.txt


# Merge annotation, frequency, and zygosity files

In [None]:
! dx download {results_dir}/05_annotated/annotated.txt --overwrite
! dx download {results_dir}/06_frequencies/frequencies.txt --overwrite
! dx download {results_dir}/07_zygosity/zygosity.txt --overwrite


In [None]:
df_anno = pd.read_csv("annotated.txt", sep="\t")
df_anno.insert(1, "ID", df_anno[["Chr","Start","Ref","Alt"]].astype(str).agg(':'.join, axis=1))
df_anno = df_anno[[col for col in df_anno.columns if "Otherinfo" not in col]]

df_freq = pd.read_csv("frequencies.txt", sep="\t")
df_freq.drop(columns="ID", inplace=True)

df_zyg = pd.read_csv("zygosity.txt", sep="\t")
df_zyg.drop(columns="ID", inplace=True)

df_merged = pd.concat([df_anno, df_freq, df_zyg], axis=1)
df_merged.to_csv("merged.txt", index=False, sep="\t")
! dx upload merged.txt --path {results_dir}/08_merged/merged.txt


# Filter variants

## Only include exonic/splicing variants present in cases

In [None]:
df_filtered = df_merged[df_merged["Func.refGene"].isin(["exonic", "splicing"])]

ctrl_col_names = [f"ALT_FREQS_{ancestry}_CONTROL" for ancestry in ancestries if f"ALT_FREQS_{ancestry}_CONTROL" in df_filtered.columns.values]
ad_col_names = [f"ALT_FREQS_{ancestry}_AD" for ancestry in ancestries if f"ALT_FREQS_{ancestry}_AD" in df_filtered.columns.values]
rd_col_names = [f"ALT_FREQS_{ancestry}_RD" for ancestry in ancestries if f"ALT_FREQS_{ancestry}_RD" in df_filtered.columns.values]

df_filtered = df_filtered[(df_filtered[ad_col_names + rd_col_names]>0).any(axis=1)]
df_filtered["Disease"] = ""
df_filtered.loc[(df_filtered[ad_col_names]>0).any(axis=1), "Disease"] = "AD"
df_filtered.loc[(df_filtered[rd_col_names]>0).any(axis=1), "Disease"] = "RD"
df_filtered.loc[(df_filtered[ad_col_names]>0).any(axis=1) & (df_filtered[rd_col_names]>0).any(axis=1), "Disease"] = "Both"
display(df_filtered)

df_filtered.to_csv("variants_in_cases.txt", index=False, sep="\t")
! dx upload variants_in_cases.txt --path {results_dir}/08_merged/variants_in_cases.txt


## Remove variants expressed in controls

In [None]:
df_filtered = df_filtered[(df_filtered["Control_Heterozygous"] == 0) & (df_filtered["Control_Homozygous_Alt"] == 0)]
df_filtered.to_csv("variants_in_cases_nocontrols.txt", index=False, sep="\t")
! dx upload variants_in_cases_nocontrols.txt --path {results_dir}/08_merged/variants_in_cases_nocontrols.txt


# APOE Genotyping

In [None]:
for pheno_ancestry_combo in pheno_ancestry_combos:
    pheno = pheno_ancestry_combo[0]
    ancestry = pheno_ancestry_combo[1]

    cmd = f"dx run swiss-army-knife "
    cmd += f"-iin='{results_dir}/04_plink/{pheno}_{ancestry}.pvar' "
    cmd += f"-iin='{results_dir}/04_plink/{pheno}_{ancestry}.psam' "
    cmd += f"-iin='{results_dir}/04_plink/{pheno}_{ancestry}.pgen' "
    cmd += f"-iin='/data/apoe_variants.txt' "
    cmd += f"-icmd='plink2 --pfile {pheno}_{ancestry} --extract apoe_variants.txt --make-bed --export compound-genotypes --out apoe_snps_{pheno}_{ancestry}' "
    cmd += f"--instance-type mem2_ssd1_v2_x4 "
    cmd += f"--destination '{projectid}:{results_dir}/09_apoe_genotyping'"

    result = subprocess.run(
        cmd,
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )

    if result.returncode != 0:
        print(f"Error running command:")
        print(result.stderr.decode("utf-8"))


In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
! dx download /data/APOE_genotypes_PLINK_ped.py --overwrite
for pheno_ancestry_combo in pheno_ancestry_combos:
    pheno = pheno_ancestry_combo[0]
    ancestry = pheno_ancestry_combo[1]
    ! dx download {results_dir}/09_apoe_genotyping/apoe_snps_{pheno}_{ancestry}.ped --overwrite
    ! python APOE_genotypes_PLINK_ped.py -i apoe_snps_{pheno}_{ancestry}.ped -o apoe_final_{pheno}_{ancestry}
    ! dx upload apoe_final_{pheno}_{ancestry}.APOE_GENOTYPES.csv --path {results_dir}/09_apoe_genotyping/apoe_final_{pheno}_{ancestry}.APOE_GENOTYPES.csv
    ! rm apoe_snps_{pheno}_{ancestry}.ped
    

In [None]:
arr_pheno_ancestry = np.array(pheno_ancestry_combos)
for pheno in ["ad","rd","control"]:
    ancestries = arr_pheno_ancestry[arr_pheno_ancestry[:,0] == pheno, 1]
    pheno_counts = []

    for ancestry in ancestries:
        apoe_geno = pd.read_csv(f"apoe_final_{pheno}_{ancestry}.APOE_GENOTYPES.csv")
        display(apoe_geno)
        counts = apoe_geno['APOE_GENOTYPE'].value_counts()
        counts = counts.reindex(["e1/e1", "e1/e2", "e1/e4", "e2/e2", "e2/e3", "e2/e4 or e1/e3", "e3/e3", "e3/e4", "e4/e4"], fill_value=0)
        counts = counts.rename(ancestry)
        counts = pd.concat([counts, pd.Series([counts.sum()], index=['total'])])
        pheno_counts.append(counts)

    pheno_counts = pd.concat(pheno_counts, axis=1)
    pheno_counts.columns = ancestries

    pheno_percentages = pheno_counts.div(pheno_counts.loc["total"], axis=1) * 100
    pheno_percentages.loc["total"] = pheno_counts.loc["total"]

    pheno_combined = pheno_counts.applymap(str) + ' (' + pheno_percentages.map(lambda x: f'{x:.2f}%') + ')'
    pheno_combined.loc["total"] = pheno_counts.loc["total"]

    pheno_counts.to_csv(f"{pheno}_apoe_genotype_counts.txt", sep="\t")
    pheno_percentages.to_csv(f"{pheno}_apoe_genotype_percentages.txt", sep="\t")
    pheno_combined.to_csv(f"{pheno}_apoe_genotype_combined.txt", sep="\t")

    ! dx upload {pheno}_apoe_genotype_counts.txt --path {results_dir}/09_apoe_genotyping/{pheno}_apoe_genotype_counts.txt
    ! dx upload {pheno}_apoe_genotype_percentages.txt --path {results_dir}/09_apoe_genotyping/{pheno}_apoe_genotype_percentages.txt
    ! dx upload {pheno}_apoe_genotype_combined.txt --path {results_dir}/09_apoe_genotyping/{pheno}_apoe_genotype_combined.txt


# Find number of controls with pathogenic variants

In [None]:
pathogenic_vars = [
    "chr1:155235196:G:A",
    "chr1:155235217:C:G",
    "chr1:155235252:A:G",
    "chr1:155235727:C:G",
    "chr1:155235790:C:T",
    "chr1:155235823:C:T",
    "chr1:155235843:T:C",
    "chr1:155236277:G:A",
    "chr1:155237453:C:T",
    "chr1:155238174:C:T",
    "chr1:155238215:T:C",
    "chr1:155238260:G:C",
    "chr1:155238630:G:A",
    "chr1:155240629:C:T",
    "chr17:44350262:TAGTC:T",
    "chr17:44350800:CGTGA:C",
    "chr17:44351409:T:C",
    "chr4:89828156:A:C",
]


In [None]:
arr_pheno_ancestry = np.array(pheno_ancestry_combos)
ancestries = arr_pheno_ancestry[arr_pheno_ancestry[:,0] == "control", 1]
control_zyg = []

for ancestry in ancestries:
    ! dx download {results_dir}/07_zygosity/counts_control_{ancestry}.txt --overwrite
    
    df = pd.read_csv(f"counts_control_{ancestry}.txt", sep="\t")
    variant_ids = df.columns.values
    rename_dict = {}
    for vid in variant_ids:
        rename_dict[vid] = vid.split("_")[0]
    df.rename(rename_dict, axis=1, inplace=True)
    df = df.T
    df = df.reset_index()
    df.rename(columns={'index': 'ID', 0: f'{ancestry}_Homozygous_Ref', 1: f'{ancestry}_Heterozygous', 2: f'{ancestry}_Homozygous_Alt'}, inplace=True)
    df = df[df['ID'].str.startswith(tuple(pathogenic_vars))]
    df[ancestry] = df[f'{ancestry}_Heterozygous'] + df[f'{ancestry}_Homozygous_Alt']
    df = df[["ID",ancestry]]
    control_zyg.append(df)
    
    ! rm counts_control_{ancestry}.txt

control_zyg_merged = reduce(lambda left, right: pd.merge(left, right, on="ID"), control_zyg)


In [None]:
control_zyg_merged


In [None]:
control_zyg_merged.to_csv("filtered_control_var_counts.txt", index=False, sep="\t")
! dx upload filtered_control_var_counts.txt --path {results_dir}/10_pathogenic_variants/filtered_control_var_counts.txt


# Phenotypic characteristics

## Download data tables

In [None]:
! dx download {results_dir}/08_merged/variants_in_cases_nocontrols.txt --overwrite
! dx download {results_dir}/08_merged/merged.txt --overwrite
! dx download {results_dir}/06_frequencies/control_EUR.afreq --overwrite
! dx download /data/protein_var_map.txt --overwrite
! dx download {results_dir}/ad_cases.txt --overwrite
! dx download {results_dir}/rd_cases.txt --overwrite


## Generate mapping between ID format from VCFs to Annovar

In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
df_freq = pd.read_csv("control_EUR.afreq", sep="\t")
vcf_ids = list(df_freq["ID"])

annovar_ids = list(pd.read_csv("merged.txt", sep="\t")["ID"])

id_mapping = pd.DataFrame({"vcf_ids": vcf_ids, "annovar_ids": annovar_ids})
ids_filtered = pd.read_csv("variants_in_cases_nocontrols.txt", sep="\t")[["ID"]]
ids_filtered.rename(columns={"ID":"annovar_ids"}, inplace=True)

id_mapping = id_mapping.merge(ids_filtered, on="annovar_ids")
id_mapping[["vcf_ids"]].to_csv("variants_to_keep.txt", header=None, index=None)
! dx upload variants_to_keep.txt --path {results_dir}/11_phenotypic_data/variant_ids.txt
id_mapping = id_mapping.set_index("vcf_ids")["annovar_ids"].to_dict()


## Fetch raw files for variants exclusively in cases

In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
cmd = f"dx run swiss-army-knife "
cmd += f"-iin='{results_dir}/03_pvcf_normalized/normalized.vcf.gz' "
cmd += f"-iin='{results_dir}/11_phenotypic_data/variant_ids.txt' "
cmd += f"-icmd='plink2 --vcf normalized.vcf.gz --set-all-var-ids \"chr@:#:\\$r:\\$a\" --new-id-max-allele-len 999 --extract variant_ids.txt --freq --out full_cohort' "
cmd += f"--instance-type mem2_ssd1_v2_x4 "
cmd += f"--destination '{projectid}:{results_dir}/11_phenotypic_data'"

result = subprocess.run(
    cmd, 
    shell=True, 
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
)

if result.returncode != 0:
    print(f"Error running command:")
    print(result.stderr.decode("utf-8"))


In [None]:
cmd = f"dx run swiss-army-knife "
cmd += f"-iin='{results_dir}/03_pvcf_normalized/normalized.vcf.gz' "
cmd += f"-iin='{results_dir}/11_phenotypic_data/variant_ids.txt' "
cmd += f"-iin='{results_dir}/11_phenotypic_data/full_cohort.afreq' "
cmd += f"-icmd='plink2 --vcf normalized.vcf.gz --set-all-var-ids \"chr@:#:\\$r:\\$a\" --new-id-max-allele-len 999 --extract variant_ids.txt --read-freq full_cohort.afreq --export A --het --out full_cohort' "
cmd += f"--instance-type mem2_ssd1_v2_x4 "
cmd += f"--destination '{projectid}:{results_dir}/11_phenotypic_data'"

result = subprocess.run(
    cmd, 
    shell=True, 
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
)

if result.returncode != 0:
    print(f"Error running command:")
    print(result.stderr.decode("utf-8"))


In [None]:
! dx download {results_dir}/11_phenotypic_data/full_cohort.raw --overwrite


## Reformat variant IDs and fill NA values

In [None]:
df_raw = pd.read_csv("full_cohort.raw", sep="\t")
variant_ids = df_raw.columns.values
rename_dict = {}
for vid in variant_ids:
    rename_dict[vid] = vid.split("_")[0]
df_raw.rename(rename_dict, axis=1, inplace=True)
df_raw.drop(columns=["FID","PAT","MAT","SEX","PHENOTYPE"], inplace=True)
df_raw.fillna(2, inplace=True)


## Keep track of participants with each variant

In [None]:
list_vars = []
list_ids = []
for variant in df_raw.columns[1:]:
    var_ids = df_raw[df_raw[variant] != 2]['IID'].tolist()
    for iid in var_ids:
        list_vars.append(variant)
        list_ids.append(iid)
    

In [None]:
df_pheno = pd.DataFrame({
    "variant_id_full":list_vars,
    "participant_id":list_ids,
})
df_pheno["variant_id_full"] = df_pheno["variant_id_full"].map(id_mapping)
df_pheno["variant_id"] = df_pheno["variant_id_full"].map(lambda x: ":".join(x.split(":")[:2]))

protein_var_map = pd.read_csv("protein_var_map.txt", sep="\t")
df_pheno = df_pheno.merge(protein_var_map, on="variant_id")


## Fetch and save phenotypic data

In [None]:
df_ad = pd.read_csv("ad_cases.txt", sep="\t")
df_rd = pd.read_csv("rd_cases.txt", sep="\t")

df_ad.rename({"AD_DATE":"DATE_OF_ONSET", "ID":"participant_id"}, inplace=True, axis=1)
df_rd.rename({"DEM_DATE":"DATE_OF_ONSET", "ID":"participant_id"}, inplace=True, axis=1)

df_dem = pd.concat([df_ad, df_rd], axis=0)
df_dem.reset_index(inplace=True, drop=True)


In [None]:
most_recent_disease_date = pd.to_datetime(df_dem["DATE_OF_ONSET"]).max()

df_dem["AGE_AT_ONSET"] = (pd.to_datetime(df_dem['DATE_OF_ONSET']) - pd.to_datetime(df_dem["BIRTH_YEAR"], format='%Y')).dt.days // 365.242374
df_dem["DAYS_SINCE_ONSET"] = (pd.to_datetime(df_dem['DATE_OF_DEATH'].fillna(most_recent_disease_date)) - pd.to_datetime(df_dem['DATE_OF_ONSET'])).dt.days
df_dem["AGE"] = (pd.to_datetime(df_dem['DATE_OF_DEATH'].fillna(most_recent_disease_date)) - pd.to_datetime(df_dem['BIRTH_YEAR'], format='%Y')).dt.days // 365.242374


In [None]:
df_pheno = df_pheno.merge(df_dem[[
    "participant_id",
    "GENETIC_SEX",
    "AGE",
    "DATE_OF_ONSET",
    "DATE_OF_DEATH",
    "AGE_AT_ONSET",
    "DAYS_SINCE_ONSET",
    "ancestry",
    "COGNITIVE_SYMPTOMS_SEVERITY_PAST_WEEK", 
]], on="participant_id", how="inner")


In [None]:
display(df_pheno)
df_pheno.to_csv("pheno.txt", sep="\t")
! dx upload pheno.txt --path {results_dir}/11_phenotypic_data/pheno.txt
