# AMP-PD

* **Project:** ADRD Genetic Diversity in Biobanks
* **Version:** Python/3.9 and 3.10
* **Last Updated:** 22-August-2024

## Notebook Overview
Check variants, APOE genotyping, define cases/controls by ancestry, demographic data, resilience/protective variants

# Query AMP PD to check for variants of interest 

## Variables used 
- `${ANCESTRY}` = EUR, AFR, AMR, AAC, AJ, MDE, SAS, CAS, EAS, FIN, CAH
- `${COHORT}` = AD, Dementia, Control or Case, Control
- `${COUNT}` = Number of total individuals in each ancestry
- `chr${}:Position:A1:A2` = Chromosome number, position, reference and alternative alleles

In [None]:
import pandas as pd

In [None]:
%%writefile allvar_keep.txt
chr${}:${Position}:${A1}:${A2}

In [None]:
%%bash
plink2 --pfile ${WORK_DIR}/FILTERED.AMP_PD_${ANCESTRY} \
--extract allvar_keep.txt --make-bed --out AMPDLB_vars_all_${ANCESTRY}

In [None]:
%%bash
module load plink/1.9
plink --bfile AMPDLB_vars_all_${ANCESTRY} --freq --out AMPDLB_vars_all_${ANCESTRY}_freq
cat AMPDLB_vars_all_${ANCESTRY}_freq.frq

In [None]:
qc_covar_DLB = pd.read_csv("${WORK_DIR}/COVFILE.csv", sep=",")
qc_covar_DLB.head()

In [None]:
qc_case_DLB = qc_covar_DLB[qc_covar_DLB["DLB_PHENO"]==2]
qc_case_DLB.info()

In [72]:
qc_case_DLB_plink = qc_case_DLB[["ID"]]
qc_case_DLB_plink.to_csv("qc_case_DLB_plink.txt", sep=",", index=False)

In [73]:
!awk '{print $0, $1}' qc_case_DLB_plink.txt > qc_case_DLB_ID_ID_plink.txt

In [None]:
qc_control_DLB = qc_covar_DLB[qc_covar_DLB["DLB_PHENO"]==1]
qc_control_DLB.info()

In [76]:
qc_control_DLB_plink = qc_control_DLB[["ID"]]
qc_control_DLB_plink.to_csv("qc_control_DLB_plink.txt", sep=",", index=False)

In [77]:
!awk '{print $0, $1}' qc_control_DLB_plink.txt > qc_control_DLB_ID_ID_plink.txt

In [None]:
%%bash
module load plink/1.9
plink --bfile AMPDLB_vars_all_${ANCESTRY} --keep qc_${COHORT}_DLB_ID_ID_plink.txt --make-bed --out AMPDLB_vars_all_${ANCESTRY}_${COHORT}

In [None]:
%%bash
module load plink/1.9
plink --bfile AMPDLB_vars_all_${ANCESTRY}_${COHORT} --freq --out AMPDLB_vars_all_${ANCESTRY}_${COHORT}
cat AMPDLB_vars_all_${ANCESTRY}_${COHORT}.frq

# Query AMP PD for APOE genotyping

In [None]:
import numpy as np
import pandas as pd
import sys
from functools import reduce
import argparse

In [None]:
%%writefile var_keepapoe.txt
chr19:44908684:T:C
chr19:44908822:C:T

In [None]:
%%bash
module load plink
plink2 --pfile ${WORK_DIR}/FILTERED.AMP_PD_${ANCESTRY} \
--extract var_keepapoe.txt --make-bed --out AMP_PD_vars_${ANCESTRY}_apoe

In [None]:
%%bash
module load plink/1.9
plink --bfile AMP_PD_vars_${ANCESTRY}_apoe --keep qc_${COHORT}_DLB_ID_ID_plink.txt --make-bed --out AMP_DLB_vars_${ANCESTRY}_apoe_${COHORT}

In [None]:
%%bash
module load plink/1.9
plink --bfile AMP_DLB_vars_${ANCESTRY}_apoe_${COHORT} --recode compound-genotypes --out AMP_DLB_vars_${ANCESTRY}_apoe_${COHORT}_recode

In [None]:
%%writefile APOE_genotypes_PLINK_ped.py
#!/bin/env python

# Determine APOE genotypes from PLINK output
    # January 2021
    # Mary B. Makarious, Makayla Portley, and Cornelis Blauwendraat (LNG/NIA/NINDS/NIH)
    # Script usage:
        # python APOE_genotypes_PLINK_ped.py -i INPUT.ped -o OUTPUT_NAME

## APOE Information
# https://www.snpedia.com/index.php/APOE

    # |          APOE GENO         	| rs429358 	| rs7412 	|             COMBINED             	|
    # |:--------------------------:	|:--------:	|:------:	|:--------------------------------:	|
    # |            e1/e1           	|    CC    	|   TT   	|               CC_TT              	|
    # |            e1/e2           	|    CT    	|   TT   	|          CT_TT or TC_TT          	|
    # |            e1/e4           	|    CC    	|   CT   	|          CC_CT or CC_TC          	|
    # |            e2/e2           	|    TT    	|   TT   	|               TT_TT              	|
    # |            e2/e3           	|    TT    	|   TC   	|          TT_TC or TT_CT          	|
    # | e2/e4 or e1/e3 (Ambiguous) 	|    TC    	|   TC   	| TC_TC or CT_CT or TC_CT or CT_TC 	|
    # |            e3/e3           	|    TT    	|   CC   	|               TT_CC              	|
    # |            e3/e4           	|    TC    	|   CC   	|          TC_CC or CT_CC          	|
    # |            e4/e4           	|    CC    	|   CC   	|               CC_CC              	|

# Import the necessary packages
import numpy as np
import pandas as pd
import sys
from functools import reduce
import argparse

# Initialize parser and add arguments
parser = argparse.ArgumentParser()
parser.add_argument("--input", "-i", help="Input file name (with suffix)")
parser.add_argument("--output", "-o", help="Desired output name (without suffix)")
args = parser.parse_args()

# Read in the .ped file and force column names
header_text = ["FID", "IID", "PAT", "MAT", "SEX", "PHENO", "rs429358", "rs7412"]
input_ped_df = pd.read_csv(args.input, sep = " ", header=None, names=header_text)

# Make a combined column, gluing the genotypes from the rs429358 and rs7412 columns
input_ped_df['rs429358_rs7412'] = input_ped_df['rs429358'].astype(str)+'_'+input_ped_df['rs7412']

# Initialize a dictionary with the genotypes to search what genotype the alleles generate
apoe_genotypes_dict = {
    'CC_TT' : 'e1/e1',
    'CT_TT' : 'e1/e2',
    'TC_TT' : 'e1/e2',
    'CC_CT' : 'e1/e4',
    'CC_TC' : 'e1/e4',
    'TT_TT' : 'e2/e2',
    'TT_TC' : 'e2/e3',
    'TT_CT' : 'e2/e3',
    'TC_TC' : 'e2/e4 or e1/e3',
    'CT_CT' : 'e2/e4 or e1/e3',
    'TC_CT' : 'e2/e4 or e1/e3',
    'CT_TC' : 'e2/e4 or e1/e3',
    'TT_CC' : 'e3/e3',
    'TC_CC' : 'e3/e4',
    'CT_CC' : 'e3/e4',
    'CC_CC' : 'e4/e4'
}

# Map the combined column to the dictionary to extract the genotypes
input_ped_df['APOE_GENOTYPE'] = input_ped_df['rs429358_rs7412'].map(apoe_genotypes_dict)

# If any of the combined alleles weren't in the dictionary, the dataframe now has NaN values
# This happens if you have a 0 or missingness somewhere, resulting in an unsure genotype call
# Replace these with something more useful, and state the APOE genotype as "unknown"
input_ped_df.replace(np.nan, 'unknown', regex=True, inplace=True)

# Make a file of just the FID, IID, SEX, PHENO, and APOE genotype
subset_geno_df = input_ped_df.drop(columns=['PAT', 'MAT', 'rs429358', 'rs7412'])

## Generate counts
# Generate APOE genotype counts and percentages for entire dataset
counts_df = pd.DataFrame(subset_geno_df['APOE_GENOTYPE'].value_counts().reset_index())
counts_df.columns = ['APOE_GENOTYPE', 'TOTAL_COUNT']
counts_df['TOTAL_PERCENT'] = counts_df['TOTAL_COUNT'] / subset_geno_df.shape[0] * 100

# Separate out into cases, controls, and missing phenotypes
    # This assumes controls=1 and cases=2 (missing is -9)

# Subset by phenotype
missing_pheno_df = subset_geno_df[subset_geno_df['PHENO'] == -9]
controls_df = subset_geno_df[subset_geno_df['PHENO'] == 1]
cases_df = subset_geno_df[subset_geno_df['PHENO'] == 2]

# Generate APOE genotype counts and percentages for missing phenotypes
missing_pheno_counts_df = pd.DataFrame(missing_pheno_df['APOE_GENOTYPE'].value_counts().reset_index())
missing_pheno_counts_df.columns = ['APOE_GENOTYPE', 'MISSING_PHENO_COUNT']
missing_pheno_counts_df['MISSING_PHENO_PERCENT'] = missing_pheno_counts_df['MISSING_PHENO_COUNT'] / missing_pheno_df.shape[0] * 100

# Generate APOE genotype counts and percentages for controls
controls_counts_df = pd.DataFrame(controls_df['APOE_GENOTYPE'].value_counts().reset_index())
controls_counts_df.columns = ['APOE_GENOTYPE', 'CONTROLS_COUNT']
controls_counts_df['CONTROLS_PERCENT'] = controls_counts_df['CONTROLS_COUNT'] / controls_df.shape[0] * 100

# Generate APOE genotype counts and percentages for cases
cases_counts_df = pd.DataFrame(cases_df['APOE_GENOTYPE'].value_counts().reset_index())
cases_counts_df.columns = ['APOE_GENOTYPE', 'CASES_COUNT']
cases_counts_df['CASES_PERCENT'] = cases_counts_df['CASES_COUNT'] / cases_df.shape[0] * 100

# Merge the dataframes together for final summary counts file
dataframes_tomerge = [counts_df, missing_pheno_counts_df, controls_counts_df, cases_counts_df]
merged_summary_df = reduce(lambda left,right: pd.merge(left,right,on='APOE_GENOTYPE'), dataframes_tomerge)

## Export
complete_df_output = args.output + ".APOE_GENOTYPES.csv"
counts_df_output = args.output + ".APOE_SUMMARY.csv"

# Save out the complete dataframe as a .csv
print(f"Your complete genotype file has been saved here: {complete_df_output}")
subset_geno_df.to_csv(complete_df_output, index=False)

# Save out the counts as a .csv
print(f"The summary counts have been saved here: {counts_df_output}")
merged_summary_df.to_csv(counts_df_output, index=False)

# Done!
print("Thanks!")


In [None]:
%%bash
python APOE_genotypes_PLINK_ped.py -i AMP_DLB_vars_${ANCESTRY}_apoe_${COHORT}_recode.ped -o AMP_DLB_vars_${ANCESTRY}_apoe_${COHORT}_recode.ped_test

In [None]:
import pandas as pd

# Load the data into a pandas DataFrame
file_path = '${WORK_DIR}/AMP_DLB_vars_${ANCESTRY}_apoe_${COHORT}_recode.ped_test.APOE_GENOTYPES.csv'  
df = pd.read_csv(file_path)

# Define the order of genotypes
genotype_order = [
    "e1/e1", "e1/e2", "e1/e4", "e2/e2", "e2/e3", "e2/e4 or e1/e3",
    "e3/e3", "e3/e4", "e4/e4"
]

# Count the occurrences of each genotype
genotype_counts = df['APOE_GENOTYPE'].value_counts()

# Initialize the counts for the genotypes in the specified order
genotype_counts_ordered = {genotype: 0 for genotype in genotype_order}

# Fill the counts for the observed genotypes
for genotype in genotype_counts.index:
    if genotype in genotype_counts_ordered:
        genotype_counts_ordered[genotype] = genotype_counts[genotype]
    elif genotype == "e2/e4" or genotype == "e1/e3":
        genotype_counts_ordered["e2/e4 or e1/e3"] += genotype_counts[genotype]

# Add the total count
total_count = df.shape[0]

# Print the results with percentages
print("Genotype\tCount\tPercentage")
for genotype in genotype_order:
    count = genotype_counts_ordered[genotype]
    percentage = (count / total_count) * 100
    print(f"{genotype}\t{count} ({percentage:.2f}%)")
print(f"Total\t{total_count} (100.00%)")


# Query AMP PD to define cases and controls in each ancestry and obtain demographic and phenotyp data

## Define cases and controls in each ancestry

In [None]:
import pandas as pd

In [None]:
%%bash
plink2 --pfile ${WORK_DIR}/FILTERED.AMP_PD_${ANCESTRY} \
--keep qc_${COHORT}_DLB_ID_ID_plink.txt --make-bed --out AMPDLB_${ANCESTRY}_${COHORT}

## Demographic and phenotype data

##### These codes were replicated for cases

In [None]:
qc_covarsex = pd.read_csv("${WORK_DIR}/COVFILE.csv", sep=",")
qc_covarsex.head()

In [None]:
qc_controlsex = qc_covarsex[qc_covarsex["DLB_PHENO"]==1]
qc_controlsex.info()

In [None]:
qc_controlsex.to_csv('qc_controlsex_file.csv', index=False)

In [None]:
!cut -f 1 AMPDLB_${ANCESTRY}_control.fam > first_column${ANCESTRY}.txt

In [None]:
!cat first_columnAAC.txt first_columnAMR.txt first_columnFIN.txt first_columnCAS.txt first_columnMDE.txt first_columnAFR.txt first_columnEAS.txt first_columnAJ.txt first_columnSAS.txt first_columnEUR.txt first_columnCAH.txt > merged_file.txt

In [None]:
!awk -F ',' 'NR==FNR{a[$1]; next} FNR==1 {print} $1 in a' merged_file.txt qc_controlsex_file.csv > filtered_qc_controlsex_file.csv
!awk -F',' 'NR==1 {print $0 > "header.csv"; next} $2 == 1 {print $0 > "controlmale2.csv"} $2 == 2 {print $0 > "controlfemale2.csv"}' filtered_qc_controlsex_file.csv
!awk -F',' 'NR>1 {sum += $3; sumsq += ($3)^2} END {mean = sum/NR; sd = sqrt(sumsq/NR - (sum/NR)^2); print "Mean age:", mean; print "Standard deviation of age:", sd}' controlmale2.csv
!awk -F',' 'NR>1 {sum += $3; sumsq += ($3)^2} END {mean = sum/NR; sd = sqrt(sumsq/NR - (sum/NR)^2); print "Mean age:", mean; print "Standard deviation of age:", sd}' controlfemale2.csv

In [None]:
%%bash
module load plink/1.9
plink --bfile AMPDLB_${ANCESTRY}_${COHORT}  --recodeA  --out AMPDLB_${ANCESTRY}_${COHORT}_recode
awk '${COLUMN} == 1' AMPDLB_${ANCESTRY}_${COHORT}_recode.raw  > AMPDLB_${ANCESTRY}_${COHORT}_recode.raw.filtered.raw

# Query AMP PD for resilience and protective variants

In [None]:
import numpy as np
import pandas as pd
import sys
from functools import reduce
import argparse

In [None]:
%%writefile var_keepprotect.txt
    chr${}:${Position}:${A1}:${A2}

In [None]:
%%bash
module load plink
plink2 --pfile ${WORK_DIR}/FILTERED.AMP_PD_${Ancestry} \
--extract var_keepprotect.txt --make-bed --out AMP_PD_vars_${Ancestry}_pro

In [None]:
%%bash
module load plink/1.9
plink --bfile AMP_PD_vars_${Ancestry}_pro --keep ${WORK_DIR}/qc_${COHORT}_DLB_ID_ID_plink.txt --make-bed --out AMP_DLB_vars_${Ancestry}_pro_${COHORT}

In [None]:
%%bash
module load plink/1.9
plink --bfile AMP_DLB_vars_${Ancestry}_pro_${COHORT} --recodeA --out AMP_DLB_vars_${Ancestry}_pro_${COHORT}_txt

In [None]:
import os
import pandas as pd

# Define file paths and ancestries
ancestries = ['FIN', 'EUR', 'AFR', 'AMR', 'EAS', 'SAS', 'MDE', 'AJ', 'AAC', 'CAS', 'CAH']
base_case_file = 'AMP_DLB_vars_{ancestry}_pro_${COHORT}_txt.raw'
base_apoe_file = '${WORK_DIR}/AMP_DLB_vars_{ancestry}_apoe_${COHORT}_recode.ped_test.APOE_GENOTYPES.csv'

# Initialize an empty list to collect DataFrames
all_results = []

for ancestry in ancestries:
    # Generate the specific file paths for each ancestry
    case_file = base_case_file.format(ancestry=ancestry)
    apoe_file = base_apoe_file.format(ancestry=ancestry)

    # Check if both files exist
    if not os.path.exists(case_file):
        print(f"Case file for {ancestry} not found, skipping...")
        continue
    if not os.path.exists(apoe_file):
        print(f"APOE genotype file for {ancestry} not found, skipping...")
        continue

    # Load the genotype data
    case_df = pd.read_csv(case_file, delim_whitespace=True)

    # Load the APOE genotype file
    apoe_df = pd.read_csv(apoe_file)
    apoe_df = apoe_df[['FID', 'IID', 'APOE_GENOTYPE']]

    # Merge the genotype data with APOE genotypes
    case_merged_df = case_df.merge(apoe_df, on=['FID', 'IID'], how='left')

    # Create the output DataFrame
    output_df = pd.DataFrame()
    output_df['genotype'] = case_merged_df['APOE_GENOTYPE']
    output_df['ancestry'] = ancestry

    # Combine variant information into a single column
    variant_cols = [col for col in case_df.columns if ':' in col]  # Select columns that represent variants
    variation_list = []
    for idx, row in case_merged_df.iterrows():
        sample_variations = []
        for col in variant_cols:
            genotype = row[col]
            if genotype != 0:  # Exclude wild-type (0) genotypes
                sample_variations.append(f"{col}_{genotype}")
        variation_list.append(sample_variations if sample_variations else ["No Variations"])

    # Explode the variations into separate rows
    exploded_df = output_df.loc[output_df.index.repeat([len(v) for v in variation_list])].reset_index(drop=True)
    exploded_df['variation'] = [var for sublist in variation_list for var in sublist]

    # Remove trailing numbers and count occurrences
    exploded_df['variation'] = exploded_df['variation'].str.replace(r'_\d+$', '', regex=True)  # Remove trailing numbers
    count_df = exploded_df.groupby(['genotype', 'ancestry', 'variation']).size().reset_index(name='count')

    # Append the current ancestry results to the list
    all_results.append(count_df)

# Concatenate all results into a single DataFrame
final_results = pd.concat(all_results, ignore_index=True)

# Save the combined results to a single output file
combined_output_file = 'combined_${COHORT}_final_output.csv'
final_results.to_csv(combined_output_file, index=False)

print(f"Combined output saved to {combined_output_file}")


In [None]:
import pandas as pd
import os

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('combined_${COHORT}_final_output.csv')

# Define the desired order of genotypes
genotype_order = [
    'unknown_unknown',
    'e1/e1',
    'e1/e2',
    'e1/e4',
    'e2/e2',
    'e2/e3',
    'e2/e4 or e1/e3',
    'e3/e3',
    'e3/e4',
    'e4/e4',
]

# Define the new order of columns
new_column_order = ['EUR', 'AFR', 'AMR', 'EAS', 'SAS', 'MDE', 'AJ', 'FIN', 'AAC', 'CAS', 'CAH']

# Define the total numbers for each ancestry
total_numbers = {
    'EUR': ${count},
    'AFR': ${count},
    'AMR': ${count},
    'EAS': ${count},
    'SAS': ${count},
    'MDE': ${count},
    'AJ': ${count},
    'FIN': ${count},
    'AAC': ${count},
    'CAS': ${count},
    'CAH': ${count}
}

# Create a directory to save the tables if it doesn't exist
output_dir = 'variant_tables'
os.makedirs(output_dir, exist_ok=True)

# Iterate over unique variations
for variation in df['variation'].unique():
    # Create a subset of the DataFrame for the current variation
    subset = df[df['variation'] == variation]

    # Pivot the subset to create the desired table
    pivot_table = subset.pivot_table(index='genotype', columns='ancestry', values='count', fill_value=0)

    # Ensure the index is in the specified genotype order
    pivot_table = pivot_table.reindex(genotype_order)

    # Reorder the columns
    pivot_table = pivot_table.reindex(columns=new_column_order)

    # Create a DataFrame for the total numbers and concatenate it as the last row
    total_df = pd.DataFrame(total_numbers, index=['total'])
    pivot_table = pd.concat([pivot_table, total_df])

    # Fill NaN values with 0 before converting to integers
    pivot_table = pivot_table.fillna(0).astype(int)

    # Calculate the percentage for each value, ignoring division by zero errors
    percentage_table = pivot_table.div(pivot_table.loc['total'], axis=1).replace([float('inf'), -float('inf'), float('nan')], 0) * 100

    # Convert integer values to strings for the formatted table
    formatted_values = pivot_table.astype(str)

    # Combine values and percentages, replacing "0 (nan%)" or "0 (0%)" with just "0"
    formatted_table = formatted_values + " (" + percentage_table.round(2).astype(str) + "%)"
    formatted_table = formatted_table.replace(r'0 \(nan%\)', '0', regex=True)
    formatted_table = formatted_table.replace(r'0 \(0.0%\)', '0', regex=True)

    # Format the 'total' row to display only values, not percentages
    formatted_table.loc['total'] = formatted_values.loc['total']

    # Save the formatted table to a CSV file
    output_file_path = os.path.join(output_dir, f'pivot_table_{variation}_value_percentage.csv')
    formatted_table.to_csv(output_file_path)

    print(f'Saved value and percentage pivot table for Variation: {variation} to {output_file_path}')
