# ADSP

* **Project:** ADRD Genetic Diversity in Biobanks
* **Version:** Python/3.9 and 3.10
* **Last Updated:** 22-August-2024

## Notebook Overview
Check variants, allele freqs, calculate missingness, APOE genotyping, demographic data, resilience/protective variants

# Query ADSP to check for variants of interest, allele frequency, and to calculate missingness

## Variables used 
- `${ANCESTRY}` = EUR, AFR, AMR, AAC, AJ, MDE, SAS, CAS, EAS, FIN, CAH
- `${COHORT}` = AD, Dementia, Control or Case, Control
- `${COUNT}` = Number of total individuals in each ancestry
- `${GENOTYPE}` = Different APOE genotypes
- `chr${}:Position:A1:A2` = Chromosome number, position, reference and alternative alleles

In [10]:
import pandas as pd

In [1]:
## Check if the variants exist in the data or not
!grep -e "chr${}:position:A1:A2"  ${WORK_DIR}/chr${}.compact_filtered.r4.wgs.biallelic.pvar > new_output.txt 
!cat new_output.txt 

In [None]:
%%bash
module load plink
plink2 -pfile ${WORK_DIR}/chr${}.compact_filtered.r4.wgs.biallelic --snps chr${}:position:A1:A2 --make-pgen --out new_chr${}vars_UKB
cat new_chr${}vars_UKB.pvar

In [None]:
## Check missingness
%%bash
module load plink
plink2 -pfile new_chr${}vars_UKB --missing --out new_chr${}vars_UKB_missing
cat new_chr${}vars_UKB_missing.vmiss

In [None]:
## Check frequency
%%bash
module load plink
plink2 --pfile new_chr${}vars_UKB --freq --out new_chr${}vars_UKB_freq
cat new_chr${}vars_UKB_freq.afreq

## Keep only individuals who have variants

In [None]:
#!/bin/bash

# Load PLINK module
module load plink
# Define the variants
variants=("chr${}:position:A1:A2")

# Define the files
files=("chr${}.compact_filtered.r4.wgs.biallelic")

# Create a folder to store extracted data
mkdir -p extracted_data_UKB

# Loop over each file
for file in "${files[@]}"; do
    # Create a subfolder for each file
    mkdir -p extracted_data_UKB/${file}
    
    # Loop over each variant and extract it
    for variant in "${variants[@]}"; do
        # Create a temporary file for the variant
        echo "$variant" > temp_variant.txt
        
        # Run PLINK command
        plink2 --pfile ${WORK_DIR}/${file} --extract temp_variant.txt --make-bed --out extract_${file}_${variant}
        
        # Move the extracted files to the subfolder
        mv extract_${file}_${variant}.bed extracted_data_UKB/${file}/
        mv extract_${file}_${variant}.bim extracted_data_UKB/${file}/
        mv extract_${file}_${variant}.fam extracted_data_UKB/${file}/
    done
done

# Clean up temporary file
rm temp_variant.txt


In [None]:
#!/bin/bash

# Load PLINK module
module load plink

plink2 --bfile ${WORK_DIR}/chr${}.compact_filtered.r4.wgs.biallelic/extract_chr${}.compact_filtered.r4.wgs.biallelic_chr${}:position:A1:A2  --recode A --out extract_chr${}.compact_filtered.r4.wgs.biallelic_chr${}:position:A1:A2_recoded

In [8]:
awk '$7 == 0 || $7 == 1' extract_chr${}.compact_filtered.r4.wgs.biallelic_chr${}:position:A1:A2_recoded.raw  > extract_chr${}.compact_filtered.r4.wgs.biallelic_chr${}:position:A1:A2_recoded.raw.filtered.raw
cat extract_chr${}.compact_filtered.r4.wgs.biallelic_chr${}:position:A1:A2_recoded.raw.filtered.raw

## Remove related individuals and calculate allele frequency

In [1]:
import pandas as pd

In [None]:
%%writefile var_keepnewUKB.txt
chr${}:${Position}:${A1}:${A2}

In [None]:
%%bash
module load plink
plink2 --pfile ${WORK_DIR}/chr${}.compact_filtered.r4.wgs.biallelic \
--extract var_keepnewUKB.txt --make-bed --out adsp_varsnewUKB_chr${}

In [None]:
ancesty= pd.read_csv("${WORK_DIR}/FILTERED.merged_biallelic_${ANCESTRY}.psam", sep = '\t')
ancestry.head()

In [165]:
ancestry_keep = ${ANCESTRY} [["#FID", "IID"]]
ancestry_keep.to_csv("adsp_${ANCESTRY}_keep.txt", sep="\t", index=False)

In [None]:
%%bash
module load plink/1.9
plink --bfile adsp_varsnewUKB_chr${} --keep adsp_${ANCESTRY}_keep.txt --make-bed --out adsp_varsnewUKB_${ANCESTRY}

In [None]:
%%bash
module load plink/1.9  
plink --bfile adsp_varsnewUKB_${ANCESTRY} --remove ${WORK_DIR}/REMOVE.FILTERED.merged_biallelic_${ANCESTRY}.related --make-bed --out adsp_varsnewUKB_${ANCESTRY}_unrelated

In [None]:
%%bash
module load plink/1.9
plink --bfile adsp_varsnewUKB_${ANCESTRY}_unrelated --freq --out adsp_varsnewUKB_${ANCESTRY}_freq_unrelated
cat adsp_varsnewUKB_${ANCESTRY}_freq_unrelated.frq

In [None]:
qc_covar = pd.read_csv("${WORK_DIR}/covars_for_QC.txt", sep="\t")
qc_covar.head()

In [None]:
qc_case = qc_covar[qc_covar["PHENO"]==2]
qc_case.info()

In [171]:
qc_case_plink = qc_case[["FID", "IID"]]
qc_case_plink.to_csv("qc_case_plink.txt", sep="\t", index=False)

In [None]:
qc_control = qc_covar[qc_covar["PHENO"]==1]
qc_control.info()

In [173]:
qc_control_plink = qc_control[["FID", "IID"]]
qc_control_plink.to_csv("qc_control_plink.txt", sep="\t", index=False)

In [None]:
%%bash
module load plink/1.9
plink --bfile adsp_varsnewUKB_${ANCESTRY}_unrelated --keep qc_${COHORT}_plink.txt --make-bed --out adsp_varsnewUKB_${ANCESTRY}_${COHORT}_unrelated

In [None]:
%%bash
module load plink/1.9
plink --bfile adsp_varsnewUKB_${ANCESTRY}_${COHORT}_unrelated --freq --out adsp_varsnewUKB_${ANCESTRY}_${COHORT}_unrelated_freq
cat adsp_varsnewUKB_${ANCESTRY}_${COHORT}_unrelated_freq.frq

# Query ADSP for APOE genotyping

In [None]:
import numpy as np
import pandas as pd
import sys
from functools import reduce
import argparse

In [None]:
%%writefile var_keepapoe.txt
chr19:44908684:T:C
chr19:44908822:C:T

In [None]:
%%bash
plink2 --pfile ${WORK_DIR}/chr19.compact_filtered.r4.wgs.biallelic \
--extract var_keepapoe.txt --make-bed --out adsp_vars_chr19_apoe

In [None]:
ancestry= pd.read_csv("${WORK_DIR}/FILTERED.merged_biallelic_${ANCESTRY}.psam", sep = '\t')
ancestry.head()

In [None]:
ancestry_keep = ${ANCESTRY}[["#FID", "IID"]]
ancestry_keep.to_csv("adsp_${ANCESTRY}_keep.txt", sep="\t", index=False)

In [None]:
%%bash
module load plink/1.9
plink --bfile adsp_vars_chr19_apoe --keep adsp_${ANCESTRY}_keep.txt --make-bed --out adsp_vars_${ANCESTRY}_apoe

In [None]:
%%bash
module load plink/1.9  
plink --bfile adsp_vars_${ANCESTRY}_apoe --remove ${WORK_DIR}/REMOVE.FILTERED.merged_biallelic_${ANCESTRY}.related --make-bed --out adsp_vars_${ANCESTRY}_apoe_unrelated

In [None]:
%%bash
module load plink/1.9
plink --bfile adsp_vars_${ANCESTRY}_apoe_unrelated --recode compound-genotypes --out adsp_vars_${ANCESTRY}_apoe_unrelated_recode

In [None]:
%%writefile APOE_genotypes_PLINK_ped.py
#!/bin/env python

# Determine APOE genotypes from PLINK output
    # January 2021
    # Mary B. Makarious, Makayla Portley, and Cornelis Blauwendraat (LNG/NIA/NINDS/NIH)
    # Script usage:
        # python APOE_genotypes_PLINK_ped.py -i INPUT.ped -o OUTPUT_NAME

## APOE Information
# https://www.snpedia.com/index.php/APOE

    # |          APOE GENO         	| rs429358 	| rs7412 	|             COMBINED             	|
    # |:--------------------------:	|:--------:	|:------:	|:--------------------------------:	|
    # |            e1/e1           	|    CC    	|   TT   	|               CC_TT              	|
    # |            e1/e2           	|    CT    	|   TT   	|          CT_TT or TC_TT          	|
    # |            e1/e4           	|    CC    	|   CT   	|          CC_CT or CC_TC          	|
    # |            e2/e2           	|    TT    	|   TT   	|               TT_TT              	|
    # |            e2/e3           	|    TT    	|   TC   	|          TT_TC or TT_CT          	|
    # | e2/e4 or e1/e3 (Ambiguous) 	|    TC    	|   TC   	| TC_TC or CT_CT or TC_CT or CT_TC 	|
    # |            e3/e3           	|    TT    	|   CC   	|               TT_CC              	|
    # |            e3/e4           	|    TC    	|   CC   	|          TC_CC or CT_CC          	|
    # |            e4/e4           	|    CC    	|   CC   	|               CC_CC              	|

# Import the necessary packages
import numpy as np
import pandas as pd
import sys
from functools import reduce
import argparse

# Initialize parser and add arguments
parser = argparse.ArgumentParser()
parser.add_argument("--input", "-i", help="Input file name (with suffix)")
parser.add_argument("--output", "-o", help="Desired output name (without suffix)")
args = parser.parse_args()

# Read in the .ped file and force column names
header_text = ["FID", "IID", "PAT", "MAT", "SEX", "PHENO", "rs429358", "rs7412"]
input_ped_df = pd.read_csv(args.input, sep = " ", header=None, names=header_text)

# Make a combined column, gluing the genotypes from the rs429358 and rs7412 columns
input_ped_df['rs429358_rs7412'] = input_ped_df['rs429358'].astype(str)+'_'+input_ped_df['rs7412']

# Initialize a dictionary with the genotypes to search what genotype the alleles generate
apoe_genotypes_dict = {
    'CC_TT' : 'e1/e1',
    'CT_TT' : 'e1/e2',
    'TC_TT' : 'e1/e2',
    'CC_CT' : 'e1/e4',
    'CC_TC' : 'e1/e4',
    'TT_TT' : 'e2/e2',
    'TT_TC' : 'e2/e3',
    'TT_CT' : 'e2/e3',
    'TC_TC' : 'e2/e4 or e1/e3',
    'CT_CT' : 'e2/e4 or e1/e3',
    'TC_CT' : 'e2/e4 or e1/e3',
    'CT_TC' : 'e2/e4 or e1/e3',
    'TT_CC' : 'e3/e3',
    'TC_CC' : 'e3/e4',
    'CT_CC' : 'e3/e4',
    'CC_CC' : 'e4/e4'
}

# Map the combined column to the dictionary to extract the genotypes
input_ped_df['APOE_GENOTYPE'] = input_ped_df['rs429358_rs7412'].map(apoe_genotypes_dict)

# If any of the combined alleles weren't in the dictionary, the dataframe now has NaN values
# This happens if you have a 0 or missingness somewhere, resulting in an unsure genotype call
# Replace these with something more useful, and state the APOE genotype as "unknown"
input_ped_df.replace(np.nan, 'unknown', regex=True, inplace=True)

# Make a file of just the FID, IID, SEX, PHENO, and APOE genotype
subset_geno_df = input_ped_df.drop(columns=['PAT', 'MAT', 'rs429358', 'rs7412'])

## Generate counts
# Generate APOE genotype counts and percentages for entire dataset
counts_df = pd.DataFrame(subset_geno_df['APOE_GENOTYPE'].value_counts().reset_index())
counts_df.columns = ['APOE_GENOTYPE', 'TOTAL_COUNT']
counts_df['TOTAL_PERCENT'] = counts_df['TOTAL_COUNT'] / subset_geno_df.shape[0] * 100

# Separate out into cases, controls, and missing phenotypes
    # This assumes controls=1 and cases=2 (missing is -9)

# Subset by phenotype
missing_pheno_df = subset_geno_df[subset_geno_df['PHENO'] == -9]
controls_df = subset_geno_df[subset_geno_df['PHENO'] == 1]
cases_df = subset_geno_df[subset_geno_df['PHENO'] == 2]

# Generate APOE genotype counts and percentages for missing phenotypes
missing_pheno_counts_df = pd.DataFrame(missing_pheno_df['APOE_GENOTYPE'].value_counts().reset_index())
missing_pheno_counts_df.columns = ['APOE_GENOTYPE', 'MISSING_PHENO_COUNT']
missing_pheno_counts_df['MISSING_PHENO_PERCENT'] = missing_pheno_counts_df['MISSING_PHENO_COUNT'] / missing_pheno_df.shape[0] * 100

# Generate APOE genotype counts and percentages for controls
controls_counts_df = pd.DataFrame(controls_df['APOE_GENOTYPE'].value_counts().reset_index())
controls_counts_df.columns = ['APOE_GENOTYPE', 'CONTROLS_COUNT']
controls_counts_df['CONTROLS_PERCENT'] = controls_counts_df['CONTROLS_COUNT'] / controls_df.shape[0] * 100

# Generate APOE genotype counts and percentages for cases
cases_counts_df = pd.DataFrame(cases_df['APOE_GENOTYPE'].value_counts().reset_index())
cases_counts_df.columns = ['APOE_GENOTYPE', 'CASES_COUNT']
cases_counts_df['CASES_PERCENT'] = cases_counts_df['CASES_COUNT'] / cases_df.shape[0] * 100

# Merge the dataframes together for final summary counts file
dataframes_tomerge = [counts_df, missing_pheno_counts_df, controls_counts_df, cases_counts_df]
merged_summary_df = reduce(lambda left,right: pd.merge(left,right,on='APOE_GENOTYPE'), dataframes_tomerge)

## Export
complete_df_output = args.output + ".APOE_GENOTYPES.csv"
counts_df_output = args.output + ".APOE_SUMMARY.csv"

# Save out the complete dataframe as a .csv
print(f"Your complete genotype file has been saved here: {complete_df_output}")
subset_geno_df.to_csv(complete_df_output, index=False)

# Save out the counts as a .csv
print(f"The summary counts have been saved here: {counts_df_output}")
merged_summary_df.to_csv(counts_df_output, index=False)

# Done!
print("Thanks!")


In [None]:
%%bash
python APOE_genotypes_PLINK_ped.py -i adsp_vars_${ANCESTRY}_apoe_unrelated_recode.ped -o adsp_vars_${ANCESTRY}_apoe_unrelated_recode_test

In [None]:
qc_covar = pd.read_csv("${WORK_DIR}/covars_for_QC.txt", sep="\t")
qc_covar.head()

In [None]:
qc_cases = qc_covar[qc_covar["PHENO"]==2]
qc_cases.info()

In [None]:
qc_cases_plink = qc_cases[["FID", "IID"]]
qc_cases_plink.to_csv("qc_cases_plink.txt", sep="\t", index=False)

In [None]:
qc_control = qc_covar[qc_covar["PHENO"]==1]
qc_control.info()

In [None]:
qc_control_plink = qc_control[["FID", "IID"]]
qc_control_plink.to_csv("qc_control_plink.txt", sep="\t", index=False)

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
data = pd.read_csv('adsp_vars_${ANCESTRY}_apoe_unrelated_recode_test.APOE_GENOTYPES.csv')

# Read the txt file into a DataFrame with tab delimiter
qc_${COHORT} = pd.read_csv('qc_${COHORT}_plink.txt', delimiter='\t')

# Merge the two DataFrames based on FID and IID columns
merged_data = pd.merge(data, qc_${COHORT}, on=['FID', 'IID'])

# Save the merged DataFrame as a new CSV file
merged_data.to_csv('adsp_vars_${ANCESTRY}_${COHORT}_apoe_unrelated.csv', index=False)

In [None]:
with open('adsp_vars_${ANCESTRY}_${COHORT}_apoe_unrelated.csv', 'r') as file:
    # Read the content
    content = file.read()
    # Count occurrences of 'GENOTYPE'
    count = content.count('${GENOTYPE}')

# Print the count
print("Word count of '${GENOTYPE}' in adsp_vars_${ANCESTRY}_${COHORT}_apoe_unrelated.csv:", count)

# Query ADSP for demographic and phenotypic data

## Demographic data

In [None]:
import pandas as pd

In [None]:
%%bash
plink2 --pfile ${WORK_DIR}/FILTERED.merged_biallelic_${ANCESTRY} --remove ${WORK_DIR}/REMOVE.FILTERED.merged_biallelic_${ANCESTRY}.related --make-bed --out ADSP_${ANCESTRY}_unrelated

In [None]:
!awk '{print $1, $2}' ADSP_${ANCESTRY}_unrelated.fam > ADSP_${ANCESTRY}_unrelated_subset.fam

In [None]:
# Merge files for each group
!cat ADSP_AAC_unrelated_subset.fam ADSP_AFR_unrelated_subset.fam ADSP_AJ_unrelated_subset.fam ADSP_AMR_unrelated_subset.fam ADSP_CAH_unrelated_subset.fam ADSP_CAS_unrelated_subset.fam ADSP_EAS_unrelated_subset.fam ADSP_EUR_unrelated_subset.fam ADSP_FIN_unrelated_subset.fam ADSP_MDE_unrelated_subset.fam ADSP_SAS_unrelated_subset.fam > mergedIDS_file.fam

In [None]:
qc_covarsex = pd.read_csv("${WORK_DIR}/covars_for_QC.txt", sep="\t")
qc_covarsex.head()

In [None]:
!grep -w -Ff <(awk '{print $1}' mergedIDS_file.fam) ${WORK_DIR}/covars_for_QC.txt > filtered_covars_for_QC.txt
!awk '$6 == 1 {print > "controls.txt"} $6 == 2 {print > "cases.txt"}' filtered_covars_for_QC.txt
!wc controls.txt
!wc cases.txt

In [None]:
!awk '$7 == 1 {print > "male_control.txt"} $7 == 2 {print > "female_controls.txt"}' controls.txt
!wc male_control.txt
!wc female_controls.txt

In [None]:
!awk '$7 == 1 {print > "male_case.txt"} $7 == 2 {print > "female_case.txt"}' cases.txt
!wc male_case.txt
!wc female_case.txt

In [None]:
!awk '{sum += $8; sumsq += ($8)^2; count++} END {avg = sum/count; sd = sqrt(sumsq/count - (avg)^2); print "Femalecontrol: Average Age =", avg, "SD =", sd}' female_controls.txt
!awk '{sum += $8; sumsq += ($8)^2; count++} END {avg = sum/count; sd = sqrt(sumsq/count - (avg)^2); print "Malecontrols: Average Age =", avg, "SD =", sd}' male_control.txt
!awk '{sum += $8; sumsq += ($8)^2; count++} END {avg = sum/count; sd = sqrt(sumsq/count - (avg)^2); print "Femalecase: Average Age =", avg, "SD =", sd}' female_case.txt
!awk '{sum += $8; sumsq += ($8)^2; count++} END {avg = sum/count; sd = sqrt(sumsq/count - (avg)^2); print "Malecase: Average Age =", avg, "SD =", sd}' male_case.txt

## Phenotype data

In [None]:
%%bash
module load plink
plink2 -pfile ${WORK_DIR}/chr${}.compact_filtered.r4.wgs.biallelic --snps chr${}:position:A1:A2 --make-bed --out chr${}_variant

In [None]:

%%bash
module load plink
plink2 --bfile chr${}_variant  --recode A --out chr${}_variant_recoded
awk '$7 == 0 || $7 == 1' chr${}_variant_recoded.raw > chr${}_variant_recoded.raw.filtered.raw

In [None]:
%%bash
module load plink/1.9
plink --bfile chr${}_variant --keep qc_${COHORT}_plink.txt --make-bed --out chr${}_variant_${COHORT}

In [None]:
import pandas as pd

# Load the .fam file
fam_file = 'chr${}_variant_${COHORT}.fam'
fam_df = pd.read_csv(fam_file, delim_whitespace=True, header=None, usecols=[0, 1], names=['FID', 'IID'])

# Load the .raw file without headers, taking the first two columns
raw_file = 'chr${}_variant_recoded.raw.filtered.raw'
raw_df = pd.read_csv(raw_file, delim_whitespace=True, header=None, usecols=[0, 1], names=['FID', 'IID'])

# Merge the dataframes on FID and IID
merged_df = pd.merge(fam_df, raw_df, on=['FID', 'IID'], how='inner')

# Save the result back to a .fam file
merged_df.to_csv('chr${}_variant_${COHORT}_filtered.fam', sep=' ', header=False, index=False)

In [None]:
%%writefile ${COHORT}_sampleids.txt

In [None]:
import pandas as pd

# Load the qc_covar file
qc_covar = pd.read_csv("${WORK_DIR}/covars_for_QC.txt", sep="\t")

# Load the Cohort_sampleids file without headers
cohort_sampleids = pd.read_csv("${COHORT}_sampleids.txt", delim_whitespace=True, header=None, names=['FID', 'IID'])

# Merge the dataframes on FID and IID
merged_df = pd.merge(qc_covar, cohort_sampleids, on=['FID', 'IID'], how='inner')

# Save the result back to a file
merged_df.to_csv("qc_covar_${COHORT}_sampleids.txt", sep="\t", index=False)

# Query ADSP for resilience and protective variants

In [None]:
import numpy as np
import pandas as pd
import sys
from functools import reduce
import argparse

In [None]:
import subprocess
import os

# Define variations
variations = [
    "chr${}:${Position}:${A1}:${A2}"
    ]
# Define file directory
file_dir = "${WORK_DIR}"

# Iterate over each variation
for variation in variations:
    # Extract chromosome from variation
    chromosome = variation.split(":")[0]
    # Construct file path
    file_path = f"{file_dir}{chromosome}.compact_filtered.r4.wgs.biallelic.pvar"
    # Check if file exists
    if os.path.exists(file_path):
        # Grep for variation in file
        print(f"Searching for variation {variation} in file {file_path}")
        subprocess.run(["grep", "-e", variation, file_path])
    else:
        print(f"File not found: {file_path}")


In [None]:
import subprocess
import os

# Define the variants
variants = [
    "chr${}:${Position}:${A1}:${A2}"
    ]

# Define the directory containing the files for each chromosome
file_dir = "${WORK_DIR}"

# List to store the names of the created files
created_files = []

# Loop over each variant
for variant in variants:
    # Extract chromosome from variant
    chromosome = variant.split(":")[0]
    
    # Define the file for the current chromosome
    file = f"{chromosome}.compact_filtered.r4.wgs.biallelic"

    # Write the variant to a temporary file
    with open("temp_variant.txt", "w") as f:
        f.write(variant)

    # Construct the output file name
    output_file = f"extract_{variant.replace(':', '_')}"

    # Extract the variant
    subprocess.run([
        "plink2",
        "--pfile",
        os.path.join(file_dir, file),
        "--extract",
        "temp_variant.txt",
        "--make-bed",
        "--out",
        output_file
    ])

    # Add the name of the created file to the list
    created_files.append(output_file)

    # Remove the temporary file
    os.remove("temp_variant.txt")

# Run the plink2 command for all created files
for file_name in created_files:
    subprocess.run([
        "plink2",
        "--bfile",
        file_name,
        "--recode",
        "A",
        "--out",
        f"{file_name}_recoded"
    ])

# Filter the recoded files
for file_name in created_files:
    recoded_file = f"{file_name}_recoded.raw"
    if os.path.exists(recoded_file):
        with open(f"{file_name}_filtered.raw", "w") as f_out:
            with open(recoded_file, "r") as f_in:
                for line in f_in:
                    if line.strip().split()[6] in ["0", "1"]:
                        f_out.write(line)


In [None]:
#!/bin/bash

# Define ancestries and genotypes
declare -a ancestries=('eur' 'afr' 'amr' 'eas' 'sas' 'mde' 'aj' 'fin' 'aac' 'cas' 'cah')
declare -a genotypes=('00_00_unknown' '00_CC_unknown' '00_TC_unknown' 'TT_00_unknown' 'CT_00_unknown' 'CC_00_unknown'
    'e1_e1' 'e1_e2' 'e1_e4' 'e2_e2' 'e2_e3' 'e2e4_or_e1e3' 'e3_e3' 'e3_e4' 'e4_e4')

# Path to the directory containing the CSV files
csv_dir='${WORK_DIR}'

# Path to the current directory for raw files
raw_dir='./'

# Create a temporary file to store intermediate results
tmp_file=$(mktemp)

# Get total number of files for progress tracking
total_files=$(ls $csv_dir | grep -E "^filtered_.*_adsp_vars_.*_${COHORT}_apoe_unrelated.csv" | wc -l)

# Counter for tracking progress
progress_counter=0

# Iterate over each file in the directory
for file_name in $(ls $csv_dir); do
    # Check if the file is a CSV file for the specified genotypes and ancestries
    if [[ $file_name =~ ^filtered_.*_adsp_vars_.*_${COHORT}_apoe_unrelated.csv ]]; then
        # Read the CSV file into a DataFrame
        filtered_df=$(cat $csv_dir/$file_name)

        # Iterate over all files in the directory again
        for raw_file in $(ls $raw_dir); do
            # Check if the file is one of the raw files
            if [[ $raw_file == *_filtered.raw ]]; then
                # Extract variations from the raw file name
                variations=$(echo $raw_file | cut -d'_' -f2-5)

                # Read the raw file into a DataFrame, specifying the tab as the separator
                adsp_df=$(cat $raw_dir/$raw_file)

                # Filter rows where the first two columns match in both DataFrames
                filtered_ids=$(comm -12 <(echo "$filtered_df" | awk -F',' '{print $1$2}' | sort) <(echo "$adsp_df" | awk -F'\t' '{print $1$2}' | sort))

                # Calculate the count of filtered ids (number of lines), handle the case when there are no common sample IDs
                if [ -z "$filtered_ids" ]; then
                    count=0
                else
                    count=$(echo "$filtered_ids" | wc -l)
                fi

                # Append the results to the temporary file
                for genotype in "${genotypes[@]}"; do
                    for ancestry in "${ancestries[@]}"; do
                        if [[ $file_name == "filtered_${genotype}_adsp_vars_${ancestry}_cases_apoe_unrelated.csv" ]]; then
                            echo "$genotype,$ancestry,$variations,$count" >> $tmp_file
                        fi
                    done
                done
            fi
        done
    fi
    # Update progress counter
    progress_counter=$((progress_counter+1))
    echo "Processed $progress_counter/$total_files files"
done

# Define headers for the final CSV file
headers='Genotype,Ancestry,Variation,Count'

# Save results to a final CSV file
echo "$headers" > final${COHORT}_counts.csv
cat $tmp_file >> final${COHORT}_counts.csv

# Remove the temporary file
rm $tmp_file

In [None]:
import pandas as pd
import os

# Define the custom row order
custom_row_order = [
    '00_00_unknown',
    '00_CC_unknown',
    '00_TC_unknown',
    'TT_00_unknown',
    'CT_00_unknown',
    'CC_00_unknown',
    'e1_e1',
    'e1_e2',
    'e1_e4',
    'e2_e2',
    'e2_e3',
    'e2e4_or_e1e3',
    'e3_e3',
    'e3_e4',
    'e4_e4',
    'Total'
]

# Define the new order of columns
new_column_order = ['Genotype', 'Total', 'eur', 'afr', 'amr', 'eas', 'sas', 'mde', 'aj', 'fin', 'aac', 'cas', 'cah']

# Specify the directory containing the files
directory = '${WORK_DIR}'

# Get a list of all files in the directory
file_list = os.listdir(directory)

for file_name in file_list:
    # Check if the file is a CSV file
    if file_name.endswith('.csv'):
        # Read the CSV file into a DataFrame
        file_path = os.path.join(directory, file_name)
        df = pd.read_csv(file_path)

        # Create a dictionary to map each row to its desired position
        row_mapping = {row: index for index, row in enumerate(custom_row_order)}

        # Apply the custom row order
        df['row_order'] = df['Genotype'].map(row_mapping)
        df = df.sort_values('row_order').drop('row_order', axis=1).reset_index(drop=True)

        # Reorder the columns
        df = df[new_column_order]

        # Save the DataFrame back to a CSV file with the .reordered.csv suffix
        new_file_name = os.path.splitext(file_name)[0] + '.reordered.csv'
        new_file_path = os.path.join(directory, new_file_name)
        df.to_csv(new_file_path, index=False)


In [None]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('final${COHORT}_counts.csv')

# Define specific total counts for each ancestry
total_counts = {
    'eur': ${COUNT},
    'afr': ${COUNT},
    'amr': ${COUNT},
    'eas': ${COUNT},
    'sas': ${COUNT},
    'mde': ${COUNT},
    'aj': ${COUNT}7,
    'fin': ${COUNT},
    'aac': ${COUNT},
    'cas': ${COUNT},
    'cah': ${COUNT}
}

# Iterate over unique variations
for variation in df['Variation'].unique():
    # Create a subset of the DataFrame for the current variation
    subset = df[df['Variation'] == variation]

    # Pivot the subset to create the desired table
    pivot_table = subset.pivot(index='Genotype', columns='Ancestry', values='Count')

    # Add a total row for the table
    pivot_table.loc['Total'] = pivot_table.sum()

    # Add total counts at the end of each ancestry column
    for ancestry, total_count in total_counts.items():
        pivot_table.loc['Total', ancestry] = total_count

    # Calculate percentage for each value based on total count for each ancestry
    pivot_table_percentage = pivot_table.div(pivot_table.loc['Total']) * 100

    # Replace "NaN" values with 0
    pivot_table_percentage = pivot_table_percentage.fillna(0)

    # Combine the original counts with the percentages
    pivot_table_combined = pivot_table.astype(str) + ',' + pivot_table_percentage.round(2).astype(str) + '%'

    # Remove ",100.0%" from the total row
    pivot_table_combined.loc['Total'] = pivot_table_combined.loc['Total'].str.replace(',100.0%', '')

    # Remove ",0.0%" from the total row
    pivot_table_combined.loc['Total'] = pivot_table_combined.loc['Total'].str.replace(',0.0%', '')
    
    # Save the pivot_table_combined to a new CSV file
    pivot_table_combined.to_csv(f'${COHORT}_{variation}.csv')

    # Print a message indicating the file has been saved
    print(f'Table for Variation: {variation} saved to ${COHORT}_{variation}.csv')
    print('\n')
