# PRS 

* **Project:** ADRD Genetic Diversity in Biobanks
* **Version:** Python/3.9 and 3.10
* **Last Updated:** 10-FEB-2025

## Notebook Overview
Calculate polygenic risk score and perform logistic regression

# Query ADSP to calculate polygenic risk score

In [10]:
import numpy as np
import pandas as pd
import sys
from functools import reduce
import argparse

In [None]:
## Merge data
%%bash
module load plink/2.0

# Define input and output directories
INPUT_DIR="${WORK_DIR}/UNFILTERED_PLINK/"  
OUTPUT_DIR="${WORK_DIR}" 
OUTPUT_PREFIX="merged_UNQC_Files"  

# Create a list file with full paths to .pgen files 
MERGE_LIST="$OUTPUT_DIR/pgen_list.txt"

for CHR in {1..22} X Y M; do
    PGEN_FILE="${INPUT_DIR}/chr${CHR}.compact_filtered.r4.wgs.biallelic.pgen"
    if [[ -f "$PGEN_FILE" ]]; then
        echo "${INPUT_DIR}/chr${CHR}.compact_filtered.r4.wgs.biallelic" >> "$MERGE_LIST"
    fi
done

# Run PLINK2 to merge all .pgen, .pvar, and .psam files into one
plink2 --pmerge-list "$MERGE_LIST" pfile \
       --make-pgen \
       --out "${OUTPUT_DIR}/${OUTPUT_PREFIX}"


In [None]:
! awk '{print $1, $2}' ${WORK_DIR}/FID_IID_PHENO_EUR.fam > keep_samples.txt

In [None]:
%%bash
module load plink/2.0

# Define input and output directories
INPUT_DIR="${WORK_DIR}"
OUTPUT_DIR="${WORK_DIR}"
OUTPUT_PREFIX="filtered_UNQC_Files"



# Filter dataset based on keep_samples.txt and convert to PLINK1 binary format (.bed, .bim, .fam)
plink2 --pfile "$INPUT_DIR/merged_UNQC_Files" \
       --keep keep_samples.txt \
       --make-bed \
       --out "$OUTPUT_DIR/$OUTPUT_PREFIX"

In [None]:
## Check score file 
! cat AD_GRS_Kunkle_final_UNQC

In [None]:
## Run PRS
%%bash
module load plink/2.0

plink2 --bfile ${WORK_DIR}/filtered_UNQC_Files \
       --score AD_GRS_Kunkle_final_UNQC\
       --memory 128000 \
       --out PRS_results_UNQC_final


In [None]:
## Merge PRS output with covariate file
import pandas as pd

# File paths
prs_file = "PRS_results_UNQC_final.sscore"
covars_file = "${WORK_DIR}/covars_alldata_with999forAGRandRACE_PCA.txt"
output_file = "merged_data_UNQC.txt"

# Load PRS file and fix column separation
prs_df = pd.read_csv(prs_file, delim_whitespace=True, dtype=str)  
prs_df.columns = prs_df.columns.str.strip()  

# Rename '#FID' to 'FID' if needed
if "#FID" in prs_df.columns:
    prs_df.rename(columns={"#FID": "FID"}, inplace=True)

# Load Covariates file
covars_df = pd.read_csv(covars_file, sep="\t", dtype=str)
covars_df.columns = covars_df.columns.str.strip()  

# Print column names for debugging
print("PRS File Columns:", prs_df.columns.tolist())
print("Covariates File Columns:", covars_df.columns.tolist())

# Ensure data types match before merging
prs_df["FID"] = prs_df["FID"].astype(str)
prs_df["IID"] = prs_df["IID"].astype(str)
covars_df["FID"] = covars_df["FID"].astype(str)
covars_df["IID"] = covars_df["IID"].astype(str)

# Merge on both FID and IID
merged_df = prs_df.merge(covars_df, on=["FID", "IID"], how="inner")

# Save merged file
merged_df.to_csv(output_file, sep="\t", index=False)

print(f"Merged file saved as: {output_file}")


In [None]:
## Normalize score to Z-score

import pandas as pd

# Load the merged file
file_path = "merged_data_UNQC.txt"
df = pd.read_csv(file_path, sep="\t")

# Ensure SCORE and PHENO are numeric
df["SCORE1_AVG"] = pd.to_numeric(df["SCORE1_AVG"], errors="coerce")
df["PHENO"] = pd.to_numeric(df["PHENO"], errors="coerce")

# Compute mean and standard deviation for controls (PHENO == 1)
mean_controls = df.loc[df["PHENO"] == 1, "SCORE1_AVG"].mean()
sd_controls = df.loc[df["PHENO"] == 1, "SCORE1_AVG"].std()

# Apply Z-score normalization
df["SCORE_Z"] = (df["SCORE1_AVG"] - mean_controls) / sd_controls

# Save the normalized dataset
output_file = "merged_data_zscore_UNQC.txt"
df.to_csv(output_file, sep="\t", index=False)

print(f"Z-score normalized data saved to: {output_file}")
print(f"Mean (Controls PHENO=1): {mean_controls:.6f}, SD (Controls PHENO=1): {sd_controls:.6f}")


In [None]:
## Select the upper 25th percentile

import pandas as pd

# Load the file
file_path = "merged_data_zscore_UNQC.txt"
df = pd.read_csv(file_path, sep="\t")

# Compute the 75th percentile threshold for SCORE_Z
percentile_75 = df["SCORE_Z"].quantile(0.75)

# Select individuals in the upper 25th percentile
selected_df = df[df["SCORE_Z"] >= percentile_75]

# Save the selected individuals
output_file = "selected_upper_25th_percentile_UNQC.txt"
selected_df.to_csv(output_file, sep="\t", index=False)

print(f"Upper 25th percentile threshold: {percentile_75:.6f}")
print(f"Selected individuals saved to: {output_file}")


In [None]:
## Create pheno file
! cut -f1,2,9 selected_upper_25th_percentile_UNQC.txt> 25th_percentile_pheno_UNQC_eur

In [None]:
## Check the count of cases and controls

import pandas as pd

# Load the file
file_path = "${WORK_DIR}/25th_percentile_pheno_UNQC_eur"

# Read the file assuming it's tab-separated
df = pd.read_csv(file_path, sep="\t")

# Count the occurrences of 1 and 2 in the third column
counts = df.iloc[:, 2].value_counts()

# Print the counts
print(f"Control (1): {counts.get(1, 0)}")
print(f"Case (2): {counts.get(2, 0)}")

# Extract protective/disease-modifying variants

In [None]:
%%bash
module load plink/2.0

# Define file directory
FILE_DIR="${WORK_DIR}/UNFILTERED_PLINK/"

# Define variants
VARIANTS=(
    "chr19:1043104:G:A" "chr21:26171645:A:G" "chr21:26171723:T:C"
    "chr4:139008878:A:G" "chr11:121564878:T:C" "chr14:92466484:T:C"
    "chr20:56422512:G:A" "chr7:143410783:C:A" "chr15:50709337:T:G"
    "chr21:25897620:C:T" "chr16:81908423:C:G" "chr19:43100929:G:A"
    "chr19:44905307:A:T" "chr2:215424292:C:T" "chr2:215386857:G:A"
    "chr7:103472855:T:C" "chr19:44892887:C:T" "chr2:26135287:A:G"
    "chr17:46275856:T:G" "chr19:3405594:T:A" "chr19:44908756:C:A"
)

# List to store only successfully created files for merging
MERGE_LIST="${WORK_DIR}/pmerge_list.txt"
> $MERGE_LIST  

# Extract variants from original PLINK files
for VARIANT in "${VARIANTS[@]}"; do
    CHR=$(echo $VARIANT | cut -d":" -f1)
    FILE="$FILE_DIR/$CHR.compact_filtered.r4.wgs.biallelic"
    TEMP_FILE="${WORK_DIR}/temp_variant.txt"
    OUTPUT_FILE="${WORK_DIR}/extract_$(echo $VARIANT | tr ':' '_')"

    # Write variant to a temporary file
    echo $VARIANT > $TEMP_FILE

    # Extract variant and create PLINK2 format (.pgen, .psam, .pvar)
    plink2 --pfile "$FILE" --extract $TEMP_FILE --make-pgen --out $OUTPUT_FILE

    # Remove temporary file
    rm $TEMP_FILE
done


In [None]:
%%bash
module load plink/2.0

# Define final merged output file
MERGED_FILE="${WORK_DIR}/merged_variants"

# Check if merge list contains files
if [[ ! -s ${WORK_DIR}/pmerge_list.txt ]]; then
    echo "❌ No extracted variants with .psam files found. Exiting."
    exit 1
fi

# Merge only successfully extracted PLINK2 files
plink2 --pmerge-list ${WORK_DIR}/pmerge_list.txt pfile --make-pgen --out $MERGED_FILE

In [None]:
%%bash
module load plink/2.0

# Define the merged PGEN file
MERGED_FILE="${WORK_DIR}/merged_variants"

# Convert to BED format
plink2 --pfile $MERGED_FILE --make-bed --out $MERGED_FILE

# Run glm

In [None]:
%%bash
module load plink/2.0

  plink2 \
    --bfile "${WORK_DIR}/merged_variants" \
    --double-id \
    --pheno "${WORK_DIR}/25th_percentile_pheno_UNQC_eur" \
    --adjust \
    --ci 0.95 \
    --covar "${WORK_DIR}/selected_upper_25th_percentile_UNQC.txt" \
    --covar-name SCORE_Z,SEX,AGE,PC1,PC2,PC3,PC4,PC5 \
    --threads 15 \
    --covar-variance-standardize \
    --out "${WORK_DIR}/Logistic_FID_IID_PHENO_case_controls_UNQC_EUR" \
    --glm omit-ref firth-fallback cols=+a1freq,+a1freqcc,+a1count,+totallele,+a1countcc,+totallelecc,+gcountcc,+err \
    --silent

In [None]:
import pandas as pd

# Load the data
data = pd.read_csv('${WORK_DIR}/Logistic_FID_IID_PHENO_case_controls_UNQC_EUR.PHENO.glm.logistic.hybrid', sep='\t')

# Filter rows where TEST == 'ADD'
add_filtered = data[data['TEST'] == 'ADD']

# Save to a new file
add_filtered.to_csv('${WORK_DIR}/ADD_filtered_EUR_PRS_25th_UNQC.txt', sep='\t', index=False)


## Adjust by APOE

In [None]:
import pandas as pd

# File paths
selected_file = "${WORK_DIR}/selected_upper_25th_percentile_UNQC.txt"
apoe_file = "${WORK_DIR}/covars_alldata_with999forAGRandRACE_PCA_APOE012.txt"
output_file = "${WORK_DIR}/selected_UNQC_with_APOE_status.txt"

# Load selected individuals file
selected_df = pd.read_csv(selected_file, sep="\t", dtype=str)

# Load APOE file
apoe_df = pd.read_csv(apoe_file, sep="\t", dtype=str)

# Merge files on FID and IID (keeping only individuals present in both files)
merged_df = selected_df.merge(apoe_df[["FID", "IID", "APOE_STATUS_012"]], on=["FID", "IID"], how="left")

# Save the updated file
merged_df.to_csv(output_file, sep="\t", index=False)

print(f"Merged file saved to: {output_file}")


In [None]:
%%bash
module load plink/2.0

  plink2 \
    --bfile "${WORK_DIR}/merged_variants" \
    --double-id \
    --pheno "${WORK_DIR}/25th_percentile_pheno_UNQC_eur" \
    --adjust \
    --ci 0.95 \
    --covar "${WORK_DIR}/selected_UNQC_with_APOE_status.txt" \
    --covar-name SCORE_Z,APOE_STATUS_012,SEX,AGE,PC1,PC2,PC3,PC4,PC5 \
    --threads 15 \
    --covar-variance-standardize \
    --out "${WORK_DIR}/Logistic_FID_IID_PHENO_case_controls_UNQC_APOE_EUR" \
    --glm omit-ref firth-fallback cols=+a1freq,+a1freqcc,+a1count,+totallele,+a1countcc,+totallelecc,+gcountcc,+err \
    --silent

In [None]:
import pandas as pd

# Load the data
data = pd.read_csv('${WORK_DIR}/Logistic_FID_IID_PHENO_case_controls_UNQC_APOE_EUR.PHENO.glm.logistic.hybrid', sep='\t')

# Filter rows where TEST == 'ADD'
add_filtered = data[data['TEST'] == 'ADD']

# Save to a new file
add_filtered.to_csv('${WORK_DIR}/ADD_filtered_EUR_PRS_25th_UNQC_APOE.txt', sep='\t', index=False)