In [None]:
# Step 0: Get the columns from the GWAS file and process them (0_get_train_input.py)
import os
import subprocess
import pandas as pd
import scipy.stats as stats


def process_single_job(
    gwas_filepath,
    output_prefix,
    trait_name,
):
    print(f"Processing {trait_name} from {gwas_filepath}")
    if "BETA" not in pd.read_csv(gwas_filepath, sep="\t", nrows=1).columns:
        df = pd.read_csv(gwas_filepath, sep="\t", usecols=["#CHROM","POS", "ID", "REF", "ALT", "OR", "P", "OBS_CT", "A1_FREQ"])
    else:
        df = pd.read_csv(gwas_filepath, sep="\t", usecols=["#CHROM", "POS", "ID", "REF", "ALT", "BETA", "P", "OBS_CT", "A1_FREQ"])
    df["P"] = df["P"].clip(lower=1e-300, upper=1.0)
    df['c'] = -stats.norm.ppf(df['P'] / 2)
    if "BETA" in df.columns:
        df['z_score'] = df['c'] * df['BETA'].apply(lambda x: -1 if x < 0 else 1)
    else:
        df['z_score'] = df['c'] * df['OR'].apply(lambda x: -1 if x < 1 else 1)
    df = df[["#CHROM", "ID", "POS", "ALT", "REF", "z_score", "OBS_CT", "A1_FREQ"]]
    df.columns = ["CHR", "SNP", "BP", "A1", "A2", "Z", "N", "A1FREQ"]
    df = df.dropna(subset=["SNP", "N", "Z", "A1", "A2"])
    # remove duplicates
    df = df.drop_duplicates(subset=["SNP", "CHR", "BP"])
    df.to_csv(output_prefix, sep='\t', index=False, header=True)
    print(f"Processed {trait_name} and saved to {output_prefix}")

if __name__ == "__main__":
    trait_dict = {
        'p48': 'waist',
        'p50': 'height',
        'p102': 'pulse',
        'p4079': 'dbp',
        'p4080': 'sbp',
        'p20116': 'smoke',
        'p20117': 'drink',
        'p21001': 'bmi',
        'p30000': 'wbc',
        'p30010': 'rbc',
        'p30020':'hb',
        'p30080': 'plt',
        'p30120': 'lymph',
        'p30130': 'mono',
        'p30140': 'neut',
        'p30150': 'eos',
        'p30620': 'alt',
        'p30650': 'ast',
        'p30670': 'bun',
        'p30690': 'cholesterol',
        'p30700': 'creatinine',
        'p30730': 'ggt',
        'p30740': 'glucose',
        'p30760': 'hdl',
        'p30780': 'ldl',
        'p30870': 'triglycerides',
        'p30880': 'ua'
    }
    sumstats_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/gwas/"
    output_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/0_cleaned/"
    # p102_int.merged.glm.linear
    for trait, name in trait_dict.items():
        sumstats_filepath = os.path.join(sumstats_dir, f"{trait}_int.merged.glm.linear")
        process_single_job(
            gwas_filepath=sumstats_filepath,
            output_prefix=os.path.join(output_base_path, f"{name}.txt"),
            trait_name=name
        )
        


In [None]:
# Step 1: munged sumstats using GWAS file (1_munge_sumstats.py)
import os
import subprocess
import pandas as pd
import scipy.stats as stats

def process_single_job(
    sumstats_filepath,
    output_prefix,
    polyfun_exec
):
    command = [
        "python", polyfun_exec,
        "--sumstats", sumstats_filepath,
        "--out", output_prefix,
    ]
    # Run the command
    try:
        subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error running command: {' '.join(command)}")
        print(e)

if __name__ == "__main__":
    trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi',
                  'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos',
                  'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'ggt',
                  'glucose', 'hdl', 'ldl', 'triglycerides', 'ua']
    sumstats_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/0_cleaned/"
    output_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/1_parquet/"
    polypred_exec = "/data1/jiapl_group/lishuhua/software/PRS/Polyfun/polyfun-master/munge_polyfun_sumstats.py"
    for trait in trait_list:
        sumstats_filepath = os.path.join(sumstats_dir, f"{trait}.txt")
        output_prefix = os.path.join(output_base_path, f"{trait}.txt")
        process_single_job(
            sumstats_filepath=sumstats_filepath,
            output_prefix=output_prefix,
            polyfun_exec=polypred_exec
        )

In [None]:
# Step 1: created .snpvar file from .txt file (1_add_snpvar.py)
import os
import subprocess
import pandas as pd
import scipy.stats as stats

def process_single_job(
    sumstats_filepath,
    output_prefix,
    polyfun_exec
):
    command = [
        "python", polyfun_exec,
        "--sumstats", sumstats_filepath,
        "--allow-missing",
        "--out", output_prefix,
    ]
    # Run the command
    try:
        subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error running command: {' '.join(command)}")
        print(e)

if __name__ == "__main__":
    trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi',
                  'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos',
                  'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'ggt',
                  'glucose', 'hdl', 'ldl', 'triglycerides', 'ua']
    sumstats_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/1_parquet/"
    output_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/1_parquet/"
    polypred_exec = "/data1/jiapl_group/lishuhua/software/PRS/Polyfun/polyfun-master/extract_snpvar.py"
    for trait in trait_list:
        sumstats_filepath = os.path.join(sumstats_dir, f"{trait}.txt")
        output_prefix = os.path.join(output_base_path, f"{trait}_snpvar.txt")
        process_single_job(
            sumstats_filepath=sumstats_filepath,
            output_prefix=output_prefix,
            polyfun_exec=polypred_exec
        )

In [None]:
# Step 2: created finemapping jobs using .snpvar file (2_create_jobs.py)
import os
import subprocess
import pandas as pd
import scipy.stats as stats

def process_single_job(
    sumstats_filepath,
    output_prefix,
    jobs_output_prefix,
    polyfun_exec
):
    command = [
        "python", polyfun_exec,
        "--sumstats", sumstats_filepath,
        "--method", "susie",
        "--n", "336922",
        "--max-num-causal", "10",
        "--jobs-file", jobs_output_prefix,
        "--memory", "3",
        "--threads", "12",
        "--out", output_prefix,
    ]
    # Run the command
    try:
        subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error running command: {' '.join(command)}")
        print(e)

if __name__ == "__main__":
    trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi',
                  'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos',
                  'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'ggt',
                  'glucose', 'hdl', 'ldl', 'triglycerides', 'ua']
    sumstats_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/1_parquet/"
    output_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/2_finemapping_jobs/"
    polypred_exec = "/data1/jiapl_group/lishuhua/software/PRS/Polyfun/polyfun-master/create_finemapper_jobs.py"
    for trait in trait_list:
        sumstats_filepath = os.path.join(sumstats_dir, f"{trait}_snpvar.txt")
        output_prefix = os.path.join(output_base_path, f"{trait}/job")
        if not os.path.exists(os.path.dirname(output_prefix)):
            os.makedirs(os.path.dirname(output_prefix))
        jobs_output_prefix = os.path.join(output_base_path, f"{trait}/{trait}_jobs.txt")
        process_single_job(
            sumstats_filepath=sumstats_filepath,
            output_prefix=output_prefix,
            jobs_output_prefix=jobs_output_prefix,
            polyfun_exec=polypred_exec
        )

In [None]:
import os
import subprocess
import pandas as pd
import scipy.stats as stats

def process_single_job(
    sumstats_filepath,
    output_prefix,
    temp_prefix,
    polyfun_exec
):
    command = [
        "python", polyfun_exec,
        "--sumstats", sumstats_filepath,
        "--adjust-beta-freq",
        "--out-prefix", temp_prefix,
        "--out", output_prefix
    ]
    # Run the command
    try:
        subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error running command: {' '.join(command)}")
        print(e)

if __name__ == "__main__":
    # trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi',
    #               'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos',
    #               'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'ggt',
    #               'glucose', 'hdl', 'ldl', 'triglycerides', 'ua']
    trait_list = ['alt', 'ast', 'bmi']
    sumstats_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/1_parquet/"
    temp_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/2_finemapping_jobs/"
    output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/3_aggregate/"
    polypred_exec = "/data1/jiapl_group/lishuhua/software/PRS/Polyfun/polyfun-master/aggregate_finemapper_results.py"
    for trait in trait_list:
        sumstats_filepath = os.path.join(sumstats_dir, f"{trait}_snpvar.txt")
        temp_prefix = os.path.join(temp_dir, f"{trait}/job")
        output_prefix = os.path.join(output_dir, f"{trait}.txt")
        
        process_single_job(
            sumstats_filepath=sumstats_filepath,
            output_prefix=output_prefix,
            temp_prefix=temp_prefix,
            polyfun_exec=polypred_exec
        )

In [None]:
# check if the polypred run successfully
# find . -type f -name "*.gz" | wc -l
# find . -type f -name "*.gz.log" | wc -l

In [None]:
python /data1/jiapl_group/lishuhua/software/PRS/Polyfun/polyfun-master/aggregate_finemapper_results.py --sumstats /data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/1_parquet/alt_snpvar.txt --adjust-beta-freq --out-prefix /data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/2_finemapping_jobs/alt --out /data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/3_aggregate/alt.txt