In [None]:
import pandas as pd
import numpy as np
import os

full_clump_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/res/full_model/EAS/"
full_eas_gwas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/gwas/gwas/"
output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/res/full_model/EAS/"

# trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']
# trait_list = ['alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']
trait_list = ['alt']

for trait in trait_list:
    print(f"Processing trait: {trait}")
    clump_file = os.path.join(full_clump_dir, f"{trait}.clumps")
    gwas_path = os.path.join(full_eas_gwas_dir, f"{trait}_int.{trait}_int.glm.linear")
    if trait in ['smoke', 'drink']:
        gwas_path = os.path.join(full_eas_gwas_dir, f"{trait}_raw.{trait}_raw.glm.logistic")
    df = pd.read_csv(clump_file, sep="\t")
    df_filtered = df[["#CHROM", "POS", "ID"]]

    gwas = pd.read_csv(gwas_path, sep="\t")
    # if OR column exists, change OR into beta
    if 'OR' in gwas.columns:
        gwas['BETA'] = np.log(gwas['OR'])
    gwas = gwas[["#CHROM", "POS", "ID", "REF", "ALT", "BETA", "P"]]
    merged = pd.merge(df_filtered, gwas, on=["#CHROM", "POS", "ID"], how="inner")
    if merged.shape[0] != df_filtered.shape[0]:
        print(f"Warning: Mismatch in SNP counts for trait {trait}. Clumped SNPs: {df_filtered.shape[0]}, Merged SNPs: {merged.shape[0]}")
    output_path = os.path.join(output_dir, f"{trait}.tsv")
    merged.to_csv(output_path, sep="\t", index=False)

In [None]:
# calculate PRS based on the filtered clump files on EAS test data
# test data demo: /data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/test/alt/group_1/test
import pandas as pd
import os
import numpy as np

# trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']
trait_list = ['alt']
clump_res_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/res/full_model/EAS/"
test_bfile_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/Chinese/1_merged/merged"
output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/res/test_out_sample/"

for trait in trait_list:
    print(f"Processing trait {trait}")
    clump_file = os.path.join(clump_res_dir, f"{trait}.tsv")
    if not os.path.exists(clump_file):
        print(f"File {clump_file} does not exist. Skipping.")
        continue
    output_subdir = os.path.join(output_dir, f"EAS/")
    os.makedirs(output_subdir, exist_ok=True)
    output_prefix = os.path.join(output_subdir, f"{trait}")
    prs_command = f"plink2 --bfile {test_bfile_path} --score {clump_file} 3 5 6 header no-mean-imputation --out {output_prefix}"
    os.system(prs_command)

In [None]:
import pandas as pd
import numpy as np
import os

full_clump_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/res/full_model/EUR/"
full_eur_gwas_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/gwas/White_British/"
output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/res/full_model/EUR/"

trait_dict = {
        'p48': 'waist',
        'p50': 'height',
        'p102': 'pulse',
        'p4079': 'dbp',
        'p4080': 'sbp',
        'p20116': 'smoke',
        'p20117': 'drink',
        'p21001': 'bmi',
        'p30000': 'wbc',
        'p30010': 'rbc',
        'p30020':'hb',
        'p30080': 'plt',
        'p30120': 'lymph',
        'p30130': 'mono',
        'p30140': 'neut',
        'p30150': 'eos',
        'p30620': 'alt',
        'p30650': 'ast',
        'p30670': 'bun',
        'p30690': 'cholesterol',
        'p30700': 'creatinine',
        'p30730': 'ggt',
        'p30740': 'glucose',
        'p30760': 'hdl',
        'p30780': 'ldl',
        'p30870': 'triglycerides',
        'p30880': 'ua'
    }

for trait_id, trait_name in trait_dict.items():
    res_df = []
    for chrom in range(1, 23):
        print(f"Processing trait: {trait_name} of chromosome chr{chrom}")
        clump_file = os.path.join(full_clump_dir, f"{trait_name}/chr_{chrom}.clumps")
        if not os.path.exists(clump_file):
            print(f"File {clump_file} does not exist. Skipping.")
            continue
        gwas_path = os.path.join(full_eur_gwas_path, f"{trait_id}_int_chr{chrom}.{trait_id}_int.glm.linear")
        if trait_name in ['smoke', 'drink']:
            gwas_path = os.path.join(full_eur_gwas_path, f"{trait_id}_int_chr{chrom}.{trait_id}_int.glm.logistic")
        df = pd.read_csv(clump_file, sep="\t")
        df_filtered = df[["#CHROM", "POS", "ID"]]
        gwas = pd.read_csv(gwas_path, sep="\t")
        # if OR column exists, change OR into beta
        if 'OR' in gwas.columns:
            gwas['BETA'] = np.log(gwas['OR'])
        gwas = gwas[["#CHROM", "POS", "ID", "REF", "ALT", "BETA", "P"]]
        merged = pd.merge(df_filtered, gwas, on=["#CHROM", "POS", "ID"], how="inner")
        if merged.shape[0] != df_filtered.shape[0]:
            print(f"Warning: Mismatch in SNP counts for trait {trait_name} chr{chrom}. Clumped SNPs: {df_filtered.shape[0]}, Merged SNPs: {merged.shape[0]}")
        # output_path = os.path.join(output_dir, f"{trait_name}/chr{chrom}.tsv")
        # merged.to_csv(output_path, sep="\t", index=False)
        res_df.append(merged)
    if len(res_df) > 0:
        final_df = pd.concat(res_df, ignore_index=True)
        final_output_path = os.path.join(output_dir, f"{trait_name}/{trait_name}.tsv")
        final_df.to_csv(final_output_path, sep="\t", index=False, header=True)

In [None]:
# calculate PRS based on the filtered clump files on EAS test data
# test data demo: /data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/test/alt/group_1/test
import pandas as pd
import os
import numpy as np

trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']
# trait_list = ['alt']
clump_res_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/res/full_model/EUR/"
test_bfile_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/Chinese/1_merged/merged"
output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/res/test_out_sample/"

for trait in trait_list:
    print(f"Processing trait {trait}")
    clump_file = os.path.join(clump_res_dir, f"{trait}/{trait}.tsv")
    if not os.path.exists(clump_file):
        print(f"File {clump_file} does not exist. Skipping.")
        continue
    output_subdir = os.path.join(output_dir, f"EUR/")
    os.makedirs(output_subdir, exist_ok=True)
    output_prefix = os.path.join(output_subdir, f"{trait}")
    prs_command = f"plink2 --bfile {test_bfile_path} --score {clump_file} 3 5 6 header no-mean-imputation --out {output_prefix}"
    os.system(prs_command)