In [None]:
# Step 1: Change the input OR to BETA (in smoke and drink) for EAS population
import pandas as pd
import os
import numpy as np

eas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"
binary_trait = ['smoke', 'drink']

for trait in binary_trait:
    for group in range(1, 11):
        print(f"Processing group {group}, trait {trait}")
        trait_file = os.path.join(eas_base_dir, f"{trait}/group_{group}/gwas/train.Pheno.glm.logistic")
        output_file = os.path.join(eas_base_dir, f"{trait}/group_{group}/gwas/train.Pheno.glm.linear")
        if not os.path.exists(trait_file):
            print(f"File {trait_file} does not exist. Skipping.")
            continue
        df = pd.read_csv(trait_file, sep="\t")
        df['BETA'] = df['OR'].apply(lambda x: 0 if x <= 0 else np.log(x))
        df.to_csv(output_file, sep="\t", index=False, header=True)

In [None]:
# Combine tune and test to construct a valid dataset for EAS
import pandas as pd
import numpy as np
import os

# demo gwas: /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/alt/group_1/gwas/train.Pheno.glm.linear
trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']
eas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"
output_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/"
eas_bfile_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_final/CAS_merged_qc_final"

for trait in trait_list:
    for group in range(1, 11):
        print(f"Processing group {group}, trait {trait}")
        # tune_id_path = os.path.join(eas_base_dir, f"{trait}/group_{group}/ids/tune_ids.txt")
        # test_id_path = os.path.join(eas_base_dir, f"{trait}/group_{group}/ids/test_ids.txt")
        combine_id_path = os.path.join(eas_base_dir, f"{trait}/group_{group}/ids/combined_ids.txt")
        # tune_output_dir = os.path.join(output_base_dir, f"tune/{trait}/group_{group}/")
        test_output_dir = os.path.join(output_base_dir, f"test/{trait}/group_{group}/")
        # os.makedirs(tune_output_dir, exist_ok=True)
        os.makedirs(test_output_dir, exist_ok=True)
        # tune_output_prefix = os.path.join(tune_output_dir, "tune")
        # test_output_prefix = os.path.join(test_output_dir, "test")
        combine_output_prefix = os.path.join(test_output_dir, "combine")

        # tune_command = f"plink2 --bfile {eas_bfile_path} --keep {tune_id_path} --make-bed --out {tune_output_prefix}"
        # test_command = f"plink2 --bfile {eas_bfile_path} --keep {test_id_path} --make-bed --out {test_output_prefix}"
        combine_command = f"plink2 --bfile {eas_bfile_path} --keep {combine_id_path} --make-bed --out {combine_output_prefix}"
        # os.system(tune_command)
        # os.system(test_command)
        os.system(combine_command)

In [None]:
# run C+T with different r2 and window size for EAS
import pandas as pd
import os

trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']
r2_list = [0.1]
windows_list = [500]

eas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"
ct_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/"
output_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/res/clumped/"

for trait in trait_list:
    for group in range(1, 11):
        print(f"Processing group {group}, trait {trait}")
        gwas_file = os.path.join(eas_base_dir, f"{trait}/group_{group}/gwas/train.Pheno.glm.linear")
        bfile_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/tlprs/reference/EAS_1kg/1000G.EAS.QC.hm3.ind"
        if not os.path.exists(gwas_file):
            print(f"File {gwas_file} does not exist. Skipping.")
            continue
        for r2 in r2_list:
            for window in windows_list:
                output_dir = os.path.join(output_base_dir, f"{trait}/group_{group}/r2_{r2}_w_{window}/")
                os.makedirs(output_dir, exist_ok=True)
                output_prefix = os.path.join(output_dir, "res")
                clump_command = f"plink2 --bfile {bfile_path} --clump {gwas_file} --clump-p1 1 --clump-r2 {r2} --clump-kb {window} --clump-snp-field ID --clump-field P --out {output_prefix}"
                os.system(clump_command)

In [None]:
# Step 1: Change the input OR to BETA (in smoke and drink)
# /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/train/gwas/fold_1/gwas_chr14_binary.p20116_int.glm.logistic
import pandas as pd
import os
import numpy as np

eur_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/train/gwas/"
binary_trait_dict = {
    'p20116': 'smoke',
    'p20117': 'drink'
}

for trait_id, trait in binary_trait_dict.items():
    for group in range(1, 11):
        print(f"Processing group {group}, trait {trait}")
        for chrom in range(1, 23):
            print(f"Processing chromosome {chrom}")
            trait_file = os.path.join(eur_base_dir, f"fold_{group}/gwas_chr{chrom}_binary.{trait_id}_int.glm.logistic")
            output_file = os.path.join(eur_base_dir, f"fold_{group}/gwas_chr{chrom}_binary.{trait_id}_int.glm.linear")
            if not os.path.exists(trait_file):
                print(f"File {trait_file} does not exist. Skipping.")
                continue
            df = pd.read_csv(trait_file, sep="\t")
            df['BETA'] = df['OR'].apply(lambda x: 0 if x <= 0 else np.log(x))
            df.to_csv(output_file, sep="\t", index=False, header=True)

In [None]:
# run C+T with different r2 and window size
import pandas as pd
import os

trait_dict = {
    'p48': 'waist',
    'p50': 'height',
    'p102': 'pulse',
    'p4079': 'dbp',
    'p4080': 'sbp',
    'p20116': 'smoke',
    'p20117': 'drink',
    'p21001': 'bmi',
    'p30000': 'wbc',
    'p30010': 'rbc',
    'p30020':'hb',
    'p30080': 'plt',
    'p30120': 'lymph',
    'p30130': 'mono',
    'p30140': 'neut',
    'p30150': 'eos',
    'p30620': 'alt',
    'p30650': 'ast',
    'p30670': 'bun',
    'p30690': 'cholesterol',
    'p30700': 'creatinine',
    'p30730': 'ggt',
    'p30740': 'glucose',
    'p30760': 'hdl',
    'p30780': 'ldl',
    'p30870': 'triglycerides',
    'p30880': 'ua'
}

r2_list = [0.1]
windows_list = [500]

eur_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/train/gwas/"
eur_tune_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/tune/"
output_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ct/res/clumped/"

for trait_id, trait in trait_dict.items():
    for group in range(10, 11):
        print(f"Processing group {group}, trait {trait}")
        for chrom in range(1, 23):
            gwas_file = os.path.join(eur_base_dir, f"fold_{group}/gwas_chr{chrom}.{trait_id}_int.glm.linear")
            if trait_id in ['p20116', 'p20117']:
                gwas_file = os.path.join(eur_base_dir, f"fold_{group}/gwas_chr{chrom}_binary.{trait_id}_int.glm.linear")
            if not os.path.exists(gwas_file):
                print(f"File {gwas_file} does not exist. Skipping.")
                continue
            tune_bfile = os.path.join(eur_tune_dir, f"fold_{group}/chr{chrom}")
            for r2 in r2_list:
                for window in windows_list:
                    output_dir = os.path.join(output_base_dir, f"{trait}/group_{group}/r2_{r2}_w_{window}/")
                    os.makedirs(output_dir, exist_ok=True)
                    output_prefix = os.path.join(output_dir, f"ukb_chr{chrom}")
                    clump_command = f"plink2 --bfile {tune_bfile} --rm-dup exclude-mismatch --clump {gwas_file} --clump-p1 1 --clump-r2 {r2} --clump-kb {window} --clump-snp-field ID --clump-field P --threads 20 --out {output_prefix}"
                    os.system(clump_command)