In [None]:
import pandas as pd
import os
import sys

eur_pheno_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/trait/White_British/"
eas_pheno_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"

eas_gwas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prscsx/train/EAS_CV/"
eur_gwas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prscsx/train/EUR/"
test_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prscsx/test/EAS/"
output_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prscsx/res_2/beta/"
reference_path = "/data1/jiapl_group/lishuhua/software/PRS/PRScsx/LD_reference/1kg/"
prs_script = "/data1/jiapl_group/lishuhua/software/PRS/PRScsx/PRScsx.py"
phi_list = [1e-6, 1e-4, 1e-2, 1]

trait_dict = {
    'p48': 'waist',
    'p50': 'height',
    'p102': 'pulse',
    'p4079': 'dbp',
    'p4080': 'sbp',
    'p20116': 'smoke',
    'p20117': 'drink',
    'p21001': 'bmi',
    'p21002': 'weight',
    'p30000': 'wbc',
    'p30010': 'rbc',
    'p30020':'hb',
    'p30080': 'plt',
    'p30120': 'lymph',
    'p30130': 'mono',
    'p30140': 'neut',
    'p30150': 'eos',
    'p30620': 'alt',
    'p30650': 'ast',
    'p30670': 'bun',
    'p30690': 'cholesterol',
    'p30700': 'creatinine',
    'p30730': 'ggt',
    'p30740': 'glucose',
    'p30760': 'hdl',
    'p30780': 'ldl',
    'p30870': 'triglycerides',
    'p30880': 'ua'
}

def get_sample_size(pheno_file):
    pheno_data = pd.read_csv(pheno_file, sep="\t", header=0)
    pheno_data.columns = ['FID', 'IID', 'pheno']
    pheno_data = pheno_data[pheno_data['pheno'].notnull()]
    return pheno_data.shape[0]

for trait_id, trait in trait_dict.items():
    eur_pheno_path = os.path.join(eur_pheno_dir, f"{trait_id}_int.txt")
    if not os.path.exists(eur_pheno_path):
        print(f"EUR phenotype file for {trait} not found, skipping.")
        continue
    eur_sample = get_sample_size(eur_pheno_path)
    for group in range(1, 11):
        eas_pheno_path = os.path.join(eas_pheno_dir, f"{trait}/group_{group}/pheno/train_pheno.txt")
        if not os.path.exists(eas_pheno_path):
            print(f"EAS phenotype file for {trait}, group {group} not found, skipping.")
            continue
        eas_sample = get_sample_size(eas_pheno_path)
        print(f"Trait: {trait}, Group: {group}, EUR sample size: {eur_sample}, EAS sample size: {eas_sample}")
        for phi in phi_list:
            output_dir = os.path.join(output_base_dir, f"{trait}/phi_{phi}/")
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            eas_gwas_path = os.path.join(eas_gwas_dir, f"{trait}/group_{group}/gwas.txt")
            eur_gwas_path = os.path.join(eur_gwas_dir, f"{trait_id}_int.merged.txt")
            test_bfile_path = os.path.join(test_base_dir, f"{trait}/group_{group}/test_geno")
            if not os.path.exists(eas_gwas_path):
                print(f"EAS GWAS file for {trait}, group {group} not found, skipping.")
                continue
            if not os.path.exists(eur_gwas_path):
                print(f"EUR GWAS file for {trait} not found, skipping.")
                continue
            if not os.path.exists(test_bfile_path + ".bim"):
                print(f"Test genotype BIM file for {trait}, group {group} not found, skipping.")
                continue
            command = f"python {prs_script} --ref_dir={reference_path} --bim_prefix={test_bfile_path} --sst_file={eur_gwas_path},{eas_gwas_path} --n_gwas={eur_sample},{eas_sample} --pop=EUR,EAS --phi={phi} --out_dir={output_dir} --out_name=group_{group}"
            os.system(command)