In [None]:
import pandas as pd
import os
import numpy as np

gwas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"
gwas_output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prscsx/train/EAS_CV/"

trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']

for trait in trait_list:
    for group in range(1, 11):
        print(f"Processing trait: {trait}, group: {group}")
        gwas_path = os.path.join(gwas_base_dir, f"{trait}/group_{group}/gwas/train.Pheno.glm.linear")
        if trait in ['smoke', 'drink']:
            gwas_path = os.path.join(gwas_base_dir, f"{trait}/group_{group}/gwas/train.Pheno.glm.logistic")
        if not os.path.exists(gwas_path):
            print(f"GWAS file not found: {gwas_path}")
            continue
        gwas_df = pd.read_csv(gwas_path, sep="\t")
        if 'BETA' not in gwas_df.columns:
            gwas_df = gwas_df[["ID", "ALT", "REF", "OR", "LOG(OR)_SE"]]
            gwas_df.columns = ["SNP", "A1", "A2", "OR", "SE"]
        else:
            gwas_df = gwas_df[["ID", "ALT", "REF", "BETA", "SE"]]
            gwas_df.columns = ["SNP", "A1", "A2", "BETA", "SE"]
        output_dir = os.path.join(gwas_output_dir, f"{trait}/group_{group}")
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, "gwas.txt")
        gwas_df.to_csv(output_path, sep="\t", index=False, header=True)

In [None]:
import pandas as pd
import os

ids_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"
pheno_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/pheno/trait/data/"
eas_bfile_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_final/CAS_merged_qc_final"
output_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prscsx/test/EAS/"

trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']

for trait in trait_list:
    pheno_path = os.path.join(pheno_base_dir, f"{trait}_raw.txt")
    if not os.path.exists(pheno_path):
        print(f"Phenotype file not found: {pheno_path}")
        continue
    pheno_df = pd.read_csv(pheno_path, sep="\t")
    pheno_df.columns = ["FID", "IID", "pheno"]
    pheno_df["FID"] = pheno_df["FID"].astype(str)
    pheno_df["IID"] = pheno_df["IID"].astype(str)
    for group in range(1, 11):
        ids_path = os.path.join(ids_base_dir, f"{trait}/group_{group}/ids/combined_ids.txt")
        if not os.path.exists(ids_path):
            print(f"IDs file not found: {ids_path}")
            continue
        ids_df = pd.read_csv(ids_path, sep="\t", header=None, names=["FID", "IID"])
        ids_df["FID"] = ids_df["FID"].astype(str)
        ids_df["IID"] = ids_df["IID"].astype(str)
        output_dir = os.path.join(output_base_dir, f"{trait}/group_{group}")
        os.makedirs(output_dir, exist_ok=True)
        if not os.path.exists(f"{output_dir}/test_geno.bed"):
            os.system(f"plink2 --bfile {eas_bfile_path} --keep {ids_path} --make-bed --out {output_dir}/test_geno")
        merge_df = pd.merge(ids_df, pheno_df, on=["FID", "IID"], how="left")
        if merge_df.shape[0] != ids_df.shape[0]:
            print(f"Warning: Mismatch in number of individuals for trait {trait}, group {group}")
        pheno_output_path = os.path.join(output_dir, "test_pheno.txt")
        merge_df.to_csv(pheno_output_path, sep="\t", index=False, header=True)