In [None]:
import pandas as pd
import os
import sys
import numpy as np

eas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"
eas_plink_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_final/CAS_merged_qc_final"
eas_covar_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/pheno/covariates.txt"
eas_output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/tlprs/train/EAS/valid/"
eas_lassosum_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/tlprs/train/EAS/tune/"
plink_exec = "/data1/jiapl_group/lishuhua/software/general/plink"

trait_dict = {
    # 'p48': 'waist',
    # 'p50': 'height',
    # 'p102': 'pulse',
    # 'p4079': 'dbp',
    # 'p4080': 'sbp',
    'p20116': 'smoke',
    'p20117': 'drink',
    # 'p21001': 'bmi',
    # 'p30000': 'wbc',
    # 'p30010': 'rbc',
    # 'p30020':'hb',
    # 'p30080': 'plt',
    # 'p30120': 'lymph',
    # 'p30130': 'mono',
    # 'p30140': 'neut',
    # 'p30150': 'eos',
    # 'p30620': 'alt',
    # 'p30650': 'ast',
    # 'p30670': 'bun',
    # 'p30690': 'cholesterol',
    # 'p30700': 'creatinine',
    # 'p30730': 'ggt',
    # 'p30740': 'glucose',
    # 'p30760': 'hdl',
    # 'p30780': 'ldl',
    # 'p30870': 'triglycerides',
    # 'p30880': 'ua'
}

def get_valid_data(trait, data_path, group_num, covar_df):
    print(f"Processing trait: {trait} of group {group_num}")
    pheno_path = os.path.join(data_path, f"{trait}/group_{group_num}/pheno/test_pheno.txt")
    ids_path = os.path.join(data_path, f"{trait}/group_{group_num}/ids/test_ids.txt")
    if not os.path.exists(pheno_path):
        print(f"Pheno file does not exist: {pheno_path}")
        return None
    pheno_df = pd.read_csv(pheno_path, sep='\t', header=0)
    pheno_df["FID"] = pheno_df["FID"].astype(str)
    pheno_df["IID"] = pheno_df["IID"].astype(str)
    merge_df = pd.merge(pheno_df, covar_df, on=["FID", "IID"], how="left")
    merge_df = merge_df.dropna(subset=["FID", "IID", "Pheno"])
    if merge_df.empty:
        print(f"No data after merging for trait {trait} in group {group_num}")
        return None
    output_dir = f'{eas_output_dir}/{trait}/group_{group_num}/'
    os.makedirs(output_dir, exist_ok=True)
    if trait == "smoke" or trait == "drink":
        merge_df["Pheno"] = merge_df["Pheno"].astype(int)
        merge_df["Pheno"] = merge_df["Pheno"].replace({1: 0, 2: 1})
    merge_df.to_csv(f'{output_dir}/pheno_covar.txt', sep=' ', index=False, header=True)
    # plink_cmd = f'{plink_exec} --bfile {eas_plink_path} --keep {ids_path} --make-bed --out {output_dir}/valid'
    # os.system(plink_cmd)
    print(f"Data for trait {trait} in group {group_num} processed successfully.")

def get_tune_data(trait, data_path, group_num, covar_df):
    print(f"Processing tune data for trait: {trait} of group {group_num}")
    pheno_path = os.path.join(data_path, f"{trait}/group_{group_num}/pheno/tune_pheno.txt")
    ids_path = os.path.join(data_path, f"{trait}/group_{group_num}/ids/tune_ids.txt")
    if not os.path.exists(pheno_path):
        print(f"Pheno file does not exist: {pheno_path}")
        return None
    ids_df = pd.read_csv(ids_path, sep='\t', header=None, names=["FID", "IID"])
    ids_df["FID"] = ids_df["FID"].astype(str)
    ids_df["IID"] = ids_df["IID"].astype(str)
    covar = pd.merge(ids_df, covar_df, on=["FID", "IID"], how="left")
    pheno_df = pd.read_csv(pheno_path, sep='\t', header=0)
    output_dir = f'{eas_lassosum_dir}/{trait}/group_{group_num}/'
    if trait == "smoke" or trait == "drink":
        pheno_df["Pheno"] = pheno_df["Pheno"].astype(int)
        pheno_df["Pheno"] = pheno_df["Pheno"].replace({1: 0, 2: 1})
    os.makedirs(output_dir, exist_ok=True)
    if pheno_df.shape[0] == covar.shape[0] == ids_df.shape[0]:
        print(f"Data for trait {trait} in group {group_num} is consistent.")
        covar.to_csv(f'{output_dir}/covar.txt', sep='\t', index=False, header=True)
        pheno_df.to_csv(f'{output_dir}/pheno.txt', sep='\t', index=False, header=True)
        plink_cmd = f'{plink_exec} --bfile {eas_plink_path} --keep {ids_path} --make-bed --out {output_dir}/tune'
        os.system(plink_cmd)
        print(f"Tune data for trait {trait} in group {group_num} processed successfully.")
    else:
        print(f"Data mismatch for trait {trait} in group {group_num}: "
              f"pheno {pheno_df.shape[0]}, covar {covar.shape[0]}, ids {ids_df.shape[0]}")

if __name__ == "__main__":
    covar_df = pd.read_csv(eas_covar_path, sep='\t', header=0)
    covar_df["FID"] = covar_df["FID"].astype(str)
    covar_df["IID"] = covar_df["IID"].astype(str)
    for trait, trait_name in trait_dict.items():
        for group_num in range(1, 11):
            get_valid_data(trait_name, eas_base_dir, group_num, covar_df)
            get_tune_data(trait_name, eas_base_dir, group_num, covar_df)