In [None]:
import pandas as pd
import os
import numpy as np

# /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/ast/group_1/gwas

eas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"
eas_output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/tlprs/train/EAS/train/"

trait_dict = {
    # 'p48': 'waist',
    # 'p50': 'height',
    # 'p102': 'pulse',
    # 'p4079': 'dbp',
    # 'p4080': 'sbp',
    'p20116': 'smoke',
    'p20117': 'drink'
    # 'p21001': 'bmi',
    # 'p30000': 'wbc',
    # 'p30010': 'rbc',
    # 'p30020':'hb',
    # 'p30080': 'plt',
    # 'p30120': 'lymph',
    # 'p30130': 'mono',
    # 'p30140': 'neut',
    # 'p30150': 'eos',
    # 'p30620': 'alt',
    # 'p30650': 'ast',
    # 'p30670': 'bun',
    # 'p30690': 'cholesterol',
    # 'p30700': 'creatinine',
    # 'p30730': 'ggt',
    # 'p30740': 'glucose',
    # 'p30760': 'hdl',
    # 'p30780': 'ldl',
    # 'p30870': 'triglycerides',
    # 'p30880': 'ua'
}

def get_train_data(trait, gwas_path, output_path):
    print(f"Processing trait: {trait} from {gwas_path} to {output_path}")
    if "BETA" not in pd.read_csv(gwas_path, sep='\t', nrows=1).columns:
        df = pd.read_csv(gwas_path, sep='\t', usecols=['ID', 'ALT', 'OR', 'OBS_CT', 'P'])
    else:
        df = pd.read_csv(gwas_path, sep='\t', usecols=['ID', 'ALT', 'BETA', 'OBS_CT', 'P'])
    if "OR" in df.columns:
        df["BETA"] = np.log(df["OR"])
    res_df = df[['ID', 'ALT', 'BETA', 'OBS_CT', 'P']].copy()
    res_df['P'] = res_df['P'].clip(1e-300, 1)
    res_df.columns = ["SNP", "A1", "beta", "N", "p"]
    res_df = res_df.dropna(subset=['SNP', 'A1', 'beta'])
    res_df = res_df.dropna(subset=['N', 'p'])
    res_df.to_csv(output_path, sep=' ', index=False, header=True)

if __name__ == "__main__":
    for trait, trait_name in trait_dict.items():
        if trait_name in ['smoke', 'drink']:
            prefix = 'train.Pheno.glm.logistic'
        else:
            prefix = 'train.Pheno.glm.linear'
        for group_num in range(1, 11):
            gwas_path = os.path.join(eas_base_dir, f'{trait_name}/group_{group_num}/gwas/{prefix}')
            output_path = os.path.join(eas_output_dir, f'{trait_name}/group_{group_num}/train_gwas.txt')
            if not os.path.exists(os.path.dirname(output_path)):
                os.makedirs(os.path.dirname(output_path))
            if os.path.exists(gwas_path):
                get_train_data(trait_name, gwas_path, output_path)
            else:
                print(f"GWAS file not found for {trait_name} group {group_num}: {gwas_path}")

: 

In [None]:
import pandas as pd
import os
import numpy as np

# /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/ast/group_1/gwas

eur_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/tlprs/beta/1_lassosum/"
eur_output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/tlprs/train/EUR/lasso_beta/"

trait_dict = {
    # 'p48': 'waist'
    # 'p50': 'height',
    # 'p102': 'pulse',
    # 'p4079': 'dbp',
    # 'p4080': 'sbp',
    # 'p20116': 'smoke',
    # 'p20117': 'drink',
    # 'p21001': 'bmi',
    # 'p30000': 'wbc',
    # 'p30010': 'rbc',
    # 'p30020':'hb',
    # 'p30080': 'plt',
    # 'p30120': 'lymph',
    # 'p30130': 'mono',
    'p30140': 'neut',
    'p30150': 'eos',
    'p30620': 'alt',
    'p30650': 'ast',
    'p30670': 'bun',
    'p30690': 'cholesterol',
    'p30700': 'creatinine',
    'p30730': 'ggt',
    'p30740': 'glucose',
    'p30760': 'hdl',
    'p30780': 'ldl',
    'p30870': 'triglycerides',
    'p30880': 'ua'
}

def get_lasso_beta(trait, beta_path, output_path):
    print(f"Processing trait: {trait} from {beta_path} to {output_path}")
    beta_df = pd.read_csv(beta_path, sep='\t', header=0)
    beta_df = beta_df[['snp', 'A1', 'v$best.beta']]
    beta_df.columns = ['SNP', 'A1', 'Beta']
    # drop rows with NA in any of the columns
    beta_df = beta_df.dropna(subset=['SNP', 'A1', 'Beta'])
    beta_df.to_csv(output_path, sep=' ', index=False, header=True)

if __name__ == "__main__":
    for trait, trait_name in trait_dict.items():
        for group_num in range(1, 11):
            print(f"Processing group {group_num} for trait {trait_name}")
            beta_path = os.path.join(eur_base_dir, f'{trait_name}/group_{group_num}.txt')
            output_path = os.path.join(eur_output_dir, f'{trait_name}/group_{group_num}/lasso_beta.txt')
            if not os.path.exists(os.path.dirname(output_path)):
                os.makedirs(os.path.dirname(output_path))
            if os.path.exists(beta_path):
                get_lasso_beta(trait_name, beta_path, output_path)
            else:
                print(f"Lasso beta file not found for {trait_name} group {group_num}: {beta_path}")

In [None]:
import pandas as pd
import os
import sys
import numpy as np

# /data1/jiapl_group/lishuhua/project/PRS_benchmark/software/tlprs/train/EAS/train/wbc/group_9/

eas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"
eas_plink_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_final/CAS_merged_qc_final"
eas_output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/tlprs/train/EAS/train/"
plink_exec = "/data1/jiapl_group/lishuhua/software/general/plink"

trait_dict = {
    'p48': 'waist',
    'p50': 'height',
    'p102': 'pulse',
    'p4079': 'dbp',
    'p4080': 'sbp',
    'p20116': 'smoke',
    'p20117': 'drink',
    'p21001': 'bmi',
    'p30000': 'wbc',
    'p30010': 'rbc',
    'p30020':'hb',
    'p30080': 'plt',
    'p30120': 'lymph',
    'p30130': 'mono',
    'p30140': 'neut',
    'p30150': 'eos',
    'p30620': 'alt',
    'p30650': 'ast',
    'p30670': 'bun',
    'p30690': 'cholesterol',
    'p30700': 'creatinine',
    'p30730': 'ggt',
    'p30740': 'glucose',
    'p30760': 'hdl',
    'p30780': 'ldl',
    'p30870': 'triglycerides',
    'p30880': 'ua'
}

def get_train_data(trait, data_path, group_num):
    print(f"Processing trait: {trait} of group {group_num}")
    ids_path = os.path.join(data_path, f"{trait}/group_{group_num}/ids/train_ids.txt")
    if not os.path.exists(ids_path):
        print(f"IDs file does not exist: {ids_path}")
        return None
    plink_cmd = f'{plink_exec} --bfile {eas_plink_path} --keep {ids_path} --make-bed --out {eas_output_dir}/{trait}/group_{group_num}/train'
    os.system(plink_cmd)
    print(f"Train data for trait {trait} in group {group_num} processed successfully.")

if __name__ == "__main__":
    for trait, trait_name in trait_dict.items():
        for group_num in range(1, 11):
            get_train_data(trait_name, eas_base_dir, group_num)