In [None]:
import os
import pandas as pd
import numpy as np
import subprocess
from multiprocessing import Pool, cpu_count

# run gwas for training data in each fold of EUR and EAS population
# For EAS

def process_gwas_single(
    pheno_filepath: str,
    bfile_filepath: str,
    covar_filepath: str,
    covar_list: str,
    output_prefix: str,
    trait: str,
    is_continuous: bool = True,
    threads_num: int = 48,
    plink_exec: str = "/data1/jiapl_group/lishuhua/software/general/plink2",
):
    """
    Process a single GWAS phenotype file.
    
    Args:
        pheno_filepath (str): Path to the phenotype file.
        bfile_filepath (str): Path to the PLINK binary files (.bed, .bim, .fam).
        covar_list (str): Path to the covariate list file.
        output_prefix (str): Output prefix for the results.
        trait (str): Trait name for the analysis.
        is_continuous (bool): Whether the trait is continuous. Defaults to True.
        threads_num (int): Number of threads to use. Defaults to 48.
    """
    if not os.path.exists(output_prefix):
        os.makedirs(output_prefix)
    if is_continuous:
        print(f"Running continuous trait analysis for {trait}.")
        command = [
            plink_exec,
            "--bfile", bfile_filepath,
            "--pheno", pheno_filepath,
            "--pheno-name", "Pheno",
            "--covar", covar_filepath,
            "--covar-name", covar_list,
            "--glm", "hide-covar", "cols=+a1freq",
            "--threads", str(threads_num),
            "--no-input-missing-phenotype",
            "--covar-variance-standardize",
            "--out", f"{output_prefix}/train"
        ]
    else:
        print(f"Running binary trait analysis for {trait}.")
        command = [
            plink_exec,
            "--bfile", bfile_filepath,
            "--pheno", pheno_filepath,
            "--pheno-name", "Pheno",
            "--covar", covar_filepath,
            "--covar-name", covar_list,
            "--glm", "hide-covar", "no-firth", "cols=+a1freq",
            "--threads", str(threads_num),
            "--no-input-missing-phenotype",
            "--covar-variance-standardize",
            "--out", f"{output_prefix}/train"
        ]
    subprocess.run(command, check=True)

if __name__ == "__main__":
    tasks = []
    eas_pheno_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"
    eas_bfile_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_final/CAS_merged_qc_final"
    eas_covar_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/pheno/covariates.txt"
    eas_covar_list = "age,sex,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10"
    trait_dict = {
        'p48': 'waist',
        'p50': 'height',
        'p102': 'pulse',
        'p4079': 'dbp',
        'p4080': 'sbp',
        'p20116': 'smoke',
        'p20117': 'drink',
        'p21001': 'bmi',
        'p21002': 'weight',
        'p30000': 'wbc',
        'p30010': 'rbc',
        'p30020':'hb',
        'p30080': 'plt',
        'p30120': 'lymph',
        'p30130': 'mono',
        'p30140': 'neut',
        'p30150': 'eos',
        'p30620': 'alt',
        'p30650': 'ast',
        'p30670': 'bun',
        'p30690': 'cholesterol',
        'p30700': 'creatinine',
        'p30730': 'ggt',
        'p30740': 'glucose',
        'p30760': 'hdl',
        'p30780': 'ldl',
        'p30870': 'triglycerides',
        'p30880': 'ua'
    }

    for trait, name in trait_dict.items():
        if name == "drink" or name == "smoke":
            is_continuous = False
        else:
            is_continuous = True
        for i in range(1, 11):
            pheno_filepath = f'{eas_pheno_base_dir}/{name}/group_{i}/pheno/train_pheno.txt'
            if not os.path.exists(pheno_filepath):
                print(f"Skipping {pheno_filepath} as it does not exist.")
                continue
            output_prefix = f"{eas_pheno_base_dir}/{name}/group_{i}/gwas"
            if not os.path.exists(output_prefix):
                os.makedirs(output_prefix)
            tasks.append((
                pheno_filepath,
                eas_bfile_path,
                eas_covar_path,
                eas_covar_list,
                output_prefix,
                name,
                is_continuous,
                48,
                "/data1/jiapl_group/lishuhua/software/general/plink2"))
    # Use multiprocessing to run GWAS in parallel
    pool = Pool(processes=cpu_count() - 1)
    pool.starmap(process_gwas_single, tasks)
    pool.close()
    pool.join()

In [None]:
import os
import pandas as pd
import numpy as np
import subprocess
from multiprocessing import Pool, cpu_count

# run gwas for training data in each fold of EUR and EAS population
# For EAS

def process_gwas_single(
    pheno_filepath: str,
    bfile_filepath: str,
    covar_filepath: str,
    covar_list: str,
    output_prefix: str,
    trait: str,
    chrom: int,
    is_continuous: bool = True,
    threads_num: int = 48,
    plink_exec: str = "/data1/jiapl_group/lishuhua/software/general/plink2",
):
    """
    Process a single GWAS phenotype file.
    
    Args:
        pheno_filepath (str): Path to the phenotype file.
        bfile_filepath (str): Path to the PLINK binary files (.bed, .bim, .fam).
        covar_list (str): Path to the covariate list file.
        output_prefix (str): Output prefix for the results.
        trait (str): Trait name for the analysis.
        is_continuous (bool): Whether the trait is continuous. Defaults to True.
        threads_num (int): Number of threads to use. Defaults to 48.
    """
    if not os.path.exists(output_prefix):
        os.makedirs(output_prefix)
    if is_continuous:
        print(f"Running continuous trait analysis for {trait} on chromosome chr{str(chrom)}.")
        command = [
            plink_exec,
            "--bfile", bfile_filepath,
            "--pheno", pheno_filepath,
            "--pheno-name", "Pheno",
            "--covar", covar_filepath,
            "--covar-name", covar_list,
            "--glm", "hide-covar", "cols=+a1freq",
            "--threads", str(threads_num),
            "--no-input-missing-phenotype",
            "--covar-variance-standardize",
            "--out", f"{output_prefix}/train_chr{str(chrom)}"
        ]
    else:
        print(f"Running binary trait analysis for {trait} on chromosome chr{str(chrom)}.")
        command = [
            plink_exec,
            "--bfile", bfile_filepath,
            "--pheno", pheno_filepath,
            "--pheno-name", "Pheno",
            "--covar", covar_filepath,
            "--covar-name", covar_list,
            "--glm", "hide-covar", "no-firth", "cols=+a1freq",
            "--threads", str(threads_num),
            "--no-input-missing-phenotype",
            "--covar-variance-standardize",
            "--out", f"{output_prefix}/train_chr{str(chrom)}"
        ]
    subprocess.run(command, check=True)

if __name__ == "__main__":
    tasks = []
    eur_pheno_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/"
    eur_bfile_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/White_British/0_sample_qc/"
    eur_covar_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/covar/covars_white_british_final.tsv"
    eur_covar_list = "age,sex,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20"

    trait_dict = {
        'p48': 'waist',
        'p50': 'height',
        'p102': 'pulse',
        'p4079': 'dbp',
        'p4080': 'sbp',
        'p20116': 'smoke',
        'p20117': 'drink',
        'p21001': 'bmi',
        'p21002': 'weight',
        'p30000': 'wbc',
        'p30010': 'rbc',
        'p30020':'hb',
        'p30080': 'plt',
        'p30120': 'lymph',
        'p30130': 'mono',
        'p30140': 'neut',
        'p30150': 'eos',
        'p30620': 'alt',
        'p30650': 'ast',
        'p30670': 'bun',
        'p30690': 'cholesterol',
        'p30700': 'creatinine',
        'p30730': 'ggt',
        'p30740': 'glucose',
        'p30760': 'hdl',
        'p30780': 'ldl',
        'p30870': 'triglycerides',
        'p30880': 'ua'
    }

    for trait, name in trait_dict.items():
        if name == "drink" or name == "smoke":
            is_continuous = False
        else:
            is_continuous = True
        for i in range(1, 11):
            pheno_filepath = f'{eur_pheno_base_dir}/{name}/group_{i}/pheno/train_pheno.txt'
            if not os.path.exists(pheno_filepath):
                print(f"Skipping {pheno_filepath} as it does not exist.")
                continue
            output_prefix = f"{eur_pheno_base_dir}/{name}/group_{i}/gwas"
            if not os.path.exists(output_prefix):
                os.makedirs(output_prefix)
            for chrom in range(1, 23):
                eur_bfile_path = f"{eur_bfile_base_path}/chr{chrom}"
                tasks.append((
                    pheno_filepath,
                    eur_bfile_path,
                    eur_covar_path,
                    eur_covar_list,
                    output_prefix,
                    name,
                    chrom,
                    is_continuous,
                    24,
                    "/data1/jiapl_group/lishuhua/software/general/plink2"
                ))
    # Use multiprocessing to run GWAS in parallel
    pool = Pool(processes=36)
    pool.starmap(process_gwas_single, tasks)
    pool.close()
    pool.join()

In [None]:
import os
import pandas as pd
import numpy as np
import subprocess
from multiprocessing import Pool, cpu_count

# run gwas for training data in each fold of EUR and EAS population
# For EAS

def process_gwas_single(
    pheno_filepath: str,
    bfile_filepath: str,
    covar_filepath: str,
    covar_list: str,
    output_prefix: str,
    trait: str,
    chrom: int,
    is_continuous: bool = True,
    threads_num: int = 2,
    plink_exec: str = "/data1/jiapl_group/lishuhua/software/general/plink2",
):
    """
    Process a single GWAS phenotype file.
    
    Args:
        pheno_filepath (str): Path to the phenotype file.
        bfile_filepath (str): Path to the PLINK binary files (.bed, .bim, .fam).
        covar_list (str): Path to the covariate list file.
        output_prefix (str): Output prefix for the results.
        trait (str): Trait name for the analysis.
        is_continuous (bool): Whether the trait is continuous. Defaults to True.
        threads_num (int): Number of threads to use. Defaults to 48.
    """
    if not os.path.exists(output_prefix):
        os.makedirs(output_prefix)
    if is_continuous:
        print(f"Running continuous trait analysis for {trait} on chromosome chr{str(chrom)}.")
        command = [
            plink_exec,
            "--bfile", bfile_filepath,
            "--pheno", pheno_filepath,
            "--pheno-name", "Pheno",
            "--covar", covar_filepath,
            "--covar-name", covar_list,
            "--glm", "hide-covar", "cols=+a1freq",
            "--threads", str(threads_num),
            "--no-input-missing-phenotype",
            "--covar-variance-standardize",
            "--out", f"{output_prefix}/train_chr{str(chrom)}"
        ]
    else:
        print(f"Running binary trait analysis for {trait} on chromosome chr{str(chrom)}.")
        command = [
            plink_exec,
            "--bfile", bfile_filepath,
            "--pheno", pheno_filepath,
            "--pheno-name", "Pheno",
            "--covar", covar_filepath,
            "--covar-name", covar_list,
            "--glm", "hide-covar", "no-firth", "cols=+a1freq",
            "--threads", str(threads_num),
            "--no-input-missing-phenotype",
            "--covar-variance-standardize",
            "--out", f"{output_prefix}/train_chr{str(chrom)}"
        ]
    subprocess.run(command, check=True)

if __name__ == "__main__":
    tasks = []
    eur_pheno_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/"
    eur_bfile_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/White_British/0_sample_qc/"
    eur_covar_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/pheno/covar/covars_white_british_final.tsv"
    eur_covar_list = "age,sex,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20"

    trait_dict = {
        'p48': 'waist',
        'p50': 'height',
        'p102': 'pulse',
        'p4079': 'dbp',
        'p4080': 'sbp',
        'p20116': 'smoke',
        'p20117': 'drink',
        'p21001': 'bmi',
        'p30000': 'wbc',
        'p30010': 'rbc',
        'p30020':'hb',
        'p30080': 'plt',
        'p30120': 'lymph',
        'p30130': 'mono',
        'p30140': 'neut',
        'p30150': 'eos',
        'p30620': 'alt',
        'p30650': 'ast',
        'p30670': 'bun',
        'p30690': 'cholesterol',
        'p30700': 'creatinine',
        'p30730': 'ggt',
        'p30740': 'glucose',
        'p30760': 'hdl',
        'p30780': 'ldl',
        'p30870': 'triglycerides',
        'p30880': 'ua'
    }

    for trait, name in trait_dict.items():
        if name == "drink" or name == "smoke":
            is_continuous = False
        else:
            is_continuous = True
        for i in range(1, 11):
            pheno_filepath = f'{eur_pheno_base_dir}/{name}/group_{i}/pheno/train_pheno.txt'
            if not os.path.exists(pheno_filepath):
                print(f"Skipping {pheno_filepath} as it does not exist.")
                continue
            output_prefix = f"{eur_pheno_base_dir}/{name}/group_{i}/gwas"
            if not os.path.exists(output_prefix):
                os.makedirs(output_prefix)
            for chrom in range(1, 23):
                if os.path.exists(f"{output_prefix}/train_chr{chrom}.Pheno.glm.linear"):
                    print(f"Skipping {output_prefix}/train_chr{chrom}.Pheno.glm.linear as it already exists.")
                    continue
                else:
                    eur_bfile_path = f"{eur_bfile_base_path}/chr{chrom}"
                    tasks.append((
                        pheno_filepath,
                        eur_bfile_path,
                        eur_covar_path,
                        eur_covar_list,
                        output_prefix,
                        name,
                        chrom,
                        is_continuous,
                        4,
                        "/data1/jiapl_group/lishuhua/software/general/plink2"
                    ))
    # Use multiprocessing to run GWAS in parallel
    pool = Pool(processes=4)
    pool.starmap(process_gwas_single, tasks)
    pool.close()
    pool.join()