In [None]:
# run prs-csx (auto)
# Step1: make two population trait name pairs

import pandas as pd
import sys
import os
import subprocess
from multiprocessing import Pool, cpu_count

sst_eas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prscsx/train/EAS/"
sst_eur_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prscsx/train/EUR/"
test_eas_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/Chinese/1_merged/merged"
reference_path = "/data1/jiapl_group/lishuhua/software/PRS/PRScsx/LD_reference/1kg/"
output_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prscsx/res/data/"
prs_script = "/data1/jiapl_group/lishuhua/software/PRS/PRScsx/PRScsx.py"

trait_dict = {
    'p48': 'waist',
    'p50': 'height',
    'p102': 'pulse',
    'p4079': 'dbp',
    'p4080': 'sbp',
    'p20116': 'smoke',
    'p20117': 'drink',
    'p21001': 'bmi',
    'p21002': 'weight',
    'p30000': 'wbc',
    'p30010': 'rbc',
    'p30020':'hb',
    'p30080': 'plt',
    'p30120': 'lymph',
    'p30130': 'mono',
    'p30140': 'neut',
    'p30150': 'eos',
    'p30620': 'alt',
    'p30650': 'ast',
    'p30670': 'bun',
    'p30690': 'cholesterol',
    'p30700': 'creatinine',
    'p30730': 'ggt',
    'p30740': 'glucose',
    'p30760': 'hdl',
    'p30780': 'ldl',
    'p30870': 'triglycerides',
    'p30880': 'ua'
}

# Step2: for each trait in trait_dict, run prs-csx
# waist_int.waist_int.glm.linear
# smoke_raw.smoke_raw.glm.logistic
# p102_int.merged.glm.linear

def run_single_job(sst_eur, sst_eas, output_path, trait, name):
    print(f"Processing trait: {name} ({trait})")
    cmd = f'python {prs_script} --ref_dir={reference_path} --bim_prefix={test_eas_path} --sst_file={sst_eur},{sst_eas} --pop=EUR,EAS --n_gwas=336922,2460 --out_dir={output_path} --out_name={name}_{trait}'
    os.system(cmd) > f'{output_path}/{name}_{trait}.log 2>&1'


tasks = []
for trait, name in trait_dict.items():
    if trait in ['p20116', 'p20117']:
        sst_eas = os.path.join(sst_eas_dir, f"{name}_raw.{name}_raw.txt")
    else:
        sst_eas = os.path.join(sst_eas_dir, f"{name}_int.{name}_int.txt")
    sst_eur = os.path.join(sst_eur_dir, f"{trait}_int.merged.txt")
    output_prefix = f"{name}_{trait}"
    # check if files exist
    if os.path.exists(sst_eas) and os.path.exists(sst_eur):
        tasks.append((sst_eur, sst_eas, output_path, trait, name))
    else:
        print(f"Warning: SST files for {name} ({trait}) do not exist. Skipping this trait.")
        continue

pool = Pool(processes=cpu_count())
pool.starmap(run_single_job, tasks)
pool.close()
pool.join()

Processing trait: waist (p48)
Processing trait: height (p50)
Processing trait: pulse (p102)
Processing trait: dbp (p4079)
Processing trait: sbp (p4080)
Processing trait: smoke (p20116)
Processing trait: drink (p20117)
Processing trait: bmi (p21001)
Processing trait: weight (p21002)
Processing trait: wbc (p30000)
Processing trait: rbc (p30010)
Processing trait: hb (p30020)
Processing trait: plt (p30080)
Processing trait: lymph (p30120)
Processing trait: mono (p30130)
Processing trait: neut (p30140)
Processing trait: eos (p30150)
Processing trait: alt (p30620)
Processing trait: ast (p30650)
Processing trait: bun (p30670)
Processing trait: cholesterol (p30690)
Processing trait: creatinine (p30700)
Processing trait: ggt (p30730)
Processing trait: glucose (p30740)
Processing trait: hdl (p30760)
Processing trait: ldl (p30780)
Processing trait: triglycerides (p30870)
Processing trait: ua (p30880)
