In [None]:
import pandas as pd
import sys
import os
import subprocess
from multiprocessing import Pool, cpu_count

sst_eur_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/sdprx/train/EUR/"
sst_eas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/sdprx/train/EAS/"
test_eas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/Chinese/1_merged/merged.bim"
output_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/sdprx/res/data/"
ld_matrix_path = "/data1/jiapl_group/lishuhua/software/PRS/SDPRX/reference/EUR_EAS/"
prs_script = "/data1/jiapl_group/lishuhua/software/PRS/SDPRX/SDPRX.py"

trait_dict = {
    'p48': 'waist',
    'p50': 'height',
    'p102': 'pulse',
    'p4079': 'dbp',
    'p4080': 'sbp',
    'p20116': 'smoke',
    'p20117': 'drink',
    'p21001': 'bmi',
    'p21002': 'weight',
    'p30000': 'wbc',
    'p30010': 'rbc',
    'p30020':'hb',
    'p30080': 'plt',
    'p30120': 'lymph',
    'p30130': 'mono',
    'p30140': 'neut',
    'p30150': 'eos',
    'p30620': 'alt',
    'p30650': 'ast',
    'p30670': 'bun',
    'p30690': 'cholesterol',
    'p30700': 'creatinine',
    'p30730': 'ggt',
    'p30740': 'glucose',
    'p30760': 'hdl',
    'p30780': 'ldl',
    'p30870': 'triglycerides',
    'p30880': 'ua'
}

def run_single_job(sst_eur, sst_eas, chrom, output_prefix, trait, name):
    print(f"Running SDPRX for {name} ({trait}) with output prefix {output_prefix}")
    cmd = f"python {prs_script} --ss1 {sst_eur} --ss2 {sst_eas} --N1 336922 --N2 2460 --force_shared TRUE --load {ld_matrix_path} --valid {test_eas_dir} --chr {chrom} --rho 0.8 --out {output_path}/{output_prefix}"
    os.system(cmd) > f'{output_path}/{output_prefix}.log 2>&1'

tasks = []
# for each trait, run the SDPRX script
for trait, name in trait_dict.items():
    if trait in ['p20116', 'p20117']:
        sst_eas = os.path.join(sst_eas_dir, f"{name}_raw.{name}_raw.txt")
    else:
        sst_eas = os.path.join(sst_eas_dir, f"{name}_int.{name}_int.txt")
    sst_eur = os.path.join(sst_eur_dir, f"{trait}_int.merged.txt")
    
    # check if files exist
    if os.path.exists(sst_eas) and os.path.exists(sst_eur):
        for chr in range(1, 23):
            output_prefix = f"{name}_{trait}_chr{chr}"
            chrom = chr
            tasks.append((sst_eur, sst_eas, chrom, output_prefix, trait, name))
    else:
        print(f"Warning: SST files for {name} ({trait}) do not exist. Skipping this trait.")
        continue

pool = Pool(processes=cpu_count() - 1)  # Leave one CPU free
pool.starmap(run_single_job, tasks)
pool.close()
pool.join()