In [None]:
# !!! test demo !!!
# ldsc: /data1/jiapl_group/lishuhua/software/general/ldsc

In [None]:
# caculate LD score for UK Biobank data (no so much memory, give up)
for i in {1..22}
do
    echo "--- Calculating LD score for chr${i} ---"
    python /data1/jiapl_group/lishuhua/software/general/ldsc/ldsc.py \
        --l2 \
        --bfile /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/White_British/0_sample_qc/chr${i} \
        --ld-wind-cm 1 \
        --out /data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/UKB_baselineLD/ldscore_chr${i}
done

In [None]:
# Step1: convert .glm files to .sumstats files
import pandas as pd
import os
import subprocess
from multiprocessing import Pool, cpu_count
# Define paths
munge_script = "/data1/jiapl_group/lishuhua/software/general/ldsc/munge_sumstats.py"
merge_allele_file_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/snpinfo_mult_1kg_hm3"
gwas_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/gwas/"
temp_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/temp/"
munged_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/munged/"

def convert_gwas_to_sumstats(gwas_file, out_file):
    df = pd.read_csv(gwas_file, sep="\t", header=0, low_memory=False)
    print df.head()
    # check if the BETA column is present
    if "BETA" not in df.columns:
        required_columns = ["ID", "ALT", "REF", "OR", "LOG(OR)_SE", "P", "OBS_CT"]
    else:
        required_columns = ["ID", "ALT", "REF", "BETA", "SE", "P", "OBS_CT"]
    if not all(col in df.columns for col in required_columns):
        raise ValueError("GWAS file %s does not contain all required columns: %s" % (gwas_file, required_columns))

    df = df[required_columns]
    if "LOG(OR)_SE" in df.columns:
        df.rename(columns={"LOG(OR)_SE": "SE"}, inplace=True)
    df.rename(columns={"ID": "SNP", "ALT": "A2", "REF": "A1", "OBS_CT": "N"}, inplace=True)
    df["N"] = df["N"].astype(int)
    df["P"] = df["P"].astype(float)
    if "OR" in df.columns:
        df["OR"] = df["OR"].astype(float)
    elif "BETA" in df.columns:
        df["BETA"] = df["BETA"].astype(float)
    else:
        raise ValueError("GWAS file %s does not contain OR or BETA column." % gwas_file)
    df["SE"] = df["SE"].astype(float)
    df["A1"] = df["A1"].astype(str)
    df["A2"] = df["A2"].astype(str)
    df.to_csv(out_file, sep="\t", index=False, header=True)

def process_single(args):
    gwas_file, temp_file, out_file, munge_script, merge_allele_file_path = args
    # Convert GWAS file to sumstats format
    convert_gwas_to_sumstats(gwas_file, temp_file)

    # Run the munge script
    cmd = [
        "python", munge_script,
        "--sumstats", temp_file,
        "--merge-alleles", merge_allele_file_path,
        "--out", out_file.replace("_sumstats.txt", "")
    ]
    subprocess.call(cmd)
    return "Processed %s to %s" % (gwas_file, out_file)

def batch_convert_gwas_to_sumstats(gwas_dir, munged_dir, temp_dir, munge_script, merge_allele_file_path, num_threads=None):
    if not os.path.exists(munged_dir):
        os.makedirs(munged_dir)
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    tasks = []
    for fname in os.listdir(gwas_dir):
        if fname.endswith(".glm.linear"):
            name = fname.replace(".glm.linear", "")
            gwas_file = os.path.join(gwas_dir, fname)
            temp_file = os.path.join(temp_dir, "%s_clean.txt" % name)
            out_file = os.path.join(munged_dir, "%s_sumstats.txt" % name)
            print "Converting GWAS file %s to sumstats format." % gwas_file
            tasks.append((gwas_file, temp_file, out_file, munge_script, merge_allele_file_path))
        if fname.endswith(".glm.logistic"):
            name = fname.replace(".glm.logistic", "")
            gwas_file = os.path.join(gwas_dir, fname)
            temp_file = os.path.join(temp_dir, "%s_clean.txt" % name)
            out_file = os.path.join(munged_dir, "%s_sumstats.txt" % name)
            print "Converting GWAS file %s to sumstats format." % gwas_file
            tasks.append((gwas_file, temp_file, out_file, munge_script, merge_allele_file_path))
    
    pool = Pool(processes=num_threads or cpu_count())
    results = pool.map(process_single, tasks)
    pool.close()
    pool.join()
    for result in results:
        print(result)

# Convert GWAS results to sumstats format for EUR and EAS populations
batch_convert_gwas_to_sumstats(gwas_path, munged_dir, temp_dir, munge_script, merge_allele_file_path, num_threads=128)

In [None]:
# Step1: convert .glm files to .sumstats files
import pandas as pd
import os
import subprocess
from multiprocessing import Pool, cpu_count
# Define paths
munge_script = "/data1/jiapl_group/lishuhua/software/general/ldsc/munge_sumstats.py"
merge_allele_file_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/snpinfo_mult_1kg_hm3"
gwas_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/Chinese/gwas/"
temp_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/Chinese/temp/"
munged_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/Chinese/munged/"

def convert_gwas_to_sumstats(gwas_file, out_file):
    df = pd.read_csv(gwas_file, sep="\t", header=0, low_memory=False)
    print df.head()
    # check if the BETA column is present
    if "BETA" not in df.columns:
        required_columns = ["ID", "ALT", "REF", "OR", "LOG(OR)_SE", "P", "OBS_CT"]
    else:
        required_columns = ["ID", "ALT", "REF", "BETA", "SE", "P", "OBS_CT"]
    if not all(col in df.columns for col in required_columns):
        raise ValueError("GWAS file %s does not contain all required columns: %s" % (gwas_file, required_columns))

    df = df[required_columns]
    if "LOG(OR)_SE" in df.columns:
        df.rename(columns={"LOG(OR)_SE": "SE"}, inplace=True)
    df.rename(columns={"ID": "SNP", "ALT": "A2", "REF": "A1", "OBS_CT": "N"}, inplace=True)
    df["N"] = df["N"].astype(int)
    df["P"] = df["P"].astype(float)
    if "OR" in df.columns:
        df["OR"] = df["OR"].astype(float)
    elif "BETA" in df.columns:
        df["BETA"] = df["BETA"].astype(float)
    else:
        raise ValueError("GWAS file %s does not contain OR or BETA column." % gwas_file)
    df["SE"] = df["SE"].astype(float)
    df["A1"] = df["A1"].astype(str)
    df["A2"] = df["A2"].astype(str)
    df.to_csv(out_file, sep="\t", index=False, header=True)

def process_single(args):
    gwas_file, temp_file, out_file, munge_script, merge_allele_file_path = args
    # Convert GWAS file to sumstats format
    convert_gwas_to_sumstats(gwas_file, temp_file)

    # Run the munge script
    cmd = [
        "python", munge_script,
        "--sumstats", temp_file,
        "--merge-alleles", merge_allele_file_path,
        "--out", out_file.replace("_sumstats.txt", "")
    ]
    subprocess.call(cmd)
    return "Processed %s to %s" % (gwas_file, out_file)

def batch_convert_gwas_to_sumstats(gwas_dir, munged_dir, temp_dir, munge_script, merge_allele_file_path, num_threads=None):
    if not os.path.exists(munged_dir):
        os.makedirs(munged_dir)
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    tasks = []
    for fname in os.listdir(gwas_dir):
        if fname.endswith(".glm.linear"):
            name = fname.replace(".glm.linear", "")
            gwas_file = os.path.join(gwas_dir, fname)
            temp_file = os.path.join(temp_dir, "%s_clean.txt" % name)
            out_file = os.path.join(munged_dir, "%s_sumstats.txt" % name)
            print "Converting GWAS file %s to sumstats format." % gwas_file
            tasks.append((gwas_file, temp_file, out_file, munge_script, merge_allele_file_path))
        if fname.endswith(".glm.logistic"):
            name = fname.replace(".glm.logistic", "")
            gwas_file = os.path.join(gwas_dir, fname)
            temp_file = os.path.join(temp_dir, "%s_clean.txt" % name)
            out_file = os.path.join(munged_dir, "%s_sumstats.txt" % name)
            print "Converting GWAS file %s to sumstats format." % gwas_file
            tasks.append((gwas_file, temp_file, out_file, munge_script, merge_allele_file_path))
    
    pool = Pool(processes=num_threads or cpu_count())
    results = pool.map(process_single, tasks)
    pool.close()
    pool.join()
    for result in results:
        print(result)

# Convert GWAS results to sumstats format for EUR and EAS populations
batch_convert_gwas_to_sumstats(gwas_path, munged_dir, temp_dir, munge_script, merge_allele_file_path, num_threads=128)

In [None]:
# caculate r2 for each trait in CAS cohort
import subprocess
import os
from multiprocessing import Pool, cpu_count

eur_munged_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/munged/"
eas_munged_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/gwas/munged/"
eur_ld_ref_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EUR_baselineLD/baselineLD."
eur_w_ld_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EUR_ldscores/LDscore."
eas_ld_ref_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EAS_baselineLD/baselineLD."
eas_w_ld_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EAS_ldscores/weights.EAS.hm3_noMHC."
eur_output_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/h2/"
eas_output_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/gwas/h2/"

def run_ldsc(sumstats_file, ld_ref, weights_ref, out_prefix):
    cmd = [
        "python", "/data1/jiapl_group/lishuhua/software/general/ldsc/ldsc.py",
        "--h2", sumstats_file,
        "--ref-ld-chr", ld_ref,
        "--w-ld-chr", weights_ref,
        "--out", out_prefix,
    ]
    subprocess.call(cmd)

# test command: python /p300s/jiapl_group/lishuhua/software/ldsc/ldsc.py --h2 /p300s/jiapl_group/lishuhua/PRS_benchmark/reference/shared_snp/simulated_gwas/munge_sumstats/EUR/sim_0.001_0.05_0.8_6.sumstats.gz --ref-ld-chr /p300s/jiapl_group/lishuhua/PRS_benchmark/software/ldsc/ldscore/eur_w_ld_hm3/LDscore/LDscore. --w-ld-chr /p300s/jiapl_group/lishuhua/PRS_benchmark/software/ldsc/ldscore/eur_w_ld_hm3/LDscore/LDscore. --out /p300s/jiapl_group/lishuhua/PRS_benchmark/reference/shared_snp/simulated_gwas/heritability/EUR/sim_0.001_0.05_0.8_6

# run ldsc to estimate heritability for EUR and EAS populations
def batch_run_ldsc(munged_dir, ld_ref, weights_ref, out_dir, num_threads=None):
    # Create output directory if it doesn't exist
    try:
        os.makedirs(out_dir)
    except OSError as e:
        if e.errno != os.errno.EEXIST:  # If it already exists, ignore
            raise
    
    tasks = []
    for fname in os.listdir(munged_dir):
        if fname.endswith(".sumstats.gz"):
            sumstats_file = os.path.join(munged_dir, fname)
            out_prefix = os.path.join(out_dir, fname.replace(".sumstats.gz", ""))
            tasks.append((sumstats_file, ld_ref, weights_ref, out_prefix))
    
    pool = Pool(processes=num_threads or cpu_count())
    # Use map instead of starmap
    for task in tasks:
        pool.apply_async(run_ldsc, task)

    pool.close()
    pool.join()

# Run LDSC for EUR and EAS populations
batch_run_ldsc(eur_munged_dir, eur_ld_ref_path, eur_w_ld_path, eur_output_path, num_threads=48)
batch_run_ldsc(eas_munged_dir, eas_ld_ref_path, eas_w_ld_path, eas_output_path, num_threads=48)

In [None]:
# caculate r2 for each trait in UKBB (EAS)
import subprocess
import os
from multiprocessing import Pool, cpu_count

eas_munged_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/Chinese/munged/"
eas_ld_ref_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EAS_baselineLD/baselineLD."
eas_w_ld_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EAS_ldscores/weights.EAS.hm3_noMHC."
eas_output_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/Chinese/h2/"

def run_ldsc(sumstats_file, ld_ref, weights_ref, out_prefix):
    cmd = [
        "python", "/data1/jiapl_group/lishuhua/software/general/ldsc/ldsc.py",
        "--h2", sumstats_file,
        "--ref-ld-chr", ld_ref,
        "--w-ld-chr", weights_ref,
        "--out", out_prefix,
    ]
    subprocess.call(cmd)

# test command: python /p300s/jiapl_group/lishuhua/software/ldsc/ldsc.py --h2 /p300s/jiapl_group/lishuhua/PRS_benchmark/reference/shared_snp/simulated_gwas/munge_sumstats/EUR/sim_0.001_0.05_0.8_6.sumstats.gz --ref-ld-chr /p300s/jiapl_group/lishuhua/PRS_benchmark/software/ldsc/ldscore/eur_w_ld_hm3/LDscore/LDscore. --w-ld-chr /p300s/jiapl_group/lishuhua/PRS_benchmark/software/ldsc/ldscore/eur_w_ld_hm3/LDscore/LDscore. --out /p300s/jiapl_group/lishuhua/PRS_benchmark/reference/shared_snp/simulated_gwas/heritability/EUR/sim_0.001_0.05_0.8_6

# run ldsc to estimate heritability for EUR and EAS populations
def batch_run_ldsc(munged_dir, ld_ref, weights_ref, out_dir, num_threads=None):
    # Create output directory if it doesn't exist
    try:
        os.makedirs(out_dir)
    except OSError as e:
        if e.errno != os.errno.EEXIST:  # If it already exists, ignore
            raise
    
    tasks = []
    for fname in os.listdir(munged_dir):
        if fname.endswith(".sumstats.gz"):
            sumstats_file = os.path.join(munged_dir, fname)
            out_prefix = os.path.join(out_dir, fname.replace(".sumstats.gz", ""))
            tasks.append((sumstats_file, ld_ref, weights_ref, out_prefix))
    
    pool = Pool(processes=num_threads or cpu_count())
    # Use map instead of starmap
    for task in tasks:
        pool.apply_async(run_ldsc, task)

    pool.close()
    pool.join()

# Run LDSC for EUR and EAS populations
batch_run_ldsc(eas_munged_dir, eas_ld_ref_path, eas_w_ld_path, eas_output_path, num_threads=48)

In [None]:
# caculate r2 for each trait in CAS cohort
import subprocess
import os
from multiprocessing import Pool, cpu_count

# eur_munged_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/munged/"
# eas_munged_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/gwas/munged/"
eas_munged_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/Chinese/munged/"
# eur_ld_ref_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EUR_baselineLD/baselineLD."
# eur_w_ld_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EUR_ldscores/LDscore."
eas_ld_ref_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EAS_baselineLD/baselineLD."
eas_w_ld_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EAS_ldscores/weights.EAS.hm3_noMHC."
# eur_output_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/h2/"
# eas_output_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/gwas/corr/"
eas_output_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/Chinese/corr/"

def run_ldsc(sumstats_file, ld_ref, weights_ref, out_prefix):
    cmd = [
        "python", "/data1/jiapl_group/lishuhua/software/general/ldsc/ldsc.py",
        "--rg", sumstats_file,
        "--ref-ld-chr", ld_ref,
        "--w-ld-chr", weights_ref,
        "--out", out_prefix,
    ]
    subprocess.call(cmd)

trait_dict = {
        'p48': 'waist',
        'p50': 'height',
        'p102': 'pulse',
        'p4079': 'dbp',
        'p4080': 'sbp',
        'p20116': 'smoke',
        'p20117': 'drink',
        'p21001': 'bmi',
        'p30000': 'wbc',
        'p30010': 'rbc',
        'p30020':'hb',
        'p30080': 'plt',
        'p30120': 'lymph',
        'p30130': 'mono',
        'p30140': 'neut',
        'p30150': 'eos',
        'p30620': 'alt',
        'p30650': 'ast',
        'p30670': 'bun',
        'p30690': 'cholesterol',
        'p30700': 'creatinine',
        'p30730': 'ggt',
        'p30740': 'glucose',
        'p30760': 'hdl',
        'p30780': 'ldl',
        'p30870': 'triglycerides',
        'p30880': 'ua'
    }
for trait, trait_name in trait_dict.items():
    for other_trait, other_trait_name in trait_dict.items():
        # print(f"Comparing {trait_name} with {other_trait_name}")
        if trait == other_trait:
            continue
        # alt_int.alt_int.sumstats.gz
        # p20116_int.merged.sumstats.gz
        # eas_sums_1 = os.path.join(eas_munged_dir, f"{trait_name}_int.{trait_name}_int.sumstats.txt.gz")
        # eas_sums_2 = os.path.join(eas_munged_dir, f"{other_trait_name}_int.{other_trait_name}_int.sumstats.txt.gz")
        eas_sums_1 = os.path.join(eas_munged_dir, f"{trait}_int.merged.sumstats.txt.gz")
        eas_sums_2 = os.path.join(eas_munged_dir, f"{other_trait}_int.merged.sumstats.txt.gz")
        eas_sum_path = f'{eas_sums_1},{eas_sums_2}'
        eas_out_prefix = os.path.join(eas_output_path, f"{trait_name}_{other_trait_name}_corr")
        run_ldsc(eas_sum_path, eas_ld_ref_path, eas_w_ld_path, eas_out_prefix)

In [None]:
# -*- coding: utf-8 -*-
# caculate r2 for each trait in CAS cohort
import subprocess
import os
from multiprocessing import Pool, cpu_count

# eur_munged_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/munged/"
# eas_munged_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/gwas/munged/"
eas_munged_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/Chinese/munged/"
# eur_ld_ref_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EUR_baselineLD/baselineLD."
# eur_w_ld_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EUR_ldscores/LDscore."
eas_ld_ref_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EAS_baselineLD/baselineLD."
eas_w_ld_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EAS_ldscores/weights.EAS.hm3_noMHC."
# eur_output_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/h2/"
# eas_output_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/gwas/corr/"
eas_output_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/Chinese/corr/"

def run_ldsc(sumstats_file, ld_ref, weights_ref, out_prefix):
    """
    运行 ldsc 命令的函数
    """
    cmd = [
        "python", "/data1/jiapl_group/lishuhua/software/general/ldsc/ldsc.py",
        "--rg", sumstats_file,
        "--ref-ld-chr", ld_ref,
        "--w-ld-chr", weights_ref,
        "--out", out_prefix,
    ]
    subprocess.call(cmd)

trait_dict = {
        'p48': 'waist',
        'p50': 'height',
        'p102': 'pulse',
        'p4079': 'dbp',
        'p4080': 'sbp',
        'p20116': 'smoke',
        'p20117': 'drink',
        'p21001': 'bmi',
        'p30000': 'wbc',
        'p30010': 'rbc',
        'p30020':'hb',
        'p30080': 'plt',
        'p30120': 'lymph',
        'p30130': 'mono',
        'p30140': 'neut',
        'p30150': 'eos',
        'p30620': 'alt',
        'p30650': 'ast',
        'p30670': 'bun',
        'p30690': 'cholesterol',
        'p30700': 'creatinine',
        'p30730': 'ggt',
        'p30740': 'glucose',
        'p30760': 'hdl',
        'p30780': 'ldl',
        'p30870': 'triglycerides',
        'p30880': 'ua'
    }

# 使用 .iteritems() 在 Python 2 中更高效
for trait, trait_name in trait_dict.iteritems():
    for other_trait, other_trait_name in trait_dict.iteritems():
        # print "Comparing {} with {}".format(trait_name, other_trait_name)
        if trait == other_trait:
            continue
        
        # 将 f-string 替换为 .format() 方法
        eas_sums_1 = os.path.join(eas_munged_dir, "{}_int.merged.sumstats.gz".format(trait))
        eas_sums_2 = os.path.join(eas_munged_dir, "{}_int.merged.sumstats.gz".format(other_trait))
        
        eas_sum_path = '{},{}'.format(eas_sums_1, eas_sums_2)
        
        eas_out_prefix = os.path.join(eas_output_path, "{}_{}_corr".format(trait_name, other_trait_name))
        if os.path.exists(eas_out_prefix + ".log"):
            print("File {} already exists, skipping...".format(eas_out_prefix + ".log"))
            continue
        
        run_ldsc(eas_sum_path, eas_ld_ref_path, eas_w_ld_path, eas_out_prefix)