In [7]:
# unzip file and briefly process
# unzip "hum0197.v3.BBJ.*.zip"
# find hum0197.v3.BBJ.*/ -type f -name "*.auto.txt.gz" -exec gunzip -f {} +
# a dict of BBJ GWAS information
import pandas as pd

bbj_info = pd.read_csv("bbj_dict.txt", sep=r"\s+", header=None, names=["trait", "n", "file"])
bbj_info["file_prefix"] = bbj_info["file"].apply(lambda x: x.replace(".v1.zip", "").replace("hum0197.v3.BBJ.", ""))
bbj_info = bbj_info[["trait", "n", "file_prefix"]]
# display(bbj_info)
bbj_info.to_csv("bbj_info.tsv", sep='\t', index=False)

In [None]:
# -*- coding: utf-8 -*-
# Step 1: convert gwas file into .sumstats files
from __future__ import print_function
import pandas as pd
import os

munge_script = "/data1/jiapl_group/lishuhua/software/general/ldsc/munge_sumstats.py"
bbj_info_path = "/data1/jiapl_group/lishuhua/BBJ/bbj_info.tsv"
merge_allele_file_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/snpinfo_mult_1kg_hm3"
gwas_base_dir = "/data1/jiapl_group/lishuhua/BBJ/general/"
output_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/BBJ/munged/"
temp_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/BBJ/temp/"

# 检查路径是否存在，如果不存在则创建
if not os.path.exists(output_base_dir):
    os.makedirs(output_base_dir)
if not os.path.exists(temp_base_dir):
    os.makedirs(temp_base_dir)

def process_gwas_file(gwas_file, merge_allele_file_list, output_file, chunksize=500000):
    required_columns = ["SNP", "ALLELE1", "ALLELE0", "BETA", "SE", "P_BOLT_LMM_INF"]
    chunk_iter = pd.read_csv(gwas_file, sep="\t", chunksize=chunksize, usecols=required_columns)
    is_first_chunk = True
    # 循环处理每个块
    for chunk in chunk_iter:
        # 为当前块添加 'N' 列
        chunk.rename(columns={"ALLELE1": "A2", "ALLELE0": "A1", "P_BOLT_LMM_INF": "P"}, inplace=True)
        chunk['N'] = n_sample
        chunk = chunk[chunk["SNP"].isin(merge_allele_file_list)]
        if is_first_chunk:
            # 第一个块：使用 'w' 模式（写入），并包含表头
            chunk.to_csv(output_file, sep="\t", index=False, mode='w', header=True)
            is_first_chunk = False # 修改标志位
        else:
            # 后续的块：使用 'a' 模式（追加），并且不包含表头
            chunk.to_csv(output_file, sep="\t", index=False, mode='a', header=False)

def convert_to_sumstats(input_file, output_file, n_sample):
    # 使用 .format() 替换 f-string
    command = "python {munge_script} --sumstats {input_file} --N {n_sample} --merge-alleles {merge_allele_file_path} --out {output_file}".format(
        munge_script=munge_script,
        input_file=input_file,
        n_sample=n_sample,
        merge_allele_file_path=merge_allele_file_path,
        output_file=output_file
    )
    os.system(command)

if __name__ == "__main__":
    bbj_info = pd.read_csv(bbj_info_path, sep="\t")
    merge_allele_file = pd.read_csv(merge_allele_file_path, sep="\t")
    merge_allele_file_list = set(merge_allele_file["SNP"].tolist())
    # 在Python 2中，iterrows()返回的是(index, Series)对，用法不变
    for _, row in bbj_info.iterrows():
        trait = row["trait"]
        n_sample = int(row["n"])
        file_prefix = row["file_prefix"]
        
        # 使用 .format() 替换 f-string
        gwas_file_dir = os.path.join(gwas_base_dir, "hum0197.v3.BBJ.{0}.v1/".format(file_prefix))
        
        # search the .auto.txt file in the directory
        gwas_file = None
        for file_name in os.listdir(gwas_file_dir):
            if file_name.endswith(".auto.txt") and file_name.startswith("GWASsummary_"):
                gwas_file = os.path.join(gwas_file_dir, file_name)
                break
        
        # 使用 .format() 替换 f-string
        temp_output_file = os.path.join(temp_base_dir, "{0}.tsv".format(trait))
        final_output_file = os.path.join(output_base_dir, "{0}".format(trait))
        
        # 使用 print() 函数 (因为我们从 __future__ 导入了它)
        print("Processing {0}...".format(gwas_file))
        process_gwas_file(gwas_file, merge_allele_file_list, temp_output_file)
        convert_to_sumstats(temp_output_file, final_output_file, n_sample)
        print("Saved munged sumstats to {0}".format(final_output_file))

In [None]:
import os
import glob

munged_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/BBJ/munged/"
eas_ld_ref_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EAS_baselineLD/baselineLD."
eas_w_ld_path = "/data1/jiapl_group/lishuhua/software/general/ldsc/LD_SCORE/EAS_ldscores/weights.EAS.hm3_noMHC."
output_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/BBJ/h2/"
munge_script = "/data1/jiapl_group/lishuhua/software/general/ldsc/ldsc.py"

def compute_h2(trait, munged_sumstats, ld_ref, w_ld, out_dir):
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    out_file = os.path.join(out_dir, trait)
    command = "python {munged_script} --h2 {munged_sumstats} --ref-ld-chr {ld_ref} --w-ld-chr {w_ld} --out {out_file}".format(
        munged_script=munge_script,
        munged_sumstats=munged_sumstats,
        ld_ref=ld_ref,
        w_ld=w_ld,
        out_file=out_file
    )
    os.system(command)

if __name__ == "__main__":
    if not os.path.exists(output_base_dir):
        os.makedirs(output_base_dir)
    search_pattern = os.path.join(munged_base_dir, "*.sumstats.gz")
    munged_files = glob.glob(search_pattern)
    for munged_file in munged_files:
        trait = os.path.basename(munged_file).replace(".sumstats.gz", "")
        print("Computing h2 for {0}...".format(trait))
        compute_h2(trait, munged_file, eas_ld_ref_path, eas_w_ld_path, output_base_dir)
        print("Saved h2 results for {0}.".format(trait))

In [None]:
import os
import re
import pandas as pd

def extract_h2_from_logs(folder_path):
    pattern = re.compile(r"Total Observed scale h2:\s*([\d.]+)\s*\(([\d.]+)\)")
    results = []

    if not os.path.exists(folder_path):
        print(f"Folder {folder_path} does not exist.")
        return results

    for filename in os.listdir(folder_path):
        if filename.endswith(".log"):
            file_path = os.path.join(folder_path, filename)
            trait = filename.replace(".log", "")
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line_num, line in enumerate(f, start=1):
                        match = pattern.search(line)
                        if match:
                            h2_value = float(match.group(1))
                            se_value = float(match.group(2))
                            results.append((trait, h2_value, se_value))
            except Exception as e:
                print(f"Error reading {file_path} on line {line_num}: {e}")
    return results

if __name__ == "__main__":
    h2_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/BBJ/h2/"
    h2_results = extract_h2_from_logs(h2_dir)
    h2_df = pd.DataFrame(h2_results, columns=["trait", "h2", "se"])
    h2_df.to_csv("/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/BBJ/bbj_h2_results.tsv", sep="\t", index=False, header=True)