In [None]:
import sys
from pathlib import Path
import pandas as pd
import os

# Continuous: train_chr1.Pheno.glm.linear
# Binary: train_chr14.Pheno.glm.logistic

# UKBB EUR SNP counts list
count_list = {
    '1': 598771,
    '2': 655977,
    '3': 557352,
    '4': 577011,
    '5': 508272,
    '6': 535480,
    '7': 455119,
    '8': 432227,
    '9': 329660,
    '10': 397640,
    '11': 392275,
    '12': 375538,
    '13': 289173,
    '14': 254513,
    '15': 213428,
    '16': 226288,
    '17': 197587,
    '18': 219460,
    '19': 165968,
    '20': 168391,
    '21': 104766,
    '22': 99925
}

FILE_PREFIX = "train"
FILE_SUFFIX = "_chr"
FILE_EXTENSION = ".Pheno.glm.linear"
FILE_EXTENSION2 = ".Pheno.glm.logistic"

def merge_gwas_files(
    GWAS_DIR,
    OUTPUT_DIR,
    TRAIT,
    check_files=False
):
    """
    Main function to perform the GWAS file checking and merging process.
    """
    # 确保输出目录存在
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    print("==================================================")
    print("Starting GWAS file merge process.")
    print(f"GWAS Directory:   {GWAS_DIR}")
    print(f"Output Directory: {OUTPUT_DIR}")
    print("==================================================")

    # 对性状列表中的每一个性状进行循环
    pheno = TRAIT
    print(f"\n--- Processing phenotype: [ {pheno} ] ---")
    all_files_exist = True
    files_to_merge = []
    # **安全检查**: 检查1到22号染色体的文件是否都存在
    print("Checking for presence of all 22 chromosome files...")
    for i in range(1, 23):
        # 根据规则确定当前性状的文件扩展名
        if pheno in ["drink", "smoke"]:
            final_extension = FILE_EXTENSION2
        else:
            final_extension = FILE_EXTENSION
        # 根据您定义的规则拼凑出预期的文件名
        # 示例: p48_int_chr1.p48_int.glm.linear
        filename = f"{FILE_PREFIX}{FILE_SUFFIX}{i}{final_extension}"
        expected_file = GWAS_DIR / filename
        # 使用 .is_file() 来判断文件是否存在且是一个文件
        if not expected_file.is_file():
            print(f"  [ERROR] Missing file: {expected_file}", file=sys.stderr)
            all_files_exist = False
            break  # 只要有一个文件缺失，就没必要再检查了
        file_rows = count_list.get(str(i), 0)
        # calculate the rows in the file
        with open(expected_file, 'r') as f:
            #  remove the header line
            actual_rows = sum(1 for line in f) - 1  # 减去表头行
        if actual_rows != file_rows:
            print(f"  [ERROR] File {expected_file} has {actual_rows} rows, expected {file_rows} rows.", file=sys.stderr)
            all_files_exist = False
            break
        
        files_to_merge.append(expected_file)
    # **条件合并**: 只有在所有文件都存在的情况下才执行合并操作
    if all_files_exist:
        print("  All 22 files found. Proceeding with merge.")
        # 定义最终的输出文件名
        # 原始脚本都输出为 .glm.linear，这里保持一致
        output_file = OUTPUT_DIR / f"{pheno}.merged.glm.linear"
        try:
            with open(output_file, 'w') as outfile:
                # 遍历已确认存在且顺序正确的文件列表
                for i, filepath in enumerate(files_to_merge):
                    with open(filepath, 'r') as infile:
                        # 第一个文件 (i=0)，完整写入 (包括表头)
                        if i == 0:
                            for line in infile:
                                outfile.write(line)
                        # 后续文件，跳过表头
                        else:
                            next(infile)  # 跳过第一行
                            for line in infile:
                                outfile.write(line)
            
            # 检查输出文件是否成功生成且不为空
            if output_file.exists() and output_file.stat().st_size > 0:
                print("  Successfully merged. Output file created:")
                check_files = True
                print(f"  -> {output_file}")
            else:
                print(f"  [ERROR] Merging failed for phenotype [ {pheno} ]. Output file is empty.", file=sys.stderr)
        except IOError as e:
            print(f"  [ERROR] An I/O error occurred during merge: {e}", file=sys.stderr)
    else:
        # 如果文件不齐全，则打印跳过信息
        print(f"  Skipping merge for phenotype [ {pheno} ] due to missing files.")
    return check_files

if __name__ == "__main__":
    meta_df = pd.DataFrame()
    TRAIT_LIST = ['height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'waist', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']
    for trait in TRAIT_LIST:
        for i in range(1, 11):
            print(f"\nProcessing trait: {trait}, iteration: {i}")
            # 定义GWAS目录和输出目录
            GWAS_DIR = Path(f"/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/{trait}/group_{i}/gwas/")
            OUTPUT_DIR = Path(f"/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/{trait}/group_{i}/merged_gwas/")
            if os.path.exists(f'{OUTPUT_DIR}/{trait}.merged.glm.linear'):
                print(f"  Output file already exists for trait {trait}, group {i}. Skipping merge.")
                check_files = True
            else:
                check_files = merge_gwas_files(GWAS_DIR, OUTPUT_DIR, trait)
            if check_files:
                # 如果文件检查通过，添加到meta_df
                meta_df = pd.concat([meta_df, pd.DataFrame({'trait': [trait], 'group': [i], 'status': ['merged']})], ignore_index=True)
            else:
                # 如果文件检查不通过，添加到meta_df
                meta_df = pd.concat([meta_df, pd.DataFrame({'trait': [trait], 'group': [i], 'status': ['skipped']})], ignore_index=True)
    # 保存meta_df到CSV文件
    meta_output_path = Path("/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/meta_gwas_ukb.tsv")
    meta_df.to_csv(meta_output_path, index=False, header=True, sep='\t')

print("\n==================================================")
print("Process finished.")
print("==================================================")