In [None]:
# Step 1: get EAS summary statistics

import pandas as pd
import os
import numpy as np

# eas_example: /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/bmi/group_1/gwas/train.Pheno.glm.linear

# sig_snps = df[["ID", "#CHROM", "ALT", "REF", "BETA", "SE", "OBS_CT"]]
# sig_snps.columns = ["rsid", "chr", "a1", "a0", "beta", "beta_se", "n_eff"]

eas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"
eur_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/"
output_gwas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prosper/train/"
snp_list_path = "/data1/jiapl_group/lishuhua/software/PRS/PROSPER/reference/ref_bim.txt"

trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']

def get_summary_stats(trait, group_num, gwas_path, output_path, snp_list):
    print(f"Processing trait: {trait}, group: {group_num}")
    if "BETA" in pd.read_csv(gwas_path, sep="\t", nrows=1).columns:
        # df = df[["ID", "#CHROM", "ALT", "REF", "BETA", "SE", "OBS_CT"]]
        df = pd.read_csv(gwas_path, sep='\t', usecols=['ID', '#CHROM', 'ALT', 'REF', 'BETA', 'SE', 'OBS_CT'])
    else:
        df = pd.read_csv(gwas_path, sep='\t', usecols=['ID', '#CHROM', 'ALT', 'REF', 'OR', 'LOG(OR)_SE', 'OBS_CT'])
        df['BETA'] = np.log(df['OR'])
        df['SE'] = df['LOG(OR)_SE']
    df = df[["ID", "#CHROM", "ALT", "REF", "BETA", "SE", "OBS_CT"]]
    df.columns = ["rsid", "chr", "a1", "a0", "beta", "beta_se", "n_eff"]
    merge_df = pd.merge(df, snp_list, on=["chr", "rsid"], how="inner")
    if merge_df.empty:
        print(f"No SNPs found for trait: {trait}, group: {group_num}")
        return
    else:
        print(f"Found {len(merge_df)} SNPs for trait: {trait}, group: {group_num}")
    merge_df = merge_df[['rsid', 'chr', 'a1', 'a0', 'beta', 'beta_se', 'n_eff']]
    merge_df = merge_df.drop_duplicates(subset=['rsid'])
    merge_df = merge_df.dropna(subset=['rsid', 'beta', 'beta_se', 'n_eff'])
    merge_df.to_csv(output_path, sep="\t", index=False, header=True)

if __name__ == "__main__":
    snp_list = pd.read_csv(snp_list_path, sep="\t", header=None, usecols=[0,1])
    snp_list.columns = ["chr", "rsid"]
    for trait in trait_list:
        for group_num in range(1, 11):
            eas_gwas_path = os.path.join(eas_base_dir, trait, f"group_{group_num}", "gwas", "train.Pheno.glm.linear")
            if os.path.exists(eas_gwas_path):
                eas_output_path = os.path.join(output_gwas_dir, "EAS", trait, f"group_{group_num}.txt")
                if not os.path.exists(os.path.dirname(eas_output_path)):
                    os.makedirs(os.path.dirname(eas_output_path))
                get_summary_stats(trait, group_num, eas_gwas_path, eas_output_path, snp_list)

In [None]:
# Step 1: get EAS summary statistics
# !!! Only change for smoke and drink !!!

import pandas as pd
import os
import numpy as np

# eas_example: /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/bmi/group_1/gwas/train.Pheno.glm.linear
# eur_example: /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/height/group_3/merged_gwas/height.merged.glm.linear

# sig_snps = df[["ID", "#CHROM", "ALT", "REF", "BETA", "SE", "OBS_CT"]]
# sig_snps.columns = ["rsid", "chr", "a1", "a0", "beta", "beta_se", "n_eff"]

eas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"
eur_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/"
output_gwas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prosper/train/"
snp_list_path = "/data1/jiapl_group/lishuhua/software/PRS/PROSPER/reference/ref_bim.txt"

# trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']
trait_list = ['drink', 'smoke']

def get_summary_stats(trait, group_num, gwas_path, output_path, snp_list, n_control=None, n_case=None):
    print(f"Processing trait: {trait}, group: {group_num}")
    if "BETA" in pd.read_csv(gwas_path, sep="\t", nrows=1).columns:
        # df = df[["ID", "#CHROM", "ALT", "REF", "BETA", "SE", "OBS_CT"]]
        df = pd.read_csv(gwas_path, sep='\t', usecols=['ID', '#CHROM', 'ALT', 'REF', 'BETA', 'SE', 'OBS_CT'])
    else:
        df = pd.read_csv(gwas_path, sep='\t', usecols=['ID', '#CHROM', 'ALT', 'REF', 'OR', 'LOG(OR)_SE', 'OBS_CT'])
        df['BETA'] = np.log(df['OR'])
        df['SE'] = df['LOG(OR)_SE']
    df = df[["ID", "#CHROM", "ALT", "REF", "BETA", "SE", "OBS_CT"]]
    df.columns = ["rsid", "chr", "a1", "a0", "beta", "beta_se", "n_eff"]
    if n_control is not None and n_case is not None:
        df['n_eff'] = 4 / (1 / n_control + 1 / n_case)
    merge_df = pd.merge(df, snp_list, on=["chr", "rsid"], how="inner")
    if merge_df.empty:
        print(f"No SNPs found for trait: {trait}, group: {group_num}")
        return
    else:
        print(f"Found {len(merge_df)} SNPs for trait: {trait}, group: {group_num}")
    merge_df = merge_df[['rsid', 'chr', 'a1', 'a0', 'beta', 'beta_se', 'n_eff']]
    merge_df = merge_df.drop_duplicates(subset=['rsid'])
    merge_df = merge_df.dropna(subset=['rsid', 'beta', 'beta_se', 'n_eff'])
    merge_df.to_csv(output_path, sep="\t", index=False, header=True)

if __name__ == "__main__":
    snp_list = pd.read_csv(snp_list_path, sep="\t", header=None, usecols=[0,1])
    snp_list.columns = ["chr", "rsid"]
    for trait in trait_list:
        for group_num in range(1, 11):
            eas_gwas_path = os.path.join(eas_base_dir, trait, f"group_{group_num}", "gwas", "train.Pheno.glm.linear")
            if trait == "smoke" or trait == "drink":
                eas_gwas_path = os.path.join(eas_base_dir, trait, f"group_{group_num}", "gwas", "train.Pheno.glm.logistic")
            
            if os.path.exists(eas_gwas_path):
                eas_output_path = os.path.join(output_gwas_dir, "EAS", trait, f"group_{group_num}.txt")
                eas_train_pheno_path = os.path.join(eas_base_dir, trait, f"group_{group_num}", "pheno", "train_pheno.txt")
                if os.path.exists(eas_train_pheno_path):
                    eas_train_pheno = pd.read_csv(eas_train_pheno_path, sep="\t", header=0)
                    eas_train_pheno['Pheno'] = eas_train_pheno['Pheno'].astype(int)
                    n_control = eas_train_pheno[eas_train_pheno['Pheno'] == 1].shape[0]
                    n_case = eas_train_pheno[eas_train_pheno['Pheno'] == 2].shape[0]
                if not os.path.exists(os.path.dirname(eas_output_path)):
                    os.makedirs(os.path.dirname(eas_output_path))
                get_summary_stats(trait, group_num, eas_gwas_path, eas_output_path, snp_list, n_control, n_case)

In [None]:
# Step 1: get EAS summary statistics
# !!! merge the code for continuous and binary traits !!!

import pandas as pd
import os
import numpy as np

# 定义基础目录和文件路径
eas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/"
output_gwas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prosper/train/"
snp_list_path = "/data1/jiapl_group/lishuhua/software/PRS/PROSPER/reference/ref_bim.txt"

# 包含所有连续性和二元性状的列表
trait_list = [
    'waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 
    'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 
    'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua'
]

# 定义二元性状列表，用于特殊处理
binary_traits = ['smoke', 'drink']

def get_summary_stats(trait, group_num, gwas_path, output_path, snp_list, n_control=None, n_case=None):
    """
    处理GWAS摘要统计数据，适用于连续性和二元性状。
    
    对于二元性状，如果提供了 n_control 和 n_case，则会重新计算有效样本量 (n_eff)。
    """
    print(f"Processing trait: {trait}, group: {group_num}")
    
    # 检查摘要统计文件中是包含 'BETA' 还是 'OR'
    # 'BETA' 通常用于连续性状 (线性回归)，'OR' 用于二元性状 (逻辑回归)
    header = pd.read_csv(gwas_path, sep="\t", nrows=1).columns
    if "BETA" in header:
        use_cols = ['ID', '#CHROM', 'ALT', 'REF', 'BETA', 'SE', 'OBS_CT']
        df = pd.read_csv(gwas_path, sep='\t', usecols=use_cols)
    else:
        use_cols = ['ID', '#CHROM', 'ALT', 'REF', 'OR', 'LOG(OR)_SE', 'OBS_CT']
        df = pd.read_csv(gwas_path, sep='\t', usecols=use_cols)
        # 将 OR (优势比) 转换为 BETA (log(OR))
        df['BETA'] = np.log(df['OR'])
        df['SE'] = df['LOG(OR)_SE']
        
    # 标准化列名
    df = df[["ID", "#CHROM", "ALT", "REF", "BETA", "SE", "OBS_CT"]]
    df.columns = ["rsid", "chr", "a1", "a0", "beta", "beta_se", "n_eff"]
    
    # 如果是二元性状，根据病例和对照数计算有效样本量
    if n_control is not None and n_case is not None:
        if n_control > 0 and n_case > 0:
            df['n_eff'] = 4 / (1 / n_control + 1 / n_case)
        else:
            print(f"Warning: Zero cases or controls for {trait}, group {group_num}. n_eff not recalculated.")

    # 与参考SNP列表合并，筛选有效SNP
    merge_df = pd.merge(df, snp_list, on=["chr", "rsid"], how="inner")
    
    if merge_df.empty:
        print(f"No SNPs found after merging for trait: {trait}, group: {group_num}")
        return
    else:
        print(f"Found {len(merge_df)} SNPs for trait: {trait}, group: {group_num}")
    
    # 数据清洗和整理
    merge_df = merge_df[['rsid', 'chr', 'a1', 'a0', 'beta', 'beta_se', 'n_eff']]
    merge_df = merge_df.drop_duplicates(subset=['rsid'])
    merge_df = merge_df.dropna(subset=['rsid', 'beta', 'beta_se', 'n_eff'])
    
    # 保存处理后的结果
    merge_df.to_csv(output_path, sep="\t", index=False, header=True)

if __name__ == "__main__":
    # 加载参考SNP列表
    snp_list = pd.read_csv(snp_list_path, sep="\t", header=None, usecols=[0, 1])
    snp_list.columns = ["chr", "rsid"]

    # 遍历所有性状和分组
    for trait in trait_list:
        for group_num in range(1, 11):
            n_control, n_case = None, None # 默认为None，适用于连续性状
            
            # 判断性状类型并设置相应的文件名和参数
            if trait in binary_traits:
                # 处理二元性状
                file_suffix = "logistic"
                eas_train_pheno_path = os.path.join(eas_base_dir, trait, f"group_{group_num}", "pheno", "train_pheno.txt")
                
                if os.path.exists(eas_train_pheno_path):
                    eas_train_pheno = pd.read_csv(eas_train_pheno_path, sep="\t", header=0)
                    eas_train_pheno['Pheno'] = eas_train_pheno['Pheno'].astype(int)
                    # 计算病例和对照的数量 (假设 1=对照, 2=病例)
                    n_control = eas_train_pheno[eas_train_pheno['Pheno'] == 1].shape[0]
                    n_case = eas_train_pheno[eas_train_pheno['Pheno'] == 2].shape[0]
                else:
                    print(f"Phenotype file not found for {trait}, group {group_num}. Cannot calculate n_eff.")
                    continue
            else:
                # 处理连续性状
                file_suffix = "linear"

            # 构建GWAS摘要统计文件路径
            gwas_filename = f"train.Pheno.glm.{file_suffix}"
            eas_gwas_path = os.path.join(eas_base_dir, trait, f"group_{group_num}", "gwas", gwas_filename)

            if os.path.exists(eas_gwas_path):
                # 构建输出文件路径并确保目录存在
                eas_output_path = os.path.join(output_gwas_dir, "EAS", trait, f"group_{group_num}.txt")
                os.makedirs(os.path.dirname(eas_output_path), exist_ok=True)
                
                # 调用统一的处理函数
                get_summary_stats(trait, group_num, eas_gwas_path, eas_output_path, snp_list, n_control, n_case)
            else:
                print(f"GWAS file not found: {eas_gwas_path}")

In [None]:
# Step 1: get EUR summary statistics
# !!! merge the code for continuous and binary traits !!!

import pandas as pd
import os
import numpy as np

# 定义基础目录和文件路径
eur_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/"
output_gwas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/prosper/train/"
snp_list_path = "/data1/jiapl_group/lishuhua/software/PRS/PROSPER/reference/ref_bim.txt"

# 包含所有连续性和二元性状的列表
trait_dict = {
        # 'p48': 'waist',
        # 'p50': 'height',
        # 'p102': 'pulse',
        # 'p4079': 'dbp',
        # 'p4080': 'sbp',
        'p20116': 'smoke',
        'p20117': 'drink',
        # 'p21001': 'bmi',
        # 'p30000': 'wbc',
        # 'p30010': 'rbc',
        # 'p30020':'hb',
        # 'p30080': 'plt',
        # 'p30120': 'lymph',
        # 'p30130': 'mono',
        # 'p30140': 'neut',
        # 'p30150': 'eos',
        # 'p30620': 'alt',
        # 'p30650': 'ast',
        # 'p30670': 'bun',
        # 'p30690': 'cholesterol',
        # 'p30700': 'creatinine',
        # 'p30730': 'ggt',
        # 'p30740': 'glucose',
        # 'p30760': 'hdl',
        # 'p30780': 'ldl',
        # 'p30870': 'triglycerides',
        # 'p30880': 'ua'
    }
# 定义二元性状列表，用于特殊处理
binary_traits = ['smoke', 'drink']

def get_summary_stats(trait, group_num, gwas_path, output_path, snp_list, n_control=None, n_case=None):
    """
    处理GWAS摘要统计数据，适用于连续性和二元性状。
    
    对于二元性状，如果提供了 n_control 和 n_case，则会重新计算有效样本量 (n_eff)。
    """
    print(f"Processing trait: {trait}, group: {group_num}")
    
    # 检查摘要统计文件中是包含 'BETA' 还是 'OR'
    # 'BETA' 通常用于连续性状 (线性回归)，'OR' 用于二元性状 (逻辑回归)
    header = pd.read_csv(gwas_path, sep="\t", nrows=1).columns
    if "BETA" in header:
        use_cols = ['ID', '#CHROM', 'ALT', 'REF', 'BETA', 'SE', 'OBS_CT']
        df = pd.read_csv(gwas_path, sep='\t', usecols=use_cols)
    else:
        use_cols = ['ID', '#CHROM', 'ALT', 'REF', 'OR', 'LOG(OR)_SE', 'OBS_CT']
        df = pd.read_csv(gwas_path, sep='\t', usecols=use_cols)
        # 将 OR (优势比) 转换为 BETA (log(OR))
        df['BETA'] = np.log(df['OR'])
        df['SE'] = df['LOG(OR)_SE']
        
    # 标准化列名
    df = df[["ID", "#CHROM", "ALT", "REF", "BETA", "SE", "OBS_CT"]]
    df.columns = ["rsid", "chr", "a1", "a0", "beta", "beta_se", "n_eff"]
    
    # 如果是二元性状，根据病例和对照数计算有效样本量
    if n_control is not None and n_case is not None:
        if n_control > 0 and n_case > 0:
            df['n_eff'] = 4 / (1 / n_control + 1 / n_case)
        else:
            print(f"Warning: Zero cases or controls for {trait}, group {group_num}. n_eff not recalculated.")

    # 与参考SNP列表合并，筛选有效SNP
    merge_df = pd.merge(df, snp_list, on=["chr", "rsid"], how="inner")
    
    if merge_df.empty:
        print(f"No SNPs found after merging for trait: {trait}, group: {group_num}")
        return
    else:
        print(f"Found {len(merge_df)} SNPs for trait: {trait}, group: {group_num}")
    
    # 数据清洗和整理
    merge_df = merge_df[['rsid', 'chr', 'a1', 'a0', 'beta', 'beta_se', 'n_eff']]
    merge_df = merge_df.drop_duplicates(subset=['rsid'])
    merge_df = merge_df.dropna(subset=['rsid', 'beta', 'beta_se', 'n_eff'])
    
    # 保存处理后的结果
    merge_df.to_csv(output_path, sep="\t", index=False, header=True)

if __name__ == "__main__":
    # 加载参考SNP列表
    snp_list = pd.read_csv(snp_list_path, sep="\t", header=None, usecols=[0, 1])
    snp_list.columns = ["chr", "rsid"]

    # 遍历所有性状和分组
    for trait_id, trait in trait_dict.items():
        trait_id = trait_id + "_int"
        for group_num in range(1, 11):
            n_control, n_case = None, None # 默认为None，适用于连续性状
            # 判断性状类型并设置相应的文件名和参数
            # /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/pheno/fold_1/train_pheno_int_dropna_2.txt
            if trait in binary_traits:
                # 处理二元性状
                file_suffix = "logistic"
                chrom_prefix = "_binary"
                eur_train_pheno_path = os.path.join(eur_base_dir, f"pheno/fold_{group_num}", "train_pheno_int_dropna_2.txt")
                
                if os.path.exists(eur_train_pheno_path):
                    eur_train_pheno = pd.read_csv(eur_train_pheno_path, sep="\t", header=0)
                    eur_train_pheno[trait_id] = eur_train_pheno[trait_id].astype(int)
                    # 计算病例和对照的数量 (假设 1=对照, 2=病例)
                    n_control = eur_train_pheno[eur_train_pheno[trait_id] == 1].shape[0]
                    n_case = eur_train_pheno[eur_train_pheno[trait_id] == 2].shape[0]
                else:
                    print(f"Phenotype file not found for {trait}, group {group_num}. Cannot calculate n_eff.")
                    continue
            else:
                # 处理连续性状
                file_suffix = "linear"
                chrom_prefix = ""

            # 构建GWAS摘要统计文件路径
            for chrom in range(1, 23):
                # eur_example: /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/UKB_EUR/train/gwas/fold_1/gwas_chr1.p102_int.glm.linear
                gwas_filename = f"gwas_chr{chrom}{chrom_prefix}.{trait_id}.glm.{file_suffix}"
                eur_gwas_path = os.path.join(eur_base_dir, "train", "gwas", f"fold_{group_num}", gwas_filename)
                if os.path.exists(eur_gwas_path):
                    eur_output_path = os.path.join(output_gwas_dir, "EUR", trait, f"group_{group_num}_chr{chrom}.txt")
                    if os.path.exists(eur_output_path):
                        print(f"Output file already exists, skipping: {eur_output_path}")
                        continue
                    if not os.path.exists(os.path.dirname(eur_output_path)):
                        os.makedirs(os.path.dirname(eur_output_path))
                    get_summary_stats(trait, group_num, eur_gwas_path, eur_output_path, snp_list, n_control, n_case)
                else:
                    print(f"GWAS file not found: {eur_gwas_path}")