In [None]:
# Step 0: get the EUR GWAS and keep the SNPs in EUR reference files
# base path: /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/gwas/
# output base path: /data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/train/EUR/
import pandas as pd
import os
import subprocess
import numpy as np

def process_single_file(file_path, ref_snp, output_path):
    ref_snp = pd.read_csv(ref_snp, sep="\t", header=None, usecols=[0, 1])
    ref_snp.columns = ["CHR", "rs_id"]
    if file_path.endswith(".merged.glm.linear"):
        if "BETA" not in pd.read_csv(file_path, sep="\t", nrows=1).columns:
            df = pd.read_csv(file_path, sep="\t", usecols=["#CHROM", "POS", "ALT", "REF", "OR", "LOG(OR)_SE", "P", "ID"])
            df["BETA"] = np.log(df["OR"])
            df["SE"] = df["LOG(OR)_SE"]
        else:
            df = pd.read_csv(file_path, sep="\t", usecols=["#CHROM", "POS", "ALT", "REF", "BETA", "SE", "P", "ID"])
        # print(df.head())
        df = df[["#CHROM", "POS", "ALT", "REF", "BETA", "SE", "P", "ID"]]
        df.columns = ["CHR", "BP", "ALT", "REF", "BETA", "SE", "P", "rs_id"]
        # df.columns = ["CHR", "BP", "rs_id", "REF", "ALT", "BETA", "SE", "P"]
        df["new_id"] = df.apply(lambda row: f"{row['rs_id']}:{row['BP']}:{row['ALT']}:{row['REF']}" if str(row['rs_id']).startswith("rs") else f"{row['CHR']}:{row['BP']}:{row['ALT']}:{row['REF']}", axis=1)
        df = df[["CHR", "new_id", "BP", "ALT", "BETA", "SE", "P", "rs_id"]]
        # print(df.head())
        df.columns = ["CHR", "SNP", "BP", "ALT", "BETA", "SE", "P", "rs_id"]
        print(df.head())
        df["CHR"] = df["CHR"].astype(int)
        df["BP"] = df["BP"].astype(int)
        df["P"] = df["P"].astype(float)
        df["BETA"] = df["BETA"].astype(float)
        df["SE"] = df["SE"].astype(float)
        merged_df = pd.merge(ref_snp, df, on=["CHR", "rs_id"], how="inner")
        if not merged_df.empty:
            merged_df = merged_df[["CHR", "SNP", "BP", "ALT", "BETA", "SE", "P", "rs_id"]]
            merged_df.columns = ["CHR", "SNP", "BP", "A1", "BETA", "SE", "P", "rs_id"]
            merged_df.to_csv(output_path, sep="\t", index=False, header=True)

if __name__ == "__main__":
    input_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/gwas/"
    output_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/train/EUR/"
    ref_snp_file = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/reference/eur_ref.txt"

    if not os.path.exists(output_base_path):
        os.makedirs(output_base_path)

    trait_dict = {
        'p48': 'waist',
        'p50': 'height',
        'p102': 'pulse',
        'p4079': 'dbp',
        'p4080': 'sbp',
        'p20116': 'smoke',
        'p20117': 'drink',
        'p21001': 'bmi',
        'p21002': 'weight',
        'p30000': 'wbc',
        'p30010': 'rbc',
        'p30020':'hb',
        'p30080': 'plt',
        'p30120': 'lymph',
        'p30130': 'mono',
        'p30140': 'neut',
        'p30150': 'eos',
        'p30620': 'alt',
        'p30650': 'ast',
        'p30670': 'bun',
        'p30690': 'cholesterol',
        'p30700': 'creatinine',
        'p30730': 'ggt',
        'p30740': 'glucose',
        'p30760': 'hdl',
        'p30780': 'ldl',
        'p30870': 'triglycerides',
        'p30880': 'ua'
    }

    for trait, name in trait_dict.items():
        input_file = os.path.join(input_base_path, f"{trait}_int.merged.glm.linear")
        output_file = os.path.join(output_base_path, f"{name}.txt")
        
        if os.path.exists(input_file):
            process_single_file(input_file, ref_snp_file, output_file)
            print(f"Processed {name} and saved to {output_file}")
        else:
            print(f"Input file {input_file} does not exist.")

In [None]:
# Step1: Split EAS data for 5-fold cross-validation
# EAS cross validation result: /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/
# EAS output: /data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/train/EAS
# EAS reference: /data1/jiapl_group/lishuhua/software/PRS/CT_SLEB/reference/EAS/merged
# overlap snp list: /data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/reference

import pandas as pd
import sys
import os
import scipy.stats as stats
import numpy as np

def process_single_file(file_path, output_path, ref_snp_list):
    # Step 1: check the rows if correct
    df = pd.read_csv(file_path, sep="\t", header=0, low_memory=False)
    df["new_id"] = df.apply(lambda row: f"{row['ID']}:{row['POS']}:{row['ALT']}:{row['REF']}" if str(row['ID']).startswith("rs") else f"{row['#CHROM']}:{row['POS']}:{row['ALT']}:{row['REF']}", axis=1)
    columns = df.columns.tolist()
    if "BETA" not in columns:
        df["BETA"] = np.log(df["OR"])
        df["SE"] = df["LOG(OR)_SE"]
    df = df[["#CHROM", "new_id", "POS", "ALT", "BETA", "SE", "P", "ID"]]
    # Step 2: rename the columns
    df.columns = ["CHR", "SNP", "BP", "ALT", "BETA", "SE", "P", "rs_id"]
    df["CHR"] = df["CHR"].astype(int)
    df["BP"] = df["BP"].astype(int)
    df["P"] = df["P"].astype(float)
    df["BETA"] = df["BETA"].astype(float)
    df["SE"] = df["SE"].astype(float)
    # Step 3: save the file
    merged_df = pd.merge(ref_snp_list, df, on=["CHR", "rs_id"], how="inner")
    if merged_df.empty:
        print(f"Warning: No overlapping SNPs found in {file_path}. Skipping this file.", file=sys.stderr)
        return
    else:
        print(f"Found {len(merged_df)} overlapping SNPs in {file_path}.")
    # Step 4: save the merged dataframe
    merged_df = merged_df[["CHR", "SNP", "BP", "ALT", "BETA", "SE", "P", "rs_id"]]
    merged_df.columns = ["CHR", "SNP", "BP", "A1", "BETA", "SE", "P", "rs_id"]
    merged_df.to_csv(output_path, sep="\t", index=False, header=True)

if __name__ == "__main__":
    TRAIT_LIST = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']
    # TRAIT_LIST = ['waist']
    eas_snp_list = pd.read_csv("/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/reference/eas_ref.txt", sep="\t", header=None, usecols=[0, 1])
    eas_snp_list.columns = ["CHR", "rs_id"]
    # /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/alt/group_1/gwas/train.Pheno.glm.linear
    for pheno in TRAIT_LIST:
        print(f"Processing trait: {pheno}")
        # 定义输入和输出目录
        eas_input_dir = f"/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/{pheno}/"
        eas_output_dir = f"/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/train/EAS/{pheno}/"
        os.makedirs(eas_output_dir, exist_ok=True)
        # 定义文件名
        for i in range(1, 11):
            eas_input_file = f"{eas_input_dir}/group_{i}/gwas/train.Pheno.glm.linear"
            if os.path.exists(eas_input_file):
                eas_output_file = f"{eas_output_dir}/group_{i}.txt"
                print(f"Processing EAS file: {eas_input_file} -> {eas_output_file}")
                process_single_file(eas_input_file, eas_output_file, eas_snp_list)
            else:
                eas_input_file = f"{eas_input_dir}/group_{i}/gwas/train.Pheno.glm.logistic"
                if os.path.exists(eas_input_file):
                    eas_output_file = f"{eas_output_dir}/group_{i}.txt"
                    print(f"Processing EAS file: {eas_input_file} -> {eas_output_file}")
                    process_single_file(eas_input_file, eas_output_file, eas_snp_list)
                else:
                    print(f"Input file {eas_input_file} does not exist. Skipping.")

In [None]:
# process the valid and tuning dataset
# ids_base_path: /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/height/group_1/ids/
# eas geno base path: /data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_final/CAS_merged_qc_final
# snp list base path: /data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/reference/
import pandas as pd
import os
import subprocess

def process_snp_list(snp_list_path, output_prefix):
    snp_list = pd.read_csv(snp_list_path, sep='\t', header=None)
    snp_list.columns = ["CHR", "SNP", "cm", "BP", "A1", "A2"]
    snp_list['check_snp'] = snp_list.apply(
        lambda row: f"{row['SNP']}:{row['BP']}:{row['A1']}:{row['A2']}" if str(row['SNP']).startswith("rs") else f"{row['CHR']}:{row['BP']}:{row['A1']}:{row['A2']}", axis=1)
    snp_list[["SNP", "check_snp"]].to_csv(f"{output_prefix}/eas_update_snp_list.txt", sep='\t', index=False, header=False)
    snp_list[["SNP"]].to_csv(f"{output_prefix}/eas_extract_snp_list.txt", sep='\t', index=False, header=False)

def process_ids_file(tuning_ids_path, valid_ids_path, output_path):
    tune_ids = pd.read_csv(tuning_ids_path, sep='\t', header=None)
    valid_ids = pd.read_csv(valid_ids_path, sep='\t', header=None)
    concat_ids = pd.concat([tune_ids, valid_ids], ignore_index=True)
    concat_ids.to_csv(output_path, sep='\t', index=False, header=False)

def process_geno_file(snp_list_path, ids_path, geno_path, output_path):
    cmd = f'/data1/jiapl_group/lishuhua/software/general/plink --bfile {geno_path} --extract {snp_list_path} --keep {ids_path} --make-bed --out {output_path}'
    subprocess.run(cmd, shell=True, check=True)

def update_snp_list(geno_path, snp_list_path, output_path):
    cmd = f'/data1/jiapl_group/lishuhua/software/general/plink --bfile {geno_path} --update-name {snp_list_path} --make-bed --out {output_path}'
    subprocess.run(cmd, shell=True, check=True)

if __name__ == "__main__":
    # get the snp list
    eas_ref_snp_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/reference/eas_ref.txt"
    eas_snp_output_prefix = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/reference/"
    eas_geno_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_final/CAS_merged_qc_final"
    process_snp_list(eas_ref_snp_path, eas_snp_output_prefix)
    extract_snp_list_path = f"{eas_snp_output_prefix}/eas_extract_snp_list.txt"
    update_snp_list_path = f"{eas_snp_output_prefix}/eas_update_snp_list.txt"

    TRAIT_LIST = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']
    # TRAIT_LIST = ['waist']

    for pheno in TRAIT_LIST:
        print(f"Processing trait: {pheno}")
        # Define paths
        # For EUR population
        for i in range(1, 11):
            ids_base_path = f"/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/Cross_Validation/CAS/{pheno}/group_{i}/ids/"
            
            # Process IDs file
            tuning_ids_path = f"{ids_base_path}/tune_ids.txt"
            valid_ids_path = f"{ids_base_path}/test_ids.txt"
            output_ids_path = f"{ids_base_path}/combined_ids.txt"
            process_ids_file(tuning_ids_path, valid_ids_path, output_ids_path)

            # Process EAS genotype file
            eas_geno_output_prefix = f"/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/tuning/EAS/{pheno}/"
            if not os.path.exists(eas_geno_output_prefix):
                os.makedirs(eas_geno_output_prefix)
            output_1 = f"{eas_geno_output_prefix}/group_{i}_temp"
            output_2 = f"{eas_geno_output_prefix}/group_{i}_final"
            process_geno_file(extract_snp_list_path, output_ids_path, eas_geno_base_path, output_1)
            update_snp_list(output_1, update_snp_list_path, output_2)

In [None]:
# Remove the duplicate SNPs in EUR and EAS GWAS files
import pandas as pd
import os
import subprocess

EAS_gwas_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/train/EAS/"
EUR_gwas_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/train/EUR/"

def process_single_file(file_path, output_path):
    df = pd.read_csv(file_path, sep="\t", header=0, low_memory=False)
    # remove the duplicate SNPs in SNP column
    df = df.drop_duplicates(subset=["SNP"])
    df.to_csv(output_path, sep="\t", index=False, header=True)

if __name__ == "__main__":
    TRAIT_LIST = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']
    # TRAIT_LIST = ['waist']

    for pheno in TRAIT_LIST:
        print(f"Processing trait: {pheno}")
        # For EAS population
        for i in range(1, 11):
            eas_input_file = f"{EAS_gwas_base_path}/{pheno}/group_{i}.txt"
            eas_output_file = f"{EAS_gwas_base_path}/{pheno}/group_{i}_final.txt"
            if os.path.exists(eas_input_file):
                process_single_file(eas_input_file, eas_output_file)
                print(f"Processed EAS file: {eas_input_file} -> {eas_output_file}")
            else:
                print(f"EAS input file {eas_input_file} does not exist. Skipping.")

        # For EUR population
        eur_input_file = f"{EUR_gwas_base_path}/{pheno}.txt"
        eur_output_file = f"{EUR_gwas_base_path}/{pheno}_final.txt"
        if os.path.exists(eur_input_file):
            process_single_file(eur_input_file, eur_output_file)
            print(f"Processed EUR file: {eur_input_file} -> {eur_output_file}")
        else:
            print(f"EUR input file {eur_input_file} does not exist. Skipping.")