In [None]:
import pandas as pd
import os
import numpy as np

gwas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/Chinese/gwas/"
output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/popcorn/gwas/EAS_UKB/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

def get_summary_stats(trait, gwas_path, output_path, ancestry):
    print(f"Processing trait: {trait} of ancestry: {ancestry}")
    if "BETA" in pd.read_csv(gwas_path, sep="\t", nrows=1).columns:
        # df = df[["ID", "#CHROM", "ALT", "REF", "BETA", "SE", "OBS_CT"]]
        df = pd.read_csv(gwas_path, sep='\t', usecols=['ID', 'ALT', 'REF', 'A1_FREQ', 'BETA', 'SE', 'OBS_CT'])
    else:
        df = pd.read_csv(gwas_path, sep='\t', usecols=['ID', 'ALT', 'REF', 'A1_FREQ', 'OR', 'LOG(OR)_SE', 'OBS_CT'])
        df['BETA'] = np.log(df['OR'])
        df['SE'] = df['LOG(OR)_SE']
    df = df[["ID", "REF", "ALT", "A1_FREQ", "OBS_CT", "BETA", "SE"]]
    df.columns = ["rsid", "a1", "a2", "af", "N", "beta", "SE"]
    df = df.drop_duplicates(subset=['rsid'])
    df = df.dropna(subset=['rsid', 'beta', 'SE', 'N'])
    df.to_csv(output_path, sep="\t", index=False, header=True)


trait_dict = {
        'p48': 'waist',
        'p50': 'height',
        'p102': 'pulse',
        'p4079': 'dbp',
        'p4080': 'sbp',
        'p20116': 'smoke',
        'p20117': 'drink',
        'p21001': 'bmi',
        'p30000': 'wbc',
        'p30010': 'rbc',
        'p30020':'hb',
        'p30080': 'plt',
        'p30120': 'lymph',
        'p30130': 'mono',
        'p30140': 'neut',
        'p30150': 'eos',
        'p30620': 'alt',
        'p30650': 'ast',
        'p30670': 'bun',
        'p30690': 'cholesterol',
        'p30700': 'creatinine',
        'p30730': 'ggt',
        'p30740': 'glucose',
        'p30760': 'hdl',
        'p30780': 'ldl',
        'p30870': 'triglycerides',
        'p30880': 'ua'
    }

for trait, trait_name in trait_dict.items():
    print(f"Processing trait: {trait_name}")
    gwas_file = os.path.join(gwas_base_dir, f"{trait}_int.merged.glm.linear")
    output_path = os.path.join(output_dir, f"{trait_name}.txt")
    get_summary_stats(trait_name, gwas_file, output_path, "EAS")

In [None]:
import pandas as pd
import numpy as np
import os

eas_gwas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/popcorn/gwas/EAS_UKB/"
eur_gwas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/popcorn/gwas/EUR/"
output_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/popcorn/res_for_ukb/"
score_file_path = "/data1/jiapl_group/lishuhua/software/general/Popcorn-master/score/EUR_EAS_all_gen_eff.cscore"

trait_list = ['waist', 'height', 'pulse', 'dbp', 'sbp', 'smoke', 'drink', 'bmi', 'wbc', 'rbc', 'hb', 'plt', 'lymph', 'mono', 'neut', 'eos', 'alt', 'ast', 'bun', 'cholesterol', 'creatinine', 'glucose', 'ggt', 'hdl', 'ldl', 'triglycerides', 'ua']

for trait_name in trait_list:
    eas_sumstats_path = os.path.join(eas_gwas_base_dir, f"{trait_name}.txt")
    eur_sumstats_path = os.path.join(eur_gwas_base_dir, f"{trait_name}.txt")
    if not os.path.exists(eas_sumstats_path):
        print(f"EAS summary stats file not found for trait {trait_name}, skipping...")
        continue
    if not os.path.exists(eur_sumstats_path):
        print(f"EUR summary stats file not found for trait {trait_name}, skipping...")
        continue
    output_path = os.path.join(output_base_dir, f"{trait_name}.txt")
    if os.path.exists(output_path):
        print(f"Output file already exists for trait {trait_name}, skipping...")
        continue
    print(f"Processing trait {trait_name}...")
    cmd = f"popcorn fit -v 1 --cfile {score_file_path} --gen_effect --sfile1 {eur_sumstats_path} --sfile2 {eas_sumstats_path} {output_path}"
    os.system(cmd)