In [None]:
import pandas as pd
import numpy as np
import os

# rsid    a1  a2  af  N   beta    SE

def get_summary_stats(trait, gwas_path, output_path, ancestry):
    print(f"Processing trait: {trait} of ancestry: {ancestry}")
    if "BETA" in pd.read_csv(gwas_path, sep="\t", nrows=1).columns:
        # df = df[["ID", "#CHROM", "ALT", "REF", "BETA", "SE", "OBS_CT"]]
        df = pd.read_csv(gwas_path, sep='\t', usecols=['ID', 'ALT', 'REF', 'A1_FREQ', 'BETA', 'SE', 'OBS_CT'])
    else:
        df = pd.read_csv(gwas_path, sep='\t', usecols=['ID', 'ALT', 'REF', 'A1_FREQ', 'OR', 'LOG(OR)_SE', 'OBS_CT'])
        df['BETA'] = np.log(df['OR'])
        df['SE'] = df['LOG(OR)_SE']
    df = df[["ID", "REF", "ALT", "A1_FREQ", "OBS_CT", "BETA", "SE"]]
    df.columns = ["rsid", "a1", "a2", "af", "N", "beta", "SE"]
    df = df.drop_duplicates(subset=['rsid'])
    df = df.dropna(subset=['rsid', 'beta', 'SE', 'N'])
    df.to_csv(output_path, sep="\t", index=False, header=True)

if __name__ == "__main__":
    # Define paths and traits
    eas_gwas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/gwas/gwas/"
    eur_gwas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/gwas/"
    output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/popcorn/gwas/"
    trait_dict = {
        'p48': 'waist',
        'p50': 'height',
        'p102': 'pulse',
        'p4079': 'dbp',
        'p4080': 'sbp',
        'p20116': 'smoke',
        'p20117': 'drink',
        'p21001': 'bmi',
        'p30000': 'wbc',
        'p30010': 'rbc',
        'p30020':'hb',
        'p30080': 'plt',
        'p30120': 'lymph',
        'p30130': 'mono',
        'p30140': 'neut',
        'p30150': 'eos',
        'p30620': 'alt',
        'p30650': 'ast',
        'p30670': 'bun',
        'p30690': 'cholesterol',
        'p30700': 'creatinine',
        'p30730': 'ggt',
        'p30740': 'glucose',
        'p30760': 'hdl',
        'p30780': 'ldl',
        'p30870': 'triglycerides',
        'p30880': 'ua'
    }

    # !!! processe the EAS popluation !!! alt_int.alt_int.glm.linear
    for trait, name in trait_dict.items():
        eas_gwas_path = os.path.join(eas_gwas_dir, f'{name}_int.{name}_int.glm.linear')
        if name == 'smoke' or name == 'drink':
            eas_gwas_path = os.path.join(eas_gwas_dir, f'{name}_raw.{name}_raw.glm.logistic')
        output_path = os.path.join(output_dir, 'EAS', f'{name}.txt')
        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))
        get_summary_stats(name, eas_gwas_path, output_path, 'EAS')
    
    # !!! processe the EUR popluation !!!  p102_int.merged.glm.linear
    for trait, name in trait_dict.items():
        eur_gwas_path = os.path.join(eur_gwas_dir, f'{trait}_int.merged.glm.linear')
        output_path = os.path.join(output_dir, 'EUR', f'{name}.txt')
        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))
        get_summary_stats(name, eur_gwas_path, output_path, 'EUR')

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import glob

# rsid    a1  a2  af  N   beta    SE
def get_summary_stats(gwas_path, output_path):
    df = pd.read_csv(gwas_path, sep='\t', header=0)
    df.columns = ["rsid", "a1", "a2", "af", "N", "beta", "SE"]
    df.to_csv(output_path, sep="\t", index=False, header=True)

if __name__ == "__main__":
    # Define paths and traits
    gwas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/popcorn/gwas/"
    search_pattern = os.path.join(gwas_dir, "*", "*.txt")
    gwas_files = glob.glob(search_pattern)
    print(f"Found {len(gwas_files)} GWAS files to process.")
    for gwas_path in gwas_files:
        print(f"Processing GWAS file: {gwas_path}")
        get_summary_stats(gwas_path, gwas_path)