In [None]:
import pandas as pd
import os
import numpy as np

def process_single_file(gwas_file, output_path):
    """
    Process a single GWAS file to extract and save the required data.
    
    Parameters:
    gwas_file (str): Path to the GWAS file.
    output_path (str): Directory where the output file will be saved.
    
    Returns:
    None
    """
    if gwas_file.endswith('.merged.glm.linear'):
        if "BETA" in pd.read_csv(gwas_file, sep='\t', nrows=1).columns:
            df = pd.read_csv(gwas_file, sep='\t', usecols=['ID', 'ALT', 'REF', 'A1_FREQ', 'BETA', 'SE', 'P', 'OBS_CT'])
            df = df[['ID', 'ALT', 'REF', 'A1_FREQ', 'BETA', 'SE', 'P', 'OBS_CT']]
        else:
            df = pd.read_csv(gwas_file, sep='\t', usecols=['ID', 'ALT', 'REF', 'A1_FREQ', 'OR', 'LOG(OR)_SE', 'P', 'OBS_CT'])
            df["BETA"] = np.log(df["OR"])
            df["SE"] = df["LOG(OR)_SE"]
            df = df[['ID', 'ALT', 'REF', 'A1_FREQ', 'BETA', 'SE', 'P', 'OBS_CT']]
        df.columns = ['SNP', 'A1', 'A2', 'freq', 'b', 'se', 'p', 'N']
        # df['p'] = df['p'].clip(1e-100, 1)
        df['N'] = df['N'].fillna(0).astype(int)
        df['b'] = df['b'].astype(float)
        df['se'] = df['se'].astype(float)
        df['freq'] = df['freq'].astype(float)
        df['SNP'] = df['SNP'].astype(str)
        df = df.drop_duplicates(subset=['SNP'])
        df = df[df["p"] > 1e-100]
        df = df[df["freq"] > 0.01]
        # draw the distribution of se
        df = df[df['se'] > 0]
        df = df.dropna(subset=['b', 'se', 'p'])
        df.to_csv(output_path, sep='\t', index=False)

if __name__ == "__main__":
    gwas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/gwas/"
    output_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/4_sbayesr/"
    trait_dict = {
        # 'p48': 'waist',
        'p50': 'height',
        'p102': 'pulse',
        'p4079': 'dbp',
        'p4080': 'sbp',
        'p20116': 'smoke',
        'p20117': 'drink',
        'p21001': 'bmi',
        # 'p21002': 'weight',
        'p30000': 'wbc',
        'p30010': 'rbc',
        'p30020':'hb',
        'p30080': 'plt',
        'p30120': 'lymph',
        'p30130': 'mono',
        'p30140': 'neut',
        'p30150': 'eos',
        'p30620': 'alt',
        'p30650': 'ast',
        'p30670': 'bun',
        'p30690': 'cholesterol',
        'p30700': 'creatinine',
        'p30730': 'ggt',
        'p30740': 'glucose',
        'p30760': 'hdl',
        'p30780': 'ldl',
        'p30870': 'triglycerides',
        'p30880': 'ua'
    }

    for trait, trait_name in trait_dict.items():
        gwas_file = os.path.join(gwas_base_dir, f"{trait}_int.merged.glm.linear")
        output_path = os.path.join(output_base_dir, f"{trait_name}.ma")
        
        if os.path.exists(gwas_file):
            process_single_file(gwas_file, output_path)
            print(f"Processed {gwas_file} and saved to {output_path}")
        else:
            print(f"File {gwas_file} does not exist.")

In [None]:
import pandas as pd
import os
import numpy as np

def process_single_file(gwas_file, output_path):
    """
    Process a single GWAS file to extract and save the required data.
    
    Parameters:
    gwas_file (str): Path to the GWAS file.
    output_path (str): Directory where the output file will be saved.
    
    Returns:
    None
    """
    if gwas_file.endswith('.merged.glm.linear'):
        if "BETA" in pd.read_csv(gwas_file, sep='\t', nrows=1).columns:
            df = pd.read_csv(gwas_file, sep='\t', usecols=['#CHROM', 'POS', 'ID', 'ALT', 'REF', 'A1_FREQ', 'BETA', 'SE', 'P', 'OBS_CT'])
            df = df[['#CHROM', 'POS', 'ID', 'ALT', 'REF', 'A1_FREQ', 'BETA', 'SE', 'P', 'OBS_CT']]
        else:
            df = pd.read_csv(gwas_file, sep='\t', usecols=['#CHROM', 'POS', 'ID', 'ALT', 'REF', 'A1_FREQ', 'OR', 'LOG(OR)_SE', 'P', 'OBS_CT'])
            df["BETA"] = np.log(df["OR"])
            df["SE"] = df["LOG(OR)_SE"]
            df = df[['#CHROM', 'POS', 'ID', 'ALT', 'REF', 'A1_FREQ', 'BETA', 'SE', 'P', 'OBS_CT']]
        df.columns = ['CHR', 'POS', 'SNP', 'A1', 'A2', 'freq', 'b', 'se', 'p', 'N']
        # df['p'] = df['p'].clip(1e-100, 1)
        df['N'] = df['N'].fillna(0).astype(int)
        df['b'] = df['b'].astype(float)
        df['se'] = df['se'].astype(float)
        df['freq'] = df['freq'].astype(float)
        df['SNP'] = df['SNP'].astype(str)
        df = df.drop_duplicates(subset=['SNP'])
        # remove the variants in 8477797 - 33448354 in chromosome 6
        df = df[(df['CHR'] != '6') | ((df['POS'] < 8477797) | (df['POS'] > 33448354))]
        df = df[df["p"] > 1e-100]
        df = df[df["freq"] > 0.01]
        # draw the distribution of se
        df = df[df['se'] > 0]
        df = df.dropna(subset=['b', 'se', 'p'])
        res = df[['SNP', 'A1', 'A2', 'freq', 'b', 'se', 'p', 'N']]
        res.to_csv(output_path, sep='\t', index=False)

if __name__ == "__main__":
    gwas_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/gwas/"
    output_base_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/polypred/train/EUR/4_sbayesr/"
    trait_dict = {
        # 'p48': 'waist',
        'p50': 'height',
        # 'p102': 'pulse',
        # 'p4079': 'dbp',
        # 'p4080': 'sbp',
        # 'p20116': 'smoke',
        # 'p20117': 'drink',
        # 'p21001': 'bmi',
        # 'p21002': 'weight',
        # 'p30000': 'wbc',
        # 'p30010': 'rbc',
        # 'p30020':'hb',
        # 'p30080': 'plt',
        # 'p30120': 'lymph',
        # 'p30130': 'mono',
        # 'p30140': 'neut',
        # 'p30150': 'eos',
        # 'p30620': 'alt',
        # 'p30650': 'ast',
        # 'p30670': 'bun',
        # 'p30690': 'cholesterol',
        # 'p30700': 'creatinine',
        # 'p30730': 'ggt',
        # 'p30740': 'glucose',
        # 'p30760': 'hdl',
        # 'p30780': 'ldl',
        # 'p30870': 'triglycerides',
        'p30880': 'ua'
    }

    for trait, trait_name in trait_dict.items():
        gwas_file = os.path.join(gwas_base_dir, f"{trait}_int.merged.glm.linear")
        output_path = os.path.join(output_base_dir, f"{trait_name}.ma")
        
        if os.path.exists(gwas_file):
            process_single_file(gwas_file, output_path)
            print(f"Processed {gwas_file} and saved to {output_path}")
        else:
            print(f"File {gwas_file} does not exist.")