In [None]:
# Step1: get the train data of two population
import pandas as pd
import sys
import os
import scipy.stats as stats

def process_single_file(file_path, output_path):
    if file_path.endswith('.glm.linear'):
        if "BETA" not in pd.read_csv(file_path, sep='\t', nrows=1).columns:
            df = pd.read_csv(file_path, sep='\t', usecols=["ID", "REF", "ALT", "OR", "P", "OBS_CT"])
        else:
            df = pd.read_csv(file_path, sep='\t', usecols=["ID", "REF", "ALT", "BETA", "P", "OBS_CT"])
        df["P"] = df["P"].clip(1e-300, 1)
        df['c'] = -stats.norm.ppf(df['P'] / 2)
        if "BETA" in df.columns:
            df['z_score'] = df['c'] * df['BETA'].apply(lambda x: -1 if x < 0 else 1)
        else:
            df['z_score'] = df['c'] * df['OR'].apply(lambda x: -1 if x < 1 else 1)
        df = df[["ID", "ALT", "REF", "z_score", "OBS_CT"]]
        df.columns = ["SNP", "A1", "A2", "Z", "N"]
        # remove NA
        df = df.dropna(subset=["SNP", "N", "Z", "A1", "A2"])
        df.to_csv(output_path, sep='\t', index=False, header=True)
        print(f"Processed {file_path} and saved to {output_path}")
    elif file_path.endswith('.glm.logistic'):
        df = pd.read_csv(file_path, sep='\t', usecols=["ID", "REF", "ALT", "OR", "P", "OBS_CT"])
        df["P"] = df["P"].clip(1e-300, 1)
        df['c'] = -stats.norm.ppf(df['P'] / 2)
        df['z_score'] = df['c'] * df['OR'].apply(lambda x: -1 if x < 1 else 1)
        df = df[["ID", "ALT", "REF", "z_score", "OBS_CT"]]
        df.columns = ["SNP", "A1", "A2", "Z", "N"]
        # remove NA
        df = df.dropna(subset=["SNP", "N", "Z", "A1", "A2"])
        df.to_csv(output_path, sep='\t', index=False, header=True)
        print(f"Processed {file_path} and saved to {output_path}")
    else:
        print(f"Unsupported file format: {file_path}")

def process_directory(dir_path, output_dir):
    for d in os.listdir(dir_path):
        file_path = os.path.join(dir_path, d)
        if os.path.isfile(file_path):
            output_path = os.path.join(output_dir, d.replace('.glm.linear', '.txt').replace('.glm.logistic', '.txt'))
            process_single_file(file_path, output_path)
        else:
            print(f"Skipping non-file item: {file_path}")

if __name__ == "__main__":
    eur_gwas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/merged_gwas/White_British/gwas/"
    eas_gwas_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/gwas/gwas/"
    eur_output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/sdprx/train/EUR/"
    eas_output_dir = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/sdprx/train/EAS/"
    if not os.path.exists(eur_output_dir):
        os.makedirs(eur_output_dir)
    if not os.path.exists(eas_output_dir):
        os.makedirs(eas_output_dir)
    process_directory(eur_gwas_dir, eur_output_dir)
    process_directory(eas_gwas_dir, eas_output_dir)