In [None]:
import pandas as pd
import numpy as np
import subprocess

eur_bim_file_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/UKB/geno/White_British/0_sample_qc/"
eas_bim_file_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/real_data/CAS/geno/CAS_final/CAS_merged_qc_final.bim"
eur_ref_file_path = "/data1/jiapl_group/lishuhua/software/PRS/CT_SLEB/reference/EUR/merged/chr_all.bim"
eas_ref_file_path = "/data1/jiapl_group/lishuhua/software/PRS/CT_SLEB/reference/EAS/merged/chr_all.bim"
output_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/reference/"

def convert_to_ref_format(
    input_bim: str,
    output_bim: str,
    ref_bim: str,
):
    """
    Convert a BIM file to match the reference format.
    
    Args:
        input_bim (str): Path to the input BIM file.
        output_bim (str): Path to save the converted BIM file.
        ref_bim (str): Path to the reference BIM file.
        chrom (int): Chromosome number.
    """
    bim_df = pd.read_csv(input_bim, sep="\t", header=None)
    ref_df = pd.read_csv(ref_bim, sep=" ", header=None)
    print(f"Input BIM file shape: {bim_df.shape}")
    print(f"Reference BIM file shape: {ref_df.shape}")
    bim_df.columns = ["chr", "snp", "cm", "pos", "a1", "a2"]
    ref_df.columns = ["chr", "snp", "cm", "pos", "a1", "a2"]
    # bim_df['check_snp'] = bim_df.apply(
    #     lambda row: f"{row['chr']}:{row['pos']}:{row['a1']}:{row['a2']}" if str(row['snp']) == '.' else f"{row['snp']}:{row['pos']}:{row['a1']}:{row['a2']}",
    #     axis=1
    # )
    bim_df['check_snp'] = bim_df.apply(
        lambda row: f"{row['snp']}:{row['pos']}:{row['a2']}:{row['a1']}" if str(row['snp']).startswith("rs") else f"{row['chr']}:{row['pos']}:{row['a2']}:{row['a1']}",
        axis=1
    )
    # find the overlap between check_snp in bim_df and snp in ref_df
    merge_df = pd.merge(bim_df[["snp","check_snp"]], ref_df[["snp"]], left_on="check_snp", right_on="snp", how="inner")
    if merge_df.empty:
        print(f"No overlapping SNPs found.")
        return
    # filter the bim_df based on the overlapping SNPs
    bim_df = bim_df[bim_df['check_snp'].isin(merge_df['check_snp'])]
    bim_df = bim_df[["chr", "snp", "cm", "pos", "a1", "a2"]]
    print(f"Filtered BIM file shape: {bim_df.shape}")
    bim_df.to_csv(output_bim, sep="\t", header=False, index=False)

if __name__ == "__main__":
    # Convert EUR BIM file to reference format
    # convert_to_ref_format(
    #     input_bim=f"{eur_bim_file_path}ukb_eur_chr1.bim",
    #     output_bim=f"{eur_bim_file_path}ukb_eur_chr1_ref.bim",
    #     ref_bim=eur_ref_file_path,
    # )
    
    # Convert EAS BIM file to reference format
    convert_to_ref_format(
        input_bim=f"{eas_bim_file_path}",
        output_bim=f"{output_base_path}/eas_ref.txt",
        ref_bim=eas_ref_file_path,
    )

    for chrom in range(1, 23):
        # Convert EUR BIM file to reference format for each chromosome
        convert_to_ref_format(
            input_bim=f"{eur_bim_file_path}/chr{chrom}.bim",
            output_bim=f"{output_base_path}/eur_chr{chrom}_ref.txt",
            ref_bim=eur_ref_file_path,
        )

In [None]:
# merge the EUR ref files into one
import os
import subprocess

output_base_path = "/data1/jiapl_group/lishuhua/project/PRS_benchmark/software/ctsleb/reference/"
merge_cmd = f"cat {output_base_path}/eur_chr*_ref.txt > {output_base_path}/eur_ref.txt"
subprocess.run(merge_cmd, shell=True, check=True)
print("EUR reference files merged successfully.")