In [None]:
# create ref/mix from meth_atlas
import pandas as pd
import os

METHODS = ['uxm', 'celfie', 'celfeer']
DISTRIBUTION = ['uniform_dis', 'cfsort_dis', 'dirichlet_dis']
HG38_BED_PATH = '~/bed_hg38/bed_hg38.bed'

def load_data(file_path, sep='\t', header=None):
    df = pd.read_table(file_path, sep=sep, header=header)
    df.columns = ['chrom', 'start', 'end', 'cpg_idx']
    return df

def create_directory(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

def filter_and_merge_data(df_hg38, df_dirichlet, df_ref):
    df_hg38_filtered = df_hg38[df_hg38['cpg_idx'].isin(df_dirichlet['cpg_idx'])]
    df_hg38_filtered.dropna(inplace=True)

    df_ref = df_ref[df_ref['cpg_idx'].isin(df_dirichlet['cpg_idx'])]
    df_dirichlet = df_dirichlet[df_dirichlet['cpg_idx'].isin(df_ref['cpg_idx'])]

    df_merged_ref = df_ref.merge(df_hg38_filtered, on='cpg_idx', how='left')
    df_merged = df_dirichlet.merge(df_hg38_filtered, on='cpg_idx', how='left')

    df_merged = df_merged.sort_values(by='cpg_idx')
    df_merged_ref = df_merged_ref.sort_values(by='cpg_idx')

    return df_merged, df_merged_ref

def save_data(df, file_path, sep='\t', index=False, header=True):
    df.to_csv(file_path, sep=sep, index=index, header=header)

def process_distribution(df_merged, dist, rep_dir):
    if dist == 'uniform_dis' or dist == 'dirichlet_0.5' :
        range_start = 0
        range_end = 100
    else:
        range_start = 1
        range_end = 101

    for i in range(range_start, range_end):
        col = f'{dist}_{i:02d}_1' if i < 99 else f'{dist}_{i}_1'
        bed = df_merged[['chrom', 'start', 'end', 'cpg_idx', col]]
        bed.loc[:, col] = bed[col].astype(float)
        bed = bed.sort_values('cpg_idx')
        save_data(bed, f'{rep_dir}/{dist}_{i:02d}.bed', header=None)

def main():
    df_hg38 = load_data(HG38_BED_PATH)

    for median in METHODS:
        for dist in DISTRIBUTION:
            rep_dir = f'/home/sty/cfDNA_benchmark/exchange_marker/cfnome/{median}/{dist}'
            create_directory(rep_dir)

            df_dirichlet = pd.read_csv(f'/home/sty/cfDNA_benchmark/exchange_marker/{median}/mix_{dist}.csv', na_filter=True)
            df_dirichlet.dropna(inplace=True)

            df_ref = pd.read_csv(f'/home/sty/cfDNA_benchmark/exchange_marker/{median}/{median}_15.csv', na_filter=True)
            df_ref.dropna(inplace=True)

            df_merged, df_merged_ref = filter_and_merge_data(df_hg38, df_dirichlet, df_ref)

            cols = df_merged_ref.columns.tolist()
            cols = [cols[-3], cols[-2], cols[-1]] + cols[:-3]
            df_merged_ref = df_merged_ref[cols]

            save_data(df_merged_ref, f'{rep_dir}/{median}_15.bed')
            process_distribution(df_merged, dist, rep_dir)

if __name__ == "__main__":
    main()

In [None]:
# create ref/mix from meth_atlas
import pandas as pd
import os
import subprocess
from concurrent.futures import ThreadPoolExecutor

METHODS = ['uxm', 'celfie', 'celfeer']
DISTRIBUTION = ['dirichlet_0.5']
HG38_BED_PATH = '~/bed_hg38/bed_hg38.bed'

def load_data(file_path, sep='\t', header=None):
    df = pd.read_table(file_path, sep=sep, header=header)
    df.columns = ['chrom', 'start', 'end', 'cpg_idx']
    return df

def create_directory(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

def filter_and_merge_data(df_hg38, df_dirichlet, df_ref):
    df_hg38_filtered = df_hg38[df_hg38['cpg_idx'].isin(df_dirichlet['cpg_idx'])]
    df_hg38_filtered.dropna(inplace=True)

    df_ref = df_ref[df_ref['cpg_idx'].isin(df_dirichlet['cpg_idx'])]
    df_dirichlet = df_dirichlet[df_dirichlet['cpg_idx'].isin(df_ref['cpg_idx'])]

    df_merged_ref = df_ref.merge(df_hg38_filtered, on='cpg_idx', how='left')
    df_merged = df_dirichlet.merge(df_hg38_filtered, on='cpg_idx', how='left')

    df_merged = df_merged.sort_values(by='cpg_idx')
    df_merged_ref = df_merged_ref.sort_values(by='cpg_idx')

    return df_merged, df_merged_ref

def save_data(df, file_path, sep='\t', index=False, header=True):
    df.to_csv(file_path, sep=sep, index=index, header=header)

def process_distribution(df_merged, dist, rep_dir):
    if dist == 'dirichlet_0.5' :
        range_start = 0
        range_end = 100
    
        for i in range(range_start, range_end):
            col = f'dirichlet_{i:02d}_1' if i < 99 else f'dirichlet_{i}_1'
            bed = df_merged[['chrom', 'start', 'end', 'cpg_idx', col]]
            bed.loc[:, col] = bed[col].astype(float)
            bed = bed.sort_values('cpg_idx')
            save_data(bed, f'{rep_dir}/dirichlet_{i:02d}.bed', header=None)

def process_deconv(df_merged, dist, rep_dir, median):
    if dist == 'dirichlet_0.5':
        range_start = 0
        range_end = 100
        
        def run_cmd(i):
            cmd = [
                "python", '/home/sty/cfNOMe/methylation_deconvolution.py',
                f'{rep_dir}/{median}_15.bed',
                f'{rep_dir}/dirichlet_{i:02d}.bed',
                "--ineq", "--verbose", "--ref_header", "True",
                "-o", f"{rep_dir}/dirichlet_{i:02d}.tsv"
            ]
            subprocess.run(cmd)
        
        with ThreadPoolExecutor(max_workers=15) as executor:
            futures = [executor.submit(run_cmd, i) for i in range(range_start, range_end)]
            for future in futures:
                future.result()
            
def main():
    df_hg38 = load_data(HG38_BED_PATH)

    for median in METHODS:
        for dist in DISTRIBUTION:
            rep_dir = f'/home/sty/cfDNA_benchmark/exchange_marker/cfnome/{median}/{dist}'
            create_directory(rep_dir)

            df_dirichlet = pd.read_csv(f'/home/sty/cfDNA_benchmark/exchange_marker/{median}/mix_{dist}.csv', na_filter=True)
            df_dirichlet.dropna(inplace=True)

            df_ref = pd.read_csv(f'/home/sty/cfDNA_benchmark/exchange_marker/{median}/{median}_15.csv', na_filter=True)
            df_ref.dropna(inplace=True)

            df_merged, df_merged_ref = filter_and_merge_data(df_hg38, df_dirichlet, df_ref)

            cols = df_merged_ref.columns.tolist()
            cols = [cols[-3], cols[-2], cols[-1]] + cols[:-3]
            df_merged_ref = df_merged_ref[cols]

            save_data(df_merged_ref, f'{rep_dir}/{median}_15.bed')
            process_distribution(df_merged, dist, rep_dir)
            process_deconv(df_merged, dist, rep_dir, median)
if __name__ == "__main__":
    main()

In [None]:
#run deconv

In [None]:
# cal Eval Metrix