In [None]:
# Generating reference markers with different missing proportions by randomly introducing missingness. 
import pandas as pd
import numpy as np
import os
import random

# Define the array of median values
median_values = np.concatenate([np.arange(15, 51, 5), np.arange(60, 101, 10)])

# Loop through each median value
for median in median_values:
    # Create a directory for each specific median value
    folder_path = f'ref_median_{median}/drop'

    # Check if the directory exists; if not, create it
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Iterate over various percentage drop situations
    for percentage in [0.1, 0.2, 0.3, 0.4, 0.5]:
        for k in range(20):
            # Load the original data files
            ref_file_path = f'ref_median_{median}/ref_depth_{median}_unique.csv'

            # Read these files into Pandas DataFrames
            ref_df = pd.read_csv(ref_file_path)

            # Calculate the number of rows to randomly remove
            num_rows_to_remove = int(len(ref_df) * percentage)
            # Choose random indices to remove
            indices_to_remove = random.sample(range(len(ref_df)), num_rows_to_remove)

            # Remove the corresponding rows from both DataFrames
            ref_df = ref_df.drop(indices_to_remove)

            # Create and verify the existence of a subdirectory for storing modified files
            drop_folder_path = f'ref_median_{median}/drop_{percentage}_col'
            if not os.path.exists(drop_folder_path):
                os.makedirs(drop_folder_path)

            # Save the modified DataFrames to separate CSV files
            ref_drop_file_path = f'{drop_folder_path}/reference_median_{median}_{k}.csv'

            # Write DataFrames to disk without including the index
            ref_df.to_csv(ref_drop_file_path, index=False)

In [None]:
# Generate data for mixture under uniform distribution 
import pandas as pd
import numpy as np
import os
import random

# Define the range of median values
median_values = np.concatenate([np.arange(15, 51, 5), np.arange(60, 101, 10)])

# Loop through each median value
for median in median_values:
    # Create directory structure based on median value
    folder_path = f'/cfDNA_benchmark/meth_atlas_data/uniform_dis/drop_col/ref_median_{median}'
    
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Loop through different percentage drop scenarios
    for percent in [0.1, 0.2, 0.3, 0.4, 0.5]:
        drop_folder_path = f'{folder_path}/drop_{percent}_col'
        
        if not os.path.exists(drop_folder_path):
            os.makedirs(drop_folder_path)
            
        # Set the path to the mixed dataset file
        mix_file_path = f'/cfDNA_benchmark/meth_atlas_data/uniform_dis/uniform_mix/full_atlas_uniform_median_{median}.csv'

        # Loop for k iterations
        for k in range(20):
            # Read original files
            origin_drop_folder = f'/cfDNA_benchmark/meth_atlas_data/ref_median_{median}/drop_{percent}_col'
            ref_drop_file_origin = f'{origin_drop_folder}/reference_median_{median}_{k}.csv'

            ref_df = pd.read_csv(ref_drop_file_origin)
            mix_df = pd.read_csv(mix_file_path)
            
            # Filter the mixed dataframe to keep rows where 'cpg_idx' is present in reference dataframe
            mix_df = mix_df[mix_df['cpg_idx'].isin(ref_df['cpg_idx'])]
            
            # Ensure destination directory exists before saving
            if not os.path.exists(drop_folder_path):
                os.makedirs(drop_folder_path)

            # Save processed reference and mixed dataframes
            ref_drop_file_path = f'{drop_folder_path}/reference_median_{median}_{k}.csv'
            mix_drop_file_path = f'{drop_folder_path}/fa_uniform_median_{median}_{k}.csv'

            ref_df.to_csv(ref_drop_file_path, index=False)
            mix_df.to_csv(mix_drop_file_path, index=False)

In [None]:
# Generate data for mixture under constrained random distribution (CRD)
import pandas as pd
import numpy as np
import os
import random

# Define the range of median values
median_values = np.concatenate([np.arange(15, 51, 5), np.arange(60, 101, 10)])

# Loop through each median value
for median in median_values:
    # Create directory structure based on median value
    folder_path = f'/cfDNA_benchmark/meth_atlas_data/crd_dis/drop_col/ref_median_{median}'
    
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Loop through different percentage drop scenarios
    for percent in [0.1, 0.2, 0.3, 0.4, 0.5]:
        drop_folder_path = f'{folder_path}/drop_{percent}_col'
        
        if not os.path.exists(drop_folder_path):
            os.makedirs(drop_folder_path)
            
        # Set the path to the mixed dataset file
        mix_file_path = f'/cfDNA_benchmark/meth_atlas_data/crd_dis/crd_mix/full_atlas_crd_median_{median}.csv'

        # Loop for k iterations
        for k in range(20):
            # Read original files
            origin_drop_folder = f'/cfDNA_benchmark/meth_atlas_data/ref_median_{median}/drop_{percent}_col'
            ref_drop_file_origin = f'{origin_drop_folder}/reference_median_{median}_{k}.csv'

            ref_df = pd.read_csv(ref_drop_file_origin)
            mix_df = pd.read_csv(mix_file_path)
            
            # Filter the mixed dataframe to keep rows where 'cpg_idx' is present in reference dataframe
            mix_df = mix_df[mix_df['cpg_idx'].isin(ref_df['cpg_idx'])]
            
            # Ensure destination directory exists before saving
            if not os.path.exists(drop_folder_path):
                os.makedirs(drop_folder_path)

            # Save processed reference and mixed dataframes
            ref_drop_file_path = f'{drop_folder_path}/reference_median_{median}_{k}.csv'
            mix_drop_file_path = f'{drop_folder_path}/fa_crd_median_{median}_{k}.csv'

            ref_df.to_csv(ref_drop_file_path, index=False)
            mix_df.to_csv(mix_drop_file_path, index=False)