In [None]:
#!/usr/bin/env python3
# Author: Tongyue Sun

In [1]:
import os
import shutil
# Note: The glob module should also be imported for wildcard matching
import glob
# Set the parent directory path
parent_dir = '/target_output_dir'

# Define a list of folder names to be created
folders = ['Tissue_names_list_here']

# Loop through each folder in the list
for folder in folders:
    # Construct the full path by joining the parent and current folder name
    full_path = os.path.join(parent_dir, folder)
    
    # Create the directory if it doesn't exist (won't raise an error if already exists)
    os.makedirs(full_path, exist_ok=True)

# Define the paths for original data directory and target base directory (pseudo-code paths)
original_data_dir = "/original_data_path"
target_base_dir = "/target_output_dir"

# Read TSV file and extract the first row (folder names) and second row (file stems)
# The first row represents Tissue, and the second row indicates RAW Data ID. There is a one-to-one correspondence between each column of data.

with open('data_ID_Tissues.tsv', 'r') as tsv_file:
    lines = tsv_file.readlines()
    folder_names = [name.strip() for name in lines[0].split('\t')]
    file_stems = [stem.strip() for stem in lines[1].split('\t')]

# Iterate over corresponding pairs of folder names and file stems
for folder_name, file_stem in zip(folder_names, file_stems):
    # Formulate the target folder path
    target_folder = os.path.join(target_base_dir, folder_name)
    
    # Ensure that the target folder exists
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    # Construct the glob pattern to match all relevant source files
    source_glob_pattern = os.path.join(original_data_dir, f"{file_stem}_*.pat.gz*")

    # Find all matching files and copy them to the target folder
    for src_file in glob.glob(source_glob_pattern):
        # Get the destination file path within the target folder
        dst_file = os.path.join(target_folder, os.path.basename(src_file))
        
        # Copy the source file to the destination
        shutil.copy(src_file, dst_file)

In [None]:
# Copy .beta files.
for folder_name, file_stem in zip(folder_names, file_stems):
    target_folder = os.path.join(target_base_dir, folder_name)
    
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    source_glob_pattern = os.path.join(original_data_dir, f"{file_stem}_*.beta")
    for src_file in glob.glob(source_glob_pattern):
        dst_file = os.path.join(target_folder, os.path.basename(src_file))
        shutil.copy(src_file, dst_file)


In [3]:
import os
import random
import shutil

# Define the source and target directory paths to process
root_dir = 'target directory paths'
ref_dir_name = 'ref'
mix_dir_name = 'mix'

# Iterate through all folders representing different tissue types
for folder in folders:
    folder_path = os.path.join(root_dir, folder)
    
    # Ensure that the folder exists and contains at least two .pat.gz files
    pat_files = [f for f in os.listdir(folder_path) if f.endswith('.pat.gz')]
    csi_files = [f[:-7] + '.pat.gz.csi' for f in pat_files]
    if len(pat_files) < 2:
        print(f"Folder '{folder}' does not have enough .pat.gz files.")
        continue
    
    # Create 'ref' and 'mix' subdirectories within the current folder
    ref_subdir = os.path.join(folder_path, ref_dir_name)
    mix_subdir = os.path.join(folder_path, mix_dir_name)
    os.makedirs(ref_subdir, exist_ok=True)
    os.makedirs(mix_subdir, exist_ok=True)

    # For odd-numbered files, randomly select (odd_number - 1) pairs to move
    num_to_move = len(pat_files) - (len(pat_files) % 2)
    
    # Randomly shuffle the files and evenly distribute them into 'ref' and 'mix' folders
    random.shuffle(pat_files)
    half_num = num_to_move // 2
    ref_pat_files = pat_files[:half_num]
    mix_pat_files = pat_files[half_num:num_to_move]
    ref_csi_files = [f[:-7] + '.pat.gz.csi' for f in ref_pat_files]
    mix_csi_files = [f[:-7] + '.pat.gz.csi' for f in mix_pat_files]

    # Move file pairs (.pat.gz and .pat.gz.csi) to their respective directories
    for pat_file, csi_file in zip(ref_pat_files, ref_csi_files):
        src_pat = os.path.join(folder_path, pat_file)
        dst_pat = os.path.join(ref_subdir, pat_file)
        src_csi = os.path.join(folder_path, csi_file)
        dst_csi = os.path.join(ref_subdir, csi_file)
        shutil.move(src_pat, dst_pat)
        shutil.move(src_csi, dst_csi)

    for pat_file, csi_file in zip(mix_pat_files, mix_csi_files):
        src_pat = os.path.join(folder_path, pat_file)
        dst_pat = os.path.join(mix_subdir, pat_file)
        src_csi = os.path.join(folder_path, csi_file)
        dst_csi = os.path.join(mix_subdir, csi_file)
        shutil.move(src_pat, dst_pat)
        shutil.move(src_csi, dst_csi)

In [None]:
import subprocess

# The folder where the mixed data is finally stored.
target_dir = 'Final folder'

# Traverse through all folders of different tissue types
for folder in folders:
    ref_folder_path = os.path.join(root_dir, folder, 'ref')
    mix_folder_path = os.path.join(root_dir, folder, 'mix')
    ref_target_path = os.path.join(target_dir, 'ref', folder)
    mix_target_path = os.path.join(target_dir, 'mix', folder)

    if not os.path.exists(ref_folder_path) or not os.path.exists(mix_folder_path):
        print(f"Folder '{folder}' does not have both ref and mix subfolders.")
        continue
    # The genome version used for the data.
    assembly_version = 'hg38'
    # Execute the merge command
    ref_merge_cmd = f'wgbstools merge {ref_folder_path}/*.pat.gz --genome {assembly_version} -f -p {ref_target_path}'
    subprocess.run(ref_merge_cmd, shell=True)

    mix_merge_cmd = f'wgbstools merge {mix_folder_path}/*.pat.gz --genome {assembly_version} -f -p {mix_target_path}'
    subprocess.run(mix_merge_cmd, shell=True)

In [None]:
import subprocess
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

def run_cmd(cmd):
    """Run a shell command using `subprocess.run`."""
    subprocess.run(cmd, shell=True)

input_directory = os.path.join(target_dir, 'mix')
crd_output_directory = os.path.join(target_dir, 'crd_dis')
matrix_filepath = os.path.join(target_dir, 'crd_dis.csv')

# Get the sorted list of files in the input directory
file_list = sorted(os.listdir(input_directory))
file_list = [f for f in file_list if f.endswith(".pat.gz")]
print(file_list)

# Read the 'dirichlet_matrix_d.csv' file
df = pd.read_csv(matrix_file)

# Get the number of columns excluding the ID column
num_columns = df.shape[1] - 1

# Iterate through columns and execute commands concurrently
with ThreadPoolExecutor(max_workers=10) as executor:
    for col_index in range(1, num_columns + 1):
        # Extract the data for the current column as a space-separated string
        column_data = " ".join(df.iloc[:, col_index].astype(str))

        # Format the uniform parameter based on the column index
        uniform_param = f"crd_dis_{str(col_index).zfill(2)}"  # Pad with zeros for consistent length

        # Construct the mix_pat command
        mix_pat_command = [
            "wgbstools",
            "mix_pat",
            " ".join([os.path.join(input_directory, f) for f in file_list]),
            "--rates",
            column_data,
            "-p",
            os.path.join(crd_output_directory, uniform_param),
            "--genome",
            "hg38",
            "-c",
            "22"
        ]
        
        # Combine the command elements into a single string
        mix_cmd_run = " ".join(mix_pat_command)
        print(f"Executing command: {mix_cmd_run}")

        # Submit the command to be executed asynchronously
        executor.submit(run_cmd, mix_cmd_run)

In [None]:
def run_cmd(cmd):
    """Run a shell command using `subprocess.run`."""
    subprocess.run(cmd, shell=True)

input_directory = os.path.join(target_dir, 'mix')
uniform_output_directory = os.path.join(target_dir, 'uniform_dis')
matrix_filepath = os.path.join(target_dir, 'uniform_matrix.csv')

# Get the sorted list of files in the input directory
file_list = sorted(os.listdir(input_directory))
file_list = [f for f in file_list if f.endswith(".pat.gz")]
print(file_list)

# Read the 'dirichlet_matrix_d.csv' file
df = pd.read_csv(matrix_file)

# Get the number of columns excluding the ID column
num_columns = df.shape[1] - 1

# Iterate through columns and execute commands concurrently
with ThreadPoolExecutor(max_workers=10) as executor:
    for col_index in range(1, num_columns + 1):
        # Extract the data for the current column as a space-separated string
        column_data = " ".join(df.iloc[:, col_index].astype(str))

        # Format the uniform parameter based on the column index
        uniform_param = f"uniform_dis_{str(col_index).zfill(2)}"  # Pad with zeros for consistent length

        # Construct the mix_pat command
        mix_pat_command = [
            "wgbstools",
            "mix_pat",
            " ".join([os.path.join(input_directory, f) for f in file_list]),
            "--rates",
            column_data,
            "-p",
            os.path.join(uniform_output_directory, uniform_param),
            "--genome",
            "hg38",
            "-c",
            "22"
        ]
        
        # Combine the command elements into a single string
        mix_cmd_run = " ".join(mix_pat_command)
        print(f"Executing command: {mix_cmd_run}")

        # Submit the command to be executed asynchronously
        executor.submit(run_cmd, mix_cmd_run)

In [None]:
# Generate .beta files for each dataset(directory) 
assembly_version = 'hg38'
# Execute the pat2beta command
uniform_dis_cmd = f'wgbstools pat2beta {uniform_output_directory}/*.pat.gz --genome {assembly_version} -f -o {uniform_output_directory}/'
subprocess.run(uniform_dis_cmd, shell=True)

crd_dis_cmd = f'wgbstools pat2beta {crd_output_directory}/*.pat.gz --genome {assembly_version} -f -o {uniform_output_directory}/'
subprocess.run(crd_dis_cmd, shell=True)

ref_beta_cmd = f'wgbstools pat2beta {ref_folder_path}/*.pat.gz --genome {assembly_version} -f -o {ref_folder_path}/'
subprocess.run(ref_beta_cmd, shell=True)