In [1]:
import numpy as np
import os
import glob

In [2]:
def generate_damage_profiles(template_file, output_dir, n, damage_type, positions, col_to_modify, start_range, end_range, mean, std_dev, end):
    """
    Generates n profiles with non-linear damage decay, modifying only the specified column (C>T or G>A).
    
    Args:
    - template_file: Path to the template file to read from.
    - output_dir: Directory where the generated profiles will be saved.
    - n: Number of profiles to generate.
    - damage_type: Either 'high' or 'mid' for the damage profile.
    - positions: List of positions to modify, e.g. [-4, -3, -2, -1, 0] for 3p, [0, 1, 2, 3, 4] for 5p.
    - col_to_modify: Column to modify ('C>T' for 5p or 'G>A' for 3p).
    - start_range: (min, max) range for the highest damage value (index 0).
    - end_range: (min, max) range for the lowest damage value (last position).
    - mean: Mean for the Gaussian distribution to control damage decay.
    - std_dev: Standard deviation for the Gaussian distribution to control damage decay.
    - end: If set to '3', the values in the profile are reversed after processing, while keeping the indices unchanged.
    
    The generated profiles will be saved as new files in the specified output directory.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Read template file structure
    with open(template_file, 'r') as f:
        template = f.readlines()

    # Get the column labels (e.g., A>C, A>G, etc.) from the template, and keep the full header
    header = template[0].strip()

    # Identify the index of the column we want to modify (either C>T or G>A)
    header_split = header.split()
    modify_col_idx = header_split.index(col_to_modify)

    # For each line of template (excluding the header), define the structure for the profile
    position_lines = template[1:]

    for i in range(n):
        # Generate the highest damage value using a Gaussian distribution (clipping to the range)
        start_value = np.clip(np.random.normal(loc=mean, scale=std_dev), start_range[0], start_range[1])

        # Randomly draw the lowest damage value from within the end_range
        end_value = np.random.uniform(end_range[0], end_range[1])

        # Generate non-linear decay for the damage values
        decay = np.geomspace(start_value, end_value, num=len(positions))

        # If end == '3', reverse the decay values
        if end == '3':
            decay = decay[::-1]

        # Prepare the profile lines for output
        updated_lines = []

        for j, line in enumerate(position_lines):
            # Split the line into position and values, respecting the [0..0] format
            parts = line.strip().split()
            updated_parts = []

            # First part should be the index (e.g., -4, -3, 0, etc.)
            updated_parts.append(parts[0])  # This is the index, so write it

            for idx, part in enumerate(parts[1:], 1):
                if idx == modify_col_idx * 2 + 1 and int(parts[0]) in positions:
                    # Modify the specific column value with its [0..0] part
                    updated_parts.append(f"{decay[positions.index(int(parts[0]))]:.6f}")
                    updated_parts.append("[0..0]")
                else:
                    updated_parts.append(part)

            # Join the updated parts and add them to the updated_lines
            updated_lines.append("\t".join(updated_parts))

        # Prepare output filename
        output_file = os.path.join(output_dir, f"{damage_type}_{i+1}_{end}.dat")
        
        # Write the new profile to a file
        with open(output_file, 'w') as out:
            out.write(header + '\n')  # Write the entire header (including index column)
            for updated_line in updated_lines:
                out.write(updated_line + '\n')

        #print(f"Generated profile {i+1}/{n}: {output_file}")


In [3]:
# Example usage for 5' damage (C>T column) for high damage
generate_damage_profiles(
    template_file='/home/damage/dhigh5.dat',
    output_dir='/home/data/damage/dhigh',
    n=1000,
    damage_type='high',
    positions=[0, 1, 2, 3, 4],  # Positions for 5p damage (index 0 is highest)
    col_to_modify='C>T',
    start_range=(0.3, 0.6),  # Range for first position
    end_range=(0.05, 0.15),  # Range for last position
    mean=0.45,               # Mean value for Gaussian distribution
    std_dev=0.125,              # Standard deviation for Gaussian distribution
    end="5"
)

# Example usage for 3' damage (G>A column) for high damage
generate_damage_profiles(
    template_file='/home/damage/dhigh3.dat',
    output_dir='/home/data/damage/dhigh',
    n=1000,
    damage_type='high',
    positions=[-4, -3, -2, -1, 0],  # Positions for 3p damage (index 0 is highest)
    col_to_modify='G>A',
    start_range=(0.3, 0.6),  # Range for first position
    end_range=(0.05, 0.15),  # Range for last position
    mean=0.45,               # Mean value for Gaussian distribution
    std_dev=0.125,              # Standard deviation for Gaussian distribution
    end="3"
)

# Example usage for 5' damage (C>T column) for mid damage
generate_damage_profiles(
    template_file='/home/damage/dmid5.dat',
    output_dir='/home/data/damage/dmid',
    n=1000,
    damage_type='mid',
    positions=[0, 1, 2, 3, 4],  # Positions for 5p damage
    col_to_modify='C>T',
    start_range=(0.15, 0.25),  # Range for first position
    end_range=(0.03, 0.05),    # Range for last position
    mean=0.2,                  # Mean value for Gaussian distribution
    std_dev=0.1,               # Standard deviation for Gaussian distribution
    end="5"
)

# Example usage for 3' damage (G>A column) for mid damage
generate_damage_profiles(
    template_file='/home/damage/dmid3.dat',
    output_dir='/home/data/damage/dmid',
    n=1000,
    damage_type='mid',
    positions=[-4, -3, -2, -1, 0],  # Positions for 3p damage
    col_to_modify='G>A',
    start_range=(0.15, 0.25),  # Range for first position
    end_range=(0.03, 0.05),    # Range for last position
    mean=0.2,                  # Mean value for Gaussian distribution
    std_dev=0.1,               # Standard deviation for Gaussian distribution
    end="3"
)



In [13]:
generate_damage_profiles(
    template_file='/damage/template_dnone/none_5.dat',
    output_dir='/home/data/damage/dnone',
    n=1000,
    damage_type='none',
    positions=[0, 1, 2, 3, 4],  # Positions for 5p damage (index 0 is highest)
    col_to_modify='C>T',
    start_range=(0.04, 0.03),  # Range for first position
    end_range=(0.03, 0.01),  # Range for last position
    mean=0.03,               # Mean value for Gaussian distribution
    std_dev=0.01,              # Standard deviation for Gaussian distribution
    end="5"
)

# Example usage for 3' damage (G>A column) for high damage
generate_damage_profiles(
    template_file='/damage/template_dnone/none_3.dat',
    output_dir='/home/data/damage/dnone',
    n=1000,
    damage_type='none',
    positions=[-4, -3, -2, -1, 0],  # Positions for 3p damage (index 0 is highest)
    col_to_modify='G>A',
    start_range=(0.04, 0.03),  # Range for first position
    end_range=(0.03, 0.01),  # Range for last position
    mean=0.03,               # Mean value for Gaussian distribution
    std_dev=0.01,              # Standard deviation for Gaussian distribution
    end="3"
)


In [14]:
# Directory where your files are located
directory = "/home/data/damage/dnone"

# Get a list of all files in the directory (you can adjust the file pattern as needed)
files = glob.glob(os.path.join(directory, "*.dat"))

# Function to process each file
def process_file(filepath):
    with open(filepath, "r") as file:
        lines = file.readlines()

    # Process each line: remove '[0..0]' and replace whitespace with tabs
    processed_lines = []
    for line in lines:
        # Remove '[0..0]' and replace all whitespaces with tabs
        cleaned_line = line.replace("[0..0]", "").strip()  # Strip trailing whitespaces
        cleaned_line = "\t".join(cleaned_line.split())  # Replace spaces with tabs
        processed_lines.append(cleaned_line)

    # Write the cleaned content back to the file (or create a new file)
    with open(filepath, "w") as file:
        file.write("\n".join(processed_lines) + "\n")

# Loop over all files and process them
for filepath in files:
    process_file(filepath)

print("Processing completed!")


Processing completed!
