In [None]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Function to process and load data from a pair of 5p and 3p files
def load_and_combine_data(fivep_file, threep_file, all_combined_data, reverse):
    # Load 5p and 3p data
    fivep_data = pd.read_csv(fivep_file, delimiter='\t')["C>T"]
    #print(fivep_data)
    threep_data = pd.read_csv(threep_file, delimiter='\t')["G>A"]

    # Reverse the 3p data to align with the 5p positions
    if reverse:
        threep_data_reversed = threep_data.iloc[::-1].reset_index(drop=True)
    else:
        threep_data_reversed = threep_data

    # Append each row from 5p to keys 0-4, and 3p (reversed) to keys 5-9 in all_combined_data
    for i in range(len(fivep_data)):
        all_combined_data[i].append(fivep_data.iloc[i].tolist())  # Append 5p data to keys 0-4

    for i in range(len(threep_data)):
        all_combined_data[i + 5].append(threep_data_reversed.iloc[i].tolist())  # Append 3p data to keys 5-9



In [None]:
#all_combined_data

In [None]:
def plot_prof_kde(ax, all_combined_data, substitution_type='C>T', color='red', positions_range=(5, 10), xlabel="Position from 5' end"):
    """
    Create a KDE plot for substitutions to show density of substitution frequencies.
    
    Parameters:
    - ax: The axis to plot on.
    - all_combined_data: Dictionary of position -> list of frequencies.
    - substitution_type: The substitution type to plot ('C>T' or 'G>A').
    - color: The color for the KDE plot ('red' for 'C>T', 'blue' for 'G>A').
    - positions_range: Range of positions to plot (tuple, e.g., (6, 11) for C>T or (0, 5) for G>A).
    - xlabel: Label for the x-axis (e.g., "Position from 5' end" or "Position from 3' end").
    """
    
    # Extract the relevant positions and frequencies for the given substitution type
    positions = list(range(*positions_range))  # Extracts range, e.g., 6-11 for C>T, 0-5 for G>A
    frequencies = [freq for pos in positions for freq in all_combined_data[pos]]  # All frequencies for selected positions
    
    # Prepare data for Seaborn
    x_vals = [pos for pos in positions for _ in range(len(all_combined_data[pos]))]
    
    # Plot kde plot to show density of data points at each position
    sns.kdeplot(x=x_vals, y=frequencies, ax=ax, fill=True, color=color, alpha=0.5, bw_adjust=0.5)

    # Plot settings
    ax.set_xlabel(xlabel, fontsize=12)
    ax.set_ylabel(f'{substitution_type} Substitution Frequency', fontsize=12)
    ax.set_ylim(0, 0.7)
    ax.set_xticks(positions)
    ax.grid(axis='y', linestyle='--')


In [None]:
def plot_prof_substitutions(ax, all_combined_data, substitution_type='C>T', color='red', positions_range=(5, 10), xlabel="Position from 5' end"):
    """
    Create a scatter plot with density and mean line for substitutions.
    
    Parameters:
    - ax: The axis to plot on.
    - all_combined_data: Dictionary of position -> list of frequencies.
    - substitution_type: The substitution type to plot ('C>T' or 'G>A').
    - color: The color for the scatter plot ('red' for 'C>T', 'blue' for 'G>A').
    - positions_range: Range of positions to plot (tuple, e.g., (6, 11) for C>T or (0, 5) for G>A).
    - xlabel: Label for the x-axis (e.g., "Position from 5' end" or "Position from 3' end").
    """
    
    # Extract the relevant positions and frequencies for the given substitution type
    positions = list(range(*positions_range))
    if substitution_type == 'G>A':
        x_labels = [-4, -3, -2, -1, 0]
    else:
        x_labels = positions
        
    frequencies = [freq for pos in positions for freq in all_combined_data[pos]]  # All frequencies for selected positions
    
    # Plot the scatter plot
    ax.scatter(
        [pos for pos in positions for _ in range(len(all_combined_data[pos]))],
        frequencies,
        c=color,
        alpha=0.5,
        edgecolors='none'
    )

    # Plot the average line (dotted line)
    avg_frequencies = [np.mean(all_combined_data[pos]) for pos in positions]
    ax.plot(positions, avg_frequencies, 'k--')

    # Plot settings
    ax.set_xlabel(xlabel, fontsize=20)
    ax.set_ylabel(f'{substitution_type}', fontsize=20)
    #ax.set_ylabel(f'{substitution_type} Substitution Freq.', fontsize=16)
    ax.set_ylim(0, 0.65)
    ax.set_xticks(positions)
    ax.tick_params(axis='x', labelsize=20)
    ax.tick_params(axis='y', labelsize=20)
    ax.set_xticklabels(x_labels, fontsize=20)
    ax.tick_params(axis='y', labelsize=20)
    ax.grid(axis='y', linestyle='--')


In [None]:
# Function to generate and save substitution profile plots
def plot_substitution_profiles(all_combined_data, plot_path, damage_type="mid", title="Substitution Profiles: Simulation", method=""):
    """
    Generate and save substitution profile plots for C>T and G>A substitutions.

    Parameters:
    - all_combined_data: Dictionary containing the combined substitution data.
    - plot_path: Path where the plot should be saved.
    - damage_type: Type of damage to include in the plot title and file name.
    - title: The title of the plot.
    """
    # Create a figure with two subplots side by side
    fig, axs = plt.subplots(1, 2, figsize=(8, 4))

    # Plot C>T substitutions (positions 0-5) for the 5' end in the first subplot
    plot_prof_substitutions(axs[0], all_combined_data, substitution_type='C>T', color='red', positions_range=(0, 5), xlabel="Position from 5' end")

    # Plot G>A substitutions (positions 5-10) for the 3' end in the second subplot
    plot_prof_substitutions(axs[1], all_combined_data, substitution_type='G>A', color='blue', positions_range=(5, 10), xlabel="Position from 3' end")

    # Set a main title for the entire figure
    fig.suptitle(f"{title}: {damage_type} damage", fontsize=20)

    # Adjust layout to make room for the title
    plt.tight_layout(rect=[0, 0, 1, 0.95])  # Leave space for the title

    # Save the figure
    #plt.savefig(f'{plot_path}/damprof_samples_{damage_type}_{method}.png', dpi=300, bbox_inches="tight")
    plt.savefig(f'{plot_path}/damprof_samples_{damage_type}_{method}.svg', format="svg", bbox_inches="tight")

    # Show the plot
    plt.show()

### DAT profiles ###

In [None]:
# Example usage
damagedir = "/home/data/damage/dmid"
plot_path = "/home/submission/figs/01simulated"

# Initialize all_combined_data as a dictionary with keys 0-9
all_combined_data = {i: [] for i in range(10)}

# Get all pairs of 5p and 3p files
threep_files = sorted(glob.glob(f'{damagedir}/*_3.dat'))  # Assuming the file naming convention
fivep_files = sorted(glob.glob(f'{damagedir}/*_5.dat'))  # Adjust the pattern as necessary

# Process each pair of files
for fivep_file, threep_file in zip(fivep_files, threep_files):
    load_and_combine_data(fivep_file, threep_file, all_combined_data, False)

# Call the function to plot and save the results
plot_substitution_profiles(all_combined_data, plot_path, damage_type="Mid", title="Damage Profiles: Input Simulation", method="baseline")

In [None]:
# Example usage
damagedir = "/home/data/damage/dhigh"
plot_path = "/home/submission/figs/01simulated"

# Initialize all_combined_data as a dictionary with keys 0-9
all_combined_data = {i: [] for i in range(10)}

# Get all pairs of 5p and 3p files
threep_files = sorted(glob.glob(f'{damagedir}/*_3.dat'))  # Assuming the file naming convention
fivep_files = sorted(glob.glob(f'{damagedir}/*_5.dat'))  # Adjust the pattern as necessary

# Process each pair of files
for fivep_file, threep_file in zip(fivep_files, threep_files):
    load_and_combine_data(fivep_file, threep_file, all_combined_data, False)

# Call the function to plot and save the results
plot_substitution_profiles(all_combined_data, plot_path, damage_type="High", title="Damage Profiles: Input Simulation", method="baseline")

In [None]:
# Example usage
damagedir = "/home/data/damage/dnone"
plot_path = "/home/submission/figs/01simulated"

# Initialize all_combined_data as a dictionary with keys 0-9
all_combined_data = {i: [] for i in range(10)}

# Get all pairs of 5p and 3p files
threep_files = sorted(glob.glob(f'{damagedir}/*_3.dat'))  # Assuming the file naming convention
fivep_files = sorted(glob.glob(f'{damagedir}/*_5.dat'))  # Adjust the pattern as necessary

# Process each pair of files
for fivep_file, threep_file in zip(fivep_files, threep_files):
    load_and_combine_data(fivep_file, threep_file, all_combined_data, False)

# Call the function to plot and save the results
plot_substitution_profiles(all_combined_data, plot_path, damage_type="None",title="Damage Profiles: Input Simulation", method="baseline")

### Estimated Profs ###

In [None]:
# Example usage
#damagedir = "/home/analysis/EMNGDN_all3"
damagedir = "/analysis/EMNGDN_all3"
plot_path = "/01simulated"
plot_path = "/EMNGDN_all3"


# Initialize all_combined_data as a dictionary with keys 0-9
all_combined_data = {i: [] for i in range(10)}

# Get all pairs of 5p and 3p files
threep_files = sorted(glob.glob(f'{damagedir}/*_3p.prof'))  # Assuming the file naming convention
fivep_files = sorted(glob.glob(f'{damagedir}/*_5p.prof'))  # Adjust the pattern as necessary

# Process each pair of files
for fivep_file, threep_file in zip(fivep_files, threep_files):
    if "dhigh" in fivep_file:
        load_and_combine_data(fivep_file, threep_file, all_combined_data, True)

# Call the function to plot and save the results
plot_substitution_profiles(all_combined_data, plot_path, damage_type="High", title="Ground Truth Damage Profiles", method="estimated")

In [None]:
# Initialize all_combined_data as a dictionary with keys 0-9
all_combined_data = {i: [] for i in range(10)}

# Get all pairs of 5p and 3p files
threep_files = sorted(glob.glob(f'{damagedir}/*_3p.prof'))  # Assuming the file naming convention
fivep_files = sorted(glob.glob(f'{damagedir}/*_5p.prof'))  # Adjust the pattern as necessary

# Process each pair of files
for fivep_file, threep_file in zip(fivep_files, threep_files):
    if "mid" in fivep_file:
        load_and_combine_data(fivep_file, threep_file, all_combined_data, True)

# Call the function to plot and save the results
#plot_substitution_profiles(all_combined_data, plot_path, damage_type="Mid", title="Damage Profiles: Estimated\nAfter Running bam2prof", method="estimated")
plot_substitution_profiles(all_combined_data, plot_path, damage_type="Mid", title="Ground Truth Damage Profiles", method="estimated")



In [None]:
# Initialize all_combined_data as a dictionary with keys 0-9
all_combined_data = {i: [] for i in range(10)}

# Get all pairs of 5p and 3p files
threep_files = sorted(glob.glob(f'{damagedir}/*_3p.prof'))  # Assuming the file naming convention
fivep_files = sorted(glob.glob(f'{damagedir}/*_5p.prof'))  # Adjust the pattern as necessary

# Process each pair of files
for fivep_file, threep_file in zip(fivep_files, threep_files):
    if "dnone" in fivep_file:
        load_and_combine_data(fivep_file, threep_file, all_combined_data, True)

# Call the function to plot and save the results
#plot_substitution_profiles(all_combined_data, plot_path, damage_type="No", title="Damage Profiles: Estimated\nAfter Running bam2prof", method="estimated")
plot_substitution_profiles(all_combined_data, plot_path, damage_type="No", title="Ground Truth Damage Profiles", method="estimated")
