In [None]:
import pandas as pd
# Reading in the Chimeric Barcode data
p005_chimeric_barcodes = pd.read_csv(f'/lunarc/nobackup/projects/lu2024-17-19/J_rAAven/BRAVE/raav-60/p005_starcode_no_chimeric/library_barcodes_chimeric.csv')
p006_chimeric_barcodes = pd.read_csv(f'/lunarc/nobackup/projects/lu2024-17-19/J_rAAven/BRAVE/raav-60/p006_starcode_no_chimeric/library_barcodes_chimeric.csv')
p007_chimeric_barcodes = pd.read_csv(f'/lunarc/nobackup/projects/lu2024-17-19/J_rAAven/BRAVE/raav-60/p007_starcode_no_chimeric/library_barcodes_chimeric.csv')

In [None]:
def create_count_tables(df, column_name):
    """
    Create a count table for a given column in the DataFrame.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    column_name (str): The name of the column to count unique values for.
    
    Returns:
    pd.DataFrame: A DataFrame with unique values and their counts, sorted by count.
    """
    count_df = df.copy()
    count_df['count'] = 1
    count_df = count_df.groupby([column_name]).count().reset_index()
    count_df = count_df.sort_values(by='count', ascending=False)
    count_df = count_df[[column_name, 'count']]
    
    return count_df[[column_name, 'count']]

In [None]:
p005_count_barcodes = create_count_tables(p005_chimeric_barcodes, 'BC')
p005_count_fragments = create_count_tables(p005_chimeric_barcodes, 'Reads')
p006_count_barcodes = create_count_tables(p006_chimeric_barcodes, 'BC')
p006_count_fragments = create_count_tables(p006_chimeric_barcodes, 'Reads')
p007_count_barcodes = create_count_tables(p007_chimeric_barcodes, 'BC')
p007_count_fragments = create_count_tables(p007_chimeric_barcodes, 'Reads')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def plot_log_binned_histogram_bc(df):
    # Ensure 'count' column exists
    if 'count' not in df.columns:
        raise ValueError("Input DataFrame must contain a 'count' column.")

    # Get the max count for binning
    max_count = df['count'].max()

    # Create log2-spaced bins starting from 1
    bins = 2 ** np.arange(1, int(np.ceil(np.log2(max_count))) + 1)

    # Bin the data
    bin_labels = [f"{int(bins[i])}–{int(bins[i+1]-1)}" for i in range(len(bins) - 1)]
    df['bin'] = pd.cut(df['count'], bins=bins, right=False, labels=bin_labels)

    # Count how many entries fall into each bin
    binned_counts = df['bin'].value_counts().sort_index()

    # Plotting
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(binned_counts.index.astype(str), binned_counts.values, 
           color='darkred', edgecolor='white', linewidth=0.5)

    # Clean up the plot
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.set_xlabel('Number of Fragments Matching to Chimeric Barcodes (Log2 Bins)')
    ax.set_ylabel('Number of Barcodes (log 10 scale)')
    ax.set_title('Distribution of Chimeric Barcodes Matching to Multiple Fragments')
    ax.set_yscale('log')
    plt.xticks(rotation=45)
    plt.tight_layout()

    return plt


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def plot_log_binned_histogram_bc(df1, df2, df3, labels=('DF1', 'DF2', 'DF3'), colors=('darkred', 'steelblue', 'darkgreen')):
    # Ensure all dataframes have 'count' column
    for df in [df1, df2, df3]:
        if 'count' not in df.columns:
            raise ValueError("All input DataFrames must contain a 'count' column.")

    # Determine common bin edges based on global max count
    max_count = max(df1['count'].max(), df2['count'].max(), df3['count'].max())
    bins = 2 ** np.arange(1, int(np.ceil(np.log2(max_count))) + 1)
    bin_labels = [f"{int(bins[i])}–{int(bins[i+1]-1)}" for i in range(len(bins) - 1)]

    # Bin each dataframe
    def get_binned_counts(df):
        df = df.copy()
        df['bin'] = pd.cut(df['count'], bins=bins, right=False, labels=bin_labels)
        return df['bin'].value_counts().sort_index()

    counts1 = get_binned_counts(df1).reindex(bin_labels, fill_value=0)
    counts2 = get_binned_counts(df2).reindex(bin_labels, fill_value=0)
    counts3 = get_binned_counts(df3).reindex(bin_labels, fill_value=0)

    # Set width and spacing
    width = 0.25  # width of each bar
    n_groups = len(bin_labels)
    group_spacing = 1.2  # spacing between bin groups

    # Compute positions: 3 bars per group, tightly packed, groups spaced out
    x_positions = []
    for i in range(n_groups):
        group_center = i * group_spacing
        x_positions.append([
            group_center - width,   # DF1
            group_center,           # DF2
            group_center + width    # DF3
        ])
    x_positions = np.array(x_positions)

    # Plotting
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(x_positions[:, 0], counts1.values, width=width, label=labels[0], color=colors[0])
    ax.bar(x_positions[:, 1], counts2.values, width=width, label=labels[1], color=colors[1])
    ax.bar(x_positions[:, 2], counts3.values, width=width, label=labels[2], color=colors[2])

    # X-axis ticks and labels
    group_centers = x_positions[:, 1]  # center bar of each group
    ax.set_xticks(group_centers)
    ax.set_xticklabels(bin_labels, rotation=45, ha='right')

    # Aesthetics
    ax.set_yscale('log')
    ax.set_xlabel('Number of Fragments Matching to Chimeric Barcodes')
    ax.set_ylabel('Number of Barcodes (log 10 scale)')
    ax.set_title('Distribution of Chimeric Barcodes Matching to Multiple Fragments')
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()

    return plt


In [None]:
plt = plot_log_binned_histogram_bc(p005_count_barcodes, p006_count_barcodes, p007_count_barcodes, labels=('p005', 'p006', 'p007'), colors=('#5F59B1', '#A7C1CBA8', '#CFB29CCC'))
plt.savefig('../plots/Chimeric_Distribution/Chimeric_Barcodes_Distribution.png', dpi=600, bbox_inches='tight', transparent=True)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def plot_log_binned_histogram_frag(df1, df2, df3, labels=('DF1', 'DF2', 'DF3'), colors=('darkred', 'steelblue', 'darkgreen')):
    # Ensure all dataframes have 'count' column
    for df in [df1, df2, df3]:
        if 'count' not in df.columns:
            raise ValueError("All input DataFrames must contain a 'count' column.")

    # Determine common bin edges based on global max count
    max_count = max(df1['count'].max(), df2['count'].max(), df3['count'].max())
    bins = 2 ** np.arange(1, int(np.ceil(np.log2(max_count))) + 1)
    bin_labels = [f"{int(bins[i])}–{int(bins[i+1]-1)}" for i in range(len(bins) - 1)]

    # Bin each dataframe
    def get_binned_counts(df):
        df = df.copy()
        df['bin'] = pd.cut(df['count'], bins=bins, right=False, labels=bin_labels)
        return df['bin'].value_counts().sort_index()

    counts1 = get_binned_counts(df1).reindex(bin_labels, fill_value=0)
    counts2 = get_binned_counts(df2).reindex(bin_labels, fill_value=0)
    counts3 = get_binned_counts(df3).reindex(bin_labels, fill_value=0)

    # Set width and spacing
    width = 0.25  # width of each bar
    n_groups = len(bin_labels)
    group_spacing = 1.2  # spacing between bin groups

    # Compute positions: 3 bars per group, tightly packed, groups spaced out
    x_positions = []
    for i in range(n_groups):
        group_center = i * group_spacing
        x_positions.append([
            group_center - width,   # DF1
            group_center,           # DF2
            group_center + width    # DF3
        ])
    x_positions = np.array(x_positions)

    # Plotting
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(x_positions[:, 0], counts1.values, width=width, label=labels[0], color=colors[0])
    ax.bar(x_positions[:, 1], counts2.values, width=width, label=labels[1], color=colors[1])
    ax.bar(x_positions[:, 2], counts3.values, width=width, label=labels[2], color=colors[2])

    # X-axis ticks and labels
    group_centers = x_positions[:, 1]  # center bar of each group
    ax.set_xticks(group_centers)
    ax.set_xticklabels(bin_labels, rotation=45, ha='right')

    # Aesthetics
    ax.set_yscale('log')
    ax.set_xlabel('Number of Chimeric Barcodes Matching to Fragments')
    ax.set_ylabel('Number of Fragments (log 10 scale)')
    ax.set_title('Distribution of Fragments Matching to Chimeric Barcodes')
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()

    return plt


In [None]:
plt = plot_log_binned_histogram_frag(p005_count_fragments, p006_count_fragments, p007_count_fragments, labels=('p005', 'p006', 'p007'), colors=('#5F59B1', '#A7C1CBA8', '#CFB29CCC'))
plt.savefig('../plots/Chimeric_Distribution/Fragments_Distribution.png', dpi=600, bbox_inches='tight', transparent=True)
plt.show()