# Read Length Distribution

## Setup


In [None]:

import itertools
import numpy as np


## Simulated Inputs


In [10]:
import numpy as np

def simulate_read_lengths(n_samples, params, min_length=18, max_length=35):
    """
    Simulate read length distributions using a mixture of normal distributions.

    Args:
        n_samples (int): Number of read lengths to simulate.
        params (list): List of tuples (mean, std, weight) for each component distribution.
        min_length (int): Minimum read length (inclusive).
        max_length (int): Maximum read length (inclusive).

    Returns:
        np.ndarray: Array of simulated read lengths.
    """
    probs = [compute_mixture_density(length, params) for length in range(min_length, max_length + 1)]
    probs_sum = sum(probs)
    probs = [p / probs_sum for p in probs]  # Normalize probabilities

    read_lengths = np.random.choice(
        np.arange(min_length, max_length + 1),
        size=n_samples,
        p=probs
    )
    return read_lengths

def compute_mixture_density(x, params):
    """
    Compute the density of a mixture of normal distributions at a given point x.

    Args:
        x (float): Point at which to compute the density.
        params (list): List of tuples (mean, std, weight) for each component distribution.

    Returns:
        float: Density of the mixture distribution at x.
    """
    density = 0
    for mean, std, weight in params:
        density += weight * np.exp(-(x - mean)**2 / (2 * std**2)) / (np.sqrt(2 * np.pi) * std)
    return density

# Example usage
n_samples = 10000
params = [
    (22, 2, 0.3),   # Unimodal component
    (26, 1, 0.4),   # Unimodal component
    (30, 3, 0.3)    # Bimodal component
]

read_lengths = simulate_read_lengths(n_samples, params)

In [14]:
import numpy as np

def simulate_normal_distributions(n_samples, n_distributions, min_length=18, max_length=35):
    """
    Simulate normally distributed read length distributions with varying standard deviations.

    Args:
        n_samples (int): Number of read lengths to simulate per distribution.
        n_distributions (int): Number of distributions to simulate.
        min_length (int): Minimum read length (inclusive).
        max_length (int): Maximum read length (inclusive).

    Returns:
        list: List of arrays, each containing simulated read lengths for one distribution.
    """
    distributions = []
    for _ in range(n_distributions):
        mean = np.random.randint(min_length, max_length)
        std_dev = np.random.uniform(1, 5)
        read_lengths = np.random.normal(loc=mean, scale=std_dev, size=n_samples)
        read_lengths = np.clip(read_lengths, min_length, max_length).astype(int)
        distributions.append(read_lengths)
    return distributions

# Example usage
n_samples = 1000
n_normal_distributions = 100
normal_distributions = simulate_normal_distributions(n_samples, n_normal_distributions)


In [19]:
import plotly.graph_objects as go

# make multifigure sub plot of 4 random distributions

for i in range(9):
    fig = go.Figure()

    fig.add_trace(go.Histogram(x=normal_distributions[i], histnorm='probability'))

    fig.update_layout(
        title='Simulated Read Length Distributions',
        xaxis_title='Read Length',
        yaxis_title='Density',
        template='plotly_dark',
        barmode='overlay'
    )

    fig.show()
