# Read Length Distribution

## Setup


In [None]:

import itertools
import numpy as np


## Simulated Inputs


In [10]:
import numpy as np

def simulate_read_lengths(n_samples, params, min_length=18, max_length=35):
    """
    Simulate read length distributions using a mixture of normal distributions.

    Args:
        n_samples (int): Number of read lengths to simulate.
        params (list): List of tuples (mean, std, weight) for each component distribution.
        min_length (int): Minimum read length (inclusive).
        max_length (int): Maximum read length (inclusive).

    Returns:
        np.ndarray: Array of simulated read lengths.
    """
    probs = [compute_mixture_density(length, params) for length in range(min_length, max_length + 1)]
    probs_sum = sum(probs)
    probs = [p / probs_sum for p in probs]  # Normalize probabilities

    read_lengths = np.random.choice(
        np.arange(min_length, max_length + 1),
        size=n_samples,
        p=probs
    )
    return read_lengths

def compute_mixture_density(x, params):
    """
    Compute the density of a mixture of normal distributions at a given point x.

    Args:
        x (float): Point at which to compute the density.
        params (list): List of tuples (mean, std, weight) for each component distribution.

    Returns:
        float: Density of the mixture distribution at x.
    """
    density = 0
    for mean, std, weight in params:
        density += weight * np.exp(-(x - mean)**2 / (2 * std**2)) / (np.sqrt(2 * np.pi) * std)
    return density

# Example usage
n_samples = 10000
params = [
    (22, 2, 0.3),   # Unimodal component
    (26, 1, 0.4),   # Unimodal component
    (30, 3, 0.3)    # Bimodal component
]

read_lengths = simulate_read_lengths(n_samples, params)

In [2]:
import numpy as np

def simulate_normal_distributions(n_samples, n_distributions, min_length=18, max_length=35):
    """
    Simulate normally distributed read length distributions with varying standard deviations.

    Args:
        n_samples (int): Number of read lengths to simulate per distribution.
        n_distributions (int): Number of distributions to simulate.
        min_length (int): Minimum read length (inclusive).
        max_length (int): Maximum read length (inclusive).

    Returns:
        list: List of arrays, each containing simulated read lengths for one distribution.
    """
    distributions = []
    for _ in range(n_distributions):
        mean = np.random.randint(min_length, max_length)
        std_dev = np.random.uniform(1, 5)
        read_lengths = np.random.normal(loc=mean, scale=std_dev, size=n_samples)
        read_lengths = np.clip(read_lengths, min_length, max_length).astype(int)
        distributions.append(read_lengths)
    return distributions

# Example usage
n_samples = 1000
n_normal_distributions = 100
normal_distributions = simulate_normal_distributions(n_samples, n_normal_distributions)


In [3]:
import plotly.graph_objects as go

# make multifigure sub plot of 4 random distributions

for i in range(9):
    fig = go.Figure()

    fig.add_trace(go.Histogram(x=normal_distributions[i], histnorm='probability'))

    fig.update_layout(
        title='Simulated Read Length Distributions',
        xaxis_title='Read Length',
        yaxis_title='Density',
        # template='plotly_dark',
        barmode='overlay'
    )

    fig.show()


In [5]:
!pip install polars


Collecting polars
  Obtaining dependency information for polars from https://files.pythonhosted.org/packages/c6/b5/40e452ce05a01a7cde60bbbc86e9103ef0cbc1354832a3c7ccd0daf5591a/polars-0.20.30-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading polars-0.20.30-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading polars-0.20.30-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (28.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.3/28.3 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: polars
Successfully installed polars-0.20.30


In [12]:
import polars as pl

# ignore commented lines
gtf = pl.read_csv("~/Downloads/gencode.v45.annotation.gtf.gz", has_header=False, separator="\t", comment_prefix="#")

In [19]:
# get unique protein coding transcript ids
gtf_cds = gtf.filter(pl.col("column_3") == "CDS")

# get transcript id from attribute column
gtf_cds = gtf_cds.select(pl.col("column_9").str.split(";").explode().str.split(" ").explode().str.replace('"', "").alias("attribute"))

gtf_cds.head()

attribute
str
"""gene_id"""
"""ENSG00000186092.7"""""
""""""
"""transcript_id"""
"""ENST00000641515.2"""""


In [21]:
import pandas as pd 

def extract_protein_coding_transcripts(gtf_file):
    # Read the GTF file into a pandas DataFrame
    col_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
    df = pd.read_csv(gtf_file, sep='\t', comment='#', names=col_names, header=None)

    # Filter for transcripts
    transcripts = df[df['feature'] == 'transcript']

    # Initialize a set to store unique protein-coding transcript IDs
    protein_coding_transcripts = set()

    # Iterate over the transcripts DataFrame
    for _, row in transcripts.iterrows():
        # Extract the attributes column
        attributes = row['attribute']
        
        # Check if the transcript is protein-coding
        if 'transcript_biotype "protein_coding"' in attributes or 'transcript_type "protein_coding"' in attributes:
            # Extract the transcript ID
            transcript_id = None
            for attribute in attributes.split(';'):
                attribute = attribute.strip()
                if attribute.startswith('transcript_id'):
                    transcript_id = attribute.split(' ')[1].replace('"', '')
                    break
            if transcript_id:
                protein_coding_transcripts.add(transcript_id)

    return protein_coding_transcripts

unique_txs = extract_protein_coding_transcripts("~/Downloads/gencode.v45.annotation.gtf.gz")

In [23]:
len(unique_txs)

89110

In [2]:
!pip install pyranges1

[31mERROR: Ignored the following versions that require a different python version: 0.1.0 Requires-Python >=3.11.0; 0.1.1 Requires-Python >=3.11.0; 0.1.2 Requires-Python >=3.12.0; 0.1.3 Requires-Python >=3.12.0; 1.0.0 Requires-Python >=3.12.0; 1.0.1 Requires-Python >=3.12.0[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement pyranges1 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pyranges1[0m[31m
[0m

In [1]:
import pyranges 

ModuleNotFoundError: No module named 'pyranges'