# Initialise simulated data

In [2]:
import itertools

def generate_read_frame_distribution_permutations():
    """
    Generate all possible permutations of read frame distributions that sum to 100.
    
    Args:
        None

    Returns:
        simulated_read_frame_proportions (dict): Dictionary containing all possible read frame distributions that sum to 100.
    """
    # Generate permutations
    numbers = range(1, 101)
    permutations = []
    for perm in itertools.permutations(numbers, 3):
        if sum(perm) == 100:
            permutations.append(perm)

    # Simulate read frame proportions
    simulated_read_frame_proportions = {}
    for i, perm in enumerate(permutations):
        simulated_read_frame_proportions[i] = {0: perm[0], 1: perm[1], 2: perm[2]}

    return simulated_read_frame_proportions

In [3]:
import random

def generate_metagene(frame_ratios, start, stop, noise_factor=0.1, max_count=100):
    """
    Generate a metagene dictionary with varying degrees of periodicity based on frame ratios.
    
    Args:
        frame_ratios (dict): A dictionary representing the global ratio of reads per frame.
                             Keys are frame positions (0, 1, 2), and values are the corresponding ratios.
        start (int): The start position of the metagene.
        stop (int): The stop position of the metagene.
    
    Returns:
        dict: A dictionary representing the metagene, where keys are positions and values are counts.
    """
    metagene = {}
    
    # Initialize all positions with a small constant value
    for pos in range(start, stop):
        metagene[pos] = 1
    
    # Adjust counts based on frame ratios
    for pos in range(start, stop):
        frame = (pos - start) % 3
        metagene[pos] += int(frame_ratios[frame] * max_count)
    
    # Introduce some random noise
    for pos in range(start, stop):
        metagene[pos] += int(random.uniform(0, noise_factor * max_count))
    
    return metagene

In [4]:
start = 10
stop = 110

simulated_metagenes = {'start':{}, 'stop':{}}
simulated_metagenes['start'] = {
    i: generate_metagene(frame_ratios, start, stop) for i, frame_ratios in generate_read_frame_distribution_permutations().items()
}

In [5]:
!pip install spectrum




In [6]:
import numpy as np
from spectrum import dpss, pmtm

def compute_periodicity_score(metagene_dict, expected_period=3, NW=4, k=8):
    """
    Compute a periodicity score for the 'start' region of a metagene dictionary using a multitaper approach.

    Args:
        metagene_dict (dict): A nested dictionary with 'start' as a key, and its value as a dictionary
                              with positions on the mRNA transcript as keys and counts at that position as values.
        expected_period (int): The expected periodicity of the signal (default: 3, for codon reading frame).
        NW (float): The time-bandwidth product for the multitaper analysis (default: 4).
        k (int): The number of tapers to use in the multitaper analysis (default: 8).

    Returns:
        float: The periodicity score, which is the ratio of the amplitude of the periodic component to the
               mean amplitude of the spectrum.
    """
    multtaper_scores = {}
    for read_length in metagene_dict['start'].keys():
        if read_length % 500 == 0: 
            print(read_length, f"{round(read_length/len(metagene_dict['start'].keys()), 2) * 100} Complete")
        counts = list(metagene_dict['start'][read_length].values())

        # Convert counts to a numpy array
        ribo_seq_signal = np.array(counts)
        N = len(ribo_seq_signal)

        # Compute the discrete prolate spheroidal sequences (DPSS)
        tapers, eigen = dpss(N, NW, k)
        # Compute the multitaper spectrum
        Sk_complex, weights, eigenvalues = pmtm(ribo_seq_signal, e=eigen, v=tapers, NFFT=N, show=False)
        Sk = np.abs(Sk_complex) ** 2
        Sk = np.mean(Sk * np.transpose(weights), axis=0)

        # Identify the frequency bin corresponding to the expected periodicity
        freqs = np.linspace(0.0, 1.0 / (2.0 * N), N // 2)
        expected_freq_bin = np.argmin(np.abs(freqs - 1 / expected_period))

        # Calculate the periodicity score
        periodic_amplitude = Sk[expected_freq_bin]
        mean_amplitude = np.mean(Sk[:N // 2])
        multtaper_scores[read_length] = periodic_amplitude / mean_amplitude

    return multtaper_scores

In [7]:
multitaper_scores = compute_periodicity_score(simulated_metagenes)

0 0.0 Complete


500 11.0 Complete
1000 21.0 Complete
1500 32.0 Complete
2000 43.0 Complete
2500 53.0 Complete
3000 64.0 Complete
3500 74.0 Complete
4000 85.0 Complete
4500 96.0 Complete


In [11]:
# plot the periodicity scores

import plotly.graph_objects as go

# fig = go.Figure()

# for read_length, score in multitaper_scores.items():
#     fig.add_trace(go.Scatter(x=[read_length], y=[score], mode='markers', name=f'{read_length} nt'))

# fig.update_layout(title='Periodicity Scores for Simulated Metagenes',
                  
#                     xaxis_title='Read Length (nt)',
#                     yaxis_title='Periodicity Score',
#                     xaxis_type='log')

# fig.show()

In [13]:
multitaper_scores

{0: 1.2938844771548563e-08,
 1: 5.5831304602705264e-08,
 2: 1.9473774331528816e-07,
 3: 1.2431735575819395e-08,
 4: 5.24395338036429e-10,
 5: 3.257177780327887e-08,
 6: 3.724227126260531e-08,
 7: 8.966069723122161e-08,
 8: 9.740732554838125e-08,
 9: 1.773682144253928e-07,
 10: 4.375727612548943e-09,
 11: 1.880702466859876e-07,
 12: 2.4964377909244254e-08,
 13: 1.967437635764735e-08,
 14: 1.252382031744628e-07,
 15: 2.5764903292134052e-08,
 16: 5.127038930882147e-08,
 17: 1.6190613906726955e-07,
 18: 3.1712469471562274e-08,
 19: 9.814867868463438e-09,
 20: 2.808143585638139e-08,
 21: 2.0352490872582574e-07,
 22: 1.3242064649114385e-07,
 23: 3.0513886286799044e-08,
 24: 2.4732054589045015e-08,
 25: 8.803791562755223e-08,
 26: 3.164592371966724e-07,
 27: 2.0923262858907513e-07,
 28: 1.006650922330155e-07,
 29: 6.238131250428461e-08,
 30: 8.944604569904593e-08,
 31: 2.9804828499228586e-08,
 32: 1.8596609513833976e-07,
 33: 1.2371132027207853e-07,
 34: 3.847599428811023e-08,
 35: 8.39807287

In [12]:
# plot hist of scores 

fig = go.Figure()

fig.add_trace(go.Histogram(x=list(multitaper_scores.values()), nbinsx=50))

fig.show()