# Triplet Periodicity Metric Investigation

Investigation of how to devlop a metric to summaries triplet periodicity accurately 

## Import Functions

In [6]:
from RiboMetric.modules import (
    read_frame_distribution,
    a_site_calculation,
)

import pandas as pd
import math

In [7]:
path_to_test_data = "/home/jack/projects/RibosomeProfiler/tests/test_data/parsed_test.csv"

In [8]:
read_df_pre = pd.read_csv(path_to_test_data)
read_df = read_df_pre.loc[
    read_df_pre.index.repeat(read_df_pre["count"])
].reset_index(drop=True)
read_frame_dict = read_frame_distribution(a_site_calculation(read_df))

## Import the metric function for further modification 

In [24]:
def read_frame_distribution_metric(
    read_frame_distribution: dict,
        ) -> float:
    """       
    Calculate the read frame distribution metric from the output of
    the read_frame_distribution module.

    This metric is the Shannon entropy of the read frame distribution 

    Inputs:
        read_frame_distribution: Dictionary containing the output of the
                read_frame_distribution module
        
    Outputs:
        read_frame_distribution_metric: Shannon entropy of the read frame
                distribution
    """

    pseudocount = 1e-100  # to avoid log(0)

    entropies = {}
    for read_length in read_frame_distribution:
        total_count = sum(read_frame_distribution[read_length].values())
        max_entropy = math.log2(len(read_frame_distribution[read_length]))
        entropy = 0.0
        probs = []
        for frame, count in read_frame_distribution[read_length].items():
            prob = (count + pseudocount) / total_count
            probs.append(prob)
            entropy += max(-(prob * math.log2(prob)), 0.0)
        score = (max_entropy - entropy)/max_entropy

        entropies[read_length] = (score, total_count, probs)
    weighted_sum = 0.0
    for i in entropies:
        weighted_sum += entropies[i][0] * entropies[i][1]
    weighted_score = weighted_sum / sum([entropies[x][1] for x in entropies])
    return entropies

In [23]:
read_frame_dict = {
    28: {0: 10000, 1: 1, 2: 1},
    29: {0: 1, 10000: 1, 2: 1},
    30: {0: 1, 1: 1, 2: 10000},
    31: {0: 5000, 1: 5000, 2: 1},
    32: {0: 1, 1: 5000, 2: 5000},
    33: {0: 100, 1: 200, 2: 100},
    34: {0: 1100, 1: 10000, 2: 2200},
    35: {0: 1000, 1: 1000, 2: 1000},
}

In [31]:
read_frame_metric = read_frame_distribution_metric(read_frame_dict)

In [27]:
def max_min_normalisation(
    value: float,
    min_value: float,
    max_value: float,
    ) -> float:
    """
    Normalise a value between 0 and 1 based on the minimum and maximum
    values of a set of values.

    Inputs:
        value: Value to be normalised
        min_value: Minimum value of the set of values
        max_value: Maximum value of the set of values

    Outputs:
        normalised_value: Normalised value between 0 and 1
    """
    normalised_value = (value - min_value) / (max_value - min_value)
    return normalised_value

In [36]:
for i in read_frame_metric:
    print("len: ", i, "score: ", read_frame_metric[i][0], "Normalised score: ", max_min_normalisation(read_frame_metric[i][0], 0.2, 1))

len:  28 score:  0.9981415830150691 Normalised score:  0.9976769787688363
len:  29 score:  0.0 Normalised score:  -0.25
len:  30 score:  0.9981415830150691 Normalised score:  0.9976769787688363
len:  31 score:  0.3682040362414955 Normalised score:  0.21025504530186936
len:  32 score:  0.3682040362414956 Normalised score:  0.2102550453018695
len:  33 score:  0.05360536964281378 Normalised score:  -0.18299328794648276
len:  34 score:  0.34627280220652973 Normalised score:  0.18284100275816215
len:  35 score:  0.0 Normalised score:  -0.25


In [37]:
def max_min_2(x):
    # Define the minimum and maximum values for the original range
    min_value = 0.0  # Minimum possible value of the metric
    max_value = 1.0  # Maximum possible value of the metric

    # Define the new minimum and maximum values for the transformed range
    new_min = 0.2  # The desired minimum value in the transformed range
    new_max = 1.0  # The desired maximum value in the transformed range

    # Apply the rescaling transformation
    transformed_x = ((x - min_value) / (max_value - min_value)) * (new_max - new_min) + new_min

    return transformed_x


In [38]:
for i in read_frame_metric:
    print("len: ", i, "score: ", read_frame_metric[i][0], "Normalised score: ", max_min_2(read_frame_metric[i][0]))

len:  28 score:  0.9981415830150691 Normalised score:  0.9985132664120553
len:  29 score:  0.0 Normalised score:  0.2
len:  30 score:  0.9981415830150691 Normalised score:  0.9985132664120553
len:  31 score:  0.3682040362414955 Normalised score:  0.49456322899319644
len:  32 score:  0.3682040362414956 Normalised score:  0.4945632289931965
len:  33 score:  0.05360536964281378 Normalised score:  0.24288429571425105
len:  34 score:  0.34627280220652973 Normalised score:  0.4770182417652238
len:  35 score:  0.0 Normalised score:  0.2


In [40]:
for i in read_frame_metric:
    print("len: ", i, "score: ", read_frame_metric[i][0], "Normalised score: ", math.sqrt(read_frame_metric[i][0]))

len:  28 score:  0.9981415830150691 Normalised score:  0.9990703593917042
len:  29 score:  0.0 Normalised score:  0.0
len:  30 score:  0.9981415830150691 Normalised score:  0.9990703593917042
len:  31 score:  0.3682040362414955 Normalised score:  0.6067981841118969
len:  32 score:  0.3682040362414956 Normalised score:  0.606798184111897
len:  33 score:  0.05360536964281378 Normalised score:  0.23152833442758963
len:  34 score:  0.34627280220652973 Normalised score:  0.5884494899365023
len:  35 score:  0.0 Normalised score:  0.0
