In [1]:
from cogent3.maths.matrix_exponential_integration import expected_number_subs
from cogent3 import get_app, load_aligned_seqs
import numpy as np

from numpy import array

GTR_model = get_app("model", sm = "GTR", 
                unique_trees = True, time_het = "max", optimise_motif_probs = True, 
                show_progress = False, opt_args = dict(max_restarts=5))

no_degenerates = get_app(
    "omit_degenerates",
    moltype = None,
    gap_is_degen = True,
    motif_length = 1,
)



In [2]:
import pathlib
paths = list(pathlib.Path("../data/16s-10").glob("**/*.json"))

In [3]:
def get_matrix_pi(path):
    aln = load_aligned_seqs(path)
    aln1 = no_degenerates(aln)
    result = GTR_model(aln1)
    edge_name = result.tree.get_node_names(includeself = False)[0]
    Q = result.lf.get_rate_matrix_for_edge(edge_name, calibrated=False)
    pi = result.lf.get_motif_probs()
    return Q, pi


In [4]:
Q1, pi1 = get_matrix_pi(paths[0])
Q2, pi2 = get_matrix_pi(paths[1])

In [46]:
t_range =  np.linspace(0, 0.6, 10)

# R = np.array([
#     [-4.5,  2.0,  1.0,  1.5],
#     [ 2.0, -4.5,  1.5,  1.0],
#     [ 1.0,  1.5, -4.5,  2.0],
#     [ 1.5,  1.0,  2.0, -4.5]
# ])

Q = array([[-1.4, 0.1, 0.4, 0.9], [4,-6.9, 0.9, 2], 
[6.3, 2, -11.3, 3], [0.1, 0.1, 0.2, -0.4]], dtype=float)
i = [0.25, 0.25, 0.25, 0.25]



In [47]:
# Add an additional equation to account for the sum of pi elements being 1
A = np.vstack([Q.T, np.ones(Q.shape[0])])

# Right-hand side of the equations
b = np.zeros(Q.shape[0] + 1)
b[-1] = 1  # The sum of pi elements is 1

# Solve for pi
pi = np.linalg.lstsq(A, b, rcond=None)[0]

print("Stationary distribution pi:", pi)

Stationary distribution pi: [0.21178161 0.02034201 0.02231267 0.74556371]


In [48]:

# Function to calculate d
def calculate_d(p, R):
    return np.linalg.norm(np.dot(p, R))

# Settings for grid search
resolution = 10
min_val = 0.01  # Minimum value for each element in p to avoid extreme cases
largest_d = -np.inf
best_p = None

# Adjusted search to ensure all elements are non-zero and sum to 1
for a in np.linspace(min_val, 1 - 3*min_val, resolution):
    for b in np.linspace(min_val, 1 - 2*min_val - a, resolution):
        for c in np.linspace(min_val, 1 - min_val - a - b, resolution):
            d = 1 - (a + b + c)  # Remaining value to ensure sum(p) = 1
            if d >= min_val:  # Check if the last value is also above the minimum
                p = np.array([a, b, c, d])
                current_d = calculate_d(p, Q)
                if current_d > largest_d:
                    largest_d = current_d
                    best_p = p

print("Best p:", best_p)
print("Largest d:", largest_d)

Best p: [0.01 0.01 0.97 0.01]
Largest d: 13.023529245177745


In [49]:
s_exp_numb_sub_value_1 = list()
n_exp_numb_sub_value_1 = list()

for t in t_range:
    s_exp_numb_sub_value_1.append(expected_number_subs(pi, Q, t))


for t in t_range:
    n_exp_numb_sub_value_1.append(expected_number_subs(i, Q, t))


# s_exp_numb_sub_value_2 = list()
# n_exp_numb_sub_value_2 = list()

# for t in t_range:
#     s_exp_numb_sub_value_2.append(expected_number_subs(i, R, t))


# for t in t_range:
#     n_exp_numb_sub_value_2.append(expected_number_subs(best_p, R, t)) 



In [50]:
import plotly.express as px

data1 = []
for i, t in enumerate(t_range):
    data1.append({'t': t, 'value': s_exp_numb_sub_value_1[i], 'series': 'Stationary1'})
    data1.append({'t': t, 'value': n_exp_numb_sub_value_1[i], 'series': 'Non-Stationary1'})

# Plotting with Plotly Express
fig = px.line(data1, x='t', y='value', color='series', 
              labels={'value': 'the expected number of substitutions'}, 
              title='Expected Number of Substitutions Over Time')
fig.update_traces(line=dict(width=1))

fig.show()





In [51]:
from scipy.stats import linregress

# Perform linear regression for s_exp_numb_sub_value
slope_s, intercept_s, r_value_s, p_value_s, std_err_s = linregress(t_range, s_exp_numb_sub_value_1)
print("Linear regression for s_exp_numb_sub_value:")
print(f"Slope: {slope_s}, Intercept: {intercept_s}, R-squared: {r_value_s**2}, P-value: {p_value_s}")

# Perform linear regression for n_exp_numb_sub_value_1
slope_n1, intercept_n1, r_value_n1, p_value_n1, std_err_n1 = linregress(t_range, n_exp_numb_sub_value_1)
print("Linear regression for n_exp_numb_sub_value_1:")
print(f"Slope: {slope_n1}, Intercept: {intercept_n1}, R-squared: {r_value_n1**2}, P-value: {p_value_n1}")

Linear regression for s_exp_numb_sub_value:
Slope: 0.9872128037718817, Intercept: 1.6653345369377348e-16, R-squared: 1.0, P-value: 4.375000000000076e-80
Linear regression for n_exp_numb_sub_value_1:
Slope: 1.8963387714085524, Intercept: 0.16053826547633665, R-squared: 0.9598826580619967, P-value: 7.199087852630193e-07


In [52]:
# import plotly.express as px

# data2 = []
# for i, t in enumerate(t_range):
#     data2.append({'t': t, 'value': s_exp_numb_sub_value_2[i], 'series': 'Stationary2'})
#     data2.append({'t': t, 'value': n_exp_numb_sub_value_2[i], 'series': 'Non-Stationary2'})

# # Plotting with Plotly Express
# fig = px.line(data2, x='t', y='value', color='series', 
#               labels={'value': 'the expected number of substitutions'}, 
#               title='Expected Number of Substitutions Over Time')
# fig.update_traces(line=dict(width=1))

# fig.show()

In [53]:
# from scipy.stats import linregress

# # Perform linear regression for s_exp_numb_sub_value
# slope_s, intercept_s, r_value_s, p_value_s, std_err_s = linregress(t_range, s_exp_numb_sub_value_2)
# print("Linear regression for s_exp_numb_sub_value:")
# print(f"Slope: {slope_s}, Intercept: {intercept_s}, R-squared: {r_value_s**2}, P-value: {p_value_s}")

# # Perform linear regression for n_exp_numb_sub_value_1
# slope_n1, intercept_n1, r_value_n1, p_value_n1, std_err_n1 = linregress(t_range, n_exp_numb_sub_value_2)
# print("Linear regression for n_exp_numb_sub_value_1:")
# print(f"Slope: {slope_n1}, Intercept: {intercept_n1}, R-squared: {r_value_n1**2}, P-value: {p_value_n1}")