In [115]:
import numpy as np
from clock_project.simulation.magnitude_quantification import calculate_ENS, calculate_non_stationarity
from clock_project.maths.evolutionary_rate import calculate_stationary_distribution, matrix_calibration
from clock_project.simulation.wts import SeqSimulate, join_number_to_base_cogent3, taxanomic_triple_simulation, generate_rate_matrix
from cogent3.maths.measure import jsd
import scipy.stats as stats
from scipy.stats import entropy
from numpy.linalg import norm
import plotly.express as px
from cogent3 import make_seq, make_aligned_seqs

In [116]:
pi_balance = [0.25, 0.25, 0.25, 0.25]
p0_exmaple = [0.14380729, 0.13524402, 0.54646076, 0.17448793]
Q1_example = [[-1.0893902421302715,
   0.23524442440577992,
   0.6073829600692169,
   0.2467628576552747],
  [0.11923074320679726,
   -1.0039745326954068,
   0.37536091281124484,
   0.5093828766773647],
  [0.021233576733640704,
   0.0580102551134161,
   -0.4552505412791451,
   0.3760067094320883],
  [0.7389261227883659,
   0.4026144174860118,
   0.6342280785996449,
   -1.7757686188740225]]

In [117]:
t = 0.3
Q1 = generate_rate_matrix()
p0 = calculate_stationary_distribution(np.array(Q1))
ens1 = calculate_ENS(p0, Q1, t)
nabla1 = calculate_non_stationarity(p0, np.array(Q1), t)
ens_differnce_list = []
nabla_list = []
jsd_p0_p2_list = []
for i in range(2000):
    Q2 = generate_rate_matrix()
    p2 = calculate_stationary_distribution(np.array(Q2))
    jsd_p0_p2 = jsd(p0, p2)
    ens2 = calculate_ENS(p0, np.array(Q2), t)
    nabla2 = calculate_non_stationarity(p0, np.array(Q2), t)
    ens_difference = np.sqrt((ens1-ens2)**2)
    ens_differnce_list.append(ens_difference)
    nabla_list.append(abs(nabla2))
    jsd_p0_p2_list.append(jsd_p0_p2)

tau1, p_value1 = stats.kendalltau(jsd_p0_p2_list, ens_differnce_list)
tau2, p_value2 = stats.kendalltau(nabla_list, ens_differnce_list)
tau1, p_value1, tau2, p_value2

(0.1976968484242121,
 4.493253136935009e-40,
 0.369911955977989,
 1.0732752469371285e-135)

In [118]:
# fix = px.scatter(x = nabla_list, y = ens_differnce_list, labels={'x':'Non-stationarity', 'y':'ENS difference'}, title= "Relationship between molecular time departure and non-statioanirty")
# fix.show()
# fix.write_image('Relationship between molecular time departure and non-statioanirty.pdf')
jsd_fig = px.scatter(x = jsd_p0_p2_list, y = ens_differnce_list, labels={'x':'JSD (pi(0) vs pi2)', 'y':'ENS difference'}, title= "Relationship between molecular time departure and JSD (pi(0) vs pi2)")
jsd_fig.show()
# fix.write_image('Relationship between molecular time departure and non-statioanirty.pdf')

In [119]:
fix = px.scatter(x = nabla_list, y = ens_differnce_list, labels={'x':'nabla2', 'y':'ENS difference'}, title= "Relationship between molecular time departure and non-statioanirty of species 2")
fix.show()


In [120]:
Q2_list = [[[-1.700815372167229,
   0.704120966521702,
   0.320665623553365,
   0.676028782092162],
  [0.9250911793534413,
   -1.9231510196217974,
   0.6786233779364725,
   0.31943646233188344],
  [0.36573865898744795,
   0.020091493612716473,
   -1.1136797294927945,
   0.7278495768926301],
  [0.013780195496337602,
   0.857295433169693,
   0.42586180503762494,
   -1.2969374337036554]],
 [[-1.6340787384185207,
   0.3060886134147855,
   0.783835811986692,
   0.5441543130170431],
  [0.6125361895693239,
   -2.098742581603092,
   0.6598864852531082,
   0.82631990678066],
  [0.4653269761351445,
   0.25556051544449826,
   -1.633672608067478,
   0.9127851164878351],
  [0.6119337376514099,
   0.964916210848228,
   0.14361995493063073,
   -1.7204699034302686]],
 [[-1.5700442555531402,
   0.5403284982308515,
   0.5429936336163911,
   0.4867221237058977],
  [0.8086417319073701,
   -1.8732525958180992,
   0.5502138632538319,
   0.5143970006568973],
  [0.225297069333555,
   0.3010469441423364,
   -0.7568014278204186,
   0.23045741434452716],
  [0.07107447565921014,
   0.44769248313399324,
   0.9706191287766386,
   -1.489386087569842]],
 [[-1.5599890063474877,
   0.47422225527800843,
   0.5720360435799029,
   0.5137307074895764],
  [0.2748975461232005,
   -1.9808389792039975,
   0.993955457265237,
   0.71198597581556],
  [0.1491806926269797,
   0.30593308381280526,
   -0.4852550910050752,
   0.03014131456529024],
  [0.751570849108605,
   0.3066495590026567,
   0.258123995734863,
   -1.316344403846125]],
 [[-0.8679120563499468,
   0.31817505225935006,
   0.36818811057683204,
   0.1815488935137647],
  [0.19197894539813667,
   -1.0701566436484722,
   0.15585985735787553,
   0.7223178408924601],
  [0.8199215499481236,
   0.7401773346393692,
   -2.493635877714258,
   0.933536993126765],
  [0.8656477751286108,
   0.8799045158639973,
   0.6380255301584097,
   -2.383577821151018]],
 [[-1.7424254954116525,
   0.7418235566373264,
   0.8449279322866831,
   0.15567400648764307],
  [0.15937358605225388,
   -2.1070257569095823,
   0.999743039306367,
   0.9479091315509615],
  [0.2392704777044946,
   0.35962387164132803,
   -0.7825012044205142,
   0.18360685507469152],
  [0.7125278073703369,
   0.5008900838375965,
   0.07717616723892165,
   -1.2905940584468552]],
 [[-1.403164476589738,
   0.5482289721008847,
   0.4861198464823766,
   0.3688156580064768],
  [0.5417468216239316,
   -2.1761163229358123,
   0.9002825394125432,
   0.7340869618993375],
  [0.3920055704881658,
   0.44624943708686204,
   -0.9896586811395789,
   0.151403673564551],
  [0.37374118364399167,
   0.8521342809816953,
   0.1414006152716403,
   -1.3672760798973274]],
   [[-1.1052908546099478,
  0.8602359611814168,
  0.20570908184218714,
  0.03934581158634395],
 [0.8946428913787168,
  -1.52713148749881,
  0.47000489764719694,
  0.16248369847289634],
 [0.6491716388837093,
  0.8940407261733354,
  -2.4695507103061534,
  0.9263383452491085],
 [0.20190311493008833,
  0.7683160517870256,
  0.7341225704965476,
  -1.7043417372136616]]]

In [7]:
# nabla_list_example = []
# ens_differnce_list_example = []
# ens2_list = []
# for Q2 in Q2_list:
#     ens2 = calculate_ENS(p0_exmaple, np.array(Q2), t)
#     ens2_list.append(ens2)
#     nabla2 = calculate_non_stationarity(p0_exmaple, np.array(Q2), t)
#     ens_difference = np.sqrt((ens1-ens2)**2)
#     nabla_list_example.append(nabla2)
#     ens_differnce_list_example.append(ens_difference)

In [8]:
# fig_example = px.scatter(x = nabla_list_example, y = ens_differnce_list_example, labels={'x':'Non-stationarity', 'y':'ENS difference'}, title= "Relationship between molecular time departure and non-statioanirty")
# fig_example.show()


In [121]:
def entropy_calculation(pk):
    return -np.sum(pk*np.log2(pk))

In [123]:
t = 0.3
pi_list = []
ens1_list = []
jsd_list = []
information_list = []
for i in range(1000):
    Q1 = generate_rate_matrix()
    p0 = calculate_stationary_distribution(np.array(Q1))
    pi_list.append(p0)
    ens1 = calculate_ENS(p0, Q1, t)
    ens1_list.append(ens1)
    nabla1 = calculate_non_stationarity(p0, np.array(Q1), t)

for pi in pi_list:
    jsd_list.append(jsd(pi, pi_balance))
    information_list.append(entropy_calculation(pi_balance) - entropy_calculation(pi))


cor_list = []
for (pi, ens1) in zip(pi_list, ens1_list):
    ens_difference_list = []
    nabla_list = []
    jsd_p0_p2_list = []
    for i in range(2000):
        Q2 = generate_rate_matrix()
        p2 = calculate_stationary_distribution(np.array(Q2))
        ens2 = calculate_ENS(pi, np.array(Q2), t)
        nabla2 = calculate_non_stationarity(pi, np.array(Q2), t)
        ens_difference = np.sqrt((ens1-ens2)**2)
        ens_difference_list.append(ens_difference)
        nabla_list.append(abs(nabla2))
        jsd_p0_p2 = jsd(pi, p2)
        jsd_p0_p2_list.append(jsd_p0_p2)
    cor, p_value = stats.pearsonr(nabla_list, ens_difference_list)
    cor_list.append(cor)

KeyboardInterrupt: 

In [122]:
t = 0.3
pi_list = []
ens1_list = []
jsd_list = []
information_list = []
cor_nabla_list = []
cor_jsd_list = []
for i in range(1000):
    Q1 = generate_rate_matrix()
    p0 = calculate_stationary_distribution(np.array(Q1))
    pi_list.append(p0)
    ens1 = calculate_ENS(p0, Q1, t)
    ens1_list.append(ens1)
    nabla1 = calculate_non_stationarity(p0, np.array(Q1), t)
    jsd_list.append(jsd(p0, pi_balance))
    information_list.append(entropy_calculation(pi_balance) - entropy_calculation(p0))
    ens_difference_list = []
    nabla_list = []
    jsd_p0_p2_list = []
    for i in range(2000):
        Q2 = generate_rate_matrix()
        p2 = calculate_stationary_distribution(np.array(Q2))
        ens2 = calculate_ENS(p0, np.array(Q2), t)
        nabla2 = calculate_non_stationarity(p0, np.array(Q2), t)
        ens_difference = np.sqrt((ens1-ens2)**2)
        ens_difference_list.append(ens_difference)
        nabla_list.append(abs(nabla2))
        jsd_p0_p2 = jsd(p0, p2)
        jsd_p0_p2_list.append(jsd_p0_p2)
    cor1, p_value1 = stats.pearsonr(nabla_list, ens_difference_list)
    cor2, p_value2 = stats.pearsonr(jsd_p0_p2_list, ens_difference_list)
    cor_nabla_list.append(cor1)
    cor_jsd_list.append(cor2)


KeyboardInterrupt: 

In [12]:
entropy_correlation_fig = px.scatter(x = information_list, y = cor_nabla_list, labels={'x':'Information', 'y':'Pearson correlation factor'}, title= None)
# Update layout with labels and title
entropy_correlation_fig.update_layout(
    template='plotly_white',
    margin=dict(l=20, r=20, t=50, b=20),
    autosize=True,
    yaxis_title_font={'size': 20},  # Bigger x-axis title font size
    xaxis_title_font={'size': 20},  # Bigger x-axis title font size
    width=None 
)
entropy_correlation_fig.show()
entropy_correlation_fig.write_image("../../results/figures/Correlation_vs_information.pdf")


In [13]:
correlation_figure = px.violin(abs(np.array(cor_nabla_list)))
correlation_figure

In [201]:
nabla_list = []
ens_differnce_list = []
Q1 = generate_rate_matrix()
t = 0.2
jsd_value = jsd(p0_exmaple, pi_balance)
ens1 = calculate_ENS(p0_exmaple, Q1, t)
nabla1 = calculate_non_stationarity(p0_exmaple, np.array(Q1), t)
nabla_list_new = []
p1 = calculate_stationary_distribution(np.array(Q1))
jsd_p1_p2_list = []
for i in range(2000):
    Q2 = generate_rate_matrix()
    p2 = calculate_stationary_distribution(np.array(Q2))
    ens2 = calculate_ENS(p0_exmaple, np.array(Q2), t)
    nabla2 = calculate_non_stationarity(p0_exmaple, np.array(Q2), t)
    ens_difference = np.sqrt((ens1-ens2)**2)
    ens_differnce_list.append(ens_difference)
    Q1c = matrix_calibration(Q1)
    Q2c = matrix_calibration(Q2)
    factor = norm(np.array(Q2c)-np.array(Q1c))/(1+norm(np.array(Q2c)-np.array(Q1c)))
    nabla_list_new.append((abs(nabla2-nabla1)))
    nabla_list.append(abs(nabla2+nabla1))
    jsd_p1_p2 = jsd(p1, p2)
    jsd_p1_p2_list.append(jsd_p1_p2)

ens_nabla_fig = px.scatter(x = nabla_list_new, y = ens_differnce_list, labels={'x': 'My-statistics', 'y': 'ENS difference'}, title='Relationship between molecular time departure and non-stationarity')

ens_nabla_fig2 = px.scatter(x = nabla_list, y = ens_differnce_list, labels={'x': 'My-statistics', 'y': 'ENS difference'}, title='Relationship between molecular time departure and non-stationarity')

ens_jsd_fig = px.scatter(x = jsd_p1_p2_list, y = ens_differnce_list, labels={'x': 'JSD between pi1 and pi2', 'y': 'ENS difference'}, title='Relationship between molecular time departure and non-stationarity')

col, p_value = stats.pearsonr(jsd_p1_p2_list, ens_differnce_list)

In [203]:
np.dot(np.diag(Q1), p0_exmaple)

-1.1580061719994819

In [205]:
p0_exmaple

[0.14380729, 0.13524402, 0.54646076, 0.17448793]

In [204]:
Q1

array([[-2.06355021,  0.46073523,  0.7666536 ,  0.83616139],
       [ 0.54841625, -1.79918958,  0.51330384,  0.73746949],
       [ 0.39719357,  0.1322696 , -0.58364758,  0.05418441],
       [ 0.09522272,  0.92854349,  0.68971895, -1.71348516]])

In [202]:
ens_nabla_fig.show()

In [16]:
# simulation_ens_list = []
# sequence_list = []
# for Q2 in Q2_list:
#     simulator = SeqSimulate(np.array(Q2), 3000, 50, 1, p0_exmaple)
#     results = simulator.main(max_time = 0.2)
#     sequence = results[0][-1]
#     sequence_list.append(sequence)
#     result = simulator.average_substitution(max_time = 0.2)
#     simulation_ens_list.append(result[1])
# cogent3_seqs = {}
# for i, seq in enumerate(sequence_list):
#     cogent3_seq = join_number_to_base_cogent3(seq)
#     cogent3_seqs[f'seq{i}'] = cogent3_seq

# cogent3_seqs_align = make_aligned_seqs(data=cogent3_seqs, moltype="dna")
# cogent3_seqs_align

In [17]:
# np.corrcoef(ens_differnce_list, nabla_list)[0][1]

# rho, p_value_r = stats.spearmanr(nabla_list, ens_differnce_list)
# tau, p_value_t = stats.kendalltau(nabla_list, ens_differnce_list)
# import pingouin as pg

# # Assuming 'x' and 'y' are your data variables as defined above
# result = pg.corr(nabla_list, ens_differnce_list, method='bicor')

Conduct experiment on random non-stationary pairs 

In [65]:
t = 1.5
clock = t
nabla1_list = []
nabla2_list = []
num_process = 3000
ens1_list = []
ens2_list = []
Q_pair_list = []
nabla_list = []
ens_difference_list = []
ens_difference1_list = []
ens_difference2_list = []

for i in range(num_process):
    Q1 = generate_rate_matrix()
    Q2 = generate_rate_matrix()
    Q_pair_list.append((Q1, Q2))

for Q1, Q2 in Q_pair_list:
    ens1 = calculate_ENS(p0_exmaple, Q1, t)
    ens2 = calculate_ENS(p0_exmaple, Q2, t)
    nabla1 = calculate_non_stationarity(p0_exmaple, Q1, t)
    nabla2 = calculate_non_stationarity(p0_exmaple, Q2, t)
    ens_difference1 = np.sqrt((ens1-clock)**2)
    ens_difference2 = np.sqrt((ens2-clock)**2)
    ens_difference = np.sqrt((ens1-ens2)**2)
    ens_difference_list.append(ens_difference)
    ens_difference1_list.append(ens_difference1)
    ens_difference2_list.append(ens_difference2)

    ens1_list.append(ens1)
    nabla1_list.append(nabla1)
    ens2_list.append(ens2)
    nabla2_list.append(nabla2)
    factor = norm(np.array(Q2)-np.array(Q1))/(1+norm(np.array(Q2)-np.array(Q1)))
    nabla_list.append(factor*(abs(nabla2+nabla1)))


    


In [19]:
np.mean(ens2_list)

1.0185680590930006

In [195]:
non_stationary_pair_fig = px.scatter(x = nabla1_list, y = ens1_list, labels={'x':'nabla', 'y':'ENS difference'}, title= "Relationship between molecular time departure and non-statioanirty of 2 species")
non_stationary_pair_fig.show()

ValueError: All arguments should have the same length. The length of argument `y` is 1000, whereas the length of  previously-processed arguments ['x'] is 3000

Simulate the volution of 3 sequences and do the model fitting

In [21]:
interesting_Q_list = [[[-0.43467944426062116,
  0.17224055426099708,
  0.16310857726796912,
  0.09933031273165498],
 [0.8744884057309638,
  -2.508370983023648,
  0.658828378197184,
  0.9750541990955003],
 [0.37647486131169944,
  0.5579123848326735,
  -1.236392734665995,
  0.3020054885216221],
 [0.55032999104189,
  0.5277845913495285,
  0.2731367973092342,
  -1.3512513797006527]], 
  
  [[-1.0324510784624759,
  0.4705794305451246,
  0.1630045451999826,
  0.39886710271736864],
 [0.9008293238961476,
  -1.5034040948261855,
  0.38980500390391093,
  0.212769767026127],
 [0.2766980368416016,
  0.9871872426447907,
  -1.4601095568968394,
  0.19622427741044704],
 [0.9317938668773915,
  0.6880911187655664,
  0.18876499162501448,
  -1.8086499772679723]], 
  
  [[-0.0836110718980669,
  0.02928686007361185,
  0.04357023499776807,
  0.010753976826686976],
 [0.42617291612837976,
  -1.407700943023633,
  0.3404865281419312,
  0.641041498753322],
 [0.4709650739106473,
  0.7359344407599814,
  -1.270471438911452,
  0.06357192424082321],
 [0.5738417345245591,
  0.9707017736872785,
  0.8269744361549397,
  -2.371517944366777]], 
  
  [[-1.3874938561237078,
  0.35200366828334734,
  0.5322473376832975,
  0.5032428501570629],
 [0.12568809061238725,
  -0.22806904582294607,
  0.06943901192785681,
  0.032941943282702],
 [0.48709368515201384,
  0.33827174062165477,
  -0.854879002815376,
  0.029513577041707324],
 [0.8536063381533379,
  0.7822134852029212,
  0.6494128004287897,
  -2.2852326237850487]], 
  
  [[-2.136629625047819,
  0.9700439277212034,
  0.9157328283601847,
  0.2508528689664307],
 [0.10269693977219942,
  -0.2924420974628559,
  0.16441136011823673,
  0.02533379757241972],
 [0.9094847232073033,
  0.28484073919589353,
  -1.5520634331233163,
  0.3577379707201194],
 [0.3652230897725877,
  0.661147712658651,
  0.011537216013910346,
  -1.0379080184451492]], 
  
  [[-2.136629625047819,
  0.9700439277212034,
  0.9157328283601847,
  0.2508528689664307],
 [0.10269693977219942,
  -0.2924420974628559,
  0.16441136011823673,
  0.02533379757241972],
 [0.9094847232073033,
  0.28484073919589353,
  -1.5520634331233163,
  0.3577379707201194],
 [0.3652230897725877,
  0.661147712658651,
  0.011537216013910346,
  -1.0379080184451492]], 

[[-0.7070916577603472,
  0.18599021415027228,
  0.45684711552471124,
  0.0642543280853637],
 [0.09534346457249147,
  -0.7747103246960625,
  0.4658892931984096,
  0.21347756692516137],
 [0.07026922858025701,
  0.1998051126643552,
  -0.34182192709856724,
  0.071747585853955],
 [0.8926981689288437,
  0.03439508051023827,
  0.3950013493306266,
  -1.3220945987697086]]]

In [22]:
Q1_test = generate_rate_matrix()

Q2_test = generate_rate_matrix()
Q3_test = generate_rate_matrix()

p0_test = p0_exmaple

In [23]:
from cogent3.evolve.models import GN
from cogent3 import get_app, make_tree

aln, ens = taxanomic_triple_simulation(p0_test, Q1_test, Q2_test, Q3_test, 0.2, 0.4, 10000, 1, 123)
tree_topology = make_tree("((ingroup_edge1,ingroup_edge2),outgroup_edge3)")
GN_model = get_app("model", sm = "GN", tree = tree_topology, time_het = "max",
                show_progress = False)
res = GN_model(aln)


In [24]:
ens

[0.3807, 0.2056, 0.2879, 0.7168]

In [25]:
p0_exmaple

[0.14380729, 0.13524402, 0.54646076, 0.17448793]

In [27]:
matrix_dict = {n: res.lf.get_rate_matrix_for_edge(n, calibrated = True) for n in aln.names}