In [1]:
from cogent3 import load_aligned_seqs, get_app
import pathlib


In [6]:
paths = list(pathlib.Path("../../data/dataset1_16s-10_initial_alignments/processed_data/selected_alns").glob("**/*.json"))

GN_model = get_app("model", sm = "GN", 
                unique_trees = True, time_het = "max", optimise_motif_probs = True, 
                show_progress = False, opt_args = dict(max_restarts=5))

no_degenerates = get_app(
    "omit_degenerates",
    moltype = None,
    gap_is_degen = True,
    motif_length = 1,
)

In [9]:
def alns_to_matx_pi(path, model):
    aln = load_aligned_seqs(path)
    aln1 = no_degenerates(aln)
    result = model(aln1)
    edge_names = result.tree.get_node_names(includeself = False)
    matrices = {n:result.lf.get_rate_matrix_for_edge(n, calibrated = True) for n in edge_names}
    motif_probs = result.lf.get_motif_probs()
    return matrices, motif_probs

matrix_dict_array = list()
motif_probs_dict_array = list()
for path in paths:
    matrix_dict_array.append(alns_to_matx_pi(path, GN_model)[0])
    motif_probs_dict_array.append(alns_to_matx_pi(path, GN_model)[1])


In [10]:
matrix_dict_array[0]

            T          C          A          G
 ---------------------------------------------
 T    -1.5788     0.2494     0.9428     0.3867
 C     0.7931    -1.1426     0.1821     0.1674
 A     0.0000     0.0980    -0.5393     0.4412
 G     0.0000     0.2226     0.6243    -0.8469
 ---------------------------------------------,
            T          C          A          G
 ---------------------------------------------
 T    -1.6392     1.0377     0.1438     0.4577
 C     0.2564    -1.1133     0.1260     0.7310
 A     0.0846     0.0966    -0.6663     0.4851
 G     0.0700     0.6191     0.0587    -0.7478
 ---------------------------------------------,
            T          C          A          G
 ---------------------------------------------
 T    -0.5081     0.0250     0.3783     0.1047
 C     1.2675    -1.6334     0.3308     0.0352
 A     0.0847     0.0000    -0.1006     0.0160
 G     0.3783     0.0325     1.0126    -1.4234
 ---------------------------------------------}

In [6]:
def process_compel_data(matrix_dict_array, motif_probs_dict_array):
    motif_probs_np_array = [motif_prob.array for motif_prob in motif_probs_dict_array]
    motif_probs_list = [inner_array.tolist() for inner_array in motif_probs_np_array]

    matrix_dict_np_array = [{n: matrix[n].array for n in matrix} for matrix in matrix_dict_array]
    matrix_dict_of_lists = [{k: v.tolist() for k, v in matrix.items()} for matrix in matrix_dict_np_array]

    data = dict()
    for i in range(len(matrix_dict_array)):
        data[i] = {'motif_prob': motif_probs_list[i], 'matrix': matrix_dict_of_lists[i]}

    return data

data = process_compel_data(matrix_dict_array, motif_probs_dict_array)

In [7]:
import json

with open('../data/matrix_motif.json', 'w') as json_file:
    json.dump(data, json_file)

In [20]:
import sys
sys.path.insert(0, '/Users/gulugulu/repos/PuningAnalysis/src')
from simulation.waiting_time_simulator import generate_ancestor, average_substitution, get_descrip_stat, simulate_seq
from cogent3.maths.matrix_exponential_integration import expected_number_subs
import numpy as np

In [21]:
with open('../../data/dataset1_16s-10_initial_alignments/matrix_motif.json', 'r') as file:
    matrix_motif = json.load(file)

In [24]:
Q_ = {'0': np.array(matrix_motif['0']['matrix']['200580'])}
pi_0 = matrix_motif['0']['motif_prob']
repeats = 100
markov_order = 0
length = 1000
time_range = [0.5, 1, 2, 3, 4]
ances_seq = generate_ancestor(length, pi_0)


In [25]:
# Dictionary to store results
results = {}


# Loop over each sequence length and time to perform simulations
for t in time_range:
    print(t)
    length = 1000
    ns_per_site_list, avg_ns_per_site = average_substitution(Q_, t, ances_seq, repeats, markov_order)
    # Store the results
    results[f"Length {length}, Time {t}"] = {
        'avg_ns_per_site': avg_ns_per_site,
        'ns_per_site_list': ns_per_site_list
    }



0.5
1
2
3
4


FileNotFoundError: [Errno 2] No such file or directory: '../../../results/length_1000.json'

In [26]:
# a = simulate_seq(ances_seq, 3, Q_, 0)
with open('../../results/length_1000.json', 'w') as outfile:
    json.dump(results, outfile, indent=4)