In [1]:
from cogent3 import load_aligned_seqs, get_app, app_help, available_apps
import pathlib


In [2]:
paths = list(pathlib.Path("../data/16s-10").glob("**/*.json"))


In [3]:
path = paths[0]


In [4]:
path

PosixPath('../data/16s-10/selected_alns/6-100667_200580_279422.json')

In [5]:
aln = load_aligned_seqs(path)

In [6]:
aln

0,1
,0
200580.0,----AACGCTAGCGACAAGCCTAACACATGCAAGTTGTGCGAAGCAGCGAACGGGTGAGT
279422.0,....------G.T..GTG..------------.---------------............
100667.0,GCC..CT....TG.GGGCCG....GC.....G...CA..G.-.C.G...G....C.C...


In [7]:
app_help("omit_degenerates")
# available_apps("sample")

Overview
--------
Excludes alignment columns with degenerate characters. Can accomodate
reading frame.

Options for making the app
--------------------------
omit_degenerates_app = get_app(
    "omit_degenerates",
    moltype: Optional[str] = None,
    gap_is_degen: bool = True,
    motif_length: int = 1,
)

Parameters
----------
moltype : str
    molecular type, must be either DNA or RNA
gap_is_degen : bool
    include gap character in degenerate character set
motif_length : int
    sequences split into non-overlapping tuples of this size. If a
    tuple contains a degen character at any position the entire tuple
    is excluded

Examples
--------
Degenerate IUPAC base symbols represents a site position that can have
multiple possible nucleotides. For example, "Y" represents
pyrimidines where the site can be either "C" or "T".

Note: In molecular evolutionary and phylogenetic analyses, the gap
character "-" is considered to be any base "N".

Create sample data with degenerate characte

In [8]:
no_degenerates = get_app(
    "omit_degenerates",
    moltype = None,
    gap_is_degen = True,
    motif_length = 1,
)

In [9]:
aln1 = no_degenerates(aln)
aln1

0,1
,0
100667.0,ATGGGGGCCGAGGACGGCTCAGTAACACGTCGGTACCTACCCTCGGGAGGGGGATAACCC
279422.0,GGT.A.TGGC..A....G.G......G...G..A.T......ATCCCTAC......G.T.
200580.0,.GC.ACAAGC..A....G.G.....A.TACAA.A.T..G..T.TTA.TATTA..A..TAT


In [10]:
app_help("model")

Overview
--------
Define a substitution model + tree for maximum likelihood evaluation.

Options for making the app
--------------------------
model_app = get_app(
    "model",
    sm,
    tree=None,
    unique_trees=False,
    tree_func=None,
    name=None,
    optimise_motif_probs=False,
    sm_args=None,
    lf_args=None,
    time_het=None,
    param_rules=None,
    opt_args=None,
    lower=1e-06,
    upper=50,
    split_codons=False,
    show_progress=False,
    verbose=False,
)

Parameters
----------
sm : str or instance
    substitution model if string must be available via get_model()
tree
    if None, assumes a star phylogeny (only valid for 3 taxa). Can be a
    newick formatted tree, a path to a file containing one, or a Tree
    instance.
unique_trees : bool
    whether to specify a unique tree per alignment. Only applies if
    number of sequences equals 3.
tree_func: callable
    a callable that takes an alignment and returns a Tree instance.
    Overrides tree and unique_

In [11]:
GN_model = get_app("model", sm = "GN", 
                unique_trees = True, time_het = "max", optimise_motif_probs = True, 
                show_progress = True, opt_args = dict(max_restarts=5))

In [12]:
result = GN_model(aln1)

   0%|          |00:00<?

   0%|          |00:00<?

In [13]:
result

key,lnL,nfp,DLC,unique_Q
'GN',-3559.8498,39,True,True


In [14]:
result.lf


edge,parent,length,A>C,A>G,A>T,C>A,C>G,C>T
279422,root,0.0592,0.2535,1.1411,0.0,0.4709,0.4331,2.0511
100667,root,0.3943,0.2112,1.0599,0.1849,0.2753,1.5972,0.5602
200580,root,0.2845,0.0,0.1527,0.8086,3.1596,0.336,12.1076

G>A,G>C,G>T,T>A,T>C
1.6146,0.5758,0.0,2.4382,0.6451
0.1283,1.3527,0.153,0.3143,2.2674
9.6731,0.3106,3.6135,3.614,0.2391

A,C,G,T
0.2197,0.2469,0.3316,0.2017


In [15]:
edge_names = result.tree.get_node_names(includeself = False)
matrices = {n:result.lf.get_rate_matrix_for_edge(n, calibrated = True) for n in edge_names}
motif_probs = result.lf.get_motif_probs()

In [16]:
motif_probs


T,C,A,G
0.2017,0.2469,0.2197,0.3316


In [17]:
result.lf


edge,parent,length,A>C,A>G,A>T,C>A,C>G,C>T
279422,root,0.0592,0.2535,1.1411,0.0,0.4709,0.4331,2.0511
100667,root,0.3943,0.2112,1.0599,0.1849,0.2753,1.5972,0.5602
200580,root,0.2845,0.0,0.1527,0.8086,3.1596,0.336,12.1076

G>A,G>C,G>T,T>A,T>C
1.6146,0.5758,0.0,2.4382,0.6451
0.1283,1.3527,0.153,0.3143,2.2674
9.6731,0.3106,3.6135,3.614,0.2391

A,C,G,T
0.2197,0.2469,0.3316,0.2017


In [18]:
aln1.get_motif_probs()

{'T': 0.20274049217002238,
 'C': 0.22930648769574943,
 'A': 0.2569910514541387,
 'G': 0.31096196868008946}

In [19]:
type(motif_probs)

cogent3.util.dict_array.DictArray

In [20]:
type(list(matrices.values())[0])

cogent3.util.dict_array.DictArray

In [21]:
def to_array(darray):
    return darray.array




to_array(motif_probs)

array([0.20174636, 0.24692541, 0.21970177, 0.33162646])

In [22]:
matrices_arrays = {n: to_array(matrices[n]) for n in matrices}

In [23]:
matrices_arrays

{'279422': array([[-1.57883956e+00,  2.49420647e-01,  9.42761512e-01,
          3.86657404e-01],
        [ 7.93059922e-01, -1.14256195e+00,  1.82059184e-01,
          1.67442839e-01],
        [ 3.86657508e-07,  9.80320365e-02, -5.39262121e-01,
          4.41229698e-01],
        [ 3.86657501e-07,  2.22638516e-01,  6.24309330e-01,
         -8.46948233e-01]]),
 '100667': array([[-1.63919085,  1.03769811,  0.14382828,  0.45766446],
        [ 0.25640298, -1.1133397 ,  0.12597443,  0.73096229],
        [ 0.08461593,  0.09664797, -0.66633476,  0.48507086],
        [ 0.07000723,  0.61906724,  0.05873203, -0.7478065 ]]),
 '200580': array([[-5.08052261e-01,  2.50306145e-02,  3.78335455e-01,
          1.04686192e-01],
        [ 1.26749712e+00, -1.63343872e+00,  3.30767248e-01,
          3.51743525e-02],
        [ 8.46540546e-02,  1.04686200e-07, -1.00644002e-01,
          1.59898427e-02],
        [ 3.78286260e-01,  3.25173270e-02,  1.01264471e+00,
         -1.42344830e+00]])}

In [24]:
def alns_to_matx_pi(path, model):
    aln = load_aligned_seqs(path)
    aln1 = no_degenerates(aln)
    result = model(aln1)
    edge_names = result.tree.get_node_names(includeself = False)
    matrices = {n:result.lf.get_rate_matrix_for_edge(n, calibrated = True) for n in edge_names}
    motif_probs = result.lf.get_motif_probs()
    return matrices, motif_probs

In [25]:
maxtrix_dict_array = list()
motif_probs_dict_array = list()
for path in paths:
    maxtrix_dict_array.append(alns_to_matx_pi(paths[0], GN_model)[0])
    motif_probs_dict_array.append(alns_to_matx_pi(paths[0], GN_model)[1])

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

In [26]:
maxtrix_numpy_array = list()
motif_probs_numpy_array = list()
for matrices in maxtrix_dict_array:
    for matrix in list(matrices.values()):
        maxtrix_numpy_array.append(to_array(matrix))

for motif_prob in motif_probs_dict_array:
    motif_probs_numpy_array.append(to_array(motif_prob))



In [27]:
maxtrix_numpy_array

[array([[-1.57883956e+00,  2.49420647e-01,  9.42761512e-01,
          3.86657404e-01],
        [ 7.93059922e-01, -1.14256195e+00,  1.82059184e-01,
          1.67442839e-01],
        [ 3.86657508e-07,  9.80320365e-02, -5.39262121e-01,
          4.41229698e-01],
        [ 3.86657501e-07,  2.22638516e-01,  6.24309330e-01,
         -8.46948233e-01]]),
 array([[-1.63919085,  1.03769811,  0.14382828,  0.45766446],
        [ 0.25640298, -1.1133397 ,  0.12597443,  0.73096229],
        [ 0.08461593,  0.09664797, -0.66633476,  0.48507086],
        [ 0.07000723,  0.61906724,  0.05873203, -0.7478065 ]]),
 array([[-5.08052261e-01,  2.50306145e-02,  3.78335455e-01,
          1.04686192e-01],
        [ 1.26749712e+00, -1.63343872e+00,  3.30767248e-01,
          3.51743525e-02],
        [ 8.46540546e-02,  1.04686200e-07, -1.00644002e-01,
          1.59898427e-02],
        [ 3.78286260e-01,  3.25173270e-02,  1.01264471e+00,
         -1.42344830e+00]]),
 array([[-1.57883956e+00,  2.49420647e-01,  9.4276

In [28]:
motif_probs_numpy_array

[array([0.20174636, 0.24692541, 0.21970177, 0.33162646]),
 array([0.20174636, 0.24692541, 0.21970177, 0.33162646]),
 array([0.20174636, 0.24692541, 0.21970177, 0.33162646]),
 array([0.20174636, 0.24692541, 0.21970177, 0.33162646]),
 array([0.20174636, 0.24692541, 0.21970177, 0.33162646]),
 array([0.20174636, 0.24692541, 0.21970177, 0.33162646]),
 array([0.20174636, 0.24692541, 0.21970177, 0.33162646]),
 array([0.20174636, 0.24692541, 0.21970177, 0.33162646]),
 array([0.20174636, 0.24692541, 0.21970177, 0.33162646]),
 array([0.20174636, 0.24692541, 0.21970177, 0.33162646])]

In [29]:
maxtrix_dict_array

             T          C          A          G
  ---------------------------------------------
  T    -1.5788     0.2494     0.9428     0.3867
  C     0.7931    -1.1426     0.1821     0.1674
  A     0.0000     0.0980    -0.5393     0.4412
  G     0.0000     0.2226     0.6243    -0.8469
  ---------------------------------------------,
             T          C          A          G
  ---------------------------------------------
  T    -1.6392     1.0377     0.1438     0.4577
  C     0.2564    -1.1133     0.1260     0.7310
  A     0.0846     0.0966    -0.6663     0.4851
  G     0.0700     0.6191     0.0587    -0.7478
  ---------------------------------------------,
             T          C          A          G
  ---------------------------------------------
  T    -0.5081     0.0250     0.3783     0.1047
  C     1.2675    -1.6334     0.3308     0.0352
  A     0.0847     0.0000    -0.1006     0.0160
  G     0.3783     0.0325     1.0126    -1.4234
  ------------------------------------

In [30]:
motif_probs_dict_array

      T         C         A         G
 ------------------------------------
 0.2017    0.2469    0.2197    0.3316
 ------------------------------------,
      T         C         A         G
 ------------------------------------
 0.2017    0.2469    0.2197    0.3316
 ------------------------------------,
      T         C         A         G
 ------------------------------------
 0.2017    0.2469    0.2197    0.3316
 ------------------------------------,
      T         C         A         G
 ------------------------------------
 0.2017    0.2469    0.2197    0.3316
 ------------------------------------,
      T         C         A         G
 ------------------------------------
 0.2017    0.2469    0.2197    0.3316
 ------------------------------------,
      T         C         A         G
 ------------------------------------
 0.2017    0.2469    0.2197    0.3316
 ------------------------------------,
      T         C         A         G
 ------------------------------------
 0.201