In [1]:
import sys, os

In [2]:
sys.path.append(os.path.join(os.path.dirname(sys.path[0]),'../spectral-tree-inference/spectraltree'))

In [85]:
import numpy as np
import utils
import generation
import reconstruct_tree
import dendropy
import scipy

In [148]:
from dendropy.model.discrete import simulate_discrete_chars, Jc69, Hky85
from dendropy.calculate.treecompare import symmetric_difference

## Birth-death tree example

In [5]:
N = 400 # length of sequences
num_taxa = 32 # number of taxa
jc = generation.Jukes_Cantor() # transition matrix object
mutation_rate = [jc.p2t(0.95)]

In [8]:
reference_tree = utils.unrooted_birth_death_tree(num_taxa, birth_rate=1)

In [99]:
reference_tree

<Tree object at 0x11a860b70>

In [10]:
for x in reference_tree.preorder_edge_iter():
    x.length = 1

In [11]:
observations = generation.simulate_sequences_ordered(N, tree_model=reference_tree, seq_model=jc, mutation_rate=mutation_rate)

In [16]:
observations.shape

(32, 400)

In [13]:
snj = reconstruct_tree.SpectralNeighborJoining(reconstruct_tree.JC_similarity_matrix)
tree_rec = snj(observations, reference_tree.taxon_namespace)

In [20]:
RF,F1 = reconstruct_tree.compare_trees(tree_rec, reference_tree)
print("SNJ: ")
print("RF = ",RF)
print("F1% = ",F1)
print("")

SNJ: 
RF =  0
F1% =  100.0



In [111]:
nj = reconstruct_tree.NeighborJoining(reconstruct_tree.JC_similarity_matrix)
tree_rec = nj(observations,reference_tree.taxon_namespace)
RF,F1 = reconstruct_tree.compare_trees(tree_rec, reference_tree)
print("")
print("NJ: ")
print("RF = ",RF)
print("F1% = ",F1)
print("")


NJ: 
RF =  0
F1% =  100.0



## Read in H3N2 tree file

In [29]:
tree_path = "/Users/wangmeng/Documents/Research/Palacios/projectingvirus/data/H3N2_NY_Skygrid_cutoff10/skygrid_J2.newick"
fasta_path = "/Users/wangmeng/Documents/Research/Palacios/projectingvirus/data/alignedFasta/H3N2_NewYork.fasta"

In [33]:
H3N2_tree = dendropy.Tree.get(path=tree_path, schema="newick")

In [120]:
H3N2_tree.print_plot()

                                                                   /------- 274
                                                                   |           
                                                                   |     /- 225
/------------------------------------------------------------------+   /-+     
|                                                                  |   | \- 229
|                                                                  | /-+       
|                                                                  | | | /- 230
|                                                                  \-+ \-+     
|                                                                    |   \- 227
|                                                                    |         
|                                                                    \----- 226
|                                                                              
|---------------------------------------

In [35]:
H3N2_rna = dendropy.DnaCharacterMatrix.get(file=open(fasta_path, "r"), schema="fasta")

In [129]:
ch_list = list()
for t in H3N2_rna.taxon_namespace:
    ch_list.append([x.symbol for x in H3N2_rna[t]])

leafs_idx = [i.label[0] != " " for i in H3N2_rna.taxon_namespace]

In [82]:
np.unique(ch_list)
# https://www.bioinformatics.org/sms/iupac.html
# H: A or C or T
# K: G or T
# M: A or C
# N: any base
# R: A or G
# W: A or T
# Y: C or T

array(['-', 'A', 'C', 'G', 'H', 'K', 'M', 'N', 'R', 'T', 'W', 'Y'],
      dtype='<U1')

In [130]:
ch_list_num = np.array(ch_list)
ch_list_num = ch_list_num[leafs_idx]
ch_list_num = np.where(ch_list_num=='A', 1, ch_list_num)
ch_list_num = np.where(ch_list_num=='C', 2, ch_list_num)
ch_list_num = np.where(ch_list_num=='G', 3, ch_list_num)
ch_list_num = np.where(ch_list_num=='T', 4, ch_list_num)
ch_list_num = np.where(np.isin(ch_list_num, ['-', "H", "K", "M", "N", "R", "W", "Y"]), 
                       -1, ch_list_num)
ch_list_num = ch_list_num.astype('int')

In [131]:
ch_list_num.shape

(565, 1737)

In [83]:
def hamming_dist_missing_values(vals, missing_val =0):
    hamming_matrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(vals, metric='hamming'))
    missing_array = (vals==missing_val)
    pdist_xor = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(missing_array, lambda u,v: np.sum(np.logical_xor(u,v))))
    pdist_or = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(missing_array, lambda u,v: np.sum(np.logical_or(u,v))))

    # Fix Hamming matrix for missing values
    return (hamming_matrix*vals.shape[1] - pdist_xor) / (np.ones_like(hamming_matrix) * vals.shape[1] - pdist_or)

def JC_similarity_matrix_missing_values(observations, classes=None):
    assert classes is None
    if classes is None:
        classes = np.unique(observations)
    k = len(classes)
    hamming_matrix_corrected = hamming_dist_missing_values(observations)
    inside_log = 1 - hamming_matrix_corrected*k/(k-1)
    return inside_log**(k-1)

In [122]:
nj = reconstruct_tree.NeighborJoining(JC_similarity_matrix_missing_values)
tree_rec = nj(ch_list_num, H3N2_tree.taxon_namespace)


NJ: 
RF =  1120
F1% =  50.35460992907801



In [150]:
N = 1000
data = simulate_discrete_chars(N, H3N2_tree, Jc69(), mutation_rate = generation.Jukes_Cantor().p2t(0.95))

In [151]:
type(data)

dendropy.datamodel.charmatrixmodel.DnaCharacterMatrix

In [152]:
raxml = reconstruct_tree.RAxML()

In [153]:
raxml_tree = raxml(data)

FileNotFoundError: [Errno 2] No such file or directory: 'raxmlHPC': 'raxmlHPC'

In [132]:
nj = reconstruct_tree.NeighborJoining(reconstruct_tree.paralinear_similarity)

In [133]:
tree_rec = nj(ch_list_num, H3N2_tree.taxon_namespace)

In [134]:
tree_rec.print_plot()

                                                                   /------- 466
                                                            /------+           
                                                            |      \------- 28 
                                                    /-------+                  
                                                    |       |      /------- 147
                                                    |       \------+           
                                                    |              \------- 290
                                             /------+                          
                                             |      |              /------- 255
                                             |      |       /------+           
                                             |      |       |      \------- 546
                                             |      \-------+                  
                                        

In [135]:
RF,F1 = reconstruct_tree.compare_trees(tree_rec, H3N2_tree)
print("")
print("NJ: ")
print("RF = ",RF)
print("F1% = ",F1)
print("")


NJ: 
RF =  1124
F1% =  50.177304964539005



In [None]:
# plotly