# Quantify the runtime of RaXML

Mamie Wang 2020/06/18

The goal of this analysis is to find where spectral tree clustering breaks and quantify the runtime of each step. 

In [27]:
import sys, os

sys.path.append(os.path.join(os.path.dirname(sys.path[0]),'../spectral-tree-inference/spectraltree'))

import numpy as np
import utils
import generation
import reconstruct_tree
import dendropy
import scipy
import time
from itertools import product
import matplotlib.pyplot as plt

from dendropy.model.discrete import simulate_discrete_chars, Jc69, Hky85
from dendropy.calculate.treecompare import symmetric_difference
import character_matrix
import copy
import os, sys

# https://stackoverflow.com/questions/8391411/suppress-calls-to-print-python
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [9]:
m = 128 # large tree: 512 nodes, mutation rate = 0.1
catepillar_tree = utils.lopsided_tree(m, edge_length = 1)
threshold = 64

In [10]:
data_HKY = simulate_discrete_chars(1000, catepillar_tree, Hky85(kappa = 1), mutation_rate=0.1)
ch_list = list()
for t in data_HKY.taxon_namespace:
    ch_list.append([x.symbol for x in data_HKY[t]])
ch_arr = np.array(ch_list)

In [11]:
spectral_method = reconstruct_tree.SpectralTreeReconstruction(reconstruct_tree.SpectralNeighborJoining, reconstruct_tree.HKY_similarity_matrix_missing_data)
with HiddenPrints():
    start_time = time.time()
    tree_rec = spectral_method.deep_spectral_tree_reonstruction(ch_arr, reconstruct_tree.HKY_similarity_matrix_missing_data, 
                                                            taxon_namespace = catepillar_tree.taxon_namespace, 
                                                            threshhold = threshold, min_split = 5)
    runtime = time.time() - start_time

RF,F1 = reconstruct_tree.compare_trees(tree_rec, catepillar_tree)

In [12]:
print("--- %s seconds ---" % runtime)
print("RF = ",RF)
print("F1% = ",F1) 

--- 1.9397449493408203 seconds ---
RF =  86
F1% =  83.07086614173227


## Check STR + RaXML time

We need to first find a set of parameter that breaks the STR + RaXML. 

In [5]:
m = 512
n = 200
threshold = 256

binary_tree = utils.balanced_binary(m, edge_length = 0.5)
data_HKY = simulate_discrete_chars(n, binary_tree, Hky85(kappa = 1), mutation_rate=0.1)

ch_list = list()
for t in data_HKY.taxon_namespace:
    ch_list.append([x.symbol for x in data_HKY[t]])
ch_arr = np.array(ch_list)

In [6]:
spectral_method = reconstruct_tree.SpectralTreeReconstruction(reconstruct_tree.RAxML,
                                                              reconstruct_tree.HKY_similarity_matrix)
start_time = time.time()
tree_rec = spectral_method.deep_spectral_tree_reonstruction(ch_arr, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxon_namespace = binary_tree.taxon_namespace, 
                                                            threshhold = threshold,
                                                            raxml_args = "-T 2 --HKY85 -c 1", min_split = 5)
runtime = time.time() - start_time
print("--- %s seconds ---" % runtime)

partition
L1 size:  256
L2 size:  256
--- 70.92654275894165 seconds ---
--- 70.10468292236328 seconds ---
one - merging:  128  out of:  256
one - merging:  192  out of:  256
--- 160.1297469139099 seconds ---


In [7]:
RF,F1 = reconstruct_tree.compare_trees(tree_rec, binary_tree)

print("RF = ",RF)
print("F1% = ",F1) 

RF =  2
F1% =  99.90215264187869


The reconstructed tree have some mistakes. Let's check which part breaks. We can use previous test to test splitting and merging. 

## Splitting test

In [14]:
def check_is_bipartition(tree, bool_partition):
    bipartitions = [str(x)[::-1] for x in tree.encode_bipartitions()]
    partition_1 = "".join(list(bool_partition.astype('int').astype('str')))
    partition_2 = "".join(list((1 - bool_partition).astype('int').astype('str')))
    is_bipartition = (partition_1 in bipartitions) or (partition_2 in bipartitions)
    return is_bipartition

def test_split(sim, tree, min_split = 5):
    _, eigvec = np.linalg.eigh(sim)
    partition = reconstruct_tree.partition_taxa(eigvec[:,-2], sim, 1, min_split)
    is_bipartition = check_is_bipartition(tree, partition)
    return partition, is_bipartition

In [15]:
start_time = time.time()
sim = reconstruct_tree.HKY_similarity_matrix(ch_arr)
runtime = time.time() - start_time
print("--- %s seconds ---" % runtime)

partition, is_biparition = test_split(sim, binary_tree)

--- 17.784427165985107 seconds ---


In [17]:
is_biparition

True

In [22]:
np.sum(partition)

256

The splitting correct and is balanced. The 30 sec of runtime has about 18 sec coming from initial computing of the HKY similarity matrix.

## Merging test

In [30]:
def test_merge(partition, tree):
    taxon_namespace_label = np.array([x.label for x in tree.taxon_namespace])
    left_namespace = list(taxon_namespace_label[np.where(partition)[0]])
    left_taxa = dendropy.TaxonNamespace([taxon for taxon in tree.taxon_namespace
            if taxon.label in left_namespace])

    T_left = copy.deepcopy(tree).extract_tree_with_taxa_labels(labels = left_namespace)
    T_left.purge_taxon_namespace()
    s = T_left.as_string(schema = "newick")
    T_left = dendropy.Tree.get(data=s, schema="newick", taxon_namespace = left_taxa)
    
    right_namespace = list(taxon_namespace_label[np.where(np.logical_not(partition))[0]])
    right_taxa = dendropy.TaxonNamespace([taxon for taxon in tree.taxon_namespace
            if taxon.label in right_namespace])
    T_right = copy.deepcopy(tree).extract_tree_with_taxa_labels(labels = right_namespace)
    T_right.purge_taxon_namespace()
    s = T_right.as_string(schema = "newick")
    T_right = dendropy.Tree.get(data=s,
        schema="newick", taxon_namespace = right_taxa)
        
    start_time = time.time()
    joined_tree = reconstruct_tree.join_trees_with_spectral_root_finding(
            sim, T_left, T_right, taxon_namespace = tree.taxon_namespace)
    runtime = time.time() - start_time
    print("--- %s seconds ---" % runtime)
    return joined_tree

In [31]:
joined_tree = test_merge(partition, binary_tree)

one - merging:  128  out of:  256
two -  merging:  1  out of:  256
--- 9.967319011688232 seconds ---


In [32]:
RF,F1 = reconstruct_tree.compare_trees(tree_rec, binary_tree)

print("RF = ",RF)
print("F1% = ",F1) 

RF =  2
F1% =  99.90215264187869


The merging part have problem. 