# New QA tests

We are going to make new QA tests, by reconstructing bigger trees.

In [1]:
import sys
sys.path.append('../')
from qa_functions import *
from colorama import Fore
import time
import networkx as nx
import random
import re
import os

In [None]:
file_names = os.listdir('alignments')

# Remove the folder test
file_names = file_names[:-1]

for folder in file_names:
    if int(folder) > 31:
        files = os.listdir(f'alignments/{folder}')
        for file in files:
            
            print(f'alignments/{folder}/{file}')
            # Load the sequences and create simmilarity matrix
            sequences,labels = load_sequences(f'./alignments/{folder}/{file}',format='fasta')
            distance_matrix = compute_distance_matrix(sequences)

            # Save the distance matrix
            file_ext = re.search(r'_([0-9]+)\.',file).group(1)
            matrix_file = new_file = f'matrix_{file_ext}'
            np.save(f'./matrices/{folder}/{matrix_file}',distance_matrix)
            
            # Begin NMcutQA
            timer = Timer(0.0)
            tree_qa = phylo_tree(distance_matrix,timer=timer)
            
            with open('timer.csv','a') as fp:
                fp.write(f'{folder},{file},{timer.value}\n')
            
            new_file = f'qa_tree_{file_ext}.newick'
            tree_qa.create_newick_file(f'./trees/{folder}/{new_file}')
            print(f'Finished {file} in {timer.value}ms')

            # Erase memory
            del sequences
            del labels
            del distance_matrix
            del tree_qa

In [2]:
file = 'assembled_sequences_49038.fasta'
folder = '43'

print(f'alignments/{folder}/{file}')
# Load the sequences and create simmilarity matrix
sequences,labels = load_sequences(f'../alignments/{folder}/{file}',format='fasta')
distance_matrix = compute_distance_matrix(sequences)

# Save the distance matrix
file_ext = re.search(r'_([0-9]+)\.',file).group(1)
matrix_file = new_file = f'matrix_{file_ext}'
np.save(f'../matrices/{folder}/{matrix_file}',distance_matrix)

# Begin NMcutQA
timer = Timer(0.0)
tree_qa = phylo_tree(distance_matrix,timer=timer)

with open('../metrics/timer.csv','a') as fp:
    fp.write(f'{folder},{file},{timer.value}\n')

new_file = f'qa_tree_{file_ext}.newick'
tree_qa.create_newick_file(f'../trees/{folder}/{new_file}')
print(f'Finished {file} in {timer.value}ms')

# Erase memory
del sequences
del labels
del distance_matrix
del tree_qa

alignments/43/assembled_sequences_49038.fasta
Finished assembled_sequences_49038.fasta in 2446.8066400000002ms


In [8]:
with open(f'../trees/{folder}/{new_file}','r') as file:
    qa_tree = file.read()

with open(f'../trees/{folder}/tree_best_{file_ext}.newick','r') as file:
    biotree = file.read()
biotree = re.sub(r'taxon([0-9]+)',lambda match: str(int(match.group(1)) - 1),biotree)
biotree = re.sub(r':[0-9]\.[0-9]+(e-[0-9]+)*',r'',biotree)

dist = treecmp(qa_tree,biotree)

print(f'The Robinson-Foulds distance bewteen the Ncut with QA tree and the Neighbor-Joining tree is \033[1m{dist}\033[22m')

The Robinson-Foulds distance bewteen the Ncut with QA tree and the Neighbor-Joining tree is [1m25.0[22m


We have generated alignments for trees from size 100 to 150. Remember, the expected number of subproblems is given by the expression:
$$\mu(x)=0.35x\cdot \log_2{10.82x}.$$

Given that a problem usually takes around 3 seconds to be enter and run in the remote QPU, the expected total time for the execution will be:

In [16]:
def f(x):
    return 0.35*x*np.log2(10.82*x)

x = np.arange(100,151,2)
print(f'The full execution will take approximately {round(np.sum(f(x)*3/60)/60,3)} hours')

The full execution will take approximately 9.87 hours


We can run the program here, but it should be run in CESGA or similar.

In [None]:
file_names = os.listdir('../alignments')

# Remove the folder test
file_names = file_names[:-1]

for folder in file_names:
    if int(folder) > 99 and int(folder) < 125:
        files = os.listdir(f'../alignments/{folder}')
        for file in files:
            
            print(f'alignments/{folder}/{file}')
            # Load the sequences and create simmilarity matrix
            sequences,labels = load_sequences(f'../alignments/{folder}/{file}',format='fasta')
            distance_matrix = compute_distance_matrix(sequences)

            # Save the distance matrix
            file_ext = re.search(r'_([0-9]+)\.',file).group(1)
            matrix_file = new_file = f'matrix_{file_ext}'
            np.save(f'../matrices/{folder}/{matrix_file}',distance_matrix)
            print(f'Alignment file generated, generating the phylogenetic tree...')
            # Begin NMcutQA
            timer = Timer(0.0)
            tree_qa = phylo_tree(distance_matrix,timer=timer)

            with open('../metrics/timer.csv','a') as fp:
                fp.write(f'{folder},{file},{timer.value}\n')

            new_file = f'qa_tree_{file_ext}.newick'
            tree_qa.create_newick_file(f'../trees/{folder}/{new_file}')
            print(f'Finished {file} in {timer.value}ms\n---------------------------------------------------------\n')

            # Erase memory
            del sequences
            del labels
            del distance_matrix
            del tree_qa

alignments/100/assembled_sequences_53321.fasta
Alignment file generated, generating the phylogenetic tree...
