# Parse ASTRAL-pro3 trees

In [1]:
import sys, re, os
import pandas as pd
import numpy as np
from skbio.tree import TreeNode

Functions

In [2]:
def assign_supports(tree):
    for node in tree.traverse():
        if node.is_tip() or node.is_root():
            node.support = None
        else:
            node.support = node.name
            node.name = None

In [3]:
def order_nodes(tree, increase=True):
    res = tree.copy()
    for node in res.postorder():
        if node.is_tip():
            node.n = 1
        else:
            node.n = sum(x.n for x in node.children)
    for node in res.postorder():
        if not node.is_tip():
            child2n = {x: x.n for x in node.children}
            node.children = []
            for child in sorted(child2n, key=child2n.get, reverse=increase):
                node.append(child)
    for node in res.postorder():
        delattr(node, 'n')
    return res

# Batch

In [4]:
def parse_tree_astralpro(dataPathIn, dataPathOut, k, p, rep, version):
    # Read newick file
    with open(f'{dataPathIn}/sps_k_{k}_p_{p}_rep_{rep}_astralpro{version}.nwk', 'r') as f:
        nwk = f.read().strip()
    # Replace complex nodes with simple labels
    tmplabs = []
    def replace(match):
        # Get the first subgroup from re.match
        tmplabs.append(match.group(1))
        return f'X{len(tmplabs)}'
    nwk = re.sub(r'\'\[([^\[\]]+)\]\'', replace, nwk)
    # Convert string into TreeNode object
    tree = TreeNode.read([nwk])
    # Convert indices into branch supports
    assign_supports(tree)
    # Order nodes by number of childs
    tree = order_nodes(tree, increase = False)
    # Assign incremental nodes IDs
    i = 1
    for node in tree.levelorder(include_self = True):
        if not node.is_tip():
            node.name = f'N{i}'
            i += 1
    # Extract node metadata
    metadata = []
    for node in tree.levelorder(include_self = True):
        if node.is_tip():
            continue
        if node.support is None:
            continue
        label = tmplabs[int(node.support[1:]) - 1]
        attrs = dict(x.split('=') for x in label.split(';'))
        attrs['node'] = node.name
        metadata.append(attrs)
    # Generate metadata table
    df = pd.DataFrame(metadata).set_index('node')
    # Save metadata
    df.to_csv(f'{dataPathOut}/metadata_sps_k_{k}_p_{p}_rep_{rep}_astralpro{version}.tsv', sep = '\t')
    # Save tree with node ids
    tree.write(f'{dataPathOut}/nid_sps_k_{k}_p_{p}_rep_{rep}_astralpro{version}.nwk')
    # Save tree with local posterior probabilities
    lpps = df['pp1'].to_dict()
    t = tree.copy()
    for node in t.non_tips(include_self=True):
        if node.name in lpps:
            lpp = lpps[node.name]
            node.name = '1.0' if lpp == '1.0' else f'{float(lpp):.3f}'
        else:
            node.name = None
    t.write(f'{dataPathOut}/lpp_sps_k_{k}_p_{p}_rep_{rep}_astralpro{version}.nwk')
    # Save tree without node labels
    for node in t.non_tips(include_self = True):
        node.name = None
    t.write(f'{dataPathOut}/nlabels_sps_k_{k}_p_{p}_rep_{rep}_astralpro{version}.nwk')

In [5]:
max_copies = [1, 5]
noises = [0.25, 0.5]
G = 50
S = 10
replicates = np.arange(0, 25, dtype = int)
ks = np.linspace(1, G - 1, dtype = int, num = 11) # Number of marker genes
ps = [-100, -80, -60, -40, -20, 0, 20, 40, 60, 80, 100] # Exponent of power mean
astralpro_vs = [2]

In [6]:
%%time
for copies in max_copies:
    print(f'Copies: {copies}')
    for noise in noises:
        dataPathIn = f'./toy/simulations_G_50_S_10/multicopy_trees_{copies}_copy_noise_{noise}/pipeline'
        dataPathOut = f'./toy/simulations_G_50_S_10/multicopy_trees_{copies}_copy_noise_{noise}/parsed'
        os.system(f'mkdir -p {dataPathOut}')
        for rep in replicates:
            # print(f'Replicate: {rep}')
            for k in ks:
                for p in ps:
                    for version in astralpro_vs:
                        try:
                            parse_tree_astralpro(dataPathIn, dataPathOut, k, p, rep, version)
                        except:
                            print(f'\tCannot be parsed. Error. k: {k}, p: {p}, rep :{rep}')

Copies: 1
Copies: 5
CPU times: user 29.6 s, sys: 5.67 s, total: 35.2 s
Wall time: 37 s


In [7]:
def parse_tree_astralpro_shuffled(dataPathIn, dataPathOut, k, p, rep, version):
    # Read newick file
    with open(f'{dataPathIn}/sps_shuffled_k_{k}_p_{p}_rep_{rep}_astralpro{version}.nwk', 'r') as f:
        nwk = f.read().strip()
    # Replace complex nodes with simple labels
    tmplabs = []
    def replace(match):
        # Get the first subgroup from re.match
        tmplabs.append(match.group(1))
        return f'X{len(tmplabs)}'
    nwk = re.sub(r'\'\[([^\[\]]+)\]\'', replace, nwk)
    # Convert string into TreeNode object
    tree = TreeNode.read([nwk])
    # Convert indices into branch supports
    assign_supports(tree)
    # Order nodes by number of childs
    tree = order_nodes(tree, increase = False)
    # Assign incremental nodes IDs
    i = 1
    for node in tree.levelorder(include_self = True):
        if not node.is_tip():
            node.name = f'N{i}'
            i += 1
    # Extract node metadata
    metadata = []
    for node in tree.levelorder(include_self = True):
        if node.is_tip():
            continue
        if node.support is None:
            continue
        label = tmplabs[int(node.support[1:]) - 1]
        attrs = dict(x.split('=') for x in label.split(';'))
        attrs['node'] = node.name
        metadata.append(attrs)
    # Generate metadata table
    df = pd.DataFrame(metadata).set_index('node')
    # Save metadata
    df.to_csv(f'{dataPathOut}/metadata_sps_shuffled_k_{k}_p_{p}_rep_{rep}_astralpro{version}.tsv', sep = '\t')
    # Save tree with node ids
    tree.write(f'{dataPathOut}/nid_sps_shuffled_k_{k}_p_{p}_rep_{rep}_astralpro{version}.nwk')
    # Save tree with local posterior probabilities
    lpps = df['pp1'].to_dict()
    t = tree.copy()
    for node in t.non_tips(include_self=True):
        if node.name in lpps:
            lpp = lpps[node.name]
            node.name = '1.0' if lpp == '1.0' else f'{float(lpp):.3f}'
        else:
            node.name = None
    t.write(f'{dataPathOut}/lpp_sps_shuffled_k_{k}_p_{p}_rep_{rep}_astralpro{version}.nwk')
    # Save tree without node labels
    for node in t.non_tips(include_self = True):
        node.name = None
    t.write(f'{dataPathOut}/nlabels_sps_shuffled_k_{k}_p_{p}_rep_{rep}_astralpro{version}.nwk')

In [8]:
%%time
for copies in max_copies:
    print(f'Copies: {copies}')
    for noise in noises:
        dataPathIn = f'./toy/simulations_G_50_S_10/multicopy_trees_{copies}_copy_noise_{noise}/pipeline'
        dataPathOut = f'./toy/simulations_G_50_S_10/multicopy_trees_{copies}_copy_noise_{noise}/parsed'
        os.system(f'mkdir -p {dataPathOut}')
        for rep in replicates:
            # print(f'Replicate: {rep}')
            for k in ks:
                for p in ps:
                    for version in astralpro_vs:
                        try:
                            parse_tree_astralpro_shuffled(dataPathIn, dataPathOut, k, p, rep, version)
                        except:
                            print(f'\tCannot be parsed. Error. k: {k}, p: {p}, rep :{rep}')

Copies: 1
Copies: 5
CPU times: user 30.6 s, sys: 6.71 s, total: 37.3 s
Wall time: 40.5 s
