# GTDB bac120 tree (release 202)

to sort bacterial genomes in an order that reflects the phylogenetic relationship.

In [1]:
import urllib
import shutil
import ete3
import pandas as pd
from pyscripts.config import path2

In [2]:
ftp_source  = 'https://data.ace.uq.edu.au/public/gtdb/data/releases/release202/202.0/bac120_r202.tree'
file_target = path2.metadata/'bac120_r202.tree'

with urllib.request.urlopen(ftp_source) as response, open(file_target, 'wb') as outfile:
    shutil.copyfileobj(response, outfile)

In [3]:
bac120_r202_tree = ete3.Tree(str(file_target), format=1, quoted_node_names=True)

# The colon-separated support values and node names (GTDB taxonomy) are loaded into the member variable 'name'.
# This can be fixed as follows:
for node in bac120_r202_tree.traverse():
    if node.is_leaf() or node.is_root(): continue
    # Divide by 100 to normalize the support value to fall between 0~1
    node.support = float(node.name.split(':', 1)[0]) / 100
    # Update node names (if any). Remove spaces in the taxon separator.
    node.name = (node.name.split(':', 1)[1]).replace('; ', ';') if ':' in node.name else ''
    
bac120_r202_tree.ladderize()

45555

In [4]:
pd.to_pickle(bac120_r202_tree, path2.metadata/'bac120_r202.tree.pkl.bz2')