In [1]:
import pandas as pd

%config InlineBackend.figure_format = 'svg'

In [2]:
df = pd.read_parquet('../data/dataset.parquet')
pangolin_lineages = df['pangolin_lineage']

In [3]:
df.head(10)

Unnamed: 0,strain,pangolin_lineage,date,date_submitted,sequence
0,hCoV-19/Benin/BJ-00332/2022,BA.1.1.1,2022-01-05,2022-08-05,AACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAAT...
1,hCoV-19/Benin/BJ-00171/2021,BA.1.1,2021-12-18,2022-08-05,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...
2,hCoV-19/Benin/BJ-00177/2021,BA.1.1,2021-12-18,2022-08-05,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...
3,hCoV-19/Benin/BJ-00047/2021,BA.1.15,2021-12-16,2022-08-05,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...
4,hCoV-19/Benin/BJ-00381/2021,BA.1.1,2021-12-31,2022-08-05,ACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAA...
5,hCoV-19/Benin/BJ-00270/2021,BA.1.1,2021-12-30,2022-08-05,TACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGAT...
6,hCoV-19/Benin/BJ-00152/2021,BA.1.1,2021-12-20,2022-08-05,AGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCT...
7,hCoV-19/Benin/BJ-00180/2021,BA.1.1,2021-12-18,2022-08-05,TACCTTCCTAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGAT...
8,hCoV-19/Benin/BJ-00213/2021,BA.1.1,2021-12-30,2022-08-05,AAGGTTTATACCTTCCTAGGTAACAAACCAACCAACTTTCGATCTC...
9,hCoV-19/Benin/BJ-00062/2021,BA.1.1,2021-12-16,2022-08-05,TACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGAT...


In [26]:
def extract_taxonomy(lineages, root="root"):
    hierarchy = {root : set()}
    for lineage in lineages:
        levels = lineage.split(".")
        hierarchy[root].add(levels[0])
        key = levels[0]
        for level in levels[1:]:
            if key not in hierarchy:
                hierarchy[key] = set()

            value = key + "." + level
            hierarchy[key].add(value)
            key = value

    return hierarchy

def extract_level_nodes(hierarchy):
    level_nodes = {}
    for _, children in hierarchy.items():
        for child in children:
            level = child.count(".") + 1
            if level not in level_nodes:
                level_nodes[level] = set()
            level_nodes[level].add(child)

    return level_nodes


In [32]:
taxonomy = extract_taxonomy(pangolin_lineages)
levels = extract_level_nodes(taxonomy)

In [34]:
from ete3 import Tree, TreeStyle, TextFace, CircleFace, NodeStyle

# Define the taxonomy as a Newick string
newick_str = "(root,(A,AY,(AY.109,AY.121,AY.122,AY.126,AY.127,AY.34,AY.36,AY.37,AY.39,AY.4,AY.44,AY.45,AY.6),(B,(B.1,(B.1.1,(B.1.1.1,B.1.1.318,B.1.1.7),B.1.351,B.1.356,B.1.525,B.1.617,(B.1.617.2)))),BA,(BA.1,(BA.1.1,(BA.1.1.1),BA.1.14,BA.1.15),BA.2),L,(L.3),Q,(Q.4),R,(R.1)),(A.27));"

# Create a Tree object from the Newick string
tree = Tree(newick_str)

# Define a custom tree style
tree_style = TreeStyle()
tree_style.mode = "r"  # Rectangular layout
tree_style.show_leaf_name = True  # Show leaf node names
tree_style.show_branch_length = False  # Hide branch lengths
tree_style.show_branch_support = False  # Hide branch support values

# Create a custom node style for leaf nodes
leaf_style = NodeStyle()
leaf_style["size"] = 10  # Adjust leaf node size
leaf_style["fgcolor"] = "black"  # Set leaf node color

# Create a custom node style for internal nodes
internal_style = NodeStyle()
internal_style["shape"] = "circle"  # Set internal node shape to circle
internal_style["fgcolor"] = "black"  # Set internal node color
internal_style["size"] = 10  # Adjust internal node size

# Apply the custom styles to the tree
tree.set_style(internal_style)
for leaf in tree.get_leaves():
    leaf.set_style(leaf_style)

# Render the tree
tree.render("phylogenetic_tree.png", tree_style=tree_style);