In [2]:
import sys
import inspect
from jw_utils import jw_draw_tree as jdt
from ete3 import ncbi_taxonomy
from ete3 import Tree
ncbi_tax = ncbi_taxonomy.NCBITaxa()
import json
from jw_utils import ncbi_datasets_fxs as nfx
from orthofinder_utils import dash_ortho_parser_d as dop
from Bio import Phylo
from plotly import graph_objects as go
from jw_utils import jw_ncbi_taxonomy as jnt
import os


In [3]:
sp_tree_path = './data/Species_Tree/SpeciesTree_rooted_node_labels.txt'
path_to_summary ='./data/summary_data/summaries.json'

In [4]:
def accession_taxid_d(path_to_summary):
    """Return simplified dict {acc:taxid} from ncbi summary.json."""
    
    with open(path_to_summary, 'r') as f:
        summary_d = nfx.make_summary_dict(json.load(f))
    acc_tax_d ={}
    for acc in summary_d:
        acc_tax_d[acc] = summary_d[acc]['org']['tax_id']
    return acc_tax_d



def get_rank_sciname_for_each_leaf(tree, rank, path_to_summary):
    """Return a dict with a sci_name at a given rank for each leaf {leaf:sci_name}
    e.g. given rank phylum: return {GCF_######:'Proteobacteria'}
    """

    acc_tax_d =  accession_taxid_d(path_to_summary)   
    leaf_rank_sciname = {}
    for leaf in tree.get_terminals():
        leaf_taxid = acc_tax_d[leaf.name]
        leaf_rank_taxid = jnt.get_lineage_rank_dict(leaf_taxid).get(rank)
        if leaf_rank_taxid:
            leaf_rank_sciname[leaf.name] = list(ncbi_tax.get_taxid_translator([leaf_rank_taxid]).values())[0]
        else:
            leaf_rank_sciname[leaf.name] = None
    return leaf_rank_sciname



def get_list_of_all_nodes(subtree, clades=None):
    """Get a list of all clade objects, including leaves, in a Bio.Phylo tree"""
    if not isinstance(subtree, (Phylo.Newick.Tree, Phylo.Newick.Clade)):
        raise TypeError(f'object entered needs to be of type {Phylo.Newick.Tree} or {Phylo.Newick.Clade}, you entered {type(subtree)}' )
        
    if clades is None:
        clades=[]
    for cl in subtree.root:
        clades.append(cl)
        if cl.is_terminal():
             clades.append(cl)
        else:
            get_list_of_all_nodes(cl, clades)
    return clades



def get_nodes_assoc_with_ranks(tree, rank, path_to_summary):
    """"""
    leaf_rank_scinames_d = get_rank_sciname_for_each_leaf(tree, rank, path_to_summary)
    node_dict = {}
    for cl in get_list_of_all_nodes(tree):
        #check to see if all leaves in clade have same sci_name, if so then clade is that sci_name
        leaves =  cl.get_terminals()
        sci_names = []

        for leaf in leaves:
            sci_names.append(leaf_rank_scinames_d[leaf.name])
        sci_names = list(set(sci_names))
        if len(sci_names) == 1:
            node_dict[cl.name] = sci_names[0]
    unique_scinames = set(list(node_dict.values()))
    rank_nodes = {n:[] for n in unique_scinames}
    for node, sci_name in node_dict.items():
        rank_nodes[sci_name].append(node)
    return rank_nodes


def get_anc_node_each_rank(tree, rank, path_to_summary):
    """"""
    rank_nodes = get_nodes_assoc_with_ranks(tree, rank, path_to_summary)
    mrca_clades = {}
    for rank_name, clades in rank_nodes.items():
        leaves = []
        for cl in clades:
            clade = tree.find_any(cl) 
            if clade.is_terminal():
                leaves.append(clade)
        mrca_clade  = tree.is_monophyletic(leaves)
        if mrca_clade:
            mrca_clades[rank_name] = mrca_clade.name
        else:
            mrca_clades[rank_name] = 'not monophyletic'
    return mrca_clades


In [6]:
tree = Phylo.read(sp_tree_path, format='newick')
rank='class'
all_clades = get_list_of_all_nodes(tree)
    
mrca_clades = get_anc_node_each_rank(tree, 'phylum', path_to_summary)

def make_mrcaClade_file(tree, path_to_summary):
    rank_mrca_clades_d = {}
    available_ranks = ['no rank', 'superkingdom','family', 'genus', 'phylum', 'class', 'order', 'species']
    for rank in available_ranks:
        mrca_clades = get_anc_node_each_rank(tree, rank, path_to_summary)
        rank_mrca_clades_d[rank] = mrca_clades
    with open('./ranks_mrca_clades.json', 'w') as f:
        json.dump(rank_mrca_clades_d, f)
    return rank_mrca_clades_d  

make_mrcaClade_file(tree, path_to_summary)

{'no rank': {'Pseudomonas amygdali pv. aesculi': 'GCF_001400675.1',
  'Pseudomonas syringae pv. helianthi': 'GCF_022557235.1',
  'Pseudomonas amygdali pv. loropetali': 'GCF_023207855.1',
  'Pseudomonas coronafaciens pv. coronafaciens': 'GCF_003699955.1',
  'Pseudomonas syringae pv. lapsa': 'N201',
  'Pseudomonas savastanoi pv. savastanoi': 'N183',
  'unclassified Pseudomonas': 'not monophyletic',
  'Pseudomonas syringae group pathovars incertae sedis': 'N130',
  'cellular organisms': 'not monophyletic',
  'Pseudomonas cannabina pv. alisalensis': 'GCF_016599635.1',
  'Pseudomonas syringae pv. atrofaciens': 'GCF_003047185.1'},
 'superkingdom': {'Bacteria': 'N0'},
 'family': {'Moraxellaceae': 'N2',
  'Pseudomonadaceae': 'N3',
  'Burkholderiaceae': 'GCF_002362295.1'},
 'genus': {'Acinetobacter': 'N2',
  'Halopseudomonas': 'not monophyletic',
  'Burkholderia': 'GCF_002362295.1',
  'Stutzerimonas': 'not monophyletic',
  'Pseudomonas': 'not monophyletic'},
 'phylum': {'Pseudomonadota': 'N0'},

In [5]:
def read_mrca_file():
    with open('./ranks_mrca_clades.json', 'r') as f:
        return json.load(f)
rank_mrca_clades_d = read_mrca_file()
rank_mrca_clades_dget_terminals

FileNotFoundError: [Errno 2] No such file or directory: './ranks_mrca_clades.json'

In [6]:
def make_internal_node_d(tree):
    """"""
internal_node_dict = {}
for node in tree.get_nonterminals():
    internal_node_dict[node.name] = {'name':node.name, 'rank':'', 'taxid':'', 'sci_name':''}
for node in tree.get_terminals():
        internal_node_dict[node.name] = {'name':node.name}
internal_node_dict

with open('./data/summary_data/internal_node_dict.json', 'w') as f:
    json.dump(internal_node_dict, f)
internal_node_dict

{'N0': {'name': 'N0', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N1': {'name': 'N1', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N3': {'name': 'N3', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N7': {'name': 'N7', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N15': {'name': 'N15', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N22': {'name': 'N22', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N30': {'name': 'N30', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N8': {'name': 'N8', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N16': {'name': 'N16', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N23': {'name': 'N23', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N4': {'name': 'N4', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N9': {'name': 'N9', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N17': {'name': 'N17', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N24': {'name': 'N24', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N31': {'name': 'N31', 'rank': '', 'taxid': '', 'sci_name': ''},
 'N39': {'name': 'N39', 

In [7]:
def get_hover_text(tree):
    """return a list of f strings for plotly hoverdata that correspond to each node on the tree.
    
    Note: Terminal nodes will not be annotated beyond what is input in the clade.name field.

    """
        
    with open('./data/summary_data/internal_node_dict.json', 'r') as f:
        int_node_dict = json.load(f)
    text=[]
    my_clades = tree.depths().keys()
    for node in my_clades:
        if node.is_terminal():
            text.append(node.name)
        else:
            text_dict = int_node_dict.get(node.name)
            taxid = text_dict["taxid"]
            sci_name = text_dict["sci_name"]
            rank = text_dict["rank"]
            data =  (f'name: {node.name}<br>'
                    f'sci_name: {sci_name}<br>'
                    f'rank: {rank}<br>'        
                    f'taxid: {taxid}<br>'
                )
            text.append(data)
    return text

text = get_hover_text(tree)

In [8]:
t=[{'prop_id': 'copynumber_bargraph.clickData', 'value': {'points': [{'curveNumber': 0, 'pointNumber': 116, 'pointIndex': 116, 'x': 0.0834335, 'y': 66.12944335937499, 'text': 'name: N30<br>sci_name: Bacteria<br>rank: superkingdom<br>taxid: 2<br>', 'marker.size': 6, 'marker.color': 'rgb(25,25,25)', 'bbox': {'x0': 538.77, 'x1': 544.77, 'y0': 609.8299999999999, 'y1': 615.8299999999999}}]}}, {'prop_id': 'name_drpdwn_keyword.value', 'value': 'GCF_001647695.1'}]
t[0]['prop_id']

'copynumber_bargraph.clickData'

In [9]:
mrca_clades
options = [{'label':name,'value':cl_obj} for name, cl_obj in mrca_clades.items()]

In [10]:
mrca_clades = get_anc_node_each_rank(tree, rank='phylum', path_to_summary = path_to_summary)
cl_to_highlight = mrca_clades['Pseudomonadota']
cl_to_highlight


'N0'

In [11]:
fig = jdt.create_tree(tree, hover_text = text)#, cl_to_highlight=cl_to_highlight)


TypeError: create_tree() got an unexpected keyword argument 'hover_text'

In [None]:
go.Figure(fig)