In [2]:
from orthofinder_utils import dash_app_preprocess as dap
from orthofinder_utils import proteomes_for_orthofinder as pfo
from orthofinder_utils import dash_ortho_parser_d as dop
from jw_utils import parse_gff as pgf
import os
import pandas as pd
import json
from Bio import Phylo
import shutil
import plotly.graph_objects as go
from jw_utils import ncbi_datasets_fxs as nfx
from jw_utils import jw_ncbi_taxonomy as jnt
from ete3 import ncbi_taxonomy
ncbi_tax = ncbi_taxonomy.NCBITaxa()

colors = {
    't_blue': 'rgba(0,102,153,255)',
    't_green': 'rgba(61,174,43,255)',
    't_red': 'rgb(255,20,20)',
    'seagreen':'#2c8d42',
    'orange':'#F9B257'}

ncbi.datasets module not found. To install, run `pip install ncbi-datasets-pylib`.


### Setup for starting a orthofinder dash app    
***This is assuming you have run orthofinder on a set of proteomes and have the resulting    orthofinder data in a folder called Proteomes  
1. CHANGE WORKING DIRECTORY of this notebook TO dash_app main folder   
2. make a directory inside the main /dash_app folder called data (dash_app/data).
    - `$ !mkdir ./data`
    - `$ !mkdir ./data/Proteomes`
    - `$ !mv ./Proteomes/* ./data/Proteomes`

3. Place the ncbi_dataset/data containing all the data downloaded from ncbi into ./data
    - dash_app/data/ncbi_datasets/data  
4. You will need to generate a summary file of all genomes. Using the terminal is the most   
reliable way I've found to do this.  
    - `$ !datasets summary genome accession --inputfile accessions.txt > summaries.json`
 
 4) Run `dap.run_all()` in the cell below. If you haven't generated a summary file  
 then an error will be thrown, and code to get summary file via ncbi datasets cli will be  printed out


### Check that species in different directories are equivalent

In [23]:
def check_for_equivalent_species():
    if not os.path.exists('./data/summary_data/'):
        os.makedirs('./data/summary_data/')
    if not os.path.exists('./accessions.txt'):
        raise FileNotFoundError('"./accessions.txt" could not be found.')
    proteomes = [f[:15] for f in os.listdir('./data/Proteomes') if f.endswith('.faa')]
    assemblies = [f[:15] for f in os.listdir('./data/ncbi_dataset/data') if f.startswith('GC')]
    with open('./accessions.txt', 'r') as f:
        accessions = [line.strip() for line in f]
    if not sorted(accessions) == sorted(assemblies):
        raise Exception(f'"./data/ncbi_dataset/data" and "./accessions.txt" are not equivalent') 
    elif not sorted(assemblies) == sorted(proteomes):
        raise Exception(f'"./data/ncbi_dataset/data" and "/data/Proteomes" are not equivalent')
    elif not sorted(accessions) == sorted(proteomes):
        raise Exception(f'"./accessions.txt" and "/data/Proteomes" are not equivalent')
    else:
        print('All directories contain equivalent species')
    shutil.copy('./accessions.txt', './data/summary_data/accessions.txt')
    
accs_p = [a[:15] for a in os.listdir('./data/Proteomes/') if a.startswith('GC')]
acc_assemblies = [f[:15] for f in os.listdir('./data/ncbi_dataset/data') if f.startswith('GC')]
with open('./accessions.txt', 'r') as f:
    accs_a = []
    for line in f: 
        accs_a.append(line.strip())
with open('./accessions.txt', 'w') as f:
    accs_a = list(set(accs_a))
    for acc in accs_a:
        f.write(f'{acc}\n')



for acc in accs_p:
    if acc not in accs_a:
        print(f'Proteome {acc} not in ./accessions.txt')
for acc in accs_a:
    if acc not in accs_p:
        print(f'Assession {acc} from ./assessions.txt  not in ./data/Proteomes/')

In [24]:
check_for_equivalent_species()
!datasets summary genome accession --inputfile ./data/summary_data/accessions.txt > ./data/summary_data/summaries.json

All directories contain equivalent species
New version of client (14.28.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/mac/datasets


## Generate data files for dash app
### !!Change path for latest orthofinder results!!

In [25]:
if not os.path.exists('./data/Proteomes/OrthoFinder'):
    os.mkdir('./data/Proteomes/OrthoFinder')

In [26]:
path_to_parent=os.getcwd()
path_to_results = './data/Proteomes/OrthoFinder/Results_May12_2'
dap.run_all(path_to_results)

  HOGs_df = pd.read_csv(path, sep = '\t')


all checks produce equivalent assembly identifiers


  orth_assemblies = list(pd.read_csv(os.path.join(path_to_results,


# Three more files to make:  
1. json annotation file. THis is the default if you want any extra annoations on internal nodes
2. json file with  

### Make tree annotation file

In [27]:
sp_tree_path = './data/Species_Tree/SpeciesTree_rooted_node_labels.txt'
path_to_summary ='./data/summary_data/summaries.json'

def make_internal_node_d(tree):
    """"""
    internal_node_dict = {}
    for node in tree.get_nonterminals():
        internal_node_dict[node.name] = {'name':node.name, 'rank':'', 'taxid':'', 'sci_name':''}
    for node in tree.get_terminals():
            internal_node_dict[node.name] = {'name':node.name}
    internal_node_dict

    with open('./data/summary_data/internal_node_dict.json', 'w') as f:
        json.dump(internal_node_dict, f)
    internal_node_dict

make_internal_node_d(Phylo.read(sp_tree_path, format='newick'))

### Make node rank lineage file

In [28]:
def accession_taxid_d(path_to_summary):
    """Return simplified dict {acc:taxid} from ncbi summary.json."""
    
    with open(path_to_summary, 'r') as f:
        summary_d = nfx.make_summary_dict(json.load(f))
    acc_tax_d ={}
    for acc in summary_d:
        acc_tax_d[acc] = summary_d[acc]['org']['tax_id']
    return acc_tax_d



def get_rank_sciname_for_each_leaf(tree, rank, path_to_summary):
    """Return a dict with a sci_name at a given rank for each leaf {leaf:sci_name}
    e.g. given rank phylum: return {GCF_######:'Proteobacteria'}
    """

    acc_tax_d =  accession_taxid_d(path_to_summary)   
    leaf_rank_sciname = {}
    for leaf in tree.get_terminals():
        leaf_taxid = acc_tax_d[leaf.name]
        leaf_rank_taxid = jnt.get_lineage_rank_dict(leaf_taxid).get(rank)
        if leaf_rank_taxid:
            leaf_rank_sciname[leaf.name] = list(ncbi_tax.get_taxid_translator([leaf_rank_taxid]).values())[0]
        else:
            leaf_rank_sciname[leaf.name] = None
    return leaf_rank_sciname



def get_list_of_all_nodes(subtree, clades=None):
    """Get a list of all clade objects, including leaves, in a Bio.Phylo tree"""
    if not isinstance(subtree, (Phylo.Newick.Tree, Phylo.Newick.Clade)):
        raise TypeError(f'object entered needs to be of type {Phylo.Newick.Tree} or {Phylo.Newick.Clade}, you entered {type(subtree)}' )
        
    if clades is None:
        clades=[]
    for cl in subtree.root:
        clades.append(cl)
        if cl.is_terminal():
             clades.append(cl)
        else:
            get_list_of_all_nodes(cl, clades)
    return clades



def get_nodes_assoc_with_ranks(tree, rank, path_to_summary):
    """"""
    leaf_rank_scinames_d = get_rank_sciname_for_each_leaf(tree, rank, path_to_summary)
    node_dict = {}
    for cl in get_list_of_all_nodes(tree):
        #check to see if all leaves in clade have same sci_name, if so then clade is that sci_name
        leaves =  cl.get_terminals()
        sci_names = []

        for leaf in leaves:
            sci_names.append(leaf_rank_scinames_d[leaf.name])
        sci_names = list(set(sci_names))
        if len(sci_names) == 1:
            node_dict[cl.name] = sci_names[0]
    unique_scinames = set(list(node_dict.values()))
    rank_nodes = {n:[] for n in unique_scinames}
    for node, sci_name in node_dict.items():
        rank_nodes[sci_name].append(node)
    return rank_nodes


def get_anc_node_each_rank(tree, rank, path_to_summary):
    """"""
    rank_nodes = get_nodes_assoc_with_ranks(tree, rank, path_to_summary)
    mrca_clades = {}
    for rank_name, clades in rank_nodes.items():
        leaves = []
        for cl in clades:
            clade = tree.find_any(cl) 
            if clade.is_terminal():
                leaves.append(clade)
        mrca_clade  = tree.is_monophyletic(leaves)
        if mrca_clade:
            mrca_clades[rank_name] = mrca_clade.name
        else:
            mrca_clades[rank_name] = 'not monophyletic'
    return mrca_clades


def get_mrca_rank_dict(tree, path_to_summary):
    """Writs"""
    rank_mrca_clades_d = {}
    available_ranks = ['no rank', 'superkingdom','family', 'genus', 'phylum', 'class', 'order', 'species']
    for rank in available_ranks:
        mrca_clades = get_anc_node_each_rank(tree, rank, path_to_summary)
        rank_mrca_clades_d[rank] = mrca_clades
    return rank_mrca_clades_d


def write_mrcaClade_file(out_filepath, tree_path, summary_path):
    tree = Phylo.read(tree_path, format='newick')
    d = get_mrca_rank_dict(tree, summary_path)
    with open(out_filepath, 'w') as f:
        json.dump(d, f)
        
def read_mrca_file():
    with open('./ranks_mrca_clades.json', 'r') as f:
        return json.load(f)


In [29]:
sp_tree_path = './data/Species_Tree/SpeciesTree_rooted_node_labels.txt'
path_to_summary ='./data/summary_data/summaries.json'

tree = Phylo.read(sp_tree_path, format='newick')
write_mrcaClade_file('ranks_mrca_clades.json', sp_tree_path, path_to_summary)
rank_mrca_clades_d = get_mrca_rank_dict(tree, path_to_summary)

rank_mrca_clades_d = read_mrca_file()
rank_mrca_clades_d

{'no rank': {'cellular organisms': 'N0'},
 'superkingdom': {'Bacteria': 'N0'},
 'family': {'Moraxellaceae': 'N0'},
 'genus': {'Acinetobacter': 'N0'},
 'phylum': {'Pseudomonadota': 'N0'},
 'class': {'Gammaproteobacteria': 'N0'},
 'order': {'Moraxellales': 'N0'},
 'species': {'Acinetobacter colistiniresistens': 'GCF_000413935.1',
  'Acinetobacter baumannii': 'N4',
  'Acinetobacter gyllenbergii': 'GCF_001682515.1',
  'Acinetobacter baylyi': 'GCF_000046845.1',
  'Acinetobacter nosocomialis': 'GCF_005281455.1'}}