In [1]:
import sys
import inspect
from jw_utils import jw_draw_tree as jdt
from ete3 import ncbi_taxonomy
from ete3 import Tree
ncbi_tax = ncbi_taxonomy.NCBITaxa()
import json
from jw_utils import ncbi_datasets_fxs as nfx
from orthofinder_utils import dash_ortho_parser as dop
from Bio import Phylo
from plotly import graph_objects as go
from jw_utils import jw_ncbi_taxonomy as jnt
import os
from jw_utils import plotly_preferences as pprefs

In [20]:
rgb_to_hex = {'rgba(132,137,145,1)':'#848991',
 'rgb(0,208,132)':'#00d084',
 'rgb(0,122,255)':'#007aff',
 'rgb(171,184,195)':'#abb8c3',
 'rgb(255,105,0)':'#ff6900',
 'rgb(252,185,0)':'#fcb900',
 'rgb(123,220,181)':'#7bdcb5',
 'rgb(142,209,252)':'#8ed1fc',
 'rgb(6,147,227)':'#0693e3',
 'rgb(155,81,224)':'#9b51e0',
 'rgb(207,46,46)':'#cf2e2e',
 'rgb(247,141,167)':'#f78da7'}

hex_colors = list(rgb_to_hex.values())

12

In [2]:
def copynumber_bargraph_data_dict(HOGs, dop_obj, id_type='accession'):
    """Make a nested dict 'data_dict' {HOG{leaf_id:counts}} for input into create_tree_w_bargraphs function
    
    Parameters:
    HOGs (list): List of one or more HOGs
    dop_obj (dash_orthoparser oject): must be generated atthe same Hierarchical level as the input HOG
    id_type (str): Can be 'accession' or 'name'. Determines whether the dict keys will be accessions or sci. names.
    """
    
    data_dict =  {}
    data_dict_names = {}
    for HOG in HOGs:
        data_dict[HOG] = dop_obj.HOG_proteins_in_genome(HOG, dop_obj.accessions).to_dict()
    if id_type =='accession':
        return data_dict

    if id_type == 'name':
        for HOG in HOGs:
            data_dict_names[HOG] = {}
            for  acc, count in data_dict[HOG].items():
                name = dop_obj.accession_to_name[acc]
                data_dict_names[HOG][name] = data_dict[HOG][acc]
        return data_dict_names

In [172]:
def _make_field_labels(name_list):
    """return itol line with field labels pulled from from name list"""
    
    field_labels='FIELD_LABELS,'
    for name in name_list:
        field_labels =f'{field_labels}{name},'
    field_labels=field_labels.strip(',')
    return field_labels


def _make_field_colors(name_list,hexcolors=None):
    """Return itol line with hexcolors for each field"""
    if not hexcolors:
        hexcolors = ['#58D68D','#F4D03F','#F5B041','#AAB7B8','#566573','#A93226','#EC7063', '#A569BD', '#5DADE2','#48C9B0']
    field_colors = 'FIELD_COLORS'
    for i, _ in enumerate(name_list):
        field_colors = f'{field_colors},{hexcolors[i]}'
    field_colors=field_colors.strip(',')
    return field_colors


def _make_legend_colors(name_list,hexcolors=None):
    """Return itol line with hexcolors for each field"""
    if not hexcolors:
        hexcolors = ['#58D68D','#F4D03F','#F5B041','#AAB7B8','#566573','#A93226','#EC7063', '#A569BD', '#5DADE2','#48C9B0']
    legend_colors = 'LEGEND_COLORS'
    for i, _ in enumerate(name_list):
        legend_colors = f'{legend_colors},{hexcolors[i]}'
    legend_colors=legend_colors.strip(',')
    return legend_colors

def _make_legend_labels(name_list):
    """return itol line with legend labels pulled from name list"""
    
    legend_labels='LEGEND_LABELS,'
    for name in name_list:
        legend_labels =f'{legend_labels}{name},'
    legend_labels=legend_labels.strip(',')
    return legend_labels


def make_multi_itol_bargraph_dataset(outfile_path, count_dict, name_list, dataset_label='dataset_label',
                                     color='#848991', legend_title='Dataset legend', hexcolors=None,):
    """
    Add data to simple bargraph template for itol dataset adn write to file.

    Parameters
    ----------
    template_path : str
        path to template file from itol.
    file_path : str
        path for new file that is to be created
    data_dict :dict
        {name:{terminal node name: count}}

    """
    if not hexcolors:
        hexcolors = ['#58D68D','#F4D03F','#F5B041','#AAB7B8','#566573','#A93226','#EC7063', '#A569BD', '#5DADE2','#48C9B0']

    with open(outfile_path, 'w') as f:
        f.write('DATASET_MULTIBAR\n')
        f.write('SEPARATOR COMMA\n')
        f.write(f'DATASET_LABEL,{dataset_label}\n')
        f.write(f'COLOR,{color}\n')
        f.write(f'{_make_field_colors(name_list, hexcolors=hexcolors)}\n')
        f.write(f'{_make_field_labels(name_list)}\n')
        f.write(f'LEGEND_TITLE,{legend_title}\n')
        f.write(f"LEGEND_SHAPES,{','.join(['1' for _ in name_list])}\n")
        f.write(f'{_make_legend_colors(name_list, hexcolors=hexcolors)}\n')
        f.write(f'{_make_legend_labels(name_list)}\n')
        f.write('ALIGN_FIELDS,1\n')
        f.write('DATA\n')

        for name, counts in count_dict.items():
            line = f'{name},'
            for count in counts:
                line = line + str(count)+','
            line=line.strip(',') 
            f.write(line+'\n')
            



In [73]:
HOGs = {
    'crc':'N0.HOG0003265', 'cbrA':'N0.HOG0003702', 'pelA':'N0.HOG0007706','pelB':'N0.HOG0007705','pelC':'N0.HOG0007650','cupB1':'N0.HOG0005381',
    'cupB2':'N0.HOG0000906','cupB3':'N0.HOG0000906','cupC1':'N0.HOG0005710','cupC2':'N0.HOG0000907','cupC3':'N0.HOG0001149',
    }
HOGs_truncated = {
    'cbrA':'N0.HOG0003702', 'crc':'N0.HOG0003265', 'pelA':'N0.HOG0007706','cupB1':'N0.HOG0005381','cupC2':'N0.HOG0000907',
     }
        


In [119]:
def make_itol_count_dict(tree, HOGs_to_name_d, dop_obj):
    """"""
    HOG_dict = copynumber_bargraph_data_dict(HOGs_to_name_d.keys(), dop_obj, id_type='accession')
    count_d = {leaf_name:[] for leaf_name in [cl.name for cl in tree.get_terminals()]}
    name_list = []
    for HOG, d in HOG_dict.items():
        name_list.append(HOGs_to_name_d[HOG])
        for acc in d:
            count_d[acc].append(d[acc])
    return name_list, count_d
        
dop_obj = dop.DashOrthoParser('./data')
tree =  Phylo.read('./data/Species_Tree/SpeciesTree_rooted_node_labels.txt', format='newick')

def make_count_dict_binary(count_dict):
    """"""
    binary_count_dict = {}   
    for name, count_list in count_dict.items():
        binary_count_dict[name] = [0 if count == 0 else 1 for count in count_list]      
    return binary_count_dict
 
HOGs_to_name_d = {k:v for v,k in HOGs_truncated.items()}
name_list, count_dict =  make_itol_count_dict(tree, HOGs_to_name_d, dop_obj)

In [120]:
itol_outfile_path = './itol_tree/multibar_dataset_trunc.txt'
binary_count_dict = make_count_dict_binary(count_dict)

In [124]:
make_multi_itol_bargraph_dataset(itol_outfile_path, count_dict=binary_count_dict, name_list=name_list)  

In [170]:
def relabel_itol_treeleafs(tree, relabel_dict, outfile_path):
    """"""
    import warnings
    tree_leafnames = [cl.name for cl in tree.get_terminals()]
    if len(tree_leafnames) != len(relabel_dict.keys()):
        warnings.warn('The number of tree leafs and the number of dict key names to be replaced are not equal')
    with open(outfile_path, 'w') as f:
        f.write('LABELS\n')
        f.write('SEPARATOR COMMA\n')
        f.write('DATA\n')
        for old_name, new_name in relabel_dict.items():
            f.write(f'{old_name},{new_name}\n')
    



In [180]:
relabel_dict = {}
for acc, name in dop_obj.accession_to_name.items():
    name = name.replace('[','').replace(']','')
    name_list =  name.split(' ')
    genus = name_list[0][:50]+'.'
    species = name_list[1][:7]+'.'
    strain = ' '.join(name_list[2:])[:10]
    strain = name_list[-1][:10]
    name = f'{genus} {species} {strain}' 
    relabel_dict[acc]=name
print(len(relabel_dict))
    
outfile_path = './itol_tree/rename_dataset.txt'
relabel_itol_treeleafs(tree, relabel_dict, outfile_path)

234


In [192]:
import shutil
dest = './Proteomes'
parent_fp = './data/ncbi_dataset/data/'
for folder in [f for f in os.listdir(parent_fp) if f.startswith('GC')]:
    proteome_fp = os.path.join(parent_fp,folder, 'sequence_report.jsonl')
    if os.path.exists(proteome_fp):
        n_dest_fp = os.path.join(dest, folder)
        shutil.move(proteome_fp, n_dest_fp)

In [49]:
def make_simple_itol_bargraph_dataset(template_path, file_path, data_dict):
    """
    Add data to simple bargraph template for itol dataset adn write to file.

    Parameters
    ----------
    template_path : str
        path to template file from itol.
    file_path : str
        path for new file that is to be created
    data_dict :dict
        {terminal node name: count}

    """


    template_lines = []
    with open(template_path, 'r') as f:
        for line in f:
            if line[-1] != '\n':
                line = line+'\n'
            template_lines.append(line)
            
    with open(file_path, 'w') as f:
        for line in template_lines:
            f.write(line)
        for name, count in data_dict.items():
            f.write(f'{name},{count}\n')
            
crc_data = list(copynumber_bargraph_data_dict([HOGs['crc']], dop_obj, id_type='accession').values())[0]
template_path = './itol_tree/dataset_multibar_template.txt'
file_path = './itol_tree/crc_simplebar_dataset.txt'
make_simple_itol_bargraph_dataset(template_path, file_path, crc_data)