In [1]:
from opentree import OT
import os
import sys
import json

Download the current OpenTree taxonomy from https://tree.opentreeoflife.org/about/taxonomy-version/ott3.2

In [2]:
!ls ../../ott3.2

family.tsv    genus.tsv    synonyms.tsv        taxonomy.tsv
forwards.tsv  README.html  taxonomy_clean.tsv  version.txt


In [3]:
taxonomy_file = "../../ott3.2/taxonomy.tsv"
assert os.path.exists(taxonomy_file)

Families are convenient to find via string matching, because they all end in *aceae* or *idae*.
We do also list a rank for taxa in the taxonomy, but dues to merhing across resources, but this can be a bit unreliable.

In [4]:
fam_dict = {}
for lin in open(taxonomy_file):
        lii=lin.split('\t|\t')
        if len(lii[2].split(' ')) > 1:
            pass
        elif lii[2].endswith("aceae"):
            fam_dict[lii[2]]=lii
        elif lii[2].endswith("idae"):
            fam_dict[lii[2]]=lii

In [5]:
len(fam_dict)

21933

In [6]:
fams_node_id = {}
fams_resp = {}
fams_in_tree = set()

There are around 22,000 families - so the query can be a bit slow.
To make it more efficient, we first check if each family is found in the tree at all, and store that data

## This takes a long time (beacuse it calls the OpenTree api's 22,000 times).
##So after it runs we write it out to json, so we can re-load in future instead of re-running.
i = 0
for fam in fam_dict:
    i += 1
    if i % 50 == 0:
        sys.stdout.write('.')
    if fam not in fams_node_id:
        ott_id = fam_dict[fam][0]
        resp = OT.synth_node_info(ott_id = ott_id)
        try:
            nid = resp.response_dict.get('node_id')
            fams_in_tree.add(fam)
            fams_resp[fam] = resp.response_dict
        except:
            assert(str(resp.response) == '<Response [400]>')
            not_in_tree.add(fam)
            nid = None
        fams_node_id[fam] =  nid

## Example node info call for 'Juncaginaceae'
https://tree.opentreeoflife.org/taxonomy/browse?id=460579

https://tree.opentreeoflife.org/opentree/argus/ottol@460579


In [7]:
resp = OT.synth_node_info(ott_id = '460579')
resp.response_dict

{'node_id': 'mrcaott106181ott402055',
 'num_tips': 38,
 'query': 'ott460579',
 'source_id_map': {'ot_1391@Tr96262': {'git_sha': '3008105691283414a18a6c8a728263b2aa8e7960',
   'study_id': 'ot_1391',
   'tree_id': 'Tr96262'},
  'ot_311@tree1': {'git_sha': '3008105691283414a18a6c8a728263b2aa8e7960',
   'study_id': 'ot_311',
   'tree_id': 'tree1'},
  'ot_502@tree1': {'git_sha': '3008105691283414a18a6c8a728263b2aa8e7960',
   'study_id': 'ot_502',
   'tree_id': 'tree1'}},
 'supported_by': {'ot_311@tree1': 'node2913'},
 'synth_id': 'opentree12.3',
 'terminal': {'ot_1391@Tr96262': 'ott5715946', 'ot_502@tree1': 'ott5715946'}}

In [21]:
#with open('fam_synth_node_info.json', 'w') as outfile:
#    json.dump(fams_node_id, outfile)
    

In [22]:
#with open('fam_resp_dict.json', 'w') as outfile:
#    json.dump(fams_resp, outfile)

In [8]:
fams_node_id = json.load(open('fam_synth_node_info.json'))
fams_resp = json.load(open('fam_resp_dict.json'))

In [9]:
fams_in_tree = set()
not_in_tree = set()
for fam in fams_node_id:
    nid = fams_node_id[fam]
    if nid:
        fams_in_tree.add(fam)
    else:
        not_in_tree.add(fam)


In [10]:
node_ids = {}
fams_in_tree = set()
not_in_tree = set()
monophy = set()
non_monophy = set()
for fam in fams_node_id:
    nid = fams_node_id[fam]
    ott_id = fam_dict[fam][0]
    if nid:
        if nid not in node_ids:
            node_ids[nid] = set()
        node_ids[nid].add(fam)
        fams_in_tree.add(fam)
        if nid.strip('ott') == ott_id:
            monophy.add(fam)
        else:
            non_monophy.add(fam)
    else:
        not_in_tree.add(fam)

In [11]:
len(non_monophy)+len(monophy)

10104

In [12]:
for fam in monophy:
    assert(fams_node_id[fam].startswith('ott'))

In [13]:
with open('fam_synth_node_info.csv', 'w') as outfile:
    outfile.write("Family, ott_id, synthetic node id, notes\n")
    for fam in fams_node_id:
        ott_id = fam_dict[fam][0]
        node_id = fams_node_id[fam]
        notes = ''
        if node_id != None:
            if len(node_ids[node_id]) > 1:
                taxa = " - ".join(list(node_ids[node_id]))
                notes = 'Family maps to same node as ' + taxa
        outfile.write("{},{}, {}, {}\n".format(fam, ott_id, node_id, notes))

In [14]:
len(fams_in_tree)

10104

In [15]:
len(monophy)

9240

In [16]:
fi = open("families_not_in_tree.tsv",'w')
fi.write('name\t ottid, sources, flags\n')
for fam in not_in_tree:
    fi.write("{}\t{}\t{}\t{}\n".format(fam,fam_dict[fam][0],fam_dict[fam][-4],fam_dict[fam][-2]))

fi.close()

In [17]:
synthtreeid = OT.synth_induced_tree(node_ids = list(node_ids.keys()), label_format="id")
synthtreeid.tree.write(path="allfam_id_label.tre",schema="newick")

In [20]:
synth_tips = [leaf.taxon.label for leaf in synthtreeid.tree.leaf_node_iter()]

In [123]:
node_annotation = {}
for node in synthtreeid.tree:
    if node.label:
        node_annotation[node.label] = {}
    elif node.taxon:
        if node.taxon.label:
            node_annotation[node.taxon.label] = {}
    else:
        print(node)

In [125]:
for nid in node_annotation:
    node_annotation[nid] = {}
    node_annotation[nid]['families'] = []
    node_annotation[nid]['studies'] = []
    node_annotation[nid]['strict_support'] = []
    node_annotation[nid]['conflict'] = []
    node_annotation[nid]['total_descendents'] = 0

In [126]:
len(node_annotation)

16810

In [29]:
# We have already pulled info on nodes that map to family names, 
## so we can use those reponses here and not poll OPenTree again
nid_resp = {}
for fam in fams_resp:
    nid = fams_resp[fam].get('node_id')
    if nid:
        if nid in nid_resp:
            assert(nid_resp[nid]['num_tips']==fams_resp[fam]['num_tips'])
        else:
            nid_resp[nid] = fams_resp[fam]
        if nid in tip_annotation:
            tip_annotation[nid]['families'].append(fam)
    else:
        assert fam in not_in_tree

In [127]:
## To get info on concordance or conflict for the rest of the nodes
## we need to call node ids for those as well
for nid in node_annotation:
    if nid in nid_resp:
        pass
    else:
        resp = OT.synth_node_info(node_id = nid)
        nid_resp[nid] = resp.response_dict

In [128]:
with open('nid_resp.json', 'w') as outfile:
    json.dump(nid_resp, outfile)

In [129]:
len(nid_resp.keys())

16810

In [130]:
total_descendents = 0
for tip in synth_tips:
    total_descendents += nid_resp[tip]['num_tips']
    node_annotation[tip]['total_descendents'] = int(nid_resp[tip]['num_tips'])

    

In [131]:
total_descendents

1725488

In [46]:
#descendent_ids = set()
#for tip in synth_tips:
    if tip_annotation[tip]['total_descendents'] > 0:
        if tip_annotation[tip].get('desc_taxa'):
            pass
        else:
            desc_subtree = OT.synth_subtree(node_id = tip, label_format ="id")
            leaves = [leaf.taxon.label for leaf in desc_subtree.tree.leaf_node_iter()]
            assert len(leaves) == tip_annotation[tip]['total_descendents'], tip
            descendent_ids.update(set(leaves))
            node_annotation[tip]['desc_taxa'] = set(leaves)

In [None]:
#import dendropy

#tips_in_synth = open("tips_in_synth.txt","w")
#fi = '/home/ejmctavish/projects/otapi/OT2020/opentree12.3_tree/labelled_supertree/labelled_supertree.tre'
#total_synth = dendropy.Tree.get(path=fi, schema="newick")
#for leaf in total_synth.leaf_node_iter():
#    tips_in_synth.write(leaf.taxon.label + '\n')

In [67]:
fi = open("tips_in_synth.txt")
tip_ott_ids_in_synth = set()
for lin in fi:
    tip_ott_ids_in_synth.add(lin.strip())
    

In [68]:
len(tip_ott_ids_in_synth)

2391916

In [69]:
not_captured = tip_ott_ids_in_synth.difference(descendent_ids)

In [None]:
len(not_captured)

In [113]:
#not_capture_info = {}
#fams_not_captured = {}
for ott_id in not_captured:
    if ott_id not in not_capture_info:   
        resp = OT.taxon_info(ott_id = int(ott_id.strip('ott')), include_lineage = True)
        not_capture_info[ott_id] = resp
        lineag = resp.response_dict['lineage']
        fam = 'Missing'
        order = 'Missing'
        for highertax in lineag:
            if highertax['rank'] == 'order':
                order = highertax['name']
            if highertax['rank'] == 'family':
                fam = highertax['name']
        if order not in fams_not_captured:
            fams_not_captured[order] = {}
        if fam not in  fams_not_captured[order]:
            fams_not_captured[order][fam] = []
        fams_not_captured[order][fam].append(ott_id)
        
        
    

ERROR:root:Error in POST to https://api.opentreeoflife.org/v3/taxonomy/taxon_info
Traceback (most recent call last):
  File "/home/ejmctavish/projects/otapi/python-opentree/opentree/ws_wrapper.py", line 221, in _call_api
    ws_call_rec = self._http_request(url, http_method, data=data, headers=headers)
  File "/home/ejmctavish/projects/otapi/python-opentree/opentree/ws_wrapper.py", line 282, in _http_request
    resp = requests.request(http_method, url, headers=headers, data=json.dumps(data), allow_redirects=True)
  File "/home/ejmctavish/projects/otapi/OT2020/venv-ot2020/lib/python3.7/site-packages/requests/api.py", line 61, in request
    return session.request(method=method, url=url, **kwargs)
  File "/home/ejmctavish/projects/otapi/OT2020/venv-ot2020/lib/python3.7/site-packages/requests/sessions.py", line 530, in request
    resp = self.send(prep, **send_kwargs)
  File "/home/ejmctavish/projects/otapi/OT2020/venv-ot2020/lib/python3.7/site-packages/requests/sessions.py", line 643, i

KeyboardInterrupt: 

In [245]:
supported_tips = set()
unsupported_tips = set()
i = 0
for node in node_annotation:
    i+=1
    supporting = nid_resp[node].get('source_id_map')
    strict_support = nid_resp[node].get('supported_by')
    conflict = nid_resp[node].get('conflicts_with', [])
    if supporting.keys() == set(['ott3.2draft9']):
        node_annotation[node]['studies'] = 0
    else:
        node_annotation[node]['studies'] = len(supporting.keys())
    if strict_support.keys() == set(['ott3.2draft9']):
        node_annotation[node]['strict_support'] = 0
    else:
        node_annotation[node]['strict_support'] = len(strict_support.keys())
    node_annotation[node]['conflict'] = len(conflict)


In [246]:
len(node_annotation)

16810

In [322]:
with open('node_annotation.json', 'w') as outfile:
    json.dump(node_annotation, outfile)

In [3]:
import json
node_annotation = json.load(open('node_annotation.json'))


In [4]:
max_conf=0
max_support=0
max_desc=0
no_desc=set()
sing_desc=set()
huge_desc=set()
for node in node_annotation:
    if node_annotation[node]['conflict'] > max_conf:
        max_conf = node_annotation[node]['conflict']
    if node_annotation[node]['studies'] > max_support:
        max_support = node_annotation[node]['studies']
    if node in synth_tips:
        if tip_annotation[node]['total_descendents'] > max_desc:
            max_desc =  tip_annotation[node]['total_descendents']
        if tip_annotation[node]['total_descendents'] == 0:
            no_desc.add(node)
        if tip_annotation[node]['total_descendents'] == 1:
            sing_desc.add(node)
        if tip_annotation[node]['total_descendents'] > 10000:
            huge_desc.add(node)
        

print(max_conf)
print(max_support)
print(max_desc)

NameError: name 'synth_tips' is not defined

In [284]:
len(no_desc)

119

In [285]:
no_desc

{'ott1065162',
 'ott119908',
 'ott231907',
 'ott245885',
 'ott2812304',
 'ott2872091',
 'ott3247106',
 'ott3356305',
 'ott3531441',
 'ott3532167',
 'ott3588511',
 'ott4023553',
 'ott4125916',
 'ott4139966',
 'ott4175023',
 'ott4187834',
 'ott4221168',
 'ott4524043',
 'ott4697503',
 'ott4697720',
 'ott4697789',
 'ott4698009',
 'ott4698047',
 'ott4698119',
 'ott4698763',
 'ott4698874',
 'ott4698945',
 'ott4699032',
 'ott4699059',
 'ott4699076',
 'ott4699276',
 'ott4699327',
 'ott4699641',
 'ott4699724',
 'ott4699826',
 'ott4700272',
 'ott4700364',
 'ott4700443',
 'ott4700790',
 'ott4701143',
 'ott4701442',
 'ott4701492',
 'ott4701526',
 'ott4701635',
 'ott4701742',
 'ott4701914',
 'ott4702117',
 'ott4702145',
 'ott4702368',
 'ott4702607',
 'ott4703029',
 'ott4703450',
 'ott4939687',
 'ott4941108',
 'ott4946049',
 'ott4991293',
 'ott4991506',
 'ott4994455',
 'ott5004048',
 'ott5021877',
 'ott5031587',
 'ott5099466',
 'ott5104667',
 'ott5122625',
 'ott5122688',
 'ott5122747',
 'ott5122770'

In [280]:
len(sing_desc)

1092

In [324]:
node_annotation['ott2959440']

{'families': [],
 'studies': 0,
 'strict_support': 0,
 'conflict': 0,
 'total_descendents': 11}

In [274]:
len(huge_desc)

15

In [286]:
huge_desc

{'mrcaott11244ott21732',
 'mrcaott413ott199885',
 'ott1014022',
 'ott116636',
 'ott271925',
 'ott305904',
 'ott46248',
 'ott508090',
 'ott568878',
 'ott587367',
 'ott65329',
 'ott678079',
 'ott7376225',
 'ott769762',
 'ott968124'}

In [325]:
fi = open("support_anno.txt", 'w')
startstr = """DATASET_STYLE
SEPARATOR TAB

#label is used in the legend table (can be changed later)
DATASET_LABEL\t{}

#dataset color (can be changed later)
COLOR\t#ffff00

DATA\n""".format("Support")
fi.write(startstr)
for node in node_annotation:
    if node_annotation[node]['studies']:
        relsupport = node_annotation[node]['studies']/5
        r = 0
        g = 255*relsupport
        b = 0
        color = "rgba({}, {}, {}, {})".format(r, g, b, 0.25+relsupport)
        fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(node,color))
    else:
        color = "rgba(0, 0, 0, 0.25)"
        fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(node,color))
fi.close()

In [216]:
fi = open("conflict_anno.txt", 'w')
startstr = """DATASET_STYLE
SEPARATOR TAB

#label is used in the legend table (can be changed later)
DATASET_LABEL\t{}

#dataset color (can be changed later)
COLOR\t#ffff00

DATA\n""".format("conflict")
fi.write(startstr)
for node in node_annotation:
    relconf = node_annotation[node]['conflict']/5
    r = 255*relconf
    g = 0
    b = 0
    color = "rgba({}, {}, {}, {})".format(r, g, b, 0.25+relconf)
    fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(node,color))
        
fi.close()

In [287]:
fi = open("descendent_heatmap.txt", 'w')
import math
startstr = """DATASET_HEATMAP
#In heatmaps, each ID is associated to multiple numeric values, which are displayed as a set of colored boxes defined by a color gradient
#lines starting with a hash are comments and ignored during parsing
#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).
#SEPARATOR TAB
SEPARATOR SPACE
#SEPARATOR COMMA

#label is used in the legend table (can be changed later)
DATASET_LABEL LOG_number_of_descendante

#dataset color (can be changed later)
COLOR #ff0000

#define labels for each individual field column
FIELD_LABELS number_desc

#=================================================================#
#                    OPTIONAL SETTINGS                            #
#=================================================================#


#Heatmaps can have an optional Newick formatted tree assigned. Its leaf IDs must exactly match the dataset FIELD_LABELS.
#The tree will be used to sort the dataset fields, and will be displayed above the dataset. It can have branch lengths defined.
#All newlines and spaces should be stripped from the tree, and COMMA cannot be used as the dataset separator if a FIELD_TREE is provided.
#FIELD_TREE (((f1:0.2,f5:0.5):1,(f2:0.2,f3:0.3):1.2):0.5,(f4:0.1,f6:0.5):0.8):1;



#=================================================================#
#     all other optional settings can be set or changed later     #
#           in the web interface (under 'Datasets' tab)           #
#=================================================================#

#Each dataset can have a legend, which is defined using LEGEND_XXX fields below
#For each row in the legend, there should be one shape, color and label.
#Optionally, you can define an exact legend position using LEGEND_POSITION_X and LEGEND_POSITION_Y. To use automatic legend positioning, do NOT define these values
#Optionally, shape scaling can be present (LEGEND_SHAPE_SCALES). For each shape, you can define a scaling factor between 0 and 1.
#Shape should be a number between 1 and 6, or any protein domain shape definition.
#1: square
#2: circle
#3: star
#4: right pointing triangle
#5: left pointing triangle
#6: checkmark

#LEGEND_TITLE,Dataset legend
#LEGEND_POSITION_X,100
#LEGEND_POSITION_Y,100
#LEGEND_SHAPES,1,2,3
#LEGEND_COLORS,#ff0000,#00ff00,#0000ff
#LEGEND_LABELS,value1,value2,value3
#LEGEND_SHAPE_SCALES,1,1,0.5

#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.
#MARGIN 0

#width of the individual boxes
#STRIP_WIDTH 25

#always show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.
#SHOW_INTERNAL 0


#show dashed lines between leaf labels and the dataset
#DASHED_LINES 1

#if a FIELD_TREE is present, it can be hidden by setting this option to 0
#SHOW_TREE 1

#define the color for the NULL values in the dataset. Use the letter X in the data to define the NULL values
#COLOR_NAN #000000

#automatically create and display a legend based on the color gradients and values defined below
#AUTO_LEGEND 1


#define the heatmap gradient colors. Values in the dataset will be mapped onto the corresponding color gradient.
COLOR_MIN #0000ff
COLOR_MAX #ff0000

#you can specify a gradient with three colors (e.g red to yellow to green) by setting 'USE_MID_COLOR' to 1, and specifying the midpoint color
#USE_MID_COLOR 1
#COLOR_MID #ffff00

#By default, color gradients will be calculated based on dataset values. You can force different values to use in the calculation by setting the values below:
#USER_MIN_VALUE 0
#USER_MID_VALUE 500
#USER_MAX_VALUE 1000

#border width; if set above 0, a border of specified width (in pixels) will be drawn around individual cells
#BORDER_WIDTH,0

#border color; used only when BORDER_WIDTH is above 0
#BORDER_COLOR,#0000ff


#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages
#=================================================================#
#       Actual data follows after the "DATA" keyword              #
#=================================================================#
DATA\n
"""
fi.write(startstr)
for node in synth_tips:
    desc = int(node_annotation[node]['total_descendents'])
    if desc == 0:
        val = -0.1
    else:
        val = math.log(desc, 10)
    fi.write("{} {}\n".format(node, val))
        
fi.close()


In [235]:
#fam_dict = {}
#rank = set()
#fi = 
#for lin in open(taxonomy_file):
#        lii=lin.split('\t|\t')
#        ott_id = 'ott'+lii[0]
#        if 
#        rank.add(lii[3])

In [240]:
fi= open("label_anno.txt",'w')
startstr = """LABELS
#use this template to change the leaf labels, or define/change the internal node names (displayed in mouseover popups)

#lines starting with a hash are comments and ignored during parsing

#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).


SEPARATOR COMMA

#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages
#=================================================================#
#       Actual data follows after the "DATA" keyword              #
#=================================================================#
DATA
\n"""
fi.write(startstr)
for lin in open(taxonomy_file):
        lii=lin.split('\t|\t')
        ott_id = 'ott'+lii[0]
        name = lii[2]
        rank = lii[3]
        if rank in ['class', 'phylum', 'kingdom']:
            if ott_id in node_annotation:
                fi.write("{},{}\n".format(ott_id, name))

fi.close()
                

In [316]:
import random
random.randint(1,255)

21

In [321]:
fi = open('phylum_colors.txt','w')
startstr="""TREE_COLORS
#use this template to define branch colors and styles, colored ranges and label colors/font styles/backgrounds
#lines starting with a hash are comments and ignored during parsing

#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).


SEPARATOR SPACE

DATA
"""
import random
fi.write(startstr)
for lin in open(taxonomy_file):
        lii=lin.split('\t|\t')
        ott_id = 'ott'+lii[0]
        name = lii[2]
        rank = lii[3]
        if rank in ['phylum']:
            if ott_id in node_annotation:
                r = random.randint(1,255)
                g = random.randint(1,255)
                b = random.randint(1,255)
                color = "rgba({},{},{},0.5)".format(r,g,b)
                fi.write("{} range {} {}\n".format(ott_id, color, name))

fi.close()

In [14]:
fi = open("kingdom_heatmap.txt", 'w')
import math
startstr = """DATASET_HEATMAP
#In heatmaps, each ID is associated to multiple numeric values, which are displayed as a set of colored boxes defined by a color gradient
#lines starting with a hash are comments and ignored during parsing
#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).
#SEPARATOR TAB
SEPARATOR SPACE
#SEPARATOR COMMA

#label is used in the legend table (can be changed later)
DATASET_LABEL Kingdom

#dataset color (can be changed later)
COLOR #ff0000

#define labels for each individual field column
FIELD_LABELS number_desc

#=================================================================#
#                    OPTIONAL SETTINGS                            #
#=================================================================#


#Heatmaps can have an optional Newick formatted tree assigned. Its leaf IDs must exactly match the dataset FIELD_LABELS.
#The tree will be used to sort the dataset fields, and will be displayed above the dataset. It can have branch lengths defined.
#All newlines and spaces should be stripped from the tree, and COMMA cannot be used as the dataset separator if a FIELD_TREE is provided.
#FIELD_TREE (((f1:0.2,f5:0.5):1,(f2:0.2,f3:0.3):1.2):0.5,(f4:0.1,f6:0.5):0.8):1;



#=================================================================#
#     all other optional settings can be set or changed later     #
#           in the web interface (under 'Datasets' tab)           #
#=================================================================#

#Each dataset can have a legend, which is defined using LEGEND_XXX fields below
#For each row in the legend, there should be one shape, color and label.
#Optionally, you can define an exact legend position using LEGEND_POSITION_X and LEGEND_POSITION_Y. To use automatic legend positioning, do NOT define these values
#Optionally, shape scaling can be present (LEGEND_SHAPE_SCALES). For each shape, you can define a scaling factor between 0 and 1.
#Shape should be a number between 1 and 6, or any protein domain shape definition.
#1: square
#2: circle
#3: star
#4: right pointing triangle
#5: left pointing triangle
#6: checkmark

#LEGEND_TITLE,Dataset legend
#LEGEND_POSITION_X,100
#LEGEND_POSITION_Y,100
#LEGEND_SHAPES,1,2,3
#LEGEND_COLORS,#ff0000,#00ff00,#0000ff
#LEGEND_LABELS,value1,value2,value3
#LEGEND_SHAPE_SCALES,1,1,0.5

#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.
#MARGIN 0

#width of the individual boxes
#STRIP_WIDTH 25

#always show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.
#SHOW_INTERNAL 0


#show dashed lines between leaf labels and the dataset
#DASHED_LINES 1

#if a FIELD_TREE is present, it can be hidden by setting this option to 0
#SHOW_TREE 1

#define the color for the NULL values in the dataset. Use the letter X in the data to define the NULL values
#COLOR_NAN #000000

#automatically create and display a legend based on the color gradients and values defined below
#AUTO_LEGEND 1


#define the heatmap gradient colors. Values in the dataset will be mapped onto the corresponding color gradient.
COLOR_MIN #0000ff
COLOR_MAX #ff0000

#you can specify a gradient with three colors (e.g red to yellow to green) by setting 'USE_MID_COLOR' to 1, and specifying the midpoint color
#USE_MID_COLOR 1
#COLOR_MID #ffff00

#By default, color gradients will be calculated based on dataset values. You can force different values to use in the calculation by setting the values below:
#USER_MIN_VALUE 0
#USER_MID_VALUE 500
#USER_MAX_VALUE 1000

#border width; if set above 0, a border of specified width (in pixels) will be drawn around individual cells
#BORDER_WIDTH,0

#border color; used only when BORDER_WIDTH is above 0
#BORDER_COLOR,#0000ff


#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages
#=================================================================#
#       Actual data follows after the "DATA" keyword              #
#=================================================================#
DATA\n
"""
fi.write(startstr)
val = 0
for lin in open(taxonomy_file):
        lii=lin.split('\t|\t')
        ott_id = 'ott'+lii[0]
        name = lii[2]
        rank = lii[3]
        if rank in ['kingdom']:
            val += 50
            if ott_id in node_annotation:
                fi.write("{} {}\n".format(node, val))
        
fi.close()


In [4]:
fi = open('kingdom_colors.txt','w')
startstr="""TREE_COLORS
#use this template to define branch colors and styles, colored ranges and label colors/font styles/backgrounds
#lines starting with a hash are comments and ignored during parsing

#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).


SEPARATOR SPACE

DATA
"""
import random
name_labels = ['Bacteria',
               'Archaea',
               'Chloroplastida',
               'Chordata',
               'Ecdysozoa',
               'Lophotrochozoa',
               'Cnidaria',
               'Fungi',
               'SAR',
               'Arthropoda',
               'Echinoidea',
               'Nematoda',
               'Porifera',
               'Excavata']
fi.write(startstr)
for lin in open(taxonomy_file):
        lii=lin.split('\t|\t')
        ott_id = 'ott'+lii[0]
        name = lii[2]
        rank = lii[3]
        if name in name_labels:
            if ott_id in node_annotation:
                r = random.randint(1,255)
                g = random.randint(1,255)
                b = random.randint(1,255)
                color = "rgba({},{},{},0.5)".format(r,g,b)
                fi.write("{} range {} {}\n".format(ott_id, color, name))
        
        

fi.close()

NameError: name 'node_annotation' is not defined

In [242]:
nid_resp['mrcaott26521ott26533']

{'node_id': 'mrcaott26521ott26533',
 'num_tips': 57,
 'query': 'mrcaott26521ott26533',
 'source_id_map': {'pg_437@tree6242': {'git_sha': '3008105691283414a18a6c8a728263b2aa8e7960',
   'study_id': 'pg_437',
   'tree_id': 'tree6242'}},
 'supported_by': {'pg_437@tree6242': 'node1082117'},
 'synth_id': 'opentree12.3'}

In [164]:
testtreeid = OT.synth_induced_tree(node_ids = test_nodes, label_format="id")
testtreeid.tree.write(path="test_id_label.tre",schema="newick")

In [181]:
print(node_annotation['ott5750755'])
print(node_annotation['ott36015'])

{'families': [], 'studies': 2, 'strict_support': 2, 'conflict': 0, 'total_descendents': 30}
{'families': [], 'studies': 5, 'strict_support': 5, 'conflict': 0, 'total_descendents': 5}


In [206]:
fi = open("test_conflict_anno.txt", 'w')
startstr = """DATASET_STYLE
SEPARATOR TAB

#label is used in the legend table (can be changed later)
DATASET_LABEL\t{}

#dataset color (can be changed later)
COLOR\t#ffff00

DATA\n""".format("conflict")
fi.write(startstr)
for node in test_nodes:
    if node_annotation[node]['conflict'] > 0:
        relconf = node_annotation[node]['conflict']/5
        r = 255*relconf
        g = 0
        b = 0
        color = "rgba({}, {}, {}, {})".format(r, g, b, 0.25+relconf)
        fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(node,color))
        
fi.close()

In [203]:
fi = open("test_support_anno.txt", 'w')
startstr = """DATASET_STYLE
SEPARATOR TAB

#label is used in the legend table (can be changed later)
DATASET_LABEL\t{}

#dataset color (can be changed later)
COLOR\t#ffff00

DATA\n""".format("Support")
fi.write(startstr)
for node in test_nodes:
    if node_annotation[node]['studies']:
        relsupport = node_annotation[node]['studies']/5
        r = 0
        g = 255*relsupport
        b = 255*(1-relsupport)
        color = "rgba({}, {}, {}, {})".format(r, g, b, 0.25+relsupport)
        fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(node,color))
        
fi.close()

In [168]:
fi = open("test_heatmap.txt", 'w')
startstr = """DATASET_HEATMAP
#In heatmaps, each ID is associated to multiple numeric values, which are displayed as a set of colored boxes defined by a color gradient
#lines starting with a hash are comments and ignored during parsing
#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).
#SEPARATOR TAB
SEPARATOR SPACE
#SEPARATOR COMMA

#label is used in the legend table (can be changed later)
DATASET_LABEL example_heatmap

#dataset color (can be changed later)
COLOR #ff0000

#define labels for each individual field column
FIELD_LABELS support conflict

#=================================================================#
#                    OPTIONAL SETTINGS                            #
#=================================================================#


#Heatmaps can have an optional Newick formatted tree assigned. Its leaf IDs must exactly match the dataset FIELD_LABELS.
#The tree will be used to sort the dataset fields, and will be displayed above the dataset. It can have branch lengths defined.
#All newlines and spaces should be stripped from the tree, and COMMA cannot be used as the dataset separator if a FIELD_TREE is provided.
#FIELD_TREE (((f1:0.2,f5:0.5):1,(f2:0.2,f3:0.3):1.2):0.5,(f4:0.1,f6:0.5):0.8):1;



#=================================================================#
#     all other optional settings can be set or changed later     #
#           in the web interface (under 'Datasets' tab)           #
#=================================================================#

#Each dataset can have a legend, which is defined using LEGEND_XXX fields below
#For each row in the legend, there should be one shape, color and label.
#Optionally, you can define an exact legend position using LEGEND_POSITION_X and LEGEND_POSITION_Y. To use automatic legend positioning, do NOT define these values
#Optionally, shape scaling can be present (LEGEND_SHAPE_SCALES). For each shape, you can define a scaling factor between 0 and 1.
#Shape should be a number between 1 and 6, or any protein domain shape definition.
#1: square
#2: circle
#3: star
#4: right pointing triangle
#5: left pointing triangle
#6: checkmark

#LEGEND_TITLE,Dataset legend
#LEGEND_POSITION_X,100
#LEGEND_POSITION_Y,100
#LEGEND_SHAPES,1,2,3
#LEGEND_COLORS,#ff0000,#00ff00,#0000ff
#LEGEND_LABELS,value1,value2,value3
#LEGEND_SHAPE_SCALES,1,1,0.5

#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.
#MARGIN 0

#width of the individual boxes
#STRIP_WIDTH 25

#always show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.
#SHOW_INTERNAL 0


#show dashed lines between leaf labels and the dataset
#DASHED_LINES 1

#if a FIELD_TREE is present, it can be hidden by setting this option to 0
#SHOW_TREE 1

#define the color for the NULL values in the dataset. Use the letter X in the data to define the NULL values
#COLOR_NAN #000000

#automatically create and display a legend based on the color gradients and values defined below
#AUTO_LEGEND 1


#define the heatmap gradient colors. Values in the dataset will be mapped onto the corresponding color gradient.
COLOR_MIN #ff0000
COLOR_MAX #0000ff

#you can specify a gradient with three colors (e.g red to yellow to green) by setting 'USE_MID_COLOR' to 1, and specifying the midpoint color
#USE_MID_COLOR 1
#COLOR_MID #ffff00

#By default, color gradients will be calculated based on dataset values. You can force different values to use in the calculation by setting the values below:
#USER_MIN_VALUE 0
#USER_MID_VALUE 500
#USER_MAX_VALUE 1000

#border width; if set above 0, a border of specified width (in pixels) will be drawn around individual cells
#BORDER_WIDTH,0

#border color; used only when BORDER_WIDTH is above 0
#BORDER_COLOR,#0000ff


#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages
#=================================================================#
#       Actual data follows after the "DATA" keyword              #
#=================================================================#
DATA\n
"""
fi.write(startstr)
for node in test_nodes:
    support = node_annotation[node]['studies']
    conflict = node_annotation[node]['conflict']
    
    fi.write("{} {} {}\n".format(node, support, conflict))
        
fi.close()

In [81]:
color = 'rgb(255, 204, 153, 0.5)'
write_annotation(not_captured, color, 'annotest', "annotest.txt")

In [None]:
fi = open("tip_annotation.tsv",'w')
fi.write('ottid \t sources \t node_id \t tip_in_tree \t descendent_taxa notes\n')
for fam in fams_in_tree:
        ott_id = fam_dict[fam][0]
        sources = fam_dict[fam][4]
        node_id = fams_node_id[fam]
        notes = ''
        if node_id != None:
            if len(node_ids[node_id]) > 1:
                taxa = " - ".join(list(node_ids[node_id]))
                notes = 'Maps to same node as: ' + taxa
        fi.write("{}\t{}\t{}\t{}\t{}\n".format(fam, ott_id, sources, node_id, notes))

fi.close()

In [80]:
def write_annotation(labels, color, name, outputfile):
    """Write an itol annotation file"""
    fi = open(outputfile, 'a')
    startstr = """DATASET_STYLE
SEPARATOR TAB

#label is used in the legend table (can be changed later)
DATASET_LABEL,{}

#dataset color (can be changed later)
COLOR,#ffff00

DATA\n""".format(name)
    fi.write(startstr)
    for label in labels:
        fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(label,color))

In [45]:
fi = open("families_in_tree.tsv",'w')
fi.write('name \t ottid \t sources \t node_id \t tip_in_tree \t descendent_taxa notes\n')
for fam in fams_in_tree:
        ott_id = fam_dict[fam][0]
        sources = fam_dict[fam][4]
        node_id = fams_node_id[fam]
        notes = ''
        if node_id != None:
            if len(node_ids[node_id]) > 1:
                taxa = " - ".join(list(node_ids[node_id]))
                notes = 'Maps to same node as: ' + taxa
        fi.write("{}\t{}\t{}\t{}\t{}\n".format(fam, ott_id, sources, node_id, notes))

fi.close()

In [52]:
def remove_problem_characters(tree, prob_char = "():#", replace_w = '?'):
    problem_characters = set(prob_char)
    for node in tree:
        if node.taxon:
            for char in problem_characters:
                node.taxon.label = node.taxon.label.replace(char,replace_w)
        elif node.label:
            for char in problem_characters:
                node.label = node.label.replace(char,replace_w)
    return None

In [53]:
import copy
local_tree = copy.deepcopy(synthtree.tree)

remove_problem_characters(local_tree)
local_tree.write(path="allfam.tre",schema="newick")

In [59]:
rev_fam =  {'ott{}'.format(fam_dict[k][0]):k for k in fam_dict}

for node in local_tree:
    if node.taxon:
        if node.taxon.label in node_ids:
            fam = '-'.join(list(node_ids[node.taxon.label]))
            node.taxon.label = "{} broken - MRCA {}".format(fam, node.taxon.label)
    elif node.label:
         if node.label in node_ids:
            fam = '-'.join(list(node_ids[node.label]))
            node.label = "{} broken - MRCA {}".format(fam, node.label)

In [57]:
# Don't forget to cite your friendly phylogeneticists!
studies = synthtree.response_dict['supporting_studies']
len(studies)

455

In [58]:
cites = OT.get_citations(studies) #this can be a bit slow
fi = open("all_famcites.txt","w")
fi.write(cites)
fi.close()

In [60]:
local_tree.write(path="allfam_labeled.tre",schema="newick")

In [23]:
#Table of values
print('total families', len(fam_dict))
print('in tree', len(fams_in_tree))
print('not in tree', len(not_in_tree))
print('monophyletic', len(monophy))
print('not monophyletic', len(non_monophy))
print('tips in tree', len([leaf for leaf in synthtree.tree.leaf_node_iter()]))


total families 21933
in tree 10104
not in tree 11829
monophyletic 9240
not monophyletic 864
tips in tree 9306
