In [25]:
from opentree import OT
import os
import sys

Download the current OpenTree taxonomy from https://tree.opentreeoflife.org/about/taxonomy-version/ott3.2

In [2]:
!ls ../ott3.2

README.html  forwards.tsv  synonyms.tsv  taxonomy.tsv  version.txt


In [3]:
taxonomy_file = "../ott3.2/taxonomy.tsv"
assert os.path.exists(taxonomy_file)

Families are convenient to find via string matching, because they all end in *aceae* or *idae*

In [5]:
fam_dict = {}
for lin in open(taxonomy_file):
        lii=lin.split('\t|\t')
        if len(lii[2].split(' ')) > 1:
            pass
        elif lii[2].endswith("aceae"):
            fam_dict[lii[2]]=lii
        elif lii[2].endswith("idae"):
            fam_dict[lii[2]]=lii

However, through merging a variety of taxonomic rseources, we do end up with some misplaced names.

In [11]:
maybe_not_fams = dict()
for fam in fam_dict:
    if fam_dict[fam][3] == 'family':
        pass
    else:
        maybe_not_fams[fam] = fam_dict[fam][3]

In [12]:
len(maybe_not_fams)

163

In [15]:
len(fam_dict)

21933

In [28]:
fams_node_id = {}

There are around 22,000 families - so the query can be a bit slow.
To make it more efficient, we first check if each family is found in the tree at all, and store that data

In [None]:
fams_in_tree = set()
not_in_tree = set()
i = 0
for fam in fam_dict:
    i += 1
    if i % 50 == 0:
        sys.stdout.write('.')
    if fam not in fams_node_id:
        ott_id = fam_dict[fam][0]
        resp = OT.synth_node_info(ott_id = ott_id)
        try:
            nid = resp.response_dict.get('node_id')
            fams_in_tree.add(fam)
        except:
            assert(str(resp.response) == '<Response [400]>')
            not_in_tree.add(fam)
            nid = None
        fams_node_id[fam] =  nid

............................................................................................................................................................................................

In [43]:
import json
with open('fam_synth_node_info.json', 'w') as outfile:
    json.dump(fams_node_id, outfile)

In [47]:
node_ids = fams_node_id.values()

In [84]:
with open('fam_synth_node_info.csv', 'w') as outfile:
    outfile.write("Family, ott_id, synthetic node id,notes\n")
    for fam in fams_node_id:
        ott_id = fam_dict[fam][0]
        node_id = fams_node_id[fam]
        notes = ''
        if node_id != None:
            if len(node_ids[node_id]) > 1:
                taxa = " - ".join(list(node_ids[node_id]))
                notes = 'Family maps to same node as' + taxa
        outfile.write("{},{}, {}, {}\n".format(fam, ott_id, node_id, notes))

In [48]:
len(fams_in_tree)

10104

In [94]:
fi = open("families_not_in_tree.tsv",'w')
fi.write('name\t ottid, sources, flags\n')
for fam in not_in_tree:
    fi.write("{}\t{}\t{}\t{}\n".format(fam,fam_dict[fam][0],fam_dict[fam][-4],fam_dict[fam][-2]))

fi.close()

In [93]:
fi = open("families_in_tree.tsv",'w')
fi.write('name \t ottid \t sources \t node_id \t notes\n')
for fam in fams_in_tree:
        ott_id = fam_dict[fam][0]
        sources = fam_dict[fam][4]
        node_id = fams_node_id[fam]
        notes = ''
        if node_id != None:
            if len(node_ids[node_id]) > 1:
                taxa = " - ".join(list(node_ids[node_id]))
                notes = 'Maps to same node as: ' + taxa
        fi.write("{}\t{}\t{}\t{}\t{}\n".format(fam, ott_id, sources, node_id, notes))

fi.close()

In [56]:
synthtree = OT.synth_induced_tree(node_ids = list(node_ids.keys()), label_format="name")

In [58]:
synthtree.tree.write(path="allfam.tre",schema="newick")

dict_keys(['broken', 'newick', 'supporting_studies'])


In [59]:
def remove_problem_characters(tree, prob_char = "():#", replace_w = '?'):
    problem_characters = set(prob_char)
    for node in tree:
        if node.taxon:
            for char in problem_characters:
                node.taxon.label = node.taxon.label.replace(char,replace_w)
        elif node.label:
            for char in problem_characters:
                node.label = node.label.replace(char,replace_w)
    return None

In [60]:
import copy
local_tree = copy.deepcopy(synthtree.tree)

remove_problem_characters(local_tree)
local_tree.write(path="allfam.tre",schema="newick")

In [61]:
# Don't forget to cite your friendly phylogeneticists!
studies = tree.response_dict['supporting_studies']
cites = OT.get_citations(studies) #this can be a bit slow
fi = open("all_famcites.txt","w")
fi.write(cites)
fi.close()

In [80]:
rev_fam =  {'ott{}'.format(fam_dict[k][0]):k for k in fam_dict}

for node in local_tree:
    if node.taxon:
        if node.taxon.label in node_ids:
            fam = '-'.join(list(node_ids[node.taxon.label]))
            node.taxon.label = "{} broken - MRCA {}".format(fam, node.taxon.label)
    elif node.label:
         if node.label in node_ids:
            fam = '-'.join(list(node_ids[node.label]))
            node.label = "{} broken - MRCA {}".format(fam, node.label)

In [81]:
local_tree.write(path="allfam_labeled.tre",schema="newick")

#Alternate approach:
Throw everything into it, and let some taxa be broken

In [None]:
output = OT.synth_induced_tree(ott_ids=list(fams_ott_id.keys()),  label_format='name_and_id')

In [None]:
broken = output.response_dict['broken']
len(broken)

In [None]:
rev_fam =  {'ott{}'.format(v):k for k, v in fam_dict.items()}
rev_broken = {}

for k, v in broken.items():
    fam = rev_fam[k]
    if v not in rev_broken:
        rev_broken[v] = [fam]
    else:
        rev_broken[v].append(fam)


for node in output.tree:
    if node.taxon:
        if node.taxon.label in rev_broken:
            fam = '-'.join(rev_broken[node.taxon.label])
            node.taxon.label = "{} broken - MRCA {}".format(fam, node.taxon.label)
    elif node.label:
         if node.label in rev_broken:
            fam = '-'.join(rev_broken[node.label])
            node.label = "{} broken - MRCA {}".format(fam, node.label)

In [None]:
# We can check which taxa are in the tree
synth_leaf_ott_ids = set()
synth_node_ids = set()

for node in output.tree:
    if node.taxon:
        synth_leaf_ott_ids.add(node.taxon.label.split()[-1])
    if node.label:
        synth_node_ids.add(node.label.split()[-1])


In [None]:
outfi = open('fam_info_synth_1.csv','w')
outfi.write("fam_name, ott_id, broken_taxon, mrca_of_descendents, in_synth_tree\n")
for fam_name in fam_dict:
    ott_id = fam_dict[fam_name]
    brok = "-"
    mrca = "-"
    in_synth_tree = 'False'
    if ott_id != 'None':
        ott_str = "ott{}".format(ott_id)
        if ott_str in broken:
            brok = "T"
            mrca = broken[ott_str].strip()
            if mrca in synth_leaf_ott_ids:
                in_synth_tree = 'Leaf'
            elif mrca in synth_node_ids:
                in_synth_tree = 'Node'
        if ott_str in synth_leaf_ott_ids:
            in_synth_tree = 'Leaf'
        elif ott_str in synth_node_ids:
            in_synth_tree = 'Node'
            
    outfi.write("{},{},{},{},{}\n".format(fam_name, ott_id, brok, mrca, in_synth_tree))
outfi.close()