## Getting a tree for families
The opentree module gives us access to the OpenTree Api's via python.  
This module is in development, this code was run using the version:  
https://github.com/OpenTreeOfLife/python-opentree sha 9e95e6952190c914a5

In [58]:
from opentree import OT
import copy

Get the list of families from a textfile.  

In [59]:
fam_fi=open("Family_tree_GF.txt").readlines()
fam_names = set()
for lin in fam_fi:
    lii = lin.strip()
    fam_names.add(lii)

In [25]:
len(fam_names)

916

Match names to taxon ids in the OpenTree taxonomy  
e.g https://tree.opentreeoflife.org/taxonomy/browse?name=Agaricostilbaceae

In [57]:
#fam_dict = dict()
for fam_name in fam_spps:
    if fam_name not in fam_dict:
        ott_id = OT.get_ottid_from_name(fam_name)
        fam_dict[fam_name] = ott_id 

In [27]:
fam_ott_ids = set()
no_match = set()
for fam_name in fam_dict:
    ott_id = fam_dict[fam_name]
    if ott_id:
        fam_ott_ids.add(ott_id)
    else:
        print("No ott_id for {}".format(fam_name))

No ott_id for Clostridiales Family XVII. Incertae Sedis
No ott_id for Clostridiales Family XII. Incertae Sedis
No ott_id for Clostridiales Family XIII. Incertae Sedis
No ott_id for Rep1


In [28]:
len(fam_dict)

916

In [29]:
len(fam_ott_ids)

910

In [30]:
try:
    fam_ott_ids.remove(None)
except:
    pass

## Take the ott ids for the families, and get a the synthetic tree.

In [31]:
output = OT.synth_induced_tree(ott_ids=list(fam_ott_ids),  label_format='name_and_id')

In [63]:
# We'll edit the names in some places, so a copy is useful
local_tree = copy.deepcopy(output.tree)

In [66]:
# Some characters in names mess up newick tree readers
def remove_problem_characters(tree, prob_char = "():#", replace_w = '?'):
    problem_characters = set(prob_char)
    for node in tree:
        if node.taxon:
            for char in problem_characters:
                node.taxon.label = node.taxon.label.replace(char,replace_w)
        elif node.label:
            for char in problem_characters:
                node.label = node.label.replace(char,replace_w)
    return None

In [67]:
remove_problem_characters(local_tree)
treefile = "FAM_cal_eDNA.tre"
local_tree.write(path = treefile, schema = "newick")

### Broken Taxa
Some family names are not found in OpenTree
because the members of that family do not form a monophyletic group.  
  
These are called "broken taxa" and the family name is mappedto the MRCA of all the members of that family in taxonomy.  
e.g. Agaricostilbaceae is not monophyletic, according to 
Zhao et al 2017 https://doi.org/10.1007/s13225-017-0381-5
https://tree.opentreeoflife.org/curator/study/view/ot_1784/?tab=trees&tree=Tr106951&conflict=ott
but the MRCA of all taxa in Agaricostilbaceae maps to
https://tree.opentreeoflife.org/opentree/argus/opentree12.3@mrcaott23276ott23291

It is possible for multiple family names to map to the same node in the tree.  
In those cases I hyphenated the names for that tip.

In [68]:
broken = output.response_dict['broken']
len(broken)

156

In [69]:
rev_fam =  {'ott{}'.format(v):k for k, v in fam_dict.items()}
rev_broken = {}

for k, v in broken.items():
    fam = rev_fam[k]
    if v not in rev_broken:
        rev_broken[v] = [fam]
    else:
        rev_broken[v].append(fam)


for node in output.tree:
    if node.taxon:
        if node.taxon.label in rev_broken:
            fam = '-'.join(rev_broken[node.taxon.label])
            node.taxon.label = "{} broken - MRCA {}".format(fam, node.taxon.label)
    elif node.label:
         if node.label in rev_broken:
            fam = '-'.join(rev_broken[node.label])
            node.label = "{} broken - MRCA {}".format(fam, node.label)

In [70]:
treefile = "FAM_cal_eDNA_mrca.tre"
output.tree.write(path = treefile, schema = "newick")

In [71]:
rev_broken

{'mrcaott23276ott23291': ['Agaricostilbaceae'],
 'mrcaott14541ott124912': ['Leucobryaceae'],
 'mrcaott6117ott105428': ['Nitrospiraceae'],
 'mrcaott2556ott3381': ['Mytilidae'],
 'mrcaott42379ott140961': ['Onuphidae'],
 'mrcaott8800ott405457': ['Helicobacteraceae'],
 'ott971709': ['Bionectriaceae',
  'Nectriaceae',
  'Herpotrichiellaceae',
  'Ophiostomataceae',
  'Magnaporthaceae',
  'Ophiocordycipitaceae',
  'Cordycipitaceae',
  'Cladosporiaceae',
  'Hypocreaceae',
  'Ceratocystidaceae',
  'Mycosphaerellaceae',
  'Clavicipitaceae',
  'Plectosphaerellaceae'],
 'ott51652': ['Hyaloscyphaceae', 'Helotiaceae', 'Sclerotiniaceae'],
 'mrcaott1403ott8433': ['Thermomonosporaceae'],
 'mrcaott23276ott60764': ['Chionosphaeraceae'],
 'mrcaott5864ott7040': ['Inocybaceae'],
 'mrcaott50ott184': ['Coriobacteriaceae'],
 'mrcaott10354ott17309': ['Pseudonocardiaceae'],
 'ott183800': ['Tremellaceae'],
 'mrcaott1093ott2355': ['Moraxellaceae'],
 'mrcaott103ott1562': ['Chromatiaceae', 'Alcanivoracaceae'],
 'mrc

In [73]:
# We can check which taxa are in the tree
synth_leaf_ott_ids = set()
synth_node_ids = set()

for node in output.tree:
    if node.taxon:
        synth_leaf_ott_ids.add(node.taxon.label.split()[-1])
    if node.label:
        synth_node_ids.add(node.label.split()[-1])


In [74]:
outfi = open('fam_info_synth_1.csv','w')
outfi.write("fam_name, ott_id, broken_taxon, mrca_of_descendents, in_synth_tree\n")
for fam_name in fam_dict:
    ott_id = fam_dict[fam_name]
    brok = "-"
    mrca = "-"
    in_synth_tree = 'False'
    if ott_id != 'None':
        ott_str = "ott{}".format(ott_id)
        if ott_str in broken:
            brok = "T"
            mrca = broken[ott_str].strip()
            if mrca in synth_leaf_ott_ids:
                in_synth_tree = 'Leaf'
            elif mrca in synth_node_ids:
                in_synth_tree = 'Node'
        if ott_str in synth_leaf_ott_ids:
            in_synth_tree = 'Leaf'
        elif ott_str in synth_node_ids:
            in_synth_tree = 'Node'
            
    outfi.write("{},{},{},{},{}\n".format(fam_name, ott_id, brok, mrca, in_synth_tree))
outfi.close()

In [75]:
missing_fams = set()
for fam_name in fam_dict:
    ott_id = fam_dict[fam_name]
    brok = "-"
    mrca = "-"
    in_synth_tree = 'False'
    if ott_id != 'None':
        ott_str = "ott{}".format(ott_id)
        if ott_str in broken:
            brok = "T"
            mrca = broken[ott_str].strip()
            if mrca in synth_leaf_ott_ids:
                in_synth_tree = 'Leaf'
            elif mrca in synth_node_ids:
                in_synth_tree = 'Node'
        if ott_str in synth_leaf_ott_ids:
            in_synth_tree = 'Leaf'
        elif ott_str in synth_node_ids:
            in_synth_tree = 'Node'
    if in_synth_tree == 'False':
        missing_fams.add(fam_name)

In [77]:
len(missing_fams)

264

## Missing families
Some families are still not found in the tree  
This is often due to taxonomy issues, or name conflicts across sources.  
For the families that are mssing from the tree, 
I pulled the asv transect data to find out what lower taxa  
were included representing each family


In [81]:
fi = open("all_asv_transects_for_EJMT.txt")
spp_reps = {}
for fam in missing_fams:
    spp_reps[fam] = set()
for lin in fi:
    lii = lin.split(';')
    while("" in lii) : 
        lii.remove("") 
    for htax in lii:
        if htax in missing_fams:
            spp_reps[htax].add(lii[-1].strip())

In [82]:
spp_reps['Oscillatoriaceae']

{'',
 'Lyngbya aestuarii',
 'Lyngbya polychroa',
 'Oscillatoria acuminata',
 'Oscillatoria tenuis',
 'Phormidium autumnale',
 'Phormidium chlorinum',
 'Phormidium laetevirens',
 'Phormidium sp. B-Tom',
 'Phormidium sp. PUPCCC 118.2'}

In [84]:
#spp_ids = {}
for fam in missing_fams:
    for spp_name in spp_reps[fam]:
        if spp_name not in spp_ids:
            ott_id = OT.get_ottid_from_name(spp_name)
            spp_ids[spp_name] = ott_id

In [85]:
fam_mrca = {}
for fam in spp_reps:
    ott_ids = set([spp_ids[spp] for spp in spp_reps[fam]])
    if None in ott_ids:
        ott_ids.remove(None)
    if len(ott_ids) < 1:
        print(fam)
        fam_mrca[fam] = None
    else:
        ret = 0
        while ret == 0:
            assert(ott_ids)
            call_record = OT.ws.tree_of_life_mrca(ott_ids=list(ott_ids))
            if 'unknown' in call_record.response_dict:
                for key in call_record.response_dict['unknown']:
                    ott_ids.remove(int(key.strip('ott')))
            elif 'mrca' in call_record.response_dict:
                ret = 1
                fam_mrca[fam] = call_record.response_dict['mrca']['node_id']              
            if len(ott_ids) < 1:
                print(fam)
                msgtemplate = 'Call to tree_of_life/mrca failed as all ids were pruned'
                print(msgtemplate)
                fam_mrca[fam] = None
                ret = 1
    

Minutisphaeraceae
Call to tree_of_life/mrca failed as all ids were pruned
Phyllophoraceae
Call to tree_of_life/mrca failed as all ids were pruned
Ancyromonadidae
Call to tree_of_life/mrca failed as all ids were pruned
Tontoniidae
Call to tree_of_life/mrca failed as all ids were pruned
Sanchytriaceae
Call to tree_of_life/mrca failed as all ids were pruned
Marinifilaceae
Call to tree_of_life/mrca failed as all ids were pruned
Coleofasciculaceae
Call to tree_of_life/mrca failed as all ids were pruned
Microbulbiferaceae
Phaeocystaceae
Balneolaceae
Call to tree_of_life/mrca failed as all ids were pruned
Rivulariaceae
Call to tree_of_life/mrca failed as all ids were pruned
Myxotrichaceae
Call to tree_of_life/mrca failed as all ids were pruned
Pseudanabaenaceae
Call to tree_of_life/mrca failed as all ids were pruned
Pseudoperisporiaceae
Call to tree_of_life/mrca failed as all ids were pruned
Labilitrichaceae
Call to tree_of_life/mrca failed as all ids were pruned
Tetrahymenidae
Microcystaceae

## Adding in MRCAs  
There are some families that we don't have records of any of the taxa listed.

In [86]:
fam_mrca_ids = [fam_mrca[fam] for fam in fam_mrca]

In [87]:
fam_ott_ids = ['ott{}'.format(idn) for idn in fam_ott_ids]

In [88]:
ids_plus = fam_mrca_ids + fam_ott_ids
ids_plus = set(ids_plus)
ids_plus.remove(None)

In [90]:
output_plus = OT.synth_induced_tree(node_ids = list(ids_plus), label_format='name_and_id')

In [91]:
tmp_tree = copy.deepcopy(output_plus.tree)

dict_keys(['broken', 'newick', 'supporting_studies'])


In [92]:
remove_problem_characters(tmp_tree)

In [93]:
treefile = "FAM_cal_eDNA_addedFams.tre"
tmp_tree.write(path = treefile, schema = "newick")

In [94]:
synth_leaf_ott_ids = set()
synth_node_ids = set()
for node in outputplus.tree:
    if node.taxon:
        synth_leaf_ott_ids.add(node.taxon.label.split()[-1])
    if node.label:
        synth_node_ids.add(node.label.split()[-1])


dict_keys(['broken', 'newick', 'supporting_studies'])


In [95]:
outfi = open('fam_info_synth_plus.csv','w')
outfi.write("fam_name, ott_id, broken_taxon, mrca_of_descendents, in_synth_tree, selected_descendents\n")
for fam_name in fam_dict:
    ott_id = fam_dict[fam_name]
    brok = "-"
    mrca = "-"
    in_synth_tree = 'False'
    selected_descendents = "-"
    if ott_id != 'None':
        ott_str = "ott{}".format(ott_id)
        if ott_str in broken:
            brok = "T"
            mrca = broken[ott_str].strip()
            if mrca in synth_leaf_ott_ids:
                in_synth_tree = 'Leaf'
            elif mrca in synth_node_ids:
                in_synth_tree = 'Node'
        if fam_name in fam_mrca:
            brok = "missing"
            mrca = fam_mrca[fam_name]
            if mrca in synth_leaf_ott_ids:
                in_synth_tree = 'Leaf'
            elif mrca in synth_node_ids:
                in_synth_tree = 'Node'
            selected_descendents = ";".join(list(set(spp_reps[fam_name])))
        if ott_str in synth_leaf_ott_ids:
            in_synth_tree = 'Leaf'
        elif ott_str in synth_node_ids:
            in_synth_tree = 'Node'
            
    outfi.write("{},{},{},{},{},{}\n".format(fam_name, ott_id, brok, mrca, in_synth_tree,selected_descendents))
outfi.close()

In [96]:
# Don't forget to cite your friendly phylogeneticists!
studies = output_plus.response_dict['supporting_studies']
cites = OT.get_citations(studies) #this can be a bit slow
fi = open("famcites.txt","w")
fi.write(cites)
fi.close()

In [97]:
len(studies)

208