In [1]:
from opentree import OT, taxonomy_helpers
import copy
import dendropy

In [2]:
# Grab the phylo only synth tree to get the ids in synth
#!wget https://files.opentreeoflife.org/synthesis/opentree13.4/output/grafted_solution/grafted_solution.tre


In [3]:
fi = open("grafted_solution.tre").readlines()
for lin in fi:
    lin = lin.replace('(',',')
    lin = lin.replace(')',',')
    lii = lin.split(',')
    synth_ottids = set(lii)

In [4]:
len(synth_ottids)

104749

In [5]:
# Get the grafted synth tree with Fabacea at the root
output = OT.synth_subtree('ott560323', label_format="id")
output.tree.write(path="legumes.tre",schema="newick")


In [6]:
pruned = copy.deepcopy(output.tree)

In [7]:
#pruned = orig_tree.extract_tree(suppress_unifurcations=False)
# Hmmmm forget why I ever needed this

In [8]:
taxa_to_retain = []
for leaf in pruned.leaf_nodes():
    tax = leaf.taxon
    if tax:
        ottid = tax.label
    else:
        ottid = None
    if ottid in synth_ottids:
        taxa_to_retain.append(tax)

In [9]:
leaf.taxon.label

'ott7633656'

In [10]:
taxa_to_prune = []
for tax in pruned.taxon_namespace:
    ottid = tax.label.split('_')[-1]
    if ottid not in synth_ottids:
        taxa_to_prune.append(tax)

In [11]:
len(taxa_to_prune)

19607

In [12]:
len(taxa_to_retain)

4872

In [13]:
pruned.retain_taxa(taxa_to_retain)

In [14]:
import json
dates = json.load(open("../../chronosynth/node_ages.json"))

In [15]:
#Check there is a date fro the root, or it will break
dates['node_ages']['ott560323']

[{'age': 69.14253099999999,
  'source_id': 'ot_1250@tree2',
  'source_node': 'node99521',
  'time_unit': 'Myr'}]

In [16]:
ages = open("ages",'w')

dated_nodes = set()
undated_nodes = set()
for node in pruned:
    lab = None
    if node.label:
        if node.label.startswith('mrca'):
            lab = node.label
        elif node.label.startswith('ott'):
            lab = node.label
        else:
            lab = node.label.split()[-1]
        if lab in dates['node_ages']:
            dated_nodes.add(lab)
            age_range = [float(source['age']) for source in dates['node_ages'][lab]]
            age_range.sort()
            age_est = sum(age_range) / len(age_range) 
            ages.write("{}\t{}\n".format(node.label, age_est))
        #    node.label = age_est
        else:
        #    node.label = "-"
            undated_nodes.add(lab)


ages.close()
len(dated_nodes)

119

In [17]:
node.label

In [18]:
trefile = "Phylo_only_legumes.tre"
pruned.write(path = trefile, schema = "newick")
            

In [19]:
! phylocom bladj -f Phylo_only_legumes.tre > Phylo_only_legumes_dated_ids.tre

In [20]:
taxonomy_helpers.download_taxonomy_file(version = 3.3, loc = '../..')

Taxonomy already available at ../../ott3.3

'../../ott3.3'

In [21]:
!head ../../ott3.3/taxonomy.tsv

uid	|	parent_uid	|	name	|	rank	|	sourceinfo	|	uniqname	|	flags	|	
805080	|		|	life	|	no rank	|	silva:0,ncbi:1,gbif:0,irmng:0	|		|		|	
93302	|	805080	|	cellular organisms	|	no rank	|	ncbi:131567	|		|		|	
844192	|	93302	|	Bacteria	|	domain	|	silva:A16379/#1,ncbi:2,worms:6,gbif:3,irmng:13,irmng:109739,irmng:109741	|		|		|	
248067	|	844192	|	Proteobacteria	|	phylum	|	silva:A16379/#2,ncbi:1224,worms:178054,gbif:111,irmng:192	|	Proteobacteria (phylum silva:A16379/#2)	|		|	
822744	|	248067	|	Gammaproteobacteria	|	class	|	silva:A16379/#3,ncbi:1236,worms:393018,gbif:362,irmng:1392	|		|	sibling_higher	|	
767311	|	822744	|	Pasteurellales	|	order	|	silva:A16379/#4,ncbi:135625,worms:394199,gbif:1472,irmng:11731	|		|	sibling_higher	|	
1098176	|	767311	|	Pasteurellaceae	|	family	|	silva:A16379/#5,ncbi:712,worms:394200,gbif:9536,irmng:104984	|	Pasteurellaceae (family silva:A16379/#5)	|		|	
470454	|	1098176	|	Haemophilus	|	genus	|	silva:A16379/#6,ncbi:724,worms:571392,gbif:3219815,irmng:1307220

In [22]:
ott_id_lookup = {}
for lin in open('../../ott3.3/taxonomy.tsv'):
    lii = lin.split('|')
    ott_id_lookup[lii[0].strip()] = lii[2].strip()

In [23]:
#-> relabel back to name labels
dated =dendropy.Tree.get(path='Phylo_only_legumes_dated_ids.tre', schema = 'newick')
for tax in dated.taxon_namespace:
    ottid = tax.label.strip('ott')
    taxname = ott_id_lookup[ottid]
    tax.label = taxonomy_helpers.remove_problem_characters(taxname)

In [24]:
dated.write(path='Phylo_only_legumes_dated_names.tre', schema = 'newick')

In [25]:
taxtree = OT.taxon_subtree(ott_id=560323, label_format="name")
taxtree.tree.write(path='Tax_only_legumes.tre', schema = 'newick')

## Get Citations

In [26]:
# Don't forget to cite your friendly phylogeneticists!
studies = output.response_dict['supporting_studies']
cites = OT.get_citations(studies) #this can be a bit slow
fi = open("legume_synth_cites.txt","w")
fi.write(cites)
fi.close()

In [27]:
#These are the citations used to build the synth tree
print(cites)

https://tree.opentreeoflife.org/curator/study/view/ot_1987?tab=trees&tree=tree1
Sinou, C., Cardinal‐McTeague, W., & Bruneau, A. (2020). Testing generic limits in Cercidoideae (Leguminosae): Insights from plastid and duplicated nuclear gene sequences. TAXON, 69(1), 67–86. doi:10.1002/tax.12207

http://dx.doi.org/10.1002/tax.12207

https://tree.opentreeoflife.org/curator/study/view/pg_2689?tab=trees&tree=tree6241
Drummond, C. S., Eastwood, R. J., Miotto, S. T., & Hughes, C. E. (2012). Multiple continental radiations and correlates of diversification in Lupinus (Leguminosae): testing for key innovation with incomplete taxon sampling. Systematic Biology 61(3), 443-460.
http://dx.doi.org/10.1093/sysbio/syr126

https://tree.opentreeoflife.org/curator/study/view/pg_2712?tab=trees&tree=tree6296
Wang, Hengchang, Michael J. Moore, Pamela S. Soltis, Charles D. Bell, Samuel F. Brockington, Roolse Alexandre, Charles C. Davis, Maribeth Latvis, Steven R. Manchester, and Douglas E. Soltis. 2009. Rosid

In [28]:
# Get all the studies that have trees conataining Fabacea
phylesystem_studies_resp = OT.find_trees("560323", search_property ='ot:ottId')

In [29]:
studies_in_synth = [treeid.split('@')[0] for treeid in studies]

In [30]:
#Studies in phylesystem but not in synth

In [31]:
phylesystem_only_studies = set()
for study in phylesystem_studies_resp.response_dict['matched_studies']:
    study_id = study['ot:studyId']
    if study_id not in studies_in_synth:
        phylesystem_only_studies.add(study_id)

In [32]:
len(phylesystem_only_studies)

88

In [33]:
cites_phyl = OT.get_citations(phylesystem_only_studies) #this can be a bit slow

In [34]:
print(cites_phyl)

https://tree.opentreeoflife.org/curator/study/view/ot_1826
Luckow M., Fortunato R., Sede S., & Livshultz T. 2005. The phylogenetic affinities of two mysterious monotypic mimosoids from southern South America. Systematic Botany, null.
http://dx.doi.org/10.1600/0363644054782206

https://tree.opentreeoflife.org/curator/study/view/pg_222
TestFakeCitation, Chris Baron,


https://tree.opentreeoflife.org/curator/study/view/pg_17
Burleigh, J. Gordon, Khidir W. Hilu, and Douglas E. Soltis. 2009. Inferring phylogenies with incomplete data sets: a 5-gene, 567-taxon analysis of angiosperms. BMC Evolutionary Biology 9 (January): 1-11.
http://dx.doi.org/10.1186/1471-2148-9-61

https://tree.opentreeoflife.org/curator/study/view/pg_626
Nickrent, D., Blarer, A., Qiu, Y. L., Vidal-Russell, R., & Anderson, F. (2004). Phylogenetic inference in Rafflesiales: the influence of rate heterogeneity and horizontal gene transfer. BMC Evolutionary Biology, 4(1), 40.
http://dx.doi.org/10.1186/1471-2148-4-40

https:

In [35]:
fi = open("curation_unfinished.txt","w")
fi.write(cites_phyl)
fi.close()

In [36]:
'''fi=open("legume_tax_info.tsv","w")
fi.write("SppName\tLabel\tOttId\tGbifId\tncbiId\tSynonyms\n")
for tip in taxa_to_retain:
    ott_id = tip.label.split()[-1].strip('ott')
    resp = OT.taxon_info(int(ott_id))    
    match = resp.response_dict
    spp = match['name']
    gbif_id = ""
    ncbi_id = ""
    for source in match['tax_sources']:
        if source.startswith("gbif"):
            gbif_id = gbif_id + source
        if source.startswith("ncbi"):
            ncbi_id = ncbi_id + source
    synonyms = ",".join(match['synonyms'])
    fi.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(spp, tip.label, ott_id, gbif_id, ncbi_id, synonyms))
fi.close()'''

'fi=open("legume_tax_info.tsv","w")\nfi.write("SppName\tLabel\tOttId\tGbifId\tncbiId\tSynonyms\n")\nfor tip in taxa_to_retain:\n    ott_id = tip.label.split()[-1].strip(\'ott\')\n    resp = OT.taxon_info(int(ott_id))    \n    match = resp.response_dict\n    spp = match[\'name\']\n    gbif_id = ""\n    ncbi_id = ""\n    for source in match[\'tax_sources\']:\n        if source.startswith("gbif"):\n            gbif_id = gbif_id + source\n        if source.startswith("ncbi"):\n            ncbi_id = ncbi_id + source\n    synonyms = ",".join(match[\'synonyms\'])\n    fi.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(spp, tip.label, ott_id, gbif_id, ncbi_id, synonyms))\nfi.close()'