In [1]:
from opentree import OT
import csv

In [2]:
ott_birds_subtree = OT.taxon_subtree(ott_id=81461, label_format="id")

In [3]:
ott_birds_ids = set()
for node in ott_birds_subtree.tree:
    if node.label:
        ott_birds_ids.add(node.label)
    if node.taxon:
        if node.taxon.label:
            ott_birds_ids.add(node.taxon.label)

In [4]:
len(ott_birds_ids)

27530

In [5]:
ott_taxonomy_file = "../../ott3.3/taxonomy.tsv"
ott_bird_tax_dict = {}
header = ['ottid', 'parent_ottid', 'name','rank','source','uniqname','flags']
bird_tax_file = open("birds_of_ott.tsv", "w")
head = open(ott_taxonomy_file).readline()
bird_tax_file.write(head)
for lin in open(ott_taxonomy_file):
        lii=lin.split('\t|\t')
        if "ott{}".format(lii[0]) in ott_birds_ids:
            ott_bird_tax_dict[lii[0]]=dict(zip(header, lii))
            bird_tax_file.write(lin)
        
bird_tax_file.close()
        

In [6]:
len(ott_bird_tax_dict)

27530

In [7]:
name_to_id = {}
for tax in ott_bird_tax_dict:
    name_to_id[ott_bird_tax_dict[tax]['name']] = ott_bird_tax_dict[tax]['ottid']

In [8]:
ebird_taxonomy_file = open("eBird_Taxonomy_v2019.csv")
##NOTE: Taxonomy file has commas within fields, quoted
header = ['TAXON_ORDER','CATEGORY','SPECIES_CODE','PRIMARY_COM_NAME','SCI_NAME','ORDER1','FAMILY','SPECIES_GROUP','REPORT_AS']
ebird_all_tax_dict = {}
ebird_taxonomy_file.readline()

lines = ebird_taxonomy_file.readlines()
for lii in csv.reader(lines, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True):
        ebird_all_tax_dict[lii[2]]=dict(zip(header, lii))

In [9]:
ebird_all_tax_dict['helgui']

{'TAXON_ORDER': '864',
 'CATEGORY': 'species',
 'SPECIES_CODE': 'helgui',
 'PRIMARY_COM_NAME': 'Helmeted Guineafowl',
 'SCI_NAME': 'Numida meleagris',
 'ORDER1': 'Galliformes',
 'FAMILY': 'Numididae (Guineafowl)',
 'SPECIES_GROUP': 'Grouse, Quail, and Allies',
 'REPORT_AS': ''}

In [10]:
len(ebird_all_tax_dict)

16513

In [11]:
ebird_name_to_code = {}
for tax in ebird_all_tax_dict:
    ebird_name_to_code[ebird_all_tax_dict[tax]['SCI_NAME']] = tax

In [12]:
exact_taxon_matches = {}
for spp in ebird_all_tax_dict:
    if ebird_all_tax_dict[spp]['SCI_NAME'] in name_to_id:
        exact_taxon_matches[spp] = name_to_id[ebird_all_tax_dict[spp]['SCI_NAME']]

In [13]:
list(exact_taxon_matches.keys())[:10]

['ostric2',
 'ostric3',
 'grerhe1',
 'tabtin1',
 'higtin1',
 'hootin1',
 'grytin1',
 'soltin1',
 'blatin1',
 'gretin1']

In [15]:
synonym_exact_matched = {}
for spp in ebird_all_tax_dict:
    if spp not in exact_taxon_matches:
        if spp not in synonym_exact_matched:
            ebname= ebird_all_tax_dict[spp]['SCI_NAME']
            match = OT.tnrs_match(names=[ebname], context_name="Birds")
            if match.response_dict.get('results'):
                if len(match.response_dict['results'][0]['matches']) == 1:
                    ott_id = match.response_dict['results'][0]['matches'][0]['taxon']['ott_id']
                    synonym_exact_matched[spp] = ott_id
    

In [16]:
len(synonym_exact_matched)

908

In [17]:
synmatchedfi = open('exact_synonym_match_ott3.3.csv', 'w')
synmatchedfi.write("ebird_code, ott_id\n")
for spp in synonym_exact_matched:
    synmatchedfi.write("{},{}\n".format(spp, synonym_exact_matched[spp]))

synmatchedfi.close()
    

In [18]:
synmatchedfi = open('exact_synonym_match_ott3.3.csv')
synonym_exact_matched = {}
for lin in synmatchedfi:
    lii = lin.strip().split(',')
    synonym_exact_matched[lii[0]] = lii[1]

In [19]:
unmatchedfi = open('unmatched.txt','w')
for spp in ebird_all_tax_dict:
    if spp not in exact_taxon_matches:
        if spp not in synonym_exact_matched:
            unmatchedfi.write(ebird_all_tax_dict[spp]['SCI_NAME'])
            unmatchedfi.write('\n')
            
unmatchedfi.close()

In [20]:
reverse_name = {}
for spp in ebird_all_tax_dict:
    reverse_name[ebird_all_tax_dict[spp]['SCI_NAME']] = spp

In [21]:
fuzzy_matches = {}
fuzzy_match_fi = open("output/fuzzy_matches.csv")
fuzzy_match_fi.readline()
for lin in fuzzy_match_fi:
    lii = lin.split('\t')
    ott_id = lii[2]
    if len(ott_id) > 1:
        spp = reverse_name[lii[0]]
        fuzzy_matches[spp] = ott_id

In [22]:
eliot_matches = {}
eliot_match_fi = open("final-eBird-ott7Mar2021.csv")
eliot_match_fi.readline()
new_synonmys = open("synonym_ammendments.csv", 'w')
new_synonmys.write("ott_id, current_name, new_synonym\n")
new_taxa = open("taxon_ammendments.csv", 'w')
new_taxa.write("genus,new_taxon\n")
for lin in eliot_match_fi:
    lii = lin.split(',')
    ebname = lii[0]
    synonym = lii[1].strip()
    if len(synonym.split(' ')) >= 2:
        match = OT.tnrs_match(names=[synonym], context_name="Birds")
        if match.response_dict.get('results'):
                if len(match.response_dict['results'][0]['matches']) == 1:
                    ott_id = match.response_dict['results'][0]['matches'][0]['taxon']['ott_id']
                    code = ebird_name_to_code[ebname]
                    eliot_matches[code] = ott_id
                    new_synonmys.write("{},{},{}\n".format(ott_id, synonym, ebname))
        else:
            print(synonym)
    else:
        new_taxa.write(lin)
        
new_synonmys.close()
new_taxa.close()

In [26]:
ofi = open("combined_data_ott3.3.tsv", "w")
header = ['TAXON_ORDER','CATEGORY','SPECIES_CODE','PRIMARY_COM_NAME','SCI_NAME','ORDER1','FAMILY','SPECIES_GROUP','REPORT_AS', 'ottid', 'parent_ottid', 'name','rank','source','uniqname','flags']
ofi.write('\t'.join(header))
ofi.write('\tmatch_type\n')
all_ott_ids = {}
for spp in ebird_all_tax_dict:
    ott_id = None
    typ = ""
    if spp in exact_taxon_matches:
        ott_id = exact_taxon_matches[spp]
        typ = "canonical_match"
    if spp in synonym_exact_matched:
        ott_id = synonym_exact_matched[spp]
        typ = "synonym_match"
    if spp in fuzzy_matches:
        ott_id = fuzzy_matches[spp]
        typ = "fuzzy_match"
    if spp in eliot_matches:
        ott_id = eliot_matches[spp]
        typ = "hand_match_eliot"
    info = ebird_all_tax_dict[spp]
    if ott_id:
        info.update(ott_bird_tax_dict[str(ott_id)])
        for item in header:
            ofi.write(info.get(item,'-').strip())
            ofi.write('\t')
        ofi.write(typ)
        all_ott_ids[str(ott_id)] = (info['SPECIES_CODE'], info['SCI_NAME'])
    else:
        for item in header:
            ofi.write(info.get(item,'-').strip())
            ofi.write('\t')
        ofi.write("unmatched")
    ofi.write('\n')
ofi.close()
            
        

        
        

In [24]:
spp='helgui'
info = ebird_all_tax_dict[spp]
if spp in exact_taxon_matches:
        ott_id = exact_taxon_matches[spp]
        typ = "canonical_match"
if spp in synonym_exact_matched:
        ott_id = synonym_exact_matched[spp]
        typ = "synonym_match"
if spp in fuzzy_matches:
        ott_id = fuzzy_matches[spp]
        typ = "fuzzy_match"
if spp in eliot_matches:
        ott_id = eliot_matches[spp]
        typ = "hand_match_eliot"
        
info.update(ott_bird_tax_dict[str(ott_id)])

In [25]:
info

{'TAXON_ORDER': '864',
 'CATEGORY': 'species',
 'SPECIES_CODE': 'helgui',
 'PRIMARY_COM_NAME': 'Helmeted Guineafowl',
 'SCI_NAME': 'Numida meleagris',
 'ORDER1': 'Galliformes',
 'FAMILY': 'Numididae (Guineafowl)',
 'SPECIES_GROUP': 'Grouse, Quail, and Allies',
 'REPORT_AS': '',
 'ottid': '684046',
 'parent_ottid': '684050',
 'name': 'Numida meleagris',
 'rank': 'species',
 'source': 'ncbi:8996,gbif:2473341,irmng:10192853',
 'uniqname': '',
 'flags': ''}

In [27]:
len(all_ott_ids) - len(ebird_all_tax_dict)

-5114

In [28]:
ofi = open("ott_taxa_not_in_ebird.csv", 'w')
header = ['ottid', 'parent_ottid', 'name','rank','source','uniqname','flags']
ofi.write('\t'.join(header))
for ott_id in ott_bird_tax_dict:
    if ott_id not in all_ott_ids:
        for item in header:
            ofi.write(ott_bird_tax_dict[str(ott_id)][item].strip())
            ofi.write('\t')
        ofi.write('\n')
        
ofi.close()
        

In [None]:
output = OT.synth_induced_tree(ott_ids=list(all_ott_ids.keys()),  label_format='id', ignore_unknown_ids= True)
output.tree.write(path="all_ebird_id_label.tre",schema="newick")

In [None]:
import json
# From git@github.com:OpenTreeOfLife/ChronoSynth.git
dates = json.load(open("../../ChronoSynth/node_ages.json"))

In [None]:
#Blad demands a date for the root. I picked this one form google...
dates['node_ages']['ott81461'] = {'age': 60,
  'source_id': 'google',
  'source_node': None,
  'time_unit': 'Myr'},

dates['node_ages']['ott241846'] = {'age': 60,
  'source_id': 'google',
  'source_node': None,
  'time_unit': 'Myr'},

In [None]:
trefile = "ebird_Aves_ID_annot.tre"
#Get the synthetic tree from OpenTree and write out the citations to a text file.
output.tree.write(path = trefile, schema = "newick")

In [None]:
ages = open("ages",'w')

dated_nodes = set()
undated_nodes = set()
for node in output.tree:
    lab = None
    if node.label:
        if node.label.startswith('mrca'):
            lab = node.label
        elif node.label.startswith('ott'):
            lab = node.label
        else:
            lab = node.label.split()[-1]
        if lab in dates['node_ages']:
            dated_nodes.add(lab)
            age_range = [float(source['age']) for source in dates['node_ages'][lab]]
            age_range.sort()
            age_est = sum(age_range) / len(age_range) 
            ages.write("{}\t{}\n".format(node.label, age_est))
        else:
            undated_nodes.add(lab)
    if node.taxon:
        if node.taxon.label:
            if node.taxon.label.startswith('mrca'):
                lab = node.taxon.label
            elif node.taxon.label.startswith('ott'):
                lab = node.taxon.label
            else:
                lab = node.taxon.label.split()[-1]
            if lab in dates['node_ages']:
                dated_nodes.add(lab)
                age_range = [float(source['age']) for source in dates['node_ages'][lab]]
                age_range.sort()
                age_est = sum(age_range) / len(age_range) 
                ages.write("{}\t{}\n".format(node.label, age_est))
            else:
                undated_nodes.add(lab)


len(dated_nodes)

In [None]:
! phylocom bladj -f ebird_Aves_ID_annot.tre > all_ebird_blad.tre

In [None]:
ages = open("ages",'w')

dated_nodes = set()
undated_nodes = set()
for node in output.tree:
    lab = None
    if node.label:
        if node.label.startswith('mrca'):
            lab = node.label
        elif node.label.startswith('ott'):
            lab = node.label
        else:
            lab = node.label.split()[-1]
        if lab in dates['node_ages']:
            dated_nodes.add(lab)
            age_range = [float(source['age']) for source in dates['node_ages'][lab]]
            age_range.sort()
            age_est = sum(age_range) / len(age_range) 
            ages.write("{}\t{}\n".format(node.label, age_est))
        else:
            undated_nodes.add(lab)
    if node.taxon:
        if node.taxon.label:
            if node.taxon.label.startswith('mrca'):
                lab = node.taxon.label
            elif node.taxon.label.startswith('ott'):
                lab = node.taxon.label
            else:
                lab = node.taxon.label.split()[-1]
            if lab in dates['node_ages']:
                dated_nodes.add(lab)
                age_range = [float(source['age']) for source in dates['node_ages'][lab]]
                age_range.sort()
                age_est = age_range[:-1]
                ages.write("{}\t{}\n".format(node.label, age_est))
            else:
                undated_nodes.add(lab)


len(dated_nodes)

! phylocom bladj -f ebird_Aves_ID_annot.tre > all_ebird_blad_max.tre

In [None]:
ages = open("ages",'w')

dated_nodes = set()
undated_nodes = set()
for node in output.tree:
    lab = None
    if node.label:
        if node.label.startswith('mrca'):
            lab = node.label
        elif node.label.startswith('ott'):
            lab = node.label
        else:
            lab = node.label.split()[-1]
        if lab in dates['node_ages']:
            dated_nodes.add(lab)
            age_range = [float(source['age']) for source in dates['node_ages'][lab]]
            age_range.sort()
            age_est = sum(age_range) / len(age_range) 
            ages.write("{}\t{}\n".format(node.label, age_est))
        else:
            undated_nodes.add(lab)
    if node.taxon:
        if node.taxon.label:
            if node.taxon.label.startswith('mrca'):
                lab = node.taxon.label
            elif node.taxon.label.startswith('ott'):
                lab = node.taxon.label
            else:
                lab = node.taxon.label.split()[-1]
            if lab in dates['node_ages']:
                dated_nodes.add(lab)
                age_range = [float(source['age']) for source in dates['node_ages'][lab]]
                age_range.sort()
                age_est = age_range[0]
                ages.write("{}\t{}\n".format(node.label, age_est))
            else:
                undated_nodes.add(lab)


len(dated_nodes)

! phylocom bladj -f ebird_Aves_ID_annot.tre > all_ebird_blad_min.tre

In [None]:
import random
ages = open("ages",'w')

all_studies = set()
dated_nodes = set()
undated_nodes = set()
for node in output.tree:
    lab = None
    if node.label:
        if node.label.startswith('mrca'):
            lab = node.label
        elif node.label.startswith('ott'):
            lab = node.label
        else:
            lab = node.label.split()[-1]
        if lab in dates['node_ages']:
            studies = {source["source_id"].split('@')[0] for source in dates['node_ages'][lab]}
            all_studies = all_studies.union(studies)
            dated_nodes.add(lab)
            age_range = [float(source['age']) for source in dates['node_ages'][lab]]
            age_range.sort()
            age_est = sum(age_range) / len(age_range) 
            ages.write("{}\t{}\n".format(node.label, age_est))
        else:
            undated_nodes.add(lab)
    if node.taxon:
        if node.taxon.label:
            if node.taxon.label.startswith('mrca'):
                lab = node.taxon.label
            elif node.taxon.label.startswith('ott'):
                lab = node.taxon.label
            else:
                lab = node.taxon.label.split()[-1]
            if lab in dates['node_ages']:
                studies = {source["source_id"].split('@')[0] for source in dates['node_ages'][lab]}
                all_studies = all_studies.union(studies)
                dated_nodes.add(lab)
                age_range = [float(source['age']) for source in dates['node_ages'][lab]]
                age_range.sort()
                age_est = random.choice(age_range)
                ages.write("{}\t{}\n".format(node.label, age_est))
            else:
                undated_nodes.add(lab)


len(dated_nodes)

! phylocom bladj -f ebird_Aves_ID_annot.tre > all_ebird_blad_rand1.tre

In [None]:
len(all_studies)

In [None]:
import dendropy
bl_tree = dendropy.Tree.get_from_path("all_ebird_blad.tre", schema = "newick")

In [None]:
id_to_name = {}

In [None]:
for node in bl_tree:
    ebird_name = None
    if node.label:
        if node.label.startswith('ott'):
            print(node.label)
            ebird_name = all_ott_ids.get(node.label.strip('ott'), [None, None])[1]
            if ebird_name:
                print(ebird_name)
                id_to_name[node.label] = ebird_name
    if node.taxon:
        if node.taxon.label:
            if node.taxon.label.startswith('ott'):
                print(node.taxon.label)
                ebird_name = all_ott_ids.get(node.taxon.label.strip('ott'), [None, None])[1]
                if ebird_name:
                    print(ebird_name)
                    id_to_name[node.taxon.label] = ebird_name

In [None]:
tips = [tip.taxon.label for tip in bl_tree.leaf_node_iter()]
nodes = [node for node in bl_tree]

In [None]:
len(tips)

In [None]:
len(nodes)

In [None]:
len(nodes)-len(tips)

In [None]:
for tip in tips:
    if tip.startswith('ott'):
        assert tip in id_to_name

In [None]:
node_annotation = {}
for node in bl_tree:
    if node.label:
        node_annotation[node.label] = {}
    elif node.taxon:
        if node.taxon.label:
            node_annotation[node.taxon.label] = {}
    else:
        print(node)

In [None]:
for nid in node_annotation:
    node_annotation[nid] = {}
    node_annotation[nid]['studies'] = []
    node_annotation[nid]['strict_support'] = []
    node_annotation[nid]['conflict'] = []

In [None]:
nid_annotation = {}
for node in output.tree:
    if node.label:
        nid_annotation[node.label] = {}
    elif node.taxon:
        if node.taxon.label:
            nid_annotation[node.taxon.label] = {}
    else:
        print(node)

nid_resp = dict()
resp = OT.synth_node_info(list(nid_annotation.keys())).response_dict
for node_info in resp:
    nid_resp[node_info['node_id']] = node_info
    nid_resp[node_info['node_id']]['dates'] =  len(dates['node_ages'].get(node_info['node_id'],[]))
    


In [None]:
supported_tips = set()
unsupported_tips = set()
i = 0
for node in node_annotation:
    i+=1
    supporting = nid_resp[node].get('source_id_map')
    strict_support = nid_resp[node].get('supported_by')
    conflict = nid_resp[node].get('conflicts_with', [])
    node_annotation[node]['dates'] = nid_resp[node]['dates']
    if supporting.keys() == set(['ott3.2draft9']):
        node_annotation[node]['studies'] = 0
    else:
        node_annotation[node]['studies'] = len(supporting.keys())
    if strict_support.keys() == set(['ott3.2draft9']):
        node_annotation[node]['strict_support'] = 0
    else:
        node_annotation[node]['strict_support'] = len(strict_support.keys())
    node_annotation[node]['conflict'] = len(conflict)


In [None]:
max_conf=0
max_support=0
max_dates =0
for node in node_annotation:
    if node_annotation[node]['conflict'] > max_conf:
        max_conf = node_annotation[node]['conflict']
    if node_annotation[node]['studies'] > max_support:
        max_support = node_annotation[node]['studies']
    if node_annotation[node]['dates'] > max_dates:
        max_dates = node_annotation[node]['dates']

        

print(max_conf)
print(max_support)
print(max_dates)


In [None]:
fi = open("support_anno.txt", 'w')
startstr = """DATASET_STYLE
SEPARATOR TAB

#label is used in the legend table (can be changed later)
DATASET_LABEL\t{}

#dataset color (can be changed later)
COLOR\t#ffff00

DATA\n""".format("Support")
fi.write(startstr)
for node in node_annotation:
    if node_annotation[node]['studies']:
        nodelab = node.replace(" ","_")
        relsupport = node_annotation[node]['studies']/5
        r = 0
        g = 255*relsupport
        b = 0
        color = "rgba({}, {}, {}, {})".format(r, g, b, 0.25+relsupport)
        fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(nodelab,color))
    else:
        color = "rgba(0, 0, 0, 0.25)"
        fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(nodelab,color))
for tip in tips:
    color = "rgba(0, 0, 0, 0.25)"
    fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(tip,color))
    
fi.close()


In [None]:
fi = open("conflict_anno.txt", 'w')
startstr = """DATASET_STYLE
SEPARATOR TAB

#label is used in the legend table (can be changed later)
DATASET_LABEL\t{}

#dataset color (can be changed later)
COLOR\t#ffff00

DATA\n""".format("conflict")
fi.write(startstr)
for node in node_annotation:
    nodelab = node.replace(" ","_")
    relconf = min(node_annotation[node]['conflict'],5)/5
    r = 255*relconf
    g = 0
    b = 0
    color = "rgba({}, {}, {}, {})".format(r, g, b, 0.25+relconf)
    fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(nodelab,color))
for tip in tips:
    color = "rgba(0, 0, 0, 0.25)"
    fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(tip,color))     
fi.close()

In [None]:
fi = open("date_anno.txt", 'w')
startstr = """DATASET_STYLE
SEPARATOR TAB

#label is used in the legend table (can be changed later)
DATASET_LABEL\t{}

#dataset color (can be changed later)
COLOR\t#ffff00

DATA\n""".format("dates")
fi.write(startstr)
for node in node_annotation:
    nodelab = node.replace(" ","_")
    reldates = min(node_annotation[node]['dates'],5)/5
    r = 0
    g = 0
    b = 255*reldates
    color = "rgba({}, {}, {}, {})".format(r, g, b, 0.25+reldates)
    fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(nodelab,color))
        
fi.close()

In [None]:
fi = open("relabel.txt", 'w')
startstr = """LABELS
#use this template to change the leaf labels, or define/change the internal node names (displayed in mouseover popups)

#lines starting with a hash are comments and ignored during parsing

#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).

#SEPARATOR TAB
#SEPARATOR SPACE
SEPARATOR COMMA

#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages
#=================================================================#
#       Actual data follows after the "DATA" keyword              #
#=================================================================#
DATA
#NODE_ID,LABEL\n"""
fi.write(startstr)

for ottid in id_to_name:
    fi.write("{},{}\n".format(ottid, id_to_name[ottid]))
fi.close()

In [None]:
'ott892276' in tips

In [None]:
tax