In [3]:
%%bash
mafft --thread 8 --retree 2 --maxiterate 0 --fft \
      data/original/unaligned_schindel2017.fasta \
      > data/processed/aligned_schindel2017.fasta \
      2> logs/mafft_usnm.log

In [4]:
%%bash
FastTree -gtr -nt data/processed/aligned_schindel2017.fasta \
         > data/processed/usnm_bird_tree.nwk \
         2> logs/fasttree_usnm.log

In [6]:
import pandas as pd

cols_to_keep = ['accession','scientific_name','specimen_voucher','country']
usnm_specimens = pd.read_csv('data/original/schindel2017.tsv',sep='\t', usecols=cols_to_keep)
usnm_specimens['just_country'] = usnm_specimens['country'].str.split(':').str.get(0)
print(usnm_specimens.head())

  accession                     country          scientific_name  \
0  JQ176654                      Guyana  Xiphorhynchus obsoletus   
1  JQ176549  Guyana: Northwest District         Trogon violaceus   
2  JQ176510                      Guyana    Todirostrum maculatum   
3  JQ176359  Guyana: Northwest District    Tachyphonus luctuosus   
4  JQ176343                      Guyana       Tachornis squamata   

  specimen_voucher just_country  
0       KU:O:89742       Guyana  
1       KU:O:88933       Guyana  
2       KU:O:90939       Guyana  
3       KU:O:89078       Guyana  
4       KU:O:91651       Guyana  


In [7]:
org_dict = usnm_specimens.set_index('accession')['scientific_name'].to_dict()
voucher_dict = usnm_specimens.set_index('accession')['specimen_voucher'].to_dict()
country_dict = usnm_specimens.set_index('accession')['just_country'].to_dict()

In [8]:
dist_df = pd.read_csv('data/processed/schindel2017_distance_summary.tsv',sep='\t')
bad_dist = dist_df[dist_df['max_intra'] > dist_df['min_inter']]
bad_orgs = bad_dist['scientific_name'].tolist()
print(len(bad_orgs))

25


In [14]:
from ete3 import Tree, TreeStyle, TextFace

t = Tree("data/processed/usnm_bird_tree.nwk")

In [15]:
for node in t.iter_leaves():
    seq_id = str(node.name)
    voucher = str(voucher_dict[seq_id])
    country = str(country_dict[seq_id])
    org_name = str(org_dict[seq_id])
    genus_species = ' '.join(org_name.split(' ')[:2])
    label_text = ', '.join([seq_id, voucher, org_name, country])
    if genus_species in bad_orgs:
        label = TextFace(label_text, fsize=10, fgcolor='red', fstyle='italic')
    else:
        label = TextFace(label_text)
    node.add_face(label, column=0)

In [17]:
ts = TreeStyle()
ts.show_leaf_name = False
ts.margin_right = 24
ts.scale=1200
ts.branch_vertical_margin=5
tree_file = t.render("figures/fasttree_tree.svg", w=800, units='px', tree_style = ts)