In [1]:
import os
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord, Seq, SearchIO, AlignIO, Phylo
from Bio.Blast import NCBIWWW, NCBIXML
import Bio.Entrez
from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor

In [2]:
#Function open fasta file
def open_fasta(filename):
    with open(filename) as handle:
        sequence_record = SeqIO.read(handle, 'fasta')
    return sequence_record

In [3]:
#Function for multi-taxid Blast
def blastn_with_taxid(sequence, filename, query_size = 50,list_taxid = []):
    result_handler, result_storer = None, None
    if len(list_taxid) <1:
        result_handler = NCBIWWW.qblast('blastn', 'nt', sequence) 
        result_storer = result_handler.read()
    else:
        entrez_query = ''
        for taxid in list_taxid:
            entrez_query += f'txid{taxid}[ORGN]'
            if taxid != list_taxid[-1]:
                entrez_query += ' OR '
        result_handler = NCBIWWW.qblast('blastn', 'nt', sequence, entrez_query= entrez_query, hitlist_size=query_size)
        result_storer = result_handler.read()
    with open(f'{filename}.xml', 'w') as savefile:
        savefile.write(result_storer)

In [5]:
#Creation of dictionary with all HSPs adding identity and other metrics

#Hits without at least a significant HSP are excluded



#!!!No more hsp or hsp combined



def blast_to_dictionary_plus_metrics(blastresult):
    blast_dictionary = {'ID' : [], 'Description' : [], 'Seq_length' : [], 'Accession' : [], 'Bitscore' : [], 'Bitscore_raw' : [], 
    'Evalue' : [], 'Hit_start' : [], 'Hit_end' : [], 'Query_frame' : [], 'Gap_num' : [], 'Aln_span' : [], 'Tot_aln_span':[], 'Identity' :[]}
    for result in blastresult:
        blast_dictionary['ID'].append(result.id)
        blast_dictionary['Description'].append(result.description)
        blast_dictionary['Seq_length'].append(result.seq_len)
        blast_dictionary['Accession'].append(result.accession)
        bitscore, bitscore_raw, evalue, hitstart, hitend, queryframe, gapnum, alnspan = '','','','','','','',''
        all_alnspan, all_gapnum = [],[] #seq_len is not required
        for hsp in result.hsps:
            bitscore += str(hsp.bitscore)
            bitscore_raw += str(hsp.bitscore_raw)
            evalue += str(hsp.evalue)
            hitstart += str(hsp.hit_start)
            hitend += str(hsp.hit_end)
            queryframe += str(hsp.query_frame)
            gapnum += str(hsp.gap_num)
            alnspan += str(hsp.aln_span)
            if hsp != result.hsps[-1]:
                bitscore += '/'
                bitscore_raw += '/' 
                evalue += '/'
                hitstart += '/'
                hitend += '/'
                queryframe += '/'
                gapnum += '/' 
                alnspan += '/' #I know it's not neat but it would give an error otherwise :(
            all_alnspan.append(int(hsp.aln_span))
            all_gapnum.append(int(hsp.gap_num))
        blast_dictionary['Bitscore'].append(bitscore)
        blast_dictionary['Bitscore_raw'].append(bitscore_raw)
        blast_dictionary['Evalue'].append(evalue)
        blast_dictionary['Hit_start'].append(hitstart)
        blast_dictionary['Hit_end'].append(hitend)
        blast_dictionary['Query_frame'].append(queryframe)
        blast_dictionary['Gap_num'].append(gapnum)
        blast_dictionary['Aln_span'].append(alnspan)
        tot_alnspan, tot_gapnum = int(), int()
        seq_len = int(result.seq_len)
        for span in all_alnspan:
            tot_alnspan += span
        for gap in all_gapnum:
            tot_gapnum += gap
        identity = (tot_alnspan - tot_gapnum)/seq_len*100
        blast_dictionary['Tot_aln_span'].append(tot_alnspan)
        blast_dictionary['Identity'].append(round(identity, 3))
    return blast_dictionary

In [8]:
#Open record
fasta_record = open_fasta('human_mx1.fas')
#Run Blast
taxid_list = ['9592', '9527', '9601'] #, '40674', '314147', '9531', '9544', '2008792'
blastn_with_taxid(fasta_record.seq, 'test_taxid_function', query_size=200,list_taxid = taxid_list)


In [9]:
#Re-import Blast results
results_handler = SearchIO.read('test_taxid_function.xml', 'blast-xml')
#Read XML and convert to dictionary
dictionary_blast_results_plus_metrics = blast_to_dictionary_plus_metrics(results_handler)
#Convert dictionary to DF
df_blast_results = pd.DataFrame.from_dict(dictionary_blast_results_plus_metrics)

#Clear DF


#Save it as .csv
#with open('csv_blast_results_long.csv', 'w') as savefile:
#        savefile.write(str(df_blast_results.to_csv()))


In [10]:

#Filter DF results by evalue
savable = False
#Check e-values
remove_index= []
for i in range(len(df_blast_results.index)):
    eval = df_blast_results.iloc[i]['Evalue']
    eval_storer = eval.split('/')
    #Check if there's at least one significant HSP per hit
    for e in eval_storer:
        if float(e) < 10**-10:
            savable = True
    if savable == False:
        remove_index.append(str(i))
    savable = False
        
if len(remove_index) >0:
    df_blast_results = df_blast_results.drop(df_blast_results.index[remove_index])
    df_blast_results = df_blast_results.reset_index(drop=True)


#Fileter by sequence length
query_length = len(fasta_record.seq)
lower, upper = query_length/2, query_length*3/2

df_blast_results = df_blast_results[df_blast_results['Seq_length'] > lower]
df_blast_results = df_blast_results.reset_index(drop=True)
df_blast_results = df_blast_results[df_blast_results['Seq_length'] < upper]
df_blast_results = df_blast_results.reset_index(drop=True)

df_blast_results = df_blast_results[df_blast_results['Identity'] > 50]
df_blast_results = df_blast_results.reset_index(drop=True)

"""for i in range(len(df_blast_results.index)):
    seq_len = int(df_blast_results.iloc[i]['Seq_length'])
    if seq_len < 1000 or seq_len<(query_length/2) or seq_len > (query_length*3/2):
        df_blast_results = df_blast_results.drop([i])
"""


"for i in range(len(df_blast_results.index)):\n    seq_len = int(df_blast_results.iloc[i]['Seq_length'])\n    if seq_len < 1000 or seq_len<(query_length/2) or seq_len > (query_length*3/2):\n        df_blast_results = df_blast_results.drop([i])\n"

In [90]:
#Check for reducing the number of sequences further?


In [11]:
#Function to retrieve all genbank from the result
def retrieve_all_genbank(list_of_entries):
    Bio.Entrez.email = 'A.N.Other@example.com'
    list_of_sequences = []
    for entry in list_of_entries:
        handler = Bio.Entrez.efetch(db='nucleotide', id=entry, rettype = 'gb', retmode = 'xml', retmax=1) #!!!Genbank format !! Returns JSON regardless
        #retmode = 'text'
        gb_info = Bio.Entrez.read(handler, 'genbank')
        list_of_sequences.append(gb_info)
    return list_of_sequences


#list_entries = ['JX297238', 'XM_005260982', 'NM_002462']
accession_list = df_blast_results['Accession'].tolist()
genbank_file_stack = retrieve_all_genbank(accession_list) 


In [12]:
#Create a dictionary with the genbank results

def find_taxon(list_feature_table):
    taxon = ''
    for feature in list_feature_table:
        if feature['GBFeature_key'] == 'source':
            source_wrap = feature['GBFeature_quals']
            for qualifier in source_wrap:
                if qualifier['GBQualifier_name'] == 'db_xref' and qualifier['GBQualifier_value'].startswith('taxon'):
                    taxon += (qualifier['GBQualifier_value'])
    return taxon    

def find_prot_id(list_feat_table):
    prot_id = ''
    for feature in list_feat_table:
        if feature['GBFeature_key'] == 'CDS':
            wrapper = feature['GBFeature_quals']
            for qualifier in wrapper:
                if qualifier['GBQualifier_name'] == 'protein_id':
                    prot_id += qualifier['GBQualifier_value']
    return prot_id

def find_prot_seq(list_feat_table):
    prot_seq = ''
    for feature in list_feat_table:
        if feature['GBFeature_key'] == 'CDS':
            wrapper = feature['GBFeature_quals']
            for qualifier in wrapper:
                if qualifier['GBQualifier_name'] == 'translation':
                    prot_seq += qualifier['GBQualifier_value']
    return prot_seq


def genbank_to_dictionary(list_of_genbank):
    dictionary_gen = {'Accession' : [], 'Accession_version': [], 'Gene_length' : [], 'Strandedness': [], 'Molecule_type':[], 'Organism':[], 'Taxonomy':[],
    'Nuc_sequence':[], 'Taxon':[], 'Protein_ID':[], 'Prot_sequence':[]} #'N_of_references':[]
    for gen in list_of_genbank:
        first_wrapper = gen[0]
        dictionary_gen['Accession'].append(first_wrapper['GBSeq_primary-accession'])
        dictionary_gen['Accession_version'].append(first_wrapper['GBSeq_accession-version'])
        dictionary_gen['Gene_length'].append(first_wrapper['GBSeq_length'])
        dictionary_gen['Strandedness'].append(first_wrapper['GBSeq_strandedness'])
        dictionary_gen['Molecule_type'].append(first_wrapper['GBSeq_moltype'])
        dictionary_gen['Organism'].append(first_wrapper['GBSeq_organism'])
        dictionary_gen['Taxonomy'].append(first_wrapper['GBSeq_taxonomy'])
        #dictionary_gen['N_of_references'].append(len(first_wrapper['GBSeq_references']))
        dictionary_gen['Nuc_sequence'].append(first_wrapper['GBSeq_sequence'])
        dictionary_gen['Taxon'].append(find_taxon(first_wrapper['GBSeq_feature-table']))
        dictionary_gen['Protein_ID'].append(find_prot_id(first_wrapper['GBSeq_feature-table']))
        dictionary_gen['Prot_sequence'].append(find_prot_seq(first_wrapper['GBSeq_feature-table']))
    return dictionary_gen


genbank_dict = genbank_to_dictionary(genbank_file_stack)
print(genbank_dict)

{'Accession': ['NM_001279761', 'XM_019017591', 'XM_019017590', 'XM_019017589', 'XM_019017588', 'XM_019017587', 'XM_019017586', 'XM_019017585', 'NM_001134146', 'CR860897', 'JX297238', 'JX297232', 'JX297248', 'JX297237', 'XM_015132838', 'XM_015446625', 'XM_015446624', 'XM_015446623', 'JX297233', 'NM_001079693', 'EF101561', 'XR_981666', 'XM_011725993', 'XM_011725991', 'XM_011725992', 'JX297239', 'XM_012037327', 'XM_011984185', 'XM_033232530', 'XM_033232529', 'XM_033232528', 'XM_033232527', 'XM_033232526', 'XM_033232525', 'XM_011954059', 'NM_001305954', 'XM_009202220', 'XM_031665755', 'XM_017956428', 'XM_031665754', 'XM_021935501', 'XM_003895462', 'XM_023191875', 'XM_023191869', 'XM_023191852', 'XM_023191847', 'XM_023191860', 'XM_023191853', 'XM_023191838', 'XM_007968715', 'XM_037984844', 'XM_037984843', 'XM_007968708', 'XM_007968698', 'XM_007968690', 'XM_025380298', 'XM_030914603', 'XM_017880224', 'XM_031005077', 'KT698238', 'KT698236', 'XM_024239517', 'XM_009233982', 'KT698235', 'KF92535

In [13]:
#Create DataFrame from dictionary
df_gb_results = pd.DataFrame.from_dict(genbank_dict)

In [94]:
#Check if there were excluded Taxids to re-run
#Obtain list of results' taxid
retrieved_taxids = df_gb_results['Taxon'].tolist()
retrieved_taxids = [taxid.replace('taxon:', '') for taxid in retrieved_taxids_raw]

retrieved_taxids = list(dict.fromkeys(retrieved_taxids))


#Check originally submitted list
#print(type(taxid_list[0]))
#print(type(retrieved_taxids[0]))
missing = [tax for tax in taxid_list if tax not in retrieved_taxids]

print(missing)
print(taxid_list)

['9592', '9527', '40674', '314147', '2008792']
['9592', '9527', '40674', '314147', '9531', '9544', '2008792']


In [95]:
#Re-run blast again to obtain sequences of missing Taxids

In [14]:
#Merge dictionaries
left = df_blast_results.loc[ : , ['Accession', 'ID', 'Description','Seq_length', 'Evalue', 'Bitscore', 'Tot_aln_span', 'Identity']]#.drop_duplicates('Accession', keep='first')
#left = left.drop_duplicates('Accession', inplace=True)
right = df_gb_results.loc[:,['Accession', 'Accession_version','Organism', 'Taxonomy', 'Taxon', 'Nuc_sequence','Protein_ID', 'Prot_sequence']]
#right = right.drop_duplicates('Accession', inplace=True)
df_combined = pd.merge(left, right, on='Accession')
#print(df_combined)


#Solve issue!!

In [15]:
#Filtering before multiple sequence alignment? 
#Only top 5-10 sequences per taxid? 


#Top 5!!!! BY IDENTITY - EVALUE - BITSCORE 


retrieved_taxon = df_combined['Taxon'].tolist()
#retrieved_taxon = [taxid.replace('taxon:', '') for taxid in retrieved_taxon] 
retrieved_taxon = list(dict.fromkeys(retrieved_taxon))
print(len(retrieved_taxon))

df_filtered = pd.DataFrame()

for taxon in retrieved_taxon:
    temp_df = df_combined[df_combined['Taxon'] == taxon]
    temp_df = temp_df.sort_values('Identity', ascending=False) #['Evalue', 'Identity', 'Bitscore'], ascending=[True, False, False] #Evalues/Bitscore have mashed results
    if len(temp_df) < 3:
        df_filtered = df_filtered.append(temp_df)
    else:
        df_filtered = df_filtered.append(temp_df[:3])

df_filtered = df_filtered.reset_index(drop=True)


27


In [16]:
df_filtered.to_csv('test_csv.csv', index=False)

In [69]:
#Prepare file for multiple alignmnet

query_protein = fasta_record.translate(to_stop=True)
fasta_for_alignment = ''
fasta_for_alignment += f">Query:{fasta_record.id}\n" #Need to clean the query ID 
fasta_for_alignment += f'{query_protein.seq}\n' #ABSOLUTELY NEED TO CHANGE TO PROTEIN SEQ
for i in range(len(df_filtered['Accession'])): 
    if len(df_filtered['Prot_sequence'][i]) < 1:
        continue
    fasta_for_alignment += f">{df_filtered['Accession'][i]}{df_filtered['Organism'][i]}\n"#'Accession'
    fasta_for_alignment += df_filtered['Prot_sequence'][i]
    if i != len(df_filtered['Accession']):
        fasta_for_alignment += '\n'


print(fasta_for_alignment)

#Write proteins for alignment in a file to feed to mofft
with open('sequences_for_alignment_filtered_v2.fasta', 'w') as savefile:
        savefile.write(fasta_for_alignment)

>Query:lcl|XM_005260978.4_cds_XP_005261035.1_1
MVVSEVDIAKADPAAASHPLLLNGDATVAQKNPGSVAENNLCSQYEEKVRPCIDLIDSLRALGVEQDLALPAIAVIGDQSSGKSSVLEALSGVALPRGSGIVTRCPLVLKLKKLVNEDKWRGKVSYQDYEIEISDASEVEKEINKAQNAIAGEGMGISHELITLEISSRDVPDLTLIDLPGITRVAVGNQPADIGYKIKTLIKKYIQRQETISLVVVPSNVDIATTEALSMAQEVDPEGDRTIGILTKPDLVDKGTEDKVVDVVRNLVFHLKKGYMIVKCRGQQEIQDQLSLSEALQREKIFFENHPYFRDLLEEGKATVPCLAEKLTSELITHICKSLPLLENQIKETHQRITEELQKYGVDIPEDENEKMFFLIDKVNAFNQDITALMQGEETVGEEDIRLFTRLRHEFHKWSTIIENNFQEGHKILSRKIQKFENQYRGRELPGFVNYRTFETIVKQQIKALEEPAVDMLHTVTDMVRLAFTDVSIKNFEEFFNLHRTAKSKIEDIRAEQEREGEKLIRLHFQMEQIVYCQDQVYRGALQKVREKELEEEKKKKSWDFGAFQSSSATDSSMEEIFQHLMAYHQEASKRISSHIPLIIQFFMLQTYGQQLQKAMLQLLQDKDTYSWLLKERSDTSDKRKFLKERLARLTQARRRLAQFPG
>AK315465Homo sapiens
MVVSEVDIAKADPAAASHPLLLNGDATVAQKNPGSVAENNLCSQYEEKVRPCIDLIDSLRALGVEQDLALPAIAVIGDQSSGKSSVLEALSGVALPRGSGIVTRCPLVLKLKKLVNEDKWRGKVSYQDYEIEISDASEVEKEINKAQNAIAGEGMGISHELITLEISSRDVPDLTLIDLPGITRVAVGNQPADIGYKIKTLIKKYIQRQETISLVVVPSNVDIATTEALSMAQEVDPEGDRTIGILTKPDLVDKGTEDKVVDVVRNLV

In [70]:
#Run MAFFT from command line using subprocess (can also use for ssh)
import subprocess
command_list = [r'/Users/Gioele/miniconda3/bin/mafft', '--distout', 'sequences_for_alignment_filtered_v2.fasta'] # --localpair checks sequences between them
process = subprocess.Popen(command_list, universal_newlines= True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
stdout,stderr = process.communicate()

#Save alignment file
with open('aligned_mafft_filtered.fasta', 'w') as handle:
    handle.write(stdout)
#Also save standard error just in case
with open('aligned_mafft.stderr', 'w') as handle:
    handle.write(stderr)

In [2]:
#Open file with AlignIO module
mafft_alignment_filtered = AlignIO.read('aligned_mafft_filtered.fasta', 'fasta')
print(mafft_alignment_filtered)

Alignment with 117 rows and 767 columns
--------------------------------MVVSEVDIAKAD...PG- Query:lcl|XM_005260978.4_cds_XP_005261035.1_1
--------------------------------MVVSEVDIAKAD...PG- AK315465Homo
--------------------------------------------...PG- BC014222Homo
--------------------------------MVVSEVDIAKAD...PG- NM_001178046Homo
--------------------------------MVVSEVDIAKAD...PG- NM_001301762Pan
--------------------------------MVVSEVDIAKAD...PG- XM_034947821Pan
--------------------------------MVVSEVDIAKAD...PG- NM_001279836Pan
--------------------------------MVVSEVDIAKAD...PG- XM_009437900Pan
--------------------------------MVVSEVDIAKAD...PG- XM_009438452Pan
--------------------------------MVVSEVDIAKAD...PG- NM_001279761Gorilla
--------------------------------MVVSEVDIAKAD...PG- XM_031005077Gorilla
--------------------------------MVVSEVDIAKAD...PG- XM_019017589Gorilla
--------------------------------MVVSEVDIAKAD...PG- XM_019017591Gorilla
--------------------------------MVVSEVDIAKAD...P

In [72]:
#Find alignment matrix and break it down

def hat2_parser(path_to_file):
    with open(path_to_file) as handler:
        matrix=handler.read().splitlines()
    seq_n = int(matrix[1].strip())
    del matrix[0:3]
    matrix_identifier = matrix[0:seq_n]
    list_of_identifiers = []
    for line in matrix_identifier:
        line = line[line.index('=')+1:]
        list_of_identifiers.append(line)
    matrix_values = matrix[seq_n:]
    value_vector = []
    for values in matrix_values:
        while True:
            try:
                storer = values[values.index('.')-1:values.index('.')+4]
            except ValueError:
                break
            value_vector.append(storer)
            values = values[values.index('.')+4:]
            values = values.strip()
    matrix_dictionary = {#'index': list_of_identifiers #Possibly useless 
    }
    temp_value_vector = value_vector
    for a in range(len(matrix_identifier)-1):
        temp_list = []
        for i in range(a):#!!!!!!!!Possibly +1
            temp_list.append(None)
        temp_list.extend(temp_value_vector[:seq_n-a-1])
        temp_value_vector = temp_value_vector[seq_n-a-1:]
        matrix_dictionary.update({f'{list_of_identifiers[a]}': temp_list})
        matrix_df = pd.DataFrame(matrix_dictionary)
    matrix_df = matrix_df.T
    return matrix_df


mafft_matrix = hat2_parser('sequences_for_alignment_filtered_v2.fasta.hat2')
print(mafft_matrix)

                                                 0      1      2      3    \
Query:lcl|XM_005260978.4_cds_XP_005261035.1_1  0.005  0.006  0.000  0.021   
AK315465Homo sapiens                            None  0.011  0.005  0.027   
BC014222Homo sapiens                            None   None  0.006  0.028   
NM_001178046Homo sapiens                        None   None   None  0.021   
NM_001301762Pan paniscus                        None   None   None   None   
...                                              ...    ...    ...    ...   
XM_029544505Mus pahari                          None   None   None   None   
BC007127Mus musculus                            None   None   None   None   
KR362567Sturnira lilium                         None   None   None   None   
XM_026777542Microtus ochrogaster                None   None   None   None   
XM_026777543Microtus ochrogaster                None   None   None   None   

                                                 4      5      6      7    

In [73]:
#Similarity matrix to distance matrix
#d = 1/(s+1)

In [3]:
#Produce matrix and tree from MultipleSeqAlignment object from Phylo
#mafft_alignment_filtered

calculator = DistanceCalculator('identity') # dna_models, protein_models
distance_matrix_phylo = calculator.get_distance(mafft_alignment_filtered)

In [75]:
distance_matrix_phylo

DistanceMatrix(names=['Query:lcl|XM_005260978.4_cds_XP_005261035.1_1', 'AK315465Homo', 'BC014222Homo', 'NM_001178046Homo', 'NM_001301762Pan', 'XM_034947821Pan', 'NM_001279836Pan', 'XM_009437900Pan', 'XM_009438452Pan', 'NM_001279761Gorilla', 'XM_031005077Gorilla', 'XM_019017589Gorilla', 'XM_019017591Gorilla', 'JX297244Pongo', 'NM_001134146Pongo', 'CR860897Pongo', 'JX297247Symphalangus', 'JX297235Hylobates', 'NM_001280102Nomascus', 'XM_032174261Hylobates', 'XM_032174260Hylobates', 'XM_032174259Hylobates', 'JX297238Macaca', 'JX297232Chlorocebus', 'JX297248Trachypithecus', 'XM_033232529Trachypithecus', 'XM_033232528Trachypithecus', 'JX297237Macaca', 'XM_015446623Macaca', 'XM_015446625Macaca', 'NM_001079693Macaca', 'EF101561Macaca', 'XM_015132838Macaca', 'JX297233Colobus', 'XM_011725992Macaca', 'XM_011725991Macaca', 'JX297239Miopithecus', 'NM_001305954Cercocebus', 'XM_012037327Cercocebus', 'XM_011984185Mandrillus', 'XM_011954059Colobus', 'XM_031665754Papio', 'XM_003895462Papio', 'XM_0092022

In [4]:
constructor = DistanceTreeConstructor(calculator, 'nj') #upgma
tree = constructor.build_tree(mafft_alignment_filtered)

In [77]:
print(tree)

Tree(rooted=False)
    Clade(branch_length=0, name='Inner115')
        Clade(branch_length=2.7405391926871585e-05, name='Inner114')
            Clade(branch_length=0.00037641786378863566, name='Inner113')
                Clade(branch_length=0.0010646213297738087, name='Inner112')
                    Clade(branch_length=0.0008879839363744842, name='Inner111')
                        Clade(branch_length=0.0003865864938986988, name='Inner110')
                            Clade(branch_length=0.0028878717202821206, name='Inner98')
                                Clade(branch_length=0.0, name='XM_007968690Chlorocebus')
                                Clade(branch_length=0.0, name='Inner97')
                                    Clade(branch_length=0.0, name='XM_037984843Chlorocebus')
                                    Clade(branch_length=0.0, name='XM_007968715Chlorocebus')
                            Clade(branch_length=0.001023471174111645, name='JX297232Chlorocebus')
                      

In [6]:
Phylo.write(tree, "tree_filtered.xml", "phyloxml")

1

In [78]:
Phylo.draw_ascii(tree)

 , XM_007968690Chlorocebus
 |
 , XM_037984843Chlorocebus
 |
 | XM_007968715Chlorocebus
 |
 | JX297232Chlorocebus
 |
 , JX297239Miopithecus
 |
 |, XM_023191838Piliocolobus
 ,|
 |, XM_023191860Piliocolobus
 ||
 || XM_023191853Piliocolobus
 |
 |, XM_011954059Colobus
 ||
 || JX297233Colobus
 |
 |, XM_017880224Rhinopithecus
 ,|
 || XM_030914603Rhinopithecus
 |
 |, XM_033232528Trachypithecus
 ||
 || XM_033232529Trachypithecus
 ||
 || JX297248Trachypithecus
 |
 | , NM_001280102Nomascus
 | |
 | | __ XM_032174261Hylobates
 | ||
 | |, XM_032174259Hylobates
 | ||
 | || XM_032174260Hylobates
 | ||
 | |, JX297235Hylobates
 | ||
 |,|| JX297247Symphalangus
 |||
 ||, CR860897Pongo
 |||
 ||| NM_001134146Pongo
 |||
 ||| JX297244Pongo
 |||
 ||, XM_019017591Gorilla
 |||
 ||, XM_019017589Gorilla
 |||
 ||| NM_001279761Gorilla
 |||
 |||, NM_001279836Pan
 ||||
 ||,, XM_009438452Pan
 ||||
 |||| XM_009437900Pan
 ||||
 |||| NM_001301762Pan
 |||
 ||, NM_001178046Homo
 |||
 ||| Query:lcl|XM_005260978.4_cds_XP_0052

In [8]:
pip install matplotlib


Collecting matplotlib
  Using cached matplotlib-3.4.3-cp39-cp39-macosx_10_9_x86_64.whl (7.2 MB)
Collecting pillow>=6.2.0
  Using cached Pillow-8.3.2-cp39-cp39-macosx_10_10_x86_64.whl (3.0 MB)
Collecting pyparsing>=2.2.1
  Using cached pyparsing-2.4.7-py2.py3-none-any.whl (67 kB)
Collecting cycler>=0.10
  Using cached cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting kiwisolver>=1.0.1
  Using cached kiwisolver-1.3.2-cp39-cp39-macosx_10_9_x86_64.whl (61 kB)
Installing collected packages: pyparsing, pillow, kiwisolver, cycler, matplotlib
Successfully installed cycler-0.10.0 kiwisolver-1.3.2 matplotlib-3.4.3 pillow-8.3.2 pyparsing-2.4.7
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
tree.ladderize()  # Flip branches so deeper clades are displayed at top
Phylo.draw(tree)