In [3]:
import os
from sys import stderr, stdout
import tempfile
import subprocess
from warnings import catch_warnings
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord, Seq, SearchIO, AlignIO, Phylo
from Bio.Blast import NCBIWWW, NCBIXML
import Bio.Entrez
from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor
from pandas.core.frame import DataFrame
import logging
import time
import traceback

In [4]:
#Function to open fasta file of imput
def open_fasta(filename) -> SeqRecord:
    with open(filename) as handle:
        sequence_record = SeqIO.read(handle, 'fasta')
    logging.info('Opened sequence {}'.format(sequence_record.id))
    return sequence_record

In [5]:
#Function to run BLAST with taxid list
def blastp_with_list(sequence, list_taxid = [], query_size = 200):
    result_handler, result_storer = None, None
    #If list is empty run query without specific taxid
    if len(list_taxid) <1:
        result_handler = NCBIWWW.qblast('blastp', 'nr', sequence, hitlist_size=query_size) 
        result_storer = result_handler.read()
    #Prepare string of Entrez and parse it to qblast
    else:
        entrez_query = ''
        for taxid in list_taxid:
            entrez_query += 'txid{}[ORGN]'.format(taxid)
            if taxid != list_taxid[-1]:
                entrez_query += ' OR '
        result_handler = NCBIWWW.qblast('blastp', 'nr', sequence, entrez_query= entrez_query, hitlist_size=query_size)
        result_storer = result_handler.read()
    logging.info('BLASTp specifying {} taxid(s) completed'.format(len(list_taxid)))

    return result_storer

In [6]:

#Single blast query for threading
#Threading to be implemented 
def blastp_single_taxid(sequence, taxid, query_size = 20):
    entrez_query = 'txid{}[ORGN]'.format(taxid)
    result_handler = NCBIWWW.qblast('blastp', 'nr', sequence, entrez_query= entrez_query, hitlist_size=query_size)    
    result_storer = result_handler.read()
    return result_storer

In [7]:

#Function to create a temporary file from string and read it with SearchIO.read
def xml_string_to_handler(string):
    #make temporary file
    tmp = tempfile.NamedTemporaryFile(mode='a+')
    #write string
    tmp.write(string)
    handler = SearchIO.read(tmp.name, 'blast-xml')
    tmp.close()
    logging.info('Blast returned {} results'.format(len(handler)))
    return handler

In [8]:
#Creation of a dictionary with all HSPS
def blast_to_dictionary(blastresult):
    blast_dictionary = {'ID' : [], 'Description' : [], 'Seq_length' : [], 'Accession' : [], 'Bitscore' : [], 'Evalue' : [], 'Tot_aln_span':[], 'Identity' :[]}
    #Loop through results 
    for result in blastresult:
        blast_dictionary['ID'].append(result.id)
        blast_dictionary['Description'].append(result.description)
        blast_dictionary['Seq_length'].append(result.seq_len)
        blast_dictionary['Accession'].append(result.accession)
        #Store results of first HSP
        first_hsp = result.hsps[0]
        blast_dictionary['Bitscore'].append(first_hsp.bitscore)
        blast_dictionary['Evalue'].append(first_hsp.evalue)
        #Create variables to store results of multiple hsps
        all_alnspan, all_gapnum = [],[] 
        #Collect data of all hsps for each hit
        for hsp in result.hsps:
            all_alnspan.append(int(hsp.aln_span))
            all_gapnum.append(int(hsp.gap_num))
        #Calculate total alignment span and gaps to calculate identity
        tot_alnspan, tot_gapnum = int(), int()
        seq_len = int(result.seq_len) #DOUBLE CHECK 
        for span in all_alnspan:
            tot_alnspan += span
        for gap in all_gapnum:
            tot_gapnum += gap
        identity = ((tot_alnspan - tot_gapnum)/seq_len)*100
        blast_dictionary['Tot_aln_span'].append(tot_alnspan)
        blast_dictionary['Identity'].append(round(identity, 3))

    logging.info("{} entries were recorded from the BLASTp results".format(len(blast_dictionary['ID'])))
    return blast_dictionary

In [9]:
#Filter DF based on Evalue, sequence length and identity
def filter_df_blast(df:DataFrame, query:SeqRecord, evalue = 10**-10, difference_from_query = 50, identity_threshold=50) -> DataFrame:
    #Record initial dataframe length
    initial_len = len(df)

    remove_index = []
    #Check if main HSP is significant for threshold, store indexes of non-significant hits
    for i in range(len(df.index)):
        eval = df.iloc[i]['Evalue']
        if float(eval) > evalue:
            remove_index.append(str(i))
        
    #Copy DF
    df_to_return = df    
    #Remove indexes if list is not empty
    if len(remove_index)>0:
        df_to_return = df_to_return.drop(df.index[int(remove_index)])
        df_to_return = df_to_return.reset_index(drop=True)
    
    #Filter by +/- % of query length
    #Obtain query length
    query_length = len(query.seq)
    #Determine upper/lower threshold of acceptance for sequences 
    lower, upper = query_length*(difference_from_query/100), query_length*((difference_from_query + 100)/100)
    #Filter DF
    df_to_return = df_to_return[df_to_return['Seq_length'] > lower]
    df_to_return = df_to_return.reset_index(drop=True)
    df_to_return = df_to_return[df_to_return['Seq_length'] < upper]
    df_to_return = df_to_return.reset_index(drop=True)

    #Filter by identity
    df_to_return = df_to_return[df_to_return['Identity'] > identity_threshold]
    df_to_return = df_to_return.reset_index(drop=True)
    
    #Calculate how many sequences were removed
    final_len = len(df_to_return)

    logging.info("{} sequences were removed filtering the BLAST results DF, returning a DF with {} entries".format((initial_len-final_len),final_len))
    return df_to_return

In [21]:
#Retrieve all result's entries from efetch
def retrieve_all_efetch(list_of_entries, email):
    Bio.Entrez.email = email
    list_of_sequences = []
    for entry in list_of_entries:
        handler = Bio.Entrez.efetch(db='protein', id=entry, rettype = 'fasta',retmode = 'xml', retmax=1) #Returns JSON regardless
        gb_info = Bio.Entrez.read(handler, 'text')#Returns nested lists and dictionaries 
        list_of_sequences.append(gb_info)
    logging.info('{} protein entries were retrieved from NCBI protein database'.format(len(list_of_sequences)))
    return list_of_sequences

In [10]:
#Efetch to dictionary parser
def efetch_protein_to_dictionary(list_of_efetch):
    #Declare new dictionary
    dictionary = {'Accession':[],'Protein_ID':[], 'Taxid':[], 'Organism_name':[], 'Description':[], 'Seq_length':[], 'Prot_sequence':[]}
    for wrapper in list_of_efetch:
        try:
            #Cast into dictionary to avoid random exception
            result = dict(wrapper[0])
            acc_ver = result['TSeq_accver']
            accession = acc_ver.split('.')
            dictionary['Accession'].append(accession[0])
            dictionary['Protein_ID'].append(result['TSeq_accver'])
            dictionary['Taxid'].append(result['TSeq_taxid'])
            dictionary['Organism_name'].append(result['TSeq_orgname'])
            dictionary['Description'].append(result['TSeq_defline'])
            dictionary['Seq_length'].append(result['TSeq_length'])
            dictionary['Prot_sequence'].append(result['TSeq_sequence'])
        except KeyError:
            logging.error('A sequence failed parsing from EFetch')
            print('Could not parse one sequence from efetch')
    logging.info("{} sequences were parsed correctly".format(len(dictionary['Accession'])))
    return dictionary

In [11]:
def filter_df_taxon(df:DataFrame, n_of_sequences = 1) -> DataFrame:
    #Record initial length of DF
    initial_len = len(df)

    #Make a list of all retrieved taxons
    retrieved_taxids = df['Taxid'].tolist() 
    #Make list from dictionary to eliminate duplicates 
    retrieved_taxids = list(dict.fromkeys(retrieved_taxids))
        
    #Declare empty DF
    df_toreturn = pd.DataFrame()

    #For each taxon create a DF, sort and get best result to append to df_toreturn
    for taxid in retrieved_taxids:
        #Create temporary DF exclusive to taxon
        temp_df = df[df['Taxid'] == taxid]
        temp_df = temp_df.sort_values(['Evalue', 'Identity', 'Bitscore'], ascending=[True, False, False]) #('Identity', ascending=False)
        if len(temp_df) > 0:
            if len(temp_df) > n_of_sequences:
                df_toreturn = df_toreturn.append(temp_df[:n_of_sequences])
            else:
                df_toreturn = df_toreturn.append(temp_df)

    #Refactor indexes
    df_toreturn = df_toreturn.reset_index(drop=True)

    #Calculate final length and append info to logging 
    final_len = len(df_toreturn)
    logging.info("Filtering by taxon: {} unique taxid(s) collected, {} entries discarded".format(len(retrieved_taxids), (initial_len - final_len)))
    return df_toreturn


In [34]:
#Prepare string for alingment file
def fasta_for_alignment(query:SeqRecord, df:DataFrame) -> str:
    #Initialise string and add 
    string = ''
    if query.id != '<unknown id>':
        string += ">Query:{}\n".format(query.id)
    else:
        string += ">Query\n"
    string += "{}\n".format(query.seq)
    #Add all sequence IDs and aa sequence
    for i in range(len(df['Accession'])):
        #Check if protein is missing ------------------------------------------------Add to report? ERROR.LOG FILE!!!!!
        if len(df['Prot_sequence'][i]) <1:
            continue
        string += ">{}\n".format(df['Protein_ID'][i])#, df['Organism_name'][i], df['Taxid'][i], df['Description'][i])
        string += "{}".format(df['Prot_sequence'][i])
        #Check if is not the last element in the list 
        if i != len(df['Accession']):
            string += '\n'
    logging.info("{} sequences were prepared for alignment with the query".format(len(df)))
    return string



#Change 'defining string' from protein to protein name/species name/taxid!!!!!!!!!!!!!!
#For accession replace accession + name!!!

In [12]:
#Run mafft by saving the fasta sequences for alignment in a file and passing it to mafft 
def run_mafft_saving_file(fasta:str, mafft_directory:str, filename:str) -> str:
    #Write fasta file for alignment
    file = '{}.fasta'.format(filename)
    with open(file, 'w') as handle:
        handle.write(fasta)
    #Parse and run with mafft 
    command_list = [mafft_directory, '--distout', '{}'.format(file)]
    process = subprocess.Popen(command_list, universal_newlines= True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
    stdout, stderr = process.communicate()

    #Save standard error of mafft
    with open('aligned_mafft.stderr', 'w') as handle:
        handle.write(stderr)
    #Retrun alignment 
    return stdout

In [13]:
#Function to get protein sequence and save fasta file given an accession number
def get_fasta_from_accession(accession, email):

    #Get the efetch handler 
    Bio.Entrez.email = email
    handler = Bio.Entrez.efetch(db='protein', id=accession, rettype = 'fasta',retmode = 'xml', retmax=1) #Returns JSON regardless
    query_protein_efetch = Bio.Entrez.read(handler, 'text')#Returns nested lists and dictionaries 
    
    #Make a dictionary -- passed as list to recycle the efetch_protein_to_dictionary function
    dictionary_query = efetch_protein_to_dictionary([query_protein_efetch])

    #Make and save fasta file 
    fasta_string_query = ">{} \n{}".format(dictionary_query['Accession'][0], dictionary_query['Prot_sequence'][0])

    fasta_file_name = "{}_sequence.fasta".format(dictionary_query['Accession'][0])
    with open(fasta_file_name, 'w') as handle:
        handle.write(fasta_string_query)

    #Open sequence with open_fasta and return it 
    fasta_record_q = open_fasta(fasta_file_name)
    return fasta_record_q

In [15]:
def open_input(input, email):
    #Declare empty protein sequence
    protein_sequence = None
    #Check that the parsed string has a fasta extension, if it does pass it to open fasta function
    if input.endswith('.fas') or input.endswith('.fasta'):
        sequence = open_fasta(input)
        #Try translating the sequence, if an error is raised the sequence is already a peptide and can be returned
        try: 
            protein_sequence = sequence.translate(to_stop = True)
            logging.info('Nucleotide sequence was translated to protein')
        except Exception:
            protein_sequence = sequence
            logging.info('Protein sequence was opened')
    #If the string doesn't have an extension it will be passed to the get_fasta_from_accession function
    else:
        try:
            protein_sequence = get_fasta_from_accession(input, email)
            logging.info('Protein sequence was retrieved from NCBI protein database')
        except Exception as e:
            logging.error(traceback.format_exc())
            logging.error('The file must have a .fas or .fasta extension')

    return protein_sequence


In [16]:
def tree_from_alignment(alignment):
    calculator = DistanceCalculator('identity')
    distance_matrix = calculator.get_distance(alignment)

    constructor = DistanceTreeConstructor(calculator, 'nj')
    tree = constructor.build_tree(alignment)
    logging.info('Tree was produced with this and that method') #Placeholder to change
    return tree

In [17]:
#USER INPUTS
input_string = 'human_mx1.fas' #'sry_protein.fasta' 'QBA69874'
taxid_list = [] #['9592']
mafft_directory = r'/Users/Gioele/miniconda3/bin/mafft'
email = 'A.N.Other@example.com'
output_name = 'draft_v4_debug_3'


local_query = False
threading = False
query_size = 100

evalue_threshold = 10**-10
len_threshold = 50
identity_threshold = 50

sequences_per_taxon = 1

#Make tsv for figtree compatibility
make_tsv = False

#logging file 
logging.basicConfig(filename='{}.log'.format(output_name), filemode='w', format='%(levelname)s:%(message)s', level=logging.DEBUG) #logging refreshes every run and only displays type of message: message

#Filenames:
#Implemented
output_df = '{}_df.csv'.format(output_name)
output_alignment = '{}_alignment.fasta'.format(output_name)
output_xml_tree = '{}_xml_tree.xml'.format(output_name)
#To add
output_tree_newick = '{}_newick_tree.nwk'.format(output_name) #!!!!
output_tree_jpg = '{}_tree_image.jpg'.format(output_name)



In [18]:

#Open fasta record
fasta_record = open_input(input_string, email)
#Blast with list 
blast_results = blastp_with_list(fasta_record.seq, query_size=query_size, list_taxid=taxid_list)

In [None]:
#Blast w threading 


In [19]:
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!NEED TO DOUBLECHECK AND IMPLEMENT BLAST WITH THREADING 


#Parse handler to xml_string_to_handler to be able to feed it in SearchIO.read
handler_blast = xml_string_to_handler(blast_results)

#Make dictionary from handler
dictionary_blast = blast_to_dictionary(handler_blast)

#Transform dictionary into DF
results_df_blast = pd.DataFrame.from_dict(dictionary_blast)


#Filter DF by E-value / Bitscore / Identity
filtered_blast_df = filter_df_blast(results_df_blast, fasta_record, evalue=evalue_threshold, difference_from_query=len_threshold, identity_threshold=identity_threshold)

print("BLAST DF after filtering holds {} entries".format(len(results_df_blast)))



BLAST DF after filtering holds 100 entries


In [23]:

#Retrieve all results with efetch 
#Retrieve accession list
accession_list = filtered_blast_df['Accession'].tolist()
#Get results 
efetch_result = retrieve_all_efetch(accession_list, email)

# Make dictionary from efetch fasta results
dictionary_efetch = efetch_protein_to_dictionary(efetch_result)

#Transform dictionary into DF
efetch_df = pd.DataFrame.from_dict(dictionary_efetch)



#Merge blast and efetch DataFrames
left = filtered_blast_df.loc[:,['Accession', 'Seq_length', 'Evalue', 'Bitscore', 'Tot_aln_span', 'Identity']]
right = efetch_df.loc[:,['Accession', 'Protein_ID' ,'Taxid', 'Organism_name', 'Description', 'Prot_sequence']]
combined_df = pd.merge(left, right, on='Accession')


#Filter DF based on taxons
filtered_df = filter_df_taxon(combined_df, n_of_sequences=sequences_per_taxon)


#Save DF of sequences that are going to be aligned 
filtered_df.to_csv(output_df, index = False)


print('Filtered DF presents {} entries'.format(len(output_df)))


Could not parse one sequence from efetch
Could not parse one sequence from efetch
Filtered DF presents 23 entries


In [None]:
#Make TSV for figtree
tsv_df = filtered_df[['Protein_ID'] + [col for col in filtered_df.columns if col!= 'Protein_ID']]
tsv_df = tsv_df.drop('Prot_sequence', axis = 1)
tsv_df.to_csv('{}.tsv'.format(output_df), sep = '\t', index = False)



In [68]:
#Prepare for multiple alingment
fasta_string_for_alignment = fasta_for_alignment(fasta_record, filtered_df)

#Run mafft alignment saving file
mafft_alignment = run_mafft_saving_file(fasta_string_for_alignment, mafft_directory, 'multiple_seq_fasta')

#Save mafft alignment
with open(output_alignment, 'w') as savefile:
    savefile.write(mafft_alignment)


#Open AlignIO from fasta file 
alignment = AlignIO.read(output_alignment, 'fasta')

#Construct tree from alignment 
#To IMPLEMENT DIFFERENT METHODS FOR TREE BUILDING 
tree = tree_from_alignment(alignment)  


#Write tree in xml
Phylo.write(tree, output_xml_tree, 'phyloxml')

Phylo.write(tree, output_tree_newick, 'newick')


1

In [79]:
#Re import newick string to feed to ETE3
with open(output_tree_newick, 'r') as handle:
    newick_string = handle.read()


In [50]:
#Create a DF with only protein ids and organisms names and drop the index
dict_of_df = filtered_df[['Protein_ID', 'Organism_name']]
dict_of_df.reset_index(drop=True, inplace=True)

#Create a dictionary from DF
dict_of_df = dict_of_df.to_dict('records')


[{'Organism_name': 'Mustela putorius furo', 'Protein_ID': 'XP_004762249.1'}, {'Organism_name': 'Nyctereutes procyonoides', 'Protein_ID': 'CAD7693676.1'}, {'Organism_name': 'Chlorocebus sabaeus', 'Protein_ID': 'XP_007966881.2'}, {'Organism_name': 'Suricata suricatta', 'Protein_ID': 'XP_029794794.1'}, {'Organism_name': 'Puma yagouaroundi', 'Protein_ID': 'XP_040339814.1'}, {'Organism_name': 'Mustela erminea', 'Protein_ID': 'XP_032211404.1'}, {'Organism_name': 'Propithecus coquereli', 'Protein_ID': 'XP_012495571.1'}, {'Organism_name': 'Lynx canadensis', 'Protein_ID': 'XP_030185366.1'}, {'Organism_name': 'Felis catus', 'Protein_ID': 'XP_023094481.1'}, {'Organism_name': 'Acinonyx jubatus', 'Protein_ID': 'XP_026897378.1'}, {'Organism_name': 'Vulpes vulpes', 'Protein_ID': 'XP_025839180.1'}, {'Organism_name': 'Miopithecus talapoin', 'Protein_ID': 'AFU81311.1'}, {'Organism_name': 'Alouatta sara', 'Protein_ID': 'AFU81300.1'}, {'Organism_name': 'Sus scrofa', 'Protein_ID': 'BAD11809.1'}, {'Organism

In [70]:
#Replace all organisms names' spaces with underscores to avoid confusion
for entry in dict_of_df:
    entry['Organism_name'] = str(entry['Organism_name']).replace(' ', '_')

In [52]:
dict_of_df



[{'Organism_name': 'Mustela_putorius_furo', 'Protein_ID': 'XP_004762249.1'},
 {'Organism_name': 'Nyctereutes_procyonoides', 'Protein_ID': 'CAD7693676.1'},
 {'Organism_name': 'Chlorocebus_sabaeus', 'Protein_ID': 'XP_007966881.2'},
 {'Organism_name': 'Suricata_suricatta', 'Protein_ID': 'XP_029794794.1'},
 {'Organism_name': 'Puma_yagouaroundi', 'Protein_ID': 'XP_040339814.1'},
 {'Organism_name': 'Mustela_erminea', 'Protein_ID': 'XP_032211404.1'},
 {'Organism_name': 'Propithecus_coquereli', 'Protein_ID': 'XP_012495571.1'},
 {'Organism_name': 'Lynx_canadensis', 'Protein_ID': 'XP_030185366.1'},
 {'Organism_name': 'Felis_catus', 'Protein_ID': 'XP_023094481.1'},
 {'Organism_name': 'Acinonyx_jubatus', 'Protein_ID': 'XP_026897378.1'},
 {'Organism_name': 'Vulpes_vulpes', 'Protein_ID': 'XP_025839180.1'},
 {'Organism_name': 'Miopithecus_talapoin', 'Protein_ID': 'AFU81311.1'},
 {'Organism_name': 'Alouatta_sara', 'Protein_ID': 'AFU81300.1'},
 {'Organism_name': 'Sus_scrofa', 'Protein_ID': 'BAD11809.1'

In [80]:
#Change all instances of protein ID to Organism name | protein ID in both newick and MSA strings
#Used to make the ETE3 output more human readable
for entry in dict_of_df:
    pro_id = str(entry['Protein_ID'])
    org_name = str(entry['Organism_name'])
    new_entry = '{}|{}'.format(org_name, pro_id)
    print(new_entry)

    newick_string= newick_string.replace(pro_id, new_entry)
    mafft_alignment= mafft_alignment.replace(pro_id, new_entry)


Mustela_putorius_furo|XP_004762249.1
Nyctereutes_procyonoides|CAD7693676.1
Chlorocebus_sabaeus|XP_007966881.2
Suricata_suricatta|XP_029794794.1
Puma_yagouaroundi|XP_040339814.1
Mustela_erminea|XP_032211404.1
Propithecus_coquereli|XP_012495571.1
Lynx_canadensis|XP_030185366.1
Felis_catus|XP_023094481.1
Acinonyx_jubatus|XP_026897378.1
Vulpes_vulpes|XP_025839180.1
Miopithecus_talapoin|AFU81311.1
Alouatta_sara|AFU81300.1
Sus_scrofa|BAD11809.1
Macaca_nemestrina|XP_011724293.1
Sapajus_apella|XP_032132720.1
Prionailurus_bengalensis|XP_043449495.1
Vulpes_lagopus|XP_041591175.1
Panthera_leo|XP_042811978.1
Orycteropus_afer_afer|XP_007945509.1
Lagothrix_lagotricha|AFU81308.1
Macaca_sylvanus|AFU81310.1
Nomascus_leucogenys|XP_012357180.1
Pithecia_pithecia|AFU81315.1
Loxodonta_africana|XP_010596962.1
synthetic_construct|AAX36236.1
Lynx_pardinus|VFV35827.1
Manis_javanica|XP_036870729.1
Manis_pentadactyla|XP_036732799.1
Hyaena_hyaena|XP_039085427.1
Saimiri_sciureus|AFU81318.1
Hylobates_agilis|AFU81307

In [78]:
#print(newick_string)

print(output_alignment)

draft_v4_debug_3_alignment.fasta


In [81]:
from ete3 import PhyloTree, TreeStyle

#Link tree with MSA

msa_tree = PhyloTree(newick_string, quoted_node_names=True, format=1) 
#Need to root the tree!!!!!!!
#Returns the node that divides the current tree into two distance-balanced partitions.
R = msa_tree.get_midpoint_outgroup()
#Sets a descendant node as the outgroup of a tree
msa_tree.set_outgroup(R)

#Link tree to the MAFFT Alignment 
msa_tree.link_to_alignment(alignment=mafft_alignment, alg_format='fasta')
msa_tree

'''leaf_x = None
for leaf in msa_tree.iter_leaves():
    print('Name: {}, Sequence: {}'.format(leaf.name, leaf.sequence))
    leaf_x=leaf'''

"leaf_x = None\nfor leaf in msa_tree.iter_leaves():\n    print('Name: {}, Sequence: {}'.format(leaf.name, leaf.sequence))\n    leaf_x=leaf"

In [None]:
msa_tree.show(tree_style=TreeStyle())

In [82]:
#Change 'defining string' from protein to protein name/species name/taxid!!!!!!!!!!!!!!
#For accession replace accession + name!!!


msa_tree.render('msa_tree_midpoint.png')


{'faces': [[175.1005898789196,
   859.0,
   339.1005898789196,
   872.0,
   133,
   'Macaca_fascicularis|AFU81309.1'],
  [452.907947842285, 858.5, 453.907947842285, 872.5, 133, None],
  [149.81403290903444,
   131.0,
   309.81403290903444,
   144.0,
   23,
   'Vulpes_vulpes|XP_025839180.1'],
  [452.907947842285, 130.5, 453.907947842285, 144.5, 23, None],
  [102.19031356721514,
   105.0,
   270.1903135672151,
   118.0,
   19,
   'Hyaena_hyaena|XP_039085427.1'],
  [452.907947842285, 104.5, 453.907947842285, 118.5, 19, None],
  [109.62278795405153,
   118.0,
   285.62278795405155,
   131.0,
   20,
   'Suricata_suricatta|XP_029794794.1'],
  [452.907947842285, 117.5, 453.907947842285, 131.5, 20, None],
  [182.58708475628688,
   833.0,
   336.5870847562869,
   846.0,
   130,
   'Papio_anubis|XP_003895511.1'],
  [452.907947842285, 832.5, 453.907947842285, 846.5, 130, None],
  [181.07016454517228,
   443.0,
   416.07016454517225,
   456.0,
   72,
   'Saimiri_boliviensis_boliviensis|XP_00392764

In [None]:
#IF you want to add the taxids you need to add the one of the query too
#EXCEPTION!! SORRY THE FASTA FORMAT SHOULD PRESENT PROTEIN ID, FORMAT:
#>gi|2765634|emb|Z78509.1|PPZ78509 P.pearcei 5.8S rRNA gene and ITS1 and ITS2 DNA


In [None]:
from ete3 import Tree


t = Tree( "((a,b),c);" )
t.render("mytree.png", w=183, units="mm")

In [None]:
newick_string

In [None]:
from ete3 import Tree

t = Tree(newick_string, quoted_node_names=True, format=1)
t
#t.render('my_newick_tree.png', w= 200, units='mm')

In [85]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()
descendants = ncbi.get_descendant_taxa(40674)
# to print names of the taxIDs in the descendants
print((descendants[:100]))


#Very fast after first time

[163843, 109228, 98314, 98318, 109229, 163861, 196630, 1081385, 1704005, 65606, 65607, 65609, 65610, 65613, 65614, 65615, 65616, 65618, 65621, 65624, 65626, 65628, 65629, 65631, 65632, 65634, 1851651, 458857, 458858, 458859, 458860, 458861, 458862, 1048687, 1048688, 655476, 1649342, 2750743, 1851652, 2588810, 92866, 1392492, 65694, 1851653, 190511, 2162857, 2162858, 2162859, 2162860, 2162861, 2162862, 2162864, 2162865, 2162866, 2162901, 2162903, 2162904, 1147103, 1147105, 1147106, 655592, 491753, 557290, 491755, 1048812, 1048813, 1294088, 48018, 327948, 327957, 1966371, 1966372, 2588968, 2588969, 1605932, 1966381, 1966382, 1966383, 1966384, 1966385, 1513475, 131388, 262468, 491852, 491853, 1048917, 1048918, 1048919, 1048920, 1048921, 1048922, 1048923, 1081694, 491872, 491874, 491876, 491878, 1343853, 1343854, 491888]


In [None]:
#Function to check if the Taxid is a species one (so not to be parsed to the lineage)

def check_if_species(list_of_taxa):
    not_species = []
    ranks = ncbi.get_rank(list_of_taxa)
    for taxid in list_of_taxa:
        if ranks[int(taxid)] != 'species':
            not_species.append(taxid)
    return not_species

In [86]:
taxid_list = ['9606', '9597', '9593', '9600', '9601', '61853', '9546', '9544', '9541', '54180']

#not_species = check_if_species(taxid_list)
#print(not_species)


species = []
for taxid in taxid_list:
    lineage_to_search = ncbi.get_lineage(int(taxid))
    for spec in lineage_to_search:
        species.append(int(spec))

print(species)

#checked_species = check_if_species(species)


ranks = ncbi.get_rank(species)


print(ranks)

[1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9605, 9606, 1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9596, 9597, 1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 207598, 9592, 9593, 1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524, 40674, 32525, 9347, 1437010, 314146, 9443, 376913, 314293, 9526, 314295, 9604, 607660, 9599, 9600, 1, 131567, 2759, 33154, 33208, 6072, 33213, 33511, 7711, 89593, 7742, 7776, 117570, 117571, 8287, 1338369, 32523, 32524

NameError: name 'check_if_species' is not defined

In [None]:
new_list = [163843, 109228, 98314, 98318, 109229, 163861, 196630, 1081385, 1704005, 65606, 65607, 65609, 65610, 65613, 65614, 65615, 65616, 65618, 65621, 65624, 65626, 65628, 65629, 65631, 65632, 65634, 1851651, 458857, 458858, 458859, 458860, 458861, 458862, 1048687, 1048688, 655476, 1649342, 2750743, 1851652, 2588810, 92866, 1392492, 65694, 1851653, 190511, 2162857, 2162858, 2162859, 2162860, 2162861, 2162862, 2162864, 2162865, 2162866, 2162901, 2162903, 2162904, 1147103, 1147105, 1147106, 655592, 491753, 557290, 491755, 1048812, 1048813, 1294088, 48018, 327948, 327957, 1966371, 1966372, 2588968, 2588969, 1605932, 1966381, 1966382, 1966383, 1966384, 1966385, 1513475, 131388, 262468, 491852, 491853, 1048917, 1048918, 1048919, 1048920, 1048921, 1048922, 1048923, 1081694, 491872, 491874, 491876, 491878, 1343853, 1343854, 491888]
print(len(new_list))