In [6]:
import os
from sys import stderr, stdout
import tempfile
import subprocess
from warnings import catch_warnings
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord, Seq, SearchIO, AlignIO, Phylo
from Bio.Blast import NCBIWWW, NCBIXML
import Bio.Entrez
from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor
import logging
import traceback
from pandas.core.frame import DataFrame


In [15]:
#Function to open fasta file of imput
def open_fasta(filename) -> SeqRecord:
    with open(filename) as handle:
        sequence_record = SeqIO.read(handle, 'fasta')
    return sequence_record


#Function to run BLAST with taxid list
def blastp_with_list(sequence, list_taxid = [], query_size = 200):
    result_handler, result_storer = None, None
    #If list is empty run query without specific taxid
    if len(list_taxid) <1:
        result_handler = NCBIWWW.qblast('blastp', 'nr', sequence, hitlist_size=query_size)
        result_storer = result_handler.read()
    #Prepare string of Entrez and parse it to qblast
    else:
        entrez_query = ''
        for taxid in list_taxid:
            entrez_query += f'txid{taxid}[ORGN]'
            if taxid != list_taxid[-1]:
                entrez_query += ' OR '
        result_handler = NCBIWWW.qblast('blastp', 'nr', sequence, entrez_query= entrez_query, hitlist_size=query_size)
        result_storer = result_handler.read()
    return result_storer

In [5]:
#Efetch to dictionary parser
def efetch_protein_to_dictionary(list_of_efetch):
    #Declare new dictionary
    dictionary = {'Accession':[],'Protein_ID':[], 'Taxid':[], 'Organism_name':[], 'Description':[], 'Seq_length':[], 'Prot_sequence':[]}
    for wrapper in list_of_efetch:
        try:
            #Cast into dictionary to avoid random exception
            result = dict(wrapper[0])
            acc_ver = result['TSeq_accver']
            accession = acc_ver.split('.')
            dictionary['Accession'].append(accession[0])
            dictionary['Protein_ID'].append(result['TSeq_accver'])
            dictionary['Taxid'].append(result['TSeq_taxid'])
            dictionary['Organism_name'].append(result['TSeq_orgname'])
            dictionary['Description'].append(result['TSeq_defline'])
            dictionary['Seq_length'].append(result['TSeq_length'])
            dictionary['Prot_sequence'].append(result['TSeq_sequence'])
        except KeyError:
            print('Could not parse one sequence from efetch \n')

    return dictionary

In [6]:
def get_fasta_from_accession(accession, email):

    #Get the efetch handler 
    Bio.Entrez.email = email
    handler = Bio.Entrez.efetch(db='protein', id=accession, rettype = 'fasta',retmode = 'xml', retmax=1) #Returns JSON regardless
    query_protein_efetch = Bio.Entrez.read(handler, 'text')#Returns nested lists and dictionaries 
    
    #Make a dictionary
    dictionary_query = efetch_protein_to_dictionary([query_protein_efetch])

    #Make and save fasta file 
    fasta_string_query = f">{dictionary_query['Accession'][0]} \n {dictionary_query['Prot_sequence'][0]}"

    fasta_file_name = f"{dictionary_query['Accession'][0]}_sequence.fasta"
    with open(fasta_file_name, 'w') as handle:
        handle.write(fasta_string_query)

    fasta_record_q = open_fasta(fasta_file_name)
    return fasta_record_q

In [7]:
def open_input(input, email):
    #Declare
    protein_sequence = None
    if input.endswith('.fas') or input.endswith('.fasta'):
        sequence = open_fasta(input)
        try: 
            protein_sequence = sequence.translate(to_stop = True)
            logging.info('Nucleotide sequence was translated to protein')
        except Exception:
            protein_sequence = sequence
            logging.info('Protein sequence was opened')
    else:
        try:
            protein_sequence = get_fasta_from_accession(input, email)
            logging.info('Protein sequence was retrieved from NCBI protein database')
        except Exception as e:
            logging.error(traceback.format_exc())

    return protein_sequence

In [19]:
def xml_string_to_handler(string):
    #make temporary file
    tmp = tempfile.NamedTemporaryFile(mode='a+')
    #write string
    tmp.write(string)
    handler = SearchIO.read(tmp.name, 'blast-xml')
    tmp.close()
    return handler

In [21]:
#Creation of a dictionary with all HSPS
def blast_to_dictionary(blastresult):
    blast_dictionary = {'ID' : [], 'Description' : [], 'Seq_length' : [], 'Accession' : [], 'Bitscore' : [], 'Evalue' : [], 'Tot_aln_span':[], 'Identity' :[]}
    #Loop through results 
    for result in blastresult:
        blast_dictionary['ID'].append(result.id)
        blast_dictionary['Description'].append(result.description)
        blast_dictionary['Seq_length'].append(result.seq_len)
        blast_dictionary['Accession'].append(result.accession)
        #Store results of first HSP
        first_hsp = result.hsps[0]
        blast_dictionary['Bitscore'].append(first_hsp.bitscore)
        blast_dictionary['Evalue'].append(first_hsp.evalue)
        #Create variables to store results of multiple hsps
        all_alnspan, all_gapnum = [],[] 
        #Collect data of all hsps for each hit
        for hsp in result.hsps:
            all_alnspan.append(int(hsp.aln_span))
            all_gapnum.append(int(hsp.gap_num))
        #Calculate total alignment span and gaps to calculate identity
        tot_alnspan, tot_gapnum = int(), int()
        seq_len = int(result.seq_len) #DOUBLE CHECK 
        for span in all_alnspan:
            tot_alnspan += span
        for gap in all_gapnum:
            tot_gapnum += gap
        identity = (tot_alnspan - tot_gapnum)/seq_len*100
        blast_dictionary['Tot_aln_span'].append(tot_alnspan)
        blast_dictionary['Identity'].append(round(identity, 3))
    return blast_dictionary

In [15]:
!conda init --help

usage: conda init [-h] [--all] [--reverse] [--json] [-v] [-q] [-d]
                  [shells ...]

Initialize conda for shell interaction. [Experimental]

Options:

positional arguments:
  shells         One or more shells to be initialized. If not given, the
                 default value is 'bash' on unix and 'cmd.exe' on Windows. Use
                 the '--all' flag to initialize all shells. Currently
                 compatible shells are {bash, fish, powershell, tcsh, xonsh,
                 zsh}

optional arguments:
  -h, --help     Show this help message and exit.
  --all          Initialize all currently available shells.
  -d, --dry-run  Only display what would have been done.

setup type:
  --reverse      Undo past effects of conda init.

Output, Prompt, and Flow Control Options:
  --json         Report all output as json. Suitable for using conda
                 programmatically.
  -v, --verbose  Use once for info, twice for debug, three times for trace.
  -q, --quiet    D

In [1]:
from ete3 import Tree
t = Tree( "((a,b),c);" )
t.render("mytree.png", w=183, units="mm")

{'faces': [[625.3847069843147,
   42.78947995155837,
   645.1336977311878,
   82.28746144530456,
   3,
   'b'],
  [625.3847069843147,
   3.291498457812182,
   645.1336977311878,
   42.78947995155837,
   2,
   'a'],
  [319.27535040778173,
   82.28746144530456,
   335.7328426968426,
   121.78544293905074,
   4,
   'c']],
 'node_areas': {0: [3.2914984578121826,
   3.291498457812182,
   645.1336977311878,
   121.78544293905074],
  1: [13.16599383124873,
   3.291498457812182,
   645.1336977311878,
   82.28746144530456],
  2: [319.27535040778173,
   3.291498457812182,
   645.1336977311878,
   42.78947995155837],
  3: [319.27535040778173,
   42.78947995155837,
   645.1336977311878,
   82.28746144530456],
  4: [13.16599383124873,
   82.28746144530456,
   335.7328426968426,
   121.78544293905074]},
 'nodes': [[307.7551058054391,
   36.206483035934006,
   320.9210996366878,
   49.37247686718273,
   1,
   None],
  [613.8644623819721,
   55.9554737828071,
   627.0304562132208,
   69.12146761405583

In [11]:
fasta_record = open_input('XP_011809449', 'A.N.Other@example.com')
fasta_record

SeqRecord(seq=Seq('MVLSEVDVVKADPAAASHPLLLNGDADVAQKSPGSVAENNLCSQYEEKVRPCID...FPG'), id='XP_011809449', name='XP_011809449', description='XP_011809449', dbxrefs=[])

In [16]:
blast_handler = blastp_with_list(fasta_record.seq, [])

In [20]:
blast_result = xml_string_to_handler(blast_handler)
len(blast_result)

200

In [27]:
dictionary_blast = blast_to_dictionary(blast_result)
print(len(dictionary_blast['ID']))


blast_df = pd.DataFrame.from_dict(dictionary_blast)
len(blast_df)

200


200