In [1]:
import os
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord, Seq, SearchIO, AlignIO, Phylo
from Bio.Blast import NCBIWWW, NCBIXML
import Bio.Entrez
from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor
import threading
from itertools import repeat
import tempfile

In [2]:
#Function open fasta file
def open_fasta(filename):
    with open(filename) as handle:
        sequence_record = SeqIO.read(handle, 'fasta')
    return sequence_record

In [None]:
#Function for multi-taxid Blast
def blastn_with_taxid(sequence, list_taxid = [], query_size = 50):
    result_handler, result_storer = None, None
    if len(list_taxid) <1:
        result_handler = NCBIWWW.qblast('blastn', 'nt', sequence) 
        result_storer = result_handler.read()
    else:
        entrez_query = ''
        for taxid in list_taxid:
            entrez_query += f'txid{taxid}[ORGN]'
            if taxid != list_taxid[-1]:
                entrez_query += ' OR '
        result_handler = NCBIWWW.qblast('blastn', 'nt', sequence, entrez_query= entrez_query, hitlist_size=query_size)
        result_storer = result_handler.read()
    return result_storer

In [None]:
#Function for single taxid blast, to use for threading 
def run_blast(fasta_record, taxid, query_size = 20):
    print(taxid)
    entrez = f'txid{taxid}[ORGN]'
    result_handler = NCBIWWW.qblast('blastn', 'nt', fasta_record.seq, entrez_query= entrez, hitlist_size=query_size)
    result_storer = result_handler.read()
    print(f'{taxid} done')
    return result_storer


In [None]:
def blast_with_threading(fasta_record, taxid_list, query_size = 20):
    threads = []
    results = []
    list_of_handlers = []

    for taxid in taxid_list:
        t = threading.Thread(target=lambda: results.append(run_blast(fasta_record, taxid, query_size=query_size)))
        t.start()
        threads.append(t)

    for thread in threads:
        thread.join()
    
    
    for result in results:
        tmp = tempfile.NamedTemporaryFile(mode='a+')
        tmp.write(result)
        results_handler = SearchIO.read(tmp.name, 'blast-xml')
        list_of_handlers.append(results_handler)
        tmp.close()

    return list_of_handlers




In [None]:
#Create dictionary from r
#Creation of dictionary with all HSPs adding identity and other metrics

#Hits without at least a significant HSP are excluded

#!!!No more hsp or hsp combined

def blast_to_dictionary_plus_metrics(blastresult):
    blast_dictionary = {'ID' : [], 'Description' : [], 'Seq_length' : [], 'Accession' : [], 'Bitscore' : [], 'Bitscore_raw' : [], 
    'Evalue' : [], 'Hit_start' : [], 'Hit_end' : [], 'Query_frame' : [], 'Gap_num' : [], 'Aln_span' : [], 'Tot_aln_span':[], 'Identity' :[]}
    for result in blastresult:
        blast_dictionary['ID'].append(result.id)
        blast_dictionary['Description'].append(result.description)
        blast_dictionary['Seq_length'].append(result.seq_len)
        blast_dictionary['Accession'].append(result.accession)
        bitscore, bitscore_raw, evalue, hitstart, hitend, queryframe, gapnum, alnspan = '','','','','','','',''
        all_alnspan, all_gapnum = [],[] #seq_len is not required
        for hsp in result.hsps:
            bitscore += str(hsp.bitscore)
            bitscore_raw += str(hsp.bitscore_raw)
            evalue += str(hsp.evalue)
            hitstart += str(hsp.hit_start)
            hitend += str(hsp.hit_end)
            queryframe += str(hsp.query_frame)
            gapnum += str(hsp.gap_num)
            alnspan += str(hsp.aln_span)
            if hsp != result.hsps[-1]:
                bitscore += '/'
                bitscore_raw += '/' 
                evalue += '/'
                hitstart += '/'
                hitend += '/'
                queryframe += '/'
                gapnum += '/' 
                alnspan += '/' #I know it's not neat but it would give an error otherwise :(
            all_alnspan.append(int(hsp.aln_span))
            all_gapnum.append(int(hsp.gap_num))
        blast_dictionary['Bitscore'].append(bitscore)
        blast_dictionary['Bitscore_raw'].append(bitscore_raw)
        blast_dictionary['Evalue'].append(evalue)
        blast_dictionary['Hit_start'].append(hitstart)
        blast_dictionary['Hit_end'].append(hitend)
        blast_dictionary['Query_frame'].append(queryframe)
        blast_dictionary['Gap_num'].append(gapnum)
        blast_dictionary['Aln_span'].append(alnspan)
        tot_alnspan, tot_gapnum = int(), int()
        seq_len = int(result.seq_len)
        for span in all_alnspan:
            tot_alnspan += span
        for gap in all_gapnum:
            tot_gapnum += gap
        identity = (tot_alnspan - tot_gapnum)/seq_len*100
        blast_dictionary['Tot_aln_span'].append(tot_alnspan)
        blast_dictionary['Identity'].append(round(identity, 3))
    return blast_dictionary

In [None]:
def dictionary_from_handler_list(list_of_handlers):
    data_frame = pd.DataFrame()
    for handler in list_of_handlers:
        dictionary = blast_to_dictionary_plus_metrics(handler)
        data_frame.append(pd.DataFrame.from_dict(dictionary))
    return data_frame



In [None]:
#Open record
fasta_record = open_fasta('human_mx1.fas')
#old_taxid_list = ['9592', '9527', '9601'] #, '40674', '314147', '9531', '9544', '2008792']
taxid_list = ['9606']#, '9597', '9593', '9600', '9601', '61853', '9546', '9544', '9541', '54180']


In [None]:
#Run Blast in one query
handler_single_query = blastn_with_taxid(fasta_record.seq,list_taxid = taxid_list, query_size=200)



In [None]:
#Run blast with threading
handler_threading_query = blast_with_threading(fasta_record, taxid_list, query_size=20)
