In [1]:
import os
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord, Seq, SearchIO, AlignIO, Phylo
from Bio.Blast import NCBIWWW, NCBIXML
import Bio.Entrez

In [2]:
def open_fasta(filename):
    with open(filename) as handle:
        sequence_record = SeqIO.read(handle, 'fasta')
    return sequence_record

fasta_record = open_fasta('human_mx1.fas')
print(fasta_record)

ID: lcl|XM_005260978.4_cds_XP_005261035.1_1
Name: lcl|XM_005260978.4_cds_XP_005261035.1_1
Description: lcl|XM_005260978.4_cds_XP_005261035.1_1 [gene=MX1] [db_xref=GeneID:4599] [protein=interferon-induced GTP-binding protein Mx1 isoform X1] [protein_id=XP_005261035.1] [location=458..2446] [gbkey=CDS]
Number of features: 0
Seq('ATGGTTGTTTCCGAAGTGGACATCGCAAAAGCTGATCCAGCTGCTGCATCCCAC...TAA')


In [3]:
#Function to run blast with a list of taxid in one query, if list is empty no taxid is specified
def blastn_with_taxid(sequence, filename, query_size = 50, list_taxid = []):
    result_handler, result_storer = None, None
    if len(list_taxid) <1:
        result_handler = NCBIWWW.qblast('blastn', 'nt', sequence) 
        result_storer = result_handler.read()
    else:
        entrez_query = ''
        for taxid in list_taxid:
            entrez_query += f'txid{taxid}[ORGN]'
            if taxid != taxid_list[-1]:
                entrez_query += ' OR '
        result_handler = NCBIWWW.qblast('blastn', 'nt', sequence, entrez_query= entrez_query, hitlist_size=query_size)
        result_storer = result_handler.read()
    with open(f'{filename}.xml', 'w') as savefile:
        savefile.write(result_storer)

taxid_list = ['9592','9527', '40674', '314147']

blastn_with_taxid(fasta_record.seq, 'test_taxid_function', query_size = 100, list_taxid = taxid_list)

#65.5 62.9 seconds 4 taxids query_size 100

In [9]:
#Function to run blast with a list of taxid in separate queries

def blast_single_taxid(sequence, filename, query_size = 50, list_taxid = []):
    result_handler, result_storer = None, None
    if len(list_taxid) <1:
        result_handler = NCBIWWW.qblast('blastn', 'nt', sequence) 
        result_storer = result_handler.read()
    else:
        for taxid in taxid_list:
            result_storer = []
            entrez = f'txid{taxid}[ORGN]'
            result_handler = NCBIWWW.qblast('blastn', 'nt', sequence, entrez_query= entrez, hitlist_size=query_size)
            result_storer.append(result_handler.read())
            print(f'{taxid} done')
    with open(f'{filename}.xml', 'w') as savefile:
        savefile.write('\n'.join(result_storer))

taxid_list = ['9592', '9527', '40674', '314147', '9531', '9544', '2008792'] 

blast_single_taxid(fasta_record.seq, 'test_taxid_function_single_7', query_size = 25,list_taxid = taxid_list)

#842 sec :( 4 taxids query_size 25 
#822 sec 3 taxid query_size 25

9592 done
9527 done


In [14]:
#Function to parallelise multiple queries
import multiprocessing

def blast_single_taxid_parallel(sequence, filename, query_size = 50, list_taxid = []):
    
    def run_blast(taxid):
        entrez = f'txid{taxid}[ORGN]'
        result_handler = NCBIWWW.qblast('blastn', 'nt', sequence, entrez_query= entrez, hitlist_size=query_size)
        result_storer = result_handler.read()
        print(f'{taxid} done')
        return result_storer
    
    pool_obj = multiprocessing.Pool()
    storer_list = pool_obj.map(run_blast, list_taxid) #for x in list_taxid
    return storer_list

taxid_list = ['9592', '9527', '40674', '314147']

result_parallel = blast_single_taxid_parallel(fasta_record.seq, 'useless_now', query_size=25, list_taxid=taxid_list)

print(result_parallel)


SyntaxError: invalid syntax (661066193.py, line 14)