In [3]:
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord, Seq, SearchIO
from Bio.Blast import NCBIWWW, NCBIXML

In [None]:
#Create sequence record
seq_record_dictionary = SeqIO.to_dict(SeqIO.parse("human_mx1.fas", 'fasta')) #parse could be substituted by read
sequence_record = seq_record_dictionary[list(seq_record_dictionary)[0]]
sequence_record

In [None]:
#Create protein from sequence
rna = Seq.transcribe(sequence_record.seq)
protein = rna.translate(to_stop=True)
protein

In [None]:
#Run BLASTn and store results in .xml file
result_handler = NCBIWWW.qblast('blastn', 'nt', sequence_record.seq) #can add entrez_query='txid000000'
result_storer = result_handler.read()
with open('search_from_parser2.xml', 'w') as savefile:
        savefile.write(result_storer)

In [4]:
#Import saved .xml file with SearchIO 
results_searchio = SearchIO.read('search_from_parser2.xml', 'blast-xml')

#Unpacking of upper object (QueryResult)
print(results_searchio.seq_len) #in the upper object it is stored the length of the DNA molecule submitted for analysis
print(results_searchio.program)
print(results_searchio.version)
print(results_searchio.target) #database
print(results_searchio.id) #query id No is default
#with results_searchio.param.... can get the parameters used for the blast alingments

#for result in results_searchio:
#    print(result.id)

1989
blastn
2.12.0+
nt
No


In [None]:
#Unpacking of first result (Hit object) (done for single object so not to have excessive output#)
#ignore query ID, description and length as already unpacked from upper object
result1 = results_searchio[0]
print(result1.id)
print(result1.description)
print(result1.seq_len) #sequence length
print(result1.accession)


In [None]:
for result in results_searchio:
    print(result.id)

In [None]:
#Unpacking the HSPs from the same Hit object (high scoring segment pair) (remember that blast uses local alignment and could be biased)
#Given that the .hsps parameter returns a list of length 1
result1_hsp = result1.hsps[0]
print(result1_hsp.bitscore)
print(result1_hsp.bitscore_raw)
print(result1_hsp.evalue)
print(result1_hsp.query_start)
print(result1_hsp.query_end)
print(result1_hsp.hit_start)
print(result1_hsp.hit_end)
print(result1_hsp.query_frame)
print(result1_hsp.hit_frame)
print(result1_hsp.ident_num)
print(result1_hsp.pos_num)
print(result1_hsp.gap_num)
print(result1_hsp.aln_span) #!!! Length of alignment
print(result1_hsp.query) #Query Seq object! important to run local alignments

print('\nseparate\n')
print(result1_hsp.query.seq) # this object only stores the alingment sequence (1989 characters) instead of the hit one of 2776

In [None]:
#make a dictionary

blast_dictionary = {'id' : [], 'description' : [], 'seq_length' : [], 'accession' : [], 'bitscore' : [], 'bitscore_raw' : [], 
'evalue' : [], 'hit_start' : [], 'hit_end' : [], 'query_frame' : [], 'gap_num' : [], 'aln_span' : []}

for result in results_searchio:
    blast_dictionary['id'].append(result.id)
    blast_dictionary['description'].append(result.description)
    blast_dictionary['seq_length'].append(result.seq_len)
    blast_dictionary['accession'].append(result.accession)
    result_hsp = result.hsps[0] #need to doublecheck because there could be more than 1 HSP (likely the first is most significant)
    blast_dictionary['bitscore'].append(result_hsp.bitscore)
    blast_dictionary['bitscore_raw'].append(result_hsp.bitscore_raw)
    blast_dictionary['evalue'].append(result_hsp.evalue)
    blast_dictionary['hit_start'].append(result_hsp.hit_start)
    blast_dictionary['hit_end'].append(result_hsp.hit_end)
    blast_dictionary['query_frame'].append(result_hsp.query_frame)
    blast_dictionary['gap_num'].append(result_hsp.gap_num)
    blast_dictionary['aln_span'].append(result_hsp.aln_span)


In [24]:
#Make a dictionary that deals with multiple HSPS
#This dictionary creates a different entry for each HSP

def redundant_blast_to_dictionary(blastresult):
    blast_dictionary = {'id' : [], 'description' : [], 'seq_length' : [], 'accession' : [], 'bitscore' : [], 'bitscore_raw' : [], 
    'evalue' : [], 'hit_start' : [], 'hit_end' : [], 'query_frame' : [], 'gap_num' : [], 'aln_span' : []}

    for result in blastresult:
        for i in range(len(result.hsps)):    
            blast_dictionary['id'].append(result.id)
            blast_dictionary['description'].append(result.description)
            blast_dictionary['seq_length'].append(result.seq_len)
            blast_dictionary['accession'].append(result.accession)
            result_hsp = result.hsps[i] #need to doublecheck because there could be more than 1 HSP (likely the first is most significant)
            blast_dictionary['bitscore'].append(result_hsp.bitscore)
            blast_dictionary['bitscore_raw'].append(result_hsp.bitscore_raw)
            blast_dictionary['evalue'].append(result_hsp.evalue)
            blast_dictionary['hit_start'].append(result_hsp.hit_start)
            blast_dictionary['hit_end'].append(result_hsp.hit_end)
            blast_dictionary['query_frame'].append(result_hsp.query_frame)
            blast_dictionary['gap_num'].append(result_hsp.gap_num)
            blast_dictionary['aln_span'].append(result_hsp.aln_span)
    return blast_dictionary

#No multiple HSPS
#print(redundant_blast_to_dictionary(results_searchio))

#With multiple HSPS 
#results_mult_hsps = SearchIO.read('for_loop_search_txid9592.xml', 'blast-xml')
print(redundant_blast_to_dictionary(results_mult_hsps))

{'id': ['gi|525344602|ref|NM_001279761.1|', 'gi|1753056057|ref|XM_019017591.2|', 'gi|1753056056|ref|XM_019017590.2|', 'gi|1753056055|ref|XM_019017589.2|', 'gi|1753056054|ref|XM_019017588.2|', 'gi|1753056053|ref|XM_019017587.2|', 'gi|1753056052|ref|XM_019017586.2|', 'gi|1753056051|ref|XM_019017585.2|', 'gi|1753056058|ref|XM_031005077.1|', 'gi|1753056058|ref|XM_031005077.1|', 'gi|1753056672|ref|XM_019017900.2|', 'gi|1753056670|ref|XM_004062835.3|', 'gi|1753056668|ref|XM_019017898.2|', 'gi|948542510|gb|KT698238.1|', 'gi|1753053231|ref|XM_031004421.1|', 'gi|1753086199|ref|XR_004071381.1|', 'gi|1753086199|ref|XR_004071381.1|', 'gi|1753086198|ref|XR_004071380.1|', 'gi|1753086198|ref|XR_004071380.1|', 'gi|1753086196|ref|XM_031014265.1|', 'gi|1753086196|ref|XM_031014265.1|', 'gi|1753086194|ref|XM_031014264.1|', 'gi|1753086194|ref|XM_031014264.1|', 'gi|1753086192|ref|XM_031014263.1|', 'gi|1753086192|ref|XM_031014263.1|', 'gi|1753086190|ref|XM_031014262.1|', 'gi|1753086190|ref|XM_031014262.1|', 

In [10]:
#Make a dictionary that deals with multiple HSPS
#This dictionary concatenates the results of multiple HSPS for each query 

def blast_to_dictionary(blastresult):
    blast_dictionary = {'id' : [], 'description' : [], 'seq_length' : [], 'accession' : [], 'bitscore' : [], 'bitscore_raw' : [], 
    'evalue' : [], 'hit_start' : [], 'hit_end' : [], 'query_frame' : [], 'gap_num' : [], 'aln_span' : []}
    for result in blastresult:
        blast_dictionary['id'].append(result.id)
        blast_dictionary['description'].append(result.description)
        blast_dictionary['seq_length'].append(result.seq_len)
        blast_dictionary['accession'].append(result.accession)
        if len(result.hsps) == 1:
            result_hsp = result.hsps[0] #need to doublecheck because there could be more than 1 HSP (likely the first is most significant)
            blast_dictionary['bitscore'].append(result_hsp.bitscore)
            blast_dictionary['bitscore_raw'].append(result_hsp.bitscore_raw)
            blast_dictionary['evalue'].append(result_hsp.evalue)
            blast_dictionary['hit_start'].append(result_hsp.hit_start)
            blast_dictionary['hit_end'].append(result_hsp.hit_end)
            blast_dictionary['query_frame'].append(result_hsp.query_frame)
            blast_dictionary['gap_num'].append(result_hsp.gap_num)
            blast_dictionary['aln_span'].append(result_hsp.aln_span)
        else:
            bitscore, bitscore_raw, evalue, hitstart, hitend, queryframe, gapnum, alnspan = '','','','','','','',''
            for hsp in result.hsps:
                bitscore += str(hsp.bitscore)
                bitscore_raw += str(hsp.bitscore_raw)
                evalue += str(hsp.evalue)
                hitstart += str(hsp.hit_start)
                hitend += str(hsp.hit_end)
                queryframe += str(hsp.query_frame)
                gapnum += str(hsp.gap_num)
                alnspan += str(hsp.aln_span)
                if hsp != result.hsps[-1]:
                    bitscore += '/'
                    bitscore_raw += '/' 
                    evalue += '/'
                    hitstart += '/'
                    hitend += '/'
                    queryframe += '/'
                    gapnum += '/' 
                    alnspan += '/' #I know it's not neat but it would give an error otherwise :(
            blast_dictionary['bitscore'].append(bitscore)
            blast_dictionary['bitscore_raw'].append(bitscore_raw)
            blast_dictionary['evalue'].append(evalue)
            blast_dictionary['hit_start'].append(hitstart)
            blast_dictionary['hit_end'].append(hitend)
            blast_dictionary['query_frame'].append(queryframe)
            blast_dictionary['gap_num'].append(gapnum)
            blast_dictionary['aln_span'].append(alnspan)

    return blast_dictionary            

#No multiple HSPS
#print(blast_to_dictionary(results_searchio))

#With multiple HSPS 
results_mult_hsps = SearchIO.read('for_loop_search_txid9592.xml', 'blast-xml')
print(blast_to_dictionary(results_mult_hsps))



{'id': ['gi|525344602|ref|NM_001279761.1|', 'gi|1753056057|ref|XM_019017591.2|', 'gi|1753056056|ref|XM_019017590.2|', 'gi|1753056055|ref|XM_019017589.2|', 'gi|1753056054|ref|XM_019017588.2|', 'gi|1753056053|ref|XM_019017587.2|', 'gi|1753056052|ref|XM_019017586.2|', 'gi|1753056051|ref|XM_019017585.2|', 'gi|1753056058|ref|XM_031005077.1|', 'gi|1753056672|ref|XM_019017900.2|', 'gi|1753056670|ref|XM_004062835.3|', 'gi|1753056668|ref|XM_019017898.2|', 'gi|948542510|gb|KT698238.1|', 'gi|1753053231|ref|XM_031004421.1|', 'gi|1753086199|ref|XR_004071381.1|', 'gi|1753086198|ref|XR_004071380.1|', 'gi|1753086196|ref|XM_031014265.1|', 'gi|1753086194|ref|XM_031014264.1|', 'gi|1753086192|ref|XM_031014263.1|', 'gi|1753086190|ref|XM_031014262.1|', 'gi|1753086188|ref|XM_031014261.1|', 'gi|1753086186|ref|XM_031014260.1|', 'gi|1753086184|ref|XM_031014259.1|', 'gi|1753086182|ref|XM_031014258.1|', 'gi|1753086180|ref|XM_031014257.1|', 'gi|1753086178|ref|XM_031014256.1|', 'gi|1753024318|ref|XM_031012314.1|', 

In [None]:
#No hit object has 2 HSPS
for result in results_searchio:
    i = 0
    for hsp in result.hsps:
        i += 1
    print(i)

In [None]:
#Create dataframe
blast_dataframe = pd.DataFrame.from_dict(blast_dictionary)
print(blast_dataframe)

In [None]:
#Save it as .csv
with open('csv_blast_results.csv', 'w') as savefile:
        savefile.write(str(blast_dataframe.to_csv()))


In [None]:
#Import saved .xml with NCBIXML
xml_parser = open('blastp_results.xml')
results_xml = NCBIXML.read(xml_parser)

In [None]:
#results_xml is a record.blast object
#It returns all the results as lists!! 
for result in results_xml.descriptions:
    result_1 = result.score
    print(result_1)