In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord, Seq, SearchIO
from Bio.Blast import NCBIWWW, NCBIXML

In [2]:
#Create sequence record
seq_record_dictionary = SeqIO.to_dict(SeqIO.parse("human_mx1.fas", 'fasta')) #parse could be substituted by read
sequence_record = seq_record_dictionary[list(seq_record_dictionary)[0]]
sequence_record

SeqRecord(seq=Seq('ATGGTTGTTTCCGAAGTGGACATCGCAAAAGCTGATCCAGCTGCTGCATCCCAC...TAA'), id='lcl|XM_005260978.4_cds_XP_005261035.1_1', name='lcl|XM_005260978.4_cds_XP_005261035.1_1', description='lcl|XM_005260978.4_cds_XP_005261035.1_1 [gene=MX1] [db_xref=GeneID:4599] [protein=interferon-induced GTP-binding protein Mx1 isoform X1] [protein_id=XP_005261035.1] [location=458..2446] [gbkey=CDS]', dbxrefs=[])

In [None]:
#Create protein from sequence
rna = Seq.transcribe(sequence_record.seq)
protein = rna.translate(to_stop=True)
protein

In [None]:
#Run BLASTn and store results in .xml file
result_handler = NCBIWWW.qblast('blastn', 'nt', sequence_record.seq) #can add entrez_query='txid000000'
result_storer = result_handler.read()
with open('search_from_parser2.xml', 'w') as savefile:
        savefile.write(result_storer)

In [3]:
#Import saved .xml file with SearchIO 
results_searchio = SearchIO.read('search_from_parser2.xml', 'blast-xml')

#Unpacking of upper object (QueryResult)
print(results_searchio.seq_len) #in the upper object it is stored the length of the DNA molecule submitted for analysis
print(results_searchio.program)
print(results_searchio.version)
print(results_searchio.target) #database
print(results_searchio.id) #query id No is default
#with results_searchio.param.... can get the parameters used for the blast alingments

#for result in results_searchio:
#    print(result.id)

1989
blastn
2.12.0+
nt
No


In [4]:
#Unpacking of first result (Hit object) (done for single object so not to have excessive output#)
#ignore query ID, description and length as already unpacked from upper object
result1 = results_searchio[0]
print(result1.id)
print(result1.description)
print(result1.seq_len) #sequence length
print(result1.accession)


gi|1519315249|ref|NM_002462.5|
Homo sapiens MX dynamin like GTPase 1 (MX1), transcript variant 2, mRNA
2776
NM_002462


In [25]:
for result in results_searchio:
    print(result.id)

gi|1519315249|ref|NM_002462.5|
gi|1370481400|ref|XM_017028349.2|
gi|1034627218|ref|XM_005260982.2|
gi|1034627217|ref|XM_005260981.2|
gi|1034627215|ref|XM_017028350.1|
gi|1034627212|ref|XM_005260980.2|
gi|1034627211|ref|XM_005260979.2|
gi|1034627210|ref|XM_011529568.2|
gi|1034627209|ref|XM_005260978.4|
gi|544711167|ref|NM_001144925.2|
gi|1890270839|ref|NM_001178046.3|
gi|190135|gb|M33882.1|HUMPMX1A
gi|61354936|gb|AY888131.1|
gi|60825047|gb|AY893667.1|
gi|21619146|gb|BC032602.1|
gi|261858633|dbj|AB527675.1|
gi|325464350|gb|JF432729.1|
gi|60812884|gb|AY893199.1|
gi|649120027|gb|KJ897213.1|
gi|164695655|dbj|AK315465.1|
gi|188900|gb|M30817.1|HUMMXA
gi|21755829|dbj|AK096355.1|
gi|1849077349|ref|XM_034947820.1|
gi|1849077347|ref|XM_034947819.1|
gi|1849077342|ref|XM_008977603.2|
gi|1849077334|ref|XM_008977599.2|
gi|1849077327|ref|XM_034947818.1|
gi|1849077319|ref|XM_008977598.4|
gi|1849077312|ref|XM_003823901.5|
gi|1367238133|ref|XM_009438930.3|
gi|1367238130|ref|XM_009438452.3|
gi|1367238127|

In [42]:
#Unpacking the HSPs from the same Hit object (high scoring segment pair) (remember that blast uses local alignment and could be biased)
#Given that the .hsps parameter returns a list of length 1
result1_hsp = result1.hsps[0]
print(result1_hsp.bitscore)
print(result1_hsp.bitscore_raw)
print(result1_hsp.evalue)
print(result1_hsp.query_start)
print(result1_hsp.query_end)
print(result1_hsp.hit_start)
print(result1_hsp.hit_end)
print(result1_hsp.query_frame)
print(result1_hsp.hit_frame)
print(result1_hsp.ident_num)
print(result1_hsp.pos_num)
print(result1_hsp.gap_num)
print(result1_hsp.aln_span) #!!! Length of alignment
print(result1_hsp.query) #Query Seq object! important to run local alignments

print('\nseparate\n')
print(result1_hsp.query.seq) # this object only stores the alingment sequence (1989 characters) instead of the hit one of 2776

3588.19
3978
0.0
0
1989
333
2322
1
1
1989
1989
0
1989
ID: No
Name: aligned query sequence
Description: definition line
Number of features: 0
/molecule_type=DNA
Seq('ATGGTTGTTTCCGAAGTGGACATCGCAAAAGCTGATCCAGCTGCTGCATCCCAC...TAA')

separate

ATGGTTGTTTCCGAAGTGGACATCGCAAAAGCTGATCCAGCTGCTGCATCCCACCCTCTATTACTGAATGGAGATGCTACTGTGGCCCAGAAAAATCCAGGCTCGGTGGCTGAGAACAACCTGTGCAGCCAGTATGAGGAGAAGGTGCGCCCCTGCATCGACCTCATTGACTCCCTGCGGGCTCTAGGTGTGGAGCAGGACCTGGCCCTGCCAGCCATCGCCGTCATCGGGGACCAGAGCTCGGGCAAGAGCTCCGTGTTGGAGGCACTGTCAGGAGTTGCCCTTCCCAGAGGCAGCGGGATCGTGACCAGATGCCCGCTGGTGCTGAAACTGAAGAAACTTGTGAACGAAGATAAGTGGAGAGGCAAGGTCAGTTACCAGGACTACGAGATTGAGATTTCGGATGCTTCAGAGGTAGAAAAGGAAATTAATAAAGCCCAGAATGCCATCGCCGGGGAAGGAATGGGAATCAGTCATGAGCTAATCACCCTGGAGATCAGCTCCCGAGATGTCCCGGATCTGACTCTAATAGACCTTCCTGGCATAACCAGAGTGGCTGTGGGCAATCAGCCTGCTGACATTGGGTATAAGATCAAGACACTCATCAAGAAGTACATCCAGAGGCAGGAGACAATCAGCCTGGTGGTGGTCCCCAGTAATGTGGACATCGCCACCACAGAGGCTCTCAGCATGGCCCAGGAGGTGGACCCCGAGGGAGACAGGACCATCGGAATCTTGACGAAGCCTGATCTGGTGGACAA

In [34]:
#make a dictionary

blast_dictionary = {'id' : [], 'description' : [], 'seq_length' : [], 'accession' : [], 'bitscore' : [], 'bitscore_raw' : [], 
'evalue' : [], 'hit_start' : [], 'hit_end' : [], 'query_frame' : [], 'gap_num' : [], 'aln_span' : []}

for result in results_searchio:
    blast_dictionary['id'].append(result.id)
    blast_dictionary['description'].append(result.description)
    blast_dictionary['seq_length'].append(result.seq_len)
    blast_dictionary['accession'].append(result.accession)
    result_hsp = result.hsps[0]
    blast_dictionary['bitscore'].append(result_hsp.bitscore)
    blast_dictionary['bitscore_raw'].append(result_hsp.bitscore_raw)
    blast_dictionary['evalue'].append(result_hsp.evalue)
    blast_dictionary['hit_start'].append(result_hsp.hit_start)
    blast_dictionary['hit_end'].append(result_hsp.hit_end)
    blast_dictionary['query_frame'].append(result_hsp.query_frame)
    blast_dictionary['gap_num'].append(result_hsp.gap_num)
    blast_dictionary['aln_span'].append(result_hsp.aln_span)


In [35]:
#Create dataframe
blast_dataframe = pd.DataFrame.from_dict(blast_dictionary)
print(blast_dataframe)

                                   id  \
0      gi|1519315249|ref|NM_002462.5|   
1   gi|1370481400|ref|XM_017028349.2|   
2   gi|1034627218|ref|XM_005260982.2|   
3   gi|1034627217|ref|XM_005260981.2|   
4   gi|1034627215|ref|XM_017028350.1|   
5   gi|1034627212|ref|XM_005260980.2|   
6   gi|1034627211|ref|XM_005260979.2|   
7   gi|1034627210|ref|XM_011529568.2|   
8   gi|1034627209|ref|XM_005260978.4|   
9    gi|544711167|ref|NM_001144925.2|   
10  gi|1890270839|ref|NM_001178046.3|   
11     gi|190135|gb|M33882.1|HUMPMX1A   
12         gi|61354936|gb|AY888131.1|   
13         gi|60825047|gb|AY893667.1|   
14         gi|21619146|gb|BC032602.1|   
15       gi|261858633|dbj|AB527675.1|   
16        gi|325464350|gb|JF432729.1|   
17         gi|60812884|gb|AY893199.1|   
18        gi|649120027|gb|KJ897213.1|   
19       gi|164695655|dbj|AK315465.1|   
20       gi|188900|gb|M30817.1|HUMMXA   
21        gi|21755829|dbj|AK096355.1|   
22  gi|1849077349|ref|XM_034947820.1|   
23  gi|184907734

In [38]:
#Save it as .csv
with open('csv_blast_results.csv', 'w') as savefile:
        savefile.write(str(blast_dataframe.to_csv()))


In [8]:
#Import saved .xml with NCBIXML
xml_parser = open('blastp_results.xml')
results_xml = NCBIXML.read(xml_parser)

In [23]:
#results_xml is a record.blast object
#It returns all the results as lists!! 
for result in results_xml.descriptions:
    result_1 = result.score
    print(result_1)

3528.0
3527.0
3526.0
3521.0
3519.0
3517.0
3516.0
3511.0
3489.0
3488.0
3484.0
3477.0
3477.0
3476.0
3466.0
3424.0
3417.0
3412.0
3410.0
3365.0
3337.0
3329.0
3285.0
3282.0
3282.0
3281.0
3277.0
3276.0
3275.0
3273.0
3270.0
3269.0
3267.0
3267.0
3264.0
3264.0
3258.0
3256.0
3254.0
3251.0
3246.0
3223.0
3222.0
3211.0
3197.0
3176.0
3176.0
3169.0
3162.0
3160.0
