In [6]:
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord, Seq, SearchIO
from Bio.Blast import NCBIWWW, NCBIXML

In [4]:
#Create sequence record
seq_record_dictionary = SeqIO.to_dict(SeqIO.parse("human_mx1.fas", 'fasta')) #parse could be substituted by read
sequence_record = seq_record_dictionary[list(seq_record_dictionary)[0]]
sequence_record

SeqRecord(seq=Seq('ATGGTTGTTTCCGAAGTGGACATCGCAAAAGCTGATCCAGCTGCTGCATCCCAC...TAA'), id='lcl|XM_005260978.4_cds_XP_005261035.1_1', name='lcl|XM_005260978.4_cds_XP_005261035.1_1', description='lcl|XM_005260978.4_cds_XP_005261035.1_1 [gene=MX1] [db_xref=GeneID:4599] [protein=interferon-induced GTP-binding protein Mx1 isoform X1] [protein_id=XP_005261035.1] [location=458..2446] [gbkey=CDS]', dbxrefs=[])

In [5]:
#Create protein from sequence
rna = Seq.transcribe(sequence_record.seq)
protein = rna.translate(to_stop=True)
protein

Seq('MVVSEVDIAKADPAAASHPLLLNGDATVAQKNPGSVAENNLCSQYEEKVRPCID...FPG')

In [7]:
#Run BLASTn and store results in .xml file
result_handler = NCBIWWW.qblast('blastn', 'nt', sequence_record.seq) #can add entrez_query='taxid'
result_storer = result_handler.read()
with open('search_from_parser2.xml', 'w') as savefile:
        savefile.write(result_storer)

In [27]:
#Import saved .xml file with SearchIO 
results_searchio = SearchIO.read('search_from_parser2.xml', 'blast-xml')

#Unpacking of upper object (QueryResult)
print(results_searchio.seq_len) #in the upper object it is stored the length of the DNA molecule submitted for analysis
print(results_searchio.program)
print(results_searchio.version)
print(results_searchio.target) #database
print(results_searchio.id) #query id No is default
#with results_searchio.param.... can get the parameters used for the blast alingments

#for result in results_searchio:
#    print(result.id)

1989
blastn
2.12.0+
nt
No


In [28]:
#Unpacking of first result (Hit object) (done for single object so not to have excessive output#)
#ignore query ID, description and length as already unpacked from upper object
result1 = results_searchio[0]
print(result1.id)
print(result1.description)
print(result1.seq_len) #query length
print(result1.accession)


gi|1519315249|ref|NM_002462.5|
Homo sapiens MX dynamin like GTPase 1 (MX1), transcript variant 2, mRNA
2776
NM_002462


In [41]:
#Unpacking the HSPs from the same Hit object (high scoring segment pair) (remember that blast uses local alignment and could be biased)
#Given that the .hsps parameter returns a list of length 1
result1_hsp = result1.hsps[0]
print(result1_hsp.bitscore)
print(result1_hsp.bitscore_raw)
print(result1_hsp.evalue)
print(result1_hsp.query_start)
print(result1_hsp.query_end)
print(result1_hsp.hit_start)
print(result1_hsp.hit_end)
print(result1_hsp.query_frame)
print(result1_hsp.hit_frame)
print(result1_hsp.ident_num)
print(result1_hsp.pos_num)
print(result1_hsp.gap_num)
print(result1_hsp.aln_span)
print(result1_hsp.query) #Query Seq object! important to run local alignments

3588.19
3978
0.0
0
1989
333
2322
1
1
1989
1989
0
1989
ID: No
Name: aligned query sequence
Description: definition line
Number of features: 0
/molecule_type=DNA
Seq('ATGGTTGTTTCCGAAGTGGACATCGCAAAAGCTGATCCAGCTGCTGCATCCCAC...TAA')


In [None]:
#Import saved .xml with NCBIXML
