#Notebook for executing multiple organism filtered queries

In [7]:
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord, Seq, SearchIO
from Bio.Blast import NCBIWWW

In [8]:
#Create sequence record

with open('human_mx1.fas') as handle:
    sequence_record = SeqIO.read(handle, 'fasta')

sequence_record

SeqRecord(seq=Seq('ATGGTTGTTTCCGAAGTGGACATCGCAAAAGCTGATCCAGCTGCTGCATCCCAC...TAA'), id='lcl|XM_005260978.4_cds_XP_005261035.1_1', name='lcl|XM_005260978.4_cds_XP_005261035.1_1', description='lcl|XM_005260978.4_cds_XP_005261035.1_1 [gene=MX1] [db_xref=GeneID:4599] [protein=interferon-induced GTP-binding protein Mx1 isoform X1] [protein_id=XP_005261035.1] [location=458..2446] [gbkey=CDS]', dbxrefs=[])

In [None]:
#MULTIPLE ALIGNMENT STAIGTH FROM BLAST????
#CHANGE ALIGNMENT MATRIX?
#USE OF BLAT???

In [9]:
#Run BLASTn with multiple taxid in the same query
#all [filter] NOT(environmental samples[organism] OR metagenomes[orgn]) AND txid3702[ORGN] AND txid9606[ORGN]
result_handler = NCBIWWW.qblast('blastn', 'nt', sequence_record.seq, entrez_query= 'txid9527[ORGN] OR txid9592[ORGN]') #can add entrez_query='txid000000'
result_storer = result_handler.read()
with open('multiple_taxid_samerun.xml', 'w') as savefile:
        savefile.write(result_storer)

In [12]:
#Run BLASTn in for loop with one query per taxid
taxid_list = ['9527', '9592']

for id in taxid_list:
    handler = NCBIWWW.qblast('blastn', 'nt', sequence_record.seq, entrez_query= f'txid{id}[ORGN]')
    storer = handler.read()
    with open(f'for_loop_search_txid{id}.xml', 'w') as savefile:
        savefile.write(storer)


In [None]:
#Necessity to retrieve whole fasta files to perform multiple alignment

In [32]:
#Retrieve fasta files with Entrez module?
import Bio.Entrez

Bio.Entrez.email = 'A.N.Other@example.com'
fasta_handler = Bio.Entrez.efetch(db='nucleotide', id='JX297238', rettype = 'fasta')
fasta_search = SeqIO.read(fasta_handler, format= 'fasta')
print(fasta_search)
print(len(fasta_search.seq))


ID: JX297238.1
Name: JX297238.1
Description: JX297238.1 Macaca sylvanus myxovirus (influenza virus) resistance 1 (MxA) mRNA, complete cds
Number of features: 0
Seq('ATGGTTGTTTCCGAAGTGGACATTGTAAAAGCTGATCCAGCTGCTGCATCCCAA...TAA')
1986


In [34]:
#Function to retrieve all fastas from the result
def retrieve_all_fastas(list_of_entries):
    Bio.Entrez.email = 'A.N.Other@example.com'
    list_of_sequences = []
    for entry in list_of_entries:
        handler = Bio.Entrez.efetch(db='nucleotide', id=entry, rettype = 'fasta', retmax=1)
        fasta = SeqIO.read(handler, format='fasta')
        list_of_sequences.append(fasta)
    return list_of_sequences


list_entries = ['JX297238', 'XM_005260982', 'NM_002462']
f_dictionary = retrieve_all_fastas(list_entries)
print(f_dictionary)

[SeqRecord(seq=Seq('ATGGTTGTTTCCGAAGTGGACATTGTAAAAGCTGATCCAGCTGCTGCATCCCAA...TAA'), id='JX297238.1', name='JX297238.1', description='JX297238.1 Macaca sylvanus myxovirus (influenza virus) resistance 1 (MxA) mRNA, complete cds', dbxrefs=[]), SeqRecord(seq=Seq('GCGGGGTGAAAGAGGCGAAGCGAGAGCGGAGGCCGCACTCCAGCACTGCGCAGG...GAC'), id='XM_005260982.2', name='XM_005260982.2', description='XM_005260982.2 PREDICTED: Homo sapiens MX dynamin like GTPase 1 (MX1), transcript variant X8, mRNA', dbxrefs=[]), SeqRecord(seq=Seq('GCACTCCAGCACTGCGCAGGGACCGCCTTGGACCGCAGTTGCCGGCCAGGAATC...AGA'), id='NM_002462.5', name='NM_002462.5', description='NM_002462.5 Homo sapiens MX dynamin like GTPase 1 (MX1), transcript variant 2, mRNA', dbxrefs=[])]


In [43]:
#Could have done it making straight a dictionary but best keeping them separate for now
def fastas_to_dictionary(list_of_fastas):
    dictionary_f = {'ID': [], 'Name' : [], 'Description':[], 'N_features':[], 'Sequence': []}
    for fasta in list_of_fastas:
        dictionary_f['ID'].append(fasta.id)
        dictionary_f['Name'].append(fasta.name)
        dictionary_f['Description'].append(fasta.description)
        dictionary_f['N_features'].append(fasta.features)
        dictionary_f['Sequence'].append(fasta.seq)
    return dictionary_f

dict_entries = fastas_to_dictionary(f_dictionary)
print(dict_entries)


{'ID': ['JX297238.1', 'XM_005260982.2', 'NM_002462.5'], 'Name': ['JX297238.1', 'XM_005260982.2', 'NM_002462.5'], 'Description': ['JX297238.1 Macaca sylvanus myxovirus (influenza virus) resistance 1 (MxA) mRNA, complete cds', 'XM_005260982.2 PREDICTED: Homo sapiens MX dynamin like GTPase 1 (MX1), transcript variant X8, mRNA', 'NM_002462.5 Homo sapiens MX dynamin like GTPase 1 (MX1), transcript variant 2, mRNA'], 'N_features': [[], [], []], 'Sequence': [Seq('ATGGTTGTTTCCGAAGTGGACATTGTAAAAGCTGATCCAGCTGCTGCATCCCAA...TAA'), Seq('GCGGGGTGAAAGAGGCGAAGCGAGAGCGGAGGCCGCACTCCAGCACTGCGCAGG...GAC'), Seq('GCACTCCAGCACTGCGCAGGGACCGCCTTGGACCGCAGTTGCCGGCCAGGAATC...AGA')]}


In [45]:
#Create a list of entries from the query

results_mult_taxid = SearchIO.read('multiple_taxid_samerun.xml', 'blast-xml')
query_entry_list = []
for result in results_mult_taxid:
    query_entry_list.append(result.accession)

#print(query_entry_list)


In [47]:
query_fasta = retrieve_all_fastas(query_entry_list)
query_dictionary = fastas_to_dictionary(query_fasta)
#print(query_dictionary)

In [None]:
#NEXT!!! MULTIPLE ALIGNMENT + PRODUCING PHYLOGENETIC TREE

In [None]:
#Multiple alignment 
import Bio.AlignIO



In [None]:
#To retrieve fasta files of the hits? 
import urllib.request
url = 'https://www.uniprot.org/uniprot/Q9LZP9.fasta'
urllib.request.urlretrieve(url, "chain_N.faa")
