#Notebook for executing multiple organism filtered queries

In [6]:
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord, Seq, SearchIO
import Bio.Entrez
from Bio.Blast import NCBIWWW

In [88]:
#Create sequence record

with open('human_mx1.fas') as handle:
    sequence_record = SeqIO.read(handle, 'fasta')

sequence_record

SeqRecord(seq=Seq('ATGGTTGTTTCCGAAGTGGACATCGCAAAAGCTGATCCAGCTGCTGCATCCCAC...TAA'), id='lcl|XM_005260978.4_cds_XP_005261035.1_1', name='lcl|XM_005260978.4_cds_XP_005261035.1_1', description='lcl|XM_005260978.4_cds_XP_005261035.1_1 [gene=MX1] [db_xref=GeneID:4599] [protein=interferon-induced GTP-binding protein Mx1 isoform X1] [protein_id=XP_005261035.1] [location=458..2446] [gbkey=CDS]', dbxrefs=[])

In [None]:
#MULTIPLE ALIGNMENT STAIGTH FROM BLAST????
#CHANGE ALIGNMENT MATRIX?
#USE OF BLAT???

In [9]:
#Run BLASTn with multiple taxid in the same query
#all [filter] NOT(environmental samples[organism] OR metagenomes[orgn]) AND txid3702[ORGN] AND txid9606[ORGN]
result_handler = NCBIWWW.qblast('blastn', 'nt', sequence_record.seq, entrez_query= 'txid9527[ORGN] OR txid9592[ORGN]') #can add entrez_query='txid000000'
result_storer = result_handler.read()
with open('multiple_taxid_samerun.xml', 'w') as savefile:
        savefile.write(result_storer)

In [12]:
#Run BLASTn in for loop with one query per taxid
taxid_list = ['9527', '9592']

for id in taxid_list:
    handler = NCBIWWW.qblast('blastn', 'nt', sequence_record.seq, entrez_query= f'txid{id}[ORGN]')
    storer = handler.read()
    with open(f'for_loop_search_txid{id}.xml', 'w') as savefile:
        savefile.write(storer)


In [94]:
#Function to run blast with a list of taxid, if list is empty no taxid is specified
def blastn_with_taxid(sequence, list_taxid, filename):
    result_handler, result_storer = None, None
    if len(list_taxid) == 0:
        result_handler = NCBIWWW.qblast('blastn', 'nt', sequence) #can add entrez_query='txid000000'
        result_storer = result_handler.read()
    else:
        entrez_query = ''
        for taxid in list_taxid:
            entrez_query += f'txid{taxid}[ORGN]'
            if taxid != taxid_list[-1]:
                entrez_query += ' OR '
        result_handler = NCBIWWW.qblast('blastn', 'nt', sequence, entrez_query= entrez_query) #can add entrez_query='txid000000'
        result_storer = result_handler.read()
    with open(f'{filename}.xml', 'w') as savefile:
        savefile.write(result_storer)

taxid_list = ['9592']

blastn_with_taxid(sequence_record.seq, taxid_list, 'test_taxid_function')


In [95]:
results_test_blast_function = SearchIO.read('test_taxid_function.xml', 'blast-xml')
results_test_blast_function

QueryResult(id='No', 50 hits)

In [None]:
#Necessity to retrieve whole fasta files to perform multiple alignment

In [32]:
#Retrieve fasta files with Entrez module?
import Bio.Entrez

Bio.Entrez.email = 'A.N.Other@example.com'
fasta_handler = Bio.Entrez.efetch(db='nucleotide', id='JX297238', rettype = 'fasta')
fasta_search = SeqIO.read(fasta_handler, format= 'fasta')
print(fasta_search)
print(len(fasta_search.seq))


ID: JX297238.1
Name: JX297238.1
Description: JX297238.1 Macaca sylvanus myxovirus (influenza virus) resistance 1 (MxA) mRNA, complete cds
Number of features: 0
Seq('ATGGTTGTTTCCGAAGTGGACATTGTAAAAGCTGATCCAGCTGCTGCATCCCAA...TAA')
1986


In [67]:
#Function to retrieve all fastas from the result
def retrieve_all_fastas(list_of_entries):
    Bio.Entrez.email = 'A.N.Other@example.com'
    list_of_sequences = []
    for entry in list_of_entries:
        handler = Bio.Entrez.efetch(db='nucleotide', id=entry, rettype = 'fasta', retmax=1) #!!!Genbank format
        fasta = SeqIO.read(handler, format='fasta')
        list_of_sequences.append(fasta)
    return list_of_sequences


list_entries = ['JX297238', 'XM_005260982', 'NM_002462']
f_dictionary = retrieve_all_fastas(list_entries)
print(f_dictionary)

[SeqRecord(seq=Seq('ATGGTTGTTTCCGAAGTGGACATTGTAAAAGCTGATCCAGCTGCTGCATCCCAA...TAA'), id='JX297238.1', name='JX297238.1', description='JX297238.1 Macaca sylvanus myxovirus (influenza virus) resistance 1 (MxA) mRNA, complete cds', dbxrefs=[]), SeqRecord(seq=Seq('GCGGGGTGAAAGAGGCGAAGCGAGAGCGGAGGCCGCACTCCAGCACTGCGCAGG...GAC'), id='XM_005260982.2', name='XM_005260982.2', description='XM_005260982.2 PREDICTED: Homo sapiens MX dynamin like GTPase 1 (MX1), transcript variant X8, mRNA', dbxrefs=[]), SeqRecord(seq=Seq('GCACTCCAGCACTGCGCAGGGACCGCCTTGGACCGCAGTTGCCGGCCAGGAATC...AGA'), id='NM_002462.5', name='NM_002462.5', description='NM_002462.5 Homo sapiens MX dynamin like GTPase 1 (MX1), transcript variant 2, mRNA', dbxrefs=[])]


In [43]:
#Could have done it making straight a dictionary but best keeping them separate for now
def fastas_to_dictionary(list_of_fastas):
    dictionary_f = {'ID': [], 'Name' : [], 'Description':[], 'N_features':[], 'Sequence': []}
    for fasta in list_of_fastas:
        dictionary_f['ID'].append(fasta.id)
        dictionary_f['Name'].append(fasta.name)
        dictionary_f['Description'].append(fasta.description)
        dictionary_f['N_features'].append(fasta.features)
        dictionary_f['Sequence'].append(fasta.seq)
    return dictionary_f

dict_entries = fastas_to_dictionary(f_dictionary)
print(dict_entries)


{'ID': ['JX297238.1', 'XM_005260982.2', 'NM_002462.5'], 'Name': ['JX297238.1', 'XM_005260982.2', 'NM_002462.5'], 'Description': ['JX297238.1 Macaca sylvanus myxovirus (influenza virus) resistance 1 (MxA) mRNA, complete cds', 'XM_005260982.2 PREDICTED: Homo sapiens MX dynamin like GTPase 1 (MX1), transcript variant X8, mRNA', 'NM_002462.5 Homo sapiens MX dynamin like GTPase 1 (MX1), transcript variant 2, mRNA'], 'N_features': [[], [], []], 'Sequence': [Seq('ATGGTTGTTTCCGAAGTGGACATTGTAAAAGCTGATCCAGCTGCTGCATCCCAA...TAA'), Seq('GCGGGGTGAAAGAGGCGAAGCGAGAGCGGAGGCCGCACTCCAGCACTGCGCAGG...GAC'), Seq('GCACTCCAGCACTGCGCAGGGACCGCCTTGGACCGCAGTTGCCGGCCAGGAATC...AGA')]}


In [45]:
#Create a list of entries from the query

results_mult_taxid = SearchIO.read('multiple_taxid_samerun.xml', 'blast-xml')
query_entry_list = []
for result in results_mult_taxid:
    query_entry_list.append(result.accession)

#print(query_entry_list)


In [47]:
query_fasta = retrieve_all_fastas(query_entry_list)
query_dictionary = fastas_to_dictionary(query_fasta)
#print(query_dictionary)

In [30]:
#Retrieve files from genbank

Bio.Entrez.email = 'A.N.Other@example.com'
gb_handler = Bio.Entrez.efetch(db= 'nucleotide', id= 'JX297238', rettype = 'gb', retmode='xml')
gb_content = Bio.Entrez.read(gb_handler, 'genbank')
print(gb_content)

[{'GBSeq_locus': 'JX297238', 'GBSeq_length': '1986', 'GBSeq_strandedness': 'single', 'GBSeq_moltype': 'mRNA', 'GBSeq_topology': 'linear', 'GBSeq_division': 'PRI', 'GBSeq_update-date': '01-NOV-2012', 'GBSeq_create-date': '01-NOV-2012', 'GBSeq_definition': 'Macaca sylvanus myxovirus (influenza virus) resistance 1 (MxA) mRNA, complete cds', 'GBSeq_primary-accession': 'JX297238', 'GBSeq_accession-version': 'JX297238.1', 'GBSeq_other-seqids': ['gb|JX297238.1|', 'gi|408689532'], 'GBSeq_source': 'Macaca sylvanus (Barbary ape)', 'GBSeq_organism': 'Macaca sylvanus', 'GBSeq_taxonomy': 'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Cercopithecidae; Cercopithecinae; Macaca', 'GBSeq_references': [{'GBReference_reference': '1', 'GBReference_position': '1..1986', 'GBReference_authors': ['Mitchell,P.S.', 'Patzina,C.', 'Emerman,M.', 'Haller,O.', 'Malik,H.S.', 'Kochs,G.'], 'GBReference_title': 'Evolution-Guided 

In [71]:
#Function to retrieve all genbank from the result
def retrieve_all_genbank(list_of_entries):
    Bio.Entrez.email = 'A.N.Other@example.com'
    list_of_sequences = []
    for entry in list_of_entries:
        handler = Bio.Entrez.efetch(db='nucleotide', id=entry, rettype = 'gb', retmode = 'xml', retmax=1) #!!!Genbank format
        gb_info = Bio.Entrez.read(handler, 'genbank')
        list_of_sequences.append(gb_info)
    return list_of_sequences


list_entries = ['JX297238', 'XM_005260982', 'NM_002462']
genbank_file_stack = retrieve_all_genbank(list_entries)
print(genbank_file_stack)

[[{'GBSeq_locus': 'JX297238', 'GBSeq_length': '1986', 'GBSeq_strandedness': 'single', 'GBSeq_moltype': 'mRNA', 'GBSeq_topology': 'linear', 'GBSeq_division': 'PRI', 'GBSeq_update-date': '01-NOV-2012', 'GBSeq_create-date': '01-NOV-2012', 'GBSeq_definition': 'Macaca sylvanus myxovirus (influenza virus) resistance 1 (MxA) mRNA, complete cds', 'GBSeq_primary-accession': 'JX297238', 'GBSeq_accession-version': 'JX297238.1', 'GBSeq_other-seqids': ['gb|JX297238.1|', 'gi|408689532'], 'GBSeq_source': 'Macaca sylvanus (Barbary ape)', 'GBSeq_organism': 'Macaca sylvanus', 'GBSeq_taxonomy': 'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Cercopithecidae; Cercopithecinae; Macaca', 'GBSeq_references': [{'GBReference_reference': '1', 'GBReference_position': '1..1986', 'GBReference_authors': ['Mitchell,P.S.', 'Patzina,C.', 'Emerman,M.', 'Haller,O.', 'Malik,H.S.', 'Kochs,G.'], 'GBReference_title': 'Evolution-Guided

In [86]:
#Create a dictionary with the genbank results


def find_taxon(list_feature_table):
    taxon = ''
    for feature in list_feature_table:
        if feature['GBFeature_key'] == 'source':
            source_wrap = feature['GBFeature_quals']
            for qualifier in source_wrap:
                if qualifier['GBQualifier_name'] == 'db_xref' and qualifier['GBQualifier_value'].startswith('taxon'):
                    taxon += (qualifier['GBQualifier_value'])
    return taxon    

def find_prot_id(list_feat_table):
    prot_id = ''
    for feature in list_feat_table:
        if feature['GBFeature_key'] == 'CDS':
            wrapper = feature['GBFeature_quals']
            for qualifier in wrapper:
                if qualifier['GBQualifier_name'] == 'protein_id':
                    prot_id += qualifier['GBQualifier_value']
    return prot_id

def find_prot_seq(list_feat_table):
    prot_seq = ''
    for feature in list_feat_table:
        if feature['GBFeature_key'] == 'CDS':
            wrapper = feature['GBFeature_quals']
            for qualifier in wrapper:
                if qualifier['GBQualifier_name'] == 'translation':
                    prot_seq += qualifier['GBQualifier_value']
    return prot_seq


def genbank_to_dictionary(list_of_genbank):
    dictionary_gen = {'Primary_accession' : [], 'Accession_version': [], 'Gene_length' : [], 'Strandedness': [], 'Molecule_type':[], 'Organism':[], 'Taxonomy':[],
    'Nuc_sequence':[], 'Taxon':[], 'Protein_ID':[], 'Prot_sequence':[]} #'N_of_references':[]
    for gen in list_of_genbank:
        first_wrapper = gen[0]
        dictionary_gen['Primary_accession'].append(first_wrapper['GBSeq_primary-accession'])
        dictionary_gen['Accession_version'].append(first_wrapper['GBSeq_accession-version'])
        dictionary_gen['Gene_length'].append(first_wrapper['GBSeq_length'])
        dictionary_gen['Strandedness'].append(first_wrapper['GBSeq_strandedness'])
        dictionary_gen['Molecule_type'].append(first_wrapper['GBSeq_moltype'])
        dictionary_gen['Organism'].append(first_wrapper['GBSeq_organism'])
        dictionary_gen['Taxonomy'].append(first_wrapper['GBSeq_taxonomy'])
        #dictionary_gen['N_of_references'].append(len(first_wrapper['GBSeq_references']))
        dictionary_gen['Nuc_sequence'].append(first_wrapper['GBSeq_sequence'])
        dictionary_gen['Taxon'].append(find_taxon(first_wrapper['GBSeq_feature-table']))
        dictionary_gen['Protein_ID'].append(find_prot_id(first_wrapper['GBSeq_feature-table']))
        dictionary_gen['Prot_sequence'].append(find_prot_seq(first_wrapper['GBSeq_feature-table']))
    return dictionary_gen


genbank_dict = genbank_to_dictionary(genbank_file_stack)
print(genbank_dict)
        


{'Primary_accession': ['JX297238', 'XM_005260982', 'NM_002462'], 'Accession_version': ['JX297238.1', 'XM_005260982.2', 'NM_002462.5'], 'Gene_length': ['1986', '2731', '2776'], 'Strandedness': ['single', 'single', 'single'], 'Molecule_type': ['mRNA', 'mRNA', 'mRNA'], 'Organism': ['Macaca sylvanus', 'Homo sapiens', 'Homo sapiens'], 'Taxonomy': ['Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Cercopithecidae; Cercopithecinae; Macaca', 'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; Homo', 'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; Homo'], 'Nuc_sequence': ['atggttgtttccgaagtggacattgtaaaagctgatccagctgctgcatcccaacctctattactgaatggagatgctgacgtggcccagaaaagtccgggctcggtggctgagaacaacctgtgcagccagtatga

In [85]:
print(gb_content[0]['GBSeq_locus'])
gb_content1 = gb_content[0]
reference_list = gb_content1['GBSeq_references']
feature_table = gb_content1['GBSeq_feature-table']

def count_n_references(list):    
    number_refs = len(reference_list)
    return number_refs

def find_taxon(list_feature_table):
    taxon = ''
    for feature in list_feature_table:
        if feature['GBFeature_key'] == 'source':
            print('x')
            source_wrap = feature['GBFeature_quals']
            for qualifier in source_wrap:
                if qualifier['GBQualifier_name'] == 'db_xref' and qualifier['GBQualifier_value'].startswith('taxon'):
                    taxon += (f"{qualifier['GBQualifier_value']} ")
    return taxon    


def find_prot_id(list_feat_table):
    prot_id = ''
    for feature in list_feat_table:
        if feature['GBFeature_key'] == 'CDS':
            wrapper = feature['GBFeature_quals']
            for qualifier in wrapper:
                if qualifier['GBQualifier_name'] == 'protein_id':
                    prot_id += qualifier['GBQualifier_value']
    return prot_id

def find_prot_seq(list_feat_table):
    prot_seq = ''
    for feature in list_feat_table:
        if feature['GBFeature_key'] == 'CDS':
            wrapper = feature['GBFeature_quals']
            for qualifier in wrapper:
                if qualifier['GBQualifier_name'] == 'translation':
                    prot_seq += qualifier['GBQualifier_value']
    return prot_seq


print(count_n_references(reference_list))
print(reference_list[0]['GBReference_xref'][0]['GBXref_id']) #!!! Not all the files have the dois or are uniformely formatted
print(gb_content1['GBSeq_organism'])

print(find_taxon(feature_table))
print(find_prot_id(feature_table))
print(find_prot_seq(feature_table))

JX297238
2
10.1016/j.chom.2012.09.005
Macaca sylvanus
x
taxon:9546 
AFU81310.1
MVVSEVDIVKADPAAASQPLLLNGDADVAQKSPGSVAENNLCSQYEEKVRPCIDLIDSLRALGVEQDLALPAIAVIGDQSSGKSSVLEALSGVALPRGSGIVTRCPLVLKLKKLVNEDEWRGKVSYQDYEIEILDASEVEEEINKAQNTIAGEGMGISHELITLEISSRDVPDLTLIDLPGITRVAVGNQPPDIGYKIKTLIRKYIQRQETINLVVVPSNVDIATTEALSMAQEVDPEGDRTIGILTKPDLVDKGTEDKVVDVVRNLVFHLKKGYMIVKCRGQQEIQDQLSLSEALQREKIFFEVHPHFRDLLEEGKATIPCLAEKLTSELIAHICKSLPLLENQIKESHQGITEELQKYGVDIPEDENEKMFFLIDKINAFNQDITALIQGEETVGEDDSRLFTRLRREFHKWGIIIENNLQEGHKITSRKMQKFENQYRGRELPGFVNYRTFETIVKQQIKALEEPAVNMLHTVTDMVRLAFTDVSMKNFEELFNLHRTAKSKIEDIRTEQEREGEKLIRLHFQMEQIVYCQDQVYRGALQKVREKELEEEKKKKSWDVGTFQSSSTDSSMEEIFQHLMAYHQEASKRISSHIPLIIQFFMLQTYGQQLQKAMLQLLQDKDTYSWLLKERSDTSDKRKFLKERLARLTQARRRLAQFPG


In [None]:
#
#
#
#Function to find sequences by taxid, Function to deal w multiple hsps, function to align and create a matrix
#
#
#

In [None]:
#Function to write a 'fasta' file with the sequences to align

In [None]:
#NEXT!!! MULTIPLE ALIGNMENT + PRODUCING PHYLOGENETIC TREE

#Probably best uning ClustalW from the command line

In [None]:
#Multiple alignment 
import Bio.AlignIO


In [None]:
#To retrieve fasta files of the hits? 
import urllib.request
url = 'https://www.uniprot.org/uniprot/Q9LZP9.fasta'
urllib.request.urlretrieve(url, "chain_N.faa")


In [None]:
#Entrez read must be open in binary!
>>> from Bio import Entrez
>>> handle = open("Entrez/esearch1.xml", "rb")  # opened in binary mode
>>> record = Entrez.read(handle)
>>> print(record['QueryTranslation'])
biopython[All Fields]
>>> handle.close()