In [None]:
#Notebook that sums up all the functions so far

In [None]:
#
#
#
#Function to find sequences by taxid, Function to deal with multiple hsps, function to align and create a matrix
#
#
#

In [1]:
import os
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord, Seq, SearchIO, AlignIO, Phylo
from Bio.Blast import NCBIWWW, NCBIXML
import Bio.Entrez

In [2]:
def open_fasta(filename):
    with open(filename) as handle:
        sequence_record = SeqIO.read(handle, 'fasta')
    return sequence_record

fasta_record = open_fasta('human_mx1.fas')
print(fasta_record)

ID: lcl|XM_005260978.4_cds_XP_005261035.1_1
Name: lcl|XM_005260978.4_cds_XP_005261035.1_1
Description: lcl|XM_005260978.4_cds_XP_005261035.1_1 [gene=MX1] [db_xref=GeneID:4599] [protein=interferon-induced GTP-binding protein Mx1 isoform X1] [protein_id=XP_005261035.1] [location=458..2446] [gbkey=CDS]
Number of features: 0
Seq('ATGGTTGTTTCCGAAGTGGACATCGCAAAAGCTGATCCAGCTGCTGCATCCCAC...TAA')


In [3]:
#Function to run blast with a list of taxid, if list is empty no taxid is specified
def blastn_with_taxid(sequence, filename, list_taxid = []):
    result_handler, result_storer = None, None
    if len(list_taxid) <1:
        result_handler = NCBIWWW.qblast('blastn', 'nt', sequence) 
        result_storer = result_handler.read()
    else:
        entrez_query = ''
        for taxid in list_taxid:
            entrez_query += f'txid{taxid}[ORGN]'
            if taxid != taxid_list[-1]:
                entrez_query += ' OR '
        result_handler = NCBIWWW.qblast('blastn', 'nt', sequence, entrez_query= entrez_query)
        result_storer = result_handler.read()
    with open(f'{filename}.xml', 'w') as savefile:
        savefile.write(result_storer)

taxid_list = ['9592']

blastn_with_taxid(fasta_record.seq, 'test_taxid_function', list_taxid = taxid_list)

In [4]:
#Creation of dictionary with all HSPs

def blast_to_dictionary(blastresult):
    blast_dictionary = {'id' : [], 'description' : [], 'seq_length' : [], 'accession' : [], 'bitscore' : [], 'bitscore_raw' : [], 
    'evalue' : [], 'hit_start' : [], 'hit_end' : [], 'query_frame' : [], 'gap_num' : [], 'aln_span' : []}
    for result in blastresult:
        blast_dictionary['id'].append(result.id)
        blast_dictionary['description'].append(result.description)
        blast_dictionary['seq_length'].append(result.seq_len)
        blast_dictionary['accession'].append(result.accession)
        bitscore, bitscore_raw, evalue, hitstart, hitend, queryframe, gapnum, alnspan = '','','','','','','',''
        for hsp in result.hsps:
            bitscore += str(hsp.bitscore)
            bitscore_raw += str(hsp.bitscore_raw)
            evalue += str(hsp.evalue)
            hitstart += str(hsp.hit_start)
            hitend += str(hsp.hit_end)
            queryframe += str(hsp.query_frame)
            gapnum += str(hsp.gap_num)
            alnspan += str(hsp.aln_span)
            if hsp != result.hsps[-1]:
                bitscore += '/'
                bitscore_raw += '/' 
                evalue += '/'
                hitstart += '/'
                hitend += '/'
                queryframe += '/'
                gapnum += '/' 
                alnspan += '/' #I know it's not neat but it would give an error otherwise :(
        blast_dictionary['bitscore'].append(bitscore)
        blast_dictionary['bitscore_raw'].append(bitscore_raw)
        blast_dictionary['evalue'].append(evalue)
        blast_dictionary['hit_start'].append(hitstart)
        blast_dictionary['hit_end'].append(hitend)
        blast_dictionary['query_frame'].append(queryframe)
        blast_dictionary['gap_num'].append(gapnum)
        blast_dictionary['aln_span'].append(alnspan)

    return blast_dictionary            


results_handler = SearchIO.read('test_taxid_function.xml', 'blast-xml')
dictionary_blast_results = blast_to_dictionary(results_handler)
print(dictionary_blast_results)


{'id': ['gi|525344602|ref|NM_001279761.1|', 'gi|1753056057|ref|XM_019017591.2|', 'gi|1753056056|ref|XM_019017590.2|', 'gi|1753056055|ref|XM_019017589.2|', 'gi|1753056054|ref|XM_019017588.2|', 'gi|1753056053|ref|XM_019017587.2|', 'gi|1753056052|ref|XM_019017586.2|', 'gi|1753056051|ref|XM_019017585.2|', 'gi|1753056058|ref|XM_031005077.1|', 'gi|1753056672|ref|XM_019017900.2|', 'gi|1753056670|ref|XM_004062835.3|', 'gi|1753056668|ref|XM_019017898.2|', 'gi|948542510|gb|KT698238.1|', 'gi|1753053231|ref|XM_031004421.1|', 'gi|1753086199|ref|XR_004071381.1|', 'gi|1753086198|ref|XR_004071380.1|', 'gi|1753086196|ref|XM_031014265.1|', 'gi|1753086194|ref|XM_031014264.1|', 'gi|1753086192|ref|XM_031014263.1|', 'gi|1753086190|ref|XM_031014262.1|', 'gi|1753086188|ref|XM_031014261.1|', 'gi|1753086186|ref|XM_031014260.1|', 'gi|1753086184|ref|XM_031014259.1|', 'gi|1753086182|ref|XM_031014258.1|', 'gi|1753086180|ref|XM_031014257.1|', 'gi|1753086178|ref|XM_031014256.1|', 'gi|1753024318|ref|XM_031012314.1|', 

In [20]:
#Creation of dictionary with all HSPs adding identity and other metrics

#Think about identity metrics useful for determining relatedness

def blast_to_dictionary_plus_metrics(blastresult):
    blast_dictionary = {'ID' : [], 'Description' : [], 'Seq_length' : [], 'Accession' : [], 'Bitscore' : [], 'Bitscore_raw' : [], 
    'Evalue' : [], 'Hit_start' : [], 'Hit_end' : [], 'Query_frame' : [], 'Gap_num' : [], 'Aln_span' : [], 'Tot_aln_span':[], 'Identity' :[]}
    for result in blastresult:
        blast_dictionary['ID'].append(result.id)
        blast_dictionary['Description'].append(result.description)
        blast_dictionary['Seq_length'].append(result.seq_len)
        blast_dictionary['Accession'].append(result.accession)
        bitscore, bitscore_raw, evalue, hitstart, hitend, queryframe, gapnum, alnspan = '','','','','','','',''
        all_alnspan, all_gapnum = [],[] #seq_len is not required
        for hsp in result.hsps:
            bitscore += str(hsp.bitscore)
            bitscore_raw += str(hsp.bitscore_raw)
            evalue += str(hsp.evalue)
            hitstart += str(hsp.hit_start)
            hitend += str(hsp.hit_end)
            queryframe += str(hsp.query_frame)
            gapnum += str(hsp.gap_num)
            alnspan += str(hsp.aln_span)
            if hsp != result.hsps[-1]:
                bitscore += '/'
                bitscore_raw += '/' 
                evalue += '/'
                hitstart += '/'
                hitend += '/'
                queryframe += '/'
                gapnum += '/' 
                alnspan += '/' #I know it's not neat but it would give an error otherwise :(
            all_alnspan.append(int(hsp.aln_span))
            all_gapnum.append(int(hsp.gap_num))
        blast_dictionary['Bitscore'].append(bitscore)
        blast_dictionary['Bitscore_raw'].append(bitscore_raw)
        blast_dictionary['Evalue'].append(evalue)
        blast_dictionary['Hit_start'].append(hitstart)
        blast_dictionary['Hit_end'].append(hitend)
        blast_dictionary['Query_frame'].append(queryframe)
        blast_dictionary['Gap_num'].append(gapnum)
        blast_dictionary['Aln_span'].append(alnspan)
        tot_alnspan, tot_gapnum = int(), int()
        seq_len = int(result.seq_len)
        for span in all_alnspan:
            tot_alnspan += span
        for gap in all_gapnum:
            tot_gapnum += gap
        identity = (tot_alnspan - tot_gapnum)/seq_len*100
        blast_dictionary['Tot_aln_span'].append(tot_alnspan)
        blast_dictionary['Identity'].append(round(identity, 3))
    return blast_dictionary            


results_handler = SearchIO.read('test_taxid_function.xml', 'blast-xml')
dictionary_blast_results_plus_metrics = blast_to_dictionary_plus_metrics(results_handler)
print(dictionary_blast_results_plus_metrics)

{'ID': ['gi|525344602|ref|NM_001279761.1|', 'gi|1753056057|ref|XM_019017591.2|', 'gi|1753056056|ref|XM_019017590.2|', 'gi|1753056055|ref|XM_019017589.2|', 'gi|1753056054|ref|XM_019017588.2|', 'gi|1753056053|ref|XM_019017587.2|', 'gi|1753056052|ref|XM_019017586.2|', 'gi|1753056051|ref|XM_019017585.2|', 'gi|1753056058|ref|XM_031005077.1|', 'gi|1753056672|ref|XM_019017900.2|', 'gi|1753056670|ref|XM_004062835.3|', 'gi|1753056668|ref|XM_019017898.2|', 'gi|948542510|gb|KT698238.1|', 'gi|1753053231|ref|XM_031004421.1|', 'gi|1753086199|ref|XR_004071381.1|', 'gi|1753086198|ref|XR_004071380.1|', 'gi|1753086196|ref|XM_031014265.1|', 'gi|1753086194|ref|XM_031014264.1|', 'gi|1753086192|ref|XM_031014263.1|', 'gi|1753086190|ref|XM_031014262.1|', 'gi|1753086188|ref|XM_031014261.1|', 'gi|1753086186|ref|XM_031014260.1|', 'gi|1753086184|ref|XM_031014259.1|', 'gi|1753086182|ref|XM_031014258.1|', 'gi|1753086180|ref|XM_031014257.1|', 'gi|1753086178|ref|XM_031014256.1|', 'gi|1753024318|ref|XM_031012314.1|', 

In [9]:
#Create DataFrame from dictionary
df_blast_results = pd.DataFrame.from_dict(dictionary_blast_results)
print(df_blast_results)

                                   id  \
0    gi|525344602|ref|NM_001279761.1|   
1   gi|1753056057|ref|XM_019017591.2|   
2   gi|1753056056|ref|XM_019017590.2|   
3   gi|1753056055|ref|XM_019017589.2|   
4   gi|1753056054|ref|XM_019017588.2|   
5   gi|1753056053|ref|XM_019017587.2|   
6   gi|1753056052|ref|XM_019017586.2|   
7   gi|1753056051|ref|XM_019017585.2|   
8   gi|1753056058|ref|XM_031005077.1|   
9   gi|1753056672|ref|XM_019017900.2|   
10  gi|1753056670|ref|XM_004062835.3|   
11  gi|1753056668|ref|XM_019017898.2|   
12        gi|948542510|gb|KT698238.1|   
13  gi|1753053231|ref|XM_031004421.1|   
14  gi|1753086199|ref|XR_004071381.1|   
15  gi|1753086198|ref|XR_004071380.1|   
16  gi|1753086196|ref|XM_031014265.1|   
17  gi|1753086194|ref|XM_031014264.1|   
18  gi|1753086192|ref|XM_031014263.1|   
19  gi|1753086190|ref|XM_031014262.1|   
20  gi|1753086188|ref|XM_031014261.1|   
21  gi|1753086186|ref|XM_031014260.1|   
22  gi|1753086184|ref|XM_031014259.1|   
23  gi|175308618

In [26]:
#Create complete dataframe
df_blast_results_plus_metrics = pd.DataFrame.from_dict(dictionary_blast_results_plus_metrics)
print(df_blast_results_plus_metrics)

                                   ID  \
0    gi|525344602|ref|NM_001279761.1|   
1   gi|1753056057|ref|XM_019017591.2|   
2   gi|1753056056|ref|XM_019017590.2|   
3   gi|1753056055|ref|XM_019017589.2|   
4   gi|1753056054|ref|XM_019017588.2|   
5   gi|1753056053|ref|XM_019017587.2|   
6   gi|1753056052|ref|XM_019017586.2|   
7   gi|1753056051|ref|XM_019017585.2|   
8   gi|1753056058|ref|XM_031005077.1|   
9   gi|1753056672|ref|XM_019017900.2|   
10  gi|1753056670|ref|XM_004062835.3|   
11  gi|1753056668|ref|XM_019017898.2|   
12        gi|948542510|gb|KT698238.1|   
13  gi|1753053231|ref|XM_031004421.1|   
14  gi|1753086199|ref|XR_004071381.1|   
15  gi|1753086198|ref|XR_004071380.1|   
16  gi|1753086196|ref|XM_031014265.1|   
17  gi|1753086194|ref|XM_031014264.1|   
18  gi|1753086192|ref|XM_031014263.1|   
19  gi|1753086190|ref|XM_031014262.1|   
20  gi|1753086188|ref|XM_031014261.1|   
21  gi|1753086186|ref|XM_031014260.1|   
22  gi|1753086184|ref|XM_031014259.1|   
23  gi|175308618

In [None]:
#Save it as .csv
with open('csv_blast_results.csv', 'w') as savefile:
        savefile.write(str(df_blast_results.to_csv()))

In [5]:
#Retrieve the list of the accession numbers from the BLAST results
accession_list = dictionary_blast_results['accession']

In [6]:
#Function to retrieve all genbank from the result
def retrieve_all_genbank(list_of_entries):
    Bio.Entrez.email = 'A.N.Other@example.com'
    list_of_sequences = []
    for entry in list_of_entries:
        handler = Bio.Entrez.efetch(db='nucleotide', id=entry, rettype = 'gb', retmode = 'xml', retmax=1) #!!!Genbank format !! Returns JSON regardless
        #retmode = 'text'
        gb_info = Bio.Entrez.read(handler, 'genbank')
        list_of_sequences.append(gb_info)
    return list_of_sequences


#list_entries = ['JX297238', 'XM_005260982', 'NM_002462']
genbank_file_stack = retrieve_all_genbank(accession_list) #accession_list
print(genbank_file_stack)


[[{'GBSeq_locus': 'NM_001279761', 'GBSeq_length': '1989', 'GBSeq_strandedness': 'single', 'GBSeq_moltype': 'mRNA', 'GBSeq_topology': 'linear', 'GBSeq_division': 'PRI', 'GBSeq_update-date': '29-JUN-2020', 'GBSeq_create-date': '19-JUL-2013', 'GBSeq_definition': 'Gorilla gorilla MX dynamin like GTPase 1 (MX1), mRNA', 'GBSeq_primary-accession': 'NM_001279761', 'GBSeq_accession-version': 'NM_001279761.1', 'GBSeq_other-seqids': ['ref|NM_001279761.1|', 'gi|525344602'], 'GBSeq_secondary-accessions': ['XM_004062836', 'XM_004062837', 'XM_004062838'], 'GBSeq_keywords': ['RefSeq'], 'GBSeq_source': 'Gorilla gorilla (western gorilla)', 'GBSeq_organism': 'Gorilla gorilla', 'GBSeq_taxonomy': 'Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; Gorilla', 'GBSeq_references': [{'GBReference_reference': '1', 'GBReference_position': '1..1989', 'GBReference_authors': ['Mitchell PS', 'Patzina C', 'Emerman M', 'H

In [65]:
#efetch fetches the genome information in fasta format
net_handle = Bio.Entrez.efetch(db="nucleotide", id='JX297238', rettype='gb', retmode="text")
#opens a writable file with the output file name
out_handle = open('allfas.gbk', "w")
#writes the fasta record in the opened file
out_handle.write(net_handle.read())
out_handle.close()
net_handle.close()


In [67]:
gb = Bio.SeqIO.parse('allfas.gbk', 'gb')
for g in gb:
    print(g)

ID: JX297238.1
Name: JX297238
Description: Macaca sylvanus myxovirus (influenza virus) resistance 1 (MxA) mRNA, complete cds
Number of features: 3
/molecule_type=mRNA
/topology=linear
/data_file_division=PRI
/date=01-NOV-2012
/accessions=['JX297238']
/sequence_version=1
/keywords=['']
/source=Macaca sylvanus (Barbary ape)
/organism=Macaca sylvanus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Cercopithecidae', 'Cercopithecinae', 'Macaca']
/references=[Reference(title='Evolution-Guided Identification of Antiviral Specificity Determinants in the Broadly Acting Interferon-Induced Innate Immunity Factor MxA', ...), Reference(title='Direct Submission', ...)]
Seq('ATGGTTGTTTCCGAAGTGGACATTGTAAAAGCTGATCCAGCTGCTGCATCCCAA...TAA')


In [63]:
Bio.Entrez.email = 'A.N.Other@example.com'
handler = Bio.Entrez.efetch(db='nucleotide', id='JX297238', rettype = 'gb', retmode = 'text')
handler.write 
gb_info = Bio.Entrez.read(handler, 'genbank')
print(gb_info)

TypeError: file should be opened in binary mode

In [16]:
#Create a dictionary with the genbank results

def find_taxon(list_feature_table):
    taxon = ''
    for feature in list_feature_table:
        if feature['GBFeature_key'] == 'source':
            source_wrap = feature['GBFeature_quals']
            for qualifier in source_wrap:
                if qualifier['GBQualifier_name'] == 'db_xref' and qualifier['GBQualifier_value'].startswith('taxon'):
                    taxon += (qualifier['GBQualifier_value'])
    return taxon    

def find_prot_id(list_feat_table):
    prot_id = ''
    for feature in list_feat_table:
        if feature['GBFeature_key'] == 'CDS':
            wrapper = feature['GBFeature_quals']
            for qualifier in wrapper:
                if qualifier['GBQualifier_name'] == 'protein_id':
                    prot_id += qualifier['GBQualifier_value']
    return prot_id

def find_prot_seq(list_feat_table):
    prot_seq = ''
    for feature in list_feat_table:
        if feature['GBFeature_key'] == 'CDS':
            wrapper = feature['GBFeature_quals']
            for qualifier in wrapper:
                if qualifier['GBQualifier_name'] == 'translation':
                    prot_seq += qualifier['GBQualifier_value']
    return prot_seq


def genbank_to_dictionary(list_of_genbank):
    dictionary_gen = {'Accession' : [], 'Accession_version': [], 'Gene_length' : [], 'Strandedness': [], 'Molecule_type':[], 'Organism':[], 'Taxonomy':[],
    'Nuc_sequence':[], 'Taxon':[], 'Protein_ID':[], 'Prot_sequence':[]} #'N_of_references':[]
    for gen in list_of_genbank:
        first_wrapper = gen[0]
        dictionary_gen['Accession'].append(first_wrapper['GBSeq_primary-accession'])
        dictionary_gen['Accession_version'].append(first_wrapper['GBSeq_accession-version'])
        dictionary_gen['Gene_length'].append(first_wrapper['GBSeq_length'])
        dictionary_gen['Strandedness'].append(first_wrapper['GBSeq_strandedness'])
        dictionary_gen['Molecule_type'].append(first_wrapper['GBSeq_moltype'])
        dictionary_gen['Organism'].append(first_wrapper['GBSeq_organism'])
        dictionary_gen['Taxonomy'].append(first_wrapper['GBSeq_taxonomy'])
        #dictionary_gen['N_of_references'].append(len(first_wrapper['GBSeq_references']))
        dictionary_gen['Nuc_sequence'].append(first_wrapper['GBSeq_sequence'])
        dictionary_gen['Taxon'].append(find_taxon(first_wrapper['GBSeq_feature-table']))
        dictionary_gen['Protein_ID'].append(find_prot_id(first_wrapper['GBSeq_feature-table']))
        dictionary_gen['Prot_sequence'].append(find_prot_seq(first_wrapper['GBSeq_feature-table']))
    return dictionary_gen


genbank_dict = genbank_to_dictionary(genbank_file_stack)
print(genbank_dict)

{'Accession': ['NM_001279761', 'XM_019017591', 'XM_019017590', 'XM_019017589', 'XM_019017588', 'XM_019017587', 'XM_019017586', 'XM_019017585', 'XM_031005077', 'XM_019017900', 'XM_004062835', 'XM_019017898', 'KT698238', 'XM_031004421', 'XR_004071381', 'XR_004071380', 'XM_031014265', 'XM_031014264', 'XM_031014263', 'XM_031014262', 'XM_031014261', 'XM_031014260', 'XM_031014259', 'XM_031014258', 'XM_031014257', 'XM_031014256', 'XM_031012314', 'XM_019026686', 'XM_019026673', 'XM_019026671', 'XM_019026667', 'XM_019026694', 'XM_019026682', 'XM_019026675', 'XM_004056249', 'XM_031001099', 'XM_019038595', 'XM_004047763', 'XM_004053112', 'XM_004053108', 'XM_031011642', 'XM_031004208', 'XR_002008569', 'AC240968', 'XM_004044579', 'XM_004043944', 'XM_004043943', 'XM_004043942', 'XM_031005608', 'XM_019018257'], 'Accession_version': ['NM_001279761.1', 'XM_019017591.2', 'XM_019017590.2', 'XM_019017589.2', 'XM_019017588.2', 'XM_019017587.2', 'XM_019017586.2', 'XM_019017585.2', 'XM_031005077.1', 'XM_0190

In [27]:
#Create DataFrame from dictionary
df_gb_results = pd.DataFrame.from_dict(genbank_dict)
print(df_gb_results)
#Save it as .csv
with open('csv_genbank_results.csv', 'w') as savefile:
        savefile.write(str(df_gb_results.to_csv()))

       Accession Accession_version Gene_length Strandedness Molecule_type  \
0   NM_001279761    NM_001279761.1        1989       single          mRNA   
1   XM_019017591    XM_019017591.2        2838       single          mRNA   
2   XM_019017590    XM_019017590.2        2870       single          mRNA   
3   XM_019017589    XM_019017589.2        2773       single          mRNA   
4   XM_019017588    XM_019017588.2        2880       single          mRNA   
5   XM_019017587    XM_019017587.2        2911       single          mRNA   
6   XM_019017586    XM_019017586.2        2965       single          mRNA   
7   XM_019017585    XM_019017585.2        2974       single          mRNA   
8   XM_031005077    XM_031005077.1        2001       single          mRNA   
9   XM_019017900    XM_019017900.2        3238       single          mRNA   
10  XM_004062835    XM_004062835.3        3488       single          mRNA   
11  XM_019017898    XM_019017898.2        3075       single          mRNA   

In [28]:
#Create combined dictionary, best on DataFrame

left = df_blast_results_plus_metrics.loc[ : , ['Accession', 'Seq_length', 'Evalue', 'Bitscore', 'Tot_aln_span', 'Identity']] #Add Description too!!! (add everything)
right = df_gb_results.loc[:,['Accession', 'Organism', 'Taxonomy', 'Taxon', 'Protein_ID', 'Prot_sequence']]
df_combined = pd.merge(left, right, on='Accession')
print(df_combined)

       Accession  Seq_length           Evalue         Bitscore  Tot_aln_span  \
0   NM_001279761        1989              0.0          3470.97          1989   
1   XM_019017591        2838              0.0          3461.95          1989   
2   XM_019017590        2870              0.0          3461.95          1989   
3   XM_019017589        2773              0.0          3461.95          1989   
4   XM_019017588        2880              0.0          3461.95          1989   
5   XM_019017587        2911              0.0          3461.95          1989   
6   XM_019017586        2965              0.0          3461.95          1989   
7   XM_019017585        2974              0.0          3461.95          1989   
8   XM_031005077        2001  0.0/2.6086e-114  2208.61/416.963          1507   
9   XM_019017900        3238              0.0          902.069          1536   
10  XM_004062835        3488              0.0          902.069          1536   
11  XM_019017898        3075            

In [54]:
#Filter df based on total_alignment_span and identity 
#
# !!! WORTH FILTERING E-VALUES BEFORE RETRIEVING GENBANK FILES?
#

df_filtered = df_combined[df_combined['Identity'] > 50]
df_filtered = df_filtered.reset_index(drop=True) #!!!! ESSENTIAL!!!
print(df_filtered)


      Accession  Seq_length           Evalue         Bitscore  Tot_aln_span  \
0  NM_001279761        1989              0.0          3470.97          1989   
1  XM_019017591        2838              0.0          3461.95          1989   
2  XM_019017590        2870              0.0          3461.95          1989   
3  XM_019017589        2773              0.0          3461.95          1989   
4  XM_019017588        2880              0.0          3461.95          1989   
5  XM_019017587        2911              0.0          3461.95          1989   
6  XM_019017586        2965              0.0          3461.95          1989   
7  XM_019017585        2974              0.0          3461.95          1989   
8  XM_031005077        2001  0.0/2.6086e-114  2208.61/416.963          1507   
9      KT698238        2103              0.0          902.069          1536   

   Identity                 Organism  \
0   100.000          Gorilla gorilla   
1    70.085  Gorilla gorilla gorilla   
2    69.30

In [59]:
#print(df_filtered['Accession'][1])
fasta_for_alignment = ''
for i in range(len(df_filtered['Accession'])):
    if len(df_filtered['Prot_sequence'][i]) < 1:
        continue
    fasta_for_alignment += f">{df_filtered['Accession'][i]}\n"
    fasta_for_alignment += df_filtered['Prot_sequence'][i]
    if i != len(df_filtered['Accession']):
        fasta_for_alignment += '\n'

print(fasta_for_alignment)

#!!!! add sequence query

>NM_001279761
MVVSEVDIAKADPAAASHPLLLNGDANVAQKNPGLVAENNLCSQYEEKVRPCIDLIDSLRALGVEQDLALPAIAVIGDQSSGKSSVLEALSGVALPRGSGIVTRCPLVLKLKKLVNEDKWRGKVSYQDYEIEISDASEVEKEINKAQNTIAGEGMGISHELITLEVSSRDVPDLTLIDLPGITRVAVGNQPADIGYKIKTLIKKYIQRQETISLVVVPSNVDIATTEALSMAQEVDPEGDRTIGILTKPDLVDRGTEDKVVDVVRNLVFHLKKGYMIVKCRGQQEIQDQLSLSEALQREKIFFQDHPYFRDLLEEGKATVPCLAEKLTSELITHICKSLPLLENQIKESHQRITEELQKYGVDIPEDENEKMFFLIDKINAFNQDITALMQGEETVGEEDIRLFTRLRHEFHKWSTIIENNFQEGHKILSRKIQKFENQYRGRELPGFVNYRTFETIVKQQIKALEEPAVDMLHTVADMVRLAFTDVSIKNFEEFFNLHRTAKSKIEDIRAEQEREGEKLIRLHFQMEQIVYCQDHVYRGALQKVREKELEEEKKKKSWDFGAFQSSSATDSSMEEIFQHLMAYHQEASKRISSHIPLIIQFFMLQTYGQQLQKAMLQLLQDKDTYSWLLKERSDTSDKRKFLKERLARLTQARRRLAQFPG
>XM_019017591
MVVSEVDIAKADPAAASHPLLLNGDANVAQKNPGLVAENNLCSQYEEKVRPCIDLIDSLRALGVEQDLALPAIAVIGDQSSGKSSVLEALSGVALPRGSGIVTRCPLVLKLKKLVNEDKWRGKVSYQDYEIEISDASEVEKEINKAQNTIAGEGMGISHELITLEVSSRDVPDLTLIDLPGITRVAVGNQPADIGYKIKTLIKKYIQRQETISLVVVPSNVDIATTEALSMAQEVDPEGDRTIGILTKPDLVDRGTEDKVVDVVRNLVFHLKKGYMIVKCRGQQEIQDQLSLSEALQREKIFFQDHPYF

In [60]:
#Write proteins for alignment in a file to feed to mofft
with open('sequences_for_alignment_filtered.fasta', 'w') as savefile:
        savefile.write(fasta_for_alignment)

In [61]:
#Run MAFFT from command line using subprocess (can also use for ssh)
import subprocess
command_list = [r'/Users/Gioele/miniconda3/bin/mafft', '--distout', 'sequences_for_alignment_filtered.fasta']
process = subprocess.Popen(command_list, universal_newlines= True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
stdout,stderr = process.communicate()
print(stdout,stderr)

>NM_001279761
--------------------MVVSEVDIAKADP-----------------AAASHPLLLN
GDANVAQKN----PGLVA-------ENNLCSQYEEKVRPCIDLIDSLRALGVEQDLALPA
IAVIGDQSSGKSSVLEALSGVALPRGSGIVTRCPLVLKLKKLVNEDKWRGKVSYQDYEIE
ISDASEVEKEINKAQNTIAGEGMGISHELITLEVSSRDVPDLTLIDLPGITRVAVGNQPA
DIGYKIKTLIKKYIQRQETISLVVVPSNVDIATTEALSMAQEVDPEGDRTIGILTKPDLV
DRGTEDKVVDVVRNLVFHLKKGYMIVKCRGQQEIQDQLSLSEALQREKIFFQDHPYFRDL
LEEGKATVPCLAEKLTSELITHICKSLPLLENQIKESHQRITEELQKYGVDIPEDENEKM
FFLIDKINAFNQDITALMQGEETVGEEDIRLFTRLRHEFHKWSTIIENNFQEGHKILSRK
IQKFENQYRGRELPGFVNYRTFETIVKQQIKALEEPAVDMLHTVADMVRLAFTDVSIKNF
EEFFNLHRTAKSKIEDIRAEQEREGEKLIRLHFQMEQIVYCQDHVYRGALQKVREKELEE
EKKKKSWDFGA----------FQSSSATDSSMEEIFQHLMAYHQEASKRISSHIPLIIQF
FMLQTYGQQLQKAMLQLLQDKDTYSWLLKERSDTSDKRKFLKERLARLT-QARRRL----
AQFPG
>XM_019017591
--------------------MVVSEVDIAKADP-----------------AAASHPLLLN
GDANVAQKN----PGLVA-------ENNLCSQYEEKVRPCIDLIDSLRALGVEQDLALPA
IAVIGDQSSGKSSVLEALSGVALPRGSGIVTRCPLVLKLKKLVNEDKWRGKVSYQDYEIE
ISDASEVEKEINKAQNTIAGEGMGISHELITLEVSSRDVPDLTLIDLPGIT

In [62]:
#Save alignment file and open it with AlignIO
with open('aligned_mafft_filtered.fasta', 'w') as handle:
    handle.write(stdout)

#Open file with AlignIO module
mafft_alignment_filtered = AlignIO.read('aligned_mafft_filtered.fasta', 'fasta')
print(mafft_alignment_filtered)

Alignment with 10 rows and 725 columns
--------------------MVVSEVDIAKADP-----------...FPG NM_001279761
--------------------MVVSEVDIAKADP-----------...FPG XM_019017591
--------------------MVVSEVDIAKADP-----------...FPG XM_019017590
--------------------MVVSEVDIAKADP-----------...FPG XM_019017589
--------------------MVVSEVDIAKADP-----------...FPG XM_019017588
--------------------MVVSEVDIAKADP-----------...FPG XM_019017587
--------------------MVVSEVDIAKADP-----------...FPG XM_019017586
--------------------MVVSEVDIAKADP-----------...FPG XM_019017585
--------------------MVVSEVDIAKADP-----------...VPG XM_031005077
MSKAHKPWPYRRRSQFSSRKYLKKEMNSFQQQPPPFGTVPPQMM...--- KT698238


In [None]:
#Matrix analyser

In [None]:
#ETA-3

In [20]:
#Write sequences in 'fasta' format
#Check for files not having protein sequences!! (usually RNA or DNA as molecule identifier instead of mRNA)

#String containing the format
prot_seq_for_alignment = ''
for i in range(len(genbank_dict['Primary_accession'])):
    if len(genbank_dict['Prot_sequence'][i]) == 0:
        print(f"{genbank_dict['Primary_accession'][i]} has no identified protein")
        continue
    prot_seq_for_alignment += f">{genbank_dict['Primary_accession'][i]}\n"
    prot_seq_for_alignment += genbank_dict['Prot_sequence'][i]
    if i != len(genbank_dict['Primary_accession']):
        prot_seq_for_alignment += '\n'

print(prot_seq_for_alignment)

XR_004071381 has no identified protein
XR_004071380 has no identified protein
XR_002008569 has no identified protein
AC240968 has no identified protein
>NM_001279761
MVVSEVDIAKADPAAASHPLLLNGDANVAQKNPGLVAENNLCSQYEEKVRPCIDLIDSLRALGVEQDLALPAIAVIGDQSSGKSSVLEALSGVALPRGSGIVTRCPLVLKLKKLVNEDKWRGKVSYQDYEIEISDASEVEKEINKAQNTIAGEGMGISHELITLEVSSRDVPDLTLIDLPGITRVAVGNQPADIGYKIKTLIKKYIQRQETISLVVVPSNVDIATTEALSMAQEVDPEGDRTIGILTKPDLVDRGTEDKVVDVVRNLVFHLKKGYMIVKCRGQQEIQDQLSLSEALQREKIFFQDHPYFRDLLEEGKATVPCLAEKLTSELITHICKSLPLLENQIKESHQRITEELQKYGVDIPEDENEKMFFLIDKINAFNQDITALMQGEETVGEEDIRLFTRLRHEFHKWSTIIENNFQEGHKILSRKIQKFENQYRGRELPGFVNYRTFETIVKQQIKALEEPAVDMLHTVADMVRLAFTDVSIKNFEEFFNLHRTAKSKIEDIRAEQEREGEKLIRLHFQMEQIVYCQDHVYRGALQKVREKELEEEKKKKSWDFGAFQSSSATDSSMEEIFQHLMAYHQEASKRISSHIPLIIQFFMLQTYGQQLQKAMLQLLQDKDTYSWLLKERSDTSDKRKFLKERLARLTQARRRLAQFPG
>XM_019017591
MVVSEVDIAKADPAAASHPLLLNGDANVAQKNPGLVAENNLCSQYEEKVRPCIDLIDSLRALGVEQDLALPAIAVIGDQSSGKSSVLEALSGVALPRGSGIVTRCPLVLKLKKLVNEDKWRGKVSYQDYEIEISDASEVEKEINKAQNTIAGEGMGI

In [21]:
#Write proteins for alignment in a file to feed to mofft
with open('sequences_for_alignment.fasta', 'w') as savefile:
        savefile.write(prot_seq_for_alignment)

In [2]:
#Alignment with ClustalW
from Bio.Align.Applications import ClustalwCommandline
clustalw_exe = r'/Users/Gioele/miniconda3/bin/clustalw2'
clustalw_cline = ClustalwCommandline(clustalw_exe, infile="sequences_for_alignment.fasta")
print(clustalw_cline)
stdout, stderr = clustalw_cline()


/Users/Gioele/miniconda3/bin/clustalw2 -infile=sequences_for_alignment.fasta




 CLUSTAL 2.1 Multiple Sequence Alignments


Sequence format is Pearson
Sequence 1: NM_001279761   662 aa
Sequence 2: XM_019017591   662 aa
Sequence 3: XM_019017590   662 aa
Sequence 4: XM_019017589   662 aa
Sequence 5: XM_019017588   662 aa
Sequence 6: XM_019017587   662 aa
Sequence 7: XM_019017586   662 aa
Sequence 8: XM_019017585   662 aa
Sequence 9: XM_031005077   521 aa
Sequence 10: XM_019017900   715 aa
Sequence 11: XM_004062835   715 aa
Sequence 12: XM_019017898   715 aa
Sequence 13: KT698238       701 aa
Sequence 14: XM_031004421   603 aa
Sequence 15: XM_031014265   851 aa
Sequence 16: XM_031014264   851 aa
Sequence 17: XM_031014263   851 aa
Sequence 18: XM_031014262   851 aa
Sequence 19: XM_031014261   851 aa
Sequence 20: XM_031014260   856 aa
Sequence 21: XM_031014259   864 aa
Sequence 22: XM_031014258   864 aa
Sequence 23: XM_031014257   868 aa
Sequence 24: XM_031014256   868 aa
Sequence 25: XM_0

In [4]:
#Read alignment of ClustalW
multiple_alignment = AlignIO.read('sequences_for_alignment.aln', 'clustal')
print(multiple_alignment)

Alignment with 46 rows and 1028 columns
---------------------------------MVVSEVDIAKA...--- NM_001279761
---------------------------------MVVSEVDIAKA...--- XM_019017591
---------------------------------MVVSEVDIAKA...--- XM_019017590
---------------------------------MVVSEVDIAKA...--- XM_019017589
---------------------------------MVVSEVDIAKA...--- XM_019017588
---------------------------------MVVSEVDIAKA...--- XM_019017587
---------------------------------MVVSEVDIAKA...--- XM_019017586
---------------------------------MVVSEVDIAKA...--- XM_019017585
---------------------------------MVVSEVDIAKA...--- XM_031005077
-------------MSKAHKPWPYRRRSQFSSRKYLKKEMNSFQQ...--- XM_019017900
-------------MSKAHKPWPYRRRSQFSSRKYLKKEMNSFQQ...--- KT698238
-------------MSKAHKPWPYRRRSQFSSRKYLKKEMNSFQQ...--- XM_004062835
-------------MSKAHKPWPYRRRSQFSSRKYLKKEMNSFQQ...--- XM_019017898
--------------------------------------------...--- XM_031014262
--------------------------------------------...--- XM_031014261
----

In [6]:
#Print tree automatically produced by ClustalW
tree_multiple_aln = Phylo.read('sequences_for_alignment.dnd', 'newick')
Phylo.draw_ascii(tree_multiple_aln)

                                               , NM_001279761
                                               |
                                               | XM_019017591
                                               |
                                               | XM_019017590
                                               |
                                               | XM_019017589
                                               |
                                               | XM_019017588
                                               |
                                               | XM_019017587
                                               |
                                          _____| XM_019017586
                                         |     |
                                    _____|     | XM_019017585
                                   |     |
                                   |     |_____ XM_031005077
                                   |
                    

In [30]:
#Multiple alignment with MAFFT
from Bio.Align.Applications import MafftCommandline
mafft_exe = r'/Users/Gioele/miniconda3/bin/mafft'
mafft_cline = MafftCommandline(mafft_exe, input="sequences_for_alignment.fasta")
print(mafft_cline)
stdout, stderr = mafft_cline()

#Mafft stores the alignment in the stdout
#print(stderr)
with open('aligned_mafft.fasta', 'w') as handle:
    handle.write(stdout)

#Open file with AlignIO module
mafft_alignment = AlignIO.read('aligned_mafft.fasta', 'fasta')
print(mafft_alignment)

/Users/Gioele/miniconda3/bin/mafft sequences_for_alignment.fasta
Alignment with 46 rows and 1387 columns
--------------------MVVSEVDIAKADP-----------...--- NM_001279761
--------------------MVVSEVDIAKADP-----------...--- XM_019017591
--------------------MVVSEVDIAKADP-----------...--- XM_019017590
--------------------MVVSEVDIAKADP-----------...--- XM_019017589
--------------------MVVSEVDIAKADP-----------...--- XM_019017588
--------------------MVVSEVDIAKADP-----------...--- XM_019017587
--------------------MVVSEVDIAKADP-----------...--- XM_019017586
--------------------MVVSEVDIAKADP-----------...--- XM_019017585
--------------------MVVSEVDIAKADP-----------...--- XM_031005077
MSKAHKPWPYRRRSQFSSRKYLKKEMNSFQQQPPPFGTVPPQMM...--- XM_019017900
MSKAHKPWPYRRRSQFSSRKYLKKEMNSFQQQPPPFGTVPPQMM...--- XM_004062835
MSKAHKPWPYRRRSQFSSRKYLKKEMNSFQQQPPPFGTVPPQMM...--- XM_019017898
MSKAHKPWPYRRRSQFSSRKYLKKEMNSFQQQPPPFGTVPPQMM...--- KT698238
--------------------------------------------...--- XM_031004421
---

In [31]:
#Multiple alignment with MAFFT obtaining tree and matrix
#To do!!

from Bio.Align.Applications import MafftCommandline
mafft_exe = r'/Users/Gioele/miniconda3/bin/mafft --distout' #!!!
mafft_cline = MafftCommandline(mafft_exe, input="sequences_for_alignment.fasta") #--distout produces .hat2 file 
print(mafft_cline)
stdout, stderr = mafft_cline()

#Mafft stores the alignment in the stdout
#print(stderr)
with open('aligned_mafft.fasta', 'w') as handle:
    handle.write(stdout)

#Open file with AlignIO module
mafft_alignment = AlignIO.read('aligned_mafft.fasta', 'fasta')
print(mafft_alignment)

"/Users/Gioele/miniconda3/bin/mafft --distout" sequences_for_alignment.fasta


ApplicationError: Non-zero return code 127 from '"/Users/Gioele/miniconda3/bin/mafft --distout" sequences_for_alignment.fasta', message '/bin/sh: /Users/Gioele/miniconda3/bin/mafft --distout: No such file or directory'

In [None]:
#ISSUES:
#Produce the command correctly, will it be run from the terminal with os?  
#Cannot find IT-free(?) (the package for drawing trees)


In [37]:
import pandas as pd
def hat2_parser(path_to_file):
    with open(path_to_file) as handler:
        matrix=handler.read().splitlines()
    seq_n = int(matrix[1].strip())
    del matrix[0:3]
    matrix_identifier = matrix[0:seq_n]
    list_of_identifiers = []
    for line in matrix_identifier:
        line = line[line.index('=')+1:]
        list_of_identifiers.append(line)
    matrix_values = matrix[seq_n:]
    value_vector = []
    for values in matrix_values:
        while True:
            try:
                storer = values[values.index('.')-1:values.index('.')+4]
            except ValueError:
                break
            value_vector.append(storer)
            values = values[values.index('.')+4:]
            values = values.strip()
    matrix_dictionary = {#'index': list_of_identifiers #Possibly useless 
    }
    temp_value_vector = value_vector
    for a in range(len(matrix_identifier)-1):
        temp_list = []
        for i in range(a):#!!!!!!!!Possibly +1
            temp_list.append('NaN')
        temp_list.extend(temp_value_vector[:seq_n-a-1])
        temp_value_vector = temp_value_vector[seq_n-a-1:]
        matrix_dictionary.update({f'{list_of_identifiers[a]}': temp_list})
        matrix_df = pd.DataFrame(matrix_dictionary)
    matrix_df = matrix_df.T
    return matrix_df


print(hat2_parser('Sequences_for_alignment.fasta.hat2'))

                 0      1      2      3      4      5      6      7      8   \
NM_001279761  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.319  0.683   
XM_019017591    NaN  0.000  0.000  0.000  0.000  0.000  0.000  0.319  0.683   
XM_019017590    NaN    NaN  0.000  0.000  0.000  0.000  0.000  0.319  0.683   
XM_019017589    NaN    NaN    NaN  0.000  0.000  0.000  0.000  0.319  0.683   
XM_019017588    NaN    NaN    NaN    NaN  0.000  0.000  0.000  0.319  0.683   
XM_019017587    NaN    NaN    NaN    NaN    NaN  0.000  0.000  0.319  0.683   
XM_019017586    NaN    NaN    NaN    NaN    NaN    NaN  0.000  0.319  0.683   
XM_019017585    NaN    NaN    NaN    NaN    NaN    NaN    NaN  0.319  0.683   
XM_031005077    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  0.845   
XM_019017900    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
XM_004062835    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
XM_019017898    NaN    NaN    NaN    NaN    NaN    N