In [1]:
from Bio import Entrez
import matplotlib.pyplot as plt
import matplotlib._color_data as mcd
import pandas as pd
import altair as alt
import math
import random
Entrez.email = "lukas.becker@hhu.de"
overlap = [name for name in mcd.CSS4_COLORS]
overlap.remove("lightgrey")

In [2]:
reciprocal_best_hits_file = "../data/curvibacter_rec_blasts/eps_operon_vs_all_complete_chromosome/reciprocal_best_hits_protein_ids.txt"
blastp_fw_table = "../data/curvibacter_rec_blasts/eps_operon_vs_all_complete_chromosome/blastp_fw_out.table"
query_file = "../data/curvibacter_rec_blasts/eps_operon_vs_all_complete_chromosome/curvibacter_aep_eps_operon.faa"

In [3]:
rec_prot=pd.read_table(reciprocal_best_hits_file)
fw_res=pd.read_table(blastp_fw_table,header=None)
fw_res.columns=["qseqid", "sseqid", "evalue", "bitscore", "qgi", "sgi", "sacc", "staxids", "sscinames", "scomnames",
                  "stitle"]

fw_res['qseqid'] = fw_res['qseqid'].map(lambda line: line.split('.')[0])
fw_res['sacc'] = fw_res['sacc'].map(lambda line: line.split('.')[0])
rec_prot = rec_prot.rename(columns={"forward_genome_id": "sacc"})
rec_prot = rec_prot.rename(columns={"backward_genome_id": "qseqid"})
result_data = rec_prot.merge(fw_res,how='inner', on=['sacc','qseqid'])
result_data = result_data.drop_duplicates('sacc', keep='first')

In [4]:
result_data.head()

Unnamed: 0,sacc,qseqid,sseqid,evalue,bitscore,qgi,sgi,staxids,sscinames,scomnames,stitle
0,WP_086914179,WP_087496569,WP_086914179.1_GCF_002157165.1_ASM215716v1,2.22e-150,432.0,0,0,553814,Acidovorax carolinensis,Acidovorax carolinensis,polyprenyl synthetase family protein [Acidovor...
1,WP_187305962,WP_087496569,WP_187305962.1_GCF_002214645.1_ASM221464v1,1.31e-143,416.0,0,0,1546149,Diaphorobacter polyhydroxybutyrativorans,Diaphorobacter polyhydroxybutyrativorans,polyprenyl synthetase family protein [Diaphoro...
2,WP_036939373,WP_087496569,WP_036939373.1_GCF_003812525.1_ASM381252v1,3.44e-105,318.0,0,0,585,Proteus vulgaris,Proteus vulgaris,MULTISPECIES: octaprenyl diphosphate synthase ...
3,WP_128113047,WP_087496569,WP_128113047.1_GCF_900096755.1_Fsp1.4,1.04e-109,330.0,0,0,576610,Polynucleobacter necessarius,Polynucleobacter necessarius,polyprenyl synthetase family protein [Polynucl...
4,WP_201348422,WP_087496569,WP_201348422.1_GCF_016592555.1_ASM1659255v1,1.97e-106,322.0,0,0,1441457,Neptunomonas japonica JAMM 1380,Neptunomonas japonica JAMM 1380,octaprenyl diphosphate synthase [Neptunomonas ...


In [9]:
df = result_data.copy()

queries = {}
queryfile = open(query_file, "r")
for line in queryfile.readlines():
    if ">" in line:
        prot_id = line.split(">")[1].split(' ')[0].split('.')[0]
        line = ' '.join(line.split(">")[1].split(' ')[1:]).rstrip()
        queries[prot_id] = line
queryfile.close()

dataframes = []
unique_queries = list(df['qseqid'].unique())
for query in unique_queries:
    print("processing : {}".format(query))
    dataframe = df.loc[df['qseqid'] == query].copy()
    dataframe['sacc'] = dataframe['sacc'].map(lambda protid: protid.split(".")[0])
    dataframe = dataframe.drop_duplicates(subset=['sacc'], keep="first")
    #print("Current length: {}".format(len(dataframe)))
    if len(dataframe) >= 4999:
        dataframe = dataframe[0:4999]
        print("New Length : {}".format(len(dataframe)))
    staxids=[]
    for ids in list(dataframe['staxids']):
        if type(ids) == str:
            staxids.append(ids.split(";")[0])
        elif type(ids) == int:
            staxids.append(ids)
   
    result_record = []
    
    end = len(dataframe[dataframe['qseqid'] == query])
    begin = 0
    step = 500
    steps = 500
    while begin < end:
        if step >= end:
            step = end
        print("\t {} to {}".format(begin,step))
        splitted_ids = staxids[begin:step]
        for attempt in range(10):
            try:
                print("length ids : {}".format(len(splitted_ids)))
                handle = Entrez.efetch(id=splitted_ids, db="taxonomy", retmode="xml")
                record = Entrez.read(handle)
                handle.close()
            except Exception as e:
                print("attempt : {} queries : {}".format(attempt,query))
                if attempt == 9:
                    raise Exception
            
            else:
                for rec in record:
                    result_record.append(rec)
                #print("result record length : {}".format(len(result_record)))
                break
        begin += steps
        step += steps
        

        
    query_info = []
    taxonomy = []
    genus = []
    superfamily = []
    family = []
    #lineageExList = []
    order= []
    classt=[]
    phylum=[]
    for i in range(len(result_record)):
        query_info.append(queries[query])
        taxonomy.append(result_record[i]['ScientificName'])
        #lineageEx = 'linex,'
        for j in result_record[i]['LineageEx']:
            #lineageEx += ','+str(j)
            if j['Rank'] == 'genus':
                genus.append(j['ScientificName'])
            if j['Rank'] == 'superfamily':
                superfamily.append(j['ScientificName'])
            if j['Rank'] == 'family':
                family.append(j['ScientificName'])
            if j['Rank'] == 'order':
                order.append(j['ScientificName'])
            if j['Rank'] == 'class':
                classt.append(j['ScientificName'])
            if j['Rank'] == 'phylum':
                phylum.append(j['ScientificName'])
        #lineageExList.append(lineageEx)
        if (len(taxonomy) != len(genus)):
            genus.append('unknown')
        if (len(taxonomy) != len(superfamily)):
            superfamily.append('unknown')
        if (len(taxonomy) != len(family)):
            family.append('unknown')
        if (len(taxonomy) != len(order)):
            order.append('unknown')
        if (len(taxonomy) != len(phylum)):
            phylum.append('unknown')
        if (len(taxonomy) != len(classt)):
            classt.append('unknown')
    print(len(genus) == len(taxonomy))
    # dataframe['taxonomic_name'] = taxonomy
    if (len(genus) == len(dataframe) and len(family) == len(dataframe) and len(superfamily) == len(dataframe) and len(
            query_info) == len(dataframe)):
        dataframe['genus'] = genus
        dataframe['superfamily'] = superfamily
        dataframe['family'] = family
        dataframe['order'] = order
        dataframe['phylum'] = phylum
        dataframe['class'] = classt
        dataframe['query_info'] = query_info
        #dataframe['lineageEx'] = lineageExList
    else:
        break
    dataframes.append(dataframe)
    
result_df = pd.concat(dataframes)


cols = math.ceil(math.sqrt(len(result_df['qseqid'].unique())))

processing : WP_087496569
	 0 to 500
length ids : 500
	 500 to 1000
length ids : 500
	 1000 to 1500
length ids : 500
	 1500 to 1934
length ids : 434
True
processing : WP_087496568
	 0 to 322
length ids : 322
True
processing : WP_087496564
	 0 to 500
length ids : 500
	 500 to 712
length ids : 212
True
processing : WP_087496563
	 0 to 500
length ids : 500
	 500 to 1000
length ids : 500
	 1000 to 1500
length ids : 500
	 1500 to 2000
length ids : 500
	 2000 to 2500
length ids : 500
	 2500 to 3000
length ids : 500
	 3000 to 3500
length ids : 500
	 3500 to 3812
length ids : 312
True
processing : WP_087496562
	 0 to 126
length ids : 126
True
processing : WP_087496561
	 0 to 174
length ids : 174
True
processing : WP_087496560
	 0 to 500
length ids : 500
	 500 to 1000
length ids : 500
	 1000 to 1500
length ids : 500
	 1500 to 2000
length ids : 500
	 2000 to 2500
length ids : 500
	 2500 to 3000
length ids : 500
	 3000 to 3065
length ids : 65
True
processing : WP_087496559
	 0 to 500
length ids :

In [39]:
result_df.to_csv('result_dataframe.csv',header=list(result_df.columns))

In [5]:
result_df = pd.read_csv('result_dataframe.csv',header=0,index_col=0)

In [6]:
len(result_df['phylum'].unique())

33

In [136]:
rf = result_df.drop_duplicates(subset='phylum',keep="first")
bars = []
selection = alt.selection_multi(fields=['phylum'])
for df in dataframes:

    #make_selector = alt.Chart(df).mark_rect().encode(y='genus', color='genus').add_selection(selection)
    bar = alt.Chart(df).mark_bar(tooltip=True).encode(
        alt.Y("count()"),
        alt.X("phylum"),
        color=alt.Color('phylum',legend=None),
        tooltip=['count()','max(bitscore)','min(evalue)',],
    ).transform_filter(selection).interactive().facet(facet='query_info')

    graph = bar
    bars.append(graph)
    
make_selector = alt.Chart(rf).mark_rect().encode(y='phylum', color='phylum').add_selection(selection)   
graphics = make_selector | bars[0]
if len(bars) > 1:
    for bar in bars[1:]:
        graphics |= bar
graphics

# filtering for species with hits on every input query sequence

1. from above commands we do have access to a list called dataframes, which inherits the result pandas tables for each query sequence

2. lets loop through those dataframes and count the number of unique taxonomic nodes for each query sequence, we can store those numbers in an appropriate dictionary, with taxonomic nodes as keys and the total sum as value

3. from above commands we do have access to a dataframe called result_df, which inherits the full result dataframe (after execution of biopython commands)

In [10]:
#dataframes[0].head()
full_operon_hits = {}
count = 1
for df in dataframes:
    print("[*] Parsing pandas dataframe : {}".format(count))
    for taxonomic_node in list(df['staxids'].unique()):
        if taxonomic_node not in full_operon_hits.keys():
            full_operon_hits[taxonomic_node] = 1
        else:
            full_operon_hits[taxonomic_node] += 1
    print('\t[+] Done parsing df : {}'.format(count))
    count += 1

[*] Parsing pandas dataframe : 1
	[+] Done parsing df : 1
[*] Parsing pandas dataframe : 2
	[+] Done parsing df : 2
[*] Parsing pandas dataframe : 3
	[+] Done parsing df : 3
[*] Parsing pandas dataframe : 4
	[+] Done parsing df : 4
[*] Parsing pandas dataframe : 5
	[+] Done parsing df : 5
[*] Parsing pandas dataframe : 6
	[+] Done parsing df : 6
[*] Parsing pandas dataframe : 7
	[+] Done parsing df : 7
[*] Parsing pandas dataframe : 8
	[+] Done parsing df : 8
[*] Parsing pandas dataframe : 9
	[+] Done parsing df : 9
[*] Parsing pandas dataframe : 10
	[+] Done parsing df : 10
[*] Parsing pandas dataframe : 11
	[+] Done parsing df : 11
[*] Parsing pandas dataframe : 12
	[+] Done parsing df : 12


In [11]:
# which taxonomic node has the full operon?
target_taxids = []
for hit in full_operon_hits.keys():
    if full_operon_hits[hit] >= 12:
        target_taxids.append(hit)
        print("[+] foung : {}".format(hit))

[+] foung : 2697032
[+] foung : 395495
[+] foung : 401471
[+] foung : 2588534
[+] foung : 864828
[+] foung : 946483
[+] foung : 1288494
[+] foung : 582744
[+] foung : 2231055
[+] foung : 1844971
[+] foung : 887061
[+] foung : 1266925
[+] foung : 583345


In [130]:
len(target_taxids)

13

In [131]:
#lets fetch the taxonomic informations from ncbis taxonomy database
handle = Entrez.efetch(db='taxonomy',id=target_taxids,retmode="xml")
records = Entrez.read(handle)
handle.close()

In [132]:
for rec in records:
    print("ScientificName : {}\n\t LineageEx __ : ".format(rec['ScientificName']))
    for list_item in rec['LineageEx']:
        print("\t\t{}".format(list_item['ScientificName']))
#records[0]

ScientificName : Xylophilus rhododendri
	 LineageEx __ : 
		cellular organisms
		Bacteria
		Proteobacteria
		Betaproteobacteria
		Burkholderiales
		Burkholderiales genera incertae sedis
		Xylophilus
ScientificName : Leptothrix cholodnii SP-6
	 LineageEx __ : 
		cellular organisms
		Bacteria
		Proteobacteria
		Betaproteobacteria
		Burkholderiales
		Burkholderiales genera incertae sedis
		Leptothrix
		Leptothrix cholodnii
ScientificName : Undibacterium parvum
	 LineageEx __ : 
		cellular organisms
		Bacteria
		Proteobacteria
		Betaproteobacteria
		Burkholderiales
		Oxalobacteraceae
		Undibacterium
ScientificName : Methylophilus medardicus
	 LineageEx __ : 
		cellular organisms
		Bacteria
		Proteobacteria
		Betaproteobacteria
		Nitrosomonadales
		Methylophilaceae
		Methylophilus
ScientificName : Massilia umbonata
	 LineageEx __ : 
		cellular organisms
		Bacteria
		Proteobacteria
		Betaproteobacteria
		Burkholderiales
		Oxalobacteraceae
		Massilia
ScientificName : Candidatus Symbiobacter m

In [133]:
dataframes = []
taxids_dict = {}
for taxid in target_taxids:
    taxids_dict[taxid] = None
    dataframes.append(result_df[result_df['staxids'] == taxid])

In [111]:
for df in dataframes:
    handle = Entrez.efetch(db="protein",id=list(df['sacc']),retmode='xml')
    record = Entrez.read(handle)
    handle.close()
    taxids_dict[str(df['staxids'].unique()[0])] = record

In [113]:
len(taxids_dict.keys())

186

In [135]:
import os
df = pd.concat(dataframes)
for qid in df['qseqid'].unique():
    directory = str(qid)
    os.mkdir(directory)
    target_genes = list(df[df['qseqid'] == qid]['sacc'])
    handle = Entrez.efetch(db='protein',id=target_genes,retmode='xml')
    record = Entrez.read(handle)
    handle.close()
    
    output = open(directory+'/target_fasta_sequences.faa','w')
    for sequence in record:
        GBSeq_locus = sequence['GBSeq_locus']
        GBSeq_definition = sequence['GBSeq_definition']
        organism = sequence['GBSeq_organism']
        taxonomy = sequence['GBSeq_taxonomy']
        seq = sequence['GBSeq_sequence']

        output.write('>'+GBSeq_locus+' '+GBSeq_definition+' '+organism+' '+taxonomy+"\n")
        output.write(seq+"\n")
    output.close()

In [118]:
for df in dataframes:
    key = str(list(df['staxids'].unique())[0])
    directory = str(key)
    os.mkdir(directory)
    for sequence in taxids_dict[int(key)]:
        output = open(directory+'/'+str(sequence['GBSeq_locus']))
        GBSeq_locus = sequence['GBSeq_locus']
        GBSeq_definition = sequence['GBSeq_definition']
        organism = sequence['GBSeq_organism']
        taxonomy = sequence['GBSeq_taxonomy']
        seq = sequence['GBSeq_sequence']
        
        output.write('>'+GBSeq_locus+' '+GBSeq_definition+' '+organism+' '+taxonomy+"\n")
        output.write(seq+"\n")
        output.close()

{'GBSeq_locus': 'WP_013647022', 'GBSeq_length': '322', 'GBSeq_moltype': 'AA', 'GBSeq_topology': 'linear', 'GBSeq_division': 'BCT', 'GBSeq_update-date': '20-JUN-2019', 'GBSeq_create-date': '18-MAY-2013', 'GBSeq_definition': 'octaprenyl diphosphate synthase [Nitrosomonas sp. AL212]', 'GBSeq_primary-accession': 'WP_013647022', 'GBSeq_accession-version': 'WP_013647022.1', 'GBSeq_other-seqids': ['ref|WP_013647022.1|', 'gi|503412361'], 'GBSeq_keywords': ['RefSeq'], 'GBSeq_source': 'Nitrosomonas sp. AL212', 'GBSeq_organism': 'Nitrosomonas sp. AL212', 'GBSeq_taxonomy': 'Bacteria; Proteobacteria; Betaproteobacteria; Nitrosomonadales; Nitrosomonadaceae; Nitrosomonas', 'GBSeq_comment': 'REFSEQ: This record represents a single, non-redundant, protein sequence which may be annotated on many different RefSeq genomes from the same, or different, species.; ; ##Evidence-For-Name-Assignment-START## ; Evidence Category :: HMM ; Evidence Accession :: NF008140.0 ; Evidence Source :: NCBI Protein Cluster (P

In [None]:
# https://www.ncbi.nlm.nih.gov/bioproject?LinkName=assembly_bioproject&from_uid=9754341 - Rhodoferax sp. AJA081-3
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4053972/ - Candidatus Symbiobacter mobilis CR

In [None]:
#downloading fasta files based on targets list
def get_sequences_as_fasta(targets,output_filepath,query_name):
    try:
        begin = 0
        step = 500
        steps = 500
        #splits the targets list into chunks of 500 sequences
        while begin < end:
            if step >= end:
                step = end
            print('downloading all hits on {}'.format(query_name))    
            print("\t {} to {}".format(begin,step))
            for attempt in range(10):
                try:
                    print("length ids : {}".format(len(splitted_ids)))
                    handle = Entrez.efetch(db="protein",id=targets,retmode='xml')
                    record = Entrez.read(handle)
                    handle.close()
                except Exception as e:
                    print("attempt : {} queries : {}".format(attempt,query))
                    if attempt == 9:
                        raise Exception('not successfull after 10 attempts')

                else:
                    for rec in record:
                        result_record.append(rec)
                    #print("result record length : {}".format(len(result_record)))
                    break
                    
            begin += steps
            step += steps
        return 0
    except Exception as e:
        raise Exception('Exception occurred : {}'.format(e))

# msa transformation and phylogeny

In [49]:
tree = "../data/curvibacter_rec_blasts/eps_operon_vs_burkholderiales_cyanobacteria/domain_nj_tree.newick"
msa_file = "../data/curvibacter_rec_blasts/eps_operon_vs_burkholderiales_cyanobacteria/epsA_orthologs.msa"

In [None]:
ifile = open(msa_file,'r')
count = 1
ofile = open("msa_corrected_header.msa",'w')
for line in ifile.readlines():
    if ">" in line:
        ofile.write(">ID:{}\n".format(count))
        count+=1
    else:
        ofile.write(line)
ofile.close()
ifile.close()

In [47]:
result_df.head()

Unnamed: 0,sacc,qseqid,sseqid,evalue,bitscore,qgi,sgi,staxids,sscinames,scomnames,stitle,genus,superfamily,family,query_info
0,WP_048346605,WP_087496569,WP_048346605.1_GCF_001040845.1_ASM104084v1,4.4699999999999994e-44,158.0,0,0,32052,Synechococcus sp. WH 8020,Synechococcus sp. WH 8020,solanesyl diphosphate synthase [Synechococcus ...,Synechococcus,unknown,Synechococcaceae,polyprenyl synthetase family protein
1,WP_162574985,WP_087496569,WP_162574985.1_GCF_901827155.1_ASM90182715v1,3.1999999999999998e-24,105.0,0,0,434009,Variovorax sp. PBL-H6,Variovorax sp. PBL-H6,polyprenyl synthetase family protein [Variovor...,Variovorax,unknown,Comamonadaceae,polyprenyl synthetase family protein
2,WP_013800821,WP_087496569,WP_013800821.1_GCF_000214395.1_ASM21439v1,6.32e-141,405.0,0,0,742013,Delftia sp. Cs1-4,Delftia sp. Cs1-4,polyprenyl synthetase family protein [Delftia ...,Delftia,unknown,Comamonadaceae,polyprenyl synthetase family protein
3,WP_096547122,WP_087496569,WP_096547122.1_GCF_002368135.1_ASM236813v1,8.65e-48,167.0,0,0,1973482,Raphidiopsis curvata NIES-932,Raphidiopsis curvata NIES-932,solanesyl diphosphate synthase [Raphidiopsis c...,Raphidiopsis,unknown,Aphanizomenonaceae,polyprenyl synthetase family protein
4,WP_046661468,WP_087496569,WP_046661468.1_GCF_001704955.2_ASM170495v2,9.980000000000001e-45,159.0,0,0,1698524,Microcystis aeruginosa NIES-2481,Microcystis aeruginosa NIES-2481,solanesyl diphosphate synthase [Microcystis ae...,Microcystis,unknown,Microcystaceae,polyprenyl synthetase family protein


In [137]:
wp_to_taxids = {}
ifile = open(msa_file,'r')
output=open("taxonomy_file.txt",'w')
for line in ifile.readlines():
    if ">" in line:
        wp = line.split(" ")[0].split(">")[1]
        taxonomy = ' '.join(line.split(" ")[1:]).strip()
        taxid = result_df[result_df['sacc'] == wp]['staxids']
        genus = result_df[result_df['sacc'] == wp]['genus'].unique()[0]
        family = genus = result_df[result_df['sacc'] == wp]['family'].unique()[0]
        wp_to_taxids[wp] = (taxid,taxonomy, genus,family)
        output.write(taxonomy+"\n")
output.close()
ifile.close()

In [146]:
treefile=open(tree,"r")
lines = treefile.readlines()
treefile.close()

In [117]:
import re

In [147]:
for wp in wp_to_taxids.keys():
    lines[0] = re.sub(wp,str(wp_to_taxids[wp][0].unique()[0]), lines[0] )

In [148]:
output=open("annotNewick.newick","w")
output.write(lines[0])
output.close()