In [41]:
#Enviroment init
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import SeqIO, AlignIO
from Bio.Emboss.Applications import NeedleCommandline as needle
from bokeh.models import ColumnDataSource, CDSView, BooleanFilter
from bokeh.models import Title, LinearAxis, Range1d
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook, export_png
import prody as pd
import numpy as np
import time, sys, os, glob

In [3]:
#Progress bar function
def progressbar(it, prefix="", size=60, file=sys.stdout):
    count = len(it)
    def show(j):
        x = int(size*j/count)
        file.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*(size-x), j, count))
        file.flush()        
    show(0)
    for i, item in enumerate(it):
        yield item
        show(i+1)
    file.write("\n")
    file.flush()

In [4]:
#bokeh init
output_notebook()

In [5]:
def dataExtraction(blastfile):
    result_handle = open(blastfile)
    record = NCBIXML.read(result_handle)

    evalue=[]
    hits=[]
    for aln in record.alignments:
        hsp = aln.hsps[0]
        hits.append(aln.title)
        val = hsp.expect
        evalue.append(hsp.expect)
    #Calulating log of Evalue
    log_eval=[]
    for i in evalue:
        if i == 0:
            log_eval.append(-420)
        else:
            log_eval.append(np.log(i))
    #Calculating Difference of Evalues (1st derivate)
    d_eval= list(np.diff(log_eval))
    d_eval.insert(0,d_eval[0])
    index = list(range(len(evalue)))
    return (hits,log_eval,d_eval,index)

In [22]:
mylist = [f for f in glob.glob("Blast_PDB_XML/*.xml")]

In [23]:
datalist_PDB = {}
for file in progressbar(mylist, "Extracting parameters: ", 40):
    name = file.split("/")[1].split("_")[0]
    temp = dataExtraction(file)
    datalist_PDB[name] = temp

Extracting parameters: [########################################] 13/13


In [24]:
mylist2 = [f for f in glob.glob("Blast_XML/*.xml")]

In [25]:
datalist_Swiss = {}
for file in progressbar(mylist2, "Extracting parameters: ", 40):
    name = file.split("/")[1].split("_")[0]
    temp = dataExtraction(file)
    datalist_Swiss[name] = temp

Extracting parameters: [########################################] 13/13


In [26]:
datalist_Swiss["ALK1"][0][:2]

['sp|P37023.2| RecName: Full=Serine/threonine-protein kinase receptor R3; Short=SKR3; AltName: Full=Activin receptor-like kinase 1; Short=ALK-1; AltName: Full=TGF-B superfamily receptor type I; Short=TSR-I; Flags: Precursor [Homo sapiens]',
 'sp|Q5RAN0.1| RecName: Full=Serine/threonine-protein kinase receptor R3; Short=SKR3; AltName: Full=Activin receptor-like kinase 1; Short=ALK-1; Flags: Precursor [Pongo abelii]']

In [27]:
datalist_PDB["ALK1"][0][:2]

['gi|301015950|pdb|3MY0|A Chain A, Crystal Structure Of The Acvrl1 (alk1) Kinase Domain Bound To Ldn- 193189 >gi|301015951|pdb|3MY0|B Chain B, Crystal Structure Of The Acvrl1 (alk1) Kinase Domain Bound To Ldn- 193189 >gi|301015952|pdb|3MY0|C Chain C, Crystal Structure Of The Acvrl1 (alk1) Kinase Domain Bound To Ldn- 193189 >gi|301015953|pdb|3MY0|D Chain D, Crystal Structure Of The Acvrl1 (alk1) Kinase Domain Bound To Ldn- 193189 >gi|301015954|pdb|3MY0|E Chain E, Crystal Structure Of The Acvrl1 (alk1) Kinase Domain Bound To Ldn- 193189 >gi|301015955|pdb|3MY0|F Chain F, Crystal Structure Of The Acvrl1 (alk1) Kinase Domain Bound To Ldn- 193189 >gi|301015956|pdb|3MY0|G Chain G, Crystal Structure Of The Acvrl1 (alk1) Kinase Domain Bound To Ldn- 193189 >gi|301015957|pdb|3MY0|H Chain H, Crystal Structure Of The Acvrl1 (alk1) Kinase Domain Bound To Ldn- 193189 >gi|301015958|pdb|3MY0|I Chain I, Crystal Structure Of The Acvrl1 (alk1) Kinase Domain Bound To Ldn- 193189 >gi|301015959|pdb|3MY0|J Ch

In [62]:
def plotting_V2(dataset1,dataset2):
    
    names1=dataset1.keys()
    names2=dataset2.keys()
    
    if names1 != names2:
        raise ValueError("No identical data")
    
    for name in progressbar(names1,prefix="Plotting :"):
        
        hits1,log_eval1,d_eval1,index1 = dataset1[name]
        hits2,log_eval2,d_eval2,index2 = dataset2[name]
        
        if index1 > index2:
            log_eval1 = log_eval1[:index2[-1]+1]
            d_eval1 = d_eval1[:index2[-1]+1]
            index1 = index1[:index2[-1]+1]
        elif index1 < index2:
            log_eval2 = log_eval2[:index1[-1]+1]
            d_eval2 = d_eval2[:index1[-1]+1]
            index2 = index2[:index1[-1]+1]
        
        data = {"index_Swiss": index1,
                "index_PDB": index2,
                "evalue_SwissProt": log_eval1,
                "derivate_SwissProt": d_eval1,
                "evalue_PDB": log_eval2,
                "derivate_PDB": d_eval2}
        
        source = ColumnDataSource(data=data)

        #Generating plot

        minY1=min(data["derivate_SwissProt"])
        maxY1=max(data["derivate_SwissProt"])
        minY2=min(data["derivate_PDB"])
        maxY2=max(data["derivate_PDB"])
        
        if minY1 < minY2:
            minY = minY1
        else:
            minY = minY2
        if maxY1 > maxY2:
            maxY = maxY1
        else:
            maxY = maxY2

        p = figure(plot_height=600, plot_width=800,title="Comparative between PDB and Swissprot for %s"%(name), title_location="above",
                   toolbar_location=None)

        #Adding titles and labels
        p.title.align = "center"
        p.xaxis.axis_label = "Sequence Index"
        p.yaxis.axis_label = "E-Value (log)"

        #Plotting lines
        p.extra_y_ranges = {"linear": Range1d(start=minY-10, end=maxY+10)}
        p.line(x="index_Swiss", y="derivate_SwissProt", source=source, line_width=2, line_color="red", y_range_name="linear",
               line_dash='dashed',legend="1st Derivate (SwissProt)")
        p.line(x="index_Swiss", y="evalue_SwissProt", source=source, line_width=2, line_color="blue",legend="E-Value (SwissProt)")
        p.line(x="index_PDB", y="derivate_PDB", source=source, line_width=2, line_color="orange", 
               y_range_name="linear", line_dash='dashed',legend="1st Derivate (PDB)")
        p.line(x="index_PDB", y="evalue_PDB", source=source, line_width=2, line_color="green",legend="E-Value (PDB)")
        export_png(p, filename="Plots/Comparative/plot_{0}.png".format(name))

In [63]:
plotting_V2(datalist_PDB,datalist_Swiss)

Plotting :[############################################################] 13/13


In [79]:
def plotting_V2_cutoff(dataset1,dataset2):
    
    names1=dataset1.keys()
    names2=dataset2.keys()
    
    if names1 != names2:
        raise ValueError("No identical data")
    
    for name in progressbar(names1,prefix="Plotting :"):
        
        hits1,log_eval1,d_eval1,index1 = dataset1[name]
        hits2,log_eval2,d_eval2,index2 = dataset2[name]
        
        if index1 > index2:
            log_eval1 = log_eval1[:index2[-1]+1]
            d_eval1 = d_eval1[:index2[-1]+1]
            index1 = index1[:index2[-1]+1]
        elif index1 < index2:
            log_eval2 = log_eval2[:index1[-1]+1]
            d_eval2 = d_eval2[:index1[-1]+1]
            index2 = index2[:index1[-1]+1]
              
        
        data = {"index_Swiss": index1,
                "index_PDB": index2,
                "evalue_SwissProt": log_eval1,
                "derivate_SwissProt": d_eval1,
                "evalue_PDB": log_eval2,
                "derivate_PDB": d_eval2}
        
        source = ColumnDataSource(data=data)

        d_max1 = max(d_eval1)
        indexa = d_eval1.index(d_max1)
        d_data1 = d_eval1[indexa+1:]
        length1=len(d_data1)-1
        for i in range(length1):
            if (d_data1[i-1] < d_data1[i] and d_data1[i+1] < d_data1[i]):
                index_d1 = d_eval1.index(d_data1[i])
                if index_d1 - indexa < 100:
                    continue
                else:
                    break
        d_max2 = max(d_eval2)
        indexb = d_eval2.index(d_max2)
        d_data2 = d_eval2[indexb+1:]
        length2=len(d_data2)-1
        for i in range(length2):
            if (d_data2[i-1] < d_data2[i] and d_data2[i+1] < d_data2[i]):
                index_d2 = d_eval2.index(d_data2[i])
                if index_d2 - indexb < 100:
                    continue
                else:
                    break
        if index_d1 > index_d2:
            cutoff_val = index_d2
        else:
            cutoff_val = index_d1
        
        #Generating plot

        minY1=min(data["derivate_SwissProt"])
        maxY1=max(data["derivate_SwissProt"])
        minY2=min(data["derivate_PDB"])
        maxY2=max(data["derivate_PDB"])
        
        if minY1 < minY2:
            minY = minY1
        else:
            minY = minY2
        if maxY1 > maxY2:
            maxY = maxY1
        else:
            maxY = maxY2

        p = figure(plot_height=600, plot_width=800,title="Comparative between PDB and Swissprot for {0} with cutoff at {1}".format(name,cutoff_val), 
                   title_location="above", x_range=Range1d(start=0, end=cutoff_val),
                   toolbar_location=None)
        
        #Adding titles and labels
        p.title.align = "center"
        p.xaxis.axis_label = "Sequence Index"
        p.yaxis.axis_label = "E-Value (log)"
        #Filtering
        booleans1 = [True if index < cutoff_val else False for index in source.data['index_Swiss']]
        view1 = CDSView(source=source, filters=[BooleanFilter(booleans1)])
        booleans2 = [True if index < cutoff_val else False for index in source.data['index_PDB']]
        view2 = CDSView(source=source, filters=[BooleanFilter(booleans2)])
        #Plotting lines
        p.extra_y_ranges = {"linear": Range1d(start=minY-10, end=maxY+10)}
        
        p.line(x="index_Swiss", y="derivate_SwissProt", source=source,view=view1, line_width=2, 
               line_color="red", y_range_name="linear", line_dash='dashed',legend="1st Derivate (SwissProt)")
        p.line(x="index_Swiss", y="evalue_SwissProt", source=source,view=view1, line_width=2, line_color="blue",legend="E-Value (SwissProt)")
        
        p.line(x="index_PDB", y="derivate_PDB", source=source,view=view2, line_width=2, 
               line_color="purple", y_range_name="linear", line_dash='dashed',legend="1st Derivate (PDB)")
        p.line(x="index_PDB", y="evalue_PDB", source=source,view=view2, line_width=2, line_color="green",legend="E-Value (PDB)")
        #show(p)
        export_png(p, filename="Plots/Comparative/plot_{0}_cutoff.png".format(name))

In [80]:
plotting_V2_cutoff(datalist_PDB,datalist_Swiss)

Plotting :[############################################################] 13/13
