# Protein template library builder for AVCR and partners

In [1]:
#Enviroment init
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import SeqIO, AlignIO
from Bio.Align.Applications import MafftCommandline as mafft
from bokeh.models import ColumnDataSource, CDSView, BooleanFilter
from bokeh.models import Title, LinearAxis, Range1d
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
import prody as pd
import numpy as np
import time, sys, os, glob

In [2]:
#Progress bar function
def progressbar(it, prefix="", size=60, file=sys.stdout):
    count = len(it)
    def show(j):
        x = int(size*j/count)
        file.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*(size-x), j, count))
        file.flush()        
    show(0)
    for i, item in enumerate(it):
        yield item
        show(i+1)
    file.write("\n")
    file.flush()

In [3]:
#bokeh init
output_notebook()

In [4]:
target_handle ={
    "ALK1":"P37023",
    "ALK2":"Q04771",
    "ALK3":"P36894",
    "ALK4":"P36896",
    "ALK5":"P36897",
    "ALK6":"Q05438",
    "ALK7":"Q8NER5",
    "BMPR2":"Q13873",
    "ActR-IIA":"Q7SXW6",
    "ActR-IIB":"Q56A35",
    "TGFR2":"P37173",
    "AMH-RII":"Q16671",
    "EGFR":"P00533"
}

In [5]:
target_name = target_handle.keys()
rewrite = False #Keep False if not needed
tol = 1e2
hitlist = 2000 #Default is 50
database = "pdb"
try:
    os.mkdir("Blast_PDB_XML")
except:
    pass

for target in progressbar(target_name, "Running BLASTP: ", 40):
    filename ="Targets/"+target+".fasta"
    result_filename = "Blast_PDB_XML/" + target + "_blast.xml"
    if os.path.exists(result_filename):
        if rewrite:
            with open(filename) as file:
                query =file.read()
                handle = NCBIWWW.qblast("blastp",database,query,
                                        expect=tol, hitlist_size=hitlist)
                print("Rewriting BLAST results of %s" %(target))
                open(result_filename,"w+").write(handle.read())
                print("Done!") 
    else:
        with open(filename) as file:
            query =file.read()
            handle = NCBIWWW.qblast("blastp",database,query,
                                    expect=tol, hitlist_size=hitlist)
            open(result_filename,"w+").write(handle.read())
    time.sleep(0.5)

Running BLASTP: [########################################] 13/13


In [6]:
mylist = [f for f in glob.glob("Blast_PDB_XML/*.xml")]

In [8]:
os.mkdir("Plots/PDB")

In [9]:
def BLASTGraphicEval(blastfile,cutoff=False,cutoff_val=50):
    from Bio.Blast import NCBIXML
    from bokeh.models import ColumnDataSource, CDSView, BooleanFilter
    from bokeh.models import Title, LinearAxis, Range1d
    from bokeh.plotting import figure, show
    from bokeh.io import export_png
    import numpy as np
    
    #Extracting Evalues
    result_handle = open(blastfile)
    record = NCBIXML.read(result_handle)

    evalue=[]
    for aln in record.alignments:
        hsp = aln.hsps[0] #just first hsp to avoid multiple hit scanning
        val = hsp.expect
        evalue.append(hsp.expect)
    
    #Calulating log of Evalue
    log_eval=[]
    for i in evalue:
        if i == 0:
            log_eval.append(-420) #Arbitrary value, because log(0) is undefined
        else:
            log_eval.append(np.log(i))
    #Calculating Difference of Evalues (1st derivate)
    d_eval= list(np.diff(log_eval))
    d_eval.insert(0,d_eval[0]) #Adding first value as 0 to square up with the index
    index = list(range(len(evalue)))
    #print("Current lenght is",len(evalue))
    #Saving data in bokeh plot data
    data = {
        "index": index,
        "evalue": log_eval,
        "derivate": d_eval
    }
    source = ColumnDataSource(data=data)
    
    #Generating plot
    minY=min(data["derivate"])
    maxY=max(data["derivate"])
    name=blastfile.split("/")[1].split("_")[0]
    if cutoff:
        #Generating plot figure
        p = figure(plot_height=600, plot_width=800,
                   title="{0} Blast against PDB with cutoff at {1}".format(name,cutoff_val), 
                   title_location="above", x_range=Range1d(start=0, end=cutoff_val),
                   toolbar_location=None)
        #Adding titles and labels
        p.title.align = "center"
        p.xaxis.axis_label = "Sequence Index"
        p.yaxis.axis_label = "E-Value (log)"
        #Filtering
        booleans = [True if index < cutoff_val else False for index in source.data['index']]
        view = CDSView(source=source, filters=[BooleanFilter(booleans)])
        #Plotting lines
        p.extra_y_ranges = {"linear": Range1d(start=minY-10, end=maxY+10)}
        p.line(x="index", y="derivate", source=source, line_width=1, line_color="red", view=view, y_range_name="linear", line_dash='dashed',legend="1st Derivate" )
        p.line(x="index", y="evalue", source=source, view=view, line_width=2, line_color="blue",legend="E-Value")
        #Export to file
        export_png(p, filename="Plots/PDB/plot_{}_cutoff.png".format(name))
    else:
        #Generating plot figure
        p = figure(plot_height=600, plot_width=800,
                   title="%s Blast against PDB"%(name), title_location="above",
                   toolbar_location=None)
        #Adding titles and labels
        p.title.align = "center"
        p.xaxis.axis_label = "Sequence Index"
        p.yaxis.axis_label = "E-Value (log)"
        #Plotting lines
        p.extra_y_ranges = {"linear": Range1d(start=minY-10, end=maxY+10)}
        p.line(x="index", y="derivate", source=source, line_width=2, line_color="red", y_range_name="linear", line_dash='dashed',legend="1st Derivate")
        p.line(x="index", y="evalue", source=source, line_width=2, line_color="blue",legend="E-Value")
        #Export to file
        export_png(p, filename="Plots/PDB/plot_{}.png".format(name))

In [10]:
for file in mylist:
    print("Plotting "+file)
    BLASTGraphicEval(file)

Plotting Blast_PDB_XML/ALK7_blast.xml
Plotting Blast_PDB_XML/ALK6_blast.xml
Plotting Blast_PDB_XML/TGFR2_blast.xml
Plotting Blast_PDB_XML/ALK5_blast.xml
Plotting Blast_PDB_XML/ALK4_blast.xml
Plotting Blast_PDB_XML/ActR-IIA_blast.xml
Plotting Blast_PDB_XML/EGFR_blast.xml
Plotting Blast_PDB_XML/ALK2_blast.xml
Plotting Blast_PDB_XML/AMH-RII_blast.xml
Plotting Blast_PDB_XML/ActR-IIB_blast.xml
Plotting Blast_PDB_XML/BMPR2_blast.xml
Plotting Blast_PDB_XML/ALK3_blast.xml
Plotting Blast_PDB_XML/ALK1_blast.xml


In [11]:
def dataExtraction(blastfile):
    result_handle = open(blastfile)
    record = NCBIXML.read(result_handle)

    evalue=[]
    for aln in record.alignments:
        hsp = aln.hsps[0]
        val = hsp.expect
        evalue.append(hsp.expect)
    #Calulating log of Evalue
    log_eval=[]
    for i in evalue:
        if i == 0:
            log_eval.append(-420)
        else:
            log_eval.append(np.log(i))
    #Calculating Difference of Evalues (1st derivate)
    d_eval= list(np.diff(log_eval))
    d_eval.insert(0,d_eval[0])
    index = list(range(len(evalue)))
    return (log_eval,d_eval,index)

In [12]:
datalist = []
for file in progressbar(mylist, "Extracting parameters: ", 40):
    temp = dataExtraction(file)
    datalist.append(temp)

Extracting parameters: [########################################] 13/13


In [13]:
cutoff_list = []
for data in progressbar(datalist, "Extracting parameters: ", 40):
    d_max = max(data[1])
    index = data[1].index(d_max)
    d_data = data[1][index+1:]
    length=len(d_data)-1
    for i in range(length):
        if (d_data[i-1] < d_data[i] and d_data[i+1] < d_data[i]):
            index_d = data[1].index(d_data[i])
            if index_d - index < 100:
                continue
            else:
                break
    cutoff_list.append(index_d)

Extracting parameters: [########################################] 13/13


In [15]:
print(cutoff_list)

[117, 117, 101, 118, 117, 105, 206, 119, 101, 103, 101, 118, 121]


In [16]:
for i in progressbar(range(len(cutoff_list)), "Plotting: ", 40):
    BLASTGraphicEval(mylist[i],cutoff=True,cutoff_val=cutoff_list[i])

Plotting: [########################################] 13/13


In [17]:
os.mkdir("Templates")

In [22]:
def Template_Getter(blastfile,cutoff=50):
    i = 0
    ids = []
    name=blastfile.split("/")[1].split("_")[0]
    try:
        os.mkdir("Templates/{0}".format(name))
    except:
        pass
    result_handle = open(blastfile)
    record = NCBIXML.read(result_handle)
    for alignment in record.alignments:
        for hsp in alignment.hsps:
            temp_str = alignment.title
            tag = temp_str.split("|")[3]
            ids.append(tag)
            if i > cutoff:
                break
            i +=1
    temp = []
    for i in progressbar(ids, "Downloading:", 40):
        pd.fetchPDB(i, compressed=False, folder="Templates/{0}".format(name))

In [None]:
for i in range(len(cutoff_list)):
    print("Working with",mylist[i])
    Template_Getter(mylist[i],cutoff=cutoff_list[i])

Working with Blast_PDB_XML/ALK7_blast.xml
Downloading:[........................................] 0/2000

@> Connecting wwPDB FTP server RCSB PDB (USA).
@> 1b6c downloaded (Templates/ALK7/1b6c.pdb)
@> PDB download via FTP completed (1 downloaded, 0 failed).


Downloading:[........................................] 1/2000

@> Connecting wwPDB FTP server RCSB PDB (USA).
@> 1py5 downloaded (Templates/ALK7/1py5.pdb)
@> PDB download via FTP completed (1 downloaded, 0 failed).


Downloading:[........................................] 2/2000

@> Connecting wwPDB FTP server RCSB PDB (USA).
@> 3tzm downloaded (Templates/ALK7/3tzm.pdb)
@> PDB download via FTP completed (1 downloaded, 0 failed).


Downloading:[........................................] 3/2000

@> Connecting wwPDB FTP server RCSB PDB (USA).
@> 4x2g downloaded (Templates/ALK7/4x2g.pdb)
@> PDB download via FTP completed (1 downloaded, 0 failed).


Downloading:[........................................] 4/2000

@> Connecting wwPDB FTP server RCSB PDB (USA).
@> 2wot downloaded (Templates/ALK7/2wot.pdb)
@> PDB download via FTP completed (1 downloaded, 0 failed).


Downloading:[........................................] 5/2000

@> Connecting wwPDB FTP server RCSB PDB (USA).
