# Protein sequence library builder for AVCR and partners

## List of target proteins

* ALK1
* ALK2
* ALK3
* ALK4
* ALK5
* ALK6
* ALK7
* BMPR2
* ActR-IIA
* ActR-IIB
* TGFR2
* AMH-RII
* EGFR (control) 

In [70]:
#Enviroment init
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import SeqIO, AlignIO, ExPASy
from Bio.Align.Applications import MafftCommandline as mafft
from bokeh.models import ColumnDataSource, CDSView, BooleanFilter
from bokeh.models import Title, LinearAxis, Range1d
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
import numpy as np
import time, os
output_notebook()

In [16]:
target_handle ={
    "ALK1":"P37023",
    "ALK2":"Q04771",
    "ALK3":"P36894",
    "ALK4":"P36896",
    "ALK5":"P36897",
    "ALK6":"Q05438",
    "ALK7":"Q8NER5",
    "BMPR2":"Q13873",
    "ActR-IIA":"Q7SXW6",
    "ActR-IIB":"Q56A35",
    "TGFR2":"P37173",
    "AMH-RII":"Q16671",
    "EGFR":"P00533"
}

In [21]:
try:
    os.mkdir("Targets")
except:
    pass
for target in target_handle.keys():
    print("Now getting",target,"sequence file")
    filename = "Targets/"+ target + ".fasta"
    handle = ExPASy.get_sprot_raw(target_handle[target])
    temp = SeqIO.read(handle, "swiss")
    print("Writing",target,"fasta file")
    SeqIO.write(temp, filename, "fasta")
    print("Done!")
    time.sleep(0.5) #Some rest to not make it look like a DDoS

Now getting ALK1 sequence file
Writing ALK1 fasta file
Done!
Now getting ALK2 sequence file
Writing ALK2 fasta file
Done!
Now getting ALK3 sequence file
Writing ALK3 fasta file
Done!
Now getting ALK4 sequence file
Writing ALK4 fasta file
Done!
Now getting ALK5 sequence file
Writing ALK5 fasta file
Done!
Now getting ALK6 sequence file
Writing ALK6 fasta file
Done!
Now getting ALK7 sequence file
Writing ALK7 fasta file
Done!
Now getting BMPR2 sequence file
Writing BMPR2 fasta file
Done!
Now getting ActR-IIA sequence file
Writing ActR-IIA fasta file
Done!
Now getting ActR-IIB sequence file
Writing ActR-IIB fasta file
Done!
Now getting TGFR2 sequence file
Writing TGFR2 fasta file
Done!
Now getting AMH-RII sequence file
Writing AMH-RII fasta file
Done!
Now getting EGFR sequence file
Writing EGFR fasta file
Done!


In [95]:
target_name = target_handle.keys()
rewrite=True #Keep False if not needed
tol=0.01 #Evalue of e-2
hitlist = 500 #Default is 50
for target in target_name:
    filename ="Targets/"+target+".fasta"
    result_filename = "Blast_XML/" + target + "_blast.xml"
    if os.path.exists(result_filename):
        if rewrite:
            print("Running BLAST against SwissProt using %s as query" %(target))
            with open(filename) as file:
                query =file.read()
                handle = NCBIWWW.qblast("blastp","swissprot",query,
                                        expect=tol,hitlist_size=hitlist)
                print("Rewriting BLAST results of %s" %(target))
                open(result_filename,"w+").write(handle.read())
                print("Done!") 
        else:
            print("Results for %s were found, skiping..." %(target))
    else:
        print("Running BLAST against SwissProt using %s as query" %(target))
        with open(filename) as file:
            query =file.read()
            handle = NCBIWWW.qblast("blastp","swissprot",query,
                                    expect=tol,hitlist_size=hitlist)
            print("Saving BLAST results of %s" %(target))
            open(result_filename,"w+").write(handle.read())
            print("Done!")
    time.sleep(0.5)

Running BLAST against SwissProt using ALK1 as query
Rewriting BLAST results of ALK1
Done!
Running BLAST against SwissProt using ALK2 as query


KeyboardInterrupt: 

In [25]:
os.path.exists(result_filename)

In [61]:
result_handle = open("Blast_XML/ALK1_blast.xml")
record = NCBIXML.read(result_handle)

evalue=[]
for aln in record.alignments:
    for hsp in aln.hsps:
        val = hsp.expect
        evalue.append(hsp.expect)

In [62]:
len(evalue)

503

In [91]:
log_eval=[]
for i in evalue:
    if i == 0:
        log_eval.append(-400)
    else:
        log_eval.append(np.log(i))
d_eval= list(np.diff(log_eval))
d_eval.insert(0,d_eval[0])
index1 = list(range(len(evalue)))
data = {
    "index": index,
    "evalue": log_eval,
    "derivate": d_eval
}
source = ColumnDataSource(data=data)
p = figure(plot_height=300, plot_width=300,
           title="E-value (log)", title_location="left")
p.title.align = "center"
p.add_layout(Title(text="Sequence Index", align="center"), "below")
p.add_layout(Title(text="ALK1 Blast against SwissProt", align="center"), "above")
p.extra_y_ranges = {"linear": Range1d(start=-50, end=150)}
#p.add_layout(LinearAxis(y_range_name="linear"), 'left')
p.line(x="index",y="derivate",source=source, line_width=1, line_color="red",y_range_name="linear")
p.line(x="index",y="evalue",source=source, line_width=2)

p_filtered =figure(plot_height=300, plot_width=300,
                  x_range=Range1d(start=0, end=100))
booleans = [True if index < 100 else False for index in source.data['index']]
view = CDSView(source=source, filters=[BooleanFilter(booleans)])
p_filtered.extra_y_ranges = {"linear": Range1d(start=-50, end=150)}
p_filtered.line(x="index",y="derivate",source=source, line_width=1, line_color="red", view=view, y_range_name="linear" )
p_filtered.line(x="index", y="evalue", source=source, view=view, line_width=2)
show(gridplot([[p, p_filtered]]))

In [93]:
result_handle = open("Blast_XML/ALK2_blast.xml")
record = NCBIXML.read(result_handle)

evalue=[]
for aln in record.alignments:
    for hsp in aln.hsps:
        val = hsp.expect
        evalue.append(hsp.expect)

In [94]:
len(evalue)

50