# Template preparation for homology modelling

In [1]:
import prody as pd
import numpy as np
from Bio import SeqIO, AlignIO
from Bio.Blast import NCBIXML
import time, sys, os, glob
pd.confProDy(verbosity='none')

In [2]:
#Progress bar function
def progressbar(it, prefix="", size=60, file=sys.stdout):
    count = len(it)
    def show(j):
        x = int(size*j/count)
        file.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*(size-x), j, count))
        file.flush()        
    show(0)
    for i, item in enumerate(it):
        yield item
        show(i+1)
    file.write("\n")
    file.flush()

In [18]:
#Protein to model
Target = "EGFR"

In [18]:
def Template_Getter(blastfile):
    result_handle = open(blastfile)
    record = NCBIXML.read(result_handle)

    evalue=[]
    for aln in record.alignments:
        hsp = aln.hsps[0]
        val = hsp.expect
        evalue.append(hsp.expect)
    #Calulating log of Evalue
    log_eval=[]
    for i in evalue:
        if i == 0:
            log_eval.append(-420)
        else:
            log_eval.append(np.log(i))
    #Calculating Difference of Evalues (1st derivate)
    d_eval= list(np.diff(log_eval))
    d_eval.insert(0,d_eval[0])
    index = list(range(len(evalue)))
    
    d_max = max(d_eval)
    maximun= d_eval.index(d_max)
    subdata = d_eval[:maximun]

    for x in subdata:
        if x > 0:
            peak = x
            break
        else:
            continue
    cutoff= d_eval.index(peak)
    i = 0
    ids = []
    name = Target
    try:
        os.mkdir("Templates/{0}".format(name))
    except:
        pass
    
    for alignment in record.alignments:
        hsp = alignment.hsps[0]
        temp_str = alignment.title
        tag = temp_str.split("|")[3]
        ids.append(tag)
        if i > cutoff:
            break
        i +=1
    temp = []
    for i in progressbar(ids, "Downloading:", 40):
        pd.fetchPDB(i, compressed=False, folder="Templates/{0}".format(name))

In [19]:
Template_Getter("../Library/Blast_PDB_XML/{0}_blast.xml".format(Target))

Downloading:[########################################] 86/86


In [64]:
def Template_Cleaner(blastfile, template_dir):
    result_handle = open(blastfile)
    record = NCBIXML.read(result_handle)
    data = {}
    for aln in progressbar(record.alignments, "Data Extraction:", 40):
        hsp = aln.hsps[0]
        name = aln.title
        tag = name.split("|")[3]
        chain = name.split("|")[4].split(" ")[0]
        data[tag] = chain

    template_list = [f for f in glob.glob("{0}/*.pdb".format(template_dir))]
    for template in progressbar(template_list, "Cleaning:", 40):
        tag = template.split("/")[2].split(".")[0].upper()
        if tag in data.keys():
            chain = data[tag]
            time.sleep(2)
            full = pd.parsePDB(template)
            clean = full[chain]
            pd.writePDB(template_dir+"/Clean/"+tag.lower()+chain+".pdb",clean)

In [65]:
Template_Cleaner("../Library/Blast_PDB_XML/{0}_blast.xml".format(Target),"Templates/{0}".format(Target))

Data Extraction:[########################################] 2000/2000
Cleaning:[########################################] 86/86


In [3]:
clean_dir = "Templates/EGFR/Clean/"

In [4]:
code3b = {"GLY" : "G", "ALA" : "A", "LEU" : "L", "ILE" : "I",
        "ARG" : "R", "LYS" : "K", "MET" : "M", "CYS" : "C",
        "TYR" : "Y", "THR" : "T", "PRO" : "P", "SER" : "S",
        "TRP" : "W", "ASP" : "D", "GLU" : "E", "ASN" : "N",
        "GLN" : "Q", "PHE" : "F", "HIS" : "H", "VAL" : "V",
        "UNK" : "X", "HSE":"H", "HSD":"H"}

In [5]:
clean_template_list = [f for f in glob.glob("{0}*.pdb".format(clean_dir))]

In [11]:
os.mkdir("Templates/EGFR/Clean/Sequence")

In [15]:
for pdb_file in progressbar(clean_template_list, "Extracing Sequence:", 40):
    pdb = pd.parsePDB(pdb_file)
    namepdb = os.path.split(pdb_file)[-1].split(".")[0]
    protAA = ""
    prot3AA = ""
    otherAA = ""
    for res in pdb.iterResidues():
        try:
            protAA += code3b[res.getResname()]
        except:
            otherAA += res.getResname() + "-"
        prot3AA += res.getResname() + "-"
    with open("Templates/EGFR/Clean/Sequence/"+namepdb+".fa","w+") as handle:
        handle.write(">"+namepdb+"\n"+protAA+"\n")

Extracing Sequence:[########################################] 86/86


In [17]:
! cd Templates/EGFR/Clean/Sequence && cat *.fa > cleanTemplate.fa

In [25]:
## Append Target Sequence to clean template file

with open("Templates/EGFR/Clean/Sequence/cleanTemplate.fa","a") as handle:
    target_file = open("../Sequences/{0}.fasta".format(Target))
    handle.write(target_file.read())

In [28]:
from Bio.Align.Applications import MafftCommandline
mafft_cline = MafftCommandline(input="Templates/EGFR/Clean/Sequence/cleanTemplate.fa")
print(mafft_cline)
stdout, stderr = mafft_cline()
from io import StringIO
align = AlignIO.read(StringIO(stdout), "fasta")
AlignIO.write(align, "salida_clustal.aln", "clustal")
print(align)

mafft Templates/EGFR/Clean/Sequence/cleanTemplate.fa
SingleLetterAlphabet() alignment with 87 rows and 1243 columns
-------------------------EEKKVCQGTSNKLTQLGTF...--- 1ivoA
--------------------------------------------...--- 1m14A
----------------------------AVCP------------...--- 1m6bA
------------------------LEEKKVCQGTSNKLTQLGTF...--- 1moxA
---------------------------TQVCTGTDMKLRLPASP...--- 1n8yC
---------------------------TQVCTGTDMKLRLPASP...--- 1n8zC
--------------------------EKKVCQGTSNKLTQLGTF...--- 1nqlA
---------------------------TQVCTGTDMKLRLPASP...--- 1s78A
--------------------------------------------...--- 1xkkA
-------------------------EEKKVCQGTSNKLTQLGTF...--- 1yy9A
---------------------------QSVCAGTENKLSSLSDL...--- 2ahxA
--------------------------------------------...--- 2eb2A
--------------------------------------------...--- 2eb3A
--------------------------------------------...--- 2gs2A
--------------------------------------------...--- 2gs7A
-----------------------------

In [29]:
AlignIO.write(align, "salida_clustal.fa", "fasta")

1