## Virtual screening Dataset

Here we run all other scoring functions on the Dataset2 - the virtual screening dataset

In [19]:
import pandas as pd
import numpy as np
import pickle as pk

### Get the data

In [4]:
vs_dataset = pd.read_csv("../Datasets/virtual_screening_dataset.csv")

In [6]:

def reformat_path(path):
    prefix = "/home/anja/Documents/"
    new = "../../"
    if not prefix in path: return path
    res = path[len(prefix):]
    return new+res


vs_dataset["path"] = vs_dataset["path"].apply(reformat_path)
vs_dataset

Unnamed: 0.1,Unnamed: 0,allele,peptide,ba,binder,path,allele_type
0,0,A0101,QSDFHNNRY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-QS...,HLA-A
1,1,A0101,NSELLNDRY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-NS...,HLA-A
2,2,A0101,GSDYINANY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-GS...,HLA-A
3,3,A0101,IINESLLFY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-II...,HLA-A
4,4,A0101,LTEYLSTHY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-LT...,HLA-A
...,...,...,...,...,...,...,...
33594,37927,B5701,DEEGQDDKD,1.000000,0,../../rdf_mount/decoymulticonf/confs/B5701-DEE...,decoy
33595,37999,B5701,SEISAFKTC,1.000000,0,../../rdf_mount/decoymulticonf/confs/B5701-SEI...,decoy
33596,38016,B5701,KQKQNRPIP,1.000000,0,../../rdf_mount/decoymulticonf/confs/B5701-KQK...,decoy
33597,38031,B5701,QLLSGAHWM,1.000000,0,../../rdf_mount/decoymulticonf/confs/B5701-QLL...,decoy


### Get the features

In [12]:
def pppene_to_array(tmp):
    tmp = tmp.replace("(", "")
    tmp = tmp.replace(")", "")
    tmp = tmp.strip("[]")
    tmp = tmp.replace(" ", "")
    tmp = tmp.replace("\n", ",")
    return np.fromstring(tmp, dtype=float, sep=", ").reshape(9,20)

vs_dataset_feat = pd.read_csv("../Featurization/rosettaPPPEnergies_vsData.csv")
vs_dataset_feat["energies"] = vs_dataset_feat["energies"].apply(pppene_to_array)

In [22]:
vs_dataset_feat = vs_dataset_feat[["allele", "peptide", "binder", "ba", "energies", "total_energy"]]
vs_dataset_feat["path"] = vs_dataset["path"]
vs_dataset_feat

Unnamed: 0,allele,peptide,binder,ba,energies,total_energy
0,A0101,QSDFHNNRY,1,0.574375,"[[-8.51093845, 15.4041086, 10.38764766, 22.450...",117.505987
1,A0101,NSELLNDRY,1,0.574375,"[[-7.22049579, 0.62790443, 9.04920189, 3.79385...",140.622286
2,A0101,GSDYINANY,1,0.574375,"[[-3.76399482, 1.17818395, 5.40936779, 0.01747...",53.403350
3,A0101,IINESLLFY,1,0.574375,"[[-8.9314899, 18.72980029, 5.33243888, 7.09671...",113.081328
4,A0101,LTEYLSTHY,1,0.574375,"[[-8.04471031, 9.87286781, 6.7829591, 43.73890...",69.983761
...,...,...,...,...,...,...
33594,B5701,DEEGQDDKD,0,1.000000,"[[-7.46551424, 6.65815792, 9.07724956, 0.97602...",86.125001
33595,B5701,SEISAFKTC,0,1.000000,"[[-3.36141123, 4.80272104, 3.70995033, 0.76738...",123.700118
33596,B5701,KQKQNRPIP,0,1.000000,"[[-7.90284005, 3.15168452, 8.07754314, 12.2471...",185.081672
33597,B5701,QLLSGAHWM,0,1.000000,"[[-7.92011332, 2.36616947, 8.00779003, 51.2077...",53.568404


## Score with 3pHLA-score

In [None]:
root_dir = "../Experiment1 - train ref2015-score, standard-pHLA-score and 3pHLA-score/"

def get_energies(X):
    ene = np.roll(X, 4, axis = 0)[:9,:19]
    ene = np.roll(ene, -4, axis = 0)
    ene = ene.reshape(9*19)
    return ene

def score_3pHLA(row):
    print(row)
    allele = row["allele"]
    feat = row["energies"]

    model_9n = root_dir+"/final_REGRmodels/"+allele+"ppp.pkl"

    model_9 = pk.load(open(model_9n, 'rb'))
    pred1 = model_9.predict([get_energies(feat)])[0]
    return pred1

alleles = vs_dataset_feat["allele"].unique()

vs_dataset_feat["3pHLA-score"] = vs_dataset_feat.apply(score_3pHLA, axis=1)

vs_dataset_feat

In [31]:
vs_dataset["3pHLA-score"] = vs_dataset_feat["3pHLA-score"]
vs_dataset[["allele", "peptide", "ba", "binder", "path", "allele_type", "3pHLA-score"]]

Unnamed: 0,allele,peptide,ba,binder,path,allele_type,3pHLA-score
0,A0101,QSDFHNNRY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-QS...,HLA-A,0.547367
1,A0101,NSELLNDRY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-NS...,HLA-A,0.489661
2,A0101,GSDYINANY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-GS...,HLA-A,0.505538
3,A0101,IINESLLFY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-II...,HLA-A,0.341056
4,A0101,LTEYLSTHY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-LT...,HLA-A,0.503116
...,...,...,...,...,...,...,...
33594,B5701,DEEGQDDKD,1.000000,0,../../rdf_mount/decoymulticonf/confs/B5701-DEE...,decoy,0.128606
33595,B5701,SEISAFKTC,1.000000,0,../../rdf_mount/decoymulticonf/confs/B5701-SEI...,decoy,0.100223
33596,B5701,KQKQNRPIP,1.000000,0,../../rdf_mount/decoymulticonf/confs/B5701-KQK...,decoy,0.084480
33597,B5701,QLLSGAHWM,1.000000,0,../../rdf_mount/decoymulticonf/confs/B5701-QLL...,decoy,0.085546


In [32]:
vs_dataset.to_csv("Results2_tmp_3phla.csv")

## Score with Vina, Vinardo, AutoDock

In [None]:
import HLA_Arena as arena
# Initialize scoring
import pandas as pd
import seaborn as sns

def score_vvad(row, func):
    print(row)
    ene = arena.rescore_complex_simple_smina(row["path"], func)
    return ene
    
#to change scoring function instead of "vina", place "vinardo" or "ad4_scoring"
funcs = ["ad4_scoring", "vina", "vinardo"]

for f in funcs:
    print("scoring with: "+f)
    vs_dataset_feat[f+"-score"] = vs_dataset_feat.apply(lambda x: score_vvad(x, f), axis=1)

## Score with DOPE

In [None]:
from modeller import *
from modeller.automodel import *
from modeller.scripts import complete_pdb

def score_dope(fpath):
    print("-----------------------------------------------")
    print("SCORING WITH DOPE")
    print(fpath)
    #score    
    env = environ()
    env.libs.topology.read(file='$(LIB)/top_heav.lib')
    env.libs.parameters.read(file='$(LIB)/par.lib')
    
    mdl = complete_pdb(env, fpath)
        
    atmsel_lig = selection(mdl.chains[2])
    atmsel_rec = selection(mdl.chains[0], mdl.chains[1])
    atmsel_full = selection(mdl.chains[0], mdl.chains[1], mdl.chains[2])
    try:
        score_lig = atmsel_lig.assess_dope()
        score_rec = atmsel_rec.assess_dope()
        score_full = atmsel_full.assess_dope()
    except:
        print("Failed on: "+fpath)
        return (0, 0)
    
    score_diff = score_full - (score_rec + score_lig) 
    
    return (score_full, score_diff)

vs_dataset_feat["dope-score"] = vs_dataset_feat.apply(lambda row: score_dope(row["path"]), axis=1)

## Score with FoldX

In [None]:
%cd ..

In [None]:
import subprocess

def find_pHLA(file_path):
    if not "rdf_mount" in file_path:
        return file_path[file_path.rfind("/")+1:file_path.find(".pdb")]
    else:
        prefix = "../../rdf_mount/decoymulticonf/confs/"
        suffix = "/full_system_confs"
        return file_path[file_path.find(prefix)+len(prefix):file_path.find(suffix)]
    

def score_FX(row):
    print("processing "+row["path"])
    #score    
    env = environ()
    env.libs.topology.read(file='$(LIB)/top_heav.lib')
    env.libs.parameters.read(file='$(LIB)/par.lib')
    
    
    mdl = complete_pdb(env, row["path"])
    chain_rec = mdl.chains[0].name
    chain_pep = mdl.chains[2].name
    
    cname = find_pHLA(row["path"])
    command = "./script_rescore_FoldX.sh "+chain_pep+" "+chain_rec+" "+row["path"]+" "+cname
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    process.wait()
    ret = process.returncode
    print(ret)
    if not ret == 0: print("Error with execution") 
    print("done with "+row["path"])
    
def reformat_path(path):
    old = "../../"
    new = "/data/"
    if not old in path: return path
    res = path[len(old):]
    return new+res

In [None]:
process = subprocess.Popen("touch FoldX_results.csv", shell=True, stdout=subprocess.PIPE)
process = subprocess.Popen("echo \"name, ene\" > FoldX_results.csv", shell=True, stdout=subprocess.PIPE)
process.wait()
process.returncode

In [None]:
vs_dataset_feat["path"] = vs_dataset_feat["path"].apply(reformat_path)
vs_dataset_feat.apply(lambda row: score_FX(row), axis=1) 

In [None]:
res_tmp = pd.read_csv("FoldX_results.csv")
res_tmp["foldx-score"] = [" ene"].apply(lambda x: float(x))
vs_dataset_feat["foldx-score"] = res_tmp["foldx-score"]

## Score with GradDock

In [None]:
%cd /graddock/GD/evaluate
%pwd

In [None]:
import subprocess
## Access ligand/receptor files

def extract_receptor_ligand(filename):
    env = environ()
    env.libs.topology.read(file='$(LIB)/top_heav.lib')
    env.libs.parameters.read(file='$(LIB)/par.lib')
    mdl = complete_pdb(env, filename)
    
    atmsel_lig = selection(mdl.chains[2])
    atmsel_rec = selection(mdl.chains[0], mdl.chains[1])
    
    lig_name = filename[:filename.find(".pdb")]+"_ligand.pdb"
    rec_name = filename[:filename.find(".pdb")]+"_receptor.pdb"
    atmsel_lig.write(lig_name)
    atmsel_rec.write(rec_name)
    
    return (lig_name, rec_name)
    
def get_rec_lig_name(path):
    
    if "singleconf" in path:
        lname = lfname[lfname.rfind("/")+1:]
        rname = rfname[rfname.rfind("/")+1:]
        return ()
    
def score_GD(row):
    print("processing "+row["path"])
    (lfname, rfname) = extract_receptor_ligand(row["path"])
    folder_src = row["path"][:row["path"].rfind("/")]
    lname = lfname[lfname.rfind("/")+1:]
    rname = rfname[rfname.rfind("/")+1:]
    command = "./GD_run.sh "+lname+" "+rname+" "+folder_src+" > GD_progress.txt"
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    process.wait()
    command = "rm "+lfname
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    process.wait()
    command = "rm "+rfname
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    process.wait()
    ret = process.returncode
    print(ret)
    if not ret == 0: print("Error with execution") 
    print("done with "+row["path"])
    
def reformat_path(path):
    old = "../../"
    new = "/data/"
    if not old in path: return path
    res = path[len(old):]
    return new+res

In [None]:
process = subprocess.Popen("touch GD_results.csv", shell=True, stdout=subprocess.PIPE)
process = subprocess.Popen("echo \"name, complex, diff\" GD_results.csv", shell=True, stdout=subprocess.PIPE)
process.wait()
process.returncode

In [None]:
dataset["path"] = dataset["path"].apply(reformat_path)
vs_dataset_feat.apply(lambda row: score_GD(row), axis=1)

In [None]:
res_tmp = pd.read_csv("GD_results.csv")
res_tmp["graddock-score"] = [" diff"].apply(lambda x: float(x))
vs_dataset_feat["graddock-score"] = res_tmp["graddock-score"]

In [None]:
vs_dataset_feat.to_csv("Results2.csv")