In [None]:
# define imports
import numpy as np
import numpy.random
import pandas as pd
import random

import xml.etree.ElementTree as ET
#rd kit
from rdkit.Chem import Draw, AllChem, MACCSkeys
from rdkit import Chem, DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
#sklearn
from sklearn import datasets
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
#importing the preprocessed embedds
embedds = dict( np.load('protein.npz', mmap_mode='r' ))

## Import Dataset from .xml file

In [None]:
# create DataFrame with predefined column names
DataSet = pd.DataFrame(columns=['name', 'smiles','molecule', 'fingerprint', 'target', 'SeqVec'])

In [None]:
# parse XML
root = ET.parse('full_database.xml').getroot()
neg_targets = []

In [None]:
# counter for stats
counter = 0

#for all drugs in databank
for i in range(len(root)):
    
    #if drug is a "small molecule"
    if (root[i].attrib.get('type') == "small molecule"):
        
        Datasetentry = []
        #tnumber counts the number of targets the drug has
        tarnumber = 0
        
        #getting the drugs name
        Datasetentry.append(root[i].find('{http://www.drugbank.ca}name').text)  
        
         
        #go thru properties searching for the smiles
        smiles = ''
        for property in root[i].find('{http://www.drugbank.ca}calculated-properties'):
            if(property[0].text == "SMILES"):
                smiles = property.find('{http://www.drugbank.ca}value').text
                Datasetentry.append(smiles)
        
        #add "X" if no smiles is found
        if len(Datasetentry) == 1:
            Datasetentry.append("X")
        
        
        #add empty cell for molecule object
        Datasetentry.append("0")
        
        #add empty cell for fingerprint object
        Datasetentry.append("0")
        
        
        
        #go thru targets to find their id
        try:
            #if the drug has a target its id is added to the dataframe
            if root[i].find('{http://www.drugbank.ca}targets').find('{http://www.drugbank.ca}target') != None:
                tars = root[i].find('{http://www.drugbank.ca}targets')
                for target in tars:
                    tar = target.find('{http://www.drugbank.ca}polypeptide')
                    x = tar.get('id')
                    Datasetentry.append(x)
                    Datasetentry.append("0")
                    DataSet.loc[len(DataSet)] = Datasetentry
                    Datasetentry = Datasetentry[:-2]
                    counter = counter + 1
                    tarnumber = tarnumber + 1
            #if it hasn't an empty cell is added instead
            else:
                Datasetentry.append("0")
                Datasetentry.append("0")
                DataSet.loc[len(DataSet)] = Datasetentry
        except:
            continue
    
        #adding the smiles of drugs that have between 3 and 10 targets to a list
        if tarnumber>2 and tarnumber<11:
            neg_targets.append(smiles)
        
        
print(len(DataSet))
print(counter)
print(len(neg_targets))

In [None]:
#adding as many negative interactions as there are positive
datalen = len(DataSet)
for i in range(datalen):
    x = []
    #adding a 0 as the drugs name so we can identify the made up negative interactions
    x.append(0)
    #selecting a random smiles for the interaction
    smi = random.choice(neg_targets)
    x.append(smi)
    #molecule
    x.append("0")
    #fingerprint
    x.append("0")
    #randoming a seqvec which doesnt have a previous entry for the smiles
    tgt = random.choice(list(embedds))
    for index, row in DataSet.iterrows():
        #ignoring ones that have a previously entered interaction
        if DataSet.at[index, 'smiles'] == smi:
            if DataSet.at[index, 'target'] == tgt:
                #duplicatates get marked with a 0 as target so they get ignored furhter on
                x.append(0)
                continue
                
    if len(x) == 4:
        x.append(tgt)
    #print(x)
    #Seqvec
    x.append("0")
    #adding the interaction to the dataset
    DataSet.loc[len(DataSet)] = x
    

In [None]:
#filling in the seqvec
for index, row in DataSet.iterrows():
    try:
        DataSet.at[index, 'SeqVec'] = embedds[row['target']]
    except:
        continue

In [None]:
#create molecule from smiles
for index, row in DataSet.iterrows():
    DataSet.at[index, 'molecule'] = Chem.MolFromSmiles(row['smiles'])      

In [None]:
#create fingerprint from molecule
   
for index, row in DataSet.iterrows():
    try:
        DataSet.at[index, 'fingerprint'] = rdkit.Chem.rdmolops.RDKFingerprint([index, 'moelcule'],fpSize=1024)
        #entries[0] = Chem.MolFromSmiles(entries.at['smiles'])
    except:
        continue
        

In [None]:
#saving the dataset so we dont have to run the timeconsuming parts
pd.to_pickle(DataSet, "DataSet2", compression='infer', protocol=4)

# Preparing the Data for machine learning

In [None]:
ml_data = []
count0 = 0
count1 = 0
for index, row in DataSet.iterrows():
    temp = []
    #if there is no fingerprint the entry gets ignored
    if len(DataSet.at[index, 'fingerprint']) > 3:
        
        #if a fingerprint and seqvec are there the vectors get concatinated and added to a list wihilst denoting that they connect
        if len(DataSet.at[index, 'SeqVec']) > 5:
            #checking for the made up negative interactions
            if DataSet.at[index, 'name'] == 0:
                temp.append(np.concatenate((DataSet.at[index, 'fingerprint'], row['SeqVec']), axis=0))
                temp.append(0)
                count0 = count0 + 1
            else:    
                temp.append(np.concatenate((DataSet.at[index, 'fingerprint'], row['SeqVec']), axis=0))
                temp.append(1)
                count1 = count1 + 1
            
        #if there is no target found the entry gets ignored
        else:
            continue
            
    else:
        continue
    ml_data.append(temp)


#converting the vector entierly to floats
for entry in ml_data:
    for number in entry[0]:
        number = float(number)
        
        
#shuffleing the array
np.random.shuffle(ml_data)

# Baseline Prediction

In [None]:
#checking the distirbution of classes in all the data
#data is all the available data
def get_distrib(data):
    positive = 0
    negative = 0
    for entry in data:
        if entry[1] == 1:
            positive = positive + 1
        else:
            negative = negative + 1
    total = positive + negative
    #scaling the data into percentages
    posper = (positive/total) * 100
    return posper



#distribution is the amount of times the dominant class appears out of 100 entrys
def ZeroRuleBaseline(distribution):
    #a random number between 1 and 100 is generated
    tempPred = random.randint(1,101)
    if tempPred <= distribution:
        return 1
    else:
        return 0


# ML

In [None]:
#initalising X and y unsing the ml_data
X = []
y = []
for entry in ml_data:
    if(len(entry[0])==2048):
        X.append(entry[0])
        y.append(entry[1])
                

In [None]:
# Split the data into a training set and a test set 

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

print("Dataset sizes:\nWhole set: {}\nTraining Set: {}\nTest Set: {}".
      format(len(y), len(y_train), len(y_test)))


In [None]:
# Perform cross-validation to optimize hyperparameters

# Define cross-validation object
cv = StratifiedKFold(n_splits = 5)

# Define predictor
from sklearn.svm import SVC
classifier = SVC(probability=True)

# Define parameters we want to optimize and values we want to test
# Here, we test different activation functions
params = { 'decision_function_shape': ['ovo']}

# Perform grid search
grid = GridSearchCV(estimator = classifier, cv = cv, param_grid = params, 
                    return_train_score=True)
grid.fit(X_train, y_train)

# Analyse results

cv_results = pd.DataFrame(grid.cv_results_)
print(cv_results)


In [None]:
# Use best estimator and assess performance on the test set

# Calculate predictions
best_classifier = grid.best_estimator_
y_pred = best_classifier.predict(X_test)
pred_score = best_classifier.score(X_test, y_test)

# Calculate confusion matrix (showing tp, fp, tn, fn)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Acc: {}'.format(round(pred_score, 3)))
predictions=y_pred

In [None]:
#calculate the AUC for the modell

probas = best_classifier.predict_proba(X_test)
proba_predictions = []
for entry in probas:
    proba_predictions.append(entry[0])
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, proba_predictions)
print("AUC: {}".format(auc))
AUCs = []
for i in range(len(y_test)):
    truths = []
    preds = []
    for j in range(len(y_test)):
        pick = random.randint(0,len(y_test)-1)
        truth = y_test[pick]
        pred = proba_predictions[pick]
        truths.append(truth)
        preds.append(pred)
    auc2 = roc_auc_score(truths,preds)
    AUCs.append(auc2)
std_auc = np.std(AUCs)
print(std_auc)

In [None]:
def calc_std_errs():
    iterations = len(y_test)
    overall_accs = []
    precisions = []
    neg_precs = []
    recalls = []
    neg_covs = []
    f1s = []
    mccs = []
    for i in range(iterations):
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        for j in range(iterations):
            pick = random.randint(0,len(y_test)-1)
            truth = y_test[pick]
            pred = predictions[pick]
            if truth == 1:
                if pred == 1:
                    tp = tp +1
                else:
                    fn = fn +1
            else:
                if pred == 1:
                    fp = fp +1
                else:
                    tn = tn +1
        #formulas of performance mesurements
        precision = tp/(tp+fp)
        precisions.append(precision)
        neg_prec = tn/(tn+fn)
        neg_precs.append(neg_prec)
        recall = tp/(tp+fn)
        recalls.append(recall)
        neg_cov = tn/(tn+fp)
        neg_covs.append(neg_cov)
        f1 = 2*precision*recall/(precision+recall)
        f1s.append(f1)
        overall_acc = (tp+tn)/(tp+fp+tn+fn)
        overall_accs.append(overall_acc)
        mcc = matthews_corrcoef(y_test,predictions)
        mccs.append(mcc)
    #calculate standard deviation of the performance mesurements
    std_prec = np.std(precisions)
    std_neg_prec = np.std(neg_precs)
    std_recall = np.std(recalls)
    std_neg_cov = np.std(neg_covs)
    std_f1 = np.std(f1s)
    std_overall_acc = np.std(overall_accs)
    std_mcc = np.std(mccs)
    print("std_prec: {}\nstd_neg_prec: {}\nstd_recall: {}\nstd_neg_cov: {}\nstd_f1: {}\nstd_overall_acc: {}\nstd_mcc: {}".format(std_prec, std_neg_prec, std_recall, std_neg_cov, std_f1, std_overall_acc, std_mcc))


In [None]:
#analysis of the predictions
tp = 0
tn = 0
fp = 0
fn = 0

for i in range(len(y_test)):
    pred = predictions[i]
    if y_test[i] == 1:
        if pred == 1:
            tp = tp+1
        else:
            fn = fn+1
    else:
        if pred == 1:
            fp = fp + 1
        else:
            tn = tn + 1
print("True Positive: {}\nTrue Negative: {}\nFalse Positive: {}\nFalse Negative: {}".
      format(tp, tn, fp, fn))

#calculating performance scores
precision = tp/(tp+fp)
print("Precision:")
print(precision)

neg_prec = tn/(tn+fn)
print("negative Precision:")
print(neg_prec)

recall = tp/(tp+fn)
print("Recall:")
print(recall)

neg_cov = tn/(tn+fp)
print("negative coverage:")
print(neg_cov)

f1 = 2*precision*recall/(precision+recall)
print("f1-score:")
print(f1)

overall_acc = (tp+tn)/(tp+fp+tn+fn)
print("overall accuracy:")
print(overall_acc)

print("MCC:")
print(matthews_corrcoef(y_test,predictions))

calc_std_errs()

In [None]:
def calc_std_errs_baseline():
    iterations = len(y_test)
    overall_accs = []
    precisions = []
    neg_precs = []
    recalls = []
    neg_covs = []
    f1s = []
    mccs = []
    for i in range(iterations):
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        for j in range(iterations):
            pick = random.randint(0,len(y_test)-1)
            truth = y_test[pick]
            pred = baseline_predicts[pick]
            if truth == 1:
                if pred == 1:
                    tp = tp +1
                else:
                    fn = fn +1
            else:
                if pred == 1:
                    fp = fp +1
                else:
                    tn = tn +1
        #formulas of performance mesurements
        precision = tp/(tp+fp)
        precisions.append(precision)
        neg_prec = tn/(tn+fn)
        neg_precs.append(neg_prec)
        recall = tp/(tp+fn)
        recalls.append(recall)
        neg_cov = tn/(tn+fp)
        neg_covs.append(neg_cov)
        f1 = 2*precision*recall/(precision+recall)
        f1s.append(f1)
        overall_acc = (tp+tn)/(tp+fp+tn+fn)
        overall_accs.append(overall_acc)
        mcc = matthews_corrcoef(y_test,baseline_predicts)
        mccs.append(mcc)
    #calculate standard deviation of the performance mesurements
    std_prec = np.std(precisions)
    std_neg_prec = np.std(neg_precs)
    std_recall = np.std(recalls)
    std_neg_cov = np.std(neg_covs)
    std_f1 = np.std(f1s)
    std_overall_acc = np.std(overall_accs)
    std_mcc = np.std(mccs)
    print("std_prec: {}\nstd_neg_prec: {}\nstd_recall: {}\nstd_neg_cov: {}\nstd_f1: {}\nstd_overall_acc: {}\nstd_mcc: {}".format(std_prec, std_neg_prec, std_recall, std_neg_cov, std_f1, std_overall_acc, std_mcc))


In [None]:
#calculating the performance of the baseline prediction
dist = get_distrib(ml_data)
tp = 0
tn = 0
fp = 0
fn = 0
baseline_predicts = []
for i in range(len(y_test)):
    pred = ZeroRuleBaseline(dist)
    baseline_predicts.append(pred)
    if y_test[i] == 1:
        if pred == 1:
            tp = tp +1
        else:
            fn = fn +1
    else:
        if pred == 1:
            fp = fp +1
        else:
            tn = tn +1
print("True Positive: {}\nTrue Negative: {}\nFalse Positive: {}\nFalse Negative: {}".format(tp, tn, fp, fn))
#calculating performance scores
precision = tp/(tp+fp)
print("Precision:")
print(precision)

neg_prec = tn/(tn+fn)
print("negative Precision:")
print(neg_prec)

recall = tp/(tp+fn)
print("Recall:")
print(recall)

neg_cov = tn/(tn+fp)
print("negative coverage:")
print(neg_cov)

f1 = 2*precision*recall/(precision+recall)
print("f1-score:")
print(f1)

overall_acc = (tp+tn)/(tp+fp+tn+fn)
print("overall accuracy:")
print(overall_acc)

print("MCC:")
print(matthews_corrcoef(y_test,baseline_predicts))

calc_std_errs_baseline()


In [None]:
#loading the dataset from pickle
DataSet = pd.read_pickle("DataSet2_fpsize1k", compression='infer')