<a href="https://colab.research.google.com/github/KA0335/DeepMirror/blob/main/Benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#installing all dependencies
#DeepP
!pip install git+https://github.com/bp-kelley/descriptastorus 
!pip install DeepPurpose
!!pip install PyTDC

In [None]:
#cloning deeppurpose repo for reference
!git clone https://github.com/kexinhuang12345/DeepPurpose.git

In [None]:
#Importing all libraries
from sklearn import tree
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from DeepPurpose import utils, CompoundPred
from tdc.single_pred import ADME
from os import mkdir
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor

In [None]:
#MPNN - A supervised GNN
def GraphDL(X,y,encoding, data):
  drug_encoding = encoding
  train, val, test = utils.data_process(X_drug = X, 
                                        y = y, 
                                        drug_encoding = drug_encoding,
                                        random_seed = 'TDC')# seed name for the dataset
  
  config = utils.generate_config(drug_encoding = drug_encoding, 
                          train_epoch = 15, 
                          LR = 0.002, 
                          batch_size = 32,
                          mpnn_hidden_size = 32,
                          mpnn_depth = 2
                          )
  model = CompoundPred.model_initialize(**config)
  # Training
  model.train(train, val, test)
  model.save_model(data+'/'+encoding+data)

In [None]:
#method to plot XGBoost model
def plot_roc(model, X_test, y_test,encoding, data):
    # calculate the fpr and tpr for all thresholds of the classification
    probabilities = model.predict_proba(np.array(X_test))
    predictions = probabilities[:, 1]
    fpr, tpr, threshold = metrics.roc_curve(y_test, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    #generating the plots
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.savefig(encoding+"_"+data+'.png', bbox_inches='tight')

#Method for regression- XGBoost
def XGBR(X, y,encoding, data):
  model = XGBRegressor(n_estimators=2000, max_depth=30, eta=0.5, subsample=0.7, colsample_bytree=0.2)
  # define model evaluation method
  cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
  # evaluate model
  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
  # force scores to be positive
  scores = absolute(scores)
  print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

#Method for classification- XGBoost  
def XGBC(X, y,encoding, data):
 
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
  model = XGBClassifier()
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  predictions = [round(value) for value in y_pred]
  # evaluate predictions
  accuracy = accuracy_score(y_test, predictions)
  final = confusion_matrix(y_test, predictions)
  plot_roc(model, X_test, y_test, encoding, data)
  return accuracy

#preprocessing the data and making it ready for XGBoost
def trainings(X,y,encoding, data, reg):
  drug_encoding = encoding
  train, val, test = utils.data_process(X_drug = X, 
                                        y = y, 
                                        drug_encoding = drug_encoding,
                                        random_seed = 'TDC')# seed name for the dataset
  
  #dropping smile string as we have the encodings now
  train = train.drop(['SMILES'], axis=1)
  test = test.drop(['SMILES'], axis=1)
  val = val.drop(['SMILES'], axis=1)
  train = train.append(test)
  train = train.append(val)
  #separating the labels 
  y = train.iloc[:,:-1]
  z = pd.DataFrame(train["drug_encoding"].to_list(), columns=['1', '2'])
 
  matrix = []
  l = len(z)
  for i in range(l):
      temp = z['1'][i]
      # Append an empty sublist inside the list
      matrix.append([])
      matrix[i].append(temp)
  new_mat = np.array(matrix)
  reshaped_array=np.reshape(new_mat,(l,50))
  if reg == False:
  
    acc= XGBC(reshaped_array, y, encoding, data)
    return acc
  else:
    acc = XGBR(reshaped_array, y, encoding, data)

#training the transformer model
def transformer(X,y,encoding, data):
  drug_encoding = encoding
  train, val, test = utils.data_process(X_drug = X, 
                                        y = y, 
                                        drug_encoding = drug_encoding,
                                        random_seed = 'TDC')# seed name for the dataset

  config = utils.generate_config(drug_encoding = drug_encoding, 
                          train_epoch = 15, 
                          LR = 0.001, 
                          batch_size = 128,
                          )
  model = CompoundPred.model_initialize(**config)
  #Training
  model.train(train, val, test)
  model.save_model(data+'/'+encoding+data)

In [None]:
#graph neural network
metric = {}
#dataset for classification
            #A          #M              #D
dataset = ['HIA_Hou','CYP2C19_Veith','BBB_Martins'] 
reg = False
for data in dataset:
  mkdir(data)
for data in dataset:
  os.chdir('/content/'+data)
  X, y = ADME(name = data).get_data(format = 'DeepPurpose')
  encoding = 'MPNN'
  GraphDL(X,y,encoding, data)
  encoding = 'Transformer'
  transformer(X,y,encoding, data)
  acc = trainings(X,y,encoding, data, reg)
  metric[data] = acc


In [None]:
%cd "/content/"

/content


In [None]:
#Regression
        #E
data = 'Half_Life_Obach'
mkdir(data)
os.chdir('/content/'+data)
X, y = ADME(name = data).get_data(format = 'DeepPurpose')
encoding = 'MPNN'
GraphDL(X,y,encoding, data)
encoding = 'Transformer'
transformer(X,y,encoding, data)
reg = True
acc = trainings(X,y,encoding, data, reg)
metric[data] = acc