In [None]:
#install tpot library (for automl)
!pip install tpot 

Collecting tpot
  Downloading TPOT-0.11.7-py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 3.0 MB/s 
Collecting stopit>=1.1.1
  Downloading stopit-1.1.2.tar.gz (18 kB)
Collecting update-checker>=0.16
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting deap>=1.2
  Downloading deap-1.3.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)
[K     |████████████████████████████████| 160 kB 17.4 MB/s 
Collecting xgboost>=1.1.0
  Downloading xgboost-1.5.1-py3-none-manylinux2014_x86_64.whl (173.5 MB)
[K     |████████████████████████████████| 173.5 MB 9.9 kB/s 
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py) ... [?25l[?25hdone
  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11952 sha256=21622bd18ed1375250ca1e0c52e307d5e8d92d1b2da924f22c3cbcd4d0b1dbd9
  Stored in directory: /root/.cache/pip/wheels/e2/d2/79/eaf81edb391e27c87f51b8ef90

In [None]:
#import necessary libraries
import pandas as pd 
import numpy as np

#import machine learning/deep learning libraries for training
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from tpot import TPOTClassifier
from keras.models import Sequential
from keras.layers import Dense

from numpy import mean
from numpy import std

#import libraries for reporting metrics 
import sklearn
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [None]:
# data import and count number of positive, negative cases 
all_data = pd.read_csv("all_list_enzyme.csv", sep="\t")
pos_cnt = all_data[all_data['Class']==1].count()[1]
neg_cnt = all_data[all_data['Class']==0].count()[1]

In [None]:
negative = all_data[all_data['Class']==0]

In [None]:
#define neural network
model = Sequential()
model.add(Dense(12, input_dim=7, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
  #for hyperparameter tuning - logistic regression
  solvers = ['newton-cg', 'lbfgs', 'liblinear']
  penalty = ['l2']
  c_values = [100, 10, 1.0, 0.1, 0.01]
  grid2 = dict(solver=solvers,penalty=penalty,C=c_values)

  #for hyperparameter tuning - random forest
  n_estimators = [10, 100, 1000]
  max_features = ['sqrt', 'log2']
  grid = dict(n_estimators=n_estimators,max_features=max_features)

In [None]:
# lists for storing results 
# rf means random forest
# lr means logistic regression
# model in this case means neural network model
# automl means AutoML implemented by tpot libraries 
rf_auc_res =[]
rf_acc_res = []
rf_precision_res = []
rf_f1_res =[]
rf_recall_res = []
lr_auc_res =[]
lr_acc_res = []
lr_precision_res = []
lr_f1_res =[]
lr_recall_res = []
model_auc_res =[]
model_acc_res = []
model_precision_res = []
model_f1_res =[]
model_recall_res = []
automl_auc_res =[]
automl_acc_res = []
automl_precision_res = []
automl_f1_res =[]
automl_recall_res = []

for i in range(5):
  # make data table for implementing model 
  neg = all_data.query("Class ==0").sample(70000)
  positive = all_data.query("Class ==1").sample(70000)
  new_data = pd.concat([neg, positive])
  y = new_data['Class']
  X = new_data.drop(['Class','Drug1','Drug2'],1)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
  
  #Random Forest model - model define, parameter tuning, training, predict, metric reporting
  rf = RandomForestClassifier()
  xtree = GridSearchCV(estimator=rf, param_grid=grid, n_jobs=-1, scoring='accuracy',error_score=0)
  xtree.fit(X_train, y_train)
  y_pred_xtree =xtree.predict(X_test)
  yhat_probs_xtree = xtree.predict_proba(X_test)
  rf_auc = roc_auc_score(y_test,yhat_probs_xtree[:, 1])
  rf_auc_res.append(rf_auc)
  rf_acc_res.append(accuracy_score(y_test, y_pred_xtree))
  rf_precision = precision_score(y_test, y_pred_xtree)
  rf_recall = recall_score(y_test, y_pred_xtree)
  rf_precision_res.append(rf_precision)
  rf_recall_res.append(rf_recall)
  rf_f1= f1_score(y_test, y_pred_xtree)
  rf_f1_res.append(rf_f1)
  
  #Logistic Regression model - model define, parameter tuning, training, predict, metric reporting
  lr = LogisticRegression()
  grid_search = GridSearchCV(estimator=lr, param_grid=grid2, n_jobs=-1,scoring='accuracy',error_score=0)
  logistic = grid_search.fit(X_train, y_train)
  y_pred_logistic =logistic.predict(X_test)
  yhat_probs_logistic = logistic.predict_proba(X_test)
  lr_auc = roc_auc_score(y_test,yhat_probs_logistic[:, 1])
  lr_auc_res.append(lr_auc)
  lr_acc_res.append(accuracy_score(y_test, y_pred_logistic))
  lr_precision = precision_score(y_test, y_pred_logistic)
  lr_recall = recall_score(y_test, y_pred_logistic)
  lr_precision_res.append(lr_precision)
  lr_recall_res.append(lr_recall)
  lr_f1= f1_score(y_test, y_pred_logistic)
  lr_f1_res.append(lr_f1)
  
  #Neural network model - model define, training, predict, metric reporting
  model.fit(X_train, y_train, epochs=40, batch_size=10, verbose=0)
  yhat_probs = model.predict(X_test, verbose=0)
  yhat_classes = (model.predict(X_test) > 0.5).astype("int32")
  yhat_probs = yhat_probs[:, 0]
  yhat_classes = yhat_classes[:, 0]
  accuracy = accuracy_score(y_test, yhat_classes)
  precision = precision_score(y_test, yhat_classes)
  recall = recall_score(y_test, yhat_classes)
  f1 = f1_score(y_test, yhat_classes)
  auc = roc_auc_score(y_test, yhat_probs)
  model_auc_res.append(auc)
  model_acc_res.append(accuracy)
  model_precision_res.append(precision)
  model_recall_res.append(recall)
  model_f1_res.append(f1)

  #AutoML - model define, training, predict, metric reporting
  tpot = TPOTClassifier(generations=5, population_size=10,scoring='accuracy', verbosity=2)
  tpot.fit(X_train, y_train)
  y_pred_tpot =tpot.predict(X_test)
  yhat_probs_tpot = tpot.predict_proba(X_test)
  automl_auc = roc_auc_score(y_test,yhat_probs_tpot[:, 1])
  automl_auc_res.append(automl_auc)
  automl_acc_res.append(accuracy_score(y_test, y_pred_tpot))
  automl_precision = precision_score(y_test, y_pred_tpot)
  automl_recall = recall_score(y_test, y_pred_tpot)
  automl_precision_res.append(automl_precision)
  automl_recall_res.append(automl_recall)
  automl_f1= f1_score(y_test, y_pred_tpot)
  automl_f1_res.append(automl_f1) 

  #new candidates 
  negative = all_data[all_data['Class']==0]
  df = pd.concat([negative, neg])
  df = df.reset_index(drop=True)
  df_diff = pd.concat([negative,neg]).drop_duplicates(keep=False)
  df_diff1 = df_diff.drop(['Class','Drug1','Drug2'],1)
  y_pred_tpot =tpot.predict(df_diff1)
  yhat_probs_tpot = tpot.predict_proba(X_test)
  df_diff.reset_index(drop=True, inplace=True)
  predictions = pd.DataFrame(y_pred_tpot)
  prediction_prob = pd.DataFrame(yhat_probs_tpot)
  df_res1 = pd.concat([df_diff[['Drug1', 'Drug2']],predictions , prediction_prob], axis=1)
  df_res1.to_csv("prediction_res.csv",sep="\t", index=None, header=None)

In [None]:
print(rf_auc_res)
print(rf_acc_res)
print(rf_precision_res)
print(rf_f1_res)
print(rf_recall_res)
print(lr_auc_res)
print(lr_acc_res)
print(lr_precision_res)
print(lr_f1_res)
print(lr_recall_res)
print(model_auc_res)
print(model_acc_res)
print(model_precision_res)
print(model_f1_res)
print(model_recall_res)
print(automl_auc_res)
print(automl_acc_res)
print(automl_precision_res)
print(automl_f1_res)
print(automl_recall_res)

In [None]:
# average across five iteration 
print(mean(rf_auc_res))
print(mean(rf_acc_res))
print(mean(rf_precision_res))
print(mean(rf_f1_res))
print(mean(rf_recall_res))
print(mean(lr_auc_res))
print(mean(lr_acc_res))
print(mean(lr_precision_res))
print(mean(lr_f1_res))
print(mean(lr_recall_res))
print(mean(model_auc_res))
print(mean(model_acc_res))
print(mean(model_precision_res))
print(mean(model_f1_res))
print(mean(model_recall_res))
print(mean(automl_auc_res))
print(mean(automl_acc_res))
print(mean(automl_precision_res))
print(mean(automl_f1_res))
print(mean(automl_recall_res))