## Package installation

In [None]:
!wget -c https://repo.continuum.io/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh
!chmod +x Miniconda3-py37_4.8.3-Linux-x86_64.sh
!time bash ./Miniconda3-py37_4.8.3-Linux-x86_64.sh -b -f -p /usr/local
#!time conda install -q -y -c conda-forge rdkit
!time conda install -c rdkit rdkit

import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [None]:
!pip install --pre deepchem #
!pip install dgl-cu110
!pip install dgllife

In [None]:
!pip install xgboost


## Import librairies

In [None]:
import deepchem as dc
from deepchem.molnet.preset_hyper_parameters import hps
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import cohen_kappa_score
import warnings
import numpy as np
warnings.filterwarnings("ignore")

## Load Data

In [None]:
def load_data(model, data):
    
    if model in ["graphconv", "dag"]:
      featurizer = dc.feat.ConvMolFeaturizer()
    elif model in ["tf", "irv", "tf_robust", "kernelsvm", "rf", "logreg", "xgb"]:
      featurizer = dc.feat.CircularFingerprint()
    elif model in ["gat", "gcn"]:
      featurizer = dc.feat.MolGraphConvFeaturizer()
    elif model == "mpnn":
      featurizer = dc.feat.WeaveFeaturizer()
    elif model == "textcnn":
      featurizer = None

    tasks = ['label']
    loader = dc.data.CSVLoader(tasks=tasks, feature_field="smiles",featurizer=featurizer)
    dataset = loader.create_dataset(data)
    transformer = None

    return (dataset, [transformer])

In [None]:
data = "/mydrive/drug_discovery/data.csv"
dataset, transformers = load_data("dag", data)
splitter = dc.splits.RandomSplitter()
train_dataset, test_dataset = splitter.train_test_split(dataset, frac_train = 0.90, seed=123)

In [None]:
print("Dataset size  : ",train_dataset.X.shape[0] + test_dataset.X.shape[0])
print("Trainset size : ",train_dataset.X.shape[0])
print("Testset size  : ",test_dataset.X.shape[0])

# Fit Model

## Benchmark classification

In [None]:
def benchmark_classification(train_dataset,
                             valid_dataset,
                             test_dataset,
                             tasks,
                             transformers,
                             n_features,
                             metric,
                             model,
                             test=False,
                             hyper_parameters=None,
                             seed=123):
  """
  Calcul la performance de différents modèles sur l'ensemble de données et les tasks spécifiques.
  Paramètres
  ----------
  train_dataset : struct dataset
      Jeu de données utilisé pour l'entraînement et l'évaluation du modèle
  valid_dataset : struct dataset
      jeu de données utilisé uniquement pour l'évaluation du modèle (et le réglage des hyperparamètres)
  test_dataset : struct dataset
      jeu de données utilisé uniquement pour l'évaluation du modèle
  tasks : liste de chaînes de caractères
      liste de cibles (tasks, datasets)
  transformers : dc.trans.Transformer struct
      transformateur utilisé pour l'évaluation du modèle
  n_features : integer
      nombre de caractéristiques, ou longueur des binary fingerprints
  metric : liste d'objets dc.metrics.Metric
      métriques utilisées pour l'évaluation
  model : chaîne de caractères, facultatif
      choix du modèle
      rf', 'tf', 'tf_robust', 'logreg', 'irv', 'graphconv', 'dag', 'xgb',
      weave', 'kernelsvm', 'textcnn', 'mpnn'.
  test : booléen, facultatif
      calcul ou non des performances de test_set
  hyper_parameters : dict, facultatif (default=None)
      paramètres hyper pour le modèle désigné, None = utiliser les valeurs prédéfinies
  Retourne
  -------
  train_scores : dict
  résultats de prédiction (AUC) sur l'ensemble d'entraînement
  valid_scores : dict
  prédiction des résultats (AUC) sur l'ensemble valide
  test_scores : dict
  prédiction des résultats (AUC) sur l'ensemble de test
  """
  train_scores = {}
  valid_scores = {}
  test_scores = {}

  assert model in [
      'rf', 'tf', 'tf_robust', 'logreg', 'irv', 'graphconv', 'dag', 'xgb',
      'weave', 'kernelsvm', 'textcnn', 'mpnn', 'gat', 'gcn'
  ]
  if hyper_parameters is None and model not in ['gat', 'gcn']:
    hyper_parameters = hps[model]
  model_name = model

  if model_name == "gat":
    nb_epoch = 40
    model = dc.models.GATModel(1,
                 mode='classification',
                 batch_size=32,
                 learning_rate=0.0001,
                 dropout=0.25,
                 )
  elif model_name == "gcn":
    nb_epoch = 40
    model = dc.models.GCNModel(1,
                 mode='classification',
                 batch_size=32,
                 learning_rate=0.001,
                 dropout=0.1,
                 )

  elif model_name == 'tf':
    layer_sizes = hyper_parameters['layer_sizes']
    weight_init_stddevs = hyper_parameters['weight_init_stddevs']
    bias_init_consts = hyper_parameters['bias_init_consts']
    dropouts = hyper_parameters['dropouts']
    penalty = hyper_parameters['penalty']
    penalty_type = hyper_parameters['penalty_type']
    batch_size = hyper_parameters['batch_size']
    nb_epoch = hyper_parameters['nb_epoch']
    learning_rate = hyper_parameters['learning_rate']

    # Building tensorflow MultitaskDNN model
    model = dc.models.MultitaskClassifier(
        len(tasks),
        2048,
        layer_sizes=layer_sizes,
        weight_init_stddevs=weight_init_stddevs,
        bias_init_consts=bias_init_consts,
        dropouts=dropouts,
        weight_decay_penalty=penalty,
        weight_decay_penalty_type=penalty_type,
        batch_size=batch_size,
        learning_rate=learning_rate,
        random_seed=seed)

  elif model_name == 'tf_robust':
    layer_sizes = hyper_parameters['layer_sizes']
    weight_init_stddevs = hyper_parameters['weight_init_stddevs']
    bias_init_consts = hyper_parameters['bias_init_consts']
    dropouts = hyper_parameters['dropouts']

    bypass_layer_sizes = hyper_parameters['bypass_layer_sizes']
    bypass_weight_init_stddevs = hyper_parameters['bypass_weight_init_stddevs']
    bypass_bias_init_consts = hyper_parameters['bypass_bias_init_consts']
    bypass_dropouts = hyper_parameters['bypass_dropouts']

    penalty = hyper_parameters['penalty']
    penalty_type = hyper_parameters['penalty_type']
    batch_size = hyper_parameters['batch_size']
    nb_epoch = hyper_parameters['nb_epoch']
    learning_rate = hyper_parameters['learning_rate']

    # Building tensorflow robust MultitaskDNN model
    model = dc.models.RobustMultitaskClassifier(
        len(tasks),
        n_features,
        layer_sizes=layer_sizes,
        weight_init_stddevs=weight_init_stddevs,
        bias_init_consts=bias_init_consts,
        dropouts=dropouts,
        bypass_layer_sizes=bypass_layer_sizes,
        bypass_weight_init_stddevs=bypass_weight_init_stddevs,
        bypass_bias_init_consts=bypass_bias_init_consts,
        bypass_dropouts=bypass_dropouts,
        weight_decay_penalty=penalty,
        weight_decay_penalty_type=penalty_type,
        batch_size=batch_size,
        learning_rate=learning_rate,
        random_seed=seed)

  elif model_name == 'logreg':
    penalty = hyper_parameters['penalty']
    penalty_type = hyper_parameters['penalty_type']
    nb_epoch = None

    # Building scikit logistic regression model
    def model_builder(model_dir):
      sklearn_model = LogisticRegression(
          penalty=penalty_type,
          C=1. / penalty,
          class_weight="balanced",
          n_jobs=-1)
      return dc.models.sklearn_models.SklearnModel(
          sklearn_model, model_dir)

    model = dc.models.multitask.SingletaskToMultitask(
        tasks, model_builder)

  elif model_name == 'irv':
    penalty = hyper_parameters['penalty']
    batch_size = hyper_parameters['batch_size']
    nb_epoch = hyper_parameters['nb_epoch']
    learning_rate = hyper_parameters['learning_rate']
    n_K = hyper_parameters['n_K']

    # Transform fingerprints to IRV features
    transformer = dc.trans.IRVTransformer(n_K, len(tasks), train_dataset)
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    if test:
      test_dataset = transformer.transform(test_dataset)

    # Building tensorflow IRV model
    model = dc.models.MultitaskIRVClassifier(
        len(tasks),
        K=n_K,
        penalty=penalty,
        batch_size=batch_size,
        learning_rate=learning_rate,
        random_seed=seed,
        mode='classification')

  elif model_name == 'graphconv':
    batch_size = hyper_parameters['batch_size']
    nb_epoch = hyper_parameters['nb_epoch']
    learning_rate = hyper_parameters['learning_rate']
    n_filters = hyper_parameters['n_filters']
    n_fully_connected_nodes = hyper_parameters['n_fully_connected_nodes']

    model = dc.models.GraphConvModel(
        len(tasks),
        graph_conv_layers=[n_filters] * 2,
        dense_layer_size=n_fully_connected_nodes,
        batch_size=batch_size,
        learning_rate=learning_rate,
        random_seed=seed,
        mode='classification')

  elif model_name == 'dag':
    batch_size = hyper_parameters['batch_size']
    nb_epoch = hyper_parameters['nb_epoch']
    learning_rate = hyper_parameters['learning_rate']
    n_graph_feat = hyper_parameters['n_graph_feat']
    default_max_atoms = hyper_parameters['default_max_atoms']

    max_atoms_train = max([mol.get_num_atoms() for mol in train_dataset.X])
    max_atoms_valid = max([mol.get_num_atoms() for mol in valid_dataset.X])
    if test :
      max_atoms_test = max([mol.get_num_atoms() for mol in test_dataset.X])
      max_atoms = max([max_atoms_train, max_atoms_valid, max_atoms_test])
    else:
      max_atoms = max([max_atoms_train, max_atoms_valid])
    max_atoms = min([max_atoms, default_max_atoms])
    print('Maximum number of atoms: %i' % max_atoms)
    reshard_size = 256
    transformer = dc.trans.DAGTransformer(max_atoms=max_atoms)
    train_dataset.reshard(reshard_size)
    train_dataset = transformer.transform(train_dataset)
    valid_dataset.reshard(reshard_size)
    valid_dataset = transformer.transform(valid_dataset)
    if test:
      test_dataset.reshard(reshard_size)
      test_dataset = transformer.transform(test_dataset)

    model = dc.models.DAGModel(
        len(tasks),
        max_atoms=max_atoms,
        n_atom_feat=75,
        n_graph_feat=n_graph_feat,
        n_outputs=30,
        batch_size=batch_size,
        learning_rate=learning_rate,
        random_seed=seed,
        use_queue=False,
        mode='classification')

  elif model_name == 'weave':
    batch_size = hyper_parameters['batch_size']
    nb_epoch = hyper_parameters['nb_epoch']
    learning_rate = hyper_parameters['learning_rate']
    n_graph_feat = hyper_parameters['n_graph_feat']
    n_pair_feat = hyper_parameters['n_pair_feat']

    model = dc.models.WeaveModel(
        len(tasks),
        n_atom_feat=n_features,
        n_pair_feat=n_pair_feat,
        n_hidden=50,
        n_graph_feat=n_graph_feat,
        batch_size=batch_size,
        learning_rate=learning_rate,
        use_queue=False,
        random_seed=seed,
        mode='classification')

  elif model_name == 'textcnn':
    batch_size = hyper_parameters['batch_size']
    nb_epoch = hyper_parameters['nb_epoch']
    learning_rate = hyper_parameters['learning_rate']
    n_embedding = hyper_parameters['n_embedding']
    filter_sizes = hyper_parameters['filter_sizes']
    num_filters = hyper_parameters['num_filters']

    all_data = dc.data.DiskDataset.merge(
        [train_dataset, valid_dataset, test_dataset])
    char_dict, length = dc.models.TextCNNModel.build_char_dict(all_data)

    model = dc.models.TextCNNModel(
        len(tasks),
        char_dict,
        seq_length=length,
        n_embedding=n_embedding,
        filter_sizes=filter_sizes,
        num_filters=num_filters,
        learning_rate=learning_rate,
        batch_size=batch_size,
        use_queue=False,
        random_seed=seed,
        mode='classification')

  elif model_name == 'mpnn':
    batch_size = hyper_parameters['batch_size']
    nb_epoch = hyper_parameters['nb_epoch']
    learning_rate = hyper_parameters['learning_rate']
    T = hyper_parameters['T']
    M = hyper_parameters['M']

    model = dc.models.MPNNModel(
        len(tasks),
        n_atom_feat=n_features[0],
        n_pair_feat=n_features[1],
        n_hidden=n_features[0],
        T=T,
        M=M,
        batch_size=batch_size,
        learning_rate=learning_rate,
        use_queue=False,
        mode="classification")

  elif model_name == 'rf':
    n_estimators = hyper_parameters['n_estimators']
    nb_epoch = None

    # Building scikit random forest model
    def model_builder(model_dir):
      sklearn_model = RandomForestClassifier(
          class_weight="balanced", n_estimators=n_estimators, n_jobs=-1)
      return dc.models.sklearn_models.SklearnModel(
          sklearn_model, model_dir)

    model = dc.models.multitask.SingletaskToMultitask(
        tasks, model_builder)

  elif model_name == 'kernelsvm':
    C = hyper_parameters['C']
    gamma = hyper_parameters['gamma']
    nb_epoch = None

    # Building scikit learn Kernel SVM model
    def model_builder(model_dir):
      sklearn_model = SVC(
          C=C, gamma=gamma, class_weight="balanced", probability=True)
      return dc.models.SklearnModel(sklearn_model, model_dir)

    model = dc.models.multitask.SingletaskToMultitask(
        tasks, model_builder)

  elif model_name == 'xgb':
    max_depth = hyper_parameters['max_depth']
    learning_rate = hyper_parameters['learning_rate']
    n_estimators = hyper_parameters['n_estimators']
    gamma = hyper_parameters['gamma']
    min_child_weight = hyper_parameters['min_child_weight']
    max_delta_step = hyper_parameters['max_delta_step']
    subsample = hyper_parameters['subsample']
    colsample_bytree = hyper_parameters['colsample_bytree']
    colsample_bylevel = hyper_parameters['colsample_bylevel']
    reg_alpha = hyper_parameters['reg_alpha']
    reg_lambda = hyper_parameters['reg_lambda']
    scale_pos_weight = hyper_parameters['scale_pos_weight']
    base_score = hyper_parameters['base_score']
    seed = hyper_parameters['seed']
    early_stopping_rounds = hyper_parameters['early_stopping_rounds']
    nb_epoch = None

    esr = {'early_stopping_rounds': early_stopping_rounds}

    # Building xgboost classification model
    def model_builder(model_dir):
      import xgboost
      xgboost_model = xgboost.XGBClassifier(
          max_depth=max_depth,
          learning_rate=learning_rate,
          n_estimators=n_estimators,
          gamma=gamma,
          min_child_weight=min_child_weight,
          max_delta_step=max_delta_step,
          subsample=subsample,
          colsample_bytree=colsample_bytree,
          colsample_bylevel=colsample_bylevel,
          reg_alpha=reg_alpha,
          reg_lambda=reg_lambda,
          scale_pos_weight=scale_pos_weight,
          base_score=base_score,
          seed=seed)
      return dc.models.GBDTModel(
          xgboost_model, model_dir, **esr)

    model = dc.models.multitask.SingletaskToMultitask(
        tasks, model_builder)

  if nb_epoch is None:
    model.fit(train_dataset)
  else:
    model.fit(train_dataset, nb_epoch=nb_epoch)

  train_scores[model_name] = model.evaluate(train_dataset, metric)
  valid_scores[model_name] = model.evaluate(valid_dataset, metric)
  if test:
    test_scores[model_name] = model.evaluate(test_dataset, metric)

  return train_scores, valid_scores, test_scores


## Instantiate model

In [None]:
metric = [dc.metrics.Metric(dc.metrics.roc_auc_score), dc.metrics.Metric(dc.metrics.f1_score), dc.metrics.Metric(dc.metrics.recall_score)]

train_scores, test_scores, _valid_scores = benchmark_classification(train_dataset = train_dataset,
                                                                    valid_dataset = test_dataset,
                                                                    test_dataset = None,
                                                                    tasks = [1],
                                                                    transformers = transformers,
                                                                    n_features = 2048,
                                                                    metric = metric,
                                                                    model = 'xgb',
                                                                    test=False,
                                                                    hyper_parameters=None,
                                                                    seed=123)

#Hyperparameter Tuning
One of the most important aspects of machine learning is hyperparameter tuning. Many machine learning models have a number of hyperparameters that control aspects of the model. These hyperparameters typically cannot be learned directly by the same learning algorithm used for the rest of learning and have to be set in an alternate fashion.

###GCNModel model Optimisation

In [None]:
nb_epoch = [120, 40]
batch_size = [64, 32, 16]
learning_rate = [0.001, 0.0001]
dropout = [0.15, 0.10]
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)

param_nb_epoch = []
param_batch_size = []
param_learning_rate =[]
param_dropout =[]
test_acc = []
test_auc = []
import pandas as pd
parameters_results = pd.DataFrame()

for ep in nb_epoch:
  for bsiz in batch_size:
    for lgr in learning_rate:
      for dp in dropout:
        print("Curent params : ", ep, bsiz, lgr, dp)
        print("Instanciate model...\n")
        model = dc.models.GCNModel(1,
                 mode='classification',
                 batch_size=bsiz,
                 learning_rate=lgr,
                 dropout=dp,
                 )
        print("Fitting model...")
        model.fit(train_dataset, nb_epoch= ep)
        test_score = model.evaluate(test_dataset, [metric])
        y_test = test_dataset.y
        y_test_pred = model.predict(test_dataset)
        y_tst_pred = np.argmax(y_test_pred, axis=1) 
        y_test_prediction = np.expand_dims(y_tst_pred, -1)
        y_test_prediction = y_test_prediction.astype('float64')

        param_nb_epoch.append(ep)
        param_batch_size.append(bsiz)
        param_learning_rate.append(lgr)
        param_dropout.append(dp)
        test_acc.append(round(accuracy_score(y_test, y_test_prediction), 2))
        test_auc.append(round(test_score["roc_auc_score"], 2))

  parameters_results["epoch"] = param_nb_epoch
  parameters_results["dropout"] = param_dropout
  parameters_results["learning_rate"] = param_learning_rate
  parameters_results["batch_size"] = param_batch_size
  parameters_results["accuracy"] = test_acc
  parameters_results["auc roc score"] = test_auc
  parameters_results.to_csv("/mydrive/drug_discovery/GCNModel_Opt.csv", index=False)
  parameters_results = pd.DataFrame()


### RF Model Optimisation

In [None]:
model = RandomForestClassifier()
n_estimators = [10, 100, 300, 500, 600, 1000]
max_features = ['sqrt', 'log2']
nb_epoch = None
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)

import pandas as pd
parameters_results = pd.DataFrame()
param_n_estimators = []
param_max_features = []
test_acc = []
test_auc = []

for n_est in n_estimators:
  for max_f in max_features:
    print("Curent params : ", n_est, max_f)
    print("Instanciate model...")
    # Building scikit random forest model
    def model_builder(model_dir):
      sklearn_model = RandomForestClassifier(
          class_weight = "balanced", n_estimators = n_est, max_features = max_f, n_jobs=-1)
      return dc.models.sklearn_models.SklearnModel(
          sklearn_model, model_dir)

    model = dc.models.multitask.SingletaskToMultitask(
        [1], model_builder)
    print("Fitting model...\n")
    model.fit(train_dataset)

    test_score = model.evaluate(test_dataset, [metric])
    y_test = test_dataset.y
    y_test_pred = model.predict(test_dataset)
    y_tst_pred = np.argmax(y_test_pred[:,0], axis=1) 
    y_test_prediction = np.expand_dims(y_tst_pred, -1)
    y_test_prediction = y_test_prediction.astype('float64')

    param_n_estimators.append(n_est)
    param_max_features.append(max_f)
    test_acc.append(round(accuracy_score(y_test, y_test_prediction), 2))
    test_auc.append(round(test_score["roc_auc_score"], 2))

  parameters_results["n_estimators"] = param_n_estimators
  parameters_results["max_features"] = param_max_features
  parameters_results["accuracy"] = test_acc
  parameters_results["auc roc score"] = test_auc
  parameters_results.to_csv("/mydrive/drug_discovery/RF_Opt.csv", index=False)
  parameters_results = pd.DataFrame()

### DAGModel Optimisation

In [None]:
batch_size = [64, 32]  #hyper_parameters['batch_size']
nb_epoch = [50, 40] #hyper_parameters['nb_epoch']
learning_rate = [0.0005, 0.001] #hyper_parameters['learning_rate']
n_graph_feat =  [30, 20] #hyper_parameters['n_graph_feat']
default_max_atoms = 60 #hyper_parameters['default_max_atoms']
seed = 123
max_atoms_train = max([mol.get_num_atoms() for mol in train_dataset.X])
max_atoms_test = max([mol.get_num_atoms() for mol in test_dataset.X])
max_atoms = max([max_atoms_train, max_atoms_test])
max_atoms = min([max_atoms, default_max_atoms])
print('Maximum number of atoms: %i' % max_atoms)
reshard_size = 256
transformer = dc.trans.DAGTransformer(max_atoms=max_atoms)
train_dataset.reshard(reshard_size)
train_dataset = transformer.transform(train_dataset)
test_dataset.reshard(reshard_size)
test_dataset = transformer.transform(test_dataset)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)

parambatch_size = []
paramnb_epoch = []
paramlearning_rate = []
paramn_graph_feat =  []
test_acc = []
test_auc = []
import pandas as pd
parameters_results = pd.DataFrame()
for btch_s in batch_size:
  for nb_ep in nb_epoch:
    for lrg_r in learning_rate:
      for n_gph in n_graph_feat:
        print("current conf : ", nb_ep, btch_s, lrg_r, n_gph)
        print("instantiate the model...")
        model = dc.models.DAGModel(
            1,
            max_atoms=max_atoms,
            n_atom_feat=75,
            n_graph_feat=n_gph,
            n_outputs=30,
            batch_size=btch_s,
            learning_rate=lrg_r,
            random_seed=seed,
            use_queue=False,
            mode='classification')
        
        print("Fitting the model...")
        model.fit(train_dataset, nb_epoch= nb_ep)
        print("evaluate the model...")
        test_score = model.evaluate(test_dataset, [metric])
        y_test = test_dataset.y
        y_test_pred = model.predict(test_dataset)
        y_tst_pred = np.argmax(y_test_pred[:,0], axis=1) 
        y_test_prediction = np.expand_dims(y_tst_pred, -1)
        y_test_prediction = y_test_prediction.astype('float64')

        paramnb_epoch.append(nb_ep)
        parambatch_size.append(btch_s)
        paramlearning_rate.append(lrg_r)
        paramn_graph_feat.append(n_gph)
        test_acc.append(round(accuracy_score(y_test, y_test_prediction), 2))
        test_auc.append(round(test_score["roc_auc_score"], 2))
  print("save the model...\n")
  parameters_results["epoch"] = paramnb_epoch
  parameters_results["n_graph_feat"] = paramn_graph_feat
  parameters_results["learning_rate"] = paramlearning_rate
  parameters_results["batch_size"] = parambatch_size
  parameters_results["accuracy"] = test_acc
  parameters_results["auc roc score"] = test_auc
  parameters_results.to_csv("/mydrive/drug_discovery/DAGModel_Opt.csv", index=False)
  parameters_results = pd.DataFrame()


# Load best model

### RF

In [None]:
def model_builder(model_dir):

  sklearn_model = RandomForestClassifier(
      class_weight = "balanced", n_estimators = 300, max_features = 'sqrt', n_jobs=-1)
  return dc.models.sklearn_models.SklearnModel(
      sklearn_model, model_dir)

model = dc.models.multitask.SingletaskToMultitask([1], model_builder)
model.fit(train_dataset)


### GCN

In [None]:
model = dc.models.GCNModel(1,
                mode='classification',
                batch_size=64,
                learning_rate=0.001,
                dropout=0.1,
                )
model.fit(train_dataset, nb_epoch= 40)

### DAG

In [None]:
default_max_atoms = 60 
seed = 123
max_atoms_train = max([mol.get_num_atoms() for mol in train_dataset.X])
max_atoms_test = max([mol.get_num_atoms() for mol in test_dataset.X])
max_atoms = max([max_atoms_train, max_atoms_test])
max_atoms = min([max_atoms, default_max_atoms])
print('Maximum number of atoms: %i' % max_atoms)
reshard_size = 256
transformer = dc.trans.DAGTransformer(max_atoms=max_atoms)
train_dataset.reshard(reshard_size)
train_dataset = transformer.transform(train_dataset)
test_dataset.reshard(reshard_size)
test_dataset = transformer.transform(test_dataset)

model = dc.models.DAGModel(
            1,
            max_atoms=max_atoms,
            n_atom_feat=75,
            n_graph_feat=30,
            n_outputs=30,
            batch_size=64,
            learning_rate=0.0005,
            random_seed=seed,
            use_queue=False,
            mode='classification')
model.fit(train_dataset, nb_epoch= 40)

# Evaluation

In [None]:
y_train = train_dataset.y
y_train_pred = model.predict(train_dataset)
y_t_pred = np.argmax(y_train_pred[:,0], axis=1) 
y_train_prediction = np.expand_dims(y_t_pred, -1)
y_train_prediction = y_train_prediction.astype('float64')

y_test = test_dataset.y
y_test_pred = model.predict(test_dataset)
y_tst_pred = np.argmax(y_test_pred[:,0], axis=1) 
y_test_prediction = np.expand_dims(y_tst_pred, -1)
y_test_prediction = y_test_prediction.astype('float64')

In [None]:

print("###################### roc auc score ##############\n")
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print('Training set score:', model.evaluate(train_dataset, [metric]))
print('Test set score:', model.evaluate(test_dataset, [metric]))

print("\n##################### Recall ####################")
metric = dc.metrics.Metric(dc.metrics.recall_score)
print('Training set score:', model.evaluate(train_dataset, [metric]))
print('Test set score:', model.evaluate(test_dataset, [metric]))

print("\n################### precision ################################")
print("'Training set score:", precision_score(y_train, y_train_prediction))
print("'Test set score:", precision_score(y_test, y_test_prediction))

print("\n################### MCC ################################")
print("'Training set score:", matthews_corrcoef(y_train, y_train_prediction))
print("'Test set score:", matthews_corrcoef(y_test, y_test_prediction))

print("\n################### cohen's kappa ################################")
print("'Training set score:", cohen_kappa_score(y_train, y_train_prediction))
print("'Test set score:", cohen_kappa_score(y_test, y_test_prediction))

print("\n################### Accuracy ################################")
print("'Training set score:", accuracy_score(y_train, y_train_prediction))
print("'Test set score:", accuracy_score(y_test, y_test_prediction))

print("################### Balanced accuracy ####################")
print("'Training set score:", balanced_accuracy_score(y_train, y_train_prediction))
print("'Test set score:", balanced_accuracy_score(y_test, y_test_prediction))

print("\n################### F1 score ####################\n")
metric = dc.metrics.Metric(dc.metrics.f1_score)
print('Training set score:', model.evaluate(train_dataset, [metric]))
print('Test set score:', model.evaluate(test_dataset, [metric]))