In [1]:
from hyperopt import fmin, hp, tpe, Trials
import numpy as np

import os
import sys

BASEDIR = os.path.dirname(os.getcwd())
sys.path.append(BASEDIR)

from copy import deepcopy
from dgl.dataloading import GraphDataLoader
from src.model.attentivefp_postnet import attentivefpPostNet
from src.utils.mol.attfp_graph import MoleculeDataset, collate_fn
from src.config.attentivefp_postnet import attentivefpPostNetArgs
from src.pipeline.ensemble import training_ensemble_models
from src.utils.basic.logger import Writer
from src.utils.model.metrics import accuracy, roc_auc_score, prc_auc, F1, MCC, expected_calibration_error, OverconfidentFalseRate, OverconfidentFalseNegatives, Brier
import torch
import pandas as pd

# Initialize

In [2]:
torch.set_num_threads(4)

target_name = "pgp"
gpu_num     = 0

h_p = {'hidden_size': 600,
       'p_dropout'  : 0.05,
       'T'          : 2,
       'radius'     : 3,
       'ffn_num_layers': 3,
       'init_lr' : 0.002,
       'latent_dim': 6,
       'n_density': 2,
       'density_type': 'iaf_flow'
}

INT_KEYS = ['hidden_size', 'ffn_num_layers', 'n_density', 'latent_dim']

# Training function

In [3]:
SAVEDIR = os.path.join(BASEDIR, "results", target_name, 'AttFpPost')
DATADIR = os.path.join(BASEDIR, "data", target_name)
n = 0
logger = Writer(os.path.join(SAVEDIR, "history.log"))
def func(hyperparams):
    logger(" ")
    logger(" ")

    global n

    for key in INT_KEYS:
        hyperparams[key] = int(hyperparams[key])

    config = attentivefpPostNetArgs().parse_args([], known_only=True)
    hyper_args = deepcopy(config)
    save_dir = os.path.join(SAVEDIR)

    for key, value in hyperparams.items():
        setattr(hyper_args, key, value)

    setattr(hyper_args, "dataset_type", "classification")
    setattr(hyper_args, "latent_dim", int(hyper_args.latent_dim))
    setattr(hyper_args, 'metric', "roc-auc")
    setattr(hyper_args, "extra_metrics", ["MCC", "prc-auc", "accuracy", "F1"])
    setattr(hyper_args, "ffn_hidden_size", hyper_args.hidden_size)
    setattr(hyper_args, "early_stopping_num", 30)
    setattr(hyper_args, "gpu", gpu_num)
    setattr(hyper_args, "log_frequency", 20)
    setattr(hyper_args, "batch_size", 64)
    setattr(hyper_args, "at_least_epoch", 60)
    print(hyper_args)

    for i in range(10):

        train_dataset = MoleculeDataset(os.path.join(DATADIR, "tenfold", f"train_{i}.csv"))
        valid_dataset = MoleculeDataset(os.path.join(DATADIR, "tenfold", f"valid_{i}.csv"))
        test_dataset  = MoleculeDataset(os.path.join(DATADIR, "ind_152.csv"))

        train_dataloader = GraphDataLoader(dataset=train_dataset, collate_fn=collate_fn, batch_size=128, drop_last=False, shuffle=True)
        train_dataloader.smiles = [[s] for s in train_dataset.smiles_list]
        valid_dataloader = GraphDataLoader(dataset=valid_dataset, collate_fn=collate_fn, batch_size=128, drop_last=False, shuffle=False)
        valid_dataloader.smiles = [[s] for s in valid_dataset.smiles_list]
        test_dataloader  = GraphDataLoader(dataset=test_dataset,  collate_fn=collate_fn, batch_size=128, drop_last=False, shuffle=False)
        test_dataloader.smiles  = [[s] for s in test_dataset.smiles_list]
        
        train_targets = []
        for _, t in train_dataset:
            train_targets.append(t)

        N = torch.tensor([len(train_targets) - np.sum(train_targets).astype(np.int64),
                        np.sum(train_targets).astype(np.int64)], dtype=torch.float64)

        setattr(hyper_args, "N", N)
        
        training_ensemble_models(os.path.join(save_dir, f"fold_{i}"),
                                 attentivefpPostNet,
                                 hyper_args,
                                 train_dataloader,
                                 valid_dataloader=valid_dataloader,
                                 test_dataloader=test_dataloader,
                                 ensemble_num=1)
        
        test_prediction = []
        
        valid_ROC = []
        valid_PRC = []
        valid_ACC = []
        valid_MCC = []
        valid_F1  = []
        valid_ECE = []
        valid_OFR = []
        valid_OFN = []
        valid_Brier = []
        
        test_ROC  = []
        test_PRC  = []
        test_ACC  = []
        test_MCC  = []
        test_F1   = []
        test_ECE  = []
        test_OFR = []
        test_OFN = []
        test_Brier = []

    for i in range(10):
    
        temp_dir = os.path.join(save_dir, f"fold_{i}", "model_0")
        
        temp_valid_prediction = pd.read_csv(os.path.join(temp_dir, "valid_prediction.csv"))["property_pred"].to_numpy()
        temp_valid_label = pd.read_csv(os.path.join(temp_dir, "valid_prediction.csv"))["property_label"].to_numpy()
        
        temp_test_prediction = pd.read_csv(os.path.join(temp_dir, "test_prediction.csv"))["property_pred"].to_numpy()
        test_label = pd.read_csv(os.path.join(temp_dir, "test_prediction.csv"))["property_label"].to_numpy()
        test_prediction.append(temp_test_prediction) # for ensemble calculation
        
        valid_ROC.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["roc-auc"].iloc[0])
        valid_PRC.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["prc-auc"].iloc[0])
        valid_ACC.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["accuracy"].iloc[0])
        valid_MCC.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["MCC"].iloc[0])
        valid_F1.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["F1"].iloc[0])
        valid_ECE.append(expected_calibration_error(temp_valid_label, temp_valid_prediction, bins=10))
        valid_OFR.append(OverconfidentFalseRate(temp_valid_prediction, temp_valid_label))
        valid_OFN.append(OverconfidentFalseNegatives(temp_valid_prediction, temp_valid_label))
        valid_Brier.append(Brier(temp_valid_label,temp_valid_prediction))
        
        test_ROC.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["roc-auc"].iloc[0])
        test_PRC.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["prc-auc"].iloc[0])
        test_ACC.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["accuracy"].iloc[0])
        test_MCC.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["MCC"].iloc[0])
        test_F1.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["F1"].iloc[0])
        test_ECE.append(expected_calibration_error(test_label, temp_test_prediction, bins=10))
        test_OFR.append(OverconfidentFalseRate(temp_test_prediction, test_label))
        test_OFN.append(OverconfidentFalseNegatives(temp_test_prediction, test_label))
        test_Brier.append(Brier(test_label,temp_test_prediction))
        

    logger(f'ROUND {n} Valid ROC-AUC {np.mean(valid_ROC)} +/- {np.std(valid_ROC)}')
    logger(f'ROUND {n} Valid PRC-AUC {np.mean(valid_PRC)} +/- {np.std(valid_PRC)}')
    logger(f'ROUND {n} Valid ACC     {np.mean(valid_ACC)} +/- {np.std(valid_ACC)}')
    logger(f'ROUND {n} Valid MCC     {np.mean(valid_MCC)} +/- {np.std(valid_MCC)}')
    logger(f'ROUND {n} Valid F1      {np.mean(valid_F1)} +/- {np.std(valid_F1)}')
    logger(f'ROUND {n} Valid ECE     {np.mean(valid_ECE)} +/- {np.std(valid_ECE)}')
    logger(f'ROUND {n} Valid OFR     {np.mean(valid_OFR)} +/- {np.std(valid_OFR)}')
    logger(f'ROUND {n} Valid OFN     {np.mean(valid_OFN)} +/- {np.std(valid_OFN)}')
    logger(f'ROUND {n} Valid Brier   {np.mean(valid_Brier)} +/- {np.std(valid_Brier)}')
    logger(' ')
    logger(f'ROUND {n} Test ROC-AUC {np.mean(test_ROC)} +/- {np.std(test_ROC)}')
    logger(f'ROUND {n} Test PRC-AUC {np.mean(test_PRC)} +/- {np.std(test_PRC)}')
    logger(f'ROUND {n} Test ACC     {np.mean(test_ACC)} +/- {np.std(test_ACC)}')
    logger(f'ROUND {n} Test MCC     {np.mean(test_MCC)} +/- {np.std(test_MCC)}')
    logger(f'ROUND {n} Test F1      {np.mean(test_F1)} +/- {np.std(test_F1)}')
    logger(f'ROUND {n} Test ECE     {np.mean(test_ECE)} +/- {np.std(test_ECE)}')
    logger(f'ROUND {n} Test OFR     {np.mean(test_OFR)} +/- {np.std(test_OFR)}')
    logger(f'ROUND {n} Test OFN     {np.mean(test_OFN)} +/- {np.std(test_OFN)}')
    logger(f'ROUND {n} Test Brier   {np.mean(test_Brier)} +/- {np.std(test_Brier)}')
    logger(' ')
    logger(f'ROUND {n} Ensemble Test ROC-AUC {roc_auc_score(test_label, np.mean(test_prediction, axis=0))}')
    logger(f'ROUND {n} Ensemble Test PRC-AUC {prc_auc(test_label, np.mean(test_prediction, axis=0))}')
    logger(f'ROUND {n} Ensemble Test ACC {accuracy(test_label, np.mean(test_prediction, axis=0))}')
    logger(f'ROUND {n} Ensemble Test MCC {MCC(test_label, np.mean(test_prediction, axis=0))}')
    logger(f'ROUND {n} Ensemble Test F1 {F1(test_label, np.mean(test_prediction, axis=0))}')
    logger(f'ROUND {n} Ensemble Test ECE {expected_calibration_error(test_label, np.mean(test_prediction, axis=0), bins=10)}')
    logger(f'ROUND {n} Ensemble Test OFR {OverconfidentFalseRate(np.mean(test_prediction, axis=0), test_label)}')
    logger(f'ROUND {n} Ensemble Test OFN {OverconfidentFalseNegatives(np.mean(test_prediction, axis=0), test_label)}')
    logger(f'ROUND {n} Ensemble Test Brier {Brier(test_label, np.mean(test_prediction, axis=0))}')
    logger(' ')

    return -np.mean(valid_ROC)

func(h_p)

07-17 04:22:15	 
07-17 04:22:15	 


{'N': [100, 100],
 'T': 2,
 'activation': 'ReLU',
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'at_least_epoch': 60,
 'atom_descriptors': None,
 'atom_descriptors_size': None,
 'atom_messages': False,
 'batch_size': 64,
 'bias': False,
 'cuda': True,
 'dataset_type': 'classification',
 'density_type': 'iaf_flow',
 'depth': 3,
 'device': device(type='cuda', index=0),
 'dropout': 0.0,
 'early_stopping_num': 30,
 'empty_cache': False,
 'explicit_h': False,
 'extra_metrics': ['MCC', 'prc-auc', 'accuracy', 'F1'],
 'features_only': False,
 'features_size': None,
 'ffn_hidden_size': 600,
 'ffn_num_layers': 3,
 'fingerprint_dim': 200,
 'gpu': 0,
 'grad_clip': None,
 'hidden_size': 600,
 'init_lr': 0.002,
 'input_feature_dim': 200,
 'latent_dim': 6,
 'log_frequency': 20,
 'loss_func': <function UCE_loss at 0x7f41bd940820>,
 'metric': 'roc-auc',
 'metrics': ['roc-auc', 'MCC', 'prc-auc', 'accuracy', 'F1'],
 'minimize_score': False,
 'mpn_shared': False,
 'multiclass_num_classes': 3,
 'n_den

1850it [00:04, 430.52it/s]
206it [00:00, 463.48it/s]
152it [00:00, 448.47it/s]
07-17 04:22:20	Training 0th model.
07-17 04:23:50	
07-17 04:23:50	Start Evaluating on Evaluation Set, EPOCH 60 BATCH 900
100%|██████████| 2/2 [00:00<00:00, 18.96it/s]
07-17 04:23:50	roc-auc: 0.567
07-17 04:23:50	MCC: 0.446
07-17 04:23:50	prc-auc: 0.631
07-17 04:23:50	accuracy: 0.728
07-17 04:23:50	F1: 0.815
07-17 04:23:52	
07-17 04:23:52	Start Evaluating on Evaluation Set, EPOCH 62 BATCH 920
100%|██████████| 2/2 [00:00<00:00, 19.19it/s]
07-17 04:23:53	roc-auc: 0.777
07-17 04:23:53	MCC: 0.488
07-17 04:23:53	prc-auc: 0.801
07-17 04:23:53	accuracy: 0.748
07-17 04:23:53	F1: 0.826
07-17 04:23:55	
07-17 04:23:55	Start Evaluating on Evaluation Set, EPOCH 63 BATCH 940
100%|██████████| 2/2 [00:00<00:00, 19.52it/s]
07-17 04:23:55	roc-auc: 0.907
07-17 04:23:55	MCC: 0.655
07-17 04:23:55	prc-auc: 0.928
07-17 04:23:55	accuracy: 0.835
07-17 04:23:55	F1: 0.875
07-17 04:23:57	
07-17 04:23:57	Start Evaluating on Evaluation Se

-0.9204818515604511