In [1]:
from hyperopt import fmin, hp, tpe, Trials
import numpy as np

import os
import sys

BASEDIR = os.path.dirname(os.getcwd())
sys.path.append(BASEDIR)

from copy import deepcopy
from dgl.dataloading import GraphDataLoader
from src.model.attentivefp import attentivefp
from src.utils.mol.attfp_graph import MoleculeDataset, collate_fn
from src.config.attentivefp import attentivefpArgs
from src.pipeline.ensemble import training_ensemble_models
from src.utils.basic.logger import Writer
from src.utils.model.metrics import accuracy, roc_auc_score, prc_auc
import torch
import pandas as pd

# Initialize

In [2]:
torch.set_num_threads(4)

target_name = "NFtest"
gpu_num     = 0

h_p = {'hidden_size': 300,
       'p_dropout'  : 0.1,
       'dropout'    : 0.1,
       'T'          : 2,
       'radius'     : 3,
       'fingerprint_dim': 150,
       'ffn_num_layers': 3,
       'init_lr' : 0.001}

INT_KEYS = ['hidden_size', 'radius', 'T', 'ffn_num_layers']

# Training function

In [None]:
SAVEDIR = os.path.join(BASEDIR, "results", target_name, 'AttFp')
DATADIR = os.path.join(BASEDIR, "data", target_name)
n = 0
logger = Writer(os.path.join(SAVEDIR, "history.log"))
def func(hyperparams):
    logger(" ")
    logger(" ")

    global n
    n = n+1
    logger(f"ROUND {n}")

    BASESAVEDIR = os.path.join(SAVEDIR, f"ROUND_{n}")

    for key in INT_KEYS:
        hyperparams[key] = int(hyperparams[key])

    config = attentivefpArgs().parse_args([], known_only=True)
    hyper_args = deepcopy(config)
    folder_name = '_'.join(f'{key}_{value}' for key, value in hyperparams.items())
    save_dir = os.path.join(BASESAVEDIR, folder_name)

    for key, value in hyperparams.items():
        setattr(hyper_args, key, value)

    setattr(hyper_args, "dataset_type", "classification")
    setattr(hyper_args, 'metric', "roc-auc")
    setattr(hyper_args, "extra_metrics", ["prc-auc", "accuracy"])
    setattr(hyper_args, "ffn_hidden_size", hyper_args.hidden_size)
    setattr(hyper_args, "early_stopping_num", 10)
    setattr(hyper_args, "gpu", gpu_num)
    setattr(hyper_args, "log_frequency", 20)
    setattr(hyper_args, "batch_size", 128)
    setattr(hyper_args, "at_least_epoch", 10)
    print(hyper_args)

    for i in range(5):

        train_dataset = MoleculeDataset(os.path.join(DATADIR, "NFtrain.csv"))
        valid_dataset = MoleculeDataset(os.path.join(DATADIR, "NFvalid.csv"))
        test_dataset  = MoleculeDataset(os.path.join(DATADIR, "NFtest.csv"))

        train_dataloader = GraphDataLoader(dataset=train_dataset, collate_fn=collate_fn, batch_size=512, drop_last=False, shuffle=True)
        train_dataloader.smiles = [[s] for s in train_dataset.smiles_list]
        valid_dataloader = GraphDataLoader(dataset=valid_dataset, collate_fn=collate_fn, batch_size=512, drop_last=False, shuffle=False)
        valid_dataloader.smiles = [[s] for s in valid_dataset.smiles_list]
        test_dataloader  = GraphDataLoader(dataset=test_dataset,  collate_fn=collate_fn, batch_size=512, drop_last=False, shuffle=False)
        test_dataloader.smiles  = [[s] for s in test_dataset.smiles_list]

        training_ensemble_models(os.path.join(save_dir, f"fold_{i}"),
                                 attentivefp,
                                 hyper_args,
                                 train_dataloader,
                                 valid_dataloader=valid_dataloader,
                                 test_dataloader=test_dataloader,
                                 ensemble_num=1)
        
        test_prediction = []
        
        valid_ROC = []
        valid_PRC = []
        valid_ACC = []
        
        test_ROC  = []
        test_PRC  = []
        test_ACC  = []

    for i in range(5):
    
        temp_dir = os.path.join(save_dir, f"fold_{i}", "model_0")
        test_prediction.append(pd.read_csv(os.path.join(temp_dir, "test_prediction.csv"))["property_pred"].to_numpy())
        valid_ROC.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["roc-auc"].iloc[0])
        valid_PRC.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["prc-auc"].iloc[0])
        valid_ACC.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["accuracy"].iloc[0])
        
        test_ROC.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["roc-auc"].iloc[0])
        test_PRC.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["prc-auc"].iloc[0])
        test_ACC.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["accuracy"].iloc[0])
        
        test_label = pd.read_csv(os.path.join(temp_dir, "test_prediction.csv"))["property_label"].to_numpy()        

    logger(f'ROUND {n} Valid ROC-AUC {np.mean(valid_ROC)} +/- {np.std(valid_ROC)}')
    logger(f'ROUND {n} Valid PRC-AUC {np.mean(valid_PRC)} +/- {np.std(valid_PRC)}')
    logger(f'ROUND {n} Valid ACC     {np.mean(valid_ACC)} +/- {np.std(valid_ACC)}')
    logger(' ')
    logger(f'ROUND {n} Test ROC-AUC {np.mean(test_ROC)} +/- {np.std(test_ROC)}')
    logger(f'ROUND {n} Test PRC-AUC {np.mean(test_PRC)} +/- {np.std(test_PRC)}')
    logger(f'ROUND {n} Test ACC     {np.mean(test_ACC)} +/- {np.std(test_ACC)}')
    logger(' ')
    logger(f'ROUND {n} Ensemble Test ROC-AUC {roc_auc_score(test_label, np.mean(test_prediction, axis=0))}')
    logger(f'ROUND {n} Ensemble Test PRC-AUC {prc_auc(test_label, np.mean(test_prediction, axis=0))}')
    logger(f'ROUND {n} Ensemble Test ACC {accuracy(test_label, np.mean(test_prediction, axis=0))}')
    logger(' ')

    return -np.mean(valid_ROC)

func(h_p)