In [1]:
from hyperopt import fmin, hp, tpe, Trials
import numpy as np

import os
import sys

BASEDIR = os.path.dirname(os.getcwd())
sys.path.append(BASEDIR)

from copy import deepcopy
from functools import partial
from src.model.attentivefp_postnet import attentivefpPostNet
from src.utils.mol.attfp_graph import MoleculeDataset, MoleculeDataLoader
from src.config.attentivefp_postnet import attentivefpPostNetArgs
from src.pipeline.ensemble import training_ensemble_models
from src.utils.basic.logger import Writer
from src.utils.model.metrics import accuracy, roc_auc_score, prc_auc, EF1, MCC, bedroc_score
import torch
import pandas as pd

# Initialize

In [2]:
torch.set_num_threads(4)

target_list = ['MAPK1', 'FEN1', 'PKM2', 'GBA', 'ALDH1', 'VDR', 'KAT2A']
gpu_num     = 0

INT_KEYS = ['hidden_size', 'ffn_num_layers', 'n_density', 'latent_dim']

# using default hyperparameter

h_p = {'hidden_size': 300,
    'p_dropout'  : 0.1,
    'dropout'    : 0.1,
    'T'          : 2,
    'radius'     : 3,
    'ffn_num_layers': 3,
    'init_lr' : 0.001,
    'latent_dim': 6,
    'n_density': 6,
    'density_type': 'iaf_flow'
}

# Training function

In [None]:
for target_name in target_list:
    
    SAVEDIR = os.path.join(BASEDIR, "results", "PCBA", target_name, 'AttFpPost')
    DATADIR = os.path.join(BASEDIR, "data", "PCBA", target_name)
    n = 0
    logger = Writer(os.path.join(SAVEDIR, "history.log"))
    def func(hyperparams):
        logger(" ")
        logger(" ")

        global n
        while n < 5:
            n = n+1
            logger(f"ROUND {n}")

            BASESAVEDIR = os.path.join(SAVEDIR, f"ROUND_{n}")
            
            for key in INT_KEYS:
                hyperparams[key] = int(hyperparams[key])

            config = attentivefpPostNetArgs().parse_args([], known_only=True)
            hyper_args = deepcopy(config)

            save_dir = BASESAVEDIR
            
            for key, value in hyperparams.items():
                setattr(hyper_args, key, value)

            setattr(hyper_args, "dataset_type", "classification")
            setattr(hyper_args, "latent_dim", int(hyper_args.latent_dim))
            setattr(hyper_args, 'metric', "BEDROC")
            setattr(hyper_args, "extra_metrics", ["roc-auc", "MCC", "prc-auc", "accuracy", "EF1"])
            setattr(hyper_args, "ffn_hidden_size", hyper_args.hidden_size)
            setattr(hyper_args, "early_stopping_num", 50)
            setattr(hyper_args, "gpu", gpu_num)
            setattr(hyper_args, "log_frequency", 300)
            setattr(hyper_args, "batch_size", 1024)
            print(hyper_args)

            for i in range(5):

                train_dataset = MoleculeDataset(os.path.join(DATADIR, f"{target_name}_train_{i}.csv"))
                valid_dataset = MoleculeDataset(os.path.join(DATADIR, f"{target_name}_valid_{i}.csv"))
                test_dataset = MoleculeDataset(os.path.join(DATADIR, f"{target_name}_test.csv"))

                train_dataloader = MoleculeDataLoader(dataset=train_dataset, batch_size=hyper_args.batch_size, shuffle=True , class_balance=True)
                train_dataloader.smiles = [[s] for s in train_dataset.smiles_list]
                valid_dataloader = MoleculeDataLoader(dataset=valid_dataset, batch_size=hyper_args.batch_size, shuffle=False)
                valid_dataloader.smiles = [[s] for s in valid_dataset.smiles_list]
                test_dataloader  = MoleculeDataLoader(dataset=test_dataset,  batch_size=hyper_args.batch_size, shuffle=False)
                test_dataloader.smiles = [[s] for s in test_dataset.smiles_list]

                train_targets = []
                for _, t in train_dataset:
                    train_targets.append(t)

                N = torch.tensor([len(train_targets) - np.sum(train_targets).astype(np.int64),
                                np.sum(train_targets).astype(np.int64)], dtype=torch.float64)

                setattr(hyper_args, "N", N)
                
                training_ensemble_models(os.path.join(save_dir, f"fold_{i}"),
                                        attentivefpPostNet,
                                        hyper_args,
                                        train_dataloader,
                                        valid_dataloader=valid_dataloader,
                                        test_dataloader=test_dataloader,
                                        ensemble_num=1)
                
            test_prediction = []
            valid_ROC = []
            valid_PRC = []
            valid_ACC = []
            valid_MCC = []
            valid_EF1  = []
            valid_BEDROC= []

            test_ROC  = []
            test_PRC  = []
            test_ACC  = []
            test_MCC  = []
            test_EF1   = []
            test_BEDROC = []

            for i in range(5):
                temp_dir = os.path.join(save_dir, f"fold_{i}", "model_0")
                test_prediction.append(pd.read_csv(os.path.join(temp_dir, "test_prediction.csv"))["property_pred"].to_numpy())
                valid_ROC.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["roc-auc"].iloc[0])
                valid_PRC.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["prc-auc"].iloc[0])
                valid_ACC.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["accuracy"].iloc[0])
                valid_MCC.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["MCC"].iloc[0])
                valid_EF1.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["EF1"].iloc[0])
                valid_BEDROC.append(pd.read_csv(os.path.join(temp_dir, "valid_prediction_performance.csv"))["BEDROC"].iloc[0])

                test_ROC.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["roc-auc"].iloc[0])
                test_PRC.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["prc-auc"].iloc[0])
                test_ACC.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["accuracy"].iloc[0])
                test_MCC.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["MCC"].iloc[0])
                test_EF1.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["EF1"].iloc[0])
                test_BEDROC.append(pd.read_csv(os.path.join(temp_dir, "test_prediction_performance.csv"))["BEDROC"].iloc[0])

                test_label = pd.read_csv(os.path.join(temp_dir, "test_prediction.csv"))["property_label"].to_numpy()

            logger(f'ROUND {n} Valid ROC-AUC {np.mean(valid_ROC)} +/- {np.std(valid_ROC)}')
            logger(f'ROUND {n} Valid PRC-AUC {np.mean(valid_PRC)} +/- {np.std(valid_PRC)}')
            logger(f'ROUND {n} Valid ACC     {np.mean(valid_ACC)} +/- {np.std(valid_ACC)}')
            logger(f'ROUND {n} Valid MCC     {np.mean(valid_MCC)} +/- {np.std(valid_MCC)}')
            logger(f'ROUND {n} Valid EF1     {np.mean(valid_EF1)} +/- {np.std(valid_EF1)}')
            logger(f'ROUND {n} Valid BEDROC  {np.mean(valid_BEDROC)} +/- {np.std(valid_BEDROC)}')

            logger(' ')
            logger(f'ROUND {n} Test ROC-AUC {np.mean(test_ROC)} +/- {np.std(test_ROC)}')
            logger(f'ROUND {n} Test PRC-AUC {np.mean(test_PRC)} +/- {np.std(test_PRC)}')
            logger(f'ROUND {n} Test ACC     {np.mean(test_ACC)} +/- {np.std(test_ACC)}')
            logger(f'ROUND {n} Test MCC     {np.mean(test_MCC)} +/- {np.std(test_MCC)}')
            logger(f'ROUND {n} Test EF1     {np.mean(test_EF1)}  +/- {np.std(test_EF1)}')
            logger(f'ROUND {n} Test BEDROC  {np.mean(test_BEDROC)}  +/- {np.std(test_BEDROC)}')

            logger(' ')
            logger(f'ROUND {n} Ensemble Test ROC-AUC {roc_auc_score(test_label, np.mean(test_prediction, axis=0))}')
            logger(f'ROUND {n} Ensemble Test PRC-AUC {prc_auc(test_label, np.mean(test_prediction, axis=0))}')
            logger(f'ROUND {n} Ensemble Test ACC {accuracy(test_label, np.mean(test_prediction, axis=0))}')
            logger(f'ROUND {n} Ensemble Test MCC {MCC(test_label, np.mean(test_prediction, axis=0))}')
            logger(f'ROUND {n} Ensemble Test EF1 {EF1(test_label, np.mean(test_prediction, axis=0))}')
            logger(f'ROUND {n} Ensemble Test BEDROC {bedroc_score(test_label, np.mean(test_prediction, axis=0))}')
            logger(' ')

        return -np.mean(valid_ROC)
    
    func(h_p)