# This script is the example of training ML models and predicting the uncertainty of given reactions

Before running the script below, you should install chemprop v1.7.1 firstly.
This script is used for illustrating how to train models and make predcitions. More details can be found in chemprop document

In [None]:
import chemprop
import os
import numpy as np
import pandas as pd

In [None]:
# Training models
# running this cell can train reactions.
arguments = [
    '--data_path', 'Ea-B97.csv',  # Training on the 'Ea-B97.csv' database
    '--save_dir', 'test_dropout', # saving directory
    '--dataset_type', 'regression',
    '--empty_cache',
    '--reaction',
    '--explicit_h',
    '--gpu', '0',  # if gpu is not available, this line should be deleted
    '--smiles_columns', 'rxn',  # the reaction SMILES column name of Ea-B97.csv database
    '--target_columns', 'dE0',  # the target SMILES column name of Ea-B97.csv database
    '--num_folds', '5',
    '--split_type', 'cv', # cross-validation split
    '--epochs', '50',
    '--dropout', '0.1',
    '--ensemble_size', '1',  # For ensemble model, ensemble size should be 5
    '--loss_function', 'mve',  # for evidential learning, mve should be evidential
]
args = chemprop.args.TrainArgs().parse_args(arguments)
mean_score, std_score = chemprop.train.cross_validate(args=args, train_func=chemprop.train.run_training)

Following script is used to predict models.
Here we provided the trained .pt file of ensemble models

In [None]:
def get_ens_unc(ensemble_dir: str,
                save_path: str,
                data_path: str,
                num_models: int = 5,
                num_folds: int = 1):
    """
    Get ensemble uncertainty

    :param ensemble_dir: the directory of ensemble models
    :param save_path: the save path of predicted results
    :param data_path: the path of predicting data
    :param num_models: the number of ensemble size
    :param num_folds: the number of folds
    """
    for fold_i in range(num_folds):
        pred_fold = None
        for model_i in range(num_models):
            cp_dir = os.path.join(ensemble_dir, f'fold_{fold_i}', f'model_{model_i}')
            arguments = [
                '--test_path', data_path,
                '--preds_path', '/dev/null',
                '--checkpoint_dir', cp_dir,
                '--uncertainty_method', 'mve',
            ]

            args = chemprop.args.PredictArgs().parse_args(arguments)
            preds = chemprop.train.make_predictions(args=args, return_uncertainty=True)  # (2, 1041, 1)
            if pred_fold is None:
                pred_fold = preds
            else:
                pred_fold = np.concatenate([pred_fold, np.array(preds)], axis=-1)

        # squeeze to 2D for save
        value = pred_fold[0, :, :]
        unc = pred_fold[1, :, :]

        pred_value = np.mean(value, axis=-1, keepdims=True)
        epi = np.var(value, axis=-1, keepdims=True)
        ale = np.mean(unc, axis=-1, keepdims=True)
        data = np.concatenate([pred_value, epi, ale], axis=-1)
        names = ['pred', 'ensemble_epistemic', 'ensemble_aleatoric']
        ens_df = pd.DataFrame(data=data, columns=names)
        ens_df.to_csv(f'{save_path}/ensemble_fold_{fold_i}_pred.csv', index=False)


def get_dropout_unc(dropout_dir: str,
                    save_path: str,
                    data_path: str,
                    num_samples: int = 5,
                    num_folds: int = 1):
    """
    Get ensemble uncertainty

    :param dropout_dir: the directory of sampling models
    :param save_path: the save path of predicted results
    :param data_path: the path of predicting data
    :param num_samples: the number of sampling size
    :param num_folds: the number of folds
    """
    for fold_i in range(num_folds):
        cp_dir = os.path.join(dropout_dir, f'fold_{fold_i}', f'model_0')
        arguments = [
                    '--test_path', data_path,
                    '--checkpoint_dir', cp_dir,
                    '--preds_path', 'dev/null',
                    '--uncertainty_method', 'mve',
                    '--uncertainty_dropout_p', '0.1',
                    '--dropout_sampling_size', f'{num_samples}',
                ]

        args = chemprop.args.PredictArgs().parse_args(arguments)
        preds = chemprop.make_dropout_predictions(args=args, return_raw_pred=True)
        pred_fold = None
        for pred_model in preds[0]:
            if pred_fold is None:
                pred_fold = pred_model
            else:
                pred_fold = np.concatenate([pred_fold, np.array(pred_model)], axis=-1)

        # squeeze to 2D for save
        value = pred_fold[0, :, :]
        unc = pred_fold[1, :, :]

        pred_value = np.mean(value, axis=-1, keepdims=True)
        epi = np.var(value, axis=-1, keepdims=True)
        ale = np.mean(unc, axis=-1, keepdims=True)
        data = np.concatenate([pred_value, epi, ale], axis=-1)
        names = ['pred', 'dropout_epistemic', 'dropout_aleatoric']
        dropout_df = pd.DataFrame(data=data, columns=names)
        dropout_df.to_csv(f'{save_path}/dropout_fold_{fold_i}_pred.csv', index=False)


def get_evidential_unc(evidential_dir: str,
                       save_path: str,
                       data_path: str,
                       num_folds: int = 1):
    """
    Get ensemble uncertainty

    :param evidential_dir: the directory of evidential models
    :param save_path: the save path of predicted results
    :param data_path: the path of predicting data
    :param num_folds: the number of folds
    """
    for fold_i in range(num_folds):
        cp_dir = os.path.join(evidential_dir, f'fold_{fold_i}', f'model_0')
        arguments = [
                    '--test_path', data_path,
                    '--preds_path', '/dev/null',
                    '--checkpoint_dir', cp_dir,
                    '--uncertainty_method', 'evidential_epistemic'
                ]

        args = chemprop.args.PredictArgs().parse_args(arguments)
        preds_epi = chemprop.train.make_predictions(args=args, return_uncertainty=True)
        args.uncertainty_method = 'evidential_aleatoric'
        preds_ale = chemprop.train.make_predictions(args=args, return_uncertainty=True)
        ale = np.array(preds_ale)[1:2, :, :]
        pred_fold = np.transpose(np.squeeze(np.concatenate([preds_epi, ale], axis=0)))
        df_names = ['pred', 'evidential_epistemic', 'evidential_aleatoric']
        evi_df = pd.DataFrame(data=pred_fold, columns=df_names)
        evi_df.to_csv(f'{save_path}/evidential_fold_{fold_i}_pred.csv', index=False)

## Running following cells can get the uncertainty predictions
Note: we provided the trained models (in `trained_models` directory) for test. Users are recommended for training their own models using the scripts we provided above.

In [None]:
# Running this script can get ensemble prediction results (.csv file) in ensemble_test directory
save_path = 'ensemble_test'
if not os.path.exists(save_path):
    os.mkdir(save_path)
cp_path = 'trained_models/ensemble'
get_ens_unc(ensemble_dir=cp_path, save_path=save_path, data_path='test_smiles.csv')

In [None]:
# Running this script can get evidential prediction results (.csv file) in evidential_test directory
save_path = 'evidential_test'
if not os.path.exists(save_path):
    os.mkdir(save_path)
cp_path = 'trained_models/evidential'
get_evidential_unc(evidential_dir=cp_path, save_path=save_path, data_path='test_smiles.csv')

In [None]:
# Running this script can get dropout prediction results (.csv file) in dropout_test directory
save_path = 'dropout_test'
if not os.path.exists(save_path):
    os.mkdir(save_path)
cp_path = 'trained_models/dropout'
get_dropout_unc(dropout_dir=cp_path, save_path=save_path, data_path='test_smiles.csv')