In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.chdir('Molecule_VAE')
import json
import pandas as pd
from lime import lime_tabular

In [3]:
from chemvae.vae_utils import VAEUtils
from chemvae import hyperparameters
import chemvae.mol_utils as mu

In [4]:
class Wrapper(object):
    def __init__(self, model, index=None):
        self._model = model
        self.index = index

    def predict(self, dataset):
        """
        输入是hot格式，不是smiles格式
        """
        if self.index is not None:
            # X_1 = self._model.smiles_to_hot(dataset, canonize_smiles=False)
            z_1 = self._model.encode(dataset)
            y_1 = self._model.predict_prop_Z(z_1)
            return y_1[:, self.index]  # 只能选择一个属性进行解释
        else:
            z_1 = self._model.encode(dataset)
            y_1 = self._model.predict_prop_Z(z_1)
            return y_1[:]  # 只能选择一个属性进行解释
    
    def predict_all(self, dataset):
        z_1 = self._model.encode(dataset)
        y_1 = self._model.predict_prop_Z(z_1)
        return y_1  # 预测所有特征结果

In [5]:
# 导入VAE model
args = {'exp_file': 'exp.json'}
# args['directory'] = 'models/server/train_3'  # MP
# args['directory'] = 'models/server/train_2'  # BP
args['directory'] = 'models/server/train_5'  # PSA
# args['directory'] = 'models/server/train_6'  # viscosity
# args['directory'] = 'models/server/train_7'  # dielectric_constants

if args['directory'] is not None:
    args['exp_file'] = os.path.join(args['directory'], args['exp_file'])
params = hyperparameters.load_params(args['exp_file'], verbose=False)
classes = params['reg_prop_tasks']
with open(params['char_file'], 'r', encoding='UTF-8') as f:
    CHARS = json.load(f)
feature_names = CHARS
NCHARS = len(CHARS)
CHAR_INDICES = dict((c, i) for i, c in enumerate(CHARS))

# 导入数据
smiles_train, Y_train = mu.load_smiles_and_data_df(params['data_file'], params['MAX_LEN'],
                                                   reg_tasks=params['reg_prop_tasks'],
                                                   normalize_out=params["data_normalization_out"])
X_train = mu.smiles_to_hot(smiles_train, params['MAX_LEN'], params['PADDING'], CHAR_INDICES, NCHARS)

smiles_test, Y_test = mu.load_smiles_and_data_df(params['val_data_file'], params['MAX_LEN'],
                                                 reg_tasks=params['reg_prop_tasks'],
                                                 normalize_out=params["data_normalization_out"])
X_test = mu.smiles_to_hot(smiles_test, params['MAX_LEN'], params['PADDING'], CHAR_INDICES, NCHARS)


# 导入模型
vae = VAEUtils(directory=args['directory'])



Using standarized functions? True
Standarization: estimating mu and std values ...done!


In [6]:
index = 0
Y_train_one = Y_train[:, index]  # 只能选择一个属性进行解释
Y_test_one = Y_test[:, index]  # 只能选择一个属性进行解释
model = Wrapper(vae, index)  #模型封装

In [7]:
explainer = lime_tabular.RecurrentTabularExplainer(X_train, 
                                                   feature_names=feature_names,
                                                   discretize_continuous=False,
                                                   mode='regression',
                                                   feature_selection='none',
                                                   )

In [8]:
# smiles_list = ['COCCC#N', 'CCCCC#N', 'CC(C)CC#N', 'CCCCCC#N', 'CC(C)CCC#N', 'CCCCCCC#N', 'CCC#N', 'CCCC#N', 'CC(C)C#N']  # 所有腈类

# smiles_list = ['COCCC#N', 'OCCC#N', 'CCCC#N', 'COCC#N', 'COCC', 'COCCCN', 'COCCC#C', 'CCC#N', 'CC#N', 'COCCF', 'COCC(F)F', 'COCC(F)(F)F']  # MPN
smiles_list = ['COCCF', 'COCC(F)F', 'COCC(F)(F)F']  # MPN

# smiles_list = ['CCCCC#N', 'CCCCC#C', 'CCCCCN', 'CCCC', 'CCC']  # CCCC#N, CCC#N, CC#N, 都已经有了，不用算  # 戊腈

# smiles_list = ['CC(C)CC#N', 'CC(C)C#N', 'CC(C)CC#C', 'CC(C)CCN', 'CC(C)C']  # CCCC#N, CCC#N, CC#N, 'CCC', 都已经有了，不用算  # 异戊腈

# smiles_list = ['CCCCCC#N', 'CCCCCC#C', 'CCCCCCN', 'CCCCC']  # CCCCC#N, CCCC#N, CCC#N, CC#N, CCCC, 都已经有了，不用算  # 己腈

# smiles_list = ['CC(C)CCC#N', 'CCCCC#N', 'CCCC#N', 'CCC#N', 'CC#N', 'CC(C)CCC#C', 'CC(C)CCCN', 'CC(C)CC', 'CC(C)C']  # 差一个CC(C)CC#N，CC(C)C#N  # 异己腈

# smiles_list = ['CCCCCCC#N', 'CCCCCCC#C', 'CCCCCCCN', 'CCCCCC']  # CCCCCC#N, CCCCC#N, CCCC#N, CCC#N, CC#N, CCCCC, 都已经有了，不用算  # 庚腈

X_test = mu.smiles_to_hot(smiles_list, params['MAX_LEN'], params['PADDING'], CHAR_INDICES, NCHARS)
print('{:20s} : {}'.format('Properties', classes))
for index_instance in range(len(smiles_list)):

    # 预测值变化
    canon_smiles_2 = mu.canon_smiles(smiles_list[index_instance])  # 规范化smile表达
    print('{:20s} : {}'.format('Input', canon_smiles_2))

    X_2 = vae.smiles_to_hot(canon_smiles_2, canonize_smiles=False)
    z_2 = vae.encode(X_2)
    y_2 = vae.predict_prop_Z(z_2)[0]
    print('{:20s} : {}'.format("Predict:", y_2))

    # 重要性变化
    smiles_len = len(smiles_list[index_instance])
    loc2smiles = {i: s for i,s in enumerate(smiles_list[index_instance])}
    # print('SMILES: ', smiles_list[index_instance])
    exp = explainer.explain_instance(X_test[index_instance], model.predict)

    exp_smiles = []
    exp_list = exp.as_list()
    element = ["o", "s", "S", "B", "#", "I", "l", "O", "H", "c", "=", "n", "P", "C", "F", "r", "N", "+"]  # 有化学意义的符号，n和s没有对应分子
    for item in exp_list:
        elem = item[0].split('_')[0]
        loc = 119-int(item[0].split('-')[1].split(' ')[0] if elem != '-' else item[0].split('-')[2].split(' ')[0])  # VEA的元素是倒着排的，需要-119
        
        if loc < smiles_len and loc2smiles[loc]==elem:
            exp_smiles.append((loc,elem,item))
            # print(item)
    exp_smiles.sort(key=lambda x: x[0])
    # print(exp_smiles)

    # 保存为csv
    out_list = []
    for a in exp_smiles:
        out = [str(a[0]), a[1], str(a[2][1])]
        out_list.append(out)
    pd.DataFrame(out_list).to_csv(r'utils\Explaination\Functional_analysis\{}-{}-{}.csv'.format(smiles_list[0], classes[0], index_instance))

Properties           : ['PSA']
Input                : COCCF
Predict:             : [8.30926853]
Input                : COCC(F)F
Predict:             : [9.18047326]
Input                : COCC(F)(F)F
Predict:             : [7.9944353]
