create sampling tool to open model and weights files and test the aut…

…oencoder
HIPS · Jan 6, 2017 · 619aa7d · 619aa7d
1 parent 6f70f8c
commit 619aa7d
Show file tree

Hide file tree

Showing 2 changed files with 131 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -5,4 +5,29 @@ A project to enable optimization of molecules by transforming them to and from a
 This is the code used for the this paper:  https://arxiv.org/abs/1610.02415
 
 
-This code requires a fork of Keras that forked from the dev version around approximately version 0.3.2 and Theano 0.8.2.  We want to point you to the work of Max Hodak who re-implemented this tool based on the paper.  For beginning your own project, you may have greater success starting there.  https://github.com/maxhodak/keras-molecules
+This code requires a fork of Keras that forked from the dev version around approximately version 0.3.2 and Theano > 0.8.2.  (Recently, to test on OS X 10.12.2, we are running Theano 0.9.0 dev4)  We want to point you to the work of Max Hodak who re-implemented this tool based on the paper.  For beginning your own project, you may have greater success starting there.  https://github.com/maxhodak/keras-molecules
+
+
+# To test the weights generated in the paper (limited to 5000 test SMILES)
+        python sample_autoencoder.py \
+            ../data/best_vae_model.json \
+            ../data/best_vae_annealed_weights.h5 \
+            ../data/250k_rndm_zinc_drugs_clean.smi \
+            ../data/zinc_char_list.json \
+            -l5000
+
+
+Which should result is something close to this (values will range from random selection of 5000 samples from test file)
+
+        Using Theano backend.
+        ('Training set size is', 5000)
+        Training set size is 5000, after filtering to max length of 120
+        ('total chars:', 35)
+        Loss: 0.834809958935, Accuracy: 0.948206666667
+
+
+# To train a new model (for quick test limit of 5000 training SMILES)
+        python train_autoencoder.py \
+            ../data/250k_rndm_zinc_drugs_clean.smi \
+            ../data/zinc_char_list.json \
+            -l5000
diff --git a/autoencoder/sample_autoencoder.py b/autoencoder/sample_autoencoder.py
@@ -0,0 +1,105 @@
+import argparse
+import json
+import logging
+import os
+from random import shuffle
+
+import h5py
+from keras.models import model_from_json
+import numpy as np
+from train_autoencoder import smile_convert
+
+def adapt_model_dict(
+    model_dict,
+    regularizer_scale=1,
+    rnd_seed=None,
+    temperature=1,
+    output_sample=False
+):
+    """
+    Add in some custom options to the model json output from keras
+    """
+    updated = model_dict.copy()
+    if "variationaldense" in updated:
+        if "regularizer_scale" not in updated:
+            logging.info('Adding a regularizer_scale = {} to the VAE layer'.format(regularizer_scale))
+            updated["regularizer_scale"] = regularizer_scale
+        if "output_sample" not in updated:
+            logging.info('Adding output_sample = {} to the VAE layer'.format(output_sample))
+            updated["output_sample"] = output_sample
+
+    if "terminalgru" in updated:
+        if "rnd_seed" not in updated:
+            logging.info('Adding a rnd_seed parameter of {}'.format(rnd_seed))
+            updated["rnd_seed"] = rnd_seed
+        if "temperature" not in updated:
+            logging.info('Adding a temperature parameter of {}'.format(temperature))
+            updated["temperature"] = temperature
+    return updated
+
+def set_weights_from_file(weights_file, model):
+    with h5py.File(weights_file, mode='r') as fp:
+        for k in range(fp.attrs['nb_layers']):
+            g = fp['layer_{}'.format(k)]
+            weights = [g['param_{}'.format(p)] for p in range(g.attrs['nb_params'])]
+            w_shape = [i.shape for i in weights]
+            logging.debug('Weights for this layer have shapes {}'.format(w_shape))
+            try:
+                model.layers[k].set_weights(weights)
+            except AssertionError:
+                logging.exception('Failed loading weights on layer {}. '
+                                   'Weights initiated with random'.format(k))
+                continue
+
+def load_test_data(test_path, n_chars, max_len, char_list, limit=None):
+    with open(test_path, 'r') as f:
+        smiles = f.readlines()
+    smiles = [s.strip() for s in smiles]
+    if limit is not None:
+        smiles = smiles[:limit]
+    print('Training set size is', len(smiles))
+    smiles = [smile_convert(i) for i in smiles if smile_convert(i)]
+    print('Training set size is {}, after filtering to max length of {}'.format(len(smiles), max_len))
+    shuffle(smiles)
+
+    print(('total chars:', n_chars))
+
+    cleaned_data = np.zeros((len(smiles), max_len, n_chars), dtype=np.float32)
+
+    char_lookup = dict((c, i) for i, c in enumerate(char_list))
+
+    for i, smile in enumerate(smiles):
+        for t, char in enumerate(smile):
+            cleaned_data[i, t, char_lookup[char]] = 1
+
+    return cleaned_data
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='Sample a trained autoencoder.')
+    parser.add_argument('model_file', type=str,
+                        help='a file path of a model json file')
+    parser.add_argument('weights_file', type=str,
+                        help='a file path of a weights file')
+    parser.add_argument('test_file', type=str,
+                        help='a file path of a smiles list file to sample from')
+    parser.add_argument('char_file', type=str,
+                        help='a file path of a char index json')
+    parser.add_argument('--limit', '-l', type=int, default=5000,
+                        help='limit test data to this count')
+
+    args = parser.parse_args()
+
+    model_dict = json.load(open(args.model_file, 'r'))
+    model_dict = adapt_model_dict(model_dict)
+
+    model = model_from_json(json.dumps(model_dict))
+    set_weights_from_file(args.weights_file, model)
+
+    max_len = model_dict["layers"][0]["batch_input_shape"][1]
+    n_chars = model_dict["layers"][0]["batch_input_shape"][2]
+
+    char_list = json.load(open(args.char_file))
+    test_set = load_test_data(args.test_file, n_chars, max_len, char_list, limit=args.limit)
+    loss, accuracy = model.test_on_batch(test_set, test_set, sample_weight=None, accuracy=True)
+    print("Loss: {}, Accuracy: {}".format(loss, accuracy))