# SAUCIE
*Sparse Autoencoders for Unsupervised Clustering, Imputation, and Embedding*

In [1]:
%matplotlib inline

import glob
import numpy as np
import os
import plotting
import tensorflow as tf
import matplotlib.pyplot as plt
import saucie
import saucie_utils as utils

from collections import OrderedDict
from saucie import Saucie
from tensorflow.python import debug as tf_debug

## Model Config

In [2]:
data_path = '/data/krishnan/zika_data/gated/combined.npz'
dataset = 'zika'
data = utils.load_dataset(dataset, data_path)
config_path = 'saucie_models/zika/17-09-26-runs/0/model.config'
model, config = saucie.load_model_from_config(dataset, config_path)
print('\n'.join([str(x) for x in config.items()]))

TypeError: __init__() missing 1 required positional argument: 'opt_method'

## Training Config

In [3]:
# TRAINING FLAGS
batch_size = 100     # size of batch during training
num_epochs = 20      # number of epochs to train
patience = 5         # number of epochs to train without improvement, early stopping
log_every = 100      # training loss logging frequency
save_every = 200     # checkpointing frequency
tb_graph = True      # logs graph to TensorBoard if True
tb_summs = True      # logs summaries to TensorBoard if True
debug = False        # enable tfdebug
verbose = False      # will log in debug mode if True
gpu_mem = 0.45       # percent of gpu mem to allocate

# PLOTTING FLAGS
thresh = .5          # threshold to binarize id regularized layers
save_plots = False    # saves plots if True

## Training Methods
Edit this to change what is saved, printed, trained, etc.

In [5]:
def train(model, sess, data, batch_size, num_steps, thresh=0.5, patience=None,
          log_freq=100, ckpt_freq=100, save_plots=True):
    """
    Args:
        model: Saucie instance to train
        sess: tf.Session object to run all ops with
        data: utils.DataSet object to load batches and test data from
        batch_size: size of batches to train with
        num_steps: number of optimizer iteration steps
        thresh: threshold for binarization
        patience: number of epochs of training allowed without improvement
        log_freq: number of steps before printing training loss
        ckpt_freq: number of steps before checkpointing model
        save_plots: boolean determining whether or not to save plots
    """
    model.epochs_trained = data.epochs_trained = model.current_epoch_.eval(sess)
    graph = sess.graph
    loss_tensors = model.loss_tensors_dict(graph)
    train_ops = dict(losses=loss_tensors, opt=model.optimize)
    test_ops = dict(losses=loss_tensors)
    test_feed_dict = {model.x_: data.test_data, model.is_training_: False}
    train_feed_dict = {model.x_: data.data, model.is_training_: False}
    best_test_losses = None
    epochs_since_improved = 0
    current_step = model.global_step_.eval(sess)
    id_lam = model._model_config['sparse_config'].id_lam
    l1_lam = model._model_config['sparse_config'].l1_lam
    cluster_layers = id_lam.nonzero()[0].tolist()

    print('Saving all run data to: {}'.format(model.save_path))

    if tb_graph or tb_summs: 
        train_writer = tf.summary.FileWriter(model.save_path + '/logs/train', graph=graph)
        test_writer = tf.summary.FileWriter(model.save_path + '/logs/test', graph=graph)
        tf.logging.debug('Saving graph to TensorBoard in {}/logs'.format(model.save_path))

    if tb_summs:
        loss_summs = [tf.summary.scalar(name, loss) for name, loss in loss_tensors.items() if type(loss) != list]
        loss_summs = tf.summary.merge(loss_summs)
        train_ops['loss_summs'] = loss_summs
        test_ops['loss_summs'] = loss_summs
        tf.logging.debug('Saving loss summaries to TensorBoard in {}/logs'.format(model.save_path))

    if save_plots:
        plot_folder = model.save_path + '/plots'
        if not os.path.exists(plot_folder):
            os.makedirs(plot_folder)
        plot_ops = OrderedDict(emb=model.encoder)
        plot_ops['cluster_acts'] = tf.get_collection('id_normalized_activations')

    for step in range(current_step + 1, num_steps + 1):
        batch = data.next_batch(batch_size)
        if data.labeled:
            batch, labels = batch
        feed_dict = {model.x_: batch, model.is_training_: True}
        train_dict = sess.run(train_ops, feed_dict=feed_dict)
        train_losses = train_dict['losses']
        if 'loss_summs' in train_dict:
            summ = train_dict['loss_summs']
            train_writer.add_summary(summ, step)
        log_str = ('epoch/step: {}/{}, '.format(model.epochs_trained, step)
                   + utils.make_dict_str(train_losses))
        tf.logging.log_every_n(tf.logging.INFO, log_str, log_freq)

        if ckpt_freq and (step % ckpt_freq) == 0:
            tf.logging.info('Saving model, after step {}'.format(step))
            model.save_model(sess, 'model', step=step)
            if save_plots:
                tf.logging.debug('Plotting middle layer embedding')
                plot_dict = sess.run(plot_ops, feed_dict=feed_dict)
                make_plots(cluster_layers, id_lam, l1_lam, plot_folder, plot_dict, data, 'cluster_layer-{}.png')

        if model.epochs_trained != data.epochs_trained:
            model.epochs_trained = sess.run(tf.assign(model.current_epoch_, data.epochs_trained))
            test_dict = sess.run(test_ops, feed_dict=test_feed_dict)
            test_losses = test_dict['losses']
            if 'loss_summs' in test_dict:
                summ = test_dict['loss_summs']
                test_writer.add_summary(summ, step)
            log_str = ('TESTING -- epoch: {}, '.format(model.epochs_trained)
                       + utils.make_dict_str(test_losses))
            tf.logging.info(log_str)
            if best_test_losses is None or best_test_losses['loss'] > test_losses['loss']:
                model.saver.save(sess, model.save_path + '/best.model')
                tf.logging.info('Best model saved after {} epochs'.format(model.epochs_trained))
                best_test_losses = test_losses
                epochs_since_improved = 0
                if save_plots:
                    tf.logging.debug('Plotting best middle layer embedding')
                    plot_dict = sess.run(plot_ops, feed_dict=test_feed_dict)
                    make_plots(cluster_layers, id_lam, l1_lam, plot_folder, plot_dict, data, 'best-cluster_layer-{}.png')
            else:
                epochs_since_improved += 1
            if patience and epochs_since_improved == patience:
                tf.logging.info('Early stopping, test loss did not improve for {} epochs.'.format(epochs_since_improved))
                tf.logging.info('Best test loss: epoch {}: '.format(model.epochs_trained - epochs_since_improved)
                                + utils.make_dict_str(best_test_losses))
                break

    tf.logging.info('Trained for {} epochs'.format(model.epochs_trained))

    print('Saved all run data to: {}'.format(model.save_path))
    return test_losses


def make_plots(cluster_layers, id_lam, l1_lam, plot_folder, plot_dict, data, title_fmt='clust_layer-{}.png'):
    for i, acts in enumerate(plot_dict['cluster_acts']):
        hl_idx = cluster_layers[i]
        save_file = plot_folder + '/emb-' + title_fmt.format(hl_idx)
        title = 'Embedding, clustered layer-{}, id_lam/l1_lam={:5.4E}/{:5.4E}'.format(hl_idx, id_lam[hl_idx], l1_lam[hl_idx])
        clusts = utils.binarize(acts, thresh)
        tf.logging.debug('Top 5 activated neurons: {}'.format(acts.max(axis=1)[:5]))
        tf.logging.debug('Mean max activation: {}'.format(acts.max(axis=1).mean()))
        tf.logging.debug('Bottom 5 max activated neurons: {}'.format(acts.max(axis=1)[-5:]))
        plotting.plot_embedding2D(plot_dict['emb'], clusts, save_file, title)
        if '_colnames' in data.__dict__:
            save_file = plot_folder + '/heatmap-' + title_fmt.format(hl_idx)
            plotting.plot_cluster_heatmap(data.test_data, clusts, data._colnames, data._markers, save_file)
    if cluster_layers == []:
        plotting.plot_embedding2D(plot_dict['emb'], np.zeros(len(plot_dict['emb'])), plot_folder + '/emb.png','Embedding, no clusters')

## Run Training

In [6]:
sess.close()
tf.reset_default_graph()

In [7]:
if verbose:
    tf.logging.set_verbosity(tf.logging.DEBUG)
else:
    tf.logging.set_verbosity(tf.logging.INFO)
if debug:
    sess = tf_debug.LocalCLIDebugWrapperSession(sess)
    sess.add_tensor_filter('has_inf_or_nan', tf_debug.has_inf_or_nan)

model = Saucie(**config)
plot_dir = model.save_path + '/plots'
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_mem)
sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))
model.build(sess)

steps_per_epoch = data.num_samples // batch_size
num_steps = steps_per_epoch * num_epochs
train(model, sess, data, batch_size, num_steps, thresh, patience, log_every, save_every, save_plots)

Saving all run data to: ./saucie_models/mnist/17-09-24-runs/1
INFO:tensorflow:epoch/step: 0/1, loss: 1.47476, recons_loss: 0.795778, id_loss: 0.678984
INFO:tensorflow:epoch/step: 0/101, loss: 0.282465, recons_loss: 0.233635, id_loss: 0.0488292
INFO:tensorflow:Saving model, after step 200
INFO:tensorflow:epoch/step: 0/201, loss: 0.226997, recons_loss: 0.202185, id_loss: 0.0248121
INFO:tensorflow:epoch/step: 0/301, loss: 0.215149, recons_loss: 0.198394, id_loss: 0.0167547
INFO:tensorflow:Saving model, after step 400
INFO:tensorflow:epoch/step: 0/401, loss: 0.207423, recons_loss: 0.19098, id_loss: 0.016443
INFO:tensorflow:epoch/step: 0/501, loss: 0.211424, recons_loss: 0.196827, id_loss: 0.0145967
INFO:tensorflow:TESTING -- epoch: 1, loss: 4.7308, recons_loss: 0.287358, id_loss: 4.44344
INFO:tensorflow:Best model saved after 1 epochs
INFO:tensorflow:Saving model, after step 600
INFO:tensorflow:epoch/step: 1/601, loss: 0.210243, recons_loss: 0.197404, id_loss: 0.0128394
INFO:tensorflow:epo

INFO:tensorflow:epoch/step: 10/5801, loss: 0.192489, recons_loss: 0.190618, id_loss: 0.00187045
INFO:tensorflow:epoch/step: 10/5901, loss: 0.186515, recons_loss: 0.184092, id_loss: 0.00242302
INFO:tensorflow:Saving model, after step 6000
INFO:tensorflow:epoch/step: 10/6001, loss: 0.176138, recons_loss: 0.174788, id_loss: 0.0013493
INFO:tensorflow:TESTING -- epoch: 11, loss: 1.2583, recons_loss: 0.381696, id_loss: 0.876602
INFO:tensorflow:Best model saved after 11 epochs
INFO:tensorflow:epoch/step: 11/6101, loss: 0.17814, recons_loss: 0.174749, id_loss: 0.00339095
INFO:tensorflow:Saving model, after step 6200
INFO:tensorflow:epoch/step: 11/6201, loss: 0.185492, recons_loss: 0.178528, id_loss: 0.00696445
INFO:tensorflow:epoch/step: 11/6301, loss: 0.172362, recons_loss: 0.170685, id_loss: 0.00167715
INFO:tensorflow:Saving model, after step 6400
INFO:tensorflow:epoch/step: 11/6401, loss: 0.196343, recons_loss: 0.195229, id_loss: 0.00111381
INFO:tensorflow:epoch/step: 11/6501, loss: 0.16975

OrderedDict([('loss', 1.1918824),
             ('recons_loss', 0.47788247),
             ('id_loss', 0.71399993)])

In [8]:
plot_folder = model.save_path + '/plots'

if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)
save_file = plot_folder + '/embedding.png'

plot_ops = OrderedDict(emb=model.encoder)
plot_ops['cluster_acts'] = tf.get_collection('id_normalized_activations')

subs = np.random.choice(np.arange(len(data.data)), 5000, replace=False)
test_feed_dict = {model.x_: data.data[subs,:],
                  model.is_training_: False}

plot_dict = sess.run(plot_ops, feed_dict=test_feed_dict)

acts = plot_dict['cluster_acts'][0]
clusts = utils.binarize(acts, thresh)
plotting.plot_embedding2D(plot_dict['emb'], clusts, save_file, 'Bottleneck layer embedding')

Unique binary clusters: 14


  (prop.get_family(), self.defaultFamily[fontext]))
