From 3a414b1013ef6e82a0314ad89737c538e8902fa2 Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 29 May 2018 13:34:53 -0700 Subject: [PATCH 001/102] Add collect_if_horovod function --- open_seq2seq/utils/funcs.py | 35 ++++++++++++++--------------------- open_seq2seq/utils/utils.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 21 deletions(-) diff --git a/open_seq2seq/utils/funcs.py b/open_seq2seq/utils/funcs.py index ba8a5d069..b5cb1929f 100644 --- a/open_seq2seq/utils/funcs.py +++ b/open_seq2seq/utils/funcs.py @@ -9,7 +9,8 @@ from .hooks import PrintSamplesHook, RunEvaluationHook, PrintLossAndTimeHook, \ BroadcastGlobalVariablesHook -from open_seq2seq.utils.utils import deco_print, get_results_for_epoch +from open_seq2seq.utils.utils import deco_print, get_results_for_epoch, \ + collect_if_horovod from tensorflow.python import debug as tf_debug @@ -127,27 +128,19 @@ def train(train_model, eval_model=None, debug_port=None): total_objects += np.sum(fetches_vals[i + 1]) step += 1 - if hvd is not None: - deco_print("Finished training on rank {}".format(hvd.rank())) - else: - deco_print("Finished training") + if len(fetches) > 1: + total_objects = collect_if_horovod(total_objects, hvd, mode="sum") - if train_model.on_horovod: - ending = " on worker {}".format(hvd.rank()) - else: - ending = "" - if step > bench_start: - deco_print( - "Avg time per step{}: {:.3f}s".format( - ending, 1.0 * total_time / (step - bench_start)) - ) - if len(fetches) > 1: - deco_print( - "Avg objects per second{}: {:.3f}".format( - ending, 1.0 * total_objects / total_time) - ) - else: - deco_print("Not enough steps for benchmarking{}".format(ending)) + if master_worker: + deco_print("Finished training") + if step > bench_start: + avg_time = 1.0 * total_time / (step - bench_start) + deco_print("Avg time per step: {:.3f}s".format(avg_time)) + if len(fetches) > 1: + avg_objects = 1.0 * total_objects / total_time + deco_print("Avg objects per second: {:.3f}".format(avg_objects)) + else: + deco_print("Not enough steps for benchmarking") def restore_and_get_results(model, checkpoint, mode): diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index e02eae74a..92f2abd75 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -31,6 +31,42 @@ def clip_sparse(value, size): dense_shape_clipped) +def collect_if_horovod(value, hvd, mode='sum'): + """Collects values from all workers if run on Horovod. + Note, that on all workers except first this function will return None. + + Args: + value: value to collect. + hvd: horovod.tensorflow module or None + mode: could be "sum", "mean" or "gather", indicating reduce_sum or gather. + For "sum" and "mean" value has to be numerical, for "gather", value has + to be iterable. + + Returns: + collected results if run on Horovod or value otherwise. + """ + if hvd is None: + return value + + import mpi4py.rc + mpi4py.rc.initialize = False + from mpi4py import MPI + + values = MPI.COMM_WORLD.gather(value) + # synchronize all workers + MPI.COMM_WORLD.Barrier() + + if MPI.COMM_WORLD.Get_rank() != 0: + return None + + if mode == 'sum': + return np.sum(values) + elif mode == 'mean': + return np.mean(values) + elif mode == 'gather': + return [item for sl in values for item in sl] + + def clip_last_batch(last_batch, true_size): last_batch_clipped = [] for val in last_batch: From f8fc8c904eada7209bbc065b3321a8242f220055 Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 29 May 2018 17:57:25 -0700 Subject: [PATCH 002/102] Fix parallel evaluation in tower mode --- open_seq2seq/utils/utils.py | 145 ++++++++++++++++++++++++++++-------- 1 file changed, 115 insertions(+), 30 deletions(-) diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index 92f2abd75..11976dec3 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -77,6 +77,108 @@ def clip_last_batch(last_batch, true_size): return last_batch_clipped +def iterate_data(model, sess, compute_loss, mode, verbose): + total_time = 0.0 + bench_start = model.params.get('bench_start', 10) + results_per_batch = [] + + size_defined = model.get_data_layer().get_size_in_samples() is not None + if size_defined: + dl_sizes = [] + + if compute_loss: + total_loss = 0.0 + + total_samples = [] + fetches = [] + + for worker_id in range(model.num_gpus): + cur_fetches = [ + model.get_data_layer(worker_id).input_tensors, + model.get_output_tensors(worker_id), + ] + if compute_loss: + cur_fetches.append(model.eval_losses[worker_id]) + fetches.append(cur_fetches) + total_samples.append(0.0) + if size_defined: + dl_sizes.append(model.get_data_layer(worker_id).get_size_in_samples) + + sess.run([model.get_data_layer(i).iterator.initializer + for i in range(model.num_gpus)]) + + step = 0 + + while True: + tm = time.time() + if size_defined: + fetches_vals = sess.run(fetches) + else: + # if size is not defined we have to process fetches sequentially, so not + # to lose data when exception is thrown on one data layer + fetches_vals = [] + for one_fetch in fetches: + try: + fetches_vals.append(sess.run(one_fetch)) + except tf.errors.OutOfRangeError: + continue + + if step >= bench_start: + total_time += time.time() - tm + + skip_workers = 0 + + # looping over num_gpus. In Horovod case this loop is "dummy", + # since num_gpus = 1 + for worker_id, fetches_val in enumerate(fetches_vals): + if compute_loss: + inputs, outputs, loss = fetches_val + else: + inputs, outputs = fetches_val + + # assuming any element of inputs["source_tensors"] .shape[0] is batch size + batch_size = inputs["source_tensors"][0].shape[0] + total_samples[worker_id] += batch_size + + if size_defined: + # this data_layer is finished + if total_samples[worker_id] - batch_size > dl_sizes[worker_id]: + skip_workers += 1 + continue + + # this data_layer is at the last batch with few more elements, cutting + if total_samples[worker_id] > dl_sizes[worker_id]: + last_batch_size = dl_sizes[worker_id] % batch_size + inputs["source_tensors"] = model.clip_last_batch( + inputs["source_tensors"], last_batch_size, + ) + if 'target_tensors' in inputs: + inputs["target_tensors"] = model.clip_last_batch( + inputs["target_tensors"], last_batch_size, + ) + outputs = model.clip_last_batch(outputs, last_batch_size) + + if compute_loss: + total_loss += loss * batch_size + + if mode == 'eval': + results_per_batch.append(model.evaluate(inputs, outputs)) + elif mode == 'infer': + results_per_batch.append(model.infer(inputs, outputs)) + else: + raise ValueError("Unknown mode: {}".format(mode)) + + if len(fetches_vals) == 0 or skip_workers == model.num_gpus: + break + step += 1 + + if compute_loss: + return results_per_batch, total_loss, np.sum(total_samples) + else: + return results_per_batch + + + def iterate_data_layer(model, dl_id, sess, compute_loss, mode, verbose): total_time = 0.0 bench_start = model.params.get('bench_start', 10) @@ -129,7 +231,7 @@ def iterate_data_layer(model, dl_id, sess, compute_loss, mode, verbose): if last_batch_size != 0: cross_over = 1 else: - # setting data_size to be infinity and assume + # setting data_size to be infinity and assuming # that tf.errors.OutOfRangeError will be raised data_size = 1000000000000 @@ -195,31 +297,14 @@ def iterate_data_layer(model, dl_id, sess, compute_loss, mode, verbose): def get_results_for_epoch(model, sess, compute_loss, mode, verbose=False): - if model.on_horovod: - if compute_loss: - results_per_batch, total_loss, total_samples = iterate_data_layer( - model, 0, sess, compute_loss, mode, verbose, - ) - else: - results_per_batch = iterate_data_layer( - model, 0, sess, compute_loss, mode, verbose, - ) + if compute_loss: + results_per_batch, total_loss, total_samples = iterate_data( + model, sess, compute_loss, mode, verbose, + ) else: - results_per_batch_all = [] - total_loss_all = [] - total_samples_all = [] - for dl_id in range(model.num_gpus): - if compute_loss: - results_per_batch, total_loss, total_samples = iterate_data_layer( - model, dl_id, sess, compute_loss, mode, verbose, - ) - total_loss_all.append(total_loss) - total_samples_all.append(total_samples) - else: - results_per_batch = iterate_data_layer( - model, dl_id, sess, compute_loss, mode, verbose, - ) - results_per_batch_all.append(results_per_batch) + results_per_batch = iterate_data( + model, sess, compute_loss, mode, verbose, + ) if model.on_horovod: import mpi4py.rc @@ -239,11 +324,11 @@ def get_results_for_epoch(model, sess, compute_loss, mode, verbose=False): else: return None - if compute_loss: - total_loss = np.sum(total_loss_all) - total_samples = np.sum(total_samples_all) - # moving GPU dimension into the batch dimension - results_per_batch = [item for sl in results_per_batch_all for item in sl] + if compute_loss: + total_loss = np.sum(total_loss_all) + total_samples = np.sum(total_samples_all) + # moving GPU dimension into the batch dimension + results_per_batch = [item for sl in results_per_batch_all for item in sl] if compute_loss: total_loss /= total_samples From 88a74f1cf425dba9e2e29d3f683788fb6e69443f Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 30 May 2018 13:21:45 -0700 Subject: [PATCH 003/102] Add verbose to new evaluation --- open_seq2seq/utils/utils.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index 11976dec3..fd4b2df96 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -102,13 +102,19 @@ def iterate_data(model, sess, compute_loss, mode, verbose): fetches.append(cur_fetches) total_samples.append(0.0) if size_defined: - dl_sizes.append(model.get_data_layer(worker_id).get_size_in_samples) + dl_sizes.append(model.get_data_layer(worker_id).get_size_in_samples()) sess.run([model.get_data_layer(i).iterator.initializer for i in range(model.num_gpus)]) step = 0 + if verbose: + if model.on_horovod: + ending = "on worker {}".format(model.hvd.rank()) + else: + ending = "" + while True: tm = time.time() if size_defined: @@ -168,10 +174,30 @@ def iterate_data(model, sess, compute_loss, mode, verbose): else: raise ValueError("Unknown mode: {}".format(mode)) + if verbose: + if size_defined: + data_size = np.sum(dl_sizes) + if data_size > 10 and step % (data_size // 10) == 0: + deco_print("Processed {}/{} batches{}".format( + (step + 1) * model.num_gpus, data_size, ending)) + else: + deco_print("Processed {} batches{}".format(step + 1, ending), end='\r') + if len(fetches_vals) == 0 or skip_workers == model.num_gpus: break step += 1 + if verbose: + if step > bench_start: + deco_print( + "Avg time per step: {:.3}s{}".format( + 1.0 * total_time / (step - bench_start), ending), + ) + else: + deco_print( + "Not enough steps for benchmarking{}".format(ending) + ) + if compute_loss: return results_per_batch, total_loss, np.sum(total_samples) else: From a33ed4b569398e563bd4543a5301e6b7baa34c74 Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 30 May 2018 15:36:35 -0700 Subject: [PATCH 004/102] Fix speech2text inference --- open_seq2seq/data/speech2text/speech2text.py | 44 ++++--- open_seq2seq/models/speech2text.py | 12 +- open_seq2seq/utils/utils.py | 127 +------------------ open_seq2seq/utils/utils_test.py | 16 ++- 4 files changed, 46 insertions(+), 153 deletions(-) diff --git a/open_seq2seq/data/speech2text/speech2text.py b/open_seq2seq/data/speech2text/speech2text.py index 938338672..faf7acb7c 100644 --- a/open_seq2seq/data/speech2text/speech2text.py +++ b/open_seq2seq/data/speech2text/speech2text.py @@ -32,7 +32,7 @@ def get_optional_params(): 'pad_to': int, }) - def __init__(self, params, model, num_workers=None, worker_id=None): + def __init__(self, params, model, num_workers, worker_id): """Speech-to-text data layer constructor. See parent class for arguments description. @@ -87,14 +87,8 @@ def __init__(self, params, model, num_workers=None, worker_id=None): def split_data(self, data): """Method that performs data split for evaluation.""" - if self.params['mode'] != 'train' and self._num_workers is not None: - size = len(data) - start = size // self._num_workers * self._worker_id - if self._worker_id == self._num_workers - 1: - end = size - else: - end = size // self._num_workers * (self._worker_id + 1) - return data[start:end] + if self.params['mode'] != 'train': + return data[self._worker_id::self._num_workers] else: return data @@ -105,12 +99,12 @@ def iterator(self): def build_graph(self): """Builds data processing graph using ``tf.data`` API.""" - self._dataset = tf.data.Dataset.from_tensor_slices(self._files) - if self.params['shuffle']: - self._dataset = self._dataset.shuffle(self._size) - self._dataset = self._dataset.repeat() - if self.params['mode'] != 'infer': + self._dataset = tf.data.Dataset.from_tensor_slices(self._files) + if self.params['shuffle']: + self._dataset = self._dataset.shuffle(self._size) + self._dataset = self._dataset.repeat() + self._dataset = self._dataset.map( lambda line: tf.py_func( self._parse_audio_transcript_element, @@ -125,18 +119,25 @@ def build_graph(self): padded_shapes=([None, self.params['num_audio_features']], 1, [None], 1) ) else: + indices = self.split_data( + np.array(map(lambda num: str(num), range(len(self.all_files)))) + ) + self._dataset = tf.data.Dataset.from_tensor_slices( + np.hstack((indices[:, np.newaxis], self._files[:, np.newaxis])) + ) + self._dataset = self._dataset.repeat() self._dataset = self._dataset.map( lambda line: tf.py_func( self._parse_audio_element, [line], - [self.params['dtype'], tf.int32], + [self.params['dtype'], tf.int32, tf.int32], stateful=False, ), num_parallel_calls=8, ) self._dataset = self._dataset.padded_batch( self.params['batch_size'], - padded_shapes=([None, self.params['num_audio_features']], 1) + padded_shapes=([None, self.params['num_audio_features']], 1, 1) ) self._iterator = self._dataset.prefetch(8).make_initializable_iterator() @@ -148,7 +149,9 @@ def build_graph(self): y.set_shape([self.params['batch_size'], None]) y_length = tf.reshape(y_length, [self.params['batch_size']]) else: - x, x_length = self._iterator.get_next() + x, x_length, x_id = self._iterator.get_next() + x_id = tf.reshape(x_id, [self.params['batch_size']]) + x.set_shape([self.params['batch_size'], None, self.params['num_audio_features']]) x_length = tf.reshape(x_length, [self.params['batch_size']]) @@ -157,6 +160,8 @@ def build_graph(self): self._input_tensors["source_tensors"] = [x, x_length] if self.params['mode'] != 'infer': self._input_tensors['target_tensors'] = [y, y_length] + else: + self._input_tensors['source_ids'] = [x_id] def _parse_audio_transcript_element(self, element): """Parses tf.data element from TextLineDataset into audio and text. @@ -183,7 +188,7 @@ def _parse_audio_transcript_element(self, element): np.int32(target), \ np.int32([len(target)]) - def _parse_audio_element(self, audio_filename): + def _parse_audio_element(self, id_and_audio_filename): """Parses audio from file and returns array of audio features. Args: @@ -192,6 +197,7 @@ def _parse_audio_element(self, audio_filename): Returns: tuple: source audio features as ``np.array``, length of source sequence, """ + idx, audio_filename = id_and_audio_filename pad_to = self.params.get('pad_to', 8) source = get_speech_features_from_file( audio_filename, self.params['num_audio_features'], pad_to, @@ -199,7 +205,7 @@ def _parse_audio_element(self, audio_filename): augmentation=self.params.get('augmentation', None), ) return source.astype(self.params['dtype'].as_numpy_dtype()), \ - np.int32([len(source)]) + np.int32([len(source)]), np.int32([idx]) @property def input_tensors(self): diff --git a/open_seq2seq/models/speech2text.py b/open_seq2seq/models/speech2text.py index 8a5c061e0..a4275b55b 100644 --- a/open_seq2seq/models/speech2text.py +++ b/open_seq2seq/models/speech2text.py @@ -120,13 +120,21 @@ def infer(self, input_values, output_values): ) for sample_id in range(len(decoded_texts)): preds.append("".join(decoded_texts[sample_id])) - return preds + return preds, input_values['source_ids'] def finalize_inference(self, results_per_batch, output_file): preds = [] + ids = [] - for result in results_per_batch: + for result, idx in results_per_batch: preds.extend(result) + ids.extend(idx) + + preds = np.array(preds) + ids = np.hstack(ids) + # restoring the correct order + preds = preds[np.argsort(ids)] + pd.DataFrame( { 'wav_filename': self.get_data_layer().all_files, diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index fd4b2df96..6565fd87c 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -155,13 +155,8 @@ def iterate_data(model, sess, compute_loss, mode, verbose): # this data_layer is at the last batch with few more elements, cutting if total_samples[worker_id] > dl_sizes[worker_id]: last_batch_size = dl_sizes[worker_id] % batch_size - inputs["source_tensors"] = model.clip_last_batch( - inputs["source_tensors"], last_batch_size, - ) - if 'target_tensors' in inputs: - inputs["target_tensors"] = model.clip_last_batch( - inputs["target_tensors"], last_batch_size, - ) + for key, value in inputs.items(): + inputs[key] = model.clip_last_batch(value, last_batch_size) outputs = model.clip_last_batch(outputs, last_batch_size) if compute_loss: @@ -204,124 +199,6 @@ def iterate_data(model, sess, compute_loss, mode, verbose): return results_per_batch - -def iterate_data_layer(model, dl_id, sess, compute_loss, mode, verbose): - total_time = 0.0 - bench_start = model.params.get('bench_start', 10) - results_per_batch = [] - - if model.on_horovod: - data_layer = model.get_data_layer() - if compute_loss: - loss_tensor = model.eval_losses[0] - output_tensors = model.get_output_tensors() - else: - data_layer = model.get_data_layer(dl_id) - if compute_loss: - loss_tensor = model.eval_losses[dl_id] - output_tensors = model.get_output_tensors(dl_id) - - sess.run(data_layer.iterator.initializer) - - fetches = [ - data_layer.input_tensors, - output_tensors, - ] - - if compute_loss: - fetches.append(loss_tensor) - total_loss = 0.0 - total_samples = 0.0 - - size_defined = data_layer.get_size_in_samples() is not None - - if size_defined: - data_size = data_layer.get_size_in_samples() // \ - data_layer.params['batch_size'] - last_batch_size = data_layer.get_size_in_samples() % \ - data_layer.params['batch_size'] - - if model.on_horovod: - worker_id = model.hvd.rank() - else: - worker_id = dl_id - - cross_over = 0 - if size_defined: - if data_size == 0: - raise ValueError( - "Batch size is bigger than dataset size: {} > {}".format( - data_layer.params['batch_size'], data_layer.get_size_in_samples() - ) - ) - if last_batch_size != 0: - cross_over = 1 - else: - # setting data_size to be infinity and assuming - # that tf.errors.OutOfRangeError will be raised - data_size = 1000000000000 - - for step in range(data_size + cross_over): - tm = time.time() - try: - if compute_loss: - inputs, outputs, loss = sess.run(fetches) - else: - inputs, outputs = sess.run(fetches) - except tf.errors.OutOfRangeError: - break - if step >= bench_start: - total_time += time.time() - tm - - # assuming any element of inputs["source_tensors"][ shape[0] is batch size - batch_size = inputs["source_tensors"][0].shape[0] - - if compute_loss: - total_loss += loss * batch_size - total_samples += batch_size - - if size_defined and step == data_size: - inputs["source_tensors"] = model.clip_last_batch( - inputs["source_tensors"], last_batch_size, - ) - if 'target_tensors' in inputs: - inputs["target_tensors"] = model.clip_last_batch( - inputs["target_tensors"], last_batch_size, - ) - outputs = model.clip_last_batch(outputs, last_batch_size) - - if mode == 'eval': - results_per_batch.append(model.evaluate(inputs, outputs)) - elif mode == 'infer': - results_per_batch.append(model.infer(inputs, outputs)) - else: - raise ValueError("Unknown mode: {}".format(mode)) - - if verbose: - if size_defined: - if data_size > 10 and step % (data_size // 10) == 0: - deco_print("Processed {}/{} batches on worker {}".format( - step + 1, data_size, worker_id)) - else: - deco_print("Processed {} batches".format(step + 1), end='\r') - - if verbose: - if step > bench_start: - deco_print( - "Avg time per step: {:.3}s on worker {}".format( - 1.0 * total_time / (step - bench_start), worker_id), - ) - else: - deco_print( - "Not enough steps for benchmarking on worker {}".format(worker_id) - ) - - if compute_loss: - return results_per_batch, total_loss, total_samples - else: - return results_per_batch - - def get_results_for_epoch(model, sess, compute_loss, mode, verbose=False): if compute_loss: results_per_batch, total_loss, total_samples = iterate_data( diff --git a/open_seq2seq/utils/utils_test.py b/open_seq2seq/utils/utils_test.py index 65f345d2f..6990ab1ed 100644 --- a/open_seq2seq/utils/utils_test.py +++ b/open_seq2seq/utils/utils_test.py @@ -27,7 +27,7 @@ def setUp(self): def tearDown(self): pass - def test_get_batches_for_epoch(self): + def test_get_results_for_epoch(self): # this will take all gpu memory, but that's probably fine for tests gpus = get_available_gpus() length_list = [] @@ -40,16 +40,18 @@ def test_get_batches_for_epoch(self): with tf.Graph().as_default() as g: self.eval_config['batch_size_per_gpu'] = bs self.eval_config['num_gpus'] = num_gpus - model = base_model(params=self.eval_config, mode="eval", hvd=None) + model = base_model(params=self.eval_config, mode="infer", hvd=None) model.compile() - model.evaluate = lambda inputs, outputs: inputs - model.finalize_evaluation = lambda results: results + model.infer = lambda inputs, outputs: inputs + model.finalize_inference = lambda results: results with self.test_session(g, use_gpu=True) as sess: sess.run(tf.global_variables_initializer()) - inputs_per_batch = get_results_for_epoch(model, sess, False, "eval") - length_list.append(np.hstack([inp['source_tensors'][1] - for inp in inputs_per_batch])) + inputs_per_batch = get_results_for_epoch(model, sess, False, "infer") + length = np.hstack([inp['source_tensors'][1] + for inp in inputs_per_batch]) + ids = np.hstack([inp['source_ids'] for inp in inputs_per_batch]) + length_list.append(length[np.argsort(ids)]) for i in range(len(length_list) - 1): npt.assert_allclose(length_list[i], length_list[i + 1]) From cb556a585da0b65d28d2fc7d52c3851ac1a7d369 Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 30 May 2018 15:43:05 -0700 Subject: [PATCH 005/102] Switch to collect_if_horovod in get_results_for_epoch --- open_seq2seq/utils/utils.py | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index 6565fd87c..ed92b19db 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -209,35 +209,22 @@ def get_results_for_epoch(model, sess, compute_loss, mode, verbose=False): model, sess, compute_loss, mode, verbose, ) - if model.on_horovod: - import mpi4py.rc - mpi4py.rc.initialize = False - from mpi4py import MPI - - if compute_loss: - total_samples_all = MPI.COMM_WORLD.gather(total_samples) - total_loss_all = MPI.COMM_WORLD.gather(total_loss) - results_per_batch_all = MPI.COMM_WORLD.gather(results_per_batch) - - MPI.COMM_WORLD.Barrier() - if MPI.COMM_WORLD.Get_rank() != 0: - # returning dummy tuple of correct shape - if compute_loss: - return None, None - else: - return None + if compute_loss: + total_samples = collect_if_horovod(total_samples, model.hvd, 'sum') + total_loss = collect_if_horovod(total_loss, model.hvd, 'sum') + results_per_batch = collect_if_horovod(results_per_batch, model.hvd, 'gather') + if results_per_batch is None: + # returning dummy tuple of correct shape if not in master worker if compute_loss: - total_loss = np.sum(total_loss_all) - total_samples = np.sum(total_samples_all) - # moving GPU dimension into the batch dimension - results_per_batch = [item for sl in results_per_batch_all for item in sl] + return None, None + else: + return None if compute_loss: - total_loss /= total_samples - return results_per_batch, total_loss - - return results_per_batch + return results_per_batch, total_loss / total_samples + else: + return results_per_batch def log_summaries_from_dict(dict_to_log, output_dir, step): From 5224f283c6e422c732d1590c45b2941a1e10f983 Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 30 May 2018 16:09:46 -0700 Subject: [PATCH 006/102] Switch back to full split data --- open_seq2seq/data/speech2text/speech2text.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/open_seq2seq/data/speech2text/speech2text.py b/open_seq2seq/data/speech2text/speech2text.py index faf7acb7c..fcd87fb3c 100644 --- a/open_seq2seq/data/speech2text/speech2text.py +++ b/open_seq2seq/data/speech2text/speech2text.py @@ -86,9 +86,14 @@ def __init__(self, params, model, num_workers, worker_id): self._input_tensors = None def split_data(self, data): - """Method that performs data split for evaluation.""" - if self.params['mode'] != 'train': - return data[self._worker_id::self._num_workers] + if self.params['mode'] != 'train' and self._num_workers is not None: + size = len(data) + start = size // self._num_workers * self._worker_id + if self._worker_id == self._num_workers - 1: + end = size + else: + end = size // self._num_workers * (self._worker_id + 1) + return data[start:end] else: return data From 55698471a4c4013a0bff2189b83a06a9ed55249c Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 30 May 2018 16:25:29 -0700 Subject: [PATCH 007/102] Docs fix --- open_seq2seq/data/speech2text/speech2text.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/open_seq2seq/data/speech2text/speech2text.py b/open_seq2seq/data/speech2text/speech2text.py index fcd87fb3c..173edb334 100644 --- a/open_seq2seq/data/speech2text/speech2text.py +++ b/open_seq2seq/data/speech2text/speech2text.py @@ -197,10 +197,11 @@ def _parse_audio_element(self, id_and_audio_filename): """Parses audio from file and returns array of audio features. Args: - audio_filename: audio file name. + id_and_audio_filename: tuple of sample id and corresponding audio file name. Returns: tuple: source audio features as ``np.array``, length of source sequence, + sample id. """ idx, audio_filename = id_and_audio_filename pad_to = self.params.get('pad_to', 8) From d3f0b3267494f797842c778c70910057e036ac17 Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 30 May 2018 16:34:20 -0700 Subject: [PATCH 008/102] Add benchmark objects to eval and infer --- open_seq2seq/models/image2label.py | 2 +- open_seq2seq/models/model.py | 15 ++++++++++++++- open_seq2seq/models/speech2text.py | 2 +- open_seq2seq/models/text2text.py | 2 +- open_seq2seq/utils/utils.py | 31 ++++++++++++++++++++++-------- 5 files changed, 40 insertions(+), 12 deletions(-) diff --git a/open_seq2seq/models/image2label.py b/open_seq2seq/models/image2label.py index bc932b6d8..884044d2d 100644 --- a/open_seq2seq/models/image2label.py +++ b/open_seq2seq/models/image2label.py @@ -60,7 +60,7 @@ def evaluate(self, input_values, output_values): top5 = np.sum(labels[:, np.newaxis] == np.argpartition(logits, -5)[:, -5:]) return total, top1, top5 - def get_num_objects_per_step(self, worker_id=0): + def _get_num_objects_per_step(self, worker_id=0): """Returns number of images in current batch, i.e. batch size.""" data_layer = self.get_data_layer(worker_id) num_images = tf.shape(data_layer.input_tensors['source_tensors'][0])[0] diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index f7fccd29d..78d025139 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -275,6 +275,7 @@ class docs. self.loss = None self.train_op = None self.eval_losses = None + self._num_objects_per_step = None def compile(self, force_var_reuse=False): """TensorFlow graph is built here.""" @@ -339,6 +340,12 @@ def compile(self, force_var_reuse=False): if self._mode == "eval": self.eval_losses = [loss] + try: + self._num_objects_per_step = [self._get_num_objects_per_step(worker_id) + for worker_id in range(self.num_gpus)] + except NotImplementedError: + pass + if self._mode == "train": if 'lr_policy' not in self.params: lr_policy = None @@ -621,7 +628,7 @@ def get_tf_dtype(self): else: return self.params['dtype'] - def get_num_objects_per_step(self, worker_id=0): + def _get_num_objects_per_step(self, worker_id=0): """Define this method if you need benchmarking functionality. For example, for translation models, this method should return number of tokens in current batch, for image recognition model should return number @@ -636,6 +643,12 @@ def get_num_objects_per_step(self, worker_id=0): """ raise NotImplementedError() + def get_num_objects_per_step(self, worker_id=0): + if self._num_objects_per_step: + return self._num_objects_per_step[worker_id] + else: + raise NotImplementedError() + @property def params(self): """Parameters used to construct the model (dictionary).""" diff --git a/open_seq2seq/models/speech2text.py b/open_seq2seq/models/speech2text.py index a4275b55b..976817bb2 100644 --- a/open_seq2seq/models/speech2text.py +++ b/open_seq2seq/models/speech2text.py @@ -143,7 +143,7 @@ def finalize_inference(self, results_per_batch, output_file): columns=['wav_filename', 'predicted_transcript'], ).to_csv(output_file, index=False) - def get_num_objects_per_step(self, worker_id=0): + def _get_num_objects_per_step(self, worker_id=0): """Returns number of audio frames in current batch.""" data_layer = self.get_data_layer(worker_id) num_frames = tf.reduce_sum(data_layer.input_tensors['source_tensors'][1]) diff --git a/open_seq2seq/models/text2text.py b/open_seq2seq/models/text2text.py index dd563963e..14c183c47 100644 --- a/open_seq2seq/models/text2text.py +++ b/open_seq2seq/models/text2text.py @@ -214,7 +214,7 @@ def finalize_evaluation(self, results_per_batch): return {} - def get_num_objects_per_step(self, worker_id=0): + def _get_num_objects_per_step(self, worker_id=0): """Returns number of source tokens + number of target tokens in batch.""" data_layer = self.get_data_layer(worker_id) # sum of source length in batch diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index ed92b19db..d8f5f72e3 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -92,6 +92,7 @@ def iterate_data(model, sess, compute_loss, mode, verbose): total_samples = [] fetches = [] + # on horovod num_gpus is 1 for worker_id in range(model.num_gpus): cur_fetches = [ model.get_data_layer(worker_id).input_tensors, @@ -99,19 +100,25 @@ def iterate_data(model, sess, compute_loss, mode, verbose): ] if compute_loss: cur_fetches.append(model.eval_losses[worker_id]) - fetches.append(cur_fetches) - total_samples.append(0.0) if size_defined: dl_sizes.append(model.get_data_layer(worker_id).get_size_in_samples()) + try: + total_objects = 0.0 + cur_fetches.append(model.get_num_objects_per_step(worker_id)) + except NotImplementedError: + total_objects = None + deco_print("WARNING: Can't compute number of objects per step, since " + "train model does not define get_num_objects_per_step method.") + fetches.append(cur_fetches) + total_samples.append(0.0) sess.run([model.get_data_layer(i).iterator.initializer for i in range(model.num_gpus)]) step = 0 - if verbose: if model.on_horovod: - ending = "on worker {}".format(model.hvd.rank()) + ending = " on worker {}".format(model.hvd.rank()) else: ending = "" @@ -138,9 +145,12 @@ def iterate_data(model, sess, compute_loss, mode, verbose): # since num_gpus = 1 for worker_id, fetches_val in enumerate(fetches_vals): if compute_loss: - inputs, outputs, loss = fetches_val + inputs, outputs, loss = fetches_val[:3] else: - inputs, outputs = fetches_val + inputs, outputs = fetches_val[:2] + + if total_objects is not None: + total_objects += np.sum(fetches_val[-1]) # assuming any element of inputs["source_tensors"] .shape[0] is batch size batch_size = inputs["source_tensors"][0].shape[0] @@ -171,8 +181,9 @@ def iterate_data(model, sess, compute_loss, mode, verbose): if verbose: if size_defined: - data_size = np.sum(dl_sizes) - if data_size > 10 and step % (data_size // 10) == 0: + data_size = int(np.ceil(np.sum(dl_sizes) / + model.params['batch_size_per_gpu'])) + if data_size > 10 and (step * model.num_gpus) % (data_size // 10) == 0: deco_print("Processed {}/{} batches{}".format( (step + 1) * model.num_gpus, data_size, ending)) else: @@ -188,6 +199,10 @@ def iterate_data(model, sess, compute_loss, mode, verbose): "Avg time per step: {:.3}s{}".format( 1.0 * total_time / (step - bench_start), ending), ) + if total_objects is not None: + avg_objects = 1.0 * total_objects / total_time + deco_print("Avg objects per second{}: {:.3f}".format(avg_objects, + ending)) else: deco_print( "Not enough steps for benchmarking{}".format(ending) From 7a68b94dc52a1107364fd91777986a4e5f217277 Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 31 May 2018 11:46:09 -0700 Subject: [PATCH 009/102] Fix bugs with data_size --- open_seq2seq/utils/utils.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index d8f5f72e3..f8934ddaf 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -116,6 +116,7 @@ def iterate_data(model, sess, compute_loss, mode, verbose): for i in range(model.num_gpus)]) step = 0 + processed_batches = 0 if verbose: if model.on_horovod: ending = " on worker {}".format(model.hvd.rank()) @@ -169,6 +170,8 @@ def iterate_data(model, sess, compute_loss, mode, verbose): inputs[key] = model.clip_last_batch(value, last_batch_size) outputs = model.clip_last_batch(outputs, last_batch_size) + processed_batches += 1 + if compute_loss: total_loss += loss * batch_size @@ -179,30 +182,32 @@ def iterate_data(model, sess, compute_loss, mode, verbose): else: raise ValueError("Unknown mode: {}".format(mode)) + if len(fetches_vals) == 0 or skip_workers == model.num_gpus: + break + if verbose: if size_defined: - data_size = int(np.ceil(np.sum(dl_sizes) / - model.params['batch_size_per_gpu'])) - if data_size > 10 and (step * model.num_gpus) % (data_size // 10) == 0: + data_size = int(np.sum(np.ceil(np.array(dl_sizes) / + model.params['batch_size_per_gpu']))) + if step == 0 or (data_size > 10 and + processed_batches % (data_size // 10) == 0): deco_print("Processed {}/{} batches{}".format( - (step + 1) * model.num_gpus, data_size, ending)) + processed_batches, data_size, ending)) else: - deco_print("Processed {} batches{}".format(step + 1, ending), end='\r') - - if len(fetches_vals) == 0 or skip_workers == model.num_gpus: - break + deco_print("Processed {} batches{}".format(processed_batches, ending), + end='\r') step += 1 if verbose: if step > bench_start: deco_print( - "Avg time per step: {:.3}s{}".format( - 1.0 * total_time / (step - bench_start), ending), + "Avg time per step{}: {:.3}s".format( + ending, 1.0 * total_time / (step - bench_start)), ) if total_objects is not None: avg_objects = 1.0 * total_objects / total_time - deco_print("Avg objects per second{}: {:.3f}".format(avg_objects, - ending)) + deco_print("Avg objects per second{}: {:.3f}".format(ending, + avg_objects)) else: deco_print( "Not enough steps for benchmarking{}".format(ending) From 18c82f6d8af1a6fc807e80bff6b2e1400f632a44 Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 31 May 2018 14:20:36 -0700 Subject: [PATCH 010/102] Add normalization to imagenet preprocessing --- .../data/image2label/imagenet_preprocessing.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/open_seq2seq/data/image2label/imagenet_preprocessing.py b/open_seq2seq/data/image2label/imagenet_preprocessing.py index 559257359..ac3a2f490 100644 --- a/open_seq2seq/data/image2label/imagenet_preprocessing.py +++ b/open_seq2seq/data/image2label/imagenet_preprocessing.py @@ -123,12 +123,12 @@ def _central_crop(image, crop_height, crop_width): image, [crop_top, crop_left, 0], [crop_height, crop_width, -1]) -def _mean_image_subtraction(image, means, num_channels): - """Subtracts the given means from each image channel. +def _mean_image_subtraction_and_normalization(image, means, num_channels): + """Subtracts the given means from each image channel and divides by 127.5. For example: means = [123.68, 116.779, 103.939] - image = _mean_image_subtraction(image, means) + image = _mean_image_subtraction_and_normalization(image, means) Note that the rank of `image` must be known. @@ -138,7 +138,7 @@ def _mean_image_subtraction(image, means, num_channels): num_channels: number of color channels in the image that will be distorted. Returns: - the centered image. + the centered image and normalized image. Raises: ValueError: If the rank of `image` is unknown, if `image` has a rank other @@ -154,7 +154,7 @@ def _mean_image_subtraction(image, means, num_channels): # We have a 1-D tensor of means; convert to 3-D. means = tf.expand_dims(tf.expand_dims(means, 0), 0) - return image - means + return (image - means) / 127.5 def _smallest_size_at_least(height, width, resize_min): @@ -261,7 +261,8 @@ def preprocess_image(image_buffer, bbox, output_height, output_width, image.set_shape([output_height, output_width, num_channels]) - return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels) + return _mean_image_subtraction_and_normalization(image, _CHANNEL_MEANS, + num_channels) def _parse_example_proto(example_serialized): From b3730c71edaaac000ebac2331c47d05771d3baff Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 31 May 2018 14:28:12 -0700 Subject: [PATCH 011/102] Add test configs --- .../image2label/resnet-50-v2-mp1.py | 62 +++++++++++++++++++ .../image2label/resnet-50-v2-mp2.py | 62 +++++++++++++++++++ .../image2label/resnet-50-v2-mp3.py | 62 +++++++++++++++++++ .../image2label/resnet-50-v2-mp4.py | 62 +++++++++++++++++++ .../image2label/resnet-50-v2-mp5.py | 62 +++++++++++++++++++ 5 files changed, 310 insertions(+) create mode 100644 example_configs/image2label/resnet-50-v2-mp1.py create mode 100644 example_configs/image2label/resnet-50-v2-mp2.py create mode 100644 example_configs/image2label/resnet-50-v2-mp3.py create mode 100644 example_configs/image2label/resnet-50-v2-mp4.py create mode 100644 example_configs/image2label/resnet-50-v2-mp5.py diff --git a/example_configs/image2label/resnet-50-v2-mp1.py b/example_configs/image2label/resnet-50-v2-mp1.py new file mode 100644 index 000000000..f00de256a --- /dev/null +++ b/example_configs/image2label/resnet-50-v2-mp1.py @@ -0,0 +1,62 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders import ResNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import piecewise_constant +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 32, + "dtype": "mixed", + "loss_scale": 10.0, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/resnet50-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": piecewise_constant, + "lr_policy_params": { + "learning_rate": 0.1, + "boundaries": [30, 60, 80, 90], + "decay_rates": [0.1, 0.01, 0.001, 1e-4], + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0001, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": ResNetEncoder, + "encoder_params": { + 'resnet_size': 50, + "regularize_bn": False, + }, + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1001, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + }, +} diff --git a/example_configs/image2label/resnet-50-v2-mp2.py b/example_configs/image2label/resnet-50-v2-mp2.py new file mode 100644 index 000000000..b8ec5e9e3 --- /dev/null +++ b/example_configs/image2label/resnet-50-v2-mp2.py @@ -0,0 +1,62 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders import ResNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import piecewise_constant +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 32, + "dtype": "mixed", + "loss_scale": 100.0, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/resnet50-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": piecewise_constant, + "lr_policy_params": { + "learning_rate": 0.1, + "boundaries": [30, 60, 80, 90], + "decay_rates": [0.1, 0.01, 0.001, 1e-4], + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0001, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": ResNetEncoder, + "encoder_params": { + 'resnet_size': 50, + "regularize_bn": False, + }, + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1001, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + }, +} diff --git a/example_configs/image2label/resnet-50-v2-mp3.py b/example_configs/image2label/resnet-50-v2-mp3.py new file mode 100644 index 000000000..ee20eba95 --- /dev/null +++ b/example_configs/image2label/resnet-50-v2-mp3.py @@ -0,0 +1,62 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders import ResNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import piecewise_constant +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 32, + "dtype": "mixed", + "loss_scale": 1000.0, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/resnet50-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": piecewise_constant, + "lr_policy_params": { + "learning_rate": 0.1, + "boundaries": [30, 60, 80, 90], + "decay_rates": [0.1, 0.01, 0.001, 1e-4], + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0001, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": ResNetEncoder, + "encoder_params": { + 'resnet_size': 50, + "regularize_bn": False, + }, + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1001, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + }, +} diff --git a/example_configs/image2label/resnet-50-v2-mp4.py b/example_configs/image2label/resnet-50-v2-mp4.py new file mode 100644 index 000000000..89deede0f --- /dev/null +++ b/example_configs/image2label/resnet-50-v2-mp4.py @@ -0,0 +1,62 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders import ResNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import piecewise_constant +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 32, + "dtype": "mixed", + "automatic_loss_scaling": "Backoff", + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/resnet50-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": piecewise_constant, + "lr_policy_params": { + "learning_rate": 0.1, + "boundaries": [30, 60, 80, 90], + "decay_rates": [0.1, 0.01, 0.001, 1e-4], + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0001, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": ResNetEncoder, + "encoder_params": { + 'resnet_size': 50, + "regularize_bn": False, + }, + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1001, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + }, +} diff --git a/example_configs/image2label/resnet-50-v2-mp5.py b/example_configs/image2label/resnet-50-v2-mp5.py new file mode 100644 index 000000000..8098c2221 --- /dev/null +++ b/example_configs/image2label/resnet-50-v2-mp5.py @@ -0,0 +1,62 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders import ResNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import piecewise_constant +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 32, + "dtype": "mixed", + "automatic_loss_scaling": "Logmax", + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/resnet50-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": piecewise_constant, + "lr_policy_params": { + "learning_rate": 0.1, + "boundaries": [30, 60, 80, 90], + "decay_rates": [0.1, 0.01, 0.001, 1e-4], + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0001, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": ResNetEncoder, + "encoder_params": { + 'resnet_size': 50, + "regularize_bn": False, + }, + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1001, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + }, +} From 7456bc5236ca720539c67ab64674ac179c2e0d94 Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 31 May 2018 14:39:41 -0700 Subject: [PATCH 012/102] Renaming --- .../image2label/{resnet-50-v2-mp1.py => resnet-50-v2-mp_000.py} | 0 .../image2label/{resnet-50-v2-mp2.py => resnet-50-v2-mp_001.py} | 0 .../image2label/{resnet-50-v2-mp3.py => resnet-50-v2-mp_002.py} | 0 .../image2label/{resnet-50-v2-mp4.py => resnet-50-v2-mp_003.py} | 0 .../image2label/{resnet-50-v2-mp5.py => resnet-50-v2-mp_004.py} | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename example_configs/image2label/{resnet-50-v2-mp1.py => resnet-50-v2-mp_000.py} (100%) rename example_configs/image2label/{resnet-50-v2-mp2.py => resnet-50-v2-mp_001.py} (100%) rename example_configs/image2label/{resnet-50-v2-mp3.py => resnet-50-v2-mp_002.py} (100%) rename example_configs/image2label/{resnet-50-v2-mp4.py => resnet-50-v2-mp_003.py} (100%) rename example_configs/image2label/{resnet-50-v2-mp5.py => resnet-50-v2-mp_004.py} (100%) diff --git a/example_configs/image2label/resnet-50-v2-mp1.py b/example_configs/image2label/resnet-50-v2-mp_000.py similarity index 100% rename from example_configs/image2label/resnet-50-v2-mp1.py rename to example_configs/image2label/resnet-50-v2-mp_000.py diff --git a/example_configs/image2label/resnet-50-v2-mp2.py b/example_configs/image2label/resnet-50-v2-mp_001.py similarity index 100% rename from example_configs/image2label/resnet-50-v2-mp2.py rename to example_configs/image2label/resnet-50-v2-mp_001.py diff --git a/example_configs/image2label/resnet-50-v2-mp3.py b/example_configs/image2label/resnet-50-v2-mp_002.py similarity index 100% rename from example_configs/image2label/resnet-50-v2-mp3.py rename to example_configs/image2label/resnet-50-v2-mp_002.py diff --git a/example_configs/image2label/resnet-50-v2-mp4.py b/example_configs/image2label/resnet-50-v2-mp_003.py similarity index 100% rename from example_configs/image2label/resnet-50-v2-mp4.py rename to example_configs/image2label/resnet-50-v2-mp_003.py diff --git a/example_configs/image2label/resnet-50-v2-mp5.py b/example_configs/image2label/resnet-50-v2-mp_004.py similarity index 100% rename from example_configs/image2label/resnet-50-v2-mp5.py rename to example_configs/image2label/resnet-50-v2-mp_004.py From 8e945aad065ae8d60be881b632400a4ebfc22c0d Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 31 May 2018 16:13:16 -0700 Subject: [PATCH 013/102] Fix regularizer in encoder/decoder --- open_seq2seq/decoders/decoder.py | 36 ++++++++++++++++++-------------- open_seq2seq/encoders/encoder.py | 36 ++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/open_seq2seq/decoders/decoder.py b/open_seq2seq/decoders/decoder.py index 2c2ae46fc..e46d6ea79 100644 --- a/open_seq2seq/decoders/decoder.py +++ b/open_seq2seq/decoders/decoder.py @@ -87,22 +87,6 @@ def __init__(self, params, model, name="decoder", mode='train'): else: self._params['dtype'] = tf.float32 - if 'regularizer' not in self._params: - if self._model and 'regularizer' in self._model.params: - self._params['regularizer'] = self._model.params['regularizer'] - self._params['regularizer_params'] = self._model.params['regularizer_params'] - - if 'regularizer' in self._params: - init_dict = self._params.get('regularizer_params', {}) - self._params['regularizer'] = self._params['regularizer'](**init_dict) - if self._params['dtype'] == 'mixed': - self._params['regularizer'] = mp_regularizer_wrapper( - self._params['regularizer'], - ) - - if self._params['dtype'] == 'mixed': - self._params['dtype'] = tf.float16 - self._name = name self._mode = mode @@ -117,6 +101,26 @@ def decode(self, input_dict): Returns: see :meth:`self._decode() <_decode>` docs. """ + if 'regularizer' not in self._params: + if self._model and 'regularizer' in self._model.params: + self._params['regularizer'] = copy.deepcopy( + self._model.params['regularizer'] + ) + self._params['regularizer_params'] = copy.deepcopy( + self._model.params['regularizer_params'] + ) + + if 'regularizer' in self._params: + init_dict = self._params.get('regularizer_params', {}) + self._params['regularizer'] = self._params['regularizer'](**init_dict) + if self._params['dtype'] == 'mixed': + self._params['regularizer'] = mp_regularizer_wrapper( + self._params['regularizer'], + ) + + if self._params['dtype'] == 'mixed': + self._params['dtype'] = tf.float16 + if 'initializer' in self.params: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) diff --git a/open_seq2seq/encoders/encoder.py b/open_seq2seq/encoders/encoder.py index 555b456a8..689308cef 100644 --- a/open_seq2seq/encoders/encoder.py +++ b/open_seq2seq/encoders/encoder.py @@ -87,22 +87,6 @@ def __init__(self, params, model, name="encoder", mode='train'): else: self._params['dtype'] = tf.float32 - if 'regularizer' not in self._params: - if self._model and 'regularizer' in self._model.params: - self._params['regularizer'] = self._model.params['regularizer'] - self._params['regularizer_params'] = self._model.params['regularizer_params'] - - if 'regularizer' in self._params: - init_dict = self._params.get('regularizer_params', {}) - self._params['regularizer'] = self._params['regularizer'](**init_dict) - if self._params['dtype'] == 'mixed': - self._params['regularizer'] = mp_regularizer_wrapper( - self._params['regularizer'], - ) - - if self._params['dtype'] == 'mixed': - self._params['dtype'] = tf.float16 - self._name = name self._mode = mode @@ -117,6 +101,26 @@ def encode(self, input_dict): Returns: see :meth:`self._encode() <_encode>` docs. """ + if 'regularizer' not in self._params: + if self._model and 'regularizer' in self._model.params: + self._params['regularizer'] = copy.deepcopy( + self._model.params['regularizer'] + ) + self._params['regularizer_params'] = copy.deepcopy( + self._model.params['regularizer_params'] + ) + + if 'regularizer' in self._params: + init_dict = self._params.get('regularizer_params', {}) + self._params['regularizer'] = self._params['regularizer'](**init_dict) + if self._params['dtype'] == 'mixed': + self._params['regularizer'] = mp_regularizer_wrapper( + self._params['regularizer'], + ) + + if self._params['dtype'] == 'mixed': + self._params['dtype'] = tf.float16 + if 'initializer' in self.params: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) From c7d225e5bdc114c16ececae95836a13b2e1fa913 Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 31 May 2018 16:16:48 -0700 Subject: [PATCH 014/102] Fix typo --- example_configs/image2label/resnet-50-v2-mp_004.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example_configs/image2label/resnet-50-v2-mp_004.py b/example_configs/image2label/resnet-50-v2-mp_004.py index 8098c2221..0c9228731 100644 --- a/example_configs/image2label/resnet-50-v2-mp_004.py +++ b/example_configs/image2label/resnet-50-v2-mp_004.py @@ -17,7 +17,7 @@ "num_gpus": 8, "batch_size_per_gpu": 32, "dtype": "mixed", - "automatic_loss_scaling": "Logmax", + "automatic_loss_scaling": "LogMax", "save_summaries_steps": 2000, "print_loss_steps": 100, From 2a7527d1e57666b8e83c686ec0d629fd084b8246 Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 31 May 2018 19:54:46 -0700 Subject: [PATCH 015/102] Add basic alexnet --- example_configs/image2label/alexnet_owt.py | 59 +++++++++++++++++ .../image2label/imagenet_preprocessing.py | 49 +++----------- open_seq2seq/encoders/alexnet_encoder.py | 65 +++++++++++++++++++ open_seq2seq/encoders/resnet_blocks.py | 15 +++++ 4 files changed, 147 insertions(+), 41 deletions(-) create mode 100644 example_configs/image2label/alexnet_owt.py create mode 100644 open_seq2seq/encoders/alexnet_encoder.py diff --git a/example_configs/image2label/alexnet_owt.py b/example_configs/image2label/alexnet_owt.py new file mode 100644 index 000000000..449c1b76a --- /dev/null +++ b/example_configs/image2label/alexnet_owt.py @@ -0,0 +1,59 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders.alexnet_encoder import AlexNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import piecewise_constant +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 128, + "dtype": tf.float32, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/alexnet-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": piecewise_constant, + "lr_policy_params": { + "learning_rate": 0.02, + "boundaries": [30, 60, 80, 90], + "decay_rates": [0.1, 0.01, 0.001, 1e-4], + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0001, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": AlexNetEncoder, + "encoder_params": {}, + + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1001, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + }, +} diff --git a/open_seq2seq/data/image2label/imagenet_preprocessing.py b/open_seq2seq/data/image2label/imagenet_preprocessing.py index 559257359..e3a0714bc 100644 --- a/open_seq2seq/data/image2label/imagenet_preprocessing.py +++ b/open_seq2seq/data/image2label/imagenet_preprocessing.py @@ -13,22 +13,17 @@ # limitations under the License. # ============================================================================== """Provides utilities to preprocess images. - Training images are sampled using the provided bounding boxes, and subsequently cropped to the sampled bounding box. Images are additionally flipped randomly, then resized to the target output size (without aspect-ratio preservation). - Images used during evaluation are resized (with aspect-ratio preservation) and centrally cropped. - All images undergo mean color subtraction. - Note that these steps are colloquially referred to as "ResNet preprocessing," and they differ from "VGG preprocessing," which does not use bounding boxes and instead does an aspect-preserving resize followed by random crop during training. (These both differ from "Inception preprocessing," which introduces color distortion steps.) - """ from __future__ import absolute_import @@ -54,21 +49,17 @@ def _decode_crop_and_flip(image_buffer, bbox, num_channels): """Crops the given image to a random part of the image, and randomly flips. - We use the fused decode_and_crop op, which performs better than the two ops used separately in series, but note that this requires that the image be passed in as an un-decoded string Tensor. - Args: image_buffer: scalar string Tensor representing the raw JPEG image buffer. bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. num_channels: Integer depth of the image buffer for decoding. - Returns: 3-D tensor with cropped image. - """ # A large fraction of image datasets contain a human-annotated bounding box # delineating the region of the image containing the object of interest. We @@ -103,12 +94,10 @@ def _decode_crop_and_flip(image_buffer, bbox, num_channels): def _central_crop(image, crop_height, crop_width): """Performs central crops of the given image list. - Args: image: a 3-D image tensor crop_height: the height of the image following the crop. crop_width: the width of the image following the crop. - Returns: 3-D tensor with cropped image. """ @@ -123,23 +112,18 @@ def _central_crop(image, crop_height, crop_width): image, [crop_top, crop_left, 0], [crop_height, crop_width, -1]) -def _mean_image_subtraction(image, means, num_channels): - """Subtracts the given means from each image channel. - +def _mean_image_subtraction_and_normalization(image, means, num_channels): + """Subtracts the given means from each image channel and divides by 127.5. For example: means = [123.68, 116.779, 103.939] - image = _mean_image_subtraction(image, means) - + image = _mean_image_subtraction_and_normalization(image, means) Note that the rank of `image` must be known. - Args: image: a tensor of size [height, width, C]. means: a C-vector of values to subtract from each channel. num_channels: number of color channels in the image that will be distorted. - Returns: - the centered image. - + the centered image and normalized image. Raises: ValueError: If the rank of `image` is unknown, if `image` has a rank other than three or if the number of channels in `image` doesn't match the @@ -154,21 +138,18 @@ def _mean_image_subtraction(image, means, num_channels): # We have a 1-D tensor of means; convert to 3-D. means = tf.expand_dims(tf.expand_dims(means, 0), 0) - return image - means + return (image - means) / 127.5 def _smallest_size_at_least(height, width, resize_min): """Computes new shape with the smallest side equal to `smallest_side`. - Computes new shape with the smallest side equal to `smallest_side` while preserving the original aspect ratio. - Args: height: an int32 scalar tensor indicating the current height. width: an int32 scalar tensor indicating the current width. resize_min: A python integer or scalar `Tensor` indicating the size of the smallest side after resize. - Returns: new_height: an int32 scalar tensor indicating the new height. new_width: an int32 scalar tensor indicating the new width. @@ -190,12 +171,10 @@ def _smallest_size_at_least(height, width, resize_min): def _aspect_preserving_resize(image, resize_min): """Resize images preserving the original aspect ratio. - Args: image: A 3-D image `Tensor`. resize_min: A python integer or scalar `Tensor` indicating the size of the smallest side after resize. - Returns: resized_image: A 3-D tensor containing the resized image. """ @@ -209,15 +188,12 @@ def _aspect_preserving_resize(image, resize_min): def _resize_image(image, height, width): """Simple wrapper around tf.resize_images. - This is primarily to make sure we use the same `ResizeMethod` and other details each time. - Args: image: A 3-D image `Tensor`. height: The target height for the resized image. width: The target width for the resized image. - Returns: resized_image: A 3-D tensor containing the resized image. The first two dimensions have the shape [height, width]. @@ -230,11 +206,9 @@ def _resize_image(image, height, width): def preprocess_image(image_buffer, bbox, output_height, output_width, num_channels, is_training=False): """Preprocesses the given image. - Preprocessing includes decoding, cropping, and resizing for both training and eval images. Training preprocessing, however, introduces some random distortion of the image to improve accuracy. - Args: image_buffer: scalar string Tensor representing the raw JPEG image buffer. bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] @@ -245,7 +219,6 @@ def preprocess_image(image_buffer, bbox, output_height, output_width, num_channels: Integer depth of the image buffer for decoding. is_training: `True` if we're preprocessing the image for training and `False` otherwise. - Returns: A preprocessed image. """ @@ -261,16 +234,15 @@ def preprocess_image(image_buffer, bbox, output_height, output_width, image.set_shape([output_height, output_width, num_channels]) - return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels) + return _mean_image_subtraction_and_normalization(image, _CHANNEL_MEANS, + num_channels) def _parse_example_proto(example_serialized): """Parses an Example proto containing a training example of an image. - The output of the build_image_data.py image preprocessing script is a dataset containing serialized Example protocol buffers. Each Example proto contains the following fields (values are included as examples): - image/height: 462 image/width: 581 image/colorspace: 'RGB' @@ -286,11 +258,9 @@ def _parse_example_proto(example_serialized): image/format: 'JPEG' image/filename: 'ILSVRC2012_val_00041207.JPEG' image/encoded: - Args: example_serialized: scalar Tensor tf.string containing a serialized Example protocol buffer. - Returns: image_buffer: Tensor tf.string containing the contents of a JPEG file. label: Tensor tf.int32 containing the label. @@ -336,15 +306,12 @@ def _parse_example_proto(example_serialized): def parse_record(raw_record, is_training): """Parses a record containing a training example of an image. - The input record is parsed into a label and image, and the image is passed through preprocessing steps (cropping, flipping, and so on). - Args: raw_record: scalar Tensor tf.string containing a serialized Example protocol buffer. is_training: A boolean denoting whether the input is for training. - Returns: Tuple with processed image tensor and one-hot-encoded label tensor. """ @@ -360,4 +327,4 @@ def parse_record(raw_record, is_training): label = tf.one_hot(tf.reshape(label, shape=[]), _NUM_CLASSES) - return image, label \ No newline at end of file + return image, label diff --git a/open_seq2seq/encoders/alexnet_encoder.py b/open_seq2seq/encoders/alexnet_encoder.py new file mode 100644 index 000000000..24081ce40 --- /dev/null +++ b/open_seq2seq/encoders/alexnet_encoder.py @@ -0,0 +1,65 @@ +# Copyright (c) 2018 NVIDIA Corporation +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals +from six.moves import range + +import tensorflow as tf +from .resnet_blocks import conv, pool +from .encoder import Encoder + + +class AlexNetEncoder(Encoder): + @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + }) + + @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + }) + + def __init__(self, params, model, name="resnet_encoder", mode='train'): + super(AlexNetEncoder, self).__init__(params, model, name, mode) + + def _encode(self, input_dict): + regularizer = self.params.get('regularizer', None) + data_format = self.params.get('data_format', 'channels_first') + if self.mode == 'train': + dropout_keep_prob = self.params.get('dropout_keep_prob', 0.5) + else: + dropout_keep_prob = 1.0 + + x = input_dict['source_tensors'][0] + + if data_format == 'channels_first': + x = tf.transpose(x, [0, 3, 1, 2]) + + x = conv(x, filters=64, kernel_size=(11, 11), strides=(4, 4), + data_format=data_format, padding='VALID', regularizer=regularizer) + x = pool(x, pool_size=(3, 3), data_format=data_format) + x = conv(x, filters=192, kernel_size=(5, 5), + data_format=data_format, regularizer=regularizer) + x = pool(x, pool_size=(3, 3), data_format=data_format) + x = conv(x, filters=384, kernel_size=(3, 3), + data_format=data_format, regularizer=regularizer) + x = conv(x, filters=256, kernel_size=(3, 3), + data_format=data_format, regularizer=regularizer) + x = conv(x, filters=256, kernel_size=(3, 3), + data_format=data_format, regularizer=regularizer) + x = pool(x, pool_size=(3, 3), data_format=data_format) + + if data_format == 'channels_first': + x = tf.transpose(x, [0, 2, 3, 1]) + input_shape = x.get_shape().as_list() + num_inputs = input_shape[1] * input_shape[2] * input_shape[3] + x = tf.reshape(x, [-1, num_inputs]) + + x = tf.layers.dense(x, 4096, activation=tf.nn.relu, + kernel_regularizer=regularizer) + x = tf.nn.dropout(x=x, keep_prob=dropout_keep_prob) + x = tf.layers.dense(x, 4096, activation=tf.nn.relu, + kernel_regularizer=regularizer) + x = tf.nn.dropout(x=x, keep_prob=dropout_keep_prob) + + return {'outputs': x} diff --git a/open_seq2seq/encoders/resnet_blocks.py b/open_seq2seq/encoders/resnet_blocks.py index 37dbc77da..e55a9841c 100644 --- a/open_seq2seq/encoders/resnet_blocks.py +++ b/open_seq2seq/encoders/resnet_blocks.py @@ -35,6 +35,21 @@ import tensorflow as tf +def conv(inputs, filters, kernel_size, + data_format, regularizer, strides=(1, 1), padding="SAME"): + output = tf.layers.conv2d( + inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, + padding=padding, use_bias=True, + data_format=data_format, kernel_regularizer=regularizer) + output = tf.nn.relu(output) + return output + + +def pool(inputs, pool_size, data_format, strides=(2, 2), padding='VALID'): + return tf.layers.max_pooling2d(inputs, pool_size=pool_size, strides=strides, + padding=padding, data_format=data_format) + + ################################################################################ # Convenience functions for building the ResNet model. ################################################################################ From 3bd0b6503baa65941d0ec8474c7b32e781b2ab88 Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 31 May 2018 19:54:59 -0700 Subject: [PATCH 016/102] Fix regularizer bug in encoder/decoder --- open_seq2seq/decoders/decoder.py | 40 +++++++++++++++++-------------- open_seq2seq/encoders/encoder.py | 41 ++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 36 deletions(-) diff --git a/open_seq2seq/decoders/decoder.py b/open_seq2seq/decoders/decoder.py index e46d6ea79..92d8f7283 100644 --- a/open_seq2seq/decoders/decoder.py +++ b/open_seq2seq/decoders/decoder.py @@ -89,6 +89,7 @@ def __init__(self, params, model, name="decoder", mode='train'): self._name = name self._mode = mode + self._compiled = False def decode(self, input_dict): """Wrapper around :meth:`self._decode() <_decode>` method. @@ -101,25 +102,26 @@ def decode(self, input_dict): Returns: see :meth:`self._decode() <_decode>` docs. """ - if 'regularizer' not in self._params: - if self._model and 'regularizer' in self._model.params: - self._params['regularizer'] = copy.deepcopy( - self._model.params['regularizer'] - ) - self._params['regularizer_params'] = copy.deepcopy( - self._model.params['regularizer_params'] - ) - - if 'regularizer' in self._params: - init_dict = self._params.get('regularizer_params', {}) - self._params['regularizer'] = self._params['regularizer'](**init_dict) - if self._params['dtype'] == 'mixed': - self._params['regularizer'] = mp_regularizer_wrapper( - self._params['regularizer'], - ) + if not self._compiled: + if 'regularizer' not in self._params: + if self._model and 'regularizer' in self._model.params: + self._params['regularizer'] = copy.deepcopy( + self._model.params['regularizer'] + ) + self._params['regularizer_params'] = copy.deepcopy( + self._model.params['regularizer_params'] + ) + + if 'regularizer' in self._params: + init_dict = self._params.get('regularizer_params', {}) + self._params['regularizer'] = self._params['regularizer'](**init_dict) + if self._params['dtype'] == 'mixed': + self._params['regularizer'] = mp_regularizer_wrapper( + self._params['regularizer'], + ) - if self._params['dtype'] == 'mixed': - self._params['dtype'] = tf.float16 + if self._params['dtype'] == 'mixed': + self._params['dtype'] = tf.float16 if 'initializer' in self.params: init_dict = self.params.get('initializer_params', {}) @@ -127,6 +129,8 @@ def decode(self, input_dict): else: initializer = None + self._compiled = True + with tf.variable_scope(self._name, initializer=initializer, dtype=self.params['dtype']): return self._decode(self._cast_types(input_dict)) diff --git a/open_seq2seq/encoders/encoder.py b/open_seq2seq/encoders/encoder.py index 689308cef..c2e969c65 100644 --- a/open_seq2seq/encoders/encoder.py +++ b/open_seq2seq/encoders/encoder.py @@ -89,6 +89,7 @@ def __init__(self, params, model, name="encoder", mode='train'): self._name = name self._mode = mode + self._compiled = False def encode(self, input_dict): """Wrapper around :meth:`self._encode() <_encode>` method. @@ -101,31 +102,35 @@ def encode(self, input_dict): Returns: see :meth:`self._encode() <_encode>` docs. """ - if 'regularizer' not in self._params: - if self._model and 'regularizer' in self._model.params: - self._params['regularizer'] = copy.deepcopy( - self._model.params['regularizer'] - ) - self._params['regularizer_params'] = copy.deepcopy( - self._model.params['regularizer_params'] - ) - - if 'regularizer' in self._params: - init_dict = self._params.get('regularizer_params', {}) - self._params['regularizer'] = self._params['regularizer'](**init_dict) - if self._params['dtype'] == 'mixed': - self._params['regularizer'] = mp_regularizer_wrapper( - self._params['regularizer'], - ) + if not self._compiled: + if 'regularizer' not in self._params: + if self._model and 'regularizer' in self._model.params: + self._params['regularizer'] = copy.deepcopy( + self._model.params['regularizer'] + ) + self._params['regularizer_params'] = copy.deepcopy( + self._model.params['regularizer_params'] + ) + + if 'regularizer' in self._params: + init_dict = self._params.get('regularizer_params', {}) + self._params['regularizer'] = self._params['regularizer'](**init_dict) + if self._params['dtype'] == 'mixed': + self._params['regularizer'] = mp_regularizer_wrapper( + self._params['regularizer'], + ) - if self._params['dtype'] == 'mixed': - self._params['dtype'] = tf.float16 + if self._params['dtype'] == 'mixed': + self._params['dtype'] = tf.float16 if 'initializer' in self.params: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) else: initializer = None + + self._compiled = True + with tf.variable_scope(self._name, initializer=initializer, dtype=self.params['dtype']): return self._encode(self._cast_types(input_dict)) From f0fff6790c755854239369bc229892738e63af6e Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 31 May 2018 19:56:20 -0700 Subject: [PATCH 017/102] Fix regularizer bug in encoder/decoder --- open_seq2seq/decoders/decoder.py | 40 +++++++++++++++++-------------- open_seq2seq/encoders/encoder.py | 41 ++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 36 deletions(-) diff --git a/open_seq2seq/decoders/decoder.py b/open_seq2seq/decoders/decoder.py index e46d6ea79..92d8f7283 100644 --- a/open_seq2seq/decoders/decoder.py +++ b/open_seq2seq/decoders/decoder.py @@ -89,6 +89,7 @@ def __init__(self, params, model, name="decoder", mode='train'): self._name = name self._mode = mode + self._compiled = False def decode(self, input_dict): """Wrapper around :meth:`self._decode() <_decode>` method. @@ -101,25 +102,26 @@ def decode(self, input_dict): Returns: see :meth:`self._decode() <_decode>` docs. """ - if 'regularizer' not in self._params: - if self._model and 'regularizer' in self._model.params: - self._params['regularizer'] = copy.deepcopy( - self._model.params['regularizer'] - ) - self._params['regularizer_params'] = copy.deepcopy( - self._model.params['regularizer_params'] - ) - - if 'regularizer' in self._params: - init_dict = self._params.get('regularizer_params', {}) - self._params['regularizer'] = self._params['regularizer'](**init_dict) - if self._params['dtype'] == 'mixed': - self._params['regularizer'] = mp_regularizer_wrapper( - self._params['regularizer'], - ) + if not self._compiled: + if 'regularizer' not in self._params: + if self._model and 'regularizer' in self._model.params: + self._params['regularizer'] = copy.deepcopy( + self._model.params['regularizer'] + ) + self._params['regularizer_params'] = copy.deepcopy( + self._model.params['regularizer_params'] + ) + + if 'regularizer' in self._params: + init_dict = self._params.get('regularizer_params', {}) + self._params['regularizer'] = self._params['regularizer'](**init_dict) + if self._params['dtype'] == 'mixed': + self._params['regularizer'] = mp_regularizer_wrapper( + self._params['regularizer'], + ) - if self._params['dtype'] == 'mixed': - self._params['dtype'] = tf.float16 + if self._params['dtype'] == 'mixed': + self._params['dtype'] = tf.float16 if 'initializer' in self.params: init_dict = self.params.get('initializer_params', {}) @@ -127,6 +129,8 @@ def decode(self, input_dict): else: initializer = None + self._compiled = True + with tf.variable_scope(self._name, initializer=initializer, dtype=self.params['dtype']): return self._decode(self._cast_types(input_dict)) diff --git a/open_seq2seq/encoders/encoder.py b/open_seq2seq/encoders/encoder.py index 689308cef..c2e969c65 100644 --- a/open_seq2seq/encoders/encoder.py +++ b/open_seq2seq/encoders/encoder.py @@ -89,6 +89,7 @@ def __init__(self, params, model, name="encoder", mode='train'): self._name = name self._mode = mode + self._compiled = False def encode(self, input_dict): """Wrapper around :meth:`self._encode() <_encode>` method. @@ -101,31 +102,35 @@ def encode(self, input_dict): Returns: see :meth:`self._encode() <_encode>` docs. """ - if 'regularizer' not in self._params: - if self._model and 'regularizer' in self._model.params: - self._params['regularizer'] = copy.deepcopy( - self._model.params['regularizer'] - ) - self._params['regularizer_params'] = copy.deepcopy( - self._model.params['regularizer_params'] - ) - - if 'regularizer' in self._params: - init_dict = self._params.get('regularizer_params', {}) - self._params['regularizer'] = self._params['regularizer'](**init_dict) - if self._params['dtype'] == 'mixed': - self._params['regularizer'] = mp_regularizer_wrapper( - self._params['regularizer'], - ) + if not self._compiled: + if 'regularizer' not in self._params: + if self._model and 'regularizer' in self._model.params: + self._params['regularizer'] = copy.deepcopy( + self._model.params['regularizer'] + ) + self._params['regularizer_params'] = copy.deepcopy( + self._model.params['regularizer_params'] + ) + + if 'regularizer' in self._params: + init_dict = self._params.get('regularizer_params', {}) + self._params['regularizer'] = self._params['regularizer'](**init_dict) + if self._params['dtype'] == 'mixed': + self._params['regularizer'] = mp_regularizer_wrapper( + self._params['regularizer'], + ) - if self._params['dtype'] == 'mixed': - self._params['dtype'] = tf.float16 + if self._params['dtype'] == 'mixed': + self._params['dtype'] = tf.float16 if 'initializer' in self.params: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) else: initializer = None + + self._compiled = True + with tf.variable_scope(self._name, initializer=initializer, dtype=self.params['dtype']): return self._encode(self._cast_types(input_dict)) From c1e0a80110e3cdeb62aa7506d537941e4b97d703 Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 31 May 2018 20:07:35 -0700 Subject: [PATCH 018/102] Add more configs --- .../{alexnet_owt.py => alexnet_000.py} | 2 +- example_configs/image2label/alexnet_001.py | 59 +++++++++++++++++++ example_configs/image2label/alexnet_002.py | 59 +++++++++++++++++++ example_configs/image2label/alexnet_003.py | 58 ++++++++++++++++++ example_configs/image2label/alexnet_004.py | 58 ++++++++++++++++++ example_configs/image2label/alexnet_005.py | 58 ++++++++++++++++++ 6 files changed, 293 insertions(+), 1 deletion(-) rename example_configs/image2label/{alexnet_owt.py => alexnet_000.py} (98%) create mode 100644 example_configs/image2label/alexnet_001.py create mode 100644 example_configs/image2label/alexnet_002.py create mode 100644 example_configs/image2label/alexnet_003.py create mode 100644 example_configs/image2label/alexnet_004.py create mode 100644 example_configs/image2label/alexnet_005.py diff --git a/example_configs/image2label/alexnet_owt.py b/example_configs/image2label/alexnet_000.py similarity index 98% rename from example_configs/image2label/alexnet_owt.py rename to example_configs/image2label/alexnet_000.py index 449c1b76a..39a341589 100644 --- a/example_configs/image2label/alexnet_owt.py +++ b/example_configs/image2label/alexnet_000.py @@ -40,7 +40,7 @@ "regularizer": tf.contrib.layers.l2_regularizer, "regularizer_params": { - 'scale': 0.0001, + 'scale': 0.0005, }, "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', 'variable_norm', 'gradient_norm', 'global_gradient_norm'], diff --git a/example_configs/image2label/alexnet_001.py b/example_configs/image2label/alexnet_001.py new file mode 100644 index 000000000..bf15055db --- /dev/null +++ b/example_configs/image2label/alexnet_001.py @@ -0,0 +1,59 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders.alexnet_encoder import AlexNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import piecewise_constant +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 128, + "dtype": tf.float32, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/alexnet-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": piecewise_constant, + "lr_policy_params": { + "learning_rate": 0.01, + "boundaries": [30, 60, 80, 90], + "decay_rates": [0.1, 0.01, 0.001, 1e-4], + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0005, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": AlexNetEncoder, + "encoder_params": {}, + + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1001, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + }, +} diff --git a/example_configs/image2label/alexnet_002.py b/example_configs/image2label/alexnet_002.py new file mode 100644 index 000000000..fa2be9fa8 --- /dev/null +++ b/example_configs/image2label/alexnet_002.py @@ -0,0 +1,59 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders.alexnet_encoder import AlexNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import piecewise_constant +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 128, + "dtype": tf.float32, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/alexnet-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": piecewise_constant, + "lr_policy_params": { + "learning_rate": 0.05, + "boundaries": [30, 60, 80, 90], + "decay_rates": [0.1, 0.01, 0.001, 1e-4], + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0005, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": AlexNetEncoder, + "encoder_params": {}, + + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1001, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + }, +} diff --git a/example_configs/image2label/alexnet_003.py b/example_configs/image2label/alexnet_003.py new file mode 100644 index 000000000..7d83a13d9 --- /dev/null +++ b/example_configs/image2label/alexnet_003.py @@ -0,0 +1,58 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders.alexnet_encoder import AlexNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import poly_decay +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 128, + "dtype": tf.float32, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/alexnet-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.02, + "power": 2.0, + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0005, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": AlexNetEncoder, + "encoder_params": {}, + + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1001, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + }, +} diff --git a/example_configs/image2label/alexnet_004.py b/example_configs/image2label/alexnet_004.py new file mode 100644 index 000000000..22fdd65db --- /dev/null +++ b/example_configs/image2label/alexnet_004.py @@ -0,0 +1,58 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders.alexnet_encoder import AlexNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import poly_decay +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 128, + "dtype": tf.float32, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/alexnet-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.02, + "power": 1.0, + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0005, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": AlexNetEncoder, + "encoder_params": {}, + + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1001, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + }, +} diff --git a/example_configs/image2label/alexnet_005.py b/example_configs/image2label/alexnet_005.py new file mode 100644 index 000000000..da1a4931d --- /dev/null +++ b/example_configs/image2label/alexnet_005.py @@ -0,0 +1,58 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders.alexnet_encoder import AlexNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import poly_decay +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 128, + "dtype": tf.float32, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/alexnet-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.02, + "power": 0.5, + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0005, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": AlexNetEncoder, + "encoder_params": {}, + + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1001, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + }, +} From a179a0a13b733a56cdc6bc07c1bd03ff75cbb303 Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 1 Jun 2018 18:34:17 -0700 Subject: [PATCH 019/102] Fix regularization loss scaling bug --- open_seq2seq/optimizers/mp_wrapper.py | 23 +++++++++--------- open_seq2seq/optimizers/optimizers.py | 34 +++++---------------------- 2 files changed, 18 insertions(+), 39 deletions(-) diff --git a/open_seq2seq/optimizers/mp_wrapper.py b/open_seq2seq/optimizers/mp_wrapper.py index f81f7138d..5ed715f3b 100644 --- a/open_seq2seq/optimizers/mp_wrapper.py +++ b/open_seq2seq/optimizers/mp_wrapper.py @@ -10,23 +10,27 @@ class MixedPrecisionOptimizerWrapper(tf.train.Optimizer): - def __init__(self, optimizer, automatic_loss_scaler=None): + def __init__(self, optimizer, loss_scale=None): super(MixedPrecisionOptimizerWrapper, self).__init__( optimizer._use_locking, optimizer._name + '-MP', ) self._optimizer = optimizer self._fp32_to_fp16 = {} - self._loss_scaler = automatic_loss_scaler + if loss_scale is None: + self._loss_scale = 1.0 + elif isinstance(loss_scale, float): + self._loss_scale = loss_scale + elif isinstance(loss_scale, AutomaticLossScaler): + self._loss_scaler = loss_scale + self._loss_scale = self._loss_scaler.loss_scale def compute_gradients(self, loss, var_list=None, gate_gradients=tf.train.Optimizer.GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): - if self._loss_scaler: - loss *= self._loss_scaler.loss_scale - + loss *= self._loss_scale grads_and_vars_fp16 = self._optimizer.compute_gradients( loss, var_list=var_list, gate_gradients=gate_gradients, @@ -59,7 +63,7 @@ def compute_gradients(self, loss, var_list=None, fp32_grad = tf.cast(grad, tf.float32) # adding regularization part with respect to fp32 copy if var.name in reg_funcs: - fp32_grad += tf.gradients( + fp32_grad += self._loss_scale * tf.gradients( tf.contrib.layers.apply_regularization( reg_funcs[var.name], [fp32_var], @@ -70,11 +74,8 @@ def compute_gradients(self, loss, var_list=None, else: grads_and_vars_fp32.append((grad, var)) - # Unscale gradients if necessary - if self._loss_scaler: - grads_and_vars_fp32 = _scale_grads(grads_and_vars_fp32, - 1. / self._loss_scaler.loss_scale) - + grads_and_vars_fp32 = _scale_grads(grads_and_vars_fp32, + 1.0 / self._loss_scale) return grads_and_vars_fp32 def apply_gradients(self, grads_and_vars, global_step=None, name=None): diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index d56c843bc..c448e9e04 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -323,32 +323,24 @@ class should be sub-class of `tf.Optimizer` that implements variables = vars_.trainable_variables() if automatic_loss_scaling is not None: - if not automatic_loss_scaling in AutomaticLossScaler.SUPPORTED_ALGOS: + if automatic_loss_scaling not in AutomaticLossScaler.SUPPORTED_ALGOS: raise ValueError("Unknown automatic loss scaling algorithm: %s." % automatic_loss_sclaing) if dtype != "mixed": raise ValueError("Automatic loss scaling can be used only with " "dtype=mixed.") - loss_scaler = AutomaticLossScaler(algorithm=automatic_loss_scaling) - else: - loss_scaler = None + loss_scale = AutomaticLossScaler(algorithm=automatic_loss_scaling) if dtype == 'mixed': - opt = MixedPrecisionOptimizerWrapper( - opt, - automatic_loss_scaler=loss_scaler, - ) + opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scale) if on_horovod: opt = DistributedOptimizer(opt) # Compute gradients. gradients = opt.compute_gradients( - loss if loss_scale == 1.0 else loss * loss_scale, - variables, - colocate_gradients_with_ops=colocate_gradients_with_ops) - - if loss_scale != 1.0: - gradients = _multiply_gradients_const(gradients, 1.0 / loss_scale) + loss, variables, + colocate_gradients_with_ops=colocate_gradients_with_ops, + ) # Optionally add gradient noise. if gradient_noise_scale is not None: @@ -616,17 +608,3 @@ def _multiply_gradients(grads_and_vars, gradient_multipliers): grad *= multiplier multiplied_grads_and_vars.append((grad, var)) return multiplied_grads_and_vars - - -def _multiply_gradients_const(grads_and_vars, multiplier): - """Multiply specified gradients.""" - multiplied_grads_and_vars = [] - for grad, var in grads_and_vars: - if grad is not None: - if isinstance(grad, ops.IndexedSlices): - grad_values = grad.values * multiplier - grad = ops.IndexedSlices(grad_values, grad.indices, grad.dense_shape) - else: - grad *= multiplier - multiplied_grads_and_vars.append((grad, var)) - return multiplied_grads_and_vars From 7d3f0832586fd07b9d2f7bb13738c8d312e57edb Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 1 Jun 2018 18:49:41 -0700 Subject: [PATCH 020/102] Bug fix --- open_seq2seq/optimizers/mp_wrapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/open_seq2seq/optimizers/mp_wrapper.py b/open_seq2seq/optimizers/mp_wrapper.py index 5ed715f3b..e863a72af 100644 --- a/open_seq2seq/optimizers/mp_wrapper.py +++ b/open_seq2seq/optimizers/mp_wrapper.py @@ -17,6 +17,7 @@ def __init__(self, optimizer, loss_scale=None): ) self._optimizer = optimizer self._fp32_to_fp16 = {} + self._loss_scaler = None if loss_scale is None: self._loss_scale = 1.0 elif isinstance(loss_scale, float): From d53d98ec192b7debf61842baff0bf3808beae2b0 Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 4 Jun 2018 12:59:09 -0700 Subject: [PATCH 021/102] Change cast to saturate_cast --- open_seq2seq/optimizers/mp_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open_seq2seq/optimizers/mp_wrapper.py b/open_seq2seq/optimizers/mp_wrapper.py index e863a72af..92942b304 100644 --- a/open_seq2seq/optimizers/mp_wrapper.py +++ b/open_seq2seq/optimizers/mp_wrapper.py @@ -90,7 +90,7 @@ def apply_ops_wrapper(): if var.name in self._fp32_to_fp16: dst_var = self._fp32_to_fp16[var.name] apply_ops.append( - tf.assign(dst_var, tf.cast(var, tf.float16))) + tf.assign(dst_var, tf.saturate_cast(var, tf.float16))) if apply_ops: return tf.group(apply_ops) return update_op From bf2e8295c72b33d477a07ea678c7a8e55dd110bc Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 4 Jun 2018 13:45:34 -0700 Subject: [PATCH 022/102] Add image_size, num_classes params for image2label --- example_configs/image2label/resnet-50-v2.py | 3 +- open_seq2seq/data/image2label/image2label.py | 9 ++++- .../image2label/imagenet_preprocessing.py | 34 +++++++++++++------ open_seq2seq/encoders/resnet_encoder.py | 1 + 4 files changed, 35 insertions(+), 12 deletions(-) diff --git a/example_configs/image2label/resnet-50-v2.py b/example_configs/image2label/resnet-50-v2.py index 856e44f21..43981118c 100644 --- a/example_configs/image2label/resnet-50-v2.py +++ b/example_configs/image2label/resnet-50-v2.py @@ -51,11 +51,12 @@ }, "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 224, }, } diff --git a/open_seq2seq/data/image2label/image2label.py b/open_seq2seq/data/image2label/image2label.py index c37696772..98d4dcf55 100644 --- a/open_seq2seq/data/image2label/image2label.py +++ b/open_seq2seq/data/image2label/image2label.py @@ -24,6 +24,8 @@ def get_optional_params(): return dict(DataLayer.get_optional_params(), **{ 'num_parallel_calls': int, 'shuffle_buffer': int, + 'image_size': int, + 'num_classes': int, }) def __init__(self, params, model, num_workers, worker_id): @@ -76,7 +78,12 @@ def build_graph(self): dataset = dataset.repeat() dataset = dataset.map( - lambda value: parse_record(value, self.params['mode'] == 'train'), + lambda value: parse_record( + raw_record=value, + is_training=self.params['mode'] == 'train', + image_size=self.params.get('image_size', 224), + num_classes=self.params.get('num_classes', 1000), + ), num_parallel_calls=self.params.get('num_parallel_calls', 16), ) diff --git a/open_seq2seq/data/image2label/imagenet_preprocessing.py b/open_seq2seq/data/image2label/imagenet_preprocessing.py index f76797a5c..07f5012ef 100644 --- a/open_seq2seq/data/image2label/imagenet_preprocessing.py +++ b/open_seq2seq/data/image2label/imagenet_preprocessing.py @@ -42,22 +42,20 @@ # _RESIZE_MIN x (_RESIZE_MIN * 2). _RESIZE_MIN = 256 -_DEFAULT_IMAGE_SIZE = 224 -_NUM_CHANNELS = 3 -_NUM_CLASSES = 1001 - def _decode_crop_and_flip(image_buffer, bbox, num_channels): """Crops the given image to a random part of the image, and randomly flips. We use the fused decode_and_crop op, which performs better than the two ops used separately in series, but note that this requires that the image be passed in as an un-decoded string Tensor. + Args: image_buffer: scalar string Tensor representing the raw JPEG image buffer. bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. num_channels: Integer depth of the image buffer for decoding. + Returns: 3-D tensor with cropped image. """ @@ -94,10 +92,12 @@ def _decode_crop_and_flip(image_buffer, bbox, num_channels): def _central_crop(image, crop_height, crop_width): """Performs central crops of the given image list. + Args: image: a 3-D image tensor crop_height: the height of the image following the crop. crop_width: the width of the image following the crop. + Returns: 3-D tensor with cropped image. """ @@ -150,11 +150,13 @@ def _smallest_size_at_least(height, width, resize_min): """Computes new shape with the smallest side equal to `smallest_side`. Computes new shape with the smallest side equal to `smallest_side` while preserving the original aspect ratio. + Args: height: an int32 scalar tensor indicating the current height. width: an int32 scalar tensor indicating the current width. resize_min: A python integer or scalar `Tensor` indicating the size of the smallest side after resize. + Returns: new_height: an int32 scalar tensor indicating the new height. new_width: an int32 scalar tensor indicating the new width. @@ -176,10 +178,12 @@ def _smallest_size_at_least(height, width, resize_min): def _aspect_preserving_resize(image, resize_min): """Resize images preserving the original aspect ratio. + Args: image: A 3-D image `Tensor`. resize_min: A python integer or scalar `Tensor` indicating the size of the smallest side after resize. + Returns: resized_image: A 3-D tensor containing the resized image. """ @@ -195,10 +199,12 @@ def _resize_image(image, height, width): """Simple wrapper around tf.resize_images. This is primarily to make sure we use the same `ResizeMethod` and other details each time. + Args: image: A 3-D image `Tensor`. height: The target height for the resized image. width: The target width for the resized image. + Returns: resized_image: A 3-D tensor containing the resized image. The first two dimensions have the shape [height, width]. @@ -214,6 +220,7 @@ def preprocess_image(image_buffer, bbox, output_height, output_width, Preprocessing includes decoding, cropping, and resizing for both training and eval images. Training preprocessing, however, introduces some random distortion of the image to improve accuracy. + Args: image_buffer: scalar string Tensor representing the raw JPEG image buffer. bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] @@ -224,6 +231,7 @@ def preprocess_image(image_buffer, bbox, output_height, output_width, num_channels: Integer depth of the image buffer for decoding. is_training: `True` if we're preprocessing the image for training and `False` otherwise. + Returns: A preprocessed image. """ @@ -263,9 +271,11 @@ def _parse_example_proto(example_serialized): image/format: 'JPEG' image/filename: 'ILSVRC2012_val_00041207.JPEG' image/encoded: + Args: example_serialized: scalar Tensor tf.string containing a serialized Example protocol buffer. + Returns: image_buffer: Tensor tf.string containing the contents of a JPEG file. label: Tensor tf.int32 containing the label. @@ -309,14 +319,18 @@ def _parse_example_proto(example_serialized): return features['image/encoded'], label, bbox -def parse_record(raw_record, is_training): +def parse_record(raw_record, is_training, image_size=224, num_classes=1000): """Parses a record containing a training example of an image. The input record is parsed into a label and image, and the image is passed through preprocessing steps (cropping, flipping, and so on). + Args: raw_record: scalar Tensor tf.string containing a serialized - Example protocol buffer. + Example protocol buffer. is_training: A boolean denoting whether the input is for training. + image_size (int): size that images should be resized to. + num_classes (int): number of output classes. + Returns: Tuple with processed image tensor and one-hot-encoded label tensor. """ @@ -325,11 +339,11 @@ def parse_record(raw_record, is_training): image = preprocess_image( image_buffer=image_buffer, bbox=bbox, - output_height=_DEFAULT_IMAGE_SIZE, - output_width=_DEFAULT_IMAGE_SIZE, - num_channels=_NUM_CHANNELS, + output_height=image_size, + output_width=image_size, + num_channels=3, is_training=is_training) - label = tf.one_hot(tf.reshape(label, shape=[]), _NUM_CLASSES) + label = tf.one_hot(tf.reshape(label, shape=[]), num_classes) return image, label diff --git a/open_seq2seq/encoders/resnet_encoder.py b/open_seq2seq/encoders/resnet_encoder.py index 8ad1fff89..3526d3876 100644 --- a/open_seq2seq/encoders/resnet_encoder.py +++ b/open_seq2seq/encoders/resnet_encoder.py @@ -135,6 +135,7 @@ def _encode(self, input_dict): regularizer=regularizer, bn_regularizer=bn_regularizer, bn_momentum=bn_momentum, bn_epsilon=bn_epsilon, ) + print(inputs.shape) if version == 2: inputs = batch_norm(inputs, training, data_format, regularizer=bn_regularizer, From fec684cee03dca237992914250aff4baa5bda392 Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 4 Jun 2018 14:09:09 -0700 Subject: [PATCH 023/102] Configs update --- example_configs/image2label/alexnet_000.py | 3 ++- example_configs/image2label/alexnet_001.py | 3 ++- example_configs/image2label/alexnet_002.py | 3 ++- example_configs/image2label/alexnet_003.py | 3 ++- example_configs/image2label/alexnet_004.py | 3 ++- example_configs/image2label/alexnet_005.py | 3 ++- example_configs/image2label/resnet-50-v2-mp_000.py | 3 ++- example_configs/image2label/resnet-50-v2-mp_001.py | 5 +++-- example_configs/image2label/resnet-50-v2-mp_002.py | 3 ++- example_configs/image2label/resnet-50-v2-mp_003.py | 3 ++- example_configs/image2label/resnet-50-v2-mp_004.py | 3 ++- open_seq2seq/encoders/resnet_encoder.py | 1 - 12 files changed, 23 insertions(+), 13 deletions(-) diff --git a/example_configs/image2label/alexnet_000.py b/example_configs/image2label/alexnet_000.py index 39a341589..6151de8ce 100644 --- a/example_configs/image2label/alexnet_000.py +++ b/example_configs/image2label/alexnet_000.py @@ -49,11 +49,12 @@ "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 227, }, } diff --git a/example_configs/image2label/alexnet_001.py b/example_configs/image2label/alexnet_001.py index bf15055db..0ad65d06a 100644 --- a/example_configs/image2label/alexnet_001.py +++ b/example_configs/image2label/alexnet_001.py @@ -49,11 +49,12 @@ "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 227, }, } diff --git a/example_configs/image2label/alexnet_002.py b/example_configs/image2label/alexnet_002.py index fa2be9fa8..9ae591a71 100644 --- a/example_configs/image2label/alexnet_002.py +++ b/example_configs/image2label/alexnet_002.py @@ -49,11 +49,12 @@ "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 227, }, } diff --git a/example_configs/image2label/alexnet_003.py b/example_configs/image2label/alexnet_003.py index 7d83a13d9..e7efbaaec 100644 --- a/example_configs/image2label/alexnet_003.py +++ b/example_configs/image2label/alexnet_003.py @@ -48,11 +48,12 @@ "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 227, }, } diff --git a/example_configs/image2label/alexnet_004.py b/example_configs/image2label/alexnet_004.py index 22fdd65db..b7f2c91cc 100644 --- a/example_configs/image2label/alexnet_004.py +++ b/example_configs/image2label/alexnet_004.py @@ -48,11 +48,12 @@ "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 227, }, } diff --git a/example_configs/image2label/alexnet_005.py b/example_configs/image2label/alexnet_005.py index da1a4931d..5a3254411 100644 --- a/example_configs/image2label/alexnet_005.py +++ b/example_configs/image2label/alexnet_005.py @@ -48,11 +48,12 @@ "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 227, }, } diff --git a/example_configs/image2label/resnet-50-v2-mp_000.py b/example_configs/image2label/resnet-50-v2-mp_000.py index f00de256a..57f2f19da 100644 --- a/example_configs/image2label/resnet-50-v2-mp_000.py +++ b/example_configs/image2label/resnet-50-v2-mp_000.py @@ -52,11 +52,12 @@ }, "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 224, }, } diff --git a/example_configs/image2label/resnet-50-v2-mp_001.py b/example_configs/image2label/resnet-50-v2-mp_001.py index b8ec5e9e3..75941bdb9 100644 --- a/example_configs/image2label/resnet-50-v2-mp_001.py +++ b/example_configs/image2label/resnet-50-v2-mp_001.py @@ -50,13 +50,14 @@ 'resnet_size': 50, "regularize_bn": False, }, - "decoder": FullyConnectedDecoder, + "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 224, }, } diff --git a/example_configs/image2label/resnet-50-v2-mp_002.py b/example_configs/image2label/resnet-50-v2-mp_002.py index ee20eba95..a50d8f1c2 100644 --- a/example_configs/image2label/resnet-50-v2-mp_002.py +++ b/example_configs/image2label/resnet-50-v2-mp_002.py @@ -52,11 +52,12 @@ }, "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 224, }, } diff --git a/example_configs/image2label/resnet-50-v2-mp_003.py b/example_configs/image2label/resnet-50-v2-mp_003.py index 89deede0f..6101c51e8 100644 --- a/example_configs/image2label/resnet-50-v2-mp_003.py +++ b/example_configs/image2label/resnet-50-v2-mp_003.py @@ -52,11 +52,12 @@ }, "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 224, }, } diff --git a/example_configs/image2label/resnet-50-v2-mp_004.py b/example_configs/image2label/resnet-50-v2-mp_004.py index 0c9228731..475de58a3 100644 --- a/example_configs/image2label/resnet-50-v2-mp_004.py +++ b/example_configs/image2label/resnet-50-v2-mp_004.py @@ -52,11 +52,12 @@ }, "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 224, }, } diff --git a/open_seq2seq/encoders/resnet_encoder.py b/open_seq2seq/encoders/resnet_encoder.py index 3526d3876..8ad1fff89 100644 --- a/open_seq2seq/encoders/resnet_encoder.py +++ b/open_seq2seq/encoders/resnet_encoder.py @@ -135,7 +135,6 @@ def _encode(self, input_dict): regularizer=regularizer, bn_regularizer=bn_regularizer, bn_momentum=bn_momentum, bn_epsilon=bn_epsilon, ) - print(inputs.shape) if version == 2: inputs = batch_norm(inputs, training, data_format, regularizer=bn_regularizer, From 017d986c339c1d077cf2848b29e1afefed4cdf31 Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 4 Jun 2018 18:03:43 -0700 Subject: [PATCH 024/102] Replace alexnet encoder with cnn encoder --- example_configs/image2label/alexnet_000.py | 60 --------------- example_configs/image2label/alexnet_001.py | 60 --------------- example_configs/image2label/alexnet_002.py | 60 --------------- example_configs/image2label/alexnet_003.py | 59 --------------- example_configs/image2label/alexnet_004.py | 50 +++++++++++- example_configs/image2label/alexnet_005.py | 59 --------------- open_seq2seq/encoders/alexnet_encoder.py | 65 ---------------- open_seq2seq/encoders/cnn_encoder.py | 88 ++++++++++++++++++++++ open_seq2seq/encoders/resnet_blocks.py | 15 ---- 9 files changed, 135 insertions(+), 381 deletions(-) delete mode 100644 example_configs/image2label/alexnet_000.py delete mode 100644 example_configs/image2label/alexnet_001.py delete mode 100644 example_configs/image2label/alexnet_002.py delete mode 100644 example_configs/image2label/alexnet_003.py delete mode 100644 example_configs/image2label/alexnet_005.py delete mode 100644 open_seq2seq/encoders/alexnet_encoder.py create mode 100644 open_seq2seq/encoders/cnn_encoder.py diff --git a/example_configs/image2label/alexnet_000.py b/example_configs/image2label/alexnet_000.py deleted file mode 100644 index 6151de8ce..000000000 --- a/example_configs/image2label/alexnet_000.py +++ /dev/null @@ -1,60 +0,0 @@ -from open_seq2seq.models import Image2Label -from open_seq2seq.encoders.alexnet_encoder import AlexNetEncoder -from open_seq2seq.decoders import FullyConnectedDecoder -from open_seq2seq.losses import CrossEntropyLoss -from open_seq2seq.data import ImagenetDataLayer -from open_seq2seq.optimizers.lr_policies import piecewise_constant -import tensorflow as tf - - -base_model = Image2Label - -base_params = { - "random_seed": 0, - "use_horovod": False, - "num_epochs": 100, - - "num_gpus": 8, - "batch_size_per_gpu": 128, - "dtype": tf.float32, - - "save_summaries_steps": 2000, - "print_loss_steps": 100, - "print_samples_steps": 2000, - "eval_steps": 5000, - "save_checkpoint_steps": 5000, - "logdir": "experiments/alexnet-imagenet", - - "optimizer": "Momentum", - "optimizer_params": { - "momentum": 0.90, - }, - "lr_policy": piecewise_constant, - "lr_policy_params": { - "learning_rate": 0.02, - "boundaries": [30, 60, 80, 90], - "decay_rates": [0.1, 0.01, 0.001, 1e-4], - }, - - "initializer": tf.variance_scaling_initializer, - - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 0.0005, - }, - "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "encoder": AlexNetEncoder, - "encoder_params": {}, - - "decoder": FullyConnectedDecoder, - "decoder_params": { - "output_dim": 1000, - }, - "loss": CrossEntropyLoss, - "data_layer": ImagenetDataLayer, - "data_layer_params": { - "data_dir": "data/tf-imagenet", - "image_size": 227, - }, -} diff --git a/example_configs/image2label/alexnet_001.py b/example_configs/image2label/alexnet_001.py deleted file mode 100644 index 0ad65d06a..000000000 --- a/example_configs/image2label/alexnet_001.py +++ /dev/null @@ -1,60 +0,0 @@ -from open_seq2seq.models import Image2Label -from open_seq2seq.encoders.alexnet_encoder import AlexNetEncoder -from open_seq2seq.decoders import FullyConnectedDecoder -from open_seq2seq.losses import CrossEntropyLoss -from open_seq2seq.data import ImagenetDataLayer -from open_seq2seq.optimizers.lr_policies import piecewise_constant -import tensorflow as tf - - -base_model = Image2Label - -base_params = { - "random_seed": 0, - "use_horovod": False, - "num_epochs": 100, - - "num_gpus": 8, - "batch_size_per_gpu": 128, - "dtype": tf.float32, - - "save_summaries_steps": 2000, - "print_loss_steps": 100, - "print_samples_steps": 2000, - "eval_steps": 5000, - "save_checkpoint_steps": 5000, - "logdir": "experiments/alexnet-imagenet", - - "optimizer": "Momentum", - "optimizer_params": { - "momentum": 0.90, - }, - "lr_policy": piecewise_constant, - "lr_policy_params": { - "learning_rate": 0.01, - "boundaries": [30, 60, 80, 90], - "decay_rates": [0.1, 0.01, 0.001, 1e-4], - }, - - "initializer": tf.variance_scaling_initializer, - - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 0.0005, - }, - "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "encoder": AlexNetEncoder, - "encoder_params": {}, - - "decoder": FullyConnectedDecoder, - "decoder_params": { - "output_dim": 1000, - }, - "loss": CrossEntropyLoss, - "data_layer": ImagenetDataLayer, - "data_layer_params": { - "data_dir": "data/tf-imagenet", - "image_size": 227, - }, -} diff --git a/example_configs/image2label/alexnet_002.py b/example_configs/image2label/alexnet_002.py deleted file mode 100644 index 9ae591a71..000000000 --- a/example_configs/image2label/alexnet_002.py +++ /dev/null @@ -1,60 +0,0 @@ -from open_seq2seq.models import Image2Label -from open_seq2seq.encoders.alexnet_encoder import AlexNetEncoder -from open_seq2seq.decoders import FullyConnectedDecoder -from open_seq2seq.losses import CrossEntropyLoss -from open_seq2seq.data import ImagenetDataLayer -from open_seq2seq.optimizers.lr_policies import piecewise_constant -import tensorflow as tf - - -base_model = Image2Label - -base_params = { - "random_seed": 0, - "use_horovod": False, - "num_epochs": 100, - - "num_gpus": 8, - "batch_size_per_gpu": 128, - "dtype": tf.float32, - - "save_summaries_steps": 2000, - "print_loss_steps": 100, - "print_samples_steps": 2000, - "eval_steps": 5000, - "save_checkpoint_steps": 5000, - "logdir": "experiments/alexnet-imagenet", - - "optimizer": "Momentum", - "optimizer_params": { - "momentum": 0.90, - }, - "lr_policy": piecewise_constant, - "lr_policy_params": { - "learning_rate": 0.05, - "boundaries": [30, 60, 80, 90], - "decay_rates": [0.1, 0.01, 0.001, 1e-4], - }, - - "initializer": tf.variance_scaling_initializer, - - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 0.0005, - }, - "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "encoder": AlexNetEncoder, - "encoder_params": {}, - - "decoder": FullyConnectedDecoder, - "decoder_params": { - "output_dim": 1000, - }, - "loss": CrossEntropyLoss, - "data_layer": ImagenetDataLayer, - "data_layer_params": { - "data_dir": "data/tf-imagenet", - "image_size": 227, - }, -} diff --git a/example_configs/image2label/alexnet_003.py b/example_configs/image2label/alexnet_003.py deleted file mode 100644 index e7efbaaec..000000000 --- a/example_configs/image2label/alexnet_003.py +++ /dev/null @@ -1,59 +0,0 @@ -from open_seq2seq.models import Image2Label -from open_seq2seq.encoders.alexnet_encoder import AlexNetEncoder -from open_seq2seq.decoders import FullyConnectedDecoder -from open_seq2seq.losses import CrossEntropyLoss -from open_seq2seq.data import ImagenetDataLayer -from open_seq2seq.optimizers.lr_policies import poly_decay -import tensorflow as tf - - -base_model = Image2Label - -base_params = { - "random_seed": 0, - "use_horovod": False, - "num_epochs": 100, - - "num_gpus": 8, - "batch_size_per_gpu": 128, - "dtype": tf.float32, - - "save_summaries_steps": 2000, - "print_loss_steps": 100, - "print_samples_steps": 2000, - "eval_steps": 5000, - "save_checkpoint_steps": 5000, - "logdir": "experiments/alexnet-imagenet", - - "optimizer": "Momentum", - "optimizer_params": { - "momentum": 0.90, - }, - "lr_policy": poly_decay, - "lr_policy_params": { - "learning_rate": 0.02, - "power": 2.0, - }, - - "initializer": tf.variance_scaling_initializer, - - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 0.0005, - }, - "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "encoder": AlexNetEncoder, - "encoder_params": {}, - - "decoder": FullyConnectedDecoder, - "decoder_params": { - "output_dim": 1000, - }, - "loss": CrossEntropyLoss, - "data_layer": ImagenetDataLayer, - "data_layer_params": { - "data_dir": "data/tf-imagenet", - "image_size": 227, - }, -} diff --git a/example_configs/image2label/alexnet_004.py b/example_configs/image2label/alexnet_004.py index b7f2c91cc..37bee06f5 100644 --- a/example_configs/image2label/alexnet_004.py +++ b/example_configs/image2label/alexnet_004.py @@ -1,5 +1,5 @@ from open_seq2seq.models import Image2Label -from open_seq2seq.encoders.alexnet_encoder import AlexNetEncoder +from open_seq2seq.encoders.cnn_encoder import CNNEncoder from open_seq2seq.decoders import FullyConnectedDecoder from open_seq2seq.losses import CrossEntropyLoss from open_seq2seq.data import ImagenetDataLayer @@ -43,8 +43,52 @@ }, "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "encoder": AlexNetEncoder, - "encoder_params": {}, + "encoder": CNNEncoder, + "encoder_params": { + 'data_format': 'channels_first', + 'cnn_layers': [ + (tf.layers.conv2d, { + 'filters': 64, 'kernel_size': (11, 11), + 'strides': (4, 4), 'padding': 'VALID', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 192, 'kernel_size': (5, 5), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 384, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + ], + 'fc_layers': [ + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.nn.dropout, {'keep_prob': 0.5}), + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.nn.dropout, {'keep_prob': 0.5}), + ], + }, "decoder": FullyConnectedDecoder, "decoder_params": { diff --git a/example_configs/image2label/alexnet_005.py b/example_configs/image2label/alexnet_005.py deleted file mode 100644 index 5a3254411..000000000 --- a/example_configs/image2label/alexnet_005.py +++ /dev/null @@ -1,59 +0,0 @@ -from open_seq2seq.models import Image2Label -from open_seq2seq.encoders.alexnet_encoder import AlexNetEncoder -from open_seq2seq.decoders import FullyConnectedDecoder -from open_seq2seq.losses import CrossEntropyLoss -from open_seq2seq.data import ImagenetDataLayer -from open_seq2seq.optimizers.lr_policies import poly_decay -import tensorflow as tf - - -base_model = Image2Label - -base_params = { - "random_seed": 0, - "use_horovod": False, - "num_epochs": 100, - - "num_gpus": 8, - "batch_size_per_gpu": 128, - "dtype": tf.float32, - - "save_summaries_steps": 2000, - "print_loss_steps": 100, - "print_samples_steps": 2000, - "eval_steps": 5000, - "save_checkpoint_steps": 5000, - "logdir": "experiments/alexnet-imagenet", - - "optimizer": "Momentum", - "optimizer_params": { - "momentum": 0.90, - }, - "lr_policy": poly_decay, - "lr_policy_params": { - "learning_rate": 0.02, - "power": 0.5, - }, - - "initializer": tf.variance_scaling_initializer, - - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 0.0005, - }, - "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "encoder": AlexNetEncoder, - "encoder_params": {}, - - "decoder": FullyConnectedDecoder, - "decoder_params": { - "output_dim": 1000, - }, - "loss": CrossEntropyLoss, - "data_layer": ImagenetDataLayer, - "data_layer_params": { - "data_dir": "data/tf-imagenet", - "image_size": 227, - }, -} diff --git a/open_seq2seq/encoders/alexnet_encoder.py b/open_seq2seq/encoders/alexnet_encoder.py deleted file mode 100644 index 24081ce40..000000000 --- a/open_seq2seq/encoders/alexnet_encoder.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2018 NVIDIA Corporation -from __future__ import absolute_import, division, print_function -from __future__ import unicode_literals -from six.moves import range - -import tensorflow as tf -from .resnet_blocks import conv, pool -from .encoder import Encoder - - -class AlexNetEncoder(Encoder): - @staticmethod - def get_required_params(): - return dict(Encoder.get_required_params(), **{ - }) - - @staticmethod - def get_optional_params(): - return dict(Encoder.get_optional_params(), **{ - }) - - def __init__(self, params, model, name="resnet_encoder", mode='train'): - super(AlexNetEncoder, self).__init__(params, model, name, mode) - - def _encode(self, input_dict): - regularizer = self.params.get('regularizer', None) - data_format = self.params.get('data_format', 'channels_first') - if self.mode == 'train': - dropout_keep_prob = self.params.get('dropout_keep_prob', 0.5) - else: - dropout_keep_prob = 1.0 - - x = input_dict['source_tensors'][0] - - if data_format == 'channels_first': - x = tf.transpose(x, [0, 3, 1, 2]) - - x = conv(x, filters=64, kernel_size=(11, 11), strides=(4, 4), - data_format=data_format, padding='VALID', regularizer=regularizer) - x = pool(x, pool_size=(3, 3), data_format=data_format) - x = conv(x, filters=192, kernel_size=(5, 5), - data_format=data_format, regularizer=regularizer) - x = pool(x, pool_size=(3, 3), data_format=data_format) - x = conv(x, filters=384, kernel_size=(3, 3), - data_format=data_format, regularizer=regularizer) - x = conv(x, filters=256, kernel_size=(3, 3), - data_format=data_format, regularizer=regularizer) - x = conv(x, filters=256, kernel_size=(3, 3), - data_format=data_format, regularizer=regularizer) - x = pool(x, pool_size=(3, 3), data_format=data_format) - - if data_format == 'channels_first': - x = tf.transpose(x, [0, 2, 3, 1]) - input_shape = x.get_shape().as_list() - num_inputs = input_shape[1] * input_shape[2] * input_shape[3] - x = tf.reshape(x, [-1, num_inputs]) - - x = tf.layers.dense(x, 4096, activation=tf.nn.relu, - kernel_regularizer=regularizer) - x = tf.nn.dropout(x=x, keep_prob=dropout_keep_prob) - x = tf.layers.dense(x, 4096, activation=tf.nn.relu, - kernel_regularizer=regularizer) - x = tf.nn.dropout(x=x, keep_prob=dropout_keep_prob) - - return {'outputs': x} diff --git a/open_seq2seq/encoders/cnn_encoder.py b/open_seq2seq/encoders/cnn_encoder.py new file mode 100644 index 000000000..2f498929d --- /dev/null +++ b/open_seq2seq/encoders/cnn_encoder.py @@ -0,0 +1,88 @@ +# Copyright (c) 2018 NVIDIA Corporation +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals +from six.moves import range + +import tensorflow as tf +import copy +from .encoder import Encoder +from open_seq2seq.utils.utils import deco_print + + +def build_layer(inputs, layer, layer_params, data_format, regularizer): + layer_built = False + + for reg_name in ['regularizer', 'kernel_regularizer', + 'gamma_regularizer', None]: + if layer_built: + break + for try_data_format in [True, False]: + cur_params = copy.deepcopy(layer_params) + if try_data_format: + cur_params.update({'data_format': data_format}) + if reg_name is not None: + cur_params.update({reg_name: regularizer}) + try: + outputs = layer(inputs, **cur_params) + layer_built = True + break + except TypeError as e: + if "got an unexpected keyword argument '{}'".format(reg_name) in e.__str__(): + continue + if "got an unexpected keyword argument 'data_format'" in e.__str__(): + continue + raise + + if not layer_built: + cur_params = copy.deepcopy(layer_params) + outputs = layer(inputs, **cur_params) + + if hasattr(layer, '_tf_api_names'): + layer_name = layer._tf_api_names[0] + else: + layer_name = layer + deco_print("Building layer: {}(inputs, {})".format( + layer_name, + ", ".join("{}={}".format(key, value) for key, value in cur_params.items()) + )) + return outputs + + +class CNNEncoder(Encoder): + @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + 'cnn_layers': list, + }) + + @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + 'data_format': ['channels_first', 'channels_last'], + 'fc_layers': list, + }) + + def __init__(self, params, model, name="cnn_encoder", mode='train'): + super(CNNEncoder, self).__init__(params, model, name, mode) + + def _encode(self, input_dict): + regularizer = self.params.get('regularizer', None) + data_format = self.params.get('data_format', 'channels_first') + + x = input_dict['source_tensors'][0] + if data_format == 'channels_first': + x = tf.transpose(x, [0, 3, 1, 2]) + + for layer, layer_params in self.params['cnn_layers']: + x = build_layer(x, layer, layer_params, data_format, regularizer) + + if data_format == 'channels_first': + x = tf.transpose(x, [0, 2, 3, 1]) + input_shape = x.get_shape().as_list() + num_inputs = input_shape[1] * input_shape[2] * input_shape[3] + x = tf.reshape(x, [-1, num_inputs]) + + for layer, layer_params in self.params.get('fc_layers', []): + x = build_layer(x, layer, layer_params, data_format, regularizer) + + return {'outputs': x} diff --git a/open_seq2seq/encoders/resnet_blocks.py b/open_seq2seq/encoders/resnet_blocks.py index e55a9841c..37dbc77da 100644 --- a/open_seq2seq/encoders/resnet_blocks.py +++ b/open_seq2seq/encoders/resnet_blocks.py @@ -35,21 +35,6 @@ import tensorflow as tf -def conv(inputs, filters, kernel_size, - data_format, regularizer, strides=(1, 1), padding="SAME"): - output = tf.layers.conv2d( - inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, - padding=padding, use_bias=True, - data_format=data_format, kernel_regularizer=regularizer) - output = tf.nn.relu(output) - return output - - -def pool(inputs, pool_size, data_format, strides=(2, 2), padding='VALID'): - return tf.layers.max_pooling2d(inputs, pool_size=pool_size, strides=strides, - padding=padding, data_format=data_format) - - ################################################################################ # Convenience functions for building the ResNet model. ################################################################################ From bddbf4d94d3af11d5ccc48c26f20dc04c8dbdf2a Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 4 Jun 2018 18:50:21 -0700 Subject: [PATCH 025/102] Config fix --- example_configs/image2label/alexnet_004.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example_configs/image2label/alexnet_004.py b/example_configs/image2label/alexnet_004.py index 37bee06f5..ae57264b6 100644 --- a/example_configs/image2label/alexnet_004.py +++ b/example_configs/image2label/alexnet_004.py @@ -14,8 +14,8 @@ "use_horovod": False, "num_epochs": 100, - "num_gpus": 8, - "batch_size_per_gpu": 128, + "num_gpus": 4, + "batch_size_per_gpu": 256, "dtype": tf.float32, "save_summaries_steps": 2000, From 96463646bd9ceaa1d59fa4139d9756ab8fd03a2c Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 5 Jun 2018 13:51:29 -0700 Subject: [PATCH 026/102] Add basic cifar 10 dataset --- example_configs/image2label/test_cifar.py | 115 +++++++++++++++ open_seq2seq/data/image2label/image2label.py | 143 +++++++++++++++++++ open_seq2seq/encoders/cnn_encoder.py | 17 ++- 3 files changed, 270 insertions(+), 5 deletions(-) create mode 100644 example_configs/image2label/test_cifar.py diff --git a/example_configs/image2label/test_cifar.py b/example_configs/image2label/test_cifar.py new file mode 100644 index 000000000..5b7b436af --- /dev/null +++ b/example_configs/image2label/test_cifar.py @@ -0,0 +1,115 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders.cnn_encoder import CNNEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data.image2label.image2label import CifarDataLayer +from open_seq2seq.optimizers.lr_policies import poly_decay +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 1, + "batch_size_per_gpu": 32, + "dtype": tf.float32, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/alexnet-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.001, + "power": 1.0, + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0002, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": CNNEncoder, + "encoder_params": { + 'data_format': 'channels_first', + 'cnn_layers': [ + # block 1 + (tf.layers.conv2d, { + 'filters': 128, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 128, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 128, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': None, 'use_bias': False, + }), + (tf.layers.batch_normalization, {'momentum': 0.9, 'epsilon': 0.0001}), + (tf.nn.relu, {}), + (tf.layers.max_pooling2d, { + 'pool_size': 3, 'strides': 2, 'padding': 'SAME', + }), + # block 2 + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': None, 'use_bias': False, + }), + (tf.layers.batch_normalization, {'momentum': 0.9, 'epsilon': 0.0001}), + (tf.nn.relu, {}), + (tf.layers.max_pooling2d, { + 'pool_size': 3, 'strides': 2, 'padding': 'SAME', + }), + # block 3 + (tf.layers.conv2d, { + 'filters': 320, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 320, 'kernel_size': (1, 1), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + ], + }, + + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 10, + }, + "loss": CrossEntropyLoss, + "data_layer": CifarDataLayer, + "data_layer_params": { + "data_dir": "data/cifar10_data/cifar-10-batches-bin", + }, +} diff --git a/open_seq2seq/data/image2label/image2label.py b/open_seq2seq/data/image2label/image2label.py index 98d4dcf55..32e643c43 100644 --- a/open_seq2seq/data/image2label/image2label.py +++ b/open_seq2seq/data/image2label/image2label.py @@ -7,11 +7,154 @@ import os import tensorflow as tf +import numpy as np from open_seq2seq.data.data_layer import DataLayer from .imagenet_preprocessing import parse_record +class CifarDataLayer(DataLayer): + _HEIGHT = 28 + _WIDTH = 28 + _NUM_CHANNELS = 3 + _DEFAULT_IMAGE_BYTES = 32 * 32 * 3 + # The record is the image plus a one-byte label + _RECORD_BYTES = _DEFAULT_IMAGE_BYTES + 1 + _NUM_CLASSES = 10 + _NUM_DATA_FILES = 5 + + _NUM_IMAGES = { + 'train': 50000, + 'validation': 10000, + } + + @staticmethod + def get_required_params(): + return dict(DataLayer.get_required_params(), **{ + 'data_dir': str, + }) + + @staticmethod + def get_optional_params(): + return dict(DataLayer.get_optional_params(), **{ + 'num_parallel_calls': int, + 'shuffle_buffer': int, + 'image_size': int, + 'num_classes': int, + }) + + def __init__(self, params, model, num_workers, worker_id): + super(CifarDataLayer, self).__init__(params, model, + num_workers, worker_id) + if self.params['mode'] == 'infer': + raise ValueError('Inference is not supported on CifarDataLayer') + + if self.params['mode'] == 'train': + filenames = [ + os.path.join(self.params['data_dir'], 'data_batch_{}.bin'.format(i)) + for i in range(1, self._NUM_DATA_FILES + 1) + ] + else: + filenames = [os.path.join(self.params['data_dir'], 'test_batch.bin')] + + self.file_names = filenames + self._train_size = 50000 + self._valid_size = 10000 + self._iterator = None + self._input_tensors = None + + def preprocess_image(self, image, is_training): + """Preprocess a single image of layout [height, width, depth].""" + if is_training: + # Resize the image to add four extra pixels on each side. + image = tf.image.resize_image_with_crop_or_pad( + image, self._HEIGHT + 8, self._WIDTH + 8) + + # Randomly crop a [_HEIGHT, _WIDTH] section of the image. + image = tf.random_crop(image, [self._HEIGHT, self._WIDTH, + self._NUM_CHANNELS]) + + # Randomly flip the image horizontally. + image = tf.image.random_flip_left_right(image) + + else: + image = tf.image.resize_image_with_crop_or_pad( + image, self._HEIGHT, self._WIDTH) + + # Subtract off the mean and divide by the variance of the pixels. + image = tf.image.per_image_standardization(image) + + return image + + def parse_record(self, raw_record, is_training, num_classes=10): + """Parse CIFAR-10 image and label from a raw record.""" + # Convert bytes to a vector of uint8 that is record_bytes long. + record_vector = tf.decode_raw(raw_record, tf.uint8) + + # The first byte represents the label, which we convert from uint8 to int32 + # and then to one-hot. + label = tf.cast(record_vector[0], tf.int32) + + # The remaining bytes after the label represent the image, which we reshape + # from [depth * height * width] to [depth, height, width]. + depth_major = tf.reshape(record_vector[1:self._RECORD_BYTES], + [3, 32, 32]) + + # Convert from [depth, height, width] to [height, width, depth], and cast as + # float32. + image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32) + + image = self.preprocess_image(image, is_training) + label = tf.one_hot(tf.reshape(label, shape=[]), num_classes) + + return image, label + + def build_graph(self): + dataset = tf.data.FixedLengthRecordDataset(self.file_names, + self._RECORD_BYTES) + + dataset = dataset.prefetch(buffer_size=self.params['batch_size']) + if self.params['shuffle']: + # shuffling images + dataset = dataset.shuffle(buffer_size=self.params.get('shuffle_buffer', + 1500)) + dataset = dataset.repeat() + + dataset = dataset.map( + lambda value: self.parse_record( + raw_record=value, + is_training=self.params['mode'] == 'train', + ), + num_parallel_calls=self.params.get('num_parallel_calls', 16), + ) + + dataset = dataset.batch(self.params['batch_size']) + dataset = dataset.prefetch(1) + + self._iterator = dataset.make_initializable_iterator() + inputs, labels = self.iterator.get_next() + if self.params['mode'] == 'train': + tf.summary.image('augmented_images', inputs, max_outputs=1) + self._input_tensors = { + 'source_tensors': [inputs], + 'target_tensors': [labels], + } + + @property + def input_tensors(self): + return self._input_tensors + + @property + def iterator(self): + return self._iterator + + def get_size_in_samples(self): + if self.params['mode'] == 'train': + return self._train_size + else: + return len(np.arange(self._valid_size)[self._worker_id::self._num_workers]) + + class ImagenetDataLayer(DataLayer): @staticmethod def get_required_params(): diff --git a/open_seq2seq/encoders/cnn_encoder.py b/open_seq2seq/encoders/cnn_encoder.py index 2f498929d..a33a43e25 100644 --- a/open_seq2seq/encoders/cnn_encoder.py +++ b/open_seq2seq/encoders/cnn_encoder.py @@ -78,11 +78,18 @@ def _encode(self, input_dict): if data_format == 'channels_first': x = tf.transpose(x, [0, 2, 3, 1]) - input_shape = x.get_shape().as_list() - num_inputs = input_shape[1] * input_shape[2] * input_shape[3] - x = tf.reshape(x, [-1, num_inputs]) - for layer, layer_params in self.params.get('fc_layers', []): - x = build_layer(x, layer, layer_params, data_format, regularizer) + fc_layers = self.params.get('fc_layers', []) + + # if fully connected layers exist, flattening the output and applying them + if fc_layers: + input_shape = x.get_shape().as_list() + num_inputs = input_shape[1] * input_shape[2] * input_shape[3] + x = tf.reshape(x, [-1, num_inputs]) + for layer, layer_params in fc_layers: + x = build_layer(x, layer, layer_params, data_format, regularizer) + else: + # if there are no fully connected layers, doing average pooling + x = tf.reduce_mean(x, [1, 2]) return {'outputs': x} From c47d64646e5d6209dbe90a98ece16db8b742285a Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 5 Jun 2018 13:53:17 -0700 Subject: [PATCH 027/102] Test config with 1001 classes --- example_configs/image2label/alexnet_004.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/example_configs/image2label/alexnet_004.py b/example_configs/image2label/alexnet_004.py index ae57264b6..efcd93479 100644 --- a/example_configs/image2label/alexnet_004.py +++ b/example_configs/image2label/alexnet_004.py @@ -92,12 +92,13 @@ "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1000, + "output_dim": 1001, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", "image_size": 227, + "num_classes": 1001, }, } From ec4c29b15cb8627347ed0839e6d335cc0a42c0f8 Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 5 Jun 2018 15:45:17 -0700 Subject: [PATCH 028/102] Add config --- example_configs/image2label/alexnet_004m.py | 104 ++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 example_configs/image2label/alexnet_004m.py diff --git a/example_configs/image2label/alexnet_004m.py b/example_configs/image2label/alexnet_004m.py new file mode 100644 index 000000000..e2b3f4ee8 --- /dev/null +++ b/example_configs/image2label/alexnet_004m.py @@ -0,0 +1,104 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders.cnn_encoder import CNNEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import poly_decay +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 4, + "batch_size_per_gpu": 256, + "dtype": tf.float32, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/alexnet-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.02, + "power": 1.0, + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0005, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": CNNEncoder, + "encoder_params": { + 'data_format': 'channels_first', + 'cnn_layers': [ + (tf.layers.conv2d, { + 'filters': 64, 'kernel_size': (11, 11), + 'strides': (4, 4), 'padding': 'VALID', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 192, 'kernel_size': (5, 5), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 384, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + ], + 'fc_layers': [ + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.nn.dropout, {'keep_prob': 0.5}), + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.nn.dropout, {'keep_prob': 0.5}), + ], + }, + + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1000, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + "image_size": 227, + "num_classes": 1000, + }, +} From 23134c4f31e641140b4a550bacb9ad82706b33ab Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 5 Jun 2018 18:36:15 -0700 Subject: [PATCH 029/102] Fix with dropout --- example_configs/image2label/alexnet_004.py | 49 +++++++++++++++++++++ example_configs/image2label/alexnet_004m.py | 49 +++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/example_configs/image2label/alexnet_004.py b/example_configs/image2label/alexnet_004.py index efcd93479..e8989635a 100644 --- a/example_configs/image2label/alexnet_004.py +++ b/example_configs/image2label/alexnet_004.py @@ -102,3 +102,52 @@ "num_classes": 1001, }, } + +eval_params = { + "encoder": CNNEncoder, + "encoder_params": { + 'data_format': 'channels_first', + 'cnn_layers': [ + (tf.layers.conv2d, { + 'filters': 64, 'kernel_size': (11, 11), + 'strides': (4, 4), 'padding': 'VALID', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 192, 'kernel_size': (5, 5), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 384, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + ], + 'fc_layers': [ + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.nn.dropout, {'keep_prob': 1.0}), + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.nn.dropout, {'keep_prob': 1.0}), + ], + }, +} diff --git a/example_configs/image2label/alexnet_004m.py b/example_configs/image2label/alexnet_004m.py index e2b3f4ee8..c50cde41a 100644 --- a/example_configs/image2label/alexnet_004m.py +++ b/example_configs/image2label/alexnet_004m.py @@ -102,3 +102,52 @@ "num_classes": 1000, }, } + +eval_params = { + "encoder": CNNEncoder, + "encoder_params": { + 'data_format': 'channels_first', + 'cnn_layers': [ + (tf.layers.conv2d, { + 'filters': 64, 'kernel_size': (11, 11), + 'strides': (4, 4), 'padding': 'VALID', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 192, 'kernel_size': (5, 5), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 384, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + ], + 'fc_layers': [ + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.nn.dropout, {'keep_prob': 1.0}), + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.nn.dropout, {'keep_prob': 1.0}), + ], + }, +} From 42381afb0d813a04ae7a8f7b83f65587f4e508a9 Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 5 Jun 2018 18:38:46 -0700 Subject: [PATCH 030/102] Test 1001 with resnet --- example_configs/image2label/resnet-50-v2-mp_000.py | 3 ++- example_configs/image2label/resnet-50-v2-mp_001.py | 5 +++-- example_configs/image2label/resnet-50-v2-mp_002.py | 3 ++- example_configs/image2label/resnet-50-v2-mp_003.py | 3 ++- example_configs/image2label/resnet-50-v2-mp_004.py | 3 ++- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/example_configs/image2label/resnet-50-v2-mp_000.py b/example_configs/image2label/resnet-50-v2-mp_000.py index 57f2f19da..594dc798a 100644 --- a/example_configs/image2label/resnet-50-v2-mp_000.py +++ b/example_configs/image2label/resnet-50-v2-mp_000.py @@ -52,12 +52,13 @@ }, "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1000, + "output_dim": 1001, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", "image_size": 224, + "num_classes": 1001, }, } diff --git a/example_configs/image2label/resnet-50-v2-mp_001.py b/example_configs/image2label/resnet-50-v2-mp_001.py index 75941bdb9..ee3ac8151 100644 --- a/example_configs/image2label/resnet-50-v2-mp_001.py +++ b/example_configs/image2label/resnet-50-v2-mp_001.py @@ -50,14 +50,15 @@ 'resnet_size': 50, "regularize_bn": False, }, - "decoder": FullyConnectedDecoder, + "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1000, + "output_dim": 1001, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", "image_size": 224, + "num_classes": 1001, }, } diff --git a/example_configs/image2label/resnet-50-v2-mp_002.py b/example_configs/image2label/resnet-50-v2-mp_002.py index a50d8f1c2..c7d19fd1b 100644 --- a/example_configs/image2label/resnet-50-v2-mp_002.py +++ b/example_configs/image2label/resnet-50-v2-mp_002.py @@ -52,12 +52,13 @@ }, "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1000, + "output_dim": 1001, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", "image_size": 224, + "num_classes": 1001, }, } diff --git a/example_configs/image2label/resnet-50-v2-mp_003.py b/example_configs/image2label/resnet-50-v2-mp_003.py index 6101c51e8..2b77d60f6 100644 --- a/example_configs/image2label/resnet-50-v2-mp_003.py +++ b/example_configs/image2label/resnet-50-v2-mp_003.py @@ -52,12 +52,13 @@ }, "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1000, + "output_dim": 1001, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", "image_size": 224, + "num_classes": 1001, }, } diff --git a/example_configs/image2label/resnet-50-v2-mp_004.py b/example_configs/image2label/resnet-50-v2-mp_004.py index 475de58a3..aba35b18d 100644 --- a/example_configs/image2label/resnet-50-v2-mp_004.py +++ b/example_configs/image2label/resnet-50-v2-mp_004.py @@ -52,12 +52,13 @@ }, "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1000, + "output_dim": 1001, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", "image_size": 224, + "num_classes": 1001, }, } From 4ac948cba8b5e91df5329c325a3ef3e1e833e270 Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 6 Jun 2018 11:10:48 -0700 Subject: [PATCH 031/102] Fix bug with labels going from 1 to 1000 --- open_seq2seq/data/image2label/imagenet_preprocessing.py | 3 ++- open_seq2seq/models/image2label.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/open_seq2seq/data/image2label/imagenet_preprocessing.py b/open_seq2seq/data/image2label/imagenet_preprocessing.py index 07f5012ef..0bcda0c20 100644 --- a/open_seq2seq/data/image2label/imagenet_preprocessing.py +++ b/open_seq2seq/data/image2label/imagenet_preprocessing.py @@ -344,6 +344,7 @@ def parse_record(raw_record, is_training, image_size=224, num_classes=1000): num_channels=3, is_training=is_training) - label = tf.one_hot(tf.reshape(label, shape=[]), num_classes) + # subtracting 1 to make labels go from 0 to 999 + label = tf.one_hot(tf.reshape(label - 1, shape=[]), num_classes) return image, label diff --git a/open_seq2seq/models/image2label.py b/open_seq2seq/models/image2label.py index 884044d2d..9e7e4380a 100644 --- a/open_seq2seq/models/image2label.py +++ b/open_seq2seq/models/image2label.py @@ -56,8 +56,9 @@ def evaluate(self, input_values, output_values): labels = np.where(labels == 1)[1] total = logits.shape[0] - top1 = np.sum(np.argmax(logits, axis=1) == labels) - top5 = np.sum(labels[:, np.newaxis] == np.argpartition(logits, -5)[:, -5:]) + top1 = np.sum(np.equal(np.argmax(logits, axis=1), labels)) + top5 = np.sum(np.equal(labels[:, np.newaxis], + np.argpartition(logits, -5)[:, -5:])) return total, top1, top5 def _get_num_objects_per_step(self, worker_id=0): From fd2cde723b250d3fdfb3fdc12ac3b6f52df40658 Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 6 Jun 2018 13:10:51 -0700 Subject: [PATCH 032/102] Change build_layer to use signature --- example_configs/image2label/alexnet_004.py | 63 +------- example_configs/image2label/alexnet_004m.py | 153 -------------------- open_seq2seq/encoders/cnn_encoder.py | 76 +++++----- open_seq2seq/models/model.py | 11 +- 4 files changed, 53 insertions(+), 250 deletions(-) delete mode 100644 example_configs/image2label/alexnet_004m.py diff --git a/example_configs/image2label/alexnet_004.py b/example_configs/image2label/alexnet_004.py index e8989635a..3dce00b63 100644 --- a/example_configs/image2label/alexnet_004.py +++ b/example_configs/image2label/alexnet_004.py @@ -12,7 +12,7 @@ base_params = { "random_seed": 0, "use_horovod": False, - "num_epochs": 100, + "num_epochs": 120, "num_gpus": 4, "batch_size_per_gpu": 256, @@ -31,7 +31,7 @@ }, "lr_policy": poly_decay, "lr_policy_params": { - "learning_rate": 0.02, + "learning_rate": 0.04, "power": 1.0, }, @@ -84,70 +84,21 @@ ], 'fc_layers': [ (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), - (tf.nn.dropout, {'keep_prob': 0.5}), + (tf.layers.dropout, {'rate': 0.5}), (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), - (tf.nn.dropout, {'keep_prob': 0.5}), + (tf.layers.dropout, {'rate': 0.5}), ], }, "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", "image_size": 227, - "num_classes": 1001, + "num_classes": 1000, }, -} - -eval_params = { - "encoder": CNNEncoder, - "encoder_params": { - 'data_format': 'channels_first', - 'cnn_layers': [ - (tf.layers.conv2d, { - 'filters': 64, 'kernel_size': (11, 11), - 'strides': (4, 4), 'padding': 'VALID', - 'activation': tf.nn.relu, - }), - (tf.layers.max_pooling2d, { - 'pool_size': (3, 3), 'strides': (2, 2), - }), - (tf.layers.conv2d, { - 'filters': 192, 'kernel_size': (5, 5), - 'strides': (1, 1), 'padding': 'SAME', - 'activation': tf.nn.relu, - }), - (tf.layers.max_pooling2d, { - 'pool_size': (3, 3), 'strides': (2, 2), - }), - (tf.layers.conv2d, { - 'filters': 384, 'kernel_size': (3, 3), - 'strides': (1, 1), 'padding': 'SAME', - 'activation': tf.nn.relu, - }), - (tf.layers.conv2d, { - 'filters': 256, 'kernel_size': (3, 3), - 'strides': (1, 1), 'padding': 'SAME', - 'activation': tf.nn.relu, - }), - (tf.layers.conv2d, { - 'filters': 256, 'kernel_size': (3, 3), - 'strides': (1, 1), 'padding': 'SAME', - 'activation': tf.nn.relu, - }), - (tf.layers.max_pooling2d, { - 'pool_size': (3, 3), 'strides': (2, 2), - }), - ], - 'fc_layers': [ - (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), - (tf.nn.dropout, {'keep_prob': 1.0}), - (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), - (tf.nn.dropout, {'keep_prob': 1.0}), - ], - }, -} +} \ No newline at end of file diff --git a/example_configs/image2label/alexnet_004m.py b/example_configs/image2label/alexnet_004m.py deleted file mode 100644 index c50cde41a..000000000 --- a/example_configs/image2label/alexnet_004m.py +++ /dev/null @@ -1,153 +0,0 @@ -from open_seq2seq.models import Image2Label -from open_seq2seq.encoders.cnn_encoder import CNNEncoder -from open_seq2seq.decoders import FullyConnectedDecoder -from open_seq2seq.losses import CrossEntropyLoss -from open_seq2seq.data import ImagenetDataLayer -from open_seq2seq.optimizers.lr_policies import poly_decay -import tensorflow as tf - - -base_model = Image2Label - -base_params = { - "random_seed": 0, - "use_horovod": False, - "num_epochs": 100, - - "num_gpus": 4, - "batch_size_per_gpu": 256, - "dtype": tf.float32, - - "save_summaries_steps": 2000, - "print_loss_steps": 100, - "print_samples_steps": 2000, - "eval_steps": 5000, - "save_checkpoint_steps": 5000, - "logdir": "experiments/alexnet-imagenet", - - "optimizer": "Momentum", - "optimizer_params": { - "momentum": 0.90, - }, - "lr_policy": poly_decay, - "lr_policy_params": { - "learning_rate": 0.02, - "power": 1.0, - }, - - "initializer": tf.variance_scaling_initializer, - - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 0.0005, - }, - "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "encoder": CNNEncoder, - "encoder_params": { - 'data_format': 'channels_first', - 'cnn_layers': [ - (tf.layers.conv2d, { - 'filters': 64, 'kernel_size': (11, 11), - 'strides': (4, 4), 'padding': 'VALID', - 'activation': tf.nn.relu, - }), - (tf.layers.max_pooling2d, { - 'pool_size': (3, 3), 'strides': (2, 2), - }), - (tf.layers.conv2d, { - 'filters': 192, 'kernel_size': (5, 5), - 'strides': (1, 1), 'padding': 'SAME', - 'activation': tf.nn.relu, - }), - (tf.layers.max_pooling2d, { - 'pool_size': (3, 3), 'strides': (2, 2), - }), - (tf.layers.conv2d, { - 'filters': 384, 'kernel_size': (3, 3), - 'strides': (1, 1), 'padding': 'SAME', - 'activation': tf.nn.relu, - }), - (tf.layers.conv2d, { - 'filters': 256, 'kernel_size': (3, 3), - 'strides': (1, 1), 'padding': 'SAME', - 'activation': tf.nn.relu, - }), - (tf.layers.conv2d, { - 'filters': 256, 'kernel_size': (3, 3), - 'strides': (1, 1), 'padding': 'SAME', - 'activation': tf.nn.relu, - }), - (tf.layers.max_pooling2d, { - 'pool_size': (3, 3), 'strides': (2, 2), - }), - ], - 'fc_layers': [ - (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), - (tf.nn.dropout, {'keep_prob': 0.5}), - (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), - (tf.nn.dropout, {'keep_prob': 0.5}), - ], - }, - - "decoder": FullyConnectedDecoder, - "decoder_params": { - "output_dim": 1000, - }, - "loss": CrossEntropyLoss, - "data_layer": ImagenetDataLayer, - "data_layer_params": { - "data_dir": "data/tf-imagenet", - "image_size": 227, - "num_classes": 1000, - }, -} - -eval_params = { - "encoder": CNNEncoder, - "encoder_params": { - 'data_format': 'channels_first', - 'cnn_layers': [ - (tf.layers.conv2d, { - 'filters': 64, 'kernel_size': (11, 11), - 'strides': (4, 4), 'padding': 'VALID', - 'activation': tf.nn.relu, - }), - (tf.layers.max_pooling2d, { - 'pool_size': (3, 3), 'strides': (2, 2), - }), - (tf.layers.conv2d, { - 'filters': 192, 'kernel_size': (5, 5), - 'strides': (1, 1), 'padding': 'SAME', - 'activation': tf.nn.relu, - }), - (tf.layers.max_pooling2d, { - 'pool_size': (3, 3), 'strides': (2, 2), - }), - (tf.layers.conv2d, { - 'filters': 384, 'kernel_size': (3, 3), - 'strides': (1, 1), 'padding': 'SAME', - 'activation': tf.nn.relu, - }), - (tf.layers.conv2d, { - 'filters': 256, 'kernel_size': (3, 3), - 'strides': (1, 1), 'padding': 'SAME', - 'activation': tf.nn.relu, - }), - (tf.layers.conv2d, { - 'filters': 256, 'kernel_size': (3, 3), - 'strides': (1, 1), 'padding': 'SAME', - 'activation': tf.nn.relu, - }), - (tf.layers.max_pooling2d, { - 'pool_size': (3, 3), 'strides': (2, 2), - }), - ], - 'fc_layers': [ - (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), - (tf.nn.dropout, {'keep_prob': 1.0}), - (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), - (tf.nn.dropout, {'keep_prob': 1.0}), - ], - }, -} diff --git a/open_seq2seq/encoders/cnn_encoder.py b/open_seq2seq/encoders/cnn_encoder.py index 2f498929d..30e2e9b7b 100644 --- a/open_seq2seq/encoders/cnn_encoder.py +++ b/open_seq2seq/encoders/cnn_encoder.py @@ -5,46 +5,44 @@ import tensorflow as tf import copy + +try: + from inspect import signature +except ImportError: + from funcsigs import signature + from .encoder import Encoder from open_seq2seq.utils.utils import deco_print -def build_layer(inputs, layer, layer_params, data_format, regularizer): - layer_built = False - - for reg_name in ['regularizer', 'kernel_regularizer', - 'gamma_regularizer', None]: - if layer_built: - break - for try_data_format in [True, False]: - cur_params = copy.deepcopy(layer_params) - if try_data_format: - cur_params.update({'data_format': data_format}) - if reg_name is not None: - cur_params.update({reg_name: regularizer}) - try: - outputs = layer(inputs, **cur_params) - layer_built = True - break - except TypeError as e: - if "got an unexpected keyword argument '{}'".format(reg_name) in e.__str__(): - continue - if "got an unexpected keyword argument 'data_format'" in e.__str__(): - continue - raise - - if not layer_built: - cur_params = copy.deepcopy(layer_params) - outputs = layer(inputs, **cur_params) - - if hasattr(layer, '_tf_api_names'): - layer_name = layer._tf_api_names[0] - else: - layer_name = layer - deco_print("Building layer: {}(inputs, {})".format( - layer_name, - ", ".join("{}={}".format(key, value) for key, value in cur_params.items()) - )) +def build_layer(inputs, layer, layer_params, data_format, + regularizer, training, verbose=True): + layer_params_cp = copy.deepcopy(layer_params) + for reg_name in ['regularizer', 'kernel_regularizer', 'gamma_regularizer']: + if reg_name not in layer_params_cp and \ + reg_name in signature(layer).parameters: + layer_params_cp.update({reg_name: regularizer}) + + if 'data_format' not in layer_params_cp and \ + 'data_format' in signature(layer).parameters: + layer_params_cp.update({'data_format': data_format}) + + if 'training' not in layer_params_cp and \ + 'training' in signature(layer).parameters: + layer_params_cp.update({'training': training}) + + outputs = layer(inputs, **layer_params_cp) + + if verbose: + if hasattr(layer, '_tf_api_names'): + layer_name = layer._tf_api_names[0] + else: + layer_name = layer + deco_print("Building layer: {}(inputs, {})".format( + layer_name, + ", ".join("{}={}".format(key, value) + for key, value in layer_params_cp.items()) + )) return outputs @@ -74,7 +72,8 @@ def _encode(self, input_dict): x = tf.transpose(x, [0, 3, 1, 2]) for layer, layer_params in self.params['cnn_layers']: - x = build_layer(x, layer, layer_params, data_format, regularizer) + x = build_layer(x, layer, layer_params, data_format, + regularizer, self.mode == 'train') if data_format == 'channels_first': x = tf.transpose(x, [0, 2, 3, 1]) @@ -83,6 +82,7 @@ def _encode(self, input_dict): x = tf.reshape(x, [-1, num_inputs]) for layer, layer_params in self.params.get('fc_layers', []): - x = build_layer(x, layer, layer_params, data_format, regularizer) + x = build_layer(x, layer, layer_params, data_format, + regularizer, self.mode == 'train') return {'outputs': x} diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index f79052608..244bddd4b 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -10,6 +10,11 @@ import copy import time +try: + from inspect import signature +except ImportError: + from funcsigs import signature + from open_seq2seq.utils.utils import deco_print, clip_last_batch from open_seq2seq.optimizers import optimize_loss, get_regularization_loss from open_seq2seq.utils.utils import check_params @@ -353,10 +358,10 @@ def compile(self, force_var_reuse=False): lr_params = self.params.get('lr_policy_params', {}) # adding default decay_steps = max_steps if lr_policy supports it and # different value is not provided - if 'decay_steps' in self.params['lr_policy'].__code__.co_varnames and \ - 'decay_steps' not in lr_params: + func_params = signature(self.params['lr_policy']).parameters + if 'decay_steps' in func_params and 'decay_steps' not in lr_params: lr_params['decay_steps'] = self._last_step - if 'steps_per_epoch' in self.params['lr_policy'].__code__.co_varnames and \ + if 'steps_per_epoch' in func_params and \ 'steps_per_epoch' not in lr_params and 'num_epochs' in self.params: lr_params['steps_per_epoch'] = self.steps_in_epoch lr_policy = lambda gs: self.params['lr_policy'](global_step=gs, From e778a616ee53211dc6e5bf02cffe8a95cc6de79f Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 6 Jun 2018 13:25:46 -0700 Subject: [PATCH 033/102] Add config --- .../image2label/resnet-50-v2-mp_005.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 example_configs/image2label/resnet-50-v2-mp_005.py diff --git a/example_configs/image2label/resnet-50-v2-mp_005.py b/example_configs/image2label/resnet-50-v2-mp_005.py new file mode 100644 index 000000000..b98c6c191 --- /dev/null +++ b/example_configs/image2label/resnet-50-v2-mp_005.py @@ -0,0 +1,64 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders import ResNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import piecewise_constant +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 32, + "dtype": "mixed", + "loss_scale": 1.0, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/resnet50-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": piecewise_constant, + "lr_policy_params": { + "learning_rate": 0.1, + "boundaries": [30, 60, 80, 90], + "decay_rates": [0.1, 0.01, 0.001, 1e-4], + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0001, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": ResNetEncoder, + "encoder_params": { + 'resnet_size': 50, + "regularize_bn": False, + }, + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1001, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + "image_size": 224, + "num_classes": 1001, + }, +} From 6eb7bfee22741b519dec32e715a67d4c9234ab64 Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 6 Jun 2018 13:55:12 -0700 Subject: [PATCH 034/102] Add cifar10_download_and_extract.py --- example_configs/image2label/test_cifar.py | 6 +- .../cifar10_download_and_extract.py | 63 +++++++++++++++++++ 2 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 open_seq2seq/data/image2label/cifar10_download_and_extract.py diff --git a/example_configs/image2label/test_cifar.py b/example_configs/image2label/test_cifar.py index 5b7b436af..c42eb1e18 100644 --- a/example_configs/image2label/test_cifar.py +++ b/example_configs/image2label/test_cifar.py @@ -12,7 +12,7 @@ base_params = { "random_seed": 0, "use_horovod": False, - "num_epochs": 100, + "num_epochs": 200, "num_gpus": 1, "batch_size_per_gpu": 32, @@ -23,7 +23,7 @@ "print_samples_steps": 2000, "eval_steps": 5000, "save_checkpoint_steps": 5000, - "logdir": "experiments/alexnet-imagenet", + "logdir": "experiments/test-cifar", "optimizer": "Momentum", "optimizer_params": { @@ -110,6 +110,6 @@ "loss": CrossEntropyLoss, "data_layer": CifarDataLayer, "data_layer_params": { - "data_dir": "data/cifar10_data/cifar-10-batches-bin", + "data_dir": "data/cifar-10-batches-bin", }, } diff --git a/open_seq2seq/data/image2label/cifar10_download_and_extract.py b/open_seq2seq/data/image2label/cifar10_download_and_extract.py new file mode 100644 index 000000000..ee4f48942 --- /dev/null +++ b/open_seq2seq/data/image2label/cifar10_download_and_extract.py @@ -0,0 +1,63 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Downloads and extracts the binary version of the CIFAR-10 dataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os +import sys +import tarfile + +from six.moves import urllib +import tensorflow as tf + +DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz' + +parser = argparse.ArgumentParser() + +parser.add_argument( + '--data_dir', type=str, default='data/', + help='Directory to download data and extract the tarball') + + +def main(_): + """Download and extract the tarball from Alex's website.""" + if not os.path.exists(FLAGS.data_dir): + os.makedirs(FLAGS.data_dir) + + filename = DATA_URL.split('/')[-1] + filepath = os.path.join(FLAGS.data_dir, filename) + + if not os.path.exists(filepath): + def _progress(count, block_size, total_size): + sys.stdout.write('\r>> Downloading %s %.1f%%' % ( + filename, 100.0 * count * block_size / total_size)) + sys.stdout.flush() + + filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress) + print() + statinfo = os.stat(filepath) + print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') + + tarfile.open(filepath, 'r:gz').extractall(FLAGS.data_dir) + + +if __name__ == '__main__': + FLAGS, unparsed = parser.parse_known_args() + tf.app.run(argv=[sys.argv[0]] + unparsed) From b322f647c5f5d15a0f0d5e43172ee2a148e6fb83 Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 6 Jun 2018 18:10:54 -0700 Subject: [PATCH 035/102] Add assert for nans --- open_seq2seq/optimizers/mp_wrapper.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/open_seq2seq/optimizers/mp_wrapper.py b/open_seq2seq/optimizers/mp_wrapper.py index 633e3a3e5..cd24dbb09 100644 --- a/open_seq2seq/optimizers/mp_wrapper.py +++ b/open_seq2seq/optimizers/mp_wrapper.py @@ -61,7 +61,14 @@ def compute_gradients(self, loss, var_list=None, "FP32_MASTER_COPIES"], ) self._fp32_to_fp16[fp32_var.name] = var - fp32_grad = tf.cast(grad, tf.float32) + assert_grad_nan = tf.Assert( + tf.is_finite(tf.reduce_sum(grad)), + [grad, var], + summarize=1000, + name="nan_assert_for/{}".format(var.name.split(':')[0]), + ) + with tf.control_dependencies([assert_grad_nan]): + fp32_grad = tf.cast(grad, tf.float32) # adding regularization part with respect to fp32 copy if var.name in reg_funcs: fp32_grad += self._loss_scale * tf.gradients( From 8a5fe93bd84d4649c4d95a8189aa1d4e60c53856 Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 7 Jun 2018 10:47:50 -0700 Subject: [PATCH 036/102] Remove assert on nan --- open_seq2seq/optimizers/mp_wrapper.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/open_seq2seq/optimizers/mp_wrapper.py b/open_seq2seq/optimizers/mp_wrapper.py index cd24dbb09..633e3a3e5 100644 --- a/open_seq2seq/optimizers/mp_wrapper.py +++ b/open_seq2seq/optimizers/mp_wrapper.py @@ -61,14 +61,7 @@ def compute_gradients(self, loss, var_list=None, "FP32_MASTER_COPIES"], ) self._fp32_to_fp16[fp32_var.name] = var - assert_grad_nan = tf.Assert( - tf.is_finite(tf.reduce_sum(grad)), - [grad, var], - summarize=1000, - name="nan_assert_for/{}".format(var.name.split(':')[0]), - ) - with tf.control_dependencies([assert_grad_nan]): - fp32_grad = tf.cast(grad, tf.float32) + fp32_grad = tf.cast(grad, tf.float32) # adding regularization part with respect to fp32 copy if var.name in reg_funcs: fp32_grad += self._loss_scale * tf.gradients( From 6147b5d511885b09a627d7ee65f38d3d7ed004f5 Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 7 Jun 2018 10:50:26 -0700 Subject: [PATCH 037/102] Debug assert on nan --- open_seq2seq/optimizers/mp_wrapper.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/open_seq2seq/optimizers/mp_wrapper.py b/open_seq2seq/optimizers/mp_wrapper.py index 633e3a3e5..b139ec0bb 100644 --- a/open_seq2seq/optimizers/mp_wrapper.py +++ b/open_seq2seq/optimizers/mp_wrapper.py @@ -61,7 +61,14 @@ def compute_gradients(self, loss, var_list=None, "FP32_MASTER_COPIES"], ) self._fp32_to_fp16[fp32_var.name] = var - fp32_grad = tf.cast(grad, tf.float32) + assert_grad_nan = tf.Assert( + tf.is_finite(tf.reduce_max(grad)), + [tf.reduce_max(grad), grad, var], + summarize=10000, + name="nan_assert_for/{}".format(var.name.split(':')[0]), + ) + with tf.control_dependencies([assert_grad_nan]): + fp32_grad = tf.cast(grad, tf.float32) # adding regularization part with respect to fp32 copy if var.name in reg_funcs: fp32_grad += self._loss_scale * tf.gradients( From 988800ae0d8140690a6769e036baeb45e672be9b Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 8 Jun 2018 09:08:53 -0700 Subject: [PATCH 038/102] Remove assert on nans --- open_seq2seq/optimizers/mp_wrapper.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/open_seq2seq/optimizers/mp_wrapper.py b/open_seq2seq/optimizers/mp_wrapper.py index b139ec0bb..633e3a3e5 100644 --- a/open_seq2seq/optimizers/mp_wrapper.py +++ b/open_seq2seq/optimizers/mp_wrapper.py @@ -61,14 +61,7 @@ def compute_gradients(self, loss, var_list=None, "FP32_MASTER_COPIES"], ) self._fp32_to_fp16[fp32_var.name] = var - assert_grad_nan = tf.Assert( - tf.is_finite(tf.reduce_max(grad)), - [tf.reduce_max(grad), grad, var], - summarize=10000, - name="nan_assert_for/{}".format(var.name.split(':')[0]), - ) - with tf.control_dependencies([assert_grad_nan]): - fp32_grad = tf.cast(grad, tf.float32) + fp32_grad = tf.cast(grad, tf.float32) # adding regularization part with respect to fp32 copy if var.name in reg_funcs: fp32_grad += self._loss_scale * tf.gradients( From 91b7978b85c67c1c4fa3d8c2cfaba1a2b4ad802b Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 8 Jun 2018 09:12:24 -0700 Subject: [PATCH 039/102] Rename/remove configs --- .../{alexnet_004.py => alexnet_owt.py} | 0 .../{test_cifar.py => cifar-nv.py} | 0 ...net-50-v2-mp_003.py => resnet-50-v2-mp.py} | 0 .../image2label/resnet-50-v2-mp_000.py | 64 ------------------- .../image2label/resnet-50-v2-mp_001.py | 64 ------------------- .../image2label/resnet-50-v2-mp_002.py | 64 ------------------- .../image2label/resnet-50-v2-mp_004.py | 64 ------------------- .../image2label/resnet-50-v2-mp_005.py | 64 ------------------- 8 files changed, 320 deletions(-) rename example_configs/image2label/{alexnet_004.py => alexnet_owt.py} (100%) rename example_configs/image2label/{test_cifar.py => cifar-nv.py} (100%) rename example_configs/image2label/{resnet-50-v2-mp_003.py => resnet-50-v2-mp.py} (100%) delete mode 100644 example_configs/image2label/resnet-50-v2-mp_000.py delete mode 100644 example_configs/image2label/resnet-50-v2-mp_001.py delete mode 100644 example_configs/image2label/resnet-50-v2-mp_002.py delete mode 100644 example_configs/image2label/resnet-50-v2-mp_004.py delete mode 100644 example_configs/image2label/resnet-50-v2-mp_005.py diff --git a/example_configs/image2label/alexnet_004.py b/example_configs/image2label/alexnet_owt.py similarity index 100% rename from example_configs/image2label/alexnet_004.py rename to example_configs/image2label/alexnet_owt.py diff --git a/example_configs/image2label/test_cifar.py b/example_configs/image2label/cifar-nv.py similarity index 100% rename from example_configs/image2label/test_cifar.py rename to example_configs/image2label/cifar-nv.py diff --git a/example_configs/image2label/resnet-50-v2-mp_003.py b/example_configs/image2label/resnet-50-v2-mp.py similarity index 100% rename from example_configs/image2label/resnet-50-v2-mp_003.py rename to example_configs/image2label/resnet-50-v2-mp.py diff --git a/example_configs/image2label/resnet-50-v2-mp_000.py b/example_configs/image2label/resnet-50-v2-mp_000.py deleted file mode 100644 index cf1b04bfd..000000000 --- a/example_configs/image2label/resnet-50-v2-mp_000.py +++ /dev/null @@ -1,64 +0,0 @@ -from open_seq2seq.models import Image2Label -from open_seq2seq.encoders import ResNetEncoder -from open_seq2seq.decoders import FullyConnectedDecoder -from open_seq2seq.losses import CrossEntropyLoss -from open_seq2seq.data import ImagenetDataLayer -from open_seq2seq.optimizers.lr_policies import piecewise_constant -import tensorflow as tf - - -base_model = Image2Label - -base_params = { - "random_seed": 0, - "use_horovod": False, - "num_epochs": 100, - - "num_gpus": 8, - "batch_size_per_gpu": 32, - "dtype": "mixed", - "loss_scale": 10.0, - - "save_summaries_steps": 2000, - "print_loss_steps": 100, - "print_samples_steps": 2000, - "eval_steps": 5000, - "save_checkpoint_steps": 5000, - "logdir": "experiments/resnet50-imagenet", - - "optimizer": "Momentum", - "optimizer_params": { - "momentum": 0.90, - }, - "lr_policy": piecewise_constant, - "lr_policy_params": { - "learning_rate": 0.1, - "boundaries": [30, 60, 80, 90], - "decay_rates": [0.1, 0.01, 0.001, 1e-4], - }, - - "initializer": tf.variance_scaling_initializer, - - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 0.0001, - }, - "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "encoder": ResNetEncoder, - "encoder_params": { - 'resnet_size': 50, - "regularize_bn": False, - }, - "decoder": FullyConnectedDecoder, - "decoder_params": { - "output_dim": 1000, - }, - "loss": CrossEntropyLoss, - "data_layer": ImagenetDataLayer, - "data_layer_params": { - "data_dir": "data/tf-imagenet", - "image_size": 224, - "num_classes": 1000, - }, -} diff --git a/example_configs/image2label/resnet-50-v2-mp_001.py b/example_configs/image2label/resnet-50-v2-mp_001.py deleted file mode 100644 index e0482f42c..000000000 --- a/example_configs/image2label/resnet-50-v2-mp_001.py +++ /dev/null @@ -1,64 +0,0 @@ -from open_seq2seq.models import Image2Label -from open_seq2seq.encoders import ResNetEncoder -from open_seq2seq.decoders import FullyConnectedDecoder -from open_seq2seq.losses import CrossEntropyLoss -from open_seq2seq.data import ImagenetDataLayer -from open_seq2seq.optimizers.lr_policies import piecewise_constant -import tensorflow as tf - - -base_model = Image2Label - -base_params = { - "random_seed": 0, - "use_horovod": False, - "num_epochs": 100, - - "num_gpus": 8, - "batch_size_per_gpu": 32, - "dtype": "mixed", - "loss_scale": 100.0, - - "save_summaries_steps": 2000, - "print_loss_steps": 100, - "print_samples_steps": 2000, - "eval_steps": 5000, - "save_checkpoint_steps": 5000, - "logdir": "experiments/resnet50-imagenet", - - "optimizer": "Momentum", - "optimizer_params": { - "momentum": 0.90, - }, - "lr_policy": piecewise_constant, - "lr_policy_params": { - "learning_rate": 0.1, - "boundaries": [30, 60, 80, 90], - "decay_rates": [0.1, 0.01, 0.001, 1e-4], - }, - - "initializer": tf.variance_scaling_initializer, - - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 0.0001, - }, - "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "encoder": ResNetEncoder, - "encoder_params": { - 'resnet_size': 50, - "regularize_bn": False, - }, - "decoder": FullyConnectedDecoder, - "decoder_params": { - "output_dim": 1000, - }, - "loss": CrossEntropyLoss, - "data_layer": ImagenetDataLayer, - "data_layer_params": { - "data_dir": "data/tf-imagenet", - "image_size": 224, - "num_classes": 1000, - }, -} diff --git a/example_configs/image2label/resnet-50-v2-mp_002.py b/example_configs/image2label/resnet-50-v2-mp_002.py deleted file mode 100644 index 5d3960a0c..000000000 --- a/example_configs/image2label/resnet-50-v2-mp_002.py +++ /dev/null @@ -1,64 +0,0 @@ -from open_seq2seq.models import Image2Label -from open_seq2seq.encoders import ResNetEncoder -from open_seq2seq.decoders import FullyConnectedDecoder -from open_seq2seq.losses import CrossEntropyLoss -from open_seq2seq.data import ImagenetDataLayer -from open_seq2seq.optimizers.lr_policies import piecewise_constant -import tensorflow as tf - - -base_model = Image2Label - -base_params = { - "random_seed": 0, - "use_horovod": False, - "num_epochs": 100, - - "num_gpus": 8, - "batch_size_per_gpu": 32, - "dtype": "mixed", - "loss_scale": 1000.0, - - "save_summaries_steps": 2000, - "print_loss_steps": 100, - "print_samples_steps": 2000, - "eval_steps": 5000, - "save_checkpoint_steps": 5000, - "logdir": "experiments/resnet50-imagenet", - - "optimizer": "Momentum", - "optimizer_params": { - "momentum": 0.90, - }, - "lr_policy": piecewise_constant, - "lr_policy_params": { - "learning_rate": 0.1, - "boundaries": [30, 60, 80, 90], - "decay_rates": [0.1, 0.01, 0.001, 1e-4], - }, - - "initializer": tf.variance_scaling_initializer, - - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 0.0001, - }, - "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "encoder": ResNetEncoder, - "encoder_params": { - 'resnet_size': 50, - "regularize_bn": False, - }, - "decoder": FullyConnectedDecoder, - "decoder_params": { - "output_dim": 1000, - }, - "loss": CrossEntropyLoss, - "data_layer": ImagenetDataLayer, - "data_layer_params": { - "data_dir": "data/tf-imagenet", - "image_size": 224, - "num_classes": 1000, - }, -} diff --git a/example_configs/image2label/resnet-50-v2-mp_004.py b/example_configs/image2label/resnet-50-v2-mp_004.py deleted file mode 100644 index df2a422ff..000000000 --- a/example_configs/image2label/resnet-50-v2-mp_004.py +++ /dev/null @@ -1,64 +0,0 @@ -from open_seq2seq.models import Image2Label -from open_seq2seq.encoders import ResNetEncoder -from open_seq2seq.decoders import FullyConnectedDecoder -from open_seq2seq.losses import CrossEntropyLoss -from open_seq2seq.data import ImagenetDataLayer -from open_seq2seq.optimizers.lr_policies import piecewise_constant -import tensorflow as tf - - -base_model = Image2Label - -base_params = { - "random_seed": 0, - "use_horovod": False, - "num_epochs": 100, - - "num_gpus": 8, - "batch_size_per_gpu": 32, - "dtype": "mixed", - "automatic_loss_scaling": "LogMax", - - "save_summaries_steps": 2000, - "print_loss_steps": 100, - "print_samples_steps": 2000, - "eval_steps": 5000, - "save_checkpoint_steps": 5000, - "logdir": "experiments/resnet50-imagenet", - - "optimizer": "Momentum", - "optimizer_params": { - "momentum": 0.90, - }, - "lr_policy": piecewise_constant, - "lr_policy_params": { - "learning_rate": 0.1, - "boundaries": [30, 60, 80, 90], - "decay_rates": [0.1, 0.01, 0.001, 1e-4], - }, - - "initializer": tf.variance_scaling_initializer, - - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 0.0001, - }, - "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "encoder": ResNetEncoder, - "encoder_params": { - 'resnet_size': 50, - "regularize_bn": False, - }, - "decoder": FullyConnectedDecoder, - "decoder_params": { - "output_dim": 1000, - }, - "loss": CrossEntropyLoss, - "data_layer": ImagenetDataLayer, - "data_layer_params": { - "data_dir": "data/tf-imagenet", - "image_size": 224, - "num_classes": 1000, - }, -} diff --git a/example_configs/image2label/resnet-50-v2-mp_005.py b/example_configs/image2label/resnet-50-v2-mp_005.py deleted file mode 100644 index cdae3fec4..000000000 --- a/example_configs/image2label/resnet-50-v2-mp_005.py +++ /dev/null @@ -1,64 +0,0 @@ -from open_seq2seq.models import Image2Label -from open_seq2seq.encoders import ResNetEncoder -from open_seq2seq.decoders import FullyConnectedDecoder -from open_seq2seq.losses import CrossEntropyLoss -from open_seq2seq.data import ImagenetDataLayer -from open_seq2seq.optimizers.lr_policies import piecewise_constant -import tensorflow as tf - - -base_model = Image2Label - -base_params = { - "random_seed": 0, - "use_horovod": False, - "num_epochs": 100, - - "num_gpus": 8, - "batch_size_per_gpu": 32, - "dtype": "mixed", - "loss_scale": 1.0, - - "save_summaries_steps": 2000, - "print_loss_steps": 100, - "print_samples_steps": 2000, - "eval_steps": 5000, - "save_checkpoint_steps": 5000, - "logdir": "experiments/resnet50-imagenet", - - "optimizer": "Momentum", - "optimizer_params": { - "momentum": 0.90, - }, - "lr_policy": piecewise_constant, - "lr_policy_params": { - "learning_rate": 0.1, - "boundaries": [30, 60, 80, 90], - "decay_rates": [0.1, 0.01, 0.001, 1e-4], - }, - - "initializer": tf.variance_scaling_initializer, - - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 0.0001, - }, - "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "encoder": ResNetEncoder, - "encoder_params": { - 'resnet_size': 50, - "regularize_bn": False, - }, - "decoder": FullyConnectedDecoder, - "decoder_params": { - "output_dim": 1000, - }, - "loss": CrossEntropyLoss, - "data_layer": ImagenetDataLayer, - "data_layer_params": { - "data_dir": "data/tf-imagenet", - "image_size": 224, - "num_classes": 1000, - }, -} From 0669bd98a7126de321a876020189d014f4a5c4bd Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 8 Jun 2018 15:16:41 -0700 Subject: [PATCH 040/102] Fix bugs with evaluation/inference processing --- open_seq2seq/utils/utils.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index f8934ddaf..54cb37e77 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -125,26 +125,29 @@ def iterate_data(model, sess, compute_loss, mode, verbose): while True: tm = time.time() + fetches_vals = {} if size_defined: - fetches_vals = sess.run(fetches) + fetches_to_run = {} + # removing finished data layers + for worker_id in range(model.num_gpus): + if total_samples[worker_id] < dl_sizes[worker_id]: + fetches_to_run[worker_id] = fetches[worker_id] + fetches_vals = sess.run(fetches_to_run) else: # if size is not defined we have to process fetches sequentially, so not # to lose data when exception is thrown on one data layer - fetches_vals = [] - for one_fetch in fetches: + for worker_id, one_fetch in enumerate(fetches): try: - fetches_vals.append(sess.run(one_fetch)) + fetches_vals[worker_id] = sess.run(one_fetch) except tf.errors.OutOfRangeError: continue if step >= bench_start: total_time += time.time() - tm - skip_workers = 0 - # looping over num_gpus. In Horovod case this loop is "dummy", # since num_gpus = 1 - for worker_id, fetches_val in enumerate(fetches_vals): + for worker_id, fetches_val in fetches_vals.items(): if compute_loss: inputs, outputs, loss = fetches_val[:3] else: @@ -158,11 +161,6 @@ def iterate_data(model, sess, compute_loss, mode, verbose): total_samples[worker_id] += batch_size if size_defined: - # this data_layer is finished - if total_samples[worker_id] - batch_size > dl_sizes[worker_id]: - skip_workers += 1 - continue - # this data_layer is at the last batch with few more elements, cutting if total_samples[worker_id] > dl_sizes[worker_id]: last_batch_size = dl_sizes[worker_id] % batch_size @@ -182,7 +180,7 @@ def iterate_data(model, sess, compute_loss, mode, verbose): else: raise ValueError("Unknown mode: {}".format(mode)) - if len(fetches_vals) == 0 or skip_workers == model.num_gpus: + if len(fetches_vals) == 0: break if verbose: From a6fd0295b928171448d7a818318fb6f8b627d9de Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 8 Jun 2018 15:22:23 -0700 Subject: [PATCH 041/102] Fix #133 for batch_size=1 --- open_seq2seq/models/text2text.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/open_seq2seq/models/text2text.py b/open_seq2seq/models/text2text.py index 14c183c47..40707bdfe 100644 --- a/open_seq2seq/models/text2text.py +++ b/open_seq2seq/models/text2text.py @@ -219,6 +219,11 @@ def _get_num_objects_per_step(self, worker_id=0): data_layer = self.get_data_layer(worker_id) # sum of source length in batch num_tokens = tf.reduce_sum(data_layer.input_tensors['source_tensors'][1]) - # sum of target length in batch - num_tokens += tf.reduce_sum(data_layer.input_tensors['target_tensors'][1]) + if self.mode != "infer": + # sum of target length in batch + num_tokens += tf.reduce_sum(data_layer.input_tensors['target_tensors'][1]) + else: + # TODO: this is not going to be correct when batch size > 1, since it will + # count padding? + num_tokens += tf.reduce_sum(tf.shape(self.get_output_tensors(worker_id)[0])) return num_tokens From f02e45ed7327e7950c7553a526e6a70a49195d17 Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 8 Jun 2018 15:33:33 -0700 Subject: [PATCH 042/102] Fix bug with unnecessary savings --- open_seq2seq/utils/hooks.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/open_seq2seq/utils/hooks.py b/open_seq2seq/utils/hooks.py index b841d3945..5ee72799f 100644 --- a/open_seq2seq/utils/hooks.py +++ b/open_seq2seq/utils/hooks.py @@ -92,7 +92,7 @@ def after_run(self, run_context, run_values): dict_to_log = self._model.maybe_print_logs(input_values, output_values) # optionally logging to tensorboard any values # returned from maybe_print_logs - if dict_to_log: + if self._model.params['save_summaries_steps'] and dict_to_log: log_summaries_from_dict( dict_to_log, self._model.params['logdir'], @@ -197,7 +197,8 @@ def after_run(self, run_context, run_values): dict_to_log['eval_loss'] = total_loss # saving the best validation model - if total_loss < self._best_eval_loss: + if self._model.params['save_checkpoint_steps'] and \ + total_loss < self._best_eval_loss: self._best_eval_loss = total_loss self._eval_saver.save( run_context.session, @@ -208,7 +209,7 @@ def after_run(self, run_context, run_values): # optionally logging to tensorboard any values # returned from maybe_print_logs - if dict_to_log: + if self._model.params['save_summaries_steps']: log_summaries_from_dict( dict_to_log, self._model.params['logdir'], From 33dbbe604b86c424fa0ccccc905249668a322d7a Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 8 Jun 2018 15:48:32 -0700 Subject: [PATCH 043/102] Make command line overwrite non-base configs --- run.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/run.py b/run.py index ec8efe4d6..1df544670 100644 --- a/run.py +++ b/run.py @@ -171,12 +171,14 @@ def main(): if args.mode == 'train' or args.mode == 'train_eval': if 'train_params' in config_module: train_config.update(copy.deepcopy(config_module['train_params'])) + nested_update(train_config, nest_dict(vars(config_update))) if hvd is None or hvd.rank() == 0: deco_print("Training config:") pprint.pprint(train_config) if args.mode == 'eval' or args.mode == 'train_eval': if 'eval_params' in config_module: eval_config.update(copy.deepcopy(config_module['eval_params'])) + nested_update(eval_config, nest_dict(vars(config_update))) if hvd is None or hvd.rank() == 0: deco_print("Evaluation config:") pprint.pprint(eval_config) @@ -185,6 +187,7 @@ def main(): raise ValueError("\"infer_output_file\" command line parameter is " "required in inference mode") infer_config.update(copy.deepcopy(config_module['infer_params'])) + nested_update(infer_config, nest_dict(vars(config_update))) if hvd is None or hvd.rank() == 0: deco_print("Inference config:") pprint.pprint(infer_config) From abc0a00e32f3dbf90fd8addedec26b616b73b66e Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 8 Jun 2018 16:51:44 -0700 Subject: [PATCH 044/102] Remove unused code from optimizers.py --- open_seq2seq/models/model.py | 7 - open_seq2seq/optimizers/optimizers.py | 467 ++++++-------------------- 2 files changed, 99 insertions(+), 375 deletions(-) diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index 244bddd4b..7f71d8941 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -372,16 +372,9 @@ def compile(self, force_var_reuse=False): dtype=self.params['dtype'], optimizer=self.params['optimizer'], optimizer_params=self.params.get('optimizer_params', {}), - gradient_noise_scale=None, - gradient_multipliers=None, clip_gradients=self.params.get('max_grad_norm', None), learning_rate_decay_fn=lr_policy, - update_ops=None, - variables=None, - name="Loss_Optimization", summaries=self.params.get('summaries', None), - colocate_gradients_with_ops=True, - increment_global_step=True, larc_params=self.params.get('larc_params', None), loss_scale=self.params.get('loss_scale', 1.0), automatic_loss_scaling=self.params.get('automatic_loss_scaling', None), diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index c448e9e04..58d8f6e6a 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -27,23 +27,8 @@ import six import tensorflow as tf - -from tensorflow.contrib import framework as contrib_framework -from tensorflow.python.framework import constant_op -from tensorflow.python.framework import dtypes -from tensorflow.python.framework import ops -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import clip_ops from tensorflow.python.ops import control_flow_ops -from tensorflow.python.ops import init_ops -from tensorflow.python.ops import math_ops -from tensorflow.python.ops import random_ops -from tensorflow.python.ops import variable_scope as vs -from tensorflow.python.ops import variables as vars_ -from tensorflow.python.summary import summary -from tensorflow.python.training import moving_averages -from tensorflow.python.training import optimizer as optimizer_ -from tensorflow.python.training import training as train + from .automatic_loss_scaler import AutomaticLossScaler from .mp_wrapper import MixedPrecisionOptimizerWrapper @@ -51,12 +36,12 @@ OPTIMIZER_CLS_NAMES = { - "Adagrad": train.AdagradOptimizer, - "Adam": train.AdamOptimizer, - "Ftrl": train.FtrlOptimizer, - "Momentum": train.MomentumOptimizer, - "RMSProp": train.RMSPropOptimizer, - "SGD": train.GradientDescentOptimizer, + "Adagrad": tf.train.AdagradOptimizer, + "Adam": tf.train.AdamOptimizer, + "Ftrl": tf.train.FtrlOptimizer, + "Momentum": tf.train.MomentumOptimizer, + "RMSProp": tf.train.RMSPropOptimizer, + "SGD": tf.train.GradientDescentOptimizer, } OPTIMIZER_SUMMARIES = [ @@ -81,10 +66,10 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"): """ losses = tf.losses.get_regularization_losses(scope) if losses: - return math_ops.add_n(list(map(lambda x: tf.cast(x, tf.float32), losses)), - name=name) + return tf.add_n(list(map(lambda x: tf.cast(x, tf.float32), losses)), + name=name) else: - return constant_op.constant(0.0) + return tf.constant(0.0) class DistributedOptimizer(tf.train.Optimizer): @@ -156,176 +141,79 @@ def optimize_loss(loss, optimizer, optimizer_params, learning_rate_decay_fn, - global_step=None, dtype=tf.float32, - gradient_noise_scale=None, - gradient_multipliers=None, clip_gradients=None, - update_ops=None, - variables=None, - name=None, summaries=None, - colocate_gradients_with_ops=False, - increment_global_step=True, larc_params=None, loss_scale=1.0, automatic_loss_scaling=None, on_horovod=False): """Given loss and parameters for optimizer, returns a training op. - Various ways of passing optimizers include: - - - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES - for full list. E.g. `optimize_loss(..., optimizer='Adam')`. - - by function taking learning rate `Tensor` as argument and returning an - `Optimizer` instance. E.g. `optimize_loss(..., - optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`. - Alternatively, if `learning_rate` is `None`, the function takes no - arguments. E.g. `optimize_loss(..., learning_rate=None, - optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`. - - by a subclass of `Optimizer` having a single-argument constructor - (the argument is the learning rate), such as AdamOptimizer or - AdagradOptimizer. E.g. `optimize_loss(..., - optimizer=tf.train.AdagradOptimizer)`. - - by an instance of a subclass of `Optimizer`. - E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`. - Args: loss: Scalar `Tensor`. - global_step: Scalar int `Tensor`, step counter to update on each step - unless `increment_global_step` is `False`. If not supplied, - it will be fetched from the default graph (see - `tf.train.get_global_step` for details). If it has - not been created, no step will be incremented with each weight - update. `learning_rate_decay_fn` requires `global_step`. - learning_rate: float or `Tensor`, magnitude of update per each training - step. Can be `None`. - optimizer: string, class or optimizer instance, used as trainer. - string should be name of optimizer, like 'SGD', - 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. - class should be sub-class of `tf.Optimizer` that implements - `compute_gradients` and `apply_gradients` functions. - optimizer instance should be instantiation of `tf.Optimizer` - sub-class and have `compute_gradients` and `apply_gradients` - functions. - gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this - value. - gradient_multipliers: dict of variables or variable names to floats. - If present, gradients for specified - variables will be multiplied by given constant. - clip_gradients: float, callable or `None`. If float, is provided, a global - clipping is applied to prevent the norm of the gradient to exceed this - value. Alternatively, a callable can be provided e.g.: adaptive_clipping. - This callable takes a `list` of `(gradients, variables)` `tuple`s and - returns the same thing with the gradients modified. - learning_rate_decay_fn: function, takes `learning_rate` and `global_step` - `Tensor`s, returns `Tensor`. - Can be used to implement any learning rate decay - functions. - For example: `tf.train.exponential_decay`. - Ignored if `learning_rate` is not supplied. - update_ops: list of update `Operation`s to execute at each step. If `None`, - uses elements of UPDATE_OPS collection. The order of execution - between `update_ops` and `loss` is non-deterministic. - variables: list of variables to optimize or - `None` to use all trainable variables. - name: The name for this operation is used to scope operations and summaries. + optimizer: string or class of optimizer, used as trainer. + string should be name of optimizer, like 'SGD', + 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. + class should be sub-class of `tf.Optimizer` that implements + `compute_gradients` and `apply_gradients` functions. + optimizer_params: parameters of the optimizer. + clip_gradients: float, max gradient norm to clip to. + learning_rate_decay_fn: function, takes `global_step` + `Tensor`s, returns `Tensor`. + Can be used to implement any learning rate decay + functions. + For example: `tf.train.exponential_decay`. + Ignored if `learning_rate` is not supplied. summaries: List of internal quantities to visualize on tensorboard. If not - set only the loss and the learning rate will be reported. The - complete list is in OPTIMIZER_SUMMARIES. - colocate_gradients_with_ops: If True, try colocating gradients with the - corresponding op. - increment_global_step: Whether to increment `global_step`. If your model - calls `optimize_loss` multiple times per training step (e.g. to optimize - different parts of the model), use this arg to avoid incrementing - `global_step` more times than necessary. - LARC_mode: 'scale' or 'clip' - LARC_nu: If not None, LARC re-scaling will be - applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu + set only the loss and the learning rate will be reported. The + complete list is in OPTIMIZER_SUMMARIES. + larc_params: If not None, LARC re-scaling will + be applied with corresponding parameters. automatic_loss_scaling: if not None, use the corresponding automatic - loss scaling algorithm. Must be one of 'Backoff' - of 'LogMax'. `dtype` must be "mixed" to use ALS. + loss scaling algorithm. Must be one of 'Backoff' + of 'LogMax'. `dtype` must be "mixed" to use ALS. + Returns: Training op. - - Raises: - ValueError: if: - * `loss` is an invalid type or shape. - * `global_step` is an invalid type or shape. - * `learning_rate` is an invalid type or value. - * `optimizer` has the wrong type. - * `clip_gradients` is neither float nor callable. - * `learning_rate` and `learning_rate_decay_fn` are supplied, but no - `global_step` is available. - * `gradients` is empty. """ - loss = ops.convert_to_tensor(loss) - contrib_framework.assert_scalar(loss) - if global_step is None: - global_step = tf.train.get_or_create_global_step() + if summaries is None: + summaries = ["learning_rate", "global_gradient_norm"] else: - tf.train.assert_global_step(global_step) - with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]): - # Update ops take UPDATE_OPS collection if not provided. - if update_ops is None: - update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) - # Make sure update ops are ran before computing loss. - if update_ops: - loss = control_flow_ops.with_dependencies(list(update_ops), loss) - - if summaries is None: - summaries = ["learning_rate", "global_gradient_norm"] - else: - for summ in summaries: - if summ not in OPTIMIZER_SUMMARIES: - raise ValueError("Summaries should be one of [%s], you provided %s." % - (", ".join(OPTIMIZER_SUMMARIES), summ)) - if global_step is None: - raise ValueError("global_step is required for learning_rate_decay_fn.") - lr = learning_rate_decay_fn(global_step) + for summ in summaries: + if summ not in OPTIMIZER_SUMMARIES: + raise ValueError( + "Summaries should be one of [{}], you provided {}.".format( + ", ".join(OPTIMIZER_SUMMARIES), summ, + )) + if clip_gradients is not None and larc_params is not None: + raise AttributeError( + "LARC and gradient norm clipping should not be used together" + ) + + global_step = tf.train.get_or_create_global_step() + lr = learning_rate_decay_fn(global_step) + if "learning_rate" in summaries: + tf.summary.scalar("learning_rate", lr) - if "learning_rate" in summaries: - summary.scalar("learning_rate", lr) + with tf.variable_scope("Loss_Optimization"): + update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) + loss = control_flow_ops.with_dependencies(list(update_ops), loss) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): - if lr is None: - raise ValueError("Learning rate is None, but should be specified if " - "optimizer is string (%s)." % optimizer) if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( - "Optimizer name should be one of [%s], you provided %s." % - (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) - opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr, **optimizer_params) - elif (isinstance(optimizer, type) and - issubclass(optimizer, optimizer_.Optimizer)): - if lr is None: - raise ValueError("Learning rate is None, but should be specified if " - "optimizer is class (%s)." % optimizer) - opt = optimizer(learning_rate=lr, **optimizer_params) - elif isinstance(optimizer, optimizer_.Optimizer): - opt = optimizer - elif callable(optimizer): - if lr is not None: - opt = optimizer(lr, **optimizer_params) - else: - opt = optimizer(**optimizer_params) - if not isinstance(opt, optimizer_.Optimizer): - raise ValueError("Unrecognized optimizer: function should return " - "subclass of Optimizer. Got %s." % str(opt)) - else: - raise ValueError("Unrecognized optimizer: should be string, " - "subclass of Optimizer, instance of " - "subclass of Optimizer or function with one argument. " - "Got %s." % str(optimizer)) - # All trainable variables, if specific variables are not specified. - if variables is None: - variables = vars_.trainable_variables() + "Optimizer name should be one of [{}], you provided {}.".format( + ", ".join(OPTIMIZER_CLS_NAMES), optimizer + )) + optimizer = OPTIMIZER_CLS_NAMES[optimizer] + opt = optimizer(learning_rate=lr, **optimizer_params) if automatic_loss_scaling is not None: if automatic_loss_scaling not in AutomaticLossScaler.SUPPORTED_ALGOS: raise ValueError("Unknown automatic loss scaling algorithm: %s." - % automatic_loss_sclaing) + % automatic_loss_scaling) if dtype != "mixed": raise ValueError("Automatic loss scaling can be used only with " "dtype=mixed.") @@ -337,54 +225,27 @@ class should be sub-class of `tf.Optimizer` that implements opt = DistributedOptimizer(opt) # Compute gradients. - gradients = opt.compute_gradients( - loss, variables, - colocate_gradients_with_ops=colocate_gradients_with_ops, - ) - - # Optionally add gradient noise. - if gradient_noise_scale is not None: - gradients = _add_scaled_noise_to_gradients(gradients, - gradient_noise_scale) + grads_and_vars = opt.compute_gradients(loss, + colocate_gradients_with_ops=True) - # Multiply some gradients. - if gradient_multipliers is not None: - gradients = _multiply_gradients(gradients, gradient_multipliers) - if not gradients: - raise ValueError( - "Empty list of (gradient, var) pairs encountered. This is most " - "likely to be caused by an improper value of gradient_multipliers.") - - if "global_gradient_norm" in summaries or "gradient_norm" in summaries: - summary.scalar( - "global_norm/gradient_norm", - clip_ops.global_norm(list(map( - lambda x: tf.cast(x, tf.float32), - list(zip(*gradients))[0]) - )), + if "global_gradient_norm" in summaries: + tf.summary.scalar( + "global_gradient_norm", + _global_norm_with_cast(grads_and_vars), ) # Optionally clip gradients by global norm. - if clip_gradients is not None and larc_params is not None: - raise AttributeError( - "LARC and gradient norm clipping should not be used together" - ) - if isinstance(clip_gradients, float): - gradients = _clip_gradients_by_norm(gradients, clip_gradients) - elif callable(clip_gradients): - gradients = clip_gradients(gradients) - elif clip_gradients is not None: - raise ValueError( - "Unknown type %s for clip_gradients" % type(clip_gradients)) + if clip_gradients is not None: + grads_and_vars = _clip_gradients_by_norm(grads_and_vars, clip_gradients) # Add histograms for variables, gradients and gradient norms. - for gradient, variable in gradients: - if isinstance(gradient, ops.IndexedSlices): + for gradient, variable in grads_and_vars: + if isinstance(gradient, tf.IndexedSlices): grad_values = gradient.values else: grad_values = gradient - if isinstance(variable, ops.IndexedSlices): + if isinstance(variable, tf.IndexedSlices): var_values = variable.values else: var_values = variable @@ -392,24 +253,20 @@ class should be sub-class of `tf.Optimizer` that implements if grad_values is not None: var_name = variable.name.replace(":", "_") if "gradients" in summaries: - summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) + # need to mask nans for automatic loss scaling + # TODO: add summary for skipped update!!! + tf.summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) if "gradient_norm" in summaries: - summary.scalar("gradient_norm/%s" % var_name, - clip_ops.global_norm([grad_values])) + tf.summary.scalar("gradient_norm/%s" % var_name, tf.norm(grad_values)) if "variables" in summaries: - summary.histogram("variables/%s" % var_name, var_values) + tf.summary.histogram("variables/%s" % var_name, var_values) if "variable_norm" in summaries: - summary.scalar("variable_norm/%s" % var_name, - clip_ops.global_norm([var_values])) - - if clip_gradients is not None and ("global_gradient_norm" in summaries or - "gradient_norm" in summaries): - summary.scalar( - "global_norm/clipped_gradient_norm", - clip_ops.global_norm(list(map( - lambda x: tf.cast(x, tf.float32), - list(zip(*gradients))[0]) - )), + tf.summary.scalar("variable_norm/%s" % var_name, tf.norm(var_values)) + + if clip_gradients is not None and "global_gradient_norm" in summaries: + tf.summary.scalar( + "global_clipped_gradient_norm", + _global_norm_with_cast(grads_and_vars), ) # LARC gradient re-scaling @@ -428,7 +285,7 @@ class should be sub-class of `tf.Optimizer` that implements min_update = larc_params.get('min_update', 1e-7) eps = larc_params.get('epsilon', 1e-7) - for idx, (g, v) in enumerate(gradients): + for idx, (g, v) in enumerate(grads_and_vars): var_dtype = v.dtype v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) @@ -439,8 +296,8 @@ class should be sub-class of `tf.Optimizer` that implements min_update, ) if "larc_summaries" in summaries: - summary.scalar('larc_clip_on/{}'.format(v.name), - tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) + tf.summary.scalar('larc_clip_on/{}'.format(v.name), + tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) larc_grad_update = tf.minimum(larc_grad_update, 1.0) else: larc_grad_update = tf.maximum( @@ -448,163 +305,37 @@ class should be sub-class of `tf.Optimizer` that implements min_update, ) larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) - gradients[idx] = (larc_grad_update * g, v) + grads_and_vars[idx] = (larc_grad_update * g, v) # adding additional summary if "larc_summaries" in summaries: - summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update) - summary.scalar("larc_final_lr/{}".format(v.name), - tf.cast(lr, var_dtype) * larc_grad_update) + tf.summary.scalar('larc_grad_update/{}'.format(v.name), + larc_grad_update) + tf.summary.scalar("larc_final_lr/{}".format(v.name), + tf.cast(lr, var_dtype) * larc_grad_update) # Create gradient updates. grad_updates = opt.apply_gradients( - gradients, - global_step=global_step if increment_global_step else None, - name="train") + grads_and_vars, + global_step=global_step, + name="train", + ) - # # Ensure the train_tensor computes grad_updates. + # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) return train_tensor +def _global_norm_with_cast(grads_and_vars): + return tf.global_norm(list(map( + lambda x: tf.cast(x, tf.float32), + list(zip(*grads_and_vars))[0]) + )) + + def _clip_gradients_by_norm(grads_and_vars, clip_gradients): """Clips gradients by global norm.""" gradients, variables = zip(*grads_and_vars) - clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_gradients) + clipped_gradients, _ = tf.clip_by_global_norm(gradients, clip_gradients) return list(zip(clipped_gradients, variables)) - - -def _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name): - """Find max_norm given norm and previous average.""" - with vs.variable_scope(name, "AdaptiveMaxNorm", [norm]): - log_norm = math_ops.log(norm + epsilon) - - def moving_average(name, value, decay): - moving_average_variable = vs.get_variable( - name, - shape=value.get_shape(), - dtype=value.dtype, - initializer=init_ops.zeros_initializer(), - trainable=False) - return moving_averages.assign_moving_average( - moving_average_variable, value, decay, zero_debias=False) - - # quicker adaptation at the beginning - if global_step is not None: - n = math_ops.to_float(global_step) - decay = math_ops.minimum(decay, n / (n + 1.)) - - # update averages - mean = moving_average("mean", log_norm, decay) - sq_mean = moving_average("sq_mean", math_ops.square(log_norm), decay) - - variance = sq_mean - math_ops.square(mean) - std = math_ops.sqrt(math_ops.maximum(epsilon, variance)) - max_norms = math_ops.exp(mean + std_factor * std) - return max_norms, mean - - -def adaptive_clipping_fn(std_factor=2., - decay=0.95, - static_max_norm=None, - global_step=None, - report_summary=False, - epsilon=1e-8, - name=None): - """Adapt the clipping value using statistics on the norms. - - Implement adaptive gradient as presented in section 3.2.1 of - https://arxiv.org/abs/1412.1602. - - Keeps a moving average of the mean and std of the log(norm) of the gradient. - If the norm exceeds `exp(mean + std_factor*std)` then all gradients will be - rescaled such that the global norm becomes `exp(mean)`. - - Args: - std_factor: Python scaler (or tensor). - `max_norm = exp(mean + std_factor*std)` - decay: The smoothing factor of the moving averages. - static_max_norm: If provided, will threshold the norm to this value as an - extra safety. - global_step: Optional global_step. If provided, `decay = decay*n/(n+1)`. - This provides a quicker adaptation of the mean for the first steps. - report_summary: If `True`, will add histogram summaries of the `max_norm`. - epsilon: Small value chosen to avoid zero variance. - name: The name for this operation is used to scope operations and summaries. - - Returns: - A function for applying gradient clipping. - """ - - def gradient_clipping(grads_and_vars): - """Internal function for adaptive clipping.""" - grads, variables = zip(*grads_and_vars) - - norm = clip_ops.global_norm(grads) - - max_norm, log_mean = _adaptive_max_norm(norm, std_factor, decay, - global_step, epsilon, name) - - # reports the max gradient norm for debugging - if report_summary: - summary.scalar("global_norm/adaptive_max_gradient_norm", max_norm) - - # factor will be 1. if norm is smaller than max_norm - factor = array_ops.where(norm < max_norm, - array_ops.ones_like(norm), - math_ops.exp(log_mean) / norm) - - if static_max_norm is not None: - factor = math_ops.minimum(static_max_norm / norm, factor) - - # apply factor - clipped_grads = [] - for grad in grads: - if grad is None: - clipped_grads.append(None) - elif isinstance(grad, ops.IndexedSlices): - clipped_grads.append( - ops.IndexedSlices(grad.values * factor, grad.indices, - grad.dense_shape)) - else: - clipped_grads.append(grad * factor) - - return list(zip(clipped_grads, variables)) - - return gradient_clipping - - -def _add_scaled_noise_to_gradients(grads_and_vars, gradient_noise_scale): - """Adds scaled noise from a 0-mean normal distribution to gradients.""" - gradients, variables = zip(*grads_and_vars) - noisy_gradients = [] - for gradient in gradients: - if gradient is None: - noisy_gradients.append(None) - continue - if isinstance(gradient, ops.IndexedSlices): - gradient_shape = gradient.dense_shape - else: - gradient_shape = gradient.get_shape() - noise = random_ops.truncated_normal(gradient_shape) * gradient_noise_scale - noisy_gradients.append(gradient + noise) - return list(zip(noisy_gradients, variables)) - - -def _multiply_gradients(grads_and_vars, gradient_multipliers): - """Multiply specified gradients.""" - multiplied_grads_and_vars = [] - for grad, var in grads_and_vars: - if grad is not None and \ - (var in gradient_multipliers or var.name in gradient_multipliers): - key = var if var in gradient_multipliers else var.name - multiplier = constant_op.constant( - gradient_multipliers[key], dtype=dtypes.float32) - if isinstance(grad, ops.IndexedSlices): - grad_values = grad.values * multiplier - grad = ops.IndexedSlices(grad_values, grad.indices, grad.dense_shape) - else: - grad *= multiplier - multiplied_grads_and_vars.append((grad, var)) - return multiplied_grads_and_vars From cca27222f8da0eca3630245a27f807055246638f Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 8 Jun 2018 17:59:02 -0700 Subject: [PATCH 045/102] Make a single parameter for static/dynamic scaling --- .../image2label/resnet-50-v2-mp.py | 2 +- .../text2text/en-de-gnmt-like-4GPUs.py | 8 ++--- example_configs/text2text/en-de-nmt-small.py | 4 +-- example_configs/text2text/nmt-reversal-TT.py | 4 +-- example_configs/text2text/transformer-big.py | 2 +- open_seq2seq/models/model.py | 16 ++++----- .../optimizers/automatic_loss_scaler.py | 18 +++++----- open_seq2seq/optimizers/mp_wrapper.py | 5 +-- open_seq2seq/optimizers/optimizers.py | 34 ++++++++----------- 9 files changed, 42 insertions(+), 51 deletions(-) diff --git a/example_configs/image2label/resnet-50-v2-mp.py b/example_configs/image2label/resnet-50-v2-mp.py index 2cf9abb48..3666f9b7d 100644 --- a/example_configs/image2label/resnet-50-v2-mp.py +++ b/example_configs/image2label/resnet-50-v2-mp.py @@ -17,7 +17,7 @@ "num_gpus": 8, "batch_size_per_gpu": 32, "dtype": "mixed", - "automatic_loss_scaling": "Backoff", + "loss_scaling": "Backoff", "save_summaries_steps": 2000, "print_loss_steps": 100, diff --git a/example_configs/text2text/en-de-gnmt-like-4GPUs.py b/example_configs/text2text/en-de-gnmt-like-4GPUs.py index d6a904dd3..91a05f2e7 100644 --- a/example_configs/text2text/en-de-gnmt-like-4GPUs.py +++ b/example_configs/text2text/en-de-gnmt-like-4GPUs.py @@ -37,12 +37,12 @@ "use_staircase_decay": True, "min_lr": 0.0000005, }, - #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + # "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], "max_grad_norm": 32768.0, "dtype": tf.float32, - #"dtype": "mixed", - #"automatic_loss_scaling": "Backoff", + # "dtype": "mixed", + # "loss_scaling": "Backoff", "encoder": GNMTLikeEncoderWithEmbedding, "encoder_params": { "initializer": tf.random_uniform_initializer, diff --git a/example_configs/text2text/en-de-nmt-small.py b/example_configs/text2text/en-de-nmt-small.py index 0e01d82b9..a8e1a43b0 100644 --- a/example_configs/text2text/en-de-nmt-small.py +++ b/example_configs/text2text/en-de-nmt-small.py @@ -36,8 +36,8 @@ "larc_eta": 0.001, }, "dtype": tf.float32, - #"dtype": "mixed", - #"automatic_loss_scaling": "Backoff", + # "dtype": "mixed", + # "loss_scaling": "Backoff", "encoder": BidirectionalRNNEncoderWithEmbedding, "encoder_params": { diff --git a/example_configs/text2text/nmt-reversal-TT.py b/example_configs/text2text/nmt-reversal-TT.py index a995d6b6f..61ae37ebb 100644 --- a/example_configs/text2text/nmt-reversal-TT.py +++ b/example_configs/text2text/nmt-reversal-TT.py @@ -29,8 +29,8 @@ "save_checkpoint_steps": 300, "logdir": "ReversalTask-Transformer-MP", "dtype": tf.float32, - #"dtype": "mixed", - #"automatic_loss_scaling": "Backoff", + # "dtype": "mixed", + # "loss_scaling": "Backoff", "optimizer": tf.contrib.opt.LazyAdamOptimizer, "optimizer_params": { diff --git a/example_configs/text2text/transformer-big.py b/example_configs/text2text/transformer-big.py index fd819bca3..6f2e3e52e 100644 --- a/example_configs/text2text/transformer-big.py +++ b/example_configs/text2text/transformer-big.py @@ -33,7 +33,7 @@ "logdir": "Transformer-FP32", "dtype": tf.float32, # "dtype": "mixed", - # "automatic_loss_scaling": "Backoff", + # "loss_scaling": "Backoff", "optimizer": tf.contrib.opt.LazyAdamOptimizer, "optimizer_params": { "beta1": 0.9, diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index 7f71d8941..9705345f9 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -80,8 +80,7 @@ class :meth:`__init__` method. 'lr_policy_params': dict, 'max_grad_norm': float, 'larc_params': dict, - 'loss_scale': float, - 'automatic_loss_scaling': [None, 'Backoff', 'LogMax'], + 'loss_scaling': None, # float, "Backoff" or "LogMax" 'summaries': list, } @@ -161,11 +160,11 @@ class docs. * **max_grad_norm** (float) --- maximum value of gradient norm. Clipping will be performed if some gradients exceed this value (this is checked for each variable independently). - * **loss_scale** (float) --- static loss scale to use. For details see - :ref:`mixed precision training ` section in docs. - * **automatic_loss_scaling** --- automatic loss scaling mode. Could be - either None, "Backoff" or "Logmax". For details see - :ref:`mixed precision training ` section in docs. + * **loss_scaling** --- could be float or string. If float, static loss + scaling is applied. If string, the corresponding automatic + loss scaling algorithm is used. Must be one of 'Backoff' + of 'LogMax' (case insensitive). Only used when dtype="mixed". For details + see :ref:`mixed precision training ` section in docs. * **summaries** (list) --- which summaries to log. Could contain "learning_rate", "gradients", "gradient_norm", "global_gradient_norm", "variables", "variable_norm". @@ -376,8 +375,7 @@ def compile(self, force_var_reuse=False): learning_rate_decay_fn=lr_policy, summaries=self.params.get('summaries', None), larc_params=self.params.get('larc_params', None), - loss_scale=self.params.get('loss_scale', 1.0), - automatic_loss_scaling=self.params.get('automatic_loss_scaling', None), + loss_scaling=self.params.get('loss_scaling', 1.0), on_horovod=self.on_horovod, ) tf.summary.scalar(name="train_loss", tensor=self.loss) diff --git a/open_seq2seq/optimizers/automatic_loss_scaler.py b/open_seq2seq/optimizers/automatic_loss_scaler.py index 740e63508..247f387ed 100644 --- a/open_seq2seq/optimizers/automatic_loss_scaler.py +++ b/open_seq2seq/optimizers/automatic_loss_scaler.py @@ -8,25 +8,25 @@ import tensorflow as tf -class AutomaticLossScaler: - SUPPORTED_ALGOS = ['Backoff', 'LogMax'] +class AutomaticLossScaler(object): + SUPPORTED_ALGOS = ['backoff', 'logmax'] def __init__(self, algorithm='Backoff', scale_min=1.0, scale_max=2.**24): - if algorithm == 'Backoff': + algorithm = algorithm.lower() + if algorithm == 'backoff': self.scaler = BackoffScaler(scale_min=scale_min, scale_max=scale_max, step_factor=2.0, step_window=2000) - elif algorithm == 'LogMax': + elif algorithm == 'logmax': self.scaler = LogMaxScaler(scale_min=scale_min, scale_max=scale_max, log_max=16., beta1=0.99, beta2=0.999, - overflow_std_dev=3.09) # ppf(.999) + overflow_std_dev=3.09) # ppf(.999) else: - raise ValueError('Unknown dynamic scaling algorithm: %s' - % algorithm_name) + raise ValueError('Unknown scaling algorithm: {}'.format(algorithm)) def update_op(self, has_nan, amax): return self.scaler.update_op(has_nan, amax) @@ -55,7 +55,7 @@ def check_grads(grads_and_vars): return has_nan, amax -class BackoffScaler: +class BackoffScaler(object): def __init__(self, scale_min, scale_max, step_factor, step_window): self.scale_min = scale_min self.scale_max = scale_max @@ -105,7 +105,7 @@ def loss_scale(self): return self.scale -class LogMaxScaler: +class LogMaxScaler(object): def __init__(self, scale_min, scale_max, log_max, beta1, beta2, overflow_std_dev): self.scale_min = scale_min self.scale_max = scale_max diff --git a/open_seq2seq/optimizers/mp_wrapper.py b/open_seq2seq/optimizers/mp_wrapper.py index 633e3a3e5..f947a3e9a 100644 --- a/open_seq2seq/optimizers/mp_wrapper.py +++ b/open_seq2seq/optimizers/mp_wrapper.py @@ -80,7 +80,6 @@ def compute_gradients(self, loss, var_list=None, return grads_and_vars_fp32 def apply_gradients(self, grads_and_vars, global_step=None, name=None): - def apply_ops_wrapper(): update_op = self._optimizer.apply_gradients(grads_and_vars, global_step, name) @@ -101,9 +100,7 @@ def apply_ops_wrapper(): loss_scale_update_op = self._loss_scaler.update_op(grad_has_nans, grad_amax) with tf.control_dependencies([loss_scale_update_op]): - return tf.cond(should_skip_update, - tf.no_op, - apply_ops_wrapper) + return tf.cond(should_skip_update, tf.no_op, apply_ops_wrapper) else: return apply_ops_wrapper() diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index 58d8f6e6a..c022ff946 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -145,8 +145,7 @@ def optimize_loss(loss, clip_gradients=None, summaries=None, larc_params=None, - loss_scale=1.0, - automatic_loss_scaling=None, + loss_scaling=1.0, on_horovod=False): """Given loss and parameters for optimizer, returns a training op. @@ -158,24 +157,27 @@ def optimize_loss(loss, class should be sub-class of `tf.Optimizer` that implements `compute_gradients` and `apply_gradients` functions. optimizer_params: parameters of the optimizer. - clip_gradients: float, max gradient norm to clip to. + dtype: model dtype (tf.float16, tf.float32 or "mixed"). learning_rate_decay_fn: function, takes `global_step` `Tensor`s, returns `Tensor`. Can be used to implement any learning rate decay functions. For example: `tf.train.exponential_decay`. Ignored if `learning_rate` is not supplied. + clip_gradients: float, max gradient norm to clip to. summaries: List of internal quantities to visualize on tensorboard. If not set only the loss and the learning rate will be reported. The complete list is in OPTIMIZER_SUMMARIES. larc_params: If not None, LARC re-scaling will be applied with corresponding parameters. - automatic_loss_scaling: if not None, use the corresponding automatic - loss scaling algorithm. Must be one of 'Backoff' - of 'LogMax'. `dtype` must be "mixed" to use ALS. + loss_scaling: could be float or string. If float, static loss scaling + is applied. If string, the corresponding automatic + loss scaling algorithm is used. Must be one of 'Backoff' + of 'LogMax' (case insensitive). Only used when dtype="mixed". + on_horovod: whether the model is run on horovod. Returns: - Training op. + training op. """ if summaries is None: summaries = ["learning_rate", "global_gradient_norm"] @@ -210,23 +212,18 @@ class should be sub-class of `tf.Optimizer` that implements optimizer = OPTIMIZER_CLS_NAMES[optimizer] opt = optimizer(learning_rate=lr, **optimizer_params) - if automatic_loss_scaling is not None: - if automatic_loss_scaling not in AutomaticLossScaler.SUPPORTED_ALGOS: - raise ValueError("Unknown automatic loss scaling algorithm: %s." - % automatic_loss_scaling) - if dtype != "mixed": - raise ValueError("Automatic loss scaling can be used only with " - "dtype=mixed.") - loss_scale = AutomaticLossScaler(algorithm=automatic_loss_scaling) + if isinstance(loss_scaling, six.string_types): + loss_scaling = AutomaticLossScaler(algorithm=loss_scaling) if dtype == 'mixed': - opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scale) + opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scaling) if on_horovod: opt = DistributedOptimizer(opt) # Compute gradients. - grads_and_vars = opt.compute_gradients(loss, - colocate_gradients_with_ops=True) + grads_and_vars = opt.compute_gradients( + loss, colocate_gradients_with_ops=True, + ) if "global_gradient_norm" in summaries: tf.summary.scalar( @@ -254,7 +251,6 @@ class should be sub-class of `tf.Optimizer` that implements var_name = variable.name.replace(":", "_") if "gradients" in summaries: # need to mask nans for automatic loss scaling - # TODO: add summary for skipped update!!! tf.summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) if "gradient_norm" in summaries: tf.summary.scalar("gradient_norm/%s" % var_name, tf.norm(grad_values)) From c8d9f171322858bc8a7e32d19d141f21c3263646 Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 11 Jun 2018 11:06:41 -0700 Subject: [PATCH 046/102] Fix bug with map on python3 --- open_seq2seq/data/speech2text/speech2text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open_seq2seq/data/speech2text/speech2text.py b/open_seq2seq/data/speech2text/speech2text.py index 173edb334..f6f039fc1 100644 --- a/open_seq2seq/data/speech2text/speech2text.py +++ b/open_seq2seq/data/speech2text/speech2text.py @@ -125,7 +125,7 @@ def build_graph(self): ) else: indices = self.split_data( - np.array(map(lambda num: str(num), range(len(self.all_files)))) + np.array(list(map(lambda num: str(num), range(len(self.all_files))))) ) self._dataset = tf.data.Dataset.from_tensor_slices( np.hstack((indices[:, np.newaxis], self._files[:, np.newaxis])) From 7a16cf4a28e47ec35e68982fb032995faaefda04 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Mon, 11 Jun 2018 11:26:18 -0700 Subject: [PATCH 047/102] Update automatic_loss_scaler.py --- open_seq2seq/optimizers/automatic_loss_scaler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open_seq2seq/optimizers/automatic_loss_scaler.py b/open_seq2seq/optimizers/automatic_loss_scaler.py index 247f387ed..5da055d79 100644 --- a/open_seq2seq/optimizers/automatic_loss_scaler.py +++ b/open_seq2seq/optimizers/automatic_loss_scaler.py @@ -12,7 +12,7 @@ class AutomaticLossScaler(object): SUPPORTED_ALGOS = ['backoff', 'logmax'] def __init__(self, algorithm='Backoff', scale_min=1.0, scale_max=2.**24): - algorithm = algorithm.lower() + algorithm = algorithm.lower().strip() if algorithm == 'backoff': self.scaler = BackoffScaler(scale_min=scale_min, scale_max=scale_max, From 2e159e10ea6c410c927033bf7570d1181fe59c24 Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 11 Jun 2018 15:37:38 -0700 Subject: [PATCH 048/102] Add iter_size to optimize_loss --- open_seq2seq/optimizers/optimizers.py | 78 ++++++++++++++++++--- open_seq2seq/optimizers/optimizers_test.py | 81 ++++++++++++++++++++++ 2 files changed, 150 insertions(+), 9 deletions(-) create mode 100644 open_seq2seq/optimizers/optimizers_test.py diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index c022ff946..b5406f328 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -137,6 +137,28 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None): return self._optimizer.apply_gradients(grads_and_vars, global_step, name) +def reduce_gradients(grads_and_vars, on_horovod): + if on_horovod: + from horovod.common import size + from horovod.tensorflow import allreduce + + if size() > 1: + averaged_grads_and_vars = [] + with tf.name_scope("all_reduce"): + for grad, var in grads_and_vars: + if grad is not None: + avg_grad = allreduce(grad) + averaged_grads_and_vars.append((avg_grad, var)) + else: + averaged_grads_and_vars.append((None, var)) + return averaged_grads_and_vars + else: + return grads_and_vars + else: + # TODO: implement this + pass + + def optimize_loss(loss, optimizer, optimizer_params, @@ -146,7 +168,9 @@ def optimize_loss(loss, summaries=None, larc_params=None, loss_scaling=1.0, - on_horovod=False): + on_horovod=False, + iter_size=1, + skip_update_ph=None): """Given loss and parameters for optimizer, returns a training op. Args: @@ -217,8 +241,6 @@ class should be sub-class of `tf.Optimizer` that implements if dtype == 'mixed': opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scaling) - if on_horovod: - opt = DistributedOptimizer(opt) # Compute gradients. grads_and_vars = opt.compute_gradients( @@ -310,12 +332,50 @@ class should be sub-class of `tf.Optimizer` that implements tf.summary.scalar("larc_final_lr/{}".format(v.name), tf.cast(lr, var_dtype) * larc_grad_update) - # Create gradient updates. - grad_updates = opt.apply_gradients( - grads_and_vars, - global_step=global_step, - name="train", - ) + if on_horovod: + if iter_size > 1: + grads_and_vars_accum = [] + accum_ops = [] + for grad, var in grads_and_vars: + grad_accum = tf.get_variable( + grad.name.split(":")[0] + "_accum", shape=grad.shape, + dtype=grad.dtype, initializer=tf.zeros_initializer(), + trainable=False, + ) + accum_ops.append(tf.assign(grad_accum, grad_accum + grad / iter_size)) + grads_and_vars_accum.append((grad_accum, var)) + + accum_op = tf.group(accum_ops) + + def clear_op(): + with tf.control_dependencies([accum_op]): + red_grad_updates = opt.apply_gradients( + reduce_gradients(grads_and_vars_accum, on_horovod=True), + global_step=global_step, + name="train", + ) + + with tf.control_dependencies([red_grad_updates]): + return tf.group([tf.assign(g, tf.zeros_like(g)) + for g, v in grads_and_vars_accum]) + + grad_updates = tf.cond( + pred=skip_update_ph, + true_fn=lambda: accum_op, + false_fn=clear_op, + ) + else: + grad_updates = opt.apply_gradients( + reduce_gradients(grads_and_vars, on_horovod=True), + global_step=global_step, + name="train", + ) + else: + grad_updates = opt.apply_gradients( + grads_and_vars, + global_step=global_step, + name="train", + ) # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) diff --git a/open_seq2seq/optimizers/optimizers_test.py b/open_seq2seq/optimizers/optimizers_test.py new file mode 100644 index 000000000..0f4737edd --- /dev/null +++ b/open_seq2seq/optimizers/optimizers_test.py @@ -0,0 +1,81 @@ +# Copyright (c) 2017 NVIDIA Corporation +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals +from six.moves import range + +import tensorflow as tf +import numpy as np +import numpy.testing as npt + +from open_seq2seq.optimizers import optimize_loss +from .lr_policies import fixed_lr + + +class IterSizeTests(tf.test.TestCase): + def setUp(self): + try: + import horovod.tensorflow as hvd + hvd.init() + except ImportError: + print("Horovod not installed skipping IterSizeTests") + + def tearDown(self): + pass + + def test_updates(self): + dtype = tf.float32 + with tf.Graph().as_default() as g: + n_samples = 10 + n_hid = 10 + var_dtype = tf.float32 if dtype == tf.float32 else tf.float16 + + np.random.seed(0) + X = np.random.rand(n_samples, n_hid) + y = np.random.rand(n_samples, 1) + w = np.linalg.solve(X.T.dot(X), X.T.dot(y)) + + x_ph = tf.placeholder(var_dtype, [n_samples, n_hid]) + y_ph = tf.placeholder(var_dtype, [n_samples, 1]) + + y_pred = tf.layers.dense(x_ph, 1, use_bias=False) + loss = tf.losses.mean_squared_error(y_ph, y_pred) + loss += tf.losses.get_regularization_loss() + skip_update_ph = tf.placeholder(tf.bool) + iter_size = 8 + train_op = optimize_loss(loss, "SGD", {}, + lambda gs: fixed_lr(gs, 0.1), dtype=dtype, + iter_size=iter_size, on_horovod=True, + skip_update_ph=skip_update_ph) + grad_accum = [var for var in tf.global_variables() if 'accum' in var.name][0] + var = tf.trainable_variables()[0] + with self.test_session(g, use_gpu=True) as sess: + sess.run(tf.global_variables_initializer()) + for _ in range(3): + g, v = sess.run([grad_accum, var]) + npt.assert_allclose(g, np.zeros(g.shape)) + + true_g = 2 * (X.T.dot(X).dot(v) - X.T.dot(y)) / X.shape[0] / iter_size + + sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: True}) + g_new, v_new = sess.run([grad_accum, var]) + npt.assert_allclose(g_new, true_g, atol=1e-7) + npt.assert_allclose(v_new, v) + + sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: True}) + g_new, v_new = sess.run([grad_accum, var]) + npt.assert_allclose(g_new, true_g * 2, atol=1e-7) + npt.assert_allclose(v_new, v) + + sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: True}) + g_new, v_new = sess.run([grad_accum, var]) + npt.assert_allclose(g_new, true_g * 3, atol=1e-7) + npt.assert_allclose(v_new, v) + + sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: False}) + g_new, v_new = sess.run([grad_accum, var]) + npt.assert_allclose(g_new, np.zeros(g.shape)) + npt.assert_allclose(v_new, v - 0.1 * true_g * 4, atol=1e-7) + + +if __name__ == '__main__': + tf.test.main() From ca684243e3f8da65d54ac6b34f0da192ea345171 Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 11 Jun 2018 16:39:01 -0700 Subject: [PATCH 049/102] Add iter_size to model/funcs --- open_seq2seq/models/model.py | 10 ++++++++++ open_seq2seq/optimizers/optimizers.py | 2 +- open_seq2seq/utils/funcs.py | 6 +++++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index 9705345f9..eb9c85f64 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -82,6 +82,7 @@ class :meth:`__init__` method. 'larc_params': dict, 'loss_scaling': None, # float, "Backoff" or "LogMax" 'summaries': list, + 'iter_size': int, } def __init__(self, params, mode="train", hvd=None): @@ -168,6 +169,8 @@ class docs. * **summaries** (list) --- which summaries to log. Could contain "learning_rate", "gradients", "gradient_norm", "global_gradient_norm", "variables", "variable_norm". + * **iter_size** (int) --- same as in nvcaffe, the gradients will be + accumulated for ``iter_size`` number of steps before applying update. * **larc_params** --- dictionary with parameters for LARC (or LARS) optimization algorithms. Can contain the following parameters: @@ -276,10 +279,12 @@ class docs. self._output = None else: self._outputs = [None] * self.num_gpus + self.loss = None self.train_op = None self.eval_losses = None self._num_objects_per_step = None + self.skip_update_ph = None def compile(self, force_var_reuse=False): """TensorFlow graph is built here.""" @@ -366,6 +371,9 @@ def compile(self, force_var_reuse=False): lr_policy = lambda gs: self.params['lr_policy'](global_step=gs, **lr_params) + if self.params.get('iter_size', 1) > 1: + self.skip_update_ph = tf.placeholder(tf.bool) + self.train_op = optimize_loss( loss=tf.cast(self.loss, tf.float32) + get_regularization_loss(), dtype=self.params['dtype'], @@ -377,6 +385,8 @@ def compile(self, force_var_reuse=False): larc_params=self.params.get('larc_params', None), loss_scaling=self.params.get('loss_scaling', 1.0), on_horovod=self.on_horovod, + iter_size=self.params.get('iter_size', 1), + skip_update_ph=self.skip_update_ph, ) tf.summary.scalar(name="train_loss", tensor=self.loss) if self.steps_in_epoch: diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index b5406f328..40724db53 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -340,7 +340,7 @@ class should be sub-class of `tf.Optimizer` that implements grad_accum = tf.get_variable( grad.name.split(":")[0] + "_accum", shape=grad.shape, dtype=grad.dtype, initializer=tf.zeros_initializer(), - trainable=False, + trainable=False, validate_shape=bool(grad.get_shape()) ) accum_ops.append(tf.assign(grad_accum, grad_accum + grad / iter_size)) grads_and_vars_accum.append((grad_accum, var)) diff --git a/open_seq2seq/utils/funcs.py b/open_seq2seq/utils/funcs.py index b5cb1929f..dabca636b 100644 --- a/open_seq2seq/utils/funcs.py +++ b/open_seq2seq/utils/funcs.py @@ -118,7 +118,11 @@ def train(train_model, eval_model=None, debug_port=None): break tm = time.time() try: - fetches_vals = sess.run(fetches) + feed_dict = {} + iter_size = train_model.params.get('iter_size', 1) + if iter_size > 1: + feed_dict[train_model.skip_update_ph] = step % iter_size != 0 + fetches_vals = sess.run(fetches, feed_dict) except tf.errors.OutOfRangeError: break if step >= bench_start: From 3cfcb8d7183b61a0e0a07742ba3612a42f0680a1 Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 11 Jun 2018 17:03:48 -0700 Subject: [PATCH 050/102] Undo bug with nested_update --- run.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/run.py b/run.py index 1df544670..93e5f1a23 100644 --- a/run.py +++ b/run.py @@ -171,14 +171,12 @@ def main(): if args.mode == 'train' or args.mode == 'train_eval': if 'train_params' in config_module: train_config.update(copy.deepcopy(config_module['train_params'])) - nested_update(train_config, nest_dict(vars(config_update))) if hvd is None or hvd.rank() == 0: deco_print("Training config:") pprint.pprint(train_config) if args.mode == 'eval' or args.mode == 'train_eval': if 'eval_params' in config_module: eval_config.update(copy.deepcopy(config_module['eval_params'])) - nested_update(eval_config, nest_dict(vars(config_update))) if hvd is None or hvd.rank() == 0: deco_print("Evaluation config:") pprint.pprint(eval_config) @@ -187,7 +185,7 @@ def main(): raise ValueError("\"infer_output_file\" command line parameter is " "required in inference mode") infer_config.update(copy.deepcopy(config_module['infer_params'])) - nested_update(infer_config, nest_dict(vars(config_update))) + if hvd is None or hvd.rank() == 0: deco_print("Inference config:") pprint.pprint(infer_config) From a061a6e7df900f06fb26624f2b5a0497c417a818 Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 11 Jun 2018 18:13:09 -0700 Subject: [PATCH 051/102] Add skip hooks to funcs --- example_configs/speech2text/ds2_toy_config.py | 16 ++++++++++------ open_seq2seq/optimizers/optimizers.py | 1 + open_seq2seq/utils/funcs.py | 8 +++++++- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/example_configs/speech2text/ds2_toy_config.py b/example_configs/speech2text/ds2_toy_config.py index cf9c01bf4..86506bbc5 100644 --- a/example_configs/speech2text/ds2_toy_config.py +++ b/example_configs/speech2text/ds2_toy_config.py @@ -11,14 +11,18 @@ base_params = { "random_seed": 0, - "use_horovod": False, - "num_epochs": 100, - "num_gpus": 2, - "batch_size_per_gpu": 2, + # "use_horovod": False, + "use_horovod": True, + "iter_size": 2, + + "num_epochs": 10, + + "num_gpus": 1, + "batch_size_per_gpu": 5, "save_summaries_steps": 10, - "print_loss_steps": 10, + "print_loss_steps": 1, "print_samples_steps": 20, "eval_steps": 50, "save_checkpoint_steps": 50, @@ -64,7 +68,7 @@ "rnn_unidirectional": False, "row_conv": False, "row_conv_width": 8, - "use_cudnn_rnn": True, + "use_cudnn_rnn": False, "dropout_keep_prob": 1.0, diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index 40724db53..692fabc5e 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -332,6 +332,7 @@ class should be sub-class of `tf.Optimizer` that implements tf.summary.scalar("larc_final_lr/{}".format(v.name), tf.cast(lr, var_dtype) * larc_grad_update) + # TODO: move this to the top! if on_horovod: if iter_size > 1: grads_and_vars_accum = [] diff --git a/open_seq2seq/utils/funcs.py b/open_seq2seq/utils/funcs.py index dabca636b..a4bd2ee88 100644 --- a/open_seq2seq/utils/funcs.py +++ b/open_seq2seq/utils/funcs.py @@ -122,7 +122,13 @@ def train(train_model, eval_model=None, debug_port=None): iter_size = train_model.params.get('iter_size', 1) if iter_size > 1: feed_dict[train_model.skip_update_ph] = step % iter_size != 0 - fetches_vals = sess.run(fetches, feed_dict) + if step % iter_size == 0: + fetches_vals = sess.run(fetches, feed_dict) + else: + # necessary to skip "no-update" steps when iter_size > 1 + def run_with_no_hooks(step_context): + return step_context.session.run(fetches, feed_dict) + fetches_vals = sess.run_step_fn(run_with_no_hooks) except tf.errors.OutOfRangeError: break if step >= bench_start: From 8508e306d0dcd653f1d3785994edbcc11dad218b Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 12 Jun 2018 10:56:56 -0700 Subject: [PATCH 052/102] Make iter_size processing work on raw grads --- open_seq2seq/optimizers/optimizers.py | 275 +++++++++++--------------- 1 file changed, 116 insertions(+), 159 deletions(-) diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index 692fabc5e..6172536d5 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -72,71 +72,6 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"): return tf.constant(0.0) -class DistributedOptimizer(tf.train.Optimizer): - """An optimizer that wraps another tf.Optimizer, using an allreduce to - average gradient values before applying gradients to model weights.""" - - def __init__(self, optimizer, name=None, use_locking=False, device_dense='', - device_sparse=''): - """Construct a new DistributedOptimizer, which uses another optimizer - under the hood for computing single-process gradient values and - applying gradient updates after the gradient values have been averaged - across all the Horovod ranks. - Args: - optimizer: - Optimizer to use for computing gradients and applying updates. - name: - Optional name prefix for the operations created when applying - gradients. Defaults to "Distributed" followed by the provided - optimizer type. - use_locking: - Whether to use locking when updating variables. - See Optimizer.__init__ for more info. - device_dense: - Device to be used for dense tensors. Uses GPU by default - if Horovod was build with HOROVOD_GPU_ALLREDUCE. - device_sparse: - Device to be used for sparse tensors. Uses GPU by default - if Horovod was build with HOROVOD_GPU_ALLGATHER. - """ - if name is None: - name = "Distributed{}".format(type(optimizer).__name__) - - self._optimizer = optimizer - self._device_dense = device_dense - self._device_sparse = device_sparse - super(DistributedOptimizer, self).__init__( - name=name, use_locking=use_locking) - - def compute_gradients(self, *args, **kwargs): - """Compute gradients of all trainable variables. - See Optimizer.compute_gradients() for more info. - In DistributedOptimizer, compute_gradients() is overriden to also - allreduce the gradients before returning them. - """ - gradients = self._optimizer.compute_gradients(*args, **kwargs) - from horovod.common import size - from horovod.tensorflow import allreduce - - if size() > 1: - averaged_gradients = [] - with tf.name_scope(self._name + "_Allreduce"): - for grad, var in gradients: - if grad is not None: - avg_grad = allreduce(grad, device_dense=self._device_dense, - device_sparse=self._device_sparse) - averaged_gradients.append((avg_grad, var)) - else: - averaged_gradients.append((None, var)) - return averaged_gradients - else: - return gradients - - def apply_gradients(self, grads_and_vars, global_step=None, name=None): - """Calls this same method on the underlying optimizer.""" - return self._optimizer.apply_gradients(grads_and_vars, global_step, name) - - def reduce_gradients(grads_and_vars, on_horovod): if on_horovod: from horovod.common import size @@ -247,92 +182,7 @@ class should be sub-class of `tf.Optimizer` that implements loss, colocate_gradients_with_ops=True, ) - if "global_gradient_norm" in summaries: - tf.summary.scalar( - "global_gradient_norm", - _global_norm_with_cast(grads_and_vars), - ) - - # Optionally clip gradients by global norm. - if clip_gradients is not None: - grads_and_vars = _clip_gradients_by_norm(grads_and_vars, clip_gradients) - - # Add histograms for variables, gradients and gradient norms. - for gradient, variable in grads_and_vars: - if isinstance(gradient, tf.IndexedSlices): - grad_values = gradient.values - else: - grad_values = gradient - - if isinstance(variable, tf.IndexedSlices): - var_values = variable.values - else: - var_values = variable - - if grad_values is not None: - var_name = variable.name.replace(":", "_") - if "gradients" in summaries: - # need to mask nans for automatic loss scaling - tf.summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) - if "gradient_norm" in summaries: - tf.summary.scalar("gradient_norm/%s" % var_name, tf.norm(grad_values)) - if "variables" in summaries: - tf.summary.histogram("variables/%s" % var_name, var_values) - if "variable_norm" in summaries: - tf.summary.scalar("variable_norm/%s" % var_name, tf.norm(var_values)) - - if clip_gradients is not None and "global_gradient_norm" in summaries: - tf.summary.scalar( - "global_clipped_gradient_norm", - _global_norm_with_cast(grads_and_vars), - ) - - # LARC gradient re-scaling - if larc_params is not None: - check_params( - config=larc_params, - required_dict={'larc_eta': float}, - optional_dict={ - 'larc_mode': ['clip', 'scale'], - 'min_update': float, - 'epsilon': float - }, - ) - larc_eta = larc_params['larc_eta'] - larc_mode = larc_params.get('larc_mode', 'clip') - min_update = larc_params.get('min_update', 1e-7) - eps = larc_params.get('epsilon', 1e-7) - - for idx, (g, v) in enumerate(grads_and_vars): - var_dtype = v.dtype - v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) - g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) - - if larc_mode == 'clip': - larc_grad_update = tf.maximum( - larc_eta * v_norm / (lr * (g_norm + eps)), - min_update, - ) - if "larc_summaries" in summaries: - tf.summary.scalar('larc_clip_on/{}'.format(v.name), - tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) - larc_grad_update = tf.minimum(larc_grad_update, 1.0) - else: - larc_grad_update = tf.maximum( - larc_eta * v_norm / (g_norm + eps), - min_update, - ) - larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) - grads_and_vars[idx] = (larc_grad_update * g, v) - - # adding additional summary - if "larc_summaries" in summaries: - tf.summary.scalar('larc_grad_update/{}'.format(v.name), - larc_grad_update) - tf.summary.scalar("larc_final_lr/{}".format(v.name), - tf.cast(lr, var_dtype) * larc_grad_update) - - # TODO: move this to the top! + # TODO: apply iter_size to float16 gradients? if on_horovod: if iter_size > 1: grads_and_vars_accum = [] @@ -348,12 +198,17 @@ class should be sub-class of `tf.Optimizer` that implements accum_op = tf.group(accum_ops) - def clear_op(): + def update_and_clear_op(): with tf.control_dependencies([accum_op]): red_grad_updates = opt.apply_gradients( - reduce_gradients(grads_and_vars_accum, on_horovod=True), + post_process_gradients( + reduce_gradients(grads_and_vars_accum, on_horovod=True), + lr=lr, + clip_gradients=clip_gradients, + larc_params=larc_params, + summaries=summaries, + ), global_step=global_step, - name="train", ) with tf.control_dependencies([red_grad_updates]): @@ -363,19 +218,29 @@ def clear_op(): grad_updates = tf.cond( pred=skip_update_ph, true_fn=lambda: accum_op, - false_fn=clear_op, + false_fn=update_and_clear_op, ) else: grad_updates = opt.apply_gradients( - reduce_gradients(grads_and_vars, on_horovod=True), + post_process_gradients( + reduce_gradients(grads_and_vars, on_horovod=True), + lr=lr, + clip_gradients=clip_gradients, + larc_params=larc_params, + summaries=summaries, + ), global_step=global_step, - name="train", ) else: grad_updates = opt.apply_gradients( - grads_and_vars, + post_process_gradients( + grads_and_vars, + lr=lr, + clip_gradients=clip_gradients, + larc_params=larc_params, + summaries=summaries, + ), global_step=global_step, - name="train", ) # Ensure the train_tensor computes grad_updates. @@ -384,6 +249,98 @@ def clear_op(): return train_tensor +def post_process_gradients(grads_and_vars, summaries, lr, + clip_gradients, larc_params): + """Applies post processing to gradients, i.e. clipping, LARC, summaries.""" + if "global_gradient_norm" in summaries: + tf.summary.scalar( + "global_gradient_norm", + _global_norm_with_cast(grads_and_vars), + ) + + # Optionally clip gradients by global norm. + if clip_gradients is not None: + grads_and_vars = _clip_gradients_by_norm(grads_and_vars, clip_gradients) + + # Add histograms for variables, gradients and gradient norms. + for gradient, variable in grads_and_vars: + if isinstance(gradient, tf.IndexedSlices): + grad_values = gradient.values + else: + grad_values = gradient + + if isinstance(variable, tf.IndexedSlices): + var_values = variable.values + else: + var_values = variable + + if grad_values is not None: + var_name = variable.name.replace(":", "_") + if "gradients" in summaries: + # need to mask nans for automatic loss scaling + tf.summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) + if "gradient_norm" in summaries: + tf.summary.scalar("gradient_norm/%s" % var_name, tf.norm(grad_values)) + if "variables" in summaries: + tf.summary.histogram("variables/%s" % var_name, var_values) + if "variable_norm" in summaries: + tf.summary.scalar("variable_norm/%s" % var_name, tf.norm(var_values)) + + if clip_gradients is not None and "global_gradient_norm" in summaries: + tf.summary.scalar( + "global_clipped_gradient_norm", + _global_norm_with_cast(grads_and_vars), + ) + + # LARC gradient re-scaling + if larc_params is not None: + check_params( + config=larc_params, + required_dict={'larc_eta': float}, + optional_dict={ + 'larc_mode': ['clip', 'scale'], + 'min_update': float, + 'epsilon': float + }, + ) + larc_eta = larc_params['larc_eta'] + larc_mode = larc_params.get('larc_mode', 'clip') + min_update = larc_params.get('min_update', 1e-7) + eps = larc_params.get('epsilon', 1e-7) + + grads_and_vars_larc = [None] * len(grads_and_vars) + for idx, (g, v) in enumerate(grads_and_vars): + var_dtype = v.dtype + v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) + g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) + + if larc_mode == 'clip': + larc_grad_update = tf.maximum( + larc_eta * v_norm / (lr * (g_norm + eps)), + min_update, + ) + if "larc_summaries" in summaries: + tf.summary.scalar('larc_clip_on/{}'.format(v.name), + tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) + larc_grad_update = tf.minimum(larc_grad_update, 1.0) + else: + larc_grad_update = tf.maximum( + larc_eta * v_norm / (g_norm + eps), + min_update, + ) + larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) + grads_and_vars_larc[idx] = (larc_grad_update * g, v) + + # adding additional summary + if "larc_summaries" in summaries: + tf.summary.scalar('larc_grad_update/{}'.format(v.name), + larc_grad_update) + tf.summary.scalar("larc_final_lr/{}".format(v.name), + tf.cast(lr, var_dtype) * larc_grad_update) + grads_and_vars = grads_and_vars_larc + return grads_and_vars + + def _global_norm_with_cast(grads_and_vars): return tf.global_norm(list(map( lambda x: tf.cast(x, tf.float32), From aed86d50b446fc623674aba3a3e18ed357fd956f Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 12 Jun 2018 11:44:30 -0700 Subject: [PATCH 053/102] Fix iter_size for cudnn rnn layers --- example_configs/speech2text/ds2_toy_config.py | 10 ++---- open_seq2seq/models/model.py | 1 + open_seq2seq/models/speech2text_test.py | 34 +++++++++++++++++-- open_seq2seq/optimizers/optimizers.py | 13 ++++--- open_seq2seq/optimizers/optimizers_test.py | 13 ++++--- 5 files changed, 52 insertions(+), 19 deletions(-) diff --git a/example_configs/speech2text/ds2_toy_config.py b/example_configs/speech2text/ds2_toy_config.py index 86506bbc5..8ae96493d 100644 --- a/example_configs/speech2text/ds2_toy_config.py +++ b/example_configs/speech2text/ds2_toy_config.py @@ -11,14 +11,10 @@ base_params = { "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, - # "use_horovod": False, - "use_horovod": True, - "iter_size": 2, - - "num_epochs": 10, - - "num_gpus": 1, + "num_gpus": 2, "batch_size_per_gpu": 5, "save_summaries_steps": 10, diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index eb9c85f64..b11ae234c 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -273,6 +273,7 @@ class docs. self._steps_in_epoch //= self._hvd.size() else: self._steps_in_epoch //= self.num_gpus + self._steps_in_epoch //= self._params.get('iter_size', 1) self._last_step = self._params['num_epochs'] * self._steps_in_epoch if self.on_horovod: diff --git a/open_seq2seq/models/speech2text_test.py b/open_seq2seq/models/speech2text_test.py index 687bda883..feed83b75 100644 --- a/open_seq2seq/models/speech2text_test.py +++ b/open_seq2seq/models/speech2text_test.py @@ -27,11 +27,11 @@ def setUp(self): def tearDown(self): pass - def run_model(self, train_config, eval_config): + def run_model(self, train_config, eval_config, hvd=None): with tf.Graph().as_default() as g: - train_model = base_model(params=train_config, mode="train", hvd=None) + train_model = base_model(params=train_config, mode="train", hvd=hvd) train_model.compile() - eval_model = base_model(params=eval_config, mode="eval", hvd=None) + eval_model = base_model(params=eval_config, mode="eval", hvd=hvd) eval_model.compile(force_var_reuse=True) train(train_model, eval_model) @@ -99,6 +99,34 @@ def test_convergence(self): self.assertLess(eval_loss, 200.0) self.assertLess(eval_dict['Eval WER'], 0.1) + def test_convergence_with_iter_size(self): + try: + import horovod.tensorflow as hvd + hvd.init() + except ImportError: + print("Horovod not installed skipping test_convergence_with_iter_size") + return + + for dtype in [tf.float32, "mixed"]: + train_config, eval_config = self.prepare_config() + train_config.update({ + "dtype": dtype, + "iter_size": 5, + "batch_size_per_gpu": 2, + "use_horovod": True, + }) + eval_config.update({ + "dtype": dtype, + "iter_size": 5, + "batch_size_per_gpu": 2, + "use_horovod": True, + }) + loss, eval_loss, eval_dict = self.run_model(train_config, eval_config, hvd) + + self.assertLess(loss, 5.0) + self.assertLess(eval_loss, 200.0) + self.assertLess(eval_dict['Eval WER'], 0.1) + def test_infer(self): train_config, infer_config = self.prepare_config() train_config['num_epochs'] = 200 diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index 6172536d5..ab2688634 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -188,10 +188,15 @@ class should be sub-class of `tf.Optimizer` that implements grads_and_vars_accum = [] accum_ops = [] for grad, var in grads_and_vars: - grad_accum = tf.get_variable( - grad.name.split(":")[0] + "_accum", shape=grad.shape, - dtype=grad.dtype, initializer=tf.zeros_initializer(), - trainable=False, validate_shape=bool(grad.get_shape()) + # necessary to use tf.Variable directly to instantiate cudnn rnn cells + # which don't have explicit shape. + grad_accum = tf.Variable( + initial_value=tf.zeros_like(var), + name=grad.name.split(":")[0] + "_accum", + expected_shape=grad.shape, + dtype=grad.dtype, + trainable=False, + validate_shape=bool(grad.get_shape()) ) accum_ops.append(tf.assign(grad_accum, grad_accum + grad / iter_size)) grads_and_vars_accum.append((grad_accum, var)) diff --git a/open_seq2seq/optimizers/optimizers_test.py b/open_seq2seq/optimizers/optimizers_test.py index 0f4737edd..bf386edf9 100644 --- a/open_seq2seq/optimizers/optimizers_test.py +++ b/open_seq2seq/optimizers/optimizers_test.py @@ -13,16 +13,19 @@ class IterSizeTests(tf.test.TestCase): def setUp(self): - try: - import horovod.tensorflow as hvd - hvd.init() - except ImportError: - print("Horovod not installed skipping IterSizeTests") + pass def tearDown(self): pass def test_updates(self): + try: + import horovod.tensorflow as hvd + hvd.init() + except ImportError: + print("Horovod not installed skipping test_updates") + return + dtype = tf.float32 with tf.Graph().as_default() as g: n_samples = 10 From e528214c0cc3f88a9138e3c292af3bef250971c1 Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 12 Jun 2018 12:37:09 -0700 Subject: [PATCH 054/102] Add checks for correct params --- open_seq2seq/models/model.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index b11ae234c..175d76a28 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -187,6 +187,9 @@ class docs. self._params = copy.deepcopy(params) + if self._params.get('iter_size', 1) > 1 and hvd is None: + raise ValueError("iter_size is only supported in Horovod mode") + # parameter checks self._mode = mode if self._mode not in ["train", "infer", "eval"]: @@ -274,6 +277,8 @@ class docs. else: self._steps_in_epoch //= self.num_gpus self._steps_in_epoch //= self._params.get('iter_size', 1) + if self._steps_in_epoch == 0: + raise ValueError("Overall batch size is too big for this dataset.") self._last_step = self._params['num_epochs'] * self._steps_in_epoch if self.on_horovod: From 2428307dd7b14430e62135d0b2c43e2867d8ba81 Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 12 Jun 2018 12:51:58 -0700 Subject: [PATCH 055/102] Fixed config --- example_configs/speech2text/ds2_toy_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/example_configs/speech2text/ds2_toy_config.py b/example_configs/speech2text/ds2_toy_config.py index 8ae96493d..9905bfb81 100644 --- a/example_configs/speech2text/ds2_toy_config.py +++ b/example_configs/speech2text/ds2_toy_config.py @@ -15,10 +15,10 @@ "num_epochs": 100, "num_gpus": 2, - "batch_size_per_gpu": 5, + "batch_size_per_gpu": 2, "save_summaries_steps": 10, - "print_loss_steps": 1, + "print_loss_steps": 10 "print_samples_steps": 20, "eval_steps": 50, "save_checkpoint_steps": 50, @@ -64,7 +64,7 @@ "rnn_unidirectional": False, "row_conv": False, "row_conv_width": 8, - "use_cudnn_rnn": False, + "use_cudnn_rnn": True, "dropout_keep_prob": 1.0, From d120cb7852d1a1463cea111b6ba6fbbdd1d8c929 Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 12 Jun 2018 12:52:31 -0700 Subject: [PATCH 056/102] Fixed config --- example_configs/speech2text/ds2_toy_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example_configs/speech2text/ds2_toy_config.py b/example_configs/speech2text/ds2_toy_config.py index 9905bfb81..cf9c01bf4 100644 --- a/example_configs/speech2text/ds2_toy_config.py +++ b/example_configs/speech2text/ds2_toy_config.py @@ -18,7 +18,7 @@ "batch_size_per_gpu": 2, "save_summaries_steps": 10, - "print_loss_steps": 10 + "print_loss_steps": 10, "print_samples_steps": 20, "eval_steps": 50, "save_checkpoint_steps": 50, From 5468950c5760554c1965a5f2a35a160beff3af61 Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 12 Jun 2018 14:45:41 -0700 Subject: [PATCH 057/102] Fix bug with verbose on last evaluation --- open_seq2seq/utils/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index 54cb37e77..54d22893b 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -180,20 +180,20 @@ def iterate_data(model, sess, compute_loss, mode, verbose): else: raise ValueError("Unknown mode: {}".format(mode)) - if len(fetches_vals) == 0: - break - if verbose: if size_defined: data_size = int(np.sum(np.ceil(np.array(dl_sizes) / model.params['batch_size_per_gpu']))) - if step == 0 or (data_size > 10 and - processed_batches % (data_size // 10) == 0): + if step == 0 or len(fetches_vals) == 0 or \ + (data_size > 10 and processed_batches % (data_size // 10) == 0): deco_print("Processed {}/{} batches{}".format( processed_batches, data_size, ending)) else: deco_print("Processed {} batches{}".format(processed_batches, ending), end='\r') + + if len(fetches_vals) == 0: + break step += 1 if verbose: From b9c101921a23e1ad0d596a2208614b5d4e905765 Mon Sep 17 00:00:00 2001 From: Kipok Date: Tue, 12 Jun 2018 15:01:23 -0700 Subject: [PATCH 058/102] Fix docs, raise exception instead of pass --- open_seq2seq/models/model.py | 5 +++-- open_seq2seq/optimizers/optimizers.py | 4 +--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index 175d76a28..53fc7955b 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -169,8 +169,9 @@ class docs. * **summaries** (list) --- which summaries to log. Could contain "learning_rate", "gradients", "gradient_norm", "global_gradient_norm", "variables", "variable_norm". - * **iter_size** (int) --- same as in nvcaffe, the gradients will be - accumulated for ``iter_size`` number of steps before applying update. + * **iter_size** (int) --- use this parameter to emulate large batches. + The gradients will be accumulated for ``iter_size`` number of steps before + applying update. * **larc_params** --- dictionary with parameters for LARC (or LARS) optimization algorithms. Can contain the following parameters: diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index ab2688634..0af6dbdac 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -90,8 +90,7 @@ def reduce_gradients(grads_and_vars, on_horovod): else: return grads_and_vars else: - # TODO: implement this - pass + raise NotImplementedError("Reduce in tower-mode is not implemented.") def optimize_loss(loss, @@ -182,7 +181,6 @@ class should be sub-class of `tf.Optimizer` that implements loss, colocate_gradients_with_ops=True, ) - # TODO: apply iter_size to float16 gradients? if on_horovod: if iter_size > 1: grads_and_vars_accum = [] From bd6a9a93c26a00e2394cc680d7ffa83d9007e668 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Tue, 12 Jun 2018 15:06:44 -0700 Subject: [PATCH 059/102] Various changes Transformer data layer: * allow option to batch in examples and preserve order for inference * add parse_output function to clean and detokenize inference output * add process data function from MLPerf Text2Text: * allow inference with batch size > 1 --- detokenizer.perl | 373 +++++++++++++++ .../text2text/transformer-base-test.py | 133 ++++++ ...transformer-big.py => transformer-base.py} | 0 open_seq2seq/data/text2text/__init__.py | 1 + open_seq2seq/data/text2text/parse_output.py | 50 ++ open_seq2seq/data/text2text/process_data.py | 433 ++++++++++++++++++ open_seq2seq/data/text2text/t2t.py | 17 +- open_seq2seq/data/text2text/text2text.py | 5 +- open_seq2seq/models/text2text.py | 33 +- requirements.txt | 2 +- run.py | 2 +- 11 files changed, 1025 insertions(+), 24 deletions(-) create mode 100755 detokenizer.perl create mode 100644 example_configs/text2text/transformer-base-test.py rename example_configs/text2text/{transformer-big.py => transformer-base.py} (100%) create mode 100644 open_seq2seq/data/text2text/parse_output.py create mode 100644 open_seq2seq/data/text2text/process_data.py diff --git a/detokenizer.perl b/detokenizer.perl new file mode 100755 index 000000000..41299baf2 --- /dev/null +++ b/detokenizer.perl @@ -0,0 +1,373 @@ +#!/usr/bin/env perl + +# $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $ +# Sample De-Tokenizer +# written by Josh Schroeder, based on code by Philipp Koehn +# further modifications by Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +use warnings; +use strict; +use utf8; # tell perl this script file is in UTF-8 (see all funny punct below) + +my $language = "en"; +my $QUIET = 0; +my $HELP = 0; +my $UPPERCASE_SENT = 0; +my $PENN = 0; + +while (@ARGV) { + $_ = shift; + /^-b$/ && ($| = 1, next); + /^-l$/ && ($language = shift, next); + /^-q$/ && ($QUIET = 1, next); + /^-h$/ && ($HELP = 1, next); + /^-u$/ && ($UPPERCASE_SENT = 1, next); + /^-penn$/ && ($PENN = 1, next); +} + +if ($HELP) { + print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n"; + print "Options:\n"; + print " -u ... uppercase the first char in the final sentence.\n"; + print " -q ... don't report detokenizer revision.\n"; + print " -b ... disable Perl buffering.\n"; + print " -penn ... assume input is tokenized as per tokenizer.perl's -penn option.\n"; + exit; +} + +if ($language !~ /^(cs|en|fr|it|fi)$/) { + print STDERR "Warning: No built-in rules for language $language.\n" +} + +if ($PENN && $language ne "en") { + print STDERR "Error: -penn option only supported for English text.\n"; + exit; +} + +if (!$QUIET) { + print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n"; + print STDERR "Language: $language\n"; +} + +while() { + if (/^<.+>$/ || /^\s*$/) { + #don't try to detokenize XML/HTML tag lines + print $_; + } elsif ($PENN) { + print &detokenize_penn($_); + } else { + print &detokenize($_); + } +} + + +sub ucsecondarg { + # uppercase the second argument + my $arg1 = shift; + my $arg2 = shift; + return $arg1.uc($arg2); +} + +sub deescape { + # de-escape special chars + my ($text) = @_; + $text =~ s/\&bar;/\|/g; # factor separator (legacy) + $text =~ s/\|/\|/g; # factor separator + $text =~ s/\</\/g; # xml + $text =~ s/\&bra;/\[/g; # syntax non-terminal (legacy) + $text =~ s/\&ket;/\]/g; # syntax non-terminal (legacy) + $text =~ s/\"/\"/g; # xml + $text =~ s/\'/\'/g; # xml + $text =~ s/\[/\[/g; # syntax non-terminal + $text =~ s/\]/\]/g; # syntax non-terminal + $text =~ s/\&/\&/g; # escape escape + return $text; +} + +sub detokenize { + my($text) = @_; + chomp($text); + $text = " $text "; + $text =~ s/ \@\-\@ /-/g; + $text = &deescape($text); + + my $word; + my $i; + my @words = split(/ /,$text); + $text = ""; + my %quoteCount = ("\'"=>0,"\""=>0); + my $prependSpace = " "; + for ($i=0;$i<(scalar(@words));$i++) { + if (&startsWithCJKChar($words[$i])) { + if (($i > 0 && &endsWithCJKChar($words[$i-1])) && ($language ne "ko")) { + # perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word + $text=$text.$words[$i]; + } else { + # ... but do nothing special if this is a CJK word that doesn't follow a CJK word + $text=$text.$prependSpace.$words[$i]; + } + $prependSpace = " "; + } elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { + #perform right shift on currency and other random punctuation items + $text = $text.$prependSpace.$words[$i]; + $prependSpace = ""; + } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){ + if (($language eq "fr") && ($words[$i] =~ /^[\?\!\:\;\\\%]$/)) { + #these punctuations are prefixed with a non-breakable space in french + $text .= " "; } + #perform left shift on punctuation items + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { + #left-shift the contraction for English + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) { + #left-shift floats in Czech + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) { + #right-shift the contraction for French and Italian + $text = $text.$prependSpace.$words[$i]; + $prependSpace = ""; + } elsif (($language eq "cs") && ($i<(scalar(@words)-3)) + && ($words[$i] =~ /[\p{IsAlpha}]$/) + && ($words[$i+1] =~ /^[-–]$/) + && ($words[$i+2] =~ /^li$|^mail.*/i) + ) { + #right-shift "-li" in Czech and a few Czech dashed words (e-mail) + $text = $text.$prependSpace.$words[$i].$words[$i+1]; + $i++; # advance over the dash + $prependSpace = ""; + } elsif ($words[$i] =~ /^[\'\"„“`]+$/) { + #combine punctuation smartly + my $normalized_quo = $words[$i]; + $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/; + $quoteCount{$normalized_quo} = 0 + if !defined $quoteCount{$normalized_quo}; + if ($language eq "cs" && $words[$i] eq "„") { + # this is always the starting quote in Czech + $quoteCount{$normalized_quo} = 0; + } + if ($language eq "cs" && $words[$i] eq "“") { + # this is usually the ending quote in Czech + $quoteCount{$normalized_quo} = 1; + } + if (($quoteCount{$normalized_quo} % 2) eq 0) { + if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) { + #single quote for posesssives ending in s... "The Jones' house" + #left shift + $text=$text.$words[$i]; + $prependSpace = " "; + } else { + #right shift + $text = $text.$prependSpace.$words[$i]; + $prependSpace = ""; + $quoteCount{$normalized_quo} ++; + + } + } else { + #left shift + $text=$text.$words[$i]; + $prependSpace = " "; + $quoteCount{$normalized_quo} ++; + + } + + } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) { + # Finnish : without intervening space if followed by case suffix + # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ... + $text=$text. lc $words[$i]; + $prependSpace = " "; + } else { + $text=$text.$prependSpace.$words[$i]; + $prependSpace = " "; + } + } + + # clean up spaces at head and tail of each line as well as any double-spacing + $text =~ s/ +/ /g; + $text =~ s/\n /\n/g; + $text =~ s/ \n/\n/g; + $text =~ s/^ //g; + $text =~ s/ $//g; + + #add trailing break + $text .= "\n" unless $text =~ /\n$/; + + $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; + + return $text; +} + +sub detokenize_penn { + my($text) = @_; + + chomp($text); + $text = " $text "; + $text =~ s/ \@\-\@ /-/g; + $text =~ s/ \@\/\@ /\//g; + $text = &deescape($text); + + # merge de-contracted forms except where the second word begins with an + # apostrophe (those are handled later) + $text =~ s/ n't /n't /g; + $text =~ s/ N'T /N'T /g; + $text =~ s/ ([Cc])an not / $1annot /g; + $text =~ s/ ([Dd])' ye / $1'ye /g; + $text =~ s/ ([Gg])im me / $1imme /g; + $text =~ s/ ([Gg])on na / $1onna /g; + $text =~ s/ ([Gg])ot ta / $1otta /g; + $text =~ s/ ([Ll])em me / $1emme /g; + $text =~ s/ '([Tt]) is / '$1is /g; + $text =~ s/ '([Tt]) was / '$1was /g; + $text =~ s/ ([Ww])an na / $1anna /g; + + # restore brackets + $text =~ s/-LRB-/\(/g; + $text =~ s/-RRB-/\)/g; + $text =~ s/-LSB-/\[/g; + $text =~ s/-RSB-/\]/g; + $text =~ s/-LCB-/{/g; + $text =~ s/-RCB-/}/g; + + my $i; + my @words = split(/ /,$text); + $text = ""; + my $prependSpace = " "; + for ($i=0;$i<(scalar(@words));$i++) { + if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { + # perform right shift on currency and other random punctuation items + $text = $text.$prependSpace.$words[$i]; + $prependSpace = ""; + } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){ + # perform left shift on punctuation items + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { + # left-shift the contraction + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only + # opening single quote: convert to straight quote and right-shift + $text = $text.$prependSpace."\'"; + $prependSpace = ""; + } elsif ($words[$i] eq "``") { + # opening double quote: convert to straight quote and right-shift + $text = $text.$prependSpace."\""; + $prependSpace = ""; + } elsif ($words[$i] eq "\'") { + # closing single quote: convert to straight quote and left shift + $text = $text."\'"; + $prependSpace = " "; + } elsif ($words[$i] eq "\'\'") { + # closing double quote: convert to straight quote and left shift + $text = $text."\""; + $prependSpace = " "; + } else { + $text = $text.$prependSpace.$words[$i]; + $prependSpace = " "; + } + } + + # clean up spaces at head and tail of each line as well as any double-spacing + $text =~ s/ +/ /g; + $text =~ s/\n /\n/g; + $text =~ s/ \n/\n/g; + $text =~ s/^ //g; + $text =~ s/ $//g; + + # add trailing break + $text .= "\n" unless $text =~ /\n$/; + + $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; + + return $text; +} + +sub startsWithCJKChar { + my ($str) = @_; + return 0 if length($str) == 0; + my $firstChar = substr($str, 0, 1); + return &charIsCJK($firstChar); +} + +sub endsWithCJKChar { + my ($str) = @_; + return 0 if length($str) == 0; + my $lastChar = substr($str, length($str)-1, 1); + return &charIsCJK($lastChar); +} + +# Given a string consisting of one character, returns true iff the character +# is a CJK (Chinese/Japanese/Korean) character +sub charIsCJK { + my ($char) = @_; + # $char should be a string of length 1 + my $codepoint = &codepoint_dec($char); + + # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane + + # Hangul Jamo (1100–11FF) + return 1 if (&between_hexes($codepoint, '1100', '11FF')); + + # CJK Radicals Supplement (2E80–2EFF) + # Kangxi Radicals (2F00–2FDF) + # Ideographic Description Characters (2FF0–2FFF) + # CJK Symbols and Punctuation (3000–303F) + # Hiragana (3040–309F) + # Katakana (30A0–30FF) + # Bopomofo (3100–312F) + # Hangul Compatibility Jamo (3130–318F) + # Kanbun (3190–319F) + # Bopomofo Extended (31A0–31BF) + # CJK Strokes (31C0–31EF) + # Katakana Phonetic Extensions (31F0–31FF) + # Enclosed CJK Letters and Months (3200–32FF) + # CJK Compatibility (3300–33FF) + # CJK Unified Ideographs Extension A (3400–4DBF) + # Yijing Hexagram Symbols (4DC0–4DFF) + # CJK Unified Ideographs (4E00–9FFF) + # Yi Syllables (A000–A48F) + # Yi Radicals (A490–A4CF) + return 1 if (&between_hexes($codepoint, '2E80', 'A4CF')); + + # Phags-pa (A840–A87F) + return 1 if (&between_hexes($codepoint, 'A840', 'A87F')); + + # Hangul Syllables (AC00–D7AF) + return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF')); + + # CJK Compatibility Ideographs (F900–FAFF) + return 1 if (&between_hexes($codepoint, 'F900', 'FAFF')); + + # CJK Compatibility Forms (FE30–FE4F) + return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F')); + + # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters + return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC')); + + # Supplementary Ideographic Plane 20000–2FFFF + return 1 if (&between_hexes($codepoint, '20000', '2FFFF')); + + return 0; +} + +# Returns the code point of a Unicode char, represented as a decimal number +sub codepoint_dec { + if (my $char = shift) { + return unpack('U0U*', $char); + } +} + +sub between_hexes { + my ($num, $left, $right) = @_; + return $num >= hex($left) && $num <= hex($right); +} diff --git a/example_configs/text2text/transformer-base-test.py b/example_configs/text2text/transformer-base-test.py new file mode 100644 index 000000000..c59758b23 --- /dev/null +++ b/example_configs/text2text/transformer-base-test.py @@ -0,0 +1,133 @@ +from __future__ import absolute_import, division, print_function +from open_seq2seq.models import Text2Text +from open_seq2seq.encoders import TransformerEncoder +from open_seq2seq.decoders import TransformerDecoder +from open_seq2seq.data.text2text.text2text import TransformerDataLayer +from open_seq2seq.losses import PaddedCrossEntropyLossWithSmoothing +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.data.text2text.tokenizer import EOS_ID +from open_seq2seq.optimizers.lr_policies import transformer_policy +import tensorflow as tf + +""" +This configuration file describes a variant of Transformer model from +https://arxiv.org/abs/1706.03762 +""" + +base_model = Text2Text +d_model = 512 +num_layers = 6 + +data_root = "/tmp/translate_ende/" + +base_params = { + "use_horovod": False, + "num_gpus": 2, + "batch_size_per_gpu": 4096, # this size is in tokens + "max_steps": 500000, + "save_summaries_steps": 50, + "print_loss_steps": 50, + "print_samples_steps": 50, + "eval_steps": 4001, + "save_checkpoint_steps": 1000, + "logdir": "Transformer-FP32", + "dtype": tf.float32, + # "dtype": "mixed", + # "loss_scaling": "Backoff", + "optimizer": tf.contrib.opt.LazyAdamOptimizer, + "optimizer_params": { + "beta1": 0.9, + "beta2": 0.997, + "epsilon": 1e-09, + }, + + "lr_policy": transformer_policy, + "lr_policy_params": { + "learning_rate": 2.0, + "warmup_steps": 16000, + "d_model": d_model, + }, + + # "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": TransformerEncoder, + "encoder_params": { + "encoder_layers": num_layers, + "hidden_size": d_model, + "num_heads": 8, + "attention_dropout": 0.1, + "filter_size": 4 * d_model, + "relu_dropout": 0.1, + "layer_postprocess_dropout": 0.1, + "pad_embeddings_2_eight": True, + }, + + "decoder": TransformerDecoder, + "decoder_params": { + "layer_postprocess_dropout": 0.1, + "num_hidden_layers": num_layers, + "hidden_size": d_model, + "num_heads": 8, + "attention_dropout": 0.1, + "relu_dropout": 0.1, + "filter_size": 4 * d_model, + "beam_size": 4, + "alpha": 0.6, + "extra_decode_length": 50, + "EOS_ID": EOS_ID, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + }, + + "loss": PaddedCrossEntropyLossWithSmoothing, + "loss_params": { + "label_smoothing": 0.1, + } +} + +train_params = { + "data_layer": TransformerDataLayer, + "data_layer_params": { + 'data_dir': data_root, + 'file_pattern': "*train*", + 'src_vocab_file': data_root + "vocab.ende.32768", + 'max_length': 256, + 'shuffle': True, + 'repeat': 100000, + 'mode': 'train', + "delimiter": ' ', + }, +} + +eval_params = { + "batch_size_per_gpu": 256, + "data_layer": TransformerDataLayer, + "data_layer_params": { + 'data_dir': data_root, + 'file_pattern': "*dev*", + 'src_vocab_file': data_root + "vocab.ende.32768", + 'max_length': 256, + 'shuffle': False, + 'repeat': 1, + 'mode': 'train', + "delimiter": ' ', + }, +} + +infer_params = { + "batch_size_per_gpu": 64, # it is now in samples, not tokens + "batch_in_tokens": False, # this is necessary to preserve the order + "data_layer": TransformerDataLayer, + "data_layer_params": { + 'data_dir': data_root, + 'file_pattern': "*test*", + 'src_vocab_file': data_root + "vocab.ende.32768", + 'max_length': 256, + 'shuffle': False, + 'repeat': 1, + 'mode': 'train', + "delimiter": ' ', + }, +} \ No newline at end of file diff --git a/example_configs/text2text/transformer-big.py b/example_configs/text2text/transformer-base.py similarity index 100% rename from example_configs/text2text/transformer-big.py rename to example_configs/text2text/transformer-base.py diff --git a/open_seq2seq/data/text2text/__init__.py b/open_seq2seq/data/text2text/__init__.py index e69de29bb..15904c843 100644 --- a/open_seq2seq/data/text2text/__init__.py +++ b/open_seq2seq/data/text2text/__init__.py @@ -0,0 +1 @@ +from . import tokenizer \ No newline at end of file diff --git a/open_seq2seq/data/text2text/parse_output.py b/open_seq2seq/data/text2text/parse_output.py new file mode 100644 index 000000000..cfc4b80a9 --- /dev/null +++ b/open_seq2seq/data/text2text/parse_output.py @@ -0,0 +1,50 @@ +# Copyright (c) 2017 NVIDIA Corporation +""" +This file takes output of the inference stage produced using +TransformerDataLayer and converts it to simple tokenized text +""" +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import argparse +import sys +import tokenizer + +def main(argv): + + with open(FLAGS.input_file, 'r') as in_file: + def trim(token): + return token[1:-1] + + print("******Reading from file: {}".format(FLAGS.input_file)) + with open(FLAGS.output_file, 'w') as out_file: + print("******Writing to file: {}".format(FLAGS.output_file)) + for line in in_file: + # merge and split by _ + escaped_tokens = "".join([trim(t) for t in line.strip().split(" ")]) + escaped_tokens = escaped_tokens.split("_") + + # unescape + unescaped_tokens = [] + for token in escaped_tokens: + if token: + unescaped_tokens.append(tokenizer._unescape_token(token)) + + # join and write + out_file.write(tokenizer._join_tokens_to_string(unescaped_tokens)+'\n') + print("******All done!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_file", "-if", type=str, default="", + help="output of the inference stage produced using model with " + "TransformerDataLayer", + metavar="") + parser.add_argument( + "--output_file", "-of", type=str, default="tokenized_output.txt", + help="where to save output", + metavar="") + FLAGS, unparsed = parser.parse_known_args() + main(sys.argv) \ No newline at end of file diff --git a/open_seq2seq/data/text2text/process_data.py b/open_seq2seq/data/text2text/process_data.py new file mode 100644 index 000000000..2c7303f3a --- /dev/null +++ b/open_seq2seq/data/text2text/process_data.py @@ -0,0 +1,433 @@ +# Copyright 2018 MLBenchmark Group. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Download and preprocess WMT17 ende training and evaluation datasets.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os +import random +import sys +import tarfile +import urllib + +import six +import tensorflow as tf +import urllib.request + +import tokenizer + +# Data sources for training/evaluating the transformer translation model. +# If any of the training sources are changed, then either: +# 1) use the flag `--search` to find the best min count or +# 2) update the _TRAIN_DATA_MIN_COUNT constant. +# min_count is the minimum number of times a token must appear in the data +# before it is added to the vocabulary. "Best min count" refers to the value +# that generates a vocabulary set that is closest in size to _TARGET_VOCAB_SIZE. +_TRAIN_DATA_SOURCES = [ + { + "url": "http://data.statmt.org/wmt17/translation-task/" + "training-parallel-nc-v12.tgz", + "input": "news-commentary-v12.de-en.en", + "target": "news-commentary-v12.de-en.de", + }, + { + "url": "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz", + "input": "commoncrawl.de-en.en", + "target": "commoncrawl.de-en.de", + }, + { + "url": "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz", + "input": "europarl-v7.de-en.en", + "target": "europarl-v7.de-en.de", + }, +] +# Use pre-defined minimum count to generate subtoken vocabulary. +_TRAIN_DATA_MIN_COUNT = 6 + + +_EVAL_DATA_SOURCES = [ + { + "url": "http://data.statmt.org/wmt17/translation-task/dev.tgz", + "input": "newstest2013.en", + "target": "newstest2013.de", + } +] + +_TEST_DATA_SOURCES = [ + { + "url": "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2014.en", + "input": "newstest2014.en", + "target": "newstest2014.en", + } +] + +# Vocabulary constants +_TARGET_VOCAB_SIZE = 32768 # Number of subtokens in the vocabulary list. +_TARGET_THRESHOLD = 327 # Accept vocabulary if size is within this threshold +_VOCAB_FILE = "vocab.ende.%d" % _TARGET_VOCAB_SIZE + +# Strings to inclue in the generated files. +_PREFIX = "wmt32k" +_TRAIN_TAG = "train" +_EVAL_TAG = "dev" # Following WMT and Tensor2Tensor conventions, in which the + # evaluation datasets are tagged as "dev" for development. +_TEST_TAG = "test" + +# Number of files to split train and evaluation data +_TRAIN_SHARDS = 100 +_EVAL_SHARDS = 1 +_TEST_SHARDS = 1 + +def find_file(path, filename, max_depth=5): + """Returns full filepath if the file is in path or a subdirectory.""" + for root, dirs, files in os.walk(path): + if filename in files: + return os.path.join(root, filename) + + # Don't search past max_depth + depth = root[len(path) + 1:].count(os.sep) + if depth > max_depth: + del dirs[:] # Clear dirs + return None + + +############################################################################### +# Download and extraction functions +############################################################################### +def get_raw_files(raw_dir, data_source): + """Return raw files from source. Downloads/extracts if needed. + + Args: + raw_dir: string directory to store raw files + data_source: dictionary with + {"url": url of compressed dataset containing input and target files + "input": file with data in input language + "target": file with data in target language} + + Returns: + dictionary with + {"inputs": list of files containing data in input language + "targets": list of files containing corresponding data in target language + } + """ + raw_files = { + "inputs": [], + "targets": [], + } # keys + for d in data_source: + input_file, target_file = download_and_extract( + raw_dir, d["url"], d["input"], d["target"]) + raw_files["inputs"].append(input_file) + raw_files["targets"].append(target_file) + return raw_files + + +def download_report_hook(count, block_size, total_size): + """Report hook for download progress. + + Args: + count: current block number + block_size: block size + total_size: total size + """ + percent = int(count * block_size * 100 / total_size) + print("\r%d%%" % percent + " completed", end="\r") + + +def download_from_url(path, url): + """Download content from a url. + + Args: + path: string directory where file will be downloaded + url: string url + + Returns: + Full path to downloaded file + """ + filename = url.split("/")[-1] + found_file = find_file(path, filename, max_depth=0) + if found_file is None: + filename = os.path.join(path, filename) + tf.logging.info("Downloading from %s to %s." % (url, filename)) + inprogress_filepath = filename + ".incomplete" + inprogress_filepath, _ = urllib.request.urlretrieve( + url, inprogress_filepath, reporthook=download_report_hook) + # Print newline to clear the carriage return from the download progress. + print() + tf.gfile.Rename(inprogress_filepath, filename) + return filename + else: + tf.logging.info("Already downloaded: %s (at %s)." % (url, found_file)) + return found_file + + +def download_and_extract(path, url, input_filename, target_filename): + """Extract files from downloaded compressed archive file. + + Args: + path: string directory where the files will be downloaded + url: url containing the compressed input and target files + input_filename: name of file containing data in source language + target_filename: name of file containing data in target language + + Returns: + Full paths to extracted input and target files. + + Raises: + OSError: if the the download/extraction fails. + """ + # Check if extracted files already exist in path + input_file = find_file(path, input_filename) + target_file = find_file(path, target_filename) + if input_file and target_file: + tf.logging.info("Already downloaded and extracted %s." % url) + return input_file, target_file + + # Download archive file if it doesn't already exist. + compressed_file = download_from_url(path, url) + + # Extract compressed files + tf.logging.info("Extracting %s." % compressed_file) + with tarfile.open(compressed_file, "r:gz") as corpus_tar: + corpus_tar.extractall(path) + + # Return filepaths of the requested files. + input_file = find_file(path, input_filename) + target_file = find_file(path, target_filename) + + if input_file and target_file: + return input_file, target_file + + raise OSError("Download/extraction failed for url %s to path %s" % + (url, path)) + + +def txt_line_iterator(path): + """Iterate through lines of file.""" + with tf.gfile.Open(path) as f: + for line in f: + yield line.strip() + + +def compile_files(raw_dir, raw_files, tag): + """Compile raw files into a single file for each language. + + Args: + raw_dir: Directory containing downloaded raw files. + raw_files: Dict containing filenames of input and target data. + {"inputs": list of files containing data in input language + "targets": list of files containing corresponding data in target language + } + tag: String to append to the compiled filename. + + Returns: + Full path of compiled input and target files. + """ + tf.logging.info("Compiling files with tag %s." % tag) + filename = "%s-%s" % (_PREFIX, tag) + input_compiled_file = os.path.join(raw_dir, filename + ".lang1") + target_compiled_file = os.path.join(raw_dir, filename + ".lang2") + + with tf.gfile.Open(input_compiled_file, mode="w") as input_writer: + with tf.gfile.Open(target_compiled_file, mode="w") as target_writer: + for i in range(len(raw_files["inputs"])): + input_file = raw_files["inputs"][i] + target_file = raw_files["targets"][i] + + tf.logging.info("Reading files %s and %s." % (input_file, target_file)) + write_file(input_writer, input_file) + write_file(target_writer, target_file) + return input_compiled_file, target_compiled_file + + +def write_file(writer, filename): + """Write all of lines from file using the writer.""" + for line in txt_line_iterator(filename): + writer.write(line) + writer.write("\n") + + +############################################################################### +# Data preprocessing +############################################################################### +def encode_and_save_files( + subtokenizer, data_dir, raw_files, tag, total_shards): + """Save data from files as encoded Examples in TFrecord format. + + Args: + subtokenizer: Subtokenizer object that will be used to encode the strings. + data_dir: The directory in which to write the examples + raw_files: A tuple of (input, target) data files. Each line in the input and + the corresponding line in target file will be saved in a tf.Example. + tag: String that will be added onto the file names. + total_shards: Number of files to divide the data into. + + Returns: + List of all files produced. + """ + # Create a file for each shard. + filepaths = [shard_filename(data_dir, tag, n + 1, total_shards) + for n in range(total_shards)] + + if all_exist(filepaths): + tf.logging.info("Files with tag %s already exist." % tag) + return filepaths + + tf.logging.info("Saving files with tag %s." % tag) + input_file = raw_files[0] + target_file = raw_files[1] + + # Write examples to each shard in round robin order. + tmp_filepaths = [fname + ".incomplete" for fname in filepaths] + writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filepaths] + counter, shard = 0, 0 + for counter, (input_line, target_line) in enumerate(zip( + txt_line_iterator(input_file), txt_line_iterator(target_file))): + if counter > 0 and counter % 100000 == 0: + tf.logging.info("\tSaving case %d." % counter) + example = dict_to_example( + {"inputs": subtokenizer.encode(input_line, add_eos=True), + "targets": subtokenizer.encode(target_line, add_eos=True)}) + writers[shard].write(example.SerializeToString()) + shard = (shard + 1) % total_shards + for writer in writers: + writer.close() + + for tmp_name, final_name in zip(tmp_filepaths, filepaths): + tf.gfile.Rename(tmp_name, final_name) + + tf.logging.info("Saved %d Examples", counter) + return filepaths + + +def shard_filename(path, tag, shard_num, total_shards): + """Create filename for data shard.""" + return os.path.join( + path, "%s-%s-%.5d-of-%.5d" % (_PREFIX, tag, shard_num, total_shards)) + + +def shuffle_records(fname): + """Shuffle records in a single file.""" + tf.logging.info("Shuffling records in file %s" % fname) + + # Rename file prior to shuffling + tmp_fname = fname + ".unshuffled" + tf.gfile.Rename(fname, tmp_fname) + + reader = tf.python_io.tf_record_iterator(tmp_fname) + records = [] + for record in reader: + records.append(record) + if len(records) % 100000 == 0: + tf.logging.info("\tRead: %d", len(records)) + + random.shuffle(records) + + # Write shuffled records to original file name + with tf.python_io.TFRecordWriter(fname) as w: + for count, record in enumerate(records): + w.write(record) + if count > 0 and count % 100000 == 0: + tf.logging.info("\tWriting record: %d" % count) + + tf.gfile.Remove(tmp_fname) + + +def dict_to_example(dictionary): + """Converts a dictionary of string->int to a tf.Example.""" + features = {} + for k, v in six.iteritems(dictionary): + features[k] = tf.train.Feature(int64_list=tf.train.Int64List(value=v)) + return tf.train.Example(features=tf.train.Features(feature=features)) + + +def all_exist(filepaths): + """Returns true if all files in the list exist.""" + for fname in filepaths: + if not tf.gfile.Exists(fname): + return False + return True + + +def make_dir(path): + if not tf.gfile.Exists(path): + tf.logging.info("Creating directory %s" % path) + tf.gfile.MakeDirs(path) + + +def main(unused_argv): + """Obtain training and evaluation data for the Transformer model.""" + tf.logging.set_verbosity(tf.logging.INFO) + + make_dir(FLAGS.raw_dir) + make_dir(FLAGS.data_dir) + + # Get paths of download/extracted training and evaluation files. + tf.logging.info("Step 1/4: Downloading data from source") + train_files = get_raw_files(FLAGS.raw_dir, _TRAIN_DATA_SOURCES) + eval_files = get_raw_files(FLAGS.raw_dir, _EVAL_DATA_SOURCES) + test_files = get_raw_files(FLAGS.raw_dir, _TEST_DATA_SOURCES) + + # Create subtokenizer based on the training files. + tf.logging.info("Step 2/4: Creating subtokenizer and building vocabulary") + train_files_flat = train_files["inputs"] + train_files["targets"] + vocab_file = os.path.join(FLAGS.data_dir, _VOCAB_FILE) + subtokenizer = tokenizer.Subtokenizer.init_from_files( + vocab_file, train_files_flat, _TARGET_VOCAB_SIZE, _TARGET_THRESHOLD, + min_count=None if FLAGS.search else _TRAIN_DATA_MIN_COUNT) + + tf.logging.info("Step 3/4: Compiling training and evaluation data") + compiled_train_files = compile_files(FLAGS.raw_dir, train_files, _TRAIN_TAG) + compiled_eval_files = compile_files(FLAGS.raw_dir, eval_files, _EVAL_TAG) + compiled_test_files = compile_files(FLAGS.raw_dir, test_files, _TEST_TAG) + + # Tokenize and save data as Examples in the TFRecord format. + tf.logging.info("Step 4/4: Preprocessing and saving data") + train_tfrecord_files = encode_and_save_files( + subtokenizer, FLAGS.data_dir, compiled_train_files, _TRAIN_TAG, + _TRAIN_SHARDS) + encode_and_save_files( + subtokenizer, FLAGS.data_dir, compiled_eval_files, _EVAL_TAG, + _EVAL_SHARDS) + encode_and_save_files( + subtokenizer, FLAGS.data_dir, compiled_test_files, _TEST_TAG, + _TEST_SHARDS) + + for fname in train_tfrecord_files: + shuffle_records(fname) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--data_dir", "-dd", type=str, default="/tmp/translate_ende", + help="[default: %(default)s] Directory for where the " + "translate_ende_wmt32k dataset is saved.", + metavar="
") + parser.add_argument( + "--raw_dir", "-rd", type=str, default="/tmp/translate_ende_raw", + help="[default: %(default)s] Path where the raw data will be downloaded " + "and extracted.", + metavar="") + parser.add_argument( + "--search", action="store_true", + help="If set, use binary search to find the vocabulary set with size" + "closest to the target size (%d)." % _TARGET_VOCAB_SIZE) + + FLAGS, unparsed = parser.parse_known_args() + main(sys.argv) \ No newline at end of file diff --git a/open_seq2seq/data/text2text/t2t.py b/open_seq2seq/data/text2text/t2t.py index c24ac23e6..22330bc04 100644 --- a/open_seq2seq/data/text2text/t2t.py +++ b/open_seq2seq/data/text2text/t2t.py @@ -45,8 +45,9 @@ is the list of training files. Second, while reading records using `parallel_interleave`, the `sloppy` argument is used to generate randomness in the order of the examples. -""" +3. Modified slightly to fit OpenSeq2Seq needs +""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -191,7 +192,7 @@ def batching_fn(bucket_id, grouped_dataset): def _read_and_batch_from_files( file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat, - num_workers, worker_id): + num_workers, worker_id, batch_in_tokens): """Create dataset where each item is a dict of "inputs" and "targets". Args: @@ -204,6 +205,10 @@ def _read_and_batch_from_files( repeated forever. num_workers: Number of workers or number of Horovod workers worker_id: Worker id or Horovod rank + batch_in_tokens: whether to batch_size means amounts in tokens or sentence + pairs. batching in tokens is more efficient as it reduces PADs. batching in + sentences should be used in inference mode since order of + sentences is important Returns: tf.data.Dataset object containing examples loaded from the files. @@ -230,8 +235,12 @@ def _read_and_batch_from_files( # Remove examples where the input or target length exceeds the maximum length, dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length)) - # Batch such that each batch has examples of similar length. - dataset = _batch_examples(dataset, batch_size, max_length) + if batch_in_tokens: + # Batch such that each batch has examples of similar length. + dataset = _batch_examples(dataset, batch_size, max_length) + else: + # Examples can have different lenghts + dataset = dataset.padded_batch(batch_size, ([None], [None])) dataset = dataset.repeat(repeat) # Prefetch the next element to improve speed of input pipeline. diff --git a/open_seq2seq/data/text2text/text2text.py b/open_seq2seq/data/text2text/text2text.py index b2dcfcc6e..9c91e3f8e 100644 --- a/open_seq2seq/data/text2text/text2text.py +++ b/open_seq2seq/data/text2text/text2text.py @@ -166,7 +166,7 @@ def tgt_token_to_id(line): [SpecialTextTokens.EOS_ID.value], self._pad_lengths_to_eight), dtype="int32") _sources = tf.data.TextLineDataset(self.source_file)\ - .map(lambda line: tf.py_func(func=src_token_to_id,inp=[line], + .map(lambda line: tf.py_func(func=src_token_to_id, inp=[line], Tout=[tf.int32], stateful=False), num_parallel_calls=self._map_parallel_calls) \ .map(lambda tokens: (tokens, tf.size(tokens)), @@ -301,7 +301,8 @@ def build_graph(self): shuffle=self.params['shuffle'], repeat=self.params['repeat'], num_workers=self._num_workers, - worker_id=self._worker_id) + worker_id=self._worker_id, + batch_in_tokens=self.params.get('batch_in_tokens', True)) self._iterator = self.batched_dataset.make_initializable_iterator() x, y = self.iterator.get_next() diff --git a/open_seq2seq/models/text2text.py b/open_seq2seq/models/text2text.py index 40707bdfe..baef575a3 100644 --- a/open_seq2seq/models/text2text.py +++ b/open_seq2seq/models/text2text.py @@ -80,22 +80,23 @@ def infer(self, input_values, output_values): input_strings, output_strings = [], [] input_values = input_values['source_tensors'] for input_sample, output_sample in zip(input_values, output_values): - output_strings.append(text_ids_to_string( - output_sample[0], - self.get_data_layer().params['target_idx2seq'], - S_ID=self.decoder.params['GO_SYMBOL'], - EOS_ID=self.decoder.params['END_SYMBOL'], - PAD_ID=self.decoder.params['PAD_SYMBOL'], - ignore_special=True, delim=' ', - )) - input_strings.append(text_ids_to_string( - input_sample[0], - self.get_data_layer().params['source_idx2seq'], - S_ID=self.decoder.params['GO_SYMBOL'], - EOS_ID=self.decoder.params['END_SYMBOL'], - PAD_ID=self.decoder.params['PAD_SYMBOL'], - ignore_special=True, delim=' ', - )) + for i in range(0, input_sample.shape[0]): # iterate over batch dimension + output_strings.append(text_ids_to_string( + output_sample[i], + self.get_data_layer().params['target_idx2seq'], + S_ID=self.decoder.params['GO_SYMBOL'], + EOS_ID=self.decoder.params['END_SYMBOL'], + PAD_ID=self.decoder.params['PAD_SYMBOL'], + ignore_special=True, delim=' ', + )) + input_strings.append(text_ids_to_string( + input_sample[i], + self.get_data_layer().params['source_idx2seq'], + S_ID=self.decoder.params['GO_SYMBOL'], + EOS_ID=self.decoder.params['END_SYMBOL'], + PAD_ID=self.decoder.params['PAD_SYMBOL'], + ignore_special=True, delim=' ', + )) return input_strings, output_strings def finalize_inference(self, results_per_batch, output_file): diff --git a/requirements.txt b/requirements.txt index 3f6411f5c..e02f835c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,6 @@ numpy nltk==3.2.5 resampy python_speech_features -pandas +pandas==0.23.0 six mpi4py diff --git a/run.py b/run.py index 1df544670..63f24b13d 100644 --- a/run.py +++ b/run.py @@ -187,7 +187,7 @@ def main(): raise ValueError("\"infer_output_file\" command line parameter is " "required in inference mode") infer_config.update(copy.deepcopy(config_module['infer_params'])) - nested_update(infer_config, nest_dict(vars(config_update))) + # nested_update(infer_config, nest_dict(vars(config_update))) if hvd is None or hvd.rank() == 0: deco_print("Inference config:") pprint.pprint(infer_config) From f408c849884fa460ccc408e83dd4920c503dfe03 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Tue, 12 Jun 2018 17:29:43 -0700 Subject: [PATCH 060/102] minor config bugfix --- example_configs/text2text/transformer-base-test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example_configs/text2text/transformer-base-test.py b/example_configs/text2text/transformer-base-test.py index c59758b23..ec8037ed4 100644 --- a/example_configs/text2text/transformer-base-test.py +++ b/example_configs/text2text/transformer-base-test.py @@ -118,11 +118,11 @@ infer_params = { "batch_size_per_gpu": 64, # it is now in samples, not tokens - "batch_in_tokens": False, # this is necessary to preserve the order "data_layer": TransformerDataLayer, "data_layer_params": { 'data_dir': data_root, 'file_pattern': "*test*", + 'batch_in_tokens': False, # this is necessary to preserve the order 'src_vocab_file': data_root + "vocab.ende.32768", 'max_length': 256, 'shuffle': False, From 4ce9a9d078d41af1a960f0e5bf16e373f69d5117 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Wed, 13 Jun 2018 13:43:46 -0700 Subject: [PATCH 061/102] flexible rnn cells in configs --- .../text2text/en-de-gnmt-like-4GPUs.py | 27 +- example_configs/text2text/nmt-reversal-RR.py | 29 +- .../text2text/transformer-base-test-mp.py | 133 ++++++++ .../text2text/transformer-base-test.py | 2 +- open_seq2seq/data/text2text/text2text.py | 1 + open_seq2seq/decoders/rnn_decoders.py | 102 ++++--- open_seq2seq/encoders/rnn_encoders.py | 115 ++++--- open_seq2seq/parts/rnns/utils.py | 287 ++++++++++-------- 8 files changed, 446 insertions(+), 250 deletions(-) create mode 100644 example_configs/text2text/transformer-base-test-mp.py diff --git a/example_configs/text2text/en-de-gnmt-like-4GPUs.py b/example_configs/text2text/en-de-gnmt-like-4GPUs.py index 91a05f2e7..895552d65 100644 --- a/example_configs/text2text/en-de-gnmt-like-4GPUs.py +++ b/example_configs/text2text/en-de-gnmt-like-4GPUs.py @@ -50,8 +50,13 @@ "minval": -0.1, "maxval": 0.1, }, - "encoder_cell_type": "lstm", - "encoder_cell_units": 1024, + #"encoder_cell_type": "lstm", + #"encoder_cell_units": 1024, + "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell_params": { + "num_units": 1024, + # "forget_bias": 1.0, + }, "encoder_layers": 7, "encoder_dp_input_keep_prob": 0.8, "encoder_dp_output_keep_prob": 1.0, @@ -66,8 +71,13 @@ "minval": -0.1, "maxval": 0.1, }, - "decoder_cell_type": "lstm", - "decoder_cell_units": 1024, + #"decoder_cell_type": "lstm", + #"decoder_cell_units": 1024, + "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell_params": { + "num_units": 1024, + # "forget_bias": 1.0, + }, "decoder_layers": 8, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, @@ -128,8 +138,13 @@ "decoder_params": { "beam_width": 10, "length_penalty": 1.0, - "decoder_cell_type": "lstm", - "decoder_cell_units": 1024, + #"decoder_cell_type": "lstm", + #"decoder_cell_units": 1024, + "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell_params": { + "num_units": 1024, + # "forget_bias": 1.0, + }, "decoder_layers": 8, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, diff --git a/example_configs/text2text/nmt-reversal-RR.py b/example_configs/text2text/nmt-reversal-RR.py index 8e6b038a7..ba0ec1ded 100644 --- a/example_configs/text2text/nmt-reversal-RR.py +++ b/example_configs/text2text/nmt-reversal-RR.py @@ -40,8 +40,13 @@ "encoder": BidirectionalRNNEncoderWithEmbedding, "encoder_params": { - "encoder_cell_type": "lstm", - "encoder_cell_units": 128, + #"encoder_cell_type": "lstm", + #"encoder_cell_units": 128, + "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell,#tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 128, + #"forget_bias": 1.0, + }, "encoder_layers": 1, "encoder_dp_input_keep_prob": 0.8, "encoder_dp_output_keep_prob": 1.0, @@ -51,8 +56,14 @@ "decoder": RNNDecoderWithAttention, "decoder_params": { - "decoder_cell_type": "lstm", - "decoder_cell_units": 128, + #"decoder_cell_type": "lstm", + #"decoder_cell_units": 128, + "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + # tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 128, + # "forget_bias": 1.0, + }, "decoder_layers": 1, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, @@ -106,8 +117,14 @@ "batch_size_per_gpu": 1, "decoder": BeamSearchRNNDecoderWithAttention, "decoder_params": { - "decoder_cell_type": "lstm", - "decoder_cell_units": 128, + #"decoder_cell_type": "lstm", + #"decoder_cell_units": 128, + "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + # tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 128, + # "forget_bias": 1.0, + }, "decoder_layers": 1, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, diff --git a/example_configs/text2text/transformer-base-test-mp.py b/example_configs/text2text/transformer-base-test-mp.py new file mode 100644 index 000000000..8b91e990b --- /dev/null +++ b/example_configs/text2text/transformer-base-test-mp.py @@ -0,0 +1,133 @@ +from __future__ import absolute_import, division, print_function +from open_seq2seq.models import Text2Text +from open_seq2seq.encoders import TransformerEncoder +from open_seq2seq.decoders import TransformerDecoder +from open_seq2seq.data.text2text.text2text import TransformerDataLayer +from open_seq2seq.losses import PaddedCrossEntropyLossWithSmoothing +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.data.text2text.tokenizer import EOS_ID +from open_seq2seq.optimizers.lr_policies import transformer_policy +import tensorflow as tf + +""" +This configuration file describes a variant of Transformer model from +https://arxiv.org/abs/1706.03762 +""" + +base_model = Text2Text +d_model = 512 +num_layers = 6 + +data_root = "/tmp/translate_ende/" + +base_params = { + "use_horovod": False, + "num_gpus": 1, + "batch_size_per_gpu": 4096, # this size is in tokens + "max_steps": 500000, + "save_summaries_steps": 50, + "print_loss_steps": 50, + "print_samples_steps": 50, + "eval_steps": 4001, + "save_checkpoint_steps": 1000, + "logdir": "Transformer-MP", + #"dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + "optimizer": tf.contrib.opt.LazyAdamOptimizer, + "optimizer_params": { + "beta1": 0.9, + "beta2": 0.997, + "epsilon": 1e-09, + }, + + "lr_policy": transformer_policy, + "lr_policy_params": { + "learning_rate": 2.0, + "warmup_steps": 16000, + "d_model": d_model, + }, + + # "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": TransformerEncoder, + "encoder_params": { + "encoder_layers": num_layers, + "hidden_size": d_model, + "num_heads": 8, + "attention_dropout": 0.1, + "filter_size": 4 * d_model, + "relu_dropout": 0.1, + "layer_postprocess_dropout": 0.1, + "pad_embeddings_2_eight": True, + }, + + "decoder": TransformerDecoder, + "decoder_params": { + "layer_postprocess_dropout": 0.1, + "num_hidden_layers": num_layers, + "hidden_size": d_model, + "num_heads": 8, + "attention_dropout": 0.1, + "relu_dropout": 0.1, + "filter_size": 4 * d_model, + "beam_size": 4, + "alpha": 0.6, + "extra_decode_length": 50, + "EOS_ID": EOS_ID, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + }, + + "loss": PaddedCrossEntropyLossWithSmoothing, + "loss_params": { + "label_smoothing": 0.1, + } +} + +train_params = { + "data_layer": TransformerDataLayer, + "data_layer_params": { + 'data_dir': data_root, + 'file_pattern': "*train*", + 'src_vocab_file': data_root + "vocab.ende.32768", + 'max_length': 256, + 'shuffle': True, + 'repeat': 100000, + 'mode': 'train', + "delimiter": ' ', + }, +} + +eval_params = { + "batch_size_per_gpu": 256, + "data_layer": TransformerDataLayer, + "data_layer_params": { + 'data_dir': data_root, + 'file_pattern': "*dev*", + 'src_vocab_file': data_root + "vocab.ende.32768", + 'max_length': 256, + 'shuffle': False, + 'repeat': 1, + 'mode': 'train', + "delimiter": ' ', + }, +} + +infer_params = { + "batch_size_per_gpu": 64, # it is now in samples, not tokens + "data_layer": TransformerDataLayer, + "data_layer_params": { + 'data_dir': data_root, + 'file_pattern': "*test*", + 'batch_in_tokens': False, # this is necessary to preserve the order + 'src_vocab_file': data_root + "vocab.ende.32768", + 'max_length': 256, + 'shuffle': False, + 'repeat': 1, + 'mode': 'train', + "delimiter": ' ', + }, +} \ No newline at end of file diff --git a/example_configs/text2text/transformer-base-test.py b/example_configs/text2text/transformer-base-test.py index ec8037ed4..0ba99384e 100644 --- a/example_configs/text2text/transformer-base-test.py +++ b/example_configs/text2text/transformer-base-test.py @@ -22,7 +22,7 @@ base_params = { "use_horovod": False, - "num_gpus": 2, + "num_gpus": 1, "batch_size_per_gpu": 4096, # this size is in tokens "max_steps": 500000, "save_summaries_steps": 50, diff --git a/open_seq2seq/data/text2text/text2text.py b/open_seq2seq/data/text2text/text2text.py index 9c91e3f8e..2c21056a8 100644 --- a/open_seq2seq/data/text2text/text2text.py +++ b/open_seq2seq/data/text2text/text2text.py @@ -255,6 +255,7 @@ def get_optional_params(): 'num_cpu_cores': int, 'tgt_vocab_file': str, 'm_padding': bool, + 'batch_in_tokens': bool, }) def __init__(self, params, model, num_workers=1, worker_id=0): diff --git a/open_seq2seq/decoders/rnn_decoders.py b/open_seq2seq/decoders/rnn_decoders.py index a0fa1b10b..6ad093a3e 100644 --- a/open_seq2seq/decoders/rnn_decoders.py +++ b/open_seq2seq/decoders/rnn_decoders.py @@ -10,7 +10,7 @@ from open_seq2seq.parts.rnns.gnmt import GNMTAttentionMultiCell, \ gnmt_residual_fn -from open_seq2seq.parts.rnns.utils import create_rnn_cell +from open_seq2seq.parts.rnns.utils import single_cell from open_seq2seq.parts.rnns.attention_wrapper import BahdanauAttention, \ LuongAttention, \ AttentionWrapper @@ -30,8 +30,10 @@ def get_required_params(): 'tgt_emb_size': int, 'attention_layer_size': int, 'attention_type': ['bahdanau', 'luong', 'gnmt', 'gnmt_v2'], - 'decoder_cell_units': int, - 'decoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'], + #'decoder_cell_units': int, + #'decoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'], + 'core_cell': None, + 'core_cell_params': dict, 'decoder_layers': int, 'decoder_use_skip_connections': bool, 'batch_size': int, @@ -65,8 +67,8 @@ def __init__(self, params, model, * **END_SYMBOL** (int) --- END symbol id, must be the same as used in data layer. * **tgt_emb_size** (int) --- embedding size to use. - * **decoder_cell_units** (int) - number of units in RNN - * **decoder_cell_type** (string) - RNN type: lstm, gru, glstm, etc. + * **core_cell_params** (int) - parameters for RNN class + * **core_cell** (string) - RNN class. * **decoder_dp_input_keep_prob** (float) - dropout input keep probability. * **decoder_dp_output_keep_prob** (float) - dropout output keep probability. * **decoder_use_skip_connections** (bool) - use residual connections or not. @@ -184,8 +186,8 @@ def _decode(self, input_dict): self._tgt_vocab_size, use_bias=False, ) - cell_params = copy.deepcopy(self.params) - cell_params["num_units"] = self.params['decoder_cell_units'] + #cell_params = copy.deepcopy(self.params) + #cell_params["num_units"] = self.params['decoder_cell_units'] if self._mode == "train": dp_input_keep_prob = self.params['decoder_dp_input_keep_prob'] @@ -194,22 +196,17 @@ def _decode(self, input_dict): dp_input_keep_prob = 1.0 dp_output_keep_prob = 1.0 - if self.params['attention_type'].startswith('gnmt'): - residual_connections = False - wrap_to_multi_rnn = False - else: - residual_connections = self.params['decoder_use_skip_connections'] - wrap_to_multi_rnn = True - - self._decoder_cells = create_rnn_cell( - cell_type=self.params['decoder_cell_type'], - cell_params=cell_params, - num_layers=self.params['decoder_layers'], - dp_input_keep_prob=dp_input_keep_prob, - dp_output_keep_prob=dp_output_keep_prob, - residual_connections=residual_connections, - wrap_to_multi_rnn=wrap_to_multi_rnn, - ) + residual_connections = self.params['decoder_use_skip_connections'] + + # list of cells + self._decoder_cells = [ + single_cell(cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), + dp_input_keep_prob=dp_input_keep_prob, + dp_output_keep_prob=dp_output_keep_prob, + # residual connections are added a little differently for GNMT + residual_connections=False if self.params['attention_type'].startswith('gnmt') else residual_connections, + ) for _ in range(self.params['decoder_layers'])] attention_mechanism = self._build_attention( encoder_outputs, @@ -217,7 +214,6 @@ def _decode(self, input_dict): ) if self.params['attention_type'].startswith('gnmt'): attention_cell = self._decoder_cells.pop(0) - # attention_cell = tf.contrib.seq2seq.AttentionWrapper( attention_cell = AttentionWrapper( attention_cell, attention_mechanism=attention_mechanism, @@ -225,12 +221,12 @@ def _decode(self, input_dict): output_attention=False, name="gnmt_attention") attentive_decoder_cell = GNMTAttentionMultiCell( - attention_cell, self._add_residual_wrapper(self._decoder_cells), + attention_cell, self._add_residual_wrapper(self._decoder_cells) if residual_connections else self._decoder_cells, use_new_attention=(self.params['attention_type'] == 'gnmt_v2')) else: # attentive_decoder_cell = tf.contrib.seq2seq.AttentionWrapper( attentive_decoder_cell = AttentionWrapper( - cell=self._decoder_cells, + cell=tf.contrib.rnn.MultiRNNCell(self._decoder_cells), attention_mechanism=attention_mechanism, ) if self._mode == "train": @@ -371,8 +367,8 @@ def _decode(self, input_dict): self._tgt_vocab_size, use_bias=False, ) - cell_params = copy.deepcopy(self.params) - cell_params["num_units"] = self.params['decoder_cell_units'] + #cell_params = copy.deepcopy(self.params) + #cell_params["num_units"] = self.params['decoder_cell_units'] if self._mode == "train": dp_input_keep_prob = self.params['decoder_dp_input_keep_prob'] @@ -381,22 +377,34 @@ def _decode(self, input_dict): dp_input_keep_prob = 1.0 dp_output_keep_prob = 1.0 - if self.params['attention_type'].startswith('gnmt'): - residual_connections = False - wrap_to_multi_rnn = False - else: - residual_connections = self.params['decoder_use_skip_connections'] - wrap_to_multi_rnn = True - - self._decoder_cells = create_rnn_cell( - cell_type=self.params['decoder_cell_type'], - cell_params=cell_params, - num_layers=self.params['decoder_layers'], - dp_input_keep_prob=dp_input_keep_prob, - dp_output_keep_prob=dp_output_keep_prob, - residual_connections=residual_connections, - wrap_to_multi_rnn=wrap_to_multi_rnn, - ) + #if self.params['attention_type'].startswith('gnmt'): + # residual_connections = False + # wrap_to_multi_rnn = False + #else: + # residual_connections = self.params['decoder_use_skip_connections'] + # wrap_to_multi_rnn = True + + #self._decoder_cells = create_rnn_cell( + # cell_type=self.params['decoder_cell_type'], + # cell_params=cell_params, + # num_layers=self.params['decoder_layers'], + # dp_input_keep_prob=dp_input_keep_prob, + # dp_output_keep_prob=dp_output_keep_prob, + # residual_connections=residual_connections, + # wrap_to_multi_rnn=wrap_to_multi_rnn, + #) + residual_connections = self.params['decoder_use_skip_connections'] + # list of cells + self._decoder_cells = [ + single_cell(cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), + dp_input_keep_prob=dp_input_keep_prob, + dp_output_keep_prob=dp_output_keep_prob, + # residual connections are added a little differently for GNMT + residual_connections=False if self.params[ + 'attention_type'].startswith( + 'gnmt') else residual_connections, + ) for _ in range(self.params['decoder_layers'])] tiled_enc_outputs = tf.contrib.seq2seq.tile_batch( encoder_outputs, @@ -420,11 +428,11 @@ def _decode(self, input_dict): output_attention=False, name="gnmt_attention") attentive_decoder_cell = GNMTAttentionMultiCell( - attention_cell, self._add_residual_wrapper(self._decoder_cells), + attention_cell, self._add_residual_wrapper(self._decoder_cells) if residual_connections else self._decoder_cells, use_new_attention=(self.params['attention_type'] == 'gnmt_v2')) - else: + else: # non-GNMT attentive_decoder_cell = AttentionWrapper( - cell=self._decoder_cells, + cell=tf.contrib.rnn.MultiRNNCell(self._decoder_cells), attention_mechanism=attention_mechanism, ) batch_size_tensor = tf.constant(self._batch_size) diff --git a/open_seq2seq/encoders/rnn_encoders.py b/open_seq2seq/encoders/rnn_encoders.py index 472ba85a5..ed7fe2d61 100644 --- a/open_seq2seq/encoders/rnn_encoders.py +++ b/open_seq2seq/encoders/rnn_encoders.py @@ -5,10 +5,9 @@ from __future__ import absolute_import, division, print_function from __future__ import unicode_literals -import copy import tensorflow as tf -from open_seq2seq.parts.rnns.utils import create_rnn_cell +from open_seq2seq.parts.rnns.utils import single_cell from .encoder import Encoder @@ -22,8 +21,8 @@ def get_required_params(): return dict(Encoder.get_required_params(), **{ 'src_vocab_size': int, 'src_emb_size': int, - 'encoder_cell_units': int, - 'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'], + 'core_cell': None, + 'core_cell_params': dict, 'encoder_layers': int, 'encoder_use_skip_connections': bool, }) @@ -87,10 +86,6 @@ def _encode(self, input_dict): source_sequence = input_dict['source_tensors'][0] source_length = input_dict['source_tensors'][1] - - cell_params = copy.deepcopy(self.params) - cell_params["num_units"] = self.params['encoder_cell_units'] - self._enc_emb_w = tf.get_variable( name="EncoderEmbeddingMatrix", shape=[self._src_vocab_size, self._src_emb_size], @@ -104,14 +99,16 @@ def _encode(self, input_dict): dp_input_keep_prob = 1.0 dp_output_keep_prob = 1.0 - self._encoder_cell_fw = create_rnn_cell( - cell_type=self.params['encoder_cell_type'], - cell_params=cell_params, - num_layers=self.params['encoder_layers'], - dp_input_keep_prob=dp_input_keep_prob, - dp_output_keep_prob=dp_output_keep_prob, - residual_connections=self.params['encoder_use_skip_connections'], - ) + fwd_cells = [ + single_cell(cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), + dp_input_keep_prob=dp_input_keep_prob, + dp_output_keep_prob=dp_output_keep_prob, + residual_connections=self.params[ + 'encoder_use_skip_connections'] + ) for _ in range(self.params['encoder_layers'])] + + self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells) time_major = self.params.get("time_major", False) use_swap_memory = self.params.get("use_swap_memory", False) @@ -157,10 +154,10 @@ def get_required_params(): return dict(Encoder.get_required_params(), **{ 'src_vocab_size': int, 'src_emb_size': int, - 'encoder_cell_units': int, - 'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'], 'encoder_layers': int, 'encoder_use_skip_connections': bool, + 'core_cell': None, + 'core_cell_params': dict, }) @staticmethod @@ -227,9 +224,6 @@ def _encode(self, input_dict): dtype=tf.float32 ) - cell_params = copy.deepcopy(self.params) - cell_params["num_units"] = self.params['encoder_cell_units'] - if self._mode == "train": dp_input_keep_prob = self.params['encoder_dp_input_keep_prob'] dp_output_keep_prob = self.params['encoder_dp_output_keep_prob'] @@ -237,25 +231,27 @@ def _encode(self, input_dict): dp_input_keep_prob = 1.0 dp_output_keep_prob = 1.0 + fwd_cells = [ + single_cell(cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), + dp_input_keep_prob=dp_input_keep_prob, + dp_output_keep_prob=dp_output_keep_prob, + residual_connections=self.params['encoder_use_skip_connections'] + ) for _ in range(self.params['encoder_layers'])] + bwd_cells = [ + single_cell(cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), + dp_input_keep_prob=dp_input_keep_prob, + dp_output_keep_prob=dp_output_keep_prob, + residual_connections=self.params['encoder_use_skip_connections'] + ) for _ in range(self.params['encoder_layers'])] + + with tf.variable_scope("FW"): - self._encoder_cell_fw = create_rnn_cell( - cell_type=self.params['encoder_cell_type'], - cell_params=cell_params, - num_layers=self.params['encoder_layers'], - dp_input_keep_prob=dp_input_keep_prob, - dp_output_keep_prob=dp_output_keep_prob, - residual_connections=self.params['encoder_use_skip_connections'] - ) + self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells) with tf.variable_scope("BW"): - self._encoder_cell_bw = create_rnn_cell( - cell_type=self.params['encoder_cell_type'], - cell_params=cell_params, - num_layers=self.params['encoder_layers'], - dp_input_keep_prob=dp_input_keep_prob, - dp_output_keep_prob=dp_output_keep_prob, - residual_connections=self.params['encoder_use_skip_connections'] - ) + self._encoder_cell_bw = tf.contrib.rnn.MultiRNNCell(bwd_cells) embedded_inputs = tf.cast(tf.nn.embedding_lookup( self.enc_emb_w, @@ -301,8 +297,10 @@ def get_required_params(): return dict(Encoder.get_required_params(), **{ 'src_vocab_size': int, 'src_emb_size': int, - 'encoder_cell_units': int, - 'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'], + 'core_cell': None, + 'core_cell_params': dict, + #'encoder_cell_units': int, + #'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'], 'encoder_layers': int, 'encoder_use_skip_connections': bool, }) @@ -353,27 +351,24 @@ def _encode(self, input_dict): if self.params['encoder_layers'] < 2: raise ValueError("GNMT encoder must have at least 2 layers") - cell_params = copy.deepcopy(self.params) - cell_params["num_units"] = self.params['encoder_cell_units'] + #cell_params = copy.deepcopy(self.params) + #cell_params["num_units"] = self.params['encoder_cell_units'] with tf.variable_scope("Level1FW"): - self._encoder_l1_cell_fw = create_rnn_cell( - cell_type=self.params['encoder_cell_type'], - cell_params=cell_params, - num_layers=1, + self._encoder_l1_cell_fw = single_cell( + cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), dp_input_keep_prob=1.0, dp_output_keep_prob=1.0, - residual_connections=False, - ) + residual_connections=False) + with tf.variable_scope("Level1BW"): - self._encoder_l1_cell_bw = create_rnn_cell( - cell_type=self.params['encoder_cell_type'], - cell_params=cell_params, - num_layers=1, + self._encoder_l1_cell_bw = single_cell( + cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), dp_input_keep_prob=1.0, dp_output_keep_prob=1.0, - residual_connections=False, - ) + residual_connections=False) if self._mode == "train": dp_input_keep_prob = self.params['encoder_dp_input_keep_prob'] @@ -383,15 +378,13 @@ def _encode(self, input_dict): dp_output_keep_prob = 1.0 with tf.variable_scope("UniDirLevel"): - self._encoder_cells = create_rnn_cell( - cell_type=self.params['encoder_cell_type'], - cell_params=cell_params, - num_layers=self.params['encoder_layers'] - 1, + self._encoder_cells = [single_cell( + cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), dp_input_keep_prob=dp_input_keep_prob, dp_output_keep_prob=dp_output_keep_prob, - residual_connections=False, - wrap_to_multi_rnn=False, - ) + residual_connections=False) for _ in range(self.params['encoder_layers'] - 1)] + # add residual connections starting from the third layer for idx, cell in enumerate(self._encoder_cells): if idx > 0: @@ -422,7 +415,7 @@ def _encode(self, input_dict): inputs=encoder_l1_outputs, sequence_length=source_length, swap_memory=use_swap_memory, - time_major = time_major, + time_major=time_major, dtype=encoder_l1_outputs.dtype, ) diff --git a/open_seq2seq/parts/rnns/utils.py b/open_seq2seq/parts/rnns/utils.py index a16bab936..d0599b184 100644 --- a/open_seq2seq/parts/rnns/utils.py +++ b/open_seq2seq/parts/rnns/utils.py @@ -11,134 +11,163 @@ import tensorflow as tf -def create_rnn_cell(cell_type, - cell_params, - num_layers=1, - dp_input_keep_prob=1.0, - dp_output_keep_prob=1.0, - residual_connections=False, - wrap_to_multi_rnn=True): - """ - TODO: MOVE THIS properly to utils. Write doc - :param cell_type: - :param cell_params: - :param num_layers: - :param dp_input_keep_prob: - :param dp_output_keep_prob: - :param residual_connections: - :return: - """ - def single_cell(cell_params): - # TODO: This method is ugly - redo - size = cell_params["num_units"] - proj_size = None if "proj_size" not in cell_params else cell_params["proj_size"] +def single_cell(cell_class, + cell_params, + dp_input_keep_prob=1.0, + dp_output_keep_prob=1.0, + residual_connections=False): + """Creates an instance of the rnn cell. + Such cell describes one step one layer and can include residual connection + and/or dropout - if cell_type == "lstm": - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return tf.nn.rnn_cell.LSTMCell(num_units=size, - num_proj=proj_size, - forget_bias=1.0) - else: - return DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=size, - num_proj=proj_size, - forget_bias=1.0), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(tf.nn.rnn_cell.LSTMCell(num_units=size, - num_proj=proj_size, - forget_bias=1.0)) - else: - return ResidualWrapper(DropoutWrapper( - tf.nn.rnn_cell.LSTMCell( - num_units=size, - num_proj=proj_size, - forget_bias=1.0, - ), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - )) - elif cell_type == "gru": - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return tf.nn.rnn_cell.GRUCell(num_units=size) - else: - return DropoutWrapper( - tf.nn.rnn_cell.GRUCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - ) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(tf.nn.rnn_cell.GRUCell(num_units=size)) - else: - return ResidualWrapper(DropoutWrapper( - tf.nn.rnn_cell.GRUCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob), - ) - elif cell_type == "glstm": - num_groups = cell_params["num_groups"] - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return GLSTMCell(num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0) - else: - return DropoutWrapper(GLSTMCell(num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(GLSTMCell(num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0)) - else: - return ResidualWrapper(DropoutWrapper( - GLSTMCell( - num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0, - ), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - )) - elif cell_type == "slstm": - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return BasicSLSTMCell(num_units=size) - else: - return DropoutWrapper(BasicSLSTMCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob - ) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(BasicSLSTMCell(num_units=size)) - else: - return ResidualWrapper(DropoutWrapper( - BasicSLSTMCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - )) - else: - raise ValueError("Unknown RNN cell class: {}".format(cell_type)) + Args: + cell_class: Tensorflow RNN cell class + cell_params (dict): cell parameters + dp_input_keep_prob (float): (default: 1.0) input dropout keep probability + dp_output_keep_prob (float): (default: 1.0) output dropout keep probability + residual_connections (bool): whether to add residual connection - if num_layers > 1: - if wrap_to_multi_rnn: - return MultiRNNCell([single_cell(cell_params) for _ in range(num_layers)]) - else: - cells = [] # for GNMT-like attention in decoder - for i in range(num_layers): - cells.append(single_cell(cell_params)) - return cells - else: - return single_cell(cell_params) + Returns: + TF RNN instance + """ + cell = cell_class(**cell_params) + if residual_connections: + cell = ResidualWrapper(cell) + if dp_input_keep_prob != 1.0 or dp_output_keep_prob != 1.0: + cell = DropoutWrapper(cell, input_keep_prob=dp_input_keep_prob, + output_keep_prob=dp_output_keep_prob) + return cell + + + +# def create_rnn_cell(cell_type, +# cell_params, +# num_layers=1, +# dp_input_keep_prob=1.0, +# dp_output_keep_prob=1.0, +# residual_connections=False, +# wrap_to_multi_rnn=True): +# """ +# TODO: MOVE THIS properly to utils. Write doc +# :param cell_type: +# :param cell_params: +# :param num_layers: +# :param dp_input_keep_prob: +# :param dp_output_keep_prob: +# :param residual_connections: +# :return: +# """ +# def single_cell(cell_params): +# # TODO: This method is ugly - redo +# size = cell_params["num_units"] +# proj_size = None if "proj_size" not in cell_params else cell_params["proj_size"] +# +# if cell_type == "lstm": +# if not residual_connections: +# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: +# return tf.nn.rnn_cell.LSTMCell(num_units=size, +# num_proj=proj_size, +# forget_bias=1.0) +# else: +# return DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=size, +# num_proj=proj_size, +# forget_bias=1.0), +# input_keep_prob=dp_input_keep_prob, +# output_keep_prob=dp_output_keep_prob) +# else: # residual connection required +# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: +# return ResidualWrapper(tf.nn.rnn_cell.LSTMCell(num_units=size, +# num_proj=proj_size, +# forget_bias=1.0)) +# else: +# return ResidualWrapper(DropoutWrapper( +# tf.nn.rnn_cell.LSTMCell( +# num_units=size, +# num_proj=proj_size, +# forget_bias=1.0, +# ), +# input_keep_prob=dp_input_keep_prob, +# output_keep_prob=dp_output_keep_prob, +# )) +# elif cell_type == "gru": +# if not residual_connections: +# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: +# return tf.nn.rnn_cell.GRUCell(num_units=size) +# else: +# return DropoutWrapper( +# tf.nn.rnn_cell.GRUCell(num_units=size), +# input_keep_prob=dp_input_keep_prob, +# output_keep_prob=dp_output_keep_prob, +# ) +# else: # residual connection required +# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: +# return ResidualWrapper(tf.nn.rnn_cell.GRUCell(num_units=size)) +# else: +# return ResidualWrapper(DropoutWrapper( +# tf.nn.rnn_cell.GRUCell(num_units=size), +# input_keep_prob=dp_input_keep_prob, +# output_keep_prob=dp_output_keep_prob), +# ) +# elif cell_type == "glstm": +# num_groups = cell_params["num_groups"] +# if not residual_connections: +# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: +# return GLSTMCell(num_units=size, +# number_of_groups=num_groups, +# num_proj=proj_size, +# forget_bias=1.0) +# else: +# return DropoutWrapper(GLSTMCell(num_units=size, +# number_of_groups=num_groups, +# num_proj=proj_size, +# forget_bias=1.0), +# input_keep_prob=dp_input_keep_prob, +# output_keep_prob=dp_output_keep_prob) +# else: # residual connection required +# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: +# return ResidualWrapper(GLSTMCell(num_units=size, +# number_of_groups=num_groups, +# num_proj=proj_size, +# forget_bias=1.0)) +# else: +# return ResidualWrapper(DropoutWrapper( +# GLSTMCell( +# num_units=size, +# number_of_groups=num_groups, +# num_proj=proj_size, +# forget_bias=1.0, +# ), +# input_keep_prob=dp_input_keep_prob, +# output_keep_prob=dp_output_keep_prob, +# )) +# elif cell_type == "slstm": +# if not residual_connections: +# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: +# return BasicSLSTMCell(num_units=size) +# else: +# return DropoutWrapper(BasicSLSTMCell(num_units=size), +# input_keep_prob=dp_input_keep_prob, +# output_keep_prob=dp_output_keep_prob +# ) +# else: # residual connection required +# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: +# return ResidualWrapper(BasicSLSTMCell(num_units=size)) +# else: +# return ResidualWrapper(DropoutWrapper( +# BasicSLSTMCell(num_units=size), +# input_keep_prob=dp_input_keep_prob, +# output_keep_prob=dp_output_keep_prob, +# )) +# else: +# raise ValueError("Unknown RNN cell class: {}".format(cell_type)) +# +# if num_layers > 1: +# if wrap_to_multi_rnn: +# return MultiRNNCell([single_cell(cell_params) for _ in range(num_layers)]) +# else: +# cells = [] # for GNMT-like attention in decoder +# for i in range(num_layers): +# cells.append(single_cell(cell_params)) +# return cells +# else: +# return single_cell(cell_params) From a23dbe49c5ae79f6e09af6c39f908a95609f0796 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Wed, 13 Jun 2018 18:14:07 -0700 Subject: [PATCH 062/102] 8 GPU config for GNMT with cuDNN compatible cells --- .../en-de-gnmt-like-8GPUs-horovod.py | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py new file mode 100644 index 000000000..66be1543f --- /dev/null +++ b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py @@ -0,0 +1,164 @@ +from __future__ import absolute_import, division, print_function +import tensorflow as tf + +from open_seq2seq.models import Text2Text +from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding +from open_seq2seq.decoders import RNNDecoderWithAttention, \ + BeamSearchRNNDecoderWithAttention +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer +from open_seq2seq.losses import BasicSequenceLoss +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.optimizers.lr_policies import exp_decay + +data_root = "/data/wmt16_s2s/" + +base_model = Text2Text + +base_params = { + "use_horovod": True, + "num_gpus": 1, + "max_steps": 34000, + "batch_size_per_gpu": 128, + "save_summaries_steps": 50, + "print_loss_steps": 48, + "print_samples_steps": 48, + "eval_steps": 1000, + "save_checkpoint_steps": 2001, + "logdir": "GNMT-MP", + "optimizer": "Adam", + "optimizer_params": {}, + # luong10 decay scheme + "lr_policy": exp_decay, + "lr_policy_params": { + "learning_rate": 0.002, + "begin_decay_at": 17000, + "decay_steps": 1700, + "decay_rate": 0.5, + "use_staircase_decay": True, + "min_lr": 0.0000005, + }, + #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + #"max_grad_norm": 32768.0, + #"dtype": tf.float32, + "dtype": "mixed", + "automatic_loss_scaling": "Backoff", + "encoder": GNMTLikeEncoderWithEmbedding, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell_params": { + "num_units": 1024, + }, + "encoder_layers": 7, + "encoder_dp_input_keep_prob": 0.8, + "encoder_dp_output_keep_prob": 1.0, + "encoder_use_skip_connections": True, + "src_emb_size": 1024, + }, + + "decoder": RNNDecoderWithAttention, + "decoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell_params": { + "num_units": 1024, + }, + "decoder_layers": 8, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": True, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + + "tgt_emb_size": 1024, + "attention_type": "gnmt_v2", + "attention_layer_size": 1024, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"train.tok.clean.bpe.32000.en", + "target_file": data_root+"train.tok.clean.bpe.32000.de", + "delimiter": " ", + "shuffle": True, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "max_length": 50, + }, +} +eval_params = { + "batch_size_per_gpu": 16, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2013.tok.bpe.32000.en", + "target_file": data_root+"newstest2013.tok.bpe.32000.de", + "delimiter": " ", + "shuffle": False, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "max_length": 32, + }, +} + +infer_params = { + "batch_size_per_gpu": 1, + "decoder": BeamSearchRNNDecoderWithAttention, + "decoder_params": { + "beam_width": 10, + "length_penalty": 1.0, + "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell_params": { + "num_units": 1024, + }, + "decoder_layers": 8, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": True, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + "tgt_emb_size": 1024, + "attention_type": "gnmt_v2", + "attention_layer_size": 1024, + }, + + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2014.tok.bpe.32000.en", + # this is intentional + "target_file": data_root+"newstest2014.tok.bpe.32000.en", + "delimiter": " ", + "shuffle": False, + "repeat": False, + "max_length": 512, + }, +} From 4b028762c68a5f853f71be76f3801cee186d044d Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Wed, 13 Jun 2018 18:25:48 -0700 Subject: [PATCH 063/102] fix config --- example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py index 66be1543f..72311d704 100644 --- a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py +++ b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py @@ -42,7 +42,7 @@ #"max_grad_norm": 32768.0, #"dtype": tf.float32, "dtype": "mixed", - "automatic_loss_scaling": "Backoff", + "loss_scaling": "Backoff", "encoder": GNMTLikeEncoderWithEmbedding, "encoder_params": { "initializer": tf.random_uniform_initializer, From 8d27c98d91efd01b61c45daab7018a26c6af879c Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Wed, 13 Jun 2018 19:51:41 -0700 Subject: [PATCH 064/102] put type back to float32 --- example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py index 72311d704..5ba14c539 100644 --- a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py +++ b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py @@ -40,9 +40,9 @@ #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], #"max_grad_norm": 32768.0, - #"dtype": tf.float32, - "dtype": "mixed", - "loss_scaling": "Backoff", + "dtype": tf.float32, + #"dtype": "mixed", + #"loss_scaling": "Backoff", "encoder": GNMTLikeEncoderWithEmbedding, "encoder_params": { "initializer": tf.random_uniform_initializer, From 77845f51308b64f00a3fe948603f1c5b68e0ca06 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Thu, 14 Jun 2018 17:48:05 -0700 Subject: [PATCH 065/102] cuDNN accelerated GNMT encoder and new configs --- .../en-de-gnmt-like-8GPUs-horovod.py | 28 ++- ...-de-gnmt-like-8GPUs-horovod_iter_size10.py | 163 ++++++++++++++++++ open_seq2seq/decoders/rnn_decoders.py | 6 +- open_seq2seq/encoders/__init__.py | 3 +- open_seq2seq/encoders/rnn_encoders.py | 150 +++++++++++++++- 5 files changed, 331 insertions(+), 19 deletions(-) create mode 100644 example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size10.py diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py index 5ba14c539..885a0e661 100644 --- a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py +++ b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py @@ -2,7 +2,7 @@ import tensorflow as tf from open_seq2seq.models import Text2Text -from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding +from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding_cuDNN from open_seq2seq.decoders import RNNDecoderWithAttention, \ BeamSearchRNNDecoderWithAttention from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer @@ -24,13 +24,13 @@ "print_samples_steps": 48, "eval_steps": 1000, "save_checkpoint_steps": 2001, - "logdir": "GNMT-MP", + "logdir": "GNMT-MP-cuDNN-enc", "optimizer": "Adam", "optimizer_params": {}, # luong10 decay scheme "lr_policy": exp_decay, "lr_policy_params": { - "learning_rate": 0.002, + "learning_rate": 0.001, "begin_decay_at": 17000, "decay_steps": 1700, "decay_rate": 0.5, @@ -40,24 +40,20 @@ #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], #"max_grad_norm": 32768.0, - "dtype": tf.float32, - #"dtype": "mixed", - #"loss_scaling": "Backoff", - "encoder": GNMTLikeEncoderWithEmbedding, + #"dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, "encoder_params": { "initializer": tf.random_uniform_initializer, "initializer_params": { "minval": -0.1, "maxval": 0.1, }, - "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, - "core_cell_params": { - "num_units": 1024, - }, + "encoder_cell_type": "lstm", + "encoder_cell_units": 1024, "encoder_layers": 7, - "encoder_dp_input_keep_prob": 0.8, "encoder_dp_output_keep_prob": 1.0, - "encoder_use_skip_connections": True, "src_emb_size": 1024, }, @@ -68,10 +64,12 @@ "minval": -0.1, "maxval": 0.1, }, - "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell": tf.nn.rnn_cell.LSTMCell, "core_cell_params": { - "num_units": 1024, + "num_units": 1024, + "forget_bias": 1.0, }, + "decoder_layers": 8, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size10.py b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size10.py new file mode 100644 index 000000000..373de0af4 --- /dev/null +++ b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size10.py @@ -0,0 +1,163 @@ +from __future__ import absolute_import, division, print_function +import tensorflow as tf + +from open_seq2seq.models import Text2Text +from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding_cuDNN +from open_seq2seq.decoders import RNNDecoderWithAttention, \ + BeamSearchRNNDecoderWithAttention +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer +from open_seq2seq.losses import BasicSequenceLoss +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.optimizers.lr_policies import exp_decay + +data_root = "/data/wmt16_s2s/" + +base_model = Text2Text + +base_params = { + "use_horovod": True, + "iter_size": 10, + "num_gpus": 1, + "max_steps": 3400, + "batch_size_per_gpu": 128, + "save_summaries_steps": 50, + "print_loss_steps": 48, + "print_samples_steps": 48, + "eval_steps": 1000, + "save_checkpoint_steps": 2001, + "logdir": "GNMT-MP-cuDNN-enc", + "optimizer": "Adam", + "optimizer_params": {}, + # luong10 decay scheme + "lr_policy": exp_decay, + "lr_policy_params": { + "learning_rate": 0.001, + "begin_decay_at": 1700, + "decay_steps": 170, + "decay_rate": 0.5, + "use_staircase_decay": True, + "min_lr": 0.0000005, + }, + #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + #"max_grad_norm": 32768.0, + #"dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "encoder_cell_type": "lstm", + "encoder_cell_units": 1024, + "encoder_layers": 7, + "encoder_dp_output_keep_prob": 1.0, + "src_emb_size": 1024, + }, + + "decoder": RNNDecoderWithAttention, + "decoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 1024, + "forget_bias": 1.0, + }, + + "decoder_layers": 8, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": True, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + + "tgt_emb_size": 1024, + "attention_type": "gnmt_v2", + "attention_layer_size": 1024, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"train.tok.clean.bpe.32000.en", + "target_file": data_root+"train.tok.clean.bpe.32000.de", + "delimiter": " ", + "shuffle": True, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "max_length": 50, + }, +} +eval_params = { + "batch_size_per_gpu": 16, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2013.tok.bpe.32000.en", + "target_file": data_root+"newstest2013.tok.bpe.32000.de", + "delimiter": " ", + "shuffle": False, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "max_length": 32, + }, +} + +infer_params = { + "batch_size_per_gpu": 1, + "decoder": BeamSearchRNNDecoderWithAttention, + "decoder_params": { + "beam_width": 10, + "length_penalty": 1.0, + "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell_params": { + "num_units": 1024, + }, + "decoder_layers": 8, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": True, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + "tgt_emb_size": 1024, + "attention_type": "gnmt_v2", + "attention_layer_size": 1024, + }, + + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2014.tok.bpe.32000.en", + # this is intentional + "target_file": data_root+"newstest2014.tok.bpe.32000.en", + "delimiter": " ", + "shuffle": False, + "repeat": False, + "max_length": 512, + }, +} diff --git a/open_seq2seq/decoders/rnn_decoders.py b/open_seq2seq/decoders/rnn_decoders.py index 6ad093a3e..25c892287 100644 --- a/open_seq2seq/decoders/rnn_decoders.py +++ b/open_seq2seq/decoders/rnn_decoders.py @@ -279,7 +279,8 @@ def _decode(self, input_dict): output_time_major=time_major, ) - return {'logits': final_outputs.rnn_output, + return {'logits': final_outputs.rnn_output if not time_major else + tf.transpose(final_outputs.rnn_output, perm=[1, 0, 2]), 'samples': [tf.argmax(final_outputs.rnn_output, axis=-1)], 'final_state': final_state, 'final_sequence_lengths': final_sequence_lengths} @@ -464,7 +465,8 @@ def _decode(self, input_dict): output_time_major=time_major, ) - return {'logits': final_outputs.predicted_ids[:, :, 0], + return {'logits': final_outputs.predicted_ids[:, :, 0] if not time_major else + tf.transpose(final_outputs.predicted_ids[:, :, 0], perm=[1, 0, 2]), 'samples': [final_outputs.predicted_ids[:, :, 0]], 'final_state': final_state, 'final_sequence_lengths': final_sequence_lengths} diff --git a/open_seq2seq/encoders/__init__.py b/open_seq2seq/encoders/__init__.py index 7d0a39923..e8a919a9f 100644 --- a/open_seq2seq/encoders/__init__.py +++ b/open_seq2seq/encoders/__init__.py @@ -6,7 +6,8 @@ from .encoder import Encoder from .rnn_encoders import UnidirectionalRNNEncoderWithEmbedding, \ BidirectionalRNNEncoderWithEmbedding, \ - GNMTLikeEncoderWithEmbedding + GNMTLikeEncoderWithEmbedding,\ + GNMTLikeEncoderWithEmbedding_cuDNN from .transformer_encoder import TransformerEncoder from .ds2_encoder import DeepSpeech2Encoder from .resnet_encoder import ResNetEncoder diff --git a/open_seq2seq/encoders/rnn_encoders.py b/open_seq2seq/encoders/rnn_encoders.py index ed7fe2d61..25838730c 100644 --- a/open_seq2seq/encoders/rnn_encoders.py +++ b/open_seq2seq/encoders/rnn_encoders.py @@ -9,7 +9,7 @@ from open_seq2seq.parts.rnns.utils import single_cell from .encoder import Encoder - +from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops class UnidirectionalRNNEncoderWithEmbedding(Encoder): """ @@ -435,3 +435,151 @@ def src_emb_size(self): @property def enc_emb_w(self): return self._enc_emb_w + +class GNMTLikeEncoderWithEmbedding_cuDNN(Encoder): + """ + Encoder similar to the one used in + GNMT model: https://arxiv.org/abs/1609.08144. + Must have at least 2 layers. Uses cuDNN RNN blocks for efficiency + """ + + @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + 'src_vocab_size': int, + 'src_emb_size': int, + 'encoder_cell_units': int, + 'encoder_cell_type': ['lstm', 'gru'], + 'encoder_layers': int, + #'core_cell': None, + #'core_cell_params': dict, + }) + + @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + 'encoder_dp_output_keep_prob': float, + }) + + def __init__(self, params, model, + name="gnmt_encoder_with_emb_cudnn", mode='train'): + """ + Encodes data into representation + :param params: a Python dictionary. + Must define: + * src_inputs - a Tensor of shape [batch_size, time] or [time, batch_size] + (depending on time_major param) + * src_lengths - a Tensor of shape [batch_size] + :return: a Python dictionary with: + * encoder_outputs - a Tensor of shape + [batch_size, time, representation_dim] + or [time, batch_size, representation_dim] + * encoder_state - a Tensor of shape [batch_size, dim] + * src_lengths - (copy ref from input) a Tensor of shape [batch_size] + """ + super(GNMTLikeEncoderWithEmbedding_cuDNN, self).__init__( + params, model, name=name, mode=mode, + ) + + self._src_vocab_size = self.params['src_vocab_size'] + self._src_emb_size = self.params['src_emb_size'] + + def _encode(self, input_dict): + source_sequence = input_dict['source_tensors'][0] + source_length = input_dict['source_tensors'][1] + self._enc_emb_w = tf.get_variable( + name="EncoderEmbeddingMatrix", + shape=[self._src_vocab_size, self._src_emb_size], + dtype=tf.float32 + ) + + if self.params['encoder_layers'] < 2: + raise ValueError("GNMT encoder must have at least 2 layers") + + if self._mode == "train": + dp_output_keep_prob = self.params['encoder_dp_output_keep_prob'] + else: + dp_output_keep_prob = 1.0 + + # source_sequence is of [batch, time] shape + embedded_inputs = tf.cast(tf.nn.embedding_lookup( + self.enc_emb_w, + tf.transpose(source_sequence), # cudnn wants [time, batch, ...] + ), self.params['dtype']) + + with tf.variable_scope("Bi_Directional_Layer"): + direction = cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION + if self.params['encoder_cell_type'] == "gru": + bidirectional_block = tf.contrib.cudnn_rnn.CudnnGRU( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=0.0, + dtype=self.params['dtype'], + name="cudnn_gru_bidi", + ) + elif self.params['encoder_cell_type'] == "lstm": + bidirectional_block = tf.contrib.cudnn_rnn.CudnnLSTM( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=0.0, + dtype=self.params['dtype'], + name="cudnn_lstm_bidi", + ) + else: + raise ValueError( + "{} is not a valid rnn_type for cudnn_rnn layers" + .format(self.params['encoder_cell_units']) + ) + bidi_output, bidi_state = bidirectional_block(embedded_inputs) + + with tf.variable_scope("Uni_Directional_Layer"): + direction = cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION + layer_input = bidi_output + for ind in range(self.params['encoder_layers'] - 1): + with tf.variable_scope("uni_layer_{}".format(ind)): + if self.params['encoder_cell_type'] == "gru": + unidirectional_block = tf.contrib.cudnn_rnn.CudnnGRU( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=1.0 - dp_output_keep_prob, + dtype=self.params['dtype'], + name="cudnn_gru_uni_".format(ind), + ) + elif self.params['encoder_cell_type'] == "lstm": + unidirectional_block = tf.contrib.cudnn_rnn.CudnnLSTM( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=1.0 - dp_output_keep_prob, + dtype=self.params['dtype'], + name="cudnn_lstm_uni_".format(ind), + ) + layer_output, encoder_state = unidirectional_block( + layer_input) + if ind > 0: # add residual connection + layer_output = layer_input + layer_output + layer_input = layer_output + + return {'outputs': tf.transpose(layer_input, perm=[1, 0, 2]), + 'state': None, + 'src_lengths': source_length, + 'encoder_input': source_sequence} + + + @property + def src_vocab_size(self): + return self._src_vocab_size + + @property + def src_emb_size(self): + return self._src_emb_size + + @property + def enc_emb_w(self): + return self._enc_emb_w + + + From 0087cadc21b723f49bff2c56416a2412e4db11e7 Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 15 Jun 2018 11:06:53 -0700 Subject: [PATCH 066/102] Make iter_size work with IndexedSlices --- open_seq2seq/optimizers/optimizers.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index 0af6dbdac..9f10fdc61 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -191,12 +191,18 @@ class should be sub-class of `tf.Optimizer` that implements grad_accum = tf.Variable( initial_value=tf.zeros_like(var), name=grad.name.split(":")[0] + "_accum", - expected_shape=grad.shape, + expected_shape=var.shape, dtype=grad.dtype, trainable=False, - validate_shape=bool(grad.get_shape()) + validate_shape=bool(var.get_shape()) ) - accum_ops.append(tf.assign(grad_accum, grad_accum + grad / iter_size)) + if isinstance(grad, tf.IndexedSlices): + add_grads = tf.scatter_nd_add(grad_accum, grad.indices, + grad.values / iter_size) + else: + add_grads = grad_accum + grad / iter_size + + accum_ops.append(tf.assign(grad_accum, add_grads)) grads_and_vars_accum.append((grad_accum, var)) accum_op = tf.group(accum_ops) From 8c2fb4635d07cc5ab7b3fc38ba1b971ab34d6dcd Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Fri, 15 Jun 2018 11:11:30 -0700 Subject: [PATCH 067/102] Update optimizers.py --- open_seq2seq/optimizers/optimizers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index 9f10fdc61..c9045b14f 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -192,7 +192,7 @@ class should be sub-class of `tf.Optimizer` that implements initial_value=tf.zeros_like(var), name=grad.name.split(":")[0] + "_accum", expected_shape=var.shape, - dtype=grad.dtype, + dtype=var.dtype, trainable=False, validate_shape=bool(var.get_shape()) ) From f296a8dd88f09a093fb5f2e4605939282af9c0ff Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Fri, 15 Jun 2018 11:38:30 -0700 Subject: [PATCH 068/102] bugfix --- open_seq2seq/optimizers/optimizers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index c9045b14f..9f10fdc61 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -192,7 +192,7 @@ class should be sub-class of `tf.Optimizer` that implements initial_value=tf.zeros_like(var), name=grad.name.split(":")[0] + "_accum", expected_shape=var.shape, - dtype=var.dtype, + dtype=grad.dtype, trainable=False, validate_shape=bool(var.get_shape()) ) From 4303f64a28d8e2bd4b1a69ae61e3a062338102e4 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Fri, 15 Jun 2018 12:36:02 -0700 Subject: [PATCH 069/102] Update optimizers.py --- open_seq2seq/optimizers/optimizers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index c9045b14f..9f10fdc61 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -192,7 +192,7 @@ class should be sub-class of `tf.Optimizer` that implements initial_value=tf.zeros_like(var), name=grad.name.split(":")[0] + "_accum", expected_shape=var.shape, - dtype=var.dtype, + dtype=grad.dtype, trainable=False, validate_shape=bool(var.get_shape()) ) From 0f50fc21b533ead9810a4a465561342990015d12 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Fri, 15 Jun 2018 13:39:09 -0700 Subject: [PATCH 070/102] more configs --- .../en-de-gnmt-like-8GPUs-horovod.py | 2 +- ...-de-gnmt-like-8GPUs-horovod_iter_size10.py | 2 +- ...de-gnmt-like-8GPUs-horovod_iter_size100.py | 163 ++++++++++++++++++ ...n-de-gnmt-like-8GPUs-horovod_iter_size5.py | 163 ++++++++++++++++++ example_configs/text2text/nmt-reversal-RR.py | 8 +- 5 files changed, 333 insertions(+), 5 deletions(-) create mode 100644 example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size100.py create mode 100644 example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size5.py diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py index 885a0e661..d892ae8e9 100644 --- a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py +++ b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py @@ -130,7 +130,7 @@ "decoder_params": { "beam_width": 10, "length_penalty": 1.0, - "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell": tf.nn.rnn_cell.LSTMCell, "core_cell_params": { "num_units": 1024, }, diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size10.py b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size10.py index 373de0af4..2d72678f6 100644 --- a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size10.py +++ b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size10.py @@ -131,7 +131,7 @@ "decoder_params": { "beam_width": 10, "length_penalty": 1.0, - "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell": tf.nn.rnn_cell.LSTMCell, "core_cell_params": { "num_units": 1024, }, diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size100.py b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size100.py new file mode 100644 index 000000000..6cbd3d66c --- /dev/null +++ b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size100.py @@ -0,0 +1,163 @@ +from __future__ import absolute_import, division, print_function +import tensorflow as tf + +from open_seq2seq.models import Text2Text +from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding_cuDNN +from open_seq2seq.decoders import RNNDecoderWithAttention, \ + BeamSearchRNNDecoderWithAttention +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer +from open_seq2seq.losses import BasicSequenceLoss +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.optimizers.lr_policies import exp_decay + +data_root = "/data/wmt16_s2s/" + +base_model = Text2Text + +base_params = { + "use_horovod": True, + "iter_size": 100, + "num_gpus": 1, + "max_steps": 340, + "batch_size_per_gpu": 128, + "save_summaries_steps": 50, + "print_loss_steps": 48, + "print_samples_steps": 48, + "eval_steps": 1000, + "save_checkpoint_steps": 2001, + "logdir": "GNMT-MP-cuDNN-enc", + "optimizer": "Adam", + "optimizer_params": {}, + # luong10 decay scheme + "lr_policy": exp_decay, + "lr_policy_params": { + "learning_rate": 0.001, + "begin_decay_at": 170, + "decay_steps": 17, + "decay_rate": 0.5, + "use_staircase_decay": True, + "min_lr": 0.0000005, + }, + #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + #"max_grad_norm": 32768.0, + #"dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "encoder_cell_type": "lstm", + "encoder_cell_units": 1024, + "encoder_layers": 7, + "encoder_dp_output_keep_prob": 1.0, + "src_emb_size": 1024, + }, + + "decoder": RNNDecoderWithAttention, + "decoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 1024, + "forget_bias": 1.0, + }, + + "decoder_layers": 8, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": True, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + + "tgt_emb_size": 1024, + "attention_type": "gnmt_v2", + "attention_layer_size": 1024, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"train.tok.clean.bpe.32000.en", + "target_file": data_root+"train.tok.clean.bpe.32000.de", + "delimiter": " ", + "shuffle": True, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "max_length": 50, + }, +} +eval_params = { + "batch_size_per_gpu": 16, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2013.tok.bpe.32000.en", + "target_file": data_root+"newstest2013.tok.bpe.32000.de", + "delimiter": " ", + "shuffle": False, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "max_length": 32, + }, +} + +infer_params = { + "batch_size_per_gpu": 1, + "decoder": BeamSearchRNNDecoderWithAttention, + "decoder_params": { + "beam_width": 10, + "length_penalty": 1.0, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 1024, + }, + "decoder_layers": 8, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": True, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + "tgt_emb_size": 1024, + "attention_type": "gnmt_v2", + "attention_layer_size": 1024, + }, + + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2014.tok.bpe.32000.en", + # this is intentional + "target_file": data_root+"newstest2014.tok.bpe.32000.en", + "delimiter": " ", + "shuffle": False, + "repeat": False, + "max_length": 512, + }, +} diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size5.py b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size5.py new file mode 100644 index 000000000..3608fe997 --- /dev/null +++ b/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size5.py @@ -0,0 +1,163 @@ +from __future__ import absolute_import, division, print_function +import tensorflow as tf + +from open_seq2seq.models import Text2Text +from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding_cuDNN +from open_seq2seq.decoders import RNNDecoderWithAttention, \ + BeamSearchRNNDecoderWithAttention +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer +from open_seq2seq.losses import BasicSequenceLoss +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.optimizers.lr_policies import exp_decay + +data_root = "/data/wmt16_s2s/" + +base_model = Text2Text + +base_params = { + "use_horovod": True, + "iter_size": 5, + "num_gpus": 1, + "max_steps": 6800, + "batch_size_per_gpu": 128, + "save_summaries_steps": 50, + "print_loss_steps": 48, + "print_samples_steps": 48, + "eval_steps": 1000, + "save_checkpoint_steps": 2001, + "logdir": "GNMT-MP-cuDNN-enc", + "optimizer": "Adam", + "optimizer_params": {}, + # luong10 decay scheme + "lr_policy": exp_decay, + "lr_policy_params": { + "learning_rate": 0.001, + "begin_decay_at": 3400, + "decay_steps": 340, + "decay_rate": 0.5, + "use_staircase_decay": True, + "min_lr": 0.0000005, + }, + #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + #"max_grad_norm": 32768.0, + #"dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "encoder_cell_type": "lstm", + "encoder_cell_units": 1024, + "encoder_layers": 7, + "encoder_dp_output_keep_prob": 1.0, + "src_emb_size": 1024, + }, + + "decoder": RNNDecoderWithAttention, + "decoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 1024, + "forget_bias": 1.0, + }, + + "decoder_layers": 8, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": True, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + + "tgt_emb_size": 1024, + "attention_type": "gnmt_v2", + "attention_layer_size": 1024, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"train.tok.clean.bpe.32000.en", + "target_file": data_root+"train.tok.clean.bpe.32000.de", + "delimiter": " ", + "shuffle": True, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "max_length": 50, + }, +} +eval_params = { + "batch_size_per_gpu": 16, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2013.tok.bpe.32000.en", + "target_file": data_root+"newstest2013.tok.bpe.32000.de", + "delimiter": " ", + "shuffle": False, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "max_length": 32, + }, +} + +infer_params = { + "batch_size_per_gpu": 1, + "decoder": BeamSearchRNNDecoderWithAttention, + "decoder_params": { + "beam_width": 10, + "length_penalty": 1.0, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 1024, + }, + "decoder_layers": 8, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": True, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + "tgt_emb_size": 1024, + "attention_type": "gnmt_v2", + "attention_layer_size": 1024, + }, + + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2014.tok.bpe.32000.en", + # this is intentional + "target_file": data_root+"newstest2014.tok.bpe.32000.en", + "delimiter": " ", + "shuffle": False, + "repeat": False, + "max_length": 512, + }, +} diff --git a/example_configs/text2text/nmt-reversal-RR.py b/example_configs/text2text/nmt-reversal-RR.py index ba0ec1ded..0babbc646 100644 --- a/example_configs/text2text/nmt-reversal-RR.py +++ b/example_configs/text2text/nmt-reversal-RR.py @@ -18,6 +18,7 @@ base_params = { "use_horovod": False, + #"iter_size": 10, # set this to number of available GPUs "num_gpus": 1, "batch_size_per_gpu": 64, @@ -36,13 +37,14 @@ 'learning_rate': 0.001 }, "max_grad_norm": 3.0, - "dtype": tf.float32, + #"dtype": tf.float32, + "dtype": "mixed", "encoder": BidirectionalRNNEncoderWithEmbedding, "encoder_params": { #"encoder_cell_type": "lstm", #"encoder_cell_units": 128, - "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell,#tf.nn.rnn_cell.LSTMCell, + "core_cell": tf.nn.rnn_cell.LSTMCell, "core_cell_params": { "num_units": 128, #"forget_bias": 1.0, @@ -58,7 +60,7 @@ "decoder_params": { #"decoder_cell_type": "lstm", #"decoder_cell_units": 128, - "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell": tf.nn.rnn_cell.LSTMCell, # tf.nn.rnn_cell.LSTMCell, "core_cell_params": { "num_units": 128, From 03f8b8d7a3a5d10eb1e06c6ec7a0986bf5d87688 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Fri, 15 Jun 2018 16:33:31 -0700 Subject: [PATCH 071/102] addressing review comments --- open_seq2seq/decoders/rnn_decoders.py | 6 +- open_seq2seq/parts/rnns/utils.py | 134 -------------------------- 2 files changed, 2 insertions(+), 138 deletions(-) diff --git a/open_seq2seq/decoders/rnn_decoders.py b/open_seq2seq/decoders/rnn_decoders.py index 25c892287..522ef28a9 100644 --- a/open_seq2seq/decoders/rnn_decoders.py +++ b/open_seq2seq/decoders/rnn_decoders.py @@ -30,10 +30,7 @@ def get_required_params(): 'tgt_emb_size': int, 'attention_layer_size': int, 'attention_type': ['bahdanau', 'luong', 'gnmt', 'gnmt_v2'], - #'decoder_cell_units': int, - #'decoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'], 'core_cell': None, - 'core_cell_params': dict, 'decoder_layers': int, 'decoder_use_skip_connections': bool, 'batch_size': int, @@ -42,6 +39,7 @@ def get_required_params(): @staticmethod def get_optional_params(): return dict(Decoder.get_optional_params(), **{ + 'core_cell_params': dict, 'bahdanau_normalize': bool, 'luong_scale': bool, 'decoder_dp_input_keep_prob': float, @@ -67,7 +65,7 @@ def __init__(self, params, model, * **END_SYMBOL** (int) --- END symbol id, must be the same as used in data layer. * **tgt_emb_size** (int) --- embedding size to use. - * **core_cell_params** (int) - parameters for RNN class + * **core_cell_params** (dict) - parameters for RNN class * **core_cell** (string) - RNN class. * **decoder_dp_input_keep_prob** (float) - dropout input keep probability. * **decoder_dp_output_keep_prob** (float) - dropout output keep probability. diff --git a/open_seq2seq/parts/rnns/utils.py b/open_seq2seq/parts/rnns/utils.py index d0599b184..53e011e64 100644 --- a/open_seq2seq/parts/rnns/utils.py +++ b/open_seq2seq/parts/rnns/utils.py @@ -37,137 +37,3 @@ def single_cell(cell_class, cell = DropoutWrapper(cell, input_keep_prob=dp_input_keep_prob, output_keep_prob=dp_output_keep_prob) return cell - - - -# def create_rnn_cell(cell_type, -# cell_params, -# num_layers=1, -# dp_input_keep_prob=1.0, -# dp_output_keep_prob=1.0, -# residual_connections=False, -# wrap_to_multi_rnn=True): -# """ -# TODO: MOVE THIS properly to utils. Write doc -# :param cell_type: -# :param cell_params: -# :param num_layers: -# :param dp_input_keep_prob: -# :param dp_output_keep_prob: -# :param residual_connections: -# :return: -# """ -# def single_cell(cell_params): -# # TODO: This method is ugly - redo -# size = cell_params["num_units"] -# proj_size = None if "proj_size" not in cell_params else cell_params["proj_size"] -# -# if cell_type == "lstm": -# if not residual_connections: -# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: -# return tf.nn.rnn_cell.LSTMCell(num_units=size, -# num_proj=proj_size, -# forget_bias=1.0) -# else: -# return DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=size, -# num_proj=proj_size, -# forget_bias=1.0), -# input_keep_prob=dp_input_keep_prob, -# output_keep_prob=dp_output_keep_prob) -# else: # residual connection required -# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: -# return ResidualWrapper(tf.nn.rnn_cell.LSTMCell(num_units=size, -# num_proj=proj_size, -# forget_bias=1.0)) -# else: -# return ResidualWrapper(DropoutWrapper( -# tf.nn.rnn_cell.LSTMCell( -# num_units=size, -# num_proj=proj_size, -# forget_bias=1.0, -# ), -# input_keep_prob=dp_input_keep_prob, -# output_keep_prob=dp_output_keep_prob, -# )) -# elif cell_type == "gru": -# if not residual_connections: -# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: -# return tf.nn.rnn_cell.GRUCell(num_units=size) -# else: -# return DropoutWrapper( -# tf.nn.rnn_cell.GRUCell(num_units=size), -# input_keep_prob=dp_input_keep_prob, -# output_keep_prob=dp_output_keep_prob, -# ) -# else: # residual connection required -# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: -# return ResidualWrapper(tf.nn.rnn_cell.GRUCell(num_units=size)) -# else: -# return ResidualWrapper(DropoutWrapper( -# tf.nn.rnn_cell.GRUCell(num_units=size), -# input_keep_prob=dp_input_keep_prob, -# output_keep_prob=dp_output_keep_prob), -# ) -# elif cell_type == "glstm": -# num_groups = cell_params["num_groups"] -# if not residual_connections: -# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: -# return GLSTMCell(num_units=size, -# number_of_groups=num_groups, -# num_proj=proj_size, -# forget_bias=1.0) -# else: -# return DropoutWrapper(GLSTMCell(num_units=size, -# number_of_groups=num_groups, -# num_proj=proj_size, -# forget_bias=1.0), -# input_keep_prob=dp_input_keep_prob, -# output_keep_prob=dp_output_keep_prob) -# else: # residual connection required -# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: -# return ResidualWrapper(GLSTMCell(num_units=size, -# number_of_groups=num_groups, -# num_proj=proj_size, -# forget_bias=1.0)) -# else: -# return ResidualWrapper(DropoutWrapper( -# GLSTMCell( -# num_units=size, -# number_of_groups=num_groups, -# num_proj=proj_size, -# forget_bias=1.0, -# ), -# input_keep_prob=dp_input_keep_prob, -# output_keep_prob=dp_output_keep_prob, -# )) -# elif cell_type == "slstm": -# if not residual_connections: -# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: -# return BasicSLSTMCell(num_units=size) -# else: -# return DropoutWrapper(BasicSLSTMCell(num_units=size), -# input_keep_prob=dp_input_keep_prob, -# output_keep_prob=dp_output_keep_prob -# ) -# else: # residual connection required -# if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: -# return ResidualWrapper(BasicSLSTMCell(num_units=size)) -# else: -# return ResidualWrapper(DropoutWrapper( -# BasicSLSTMCell(num_units=size), -# input_keep_prob=dp_input_keep_prob, -# output_keep_prob=dp_output_keep_prob, -# )) -# else: -# raise ValueError("Unknown RNN cell class: {}".format(cell_type)) -# -# if num_layers > 1: -# if wrap_to_multi_rnn: -# return MultiRNNCell([single_cell(cell_params) for _ in range(num_layers)]) -# else: -# cells = [] # for GNMT-like attention in decoder -# for i in range(num_layers): -# cells.append(single_cell(cell_params)) -# return cells -# else: -# return single_cell(cell_params) From 2e0e72d329d3072582e90e64611cb28addec3f85 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Mon, 18 Jun 2018 16:22:24 -0700 Subject: [PATCH 072/102] add nmt small config --- example_configs/text2text/en-de-nmt-small.py | 26 ++++++++++++-------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/example_configs/text2text/en-de-nmt-small.py b/example_configs/text2text/en-de-nmt-small.py index a8e1a43b0..fd3a34691 100644 --- a/example_configs/text2text/en-de-nmt-small.py +++ b/example_configs/text2text/en-de-nmt-small.py @@ -10,7 +10,7 @@ from open_seq2seq.data.text2text.text2text import SpecialTextTokens from open_seq2seq.optimizers.lr_policies import fixed_lr -data_root = "[REPLACE THIS TO THE PATH WITH YOUR WMT DATA]" +data_root = "/data/wmt16_s2s/" # This model should run fine on single GPU such as 1080ti or better base_model = Text2Text @@ -18,12 +18,12 @@ base_params = { "use_horovod": False, "num_gpus": 1, - "max_steps": 160082, + "max_steps": 100000, "batch_size_per_gpu": 128, "save_summaries_steps": 50, "print_loss_steps": 48, "print_samples_steps": 48, - "eval_steps": 1000, + "eval_steps": 3000, "save_checkpoint_steps": 2001, "logdir": "nmt-small-en-de", "optimizer": "Adam", @@ -32,9 +32,9 @@ "lr_policy_params": { "learning_rate": 0.001, }, - "larc_params": { - "larc_eta": 0.001, - }, + #"larc_params": { + # "larc_eta": 0.001, + #}, "dtype": tf.float32, # "dtype": "mixed", # "loss_scaling": "Backoff", @@ -42,8 +42,11 @@ "encoder": BidirectionalRNNEncoderWithEmbedding, "encoder_params": { "initializer": tf.glorot_uniform_initializer, - "encoder_cell_type": "lstm", - "encoder_cell_units": 512, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 512, + "forget_bias": 1.0, + }, "encoder_layers": 2, "encoder_dp_input_keep_prob": 0.8, "encoder_dp_output_keep_prob": 1.0, @@ -55,8 +58,11 @@ "decoder": RNNDecoderWithAttention, "decoder_params": { "initializer": tf.glorot_uniform_initializer, - "decoder_cell_type": "lstm", - "decoder_cell_units": 512, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 512, + "forget_bias": 1.0, + }, "decoder_layers": 2, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, From 319470a2257366b642e132bf667bfbeb939f4aa6 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Tue, 19 Jun 2018 15:42:18 -0700 Subject: [PATCH 073/102] Add to_8_padding to transformer data layer, embedding and loss --- open_seq2seq/data/text2text/t2t.py | 31 ++++++++++++++---- open_seq2seq/data/text2text/text2text.py | 32 ++----------------- open_seq2seq/encoders/transformer_encoder.py | 2 +- open_seq2seq/losses/sequence_loss.py | 8 ++++- .../parts/transformer/embedding_layer.py | 11 ++++--- 5 files changed, 41 insertions(+), 43 deletions(-) diff --git a/open_seq2seq/data/text2text/t2t.py b/open_seq2seq/data/text2text/t2t.py index 22330bc04..8a2632c45 100644 --- a/open_seq2seq/data/text2text/t2t.py +++ b/open_seq2seq/data/text2text/t2t.py @@ -73,7 +73,7 @@ def _load_records(filename): return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER) -def _parse_example(serialized_example): +def _parse_example(serialized_example, pad_2_eight=False): """Return inputs and targets Tensors from a serialized tf.Example.""" data_fields = { "inputs": tf.VarLenFeature(tf.int64), @@ -82,6 +82,17 @@ def _parse_example(serialized_example): parsed = tf.parse_single_example(serialized_example, data_fields) inputs = tf.sparse_tensor_to_dense(parsed["inputs"]) targets = tf.sparse_tensor_to_dense(parsed["targets"]) + + if pad_2_eight: + inputs = tf.cond(tf.equal(tf.shape(inputs)[0] % 8, 0), + true_fn=lambda: inputs, + false_fn=lambda: tf.pad(inputs, + paddings=[[0, 8 - tf.shape(inputs)[0] % 8]])) + targets = tf.cond(tf.equal(tf.shape(targets)[0] % 8, 0), + true_fn=lambda: targets, + false_fn=lambda: tf.pad(targets, + paddings=[[0, 8 - tf.shape(targets)[0] % 8]])) + return inputs, targets @@ -129,7 +140,7 @@ def _create_min_max_boundaries( return buckets_min, buckets_max -def _batch_examples(dataset, batch_size, max_length): +def _batch_examples(dataset, batch_size, max_length, pad_2_eight=True): """Group examples by similar lengths, and return batched dataset. Each batch of similar-length examples are padded to the same length, and may @@ -155,7 +166,12 @@ def _batch_examples(dataset, batch_size, max_length): # Create list of batch sizes for each bucket_id, so that # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size - bucket_batch_sizes = [batch_size // x for x in buckets_max] + if pad_2_eight: # pad to 8 for HMMA + bucket_batch_sizes = [ + batch_size // x if batch_size // x % 8 == 0 else batch_size // x + ( + 8 - batch_size // x % 8) for x in buckets_max] + else: + bucket_batch_sizes = [batch_size // x for x in buckets_max] # bucket_id will be a tensor, so convert this list to a tensor as well. bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64) @@ -177,7 +193,6 @@ def window_size_fn(bucket_id): def batching_fn(bucket_id, grouped_dataset): """Batch and add padding to a dataset of elements with similar lengths.""" bucket_batch_size = window_size_fn(bucket_id) - # Batch the dataset and add padding so that all input sequences in the # examples have the same length, and all target sequences have the same # lengths as well. Resulting lengths of inputs and targets can differ. @@ -192,7 +207,7 @@ def batching_fn(bucket_id, grouped_dataset): def _read_and_batch_from_files( file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat, - num_workers, worker_id, batch_in_tokens): + num_workers, worker_id, batch_in_tokens, pad2eight=True): """Create dataset where each item is a dict of "inputs" and "targets". Args: @@ -209,6 +224,7 @@ def _read_and_batch_from_files( pairs. batching in tokens is more efficient as it reduces PADs. batching in sentences should be used in inference mode since order of sentences is important + pad2eight: if True, it will pad both dimensions to be divisible by 8 Returns: tf.data.Dataset object containing examples loaded from the files. @@ -229,7 +245,7 @@ def _read_and_batch_from_files( # Parse each tf.Example into a dictionary # TODO: Look into prefetch_input_elements for performance optimization. - dataset = dataset.map(_parse_example, + dataset = dataset.map(lambda x: _parse_example(x, pad_2_eight=pad2eight), num_parallel_calls=num_cpu_cores) # Remove examples where the input or target length exceeds the maximum length, @@ -237,7 +253,8 @@ def _read_and_batch_from_files( if batch_in_tokens: # Batch such that each batch has examples of similar length. - dataset = _batch_examples(dataset, batch_size, max_length) + dataset = _batch_examples(dataset, batch_size, max_length, + pad_2_eight=pad2eight) else: # Examples can have different lenghts dataset = dataset.padded_batch(batch_size, ([None], [None])) diff --git a/open_seq2seq/data/text2text/text2text.py b/open_seq2seq/data/text2text/text2text.py index 2c21056a8..1206d5be7 100644 --- a/open_seq2seq/data/text2text/text2text.py +++ b/open_seq2seq/data/text2text/text2text.py @@ -254,7 +254,7 @@ def get_optional_params(): 'repeat': int, 'num_cpu_cores': int, 'tgt_vocab_file': str, - 'm_padding': bool, + 'pad_data_to_eight': bool, 'batch_in_tokens': bool, }) @@ -303,38 +303,12 @@ def build_graph(self): repeat=self.params['repeat'], num_workers=self._num_workers, worker_id=self._worker_id, - batch_in_tokens=self.params.get('batch_in_tokens', True)) + batch_in_tokens=self.params.get('batch_in_tokens', True), + pad2eight=self.params.get('pad_data_to_eight', False)) self._iterator = self.batched_dataset.make_initializable_iterator() x, y = self.iterator.get_next() - if self.params.get('m_padding', False): - # MAGIC PADDING - x = tf.cond(tf.equal(tf.shape(x)[1] % 8, 0), - true_fn = lambda: x, - false_fn = lambda: tf.pad(x, - paddings=[[0, 0], - [0, 8 - tf.shape(x)[1] % 8]])) - - y = tf.cond(tf.equal(tf.shape(y)[1] % 8, 0), - true_fn = lambda: y, - false_fn = lambda: tf.pad(y, - paddings=[[0, 0], - [0, 8 - tf.shape(y)[1] % 8]])) - - x = tf.cond(tf.equal(tf.shape(x)[0] % 8, 0), - true_fn = lambda: x, - false_fn = lambda: tf.pad(x, - paddings=[[0, 8 - tf.shape(x)[0] % 8], - [0, 0]])) - - y = tf.cond(tf.equal(tf.shape(y)[0] % 8, 0), - true_fn=lambda: y, - false_fn=lambda: tf.pad(y, - paddings=[[0, 8 - tf.shape(y)[0] % 8], - [0, 0]])) - # ENDOF MAGIC PADDING - len_x = tf.count_nonzero(x, axis=1, dtype=tf.int32) len_y = tf.count_nonzero(y, axis=1, dtype=tf.int32) if self.params['mode'] == 'train' or self.params['mode'] == 'eval': diff --git a/open_seq2seq/encoders/transformer_encoder.py b/open_seq2seq/encoders/transformer_encoder.py index 342b992e4..cd2d6edbd 100644 --- a/open_seq2seq/encoders/transformer_encoder.py +++ b/open_seq2seq/encoders/transformer_encoder.py @@ -79,7 +79,7 @@ def _encode(self, input_dict): # prepare encoder graph self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( self.params["src_vocab_size"], self.params["hidden_size"], - pad2eight=self.params.get('pad_embeddings_2_eight', False)) + pad_vocab_to_eight=self.params.get('pad_embeddings_2_eight', False)) for _ in range(self.params['encoder_layers']): # Create sublayers for each layer. diff --git a/open_seq2seq/losses/sequence_loss.py b/open_seq2seq/losses/sequence_loss.py index 718b053c2..f665a05c4 100644 --- a/open_seq2seq/losses/sequence_loss.py +++ b/open_seq2seq/losses/sequence_loss.py @@ -252,11 +252,17 @@ def get_optional_params(): 'batch_size': int, 'tgt_vocab_size': int, 'label_smoothing': float, + 'pad_embeddings_2_eight': bool, }) def __init__(self, params, model, name="padded_cross_entropy_with_smoothing"): super(PaddedCrossEntropyLossWithSmoothing, self).__init__(params, model, name) - self._tgt_vocab_size = self.params["tgt_vocab_size"] + if self.params.get('pad_embeddings_2_eight', False): + self._tgt_vocab_size = self.params["tgt_vocab_size"] if self.params[ + "tgt_vocab_size"] % 8 == 0 else \ + self.params["tgt_vocab_size"] + (8 - self.params["tgt_vocab_size"] % 8) + else: + self._tgt_vocab_size = self.params["tgt_vocab_size"] self._label_smoothing = self.params.get("label_smoothing", 0.0) def _compute_loss(self, input_dict): diff --git a/open_seq2seq/parts/transformer/embedding_layer.py b/open_seq2seq/parts/transformer/embedding_layer.py index a4fef6147..23f7c2177 100644 --- a/open_seq2seq/parts/transformer/embedding_layer.py +++ b/open_seq2seq/parts/transformer/embedding_layer.py @@ -26,14 +26,15 @@ class EmbeddingSharedWeights(tf.layers.Layer): """Calculates input embeddings and pre-softmax linear with shared weights.""" - def __init__(self, vocab_size, hidden_size, pad2eight=False): + def __init__(self, vocab_size, hidden_size, pad_vocab_to_eight=False): super(EmbeddingSharedWeights, self).__init__() - self.vocab_size = vocab_size + self.hidden_size = hidden_size padf = lambda x: x if x % 8 == 0 else x + 8 - x % 8 - if pad2eight: - self.hidden_size = padf(hidden_size) + if pad_vocab_to_eight: + self.vocab_size = padf(vocab_size) else: - self.hidden_size = hidden_size + self.vocab_size = vocab_size + def build(self, _): with tf.variable_scope("embedding_and_softmax", reuse=tf.AUTO_REUSE): From 9648eecff1be5cd8407835cb4ab101e8c51b7dbc Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Tue, 19 Jun 2018 15:53:36 -0700 Subject: [PATCH 074/102] move configs a little --- .../de-en/de-en-gnmt-like-8GPUs-horovod.py | 162 ++++++++++++++++++ .../{ => en-de}/en-de-gnmt-like-4GPUs.py | 0 .../en-de-gnmt-like-8GPUs-horovod.py | 8 +- ...-de-gnmt-like-8GPUs-horovod_iter_size10.py | 0 ...de-gnmt-like-8GPUs-horovod_iter_size100.py | 0 ...n-de-gnmt-like-8GPUs-horovod_iter_size5.py | 0 .../text2text/{ => en-de}/en-de-nmt-small.py | 2 +- .../{ => en-de}/transformer-base-test-mp.py | 0 .../{ => en-de}/transformer-base-test.py | 0 .../text2text/{ => en-de}/transformer-base.py | 0 10 files changed, 167 insertions(+), 5 deletions(-) create mode 100644 example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py rename example_configs/text2text/{ => en-de}/en-de-gnmt-like-4GPUs.py (100%) rename example_configs/text2text/{ => en-de}/en-de-gnmt-like-8GPUs-horovod.py (96%) rename example_configs/text2text/{ => en-de}/en-de-gnmt-like-8GPUs-horovod_iter_size10.py (100%) rename example_configs/text2text/{ => en-de}/en-de-gnmt-like-8GPUs-horovod_iter_size100.py (100%) rename example_configs/text2text/{ => en-de}/en-de-gnmt-like-8GPUs-horovod_iter_size5.py (100%) rename example_configs/text2text/{ => en-de}/en-de-nmt-small.py (99%) rename example_configs/text2text/{ => en-de}/transformer-base-test-mp.py (100%) rename example_configs/text2text/{ => en-de}/transformer-base-test.py (100%) rename example_configs/text2text/{ => en-de}/transformer-base.py (100%) diff --git a/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py new file mode 100644 index 000000000..b8902853b --- /dev/null +++ b/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py @@ -0,0 +1,162 @@ +from __future__ import absolute_import, division, print_function +import tensorflow as tf + +from open_seq2seq.models import Text2Text +from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding_cuDNN +from open_seq2seq.decoders import RNNDecoderWithAttention, \ + BeamSearchRNNDecoderWithAttention +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer +from open_seq2seq.losses import BasicSequenceLoss +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.optimizers.lr_policies import exp_decay + +data_root = "/data/wmt16_s2s/" + +base_model = Text2Text + +base_params = { + "use_horovod": True, + "num_gpus": 1, # each Horovod process will occupy single GPU + "max_steps": 34000, + "batch_size_per_gpu": 128, + "save_summaries_steps": 50, + "print_loss_steps": 48, + "print_samples_steps": 48, + "eval_steps": 1000, + "save_checkpoint_steps": 2001, + "logdir": "GNMT-like-de-en", + "optimizer": "Adam", + "optimizer_params": {}, + # luong10 decay scheme + "lr_policy": exp_decay, + "lr_policy_params": { + "learning_rate": 0.0008, + "begin_decay_at": 17000, + "decay_steps": 1700, + "decay_rate": 0.5, + "use_staircase_decay": True, + "min_lr": 0.0000005, + }, + #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + #"max_grad_norm": 32768.0, + #"dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "encoder_cell_type": "lstm", + "encoder_cell_units": 1024, + "encoder_layers": 7, + "encoder_dp_output_keep_prob": 1.0, + "src_emb_size": 1024, + }, + + "decoder": RNNDecoderWithAttention, + "decoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 1024, + "forget_bias": 1.0, + }, + + "decoder_layers": 8, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": True, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + + "tgt_emb_size": 1024, + "attention_type": "gnmt_v2", + "attention_layer_size": 1024, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"train.tok.clean.bpe.32000.de", + "target_file": data_root+"train.tok.clean.bpe.32000.en", + "delimiter": " ", + "shuffle": True, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "max_length": 50, + }, +} +eval_params = { + "batch_size_per_gpu": 16, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": True, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2013.tok.bpe.32000.de", + "target_file": data_root+"newstest2013.tok.bpe.32000.en", + "delimiter": " ", + "shuffle": False, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "max_length": 32, + }, +} + +infer_params = { + "batch_size_per_gpu": 1, + "decoder": BeamSearchRNNDecoderWithAttention, + "decoder_params": { + "beam_width": 10, + "length_penalty": 1.0, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 1024, + }, + "decoder_layers": 8, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": True, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + "tgt_emb_size": 1024, + "attention_type": "gnmt_v2", + "attention_layer_size": 1024, + }, + + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2014.tok.bpe.32000.de", + # this is intentional + "target_file": data_root+"newstest2014.tok.bpe.32000.de", + "delimiter": " ", + "shuffle": False, + "repeat": False, + "max_length": 512, + }, +} diff --git a/example_configs/text2text/en-de-gnmt-like-4GPUs.py b/example_configs/text2text/en-de/en-de-gnmt-like-4GPUs.py similarity index 100% rename from example_configs/text2text/en-de-gnmt-like-4GPUs.py rename to example_configs/text2text/en-de/en-de-gnmt-like-4GPUs.py diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py similarity index 96% rename from example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py rename to example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py index d892ae8e9..0a7bc9513 100644 --- a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod.py +++ b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py @@ -16,7 +16,7 @@ base_params = { "use_horovod": True, - "num_gpus": 1, + "num_gpus": 1, # each Horovod process will occupy single GPU "max_steps": 34000, "batch_size_per_gpu": 128, "save_summaries_steps": 50, @@ -24,13 +24,13 @@ "print_samples_steps": 48, "eval_steps": 1000, "save_checkpoint_steps": 2001, - "logdir": "GNMT-MP-cuDNN-enc", + "logdir": "GNMT-like-en-de", "optimizer": "Adam", "optimizer_params": {}, # luong10 decay scheme "lr_policy": exp_decay, "lr_policy_params": { - "learning_rate": 0.001, + "learning_rate": 0.0008, "begin_decay_at": 17000, "decay_steps": 1700, "decay_rate": 0.5, @@ -148,7 +148,7 @@ "data_layer": ParallelTextDataLayer, "data_layer_params": { - "pad_vocab_to_eight": True, + "pad_vocab_to_eight": False, "src_vocab_file": data_root+"vocab.bpe.32000", "tgt_vocab_file": data_root+"vocab.bpe.32000", "source_file": data_root+"newstest2014.tok.bpe.32000.en", diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size10.py b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size10.py similarity index 100% rename from example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size10.py rename to example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size10.py diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size100.py b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size100.py similarity index 100% rename from example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size100.py rename to example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size100.py diff --git a/example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size5.py b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size5.py similarity index 100% rename from example_configs/text2text/en-de-gnmt-like-8GPUs-horovod_iter_size5.py rename to example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size5.py diff --git a/example_configs/text2text/en-de-nmt-small.py b/example_configs/text2text/en-de/en-de-nmt-small.py similarity index 99% rename from example_configs/text2text/en-de-nmt-small.py rename to example_configs/text2text/en-de/en-de-nmt-small.py index fd3a34691..9266827f7 100644 --- a/example_configs/text2text/en-de-nmt-small.py +++ b/example_configs/text2text/en-de/en-de-nmt-small.py @@ -30,7 +30,7 @@ "optimizer_params": {}, "lr_policy": fixed_lr, "lr_policy_params": { - "learning_rate": 0.001, + "learning_rate": 0.0008, }, #"larc_params": { # "larc_eta": 0.001, diff --git a/example_configs/text2text/transformer-base-test-mp.py b/example_configs/text2text/en-de/transformer-base-test-mp.py similarity index 100% rename from example_configs/text2text/transformer-base-test-mp.py rename to example_configs/text2text/en-de/transformer-base-test-mp.py diff --git a/example_configs/text2text/transformer-base-test.py b/example_configs/text2text/en-de/transformer-base-test.py similarity index 100% rename from example_configs/text2text/transformer-base-test.py rename to example_configs/text2text/en-de/transformer-base-test.py diff --git a/example_configs/text2text/transformer-base.py b/example_configs/text2text/en-de/transformer-base.py similarity index 100% rename from example_configs/text2text/transformer-base.py rename to example_configs/text2text/en-de/transformer-base.py From e0af7281bb05ea53ae1eaa0190b39650c1ff520e Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Tue, 19 Jun 2018 15:58:20 -0700 Subject: [PATCH 075/102] adjust batch size in gnmt configs --- .../text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py | 2 +- .../text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py index b8902853b..a64b4ea9a 100644 --- a/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py +++ b/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py @@ -18,7 +18,7 @@ "use_horovod": True, "num_gpus": 1, # each Horovod process will occupy single GPU "max_steps": 34000, - "batch_size_per_gpu": 128, + "batch_size_per_gpu": 64, "save_summaries_steps": 50, "print_loss_steps": 48, "print_samples_steps": 48, diff --git a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py index 0a7bc9513..2244756c4 100644 --- a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py +++ b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py @@ -18,7 +18,7 @@ "use_horovod": True, "num_gpus": 1, # each Horovod process will occupy single GPU "max_steps": 34000, - "batch_size_per_gpu": 128, + "batch_size_per_gpu": 64, "save_summaries_steps": 50, "print_loss_steps": 48, "print_samples_steps": 48, From 072ff5b0d60d16a6f8f7241587ad3d95f0ff53a0 Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 20 Jun 2018 10:31:53 -0700 Subject: [PATCH 076/102] Add print_bench_info_steps parameter --- open_seq2seq/models/model.py | 8 ++++++++ open_seq2seq/utils/funcs.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index 53fc7955b..b84136b5b 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -60,6 +60,7 @@ class :meth:`__init__` method. 'save_summaries_steps': None, # could be int or None 'print_loss_steps': None, # could be int or None 'print_samples_steps': None, # could be int or None + 'print_bench_info_steps': None, # could be int or None 'save_checkpoint_steps': None, # could be int or None 'eval_steps': int, @@ -126,6 +127,11 @@ def __init__(self, params, mode="train", hvd=None): * **print_samples_steps** (int or None) --- how often to print training samples (input sequences, correct answers and model predictions). Setting it to None disables samples printing. + * **print_bench_info_steps** (int or None) --- how often to print training + benchmarking information (average number of objects processed per step). + Setting it to None disables intermediate benchmarking printing, but + the average information across the whole training will always be printed + after the last iteration. * **save_checkpoint_steps** (int or None) --- how often to save model checkpoints. Setting it to None disables checkpoint saving. * **eval_steps** (int) --- how often to run evaluation during training. @@ -212,6 +218,8 @@ class docs. self._params['save_checkpoint_steps'] = None if 'save_summaries_steps' not in self._params: self._params['save_summaries_steps'] = None + if 'print_bench_info_steps' not in self._params: + self._params['print_bench_info_steps'] = None # checking that frequencies of samples and loss are aligned s_fr = self._params['print_samples_steps'] diff --git a/open_seq2seq/utils/funcs.py b/open_seq2seq/utils/funcs.py index a4bd2ee88..de3bf96ae 100644 --- a/open_seq2seq/utils/funcs.py +++ b/open_seq2seq/utils/funcs.py @@ -136,6 +136,14 @@ def run_with_no_hooks(step_context): if len(fetches) > 1: for i in range(train_model.num_gpus): total_objects += np.sum(fetches_vals[i + 1]) + if train_model.params['print_bench_info_steps'] is not None: + if step % train_model.params['print_bench_info_steps'] == 0: + total_objects_cur = collect_if_horovod(total_objects, hvd, + mode="sum") + if master_worker: + avg_objects = 1.0 * total_objects_cur / total_time + deco_print("Avg objects per second: {:.3f}".format(avg_objects)) + step += 1 if len(fetches) > 1: From a807c211a8506089098889edf9a3712c946a5ce2 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Wed, 20 Jun 2018 10:46:38 -0700 Subject: [PATCH 077/102] modify configs --- .../de-en/de-en-gnmt-like-8GPUs-horovod.py | 11 +- .../en-de/en-de-gnmt-like-8GPUs-horovod.py | 11 +- ...-de-gnmt-like-8GPUs-horovod_iter_size10.py | 163 ------------------ ...de-gnmt-like-8GPUs-horovod_iter_size100.py | 163 ------------------ ...n-de-gnmt-like-8GPUs-horovod_iter_size5.py | 163 ------------------ 5 files changed, 10 insertions(+), 501 deletions(-) delete mode 100644 example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size10.py delete mode 100644 example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size100.py delete mode 100644 example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size5.py diff --git a/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py index a64b4ea9a..40b2222e3 100644 --- a/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py +++ b/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py @@ -39,10 +39,9 @@ }, #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - #"max_grad_norm": 32768.0, - #"dtype": tf.float32, - "dtype": "mixed", - "loss_scaling": "Backoff", + "dtype": tf.float32, + #"dtype": "mixed", + #"loss_scaling": "Backoff", "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, "encoder_params": { "initializer": tf.random_uniform_initializer, @@ -93,7 +92,7 @@ train_params = { "data_layer": ParallelTextDataLayer, "data_layer_params": { - "pad_vocab_to_eight": True, + "pad_vocab_to_eight": False, "src_vocab_file": data_root+"vocab.bpe.32000", "tgt_vocab_file": data_root+"vocab.bpe.32000", "source_file": data_root+"train.tok.clean.bpe.32000.de", @@ -110,7 +109,7 @@ "batch_size_per_gpu": 16, "data_layer": ParallelTextDataLayer, "data_layer_params": { - "pad_vocab_to_eight": True, + "pad_vocab_to_eight": False, "src_vocab_file": data_root+"vocab.bpe.32000", "tgt_vocab_file": data_root+"vocab.bpe.32000", "source_file": data_root+"newstest2013.tok.bpe.32000.de", diff --git a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py index 2244756c4..2826b4a86 100644 --- a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py +++ b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py @@ -39,10 +39,9 @@ }, #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - #"max_grad_norm": 32768.0, - #"dtype": tf.float32, - "dtype": "mixed", - "loss_scaling": "Backoff", + "dtype": tf.float32, + #"dtype": "mixed", + #"loss_scaling": "Backoff", "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, "encoder_params": { "initializer": tf.random_uniform_initializer, @@ -93,7 +92,7 @@ train_params = { "data_layer": ParallelTextDataLayer, "data_layer_params": { - "pad_vocab_to_eight": True, + "pad_vocab_to_eight": False, "src_vocab_file": data_root+"vocab.bpe.32000", "tgt_vocab_file": data_root+"vocab.bpe.32000", "source_file": data_root+"train.tok.clean.bpe.32000.en", @@ -110,7 +109,7 @@ "batch_size_per_gpu": 16, "data_layer": ParallelTextDataLayer, "data_layer_params": { - "pad_vocab_to_eight": True, + "pad_vocab_to_eight": False, "src_vocab_file": data_root+"vocab.bpe.32000", "tgt_vocab_file": data_root+"vocab.bpe.32000", "source_file": data_root+"newstest2013.tok.bpe.32000.en", diff --git a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size10.py b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size10.py deleted file mode 100644 index 2d72678f6..000000000 --- a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size10.py +++ /dev/null @@ -1,163 +0,0 @@ -from __future__ import absolute_import, division, print_function -import tensorflow as tf - -from open_seq2seq.models import Text2Text -from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding_cuDNN -from open_seq2seq.decoders import RNNDecoderWithAttention, \ - BeamSearchRNNDecoderWithAttention -from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer -from open_seq2seq.losses import BasicSequenceLoss -from open_seq2seq.data.text2text.text2text import SpecialTextTokens -from open_seq2seq.optimizers.lr_policies import exp_decay - -data_root = "/data/wmt16_s2s/" - -base_model = Text2Text - -base_params = { - "use_horovod": True, - "iter_size": 10, - "num_gpus": 1, - "max_steps": 3400, - "batch_size_per_gpu": 128, - "save_summaries_steps": 50, - "print_loss_steps": 48, - "print_samples_steps": 48, - "eval_steps": 1000, - "save_checkpoint_steps": 2001, - "logdir": "GNMT-MP-cuDNN-enc", - "optimizer": "Adam", - "optimizer_params": {}, - # luong10 decay scheme - "lr_policy": exp_decay, - "lr_policy_params": { - "learning_rate": 0.001, - "begin_decay_at": 1700, - "decay_steps": 170, - "decay_rate": 0.5, - "use_staircase_decay": True, - "min_lr": 0.0000005, - }, - #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - #"max_grad_norm": 32768.0, - #"dtype": tf.float32, - "dtype": "mixed", - "loss_scaling": "Backoff", - "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, - "encoder_params": { - "initializer": tf.random_uniform_initializer, - "initializer_params": { - "minval": -0.1, - "maxval": 0.1, - }, - "encoder_cell_type": "lstm", - "encoder_cell_units": 1024, - "encoder_layers": 7, - "encoder_dp_output_keep_prob": 1.0, - "src_emb_size": 1024, - }, - - "decoder": RNNDecoderWithAttention, - "decoder_params": { - "initializer": tf.random_uniform_initializer, - "initializer_params": { - "minval": -0.1, - "maxval": 0.1, - }, - "core_cell": tf.nn.rnn_cell.LSTMCell, - "core_cell_params": { - "num_units": 1024, - "forget_bias": 1.0, - }, - - "decoder_layers": 8, - "decoder_dp_input_keep_prob": 0.8, - "decoder_dp_output_keep_prob": 1.0, - "decoder_use_skip_connections": True, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - - "tgt_emb_size": 1024, - "attention_type": "gnmt_v2", - "attention_layer_size": 1024, - }, - - "loss": BasicSequenceLoss, - "loss_params": { - "offset_target_by_one": True, - "average_across_timestep": True, - "do_mask": True - } -} - -train_params = { - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": True, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"train.tok.clean.bpe.32000.en", - "target_file": data_root+"train.tok.clean.bpe.32000.de", - "delimiter": " ", - "shuffle": True, - "repeat": True, - "map_parallel_calls": 16, - "prefetch_buffer_size": 8, - "max_length": 50, - }, -} -eval_params = { - "batch_size_per_gpu": 16, - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": True, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"newstest2013.tok.bpe.32000.en", - "target_file": data_root+"newstest2013.tok.bpe.32000.de", - "delimiter": " ", - "shuffle": False, - "repeat": True, - "map_parallel_calls": 16, - "prefetch_buffer_size": 1, - "max_length": 32, - }, -} - -infer_params = { - "batch_size_per_gpu": 1, - "decoder": BeamSearchRNNDecoderWithAttention, - "decoder_params": { - "beam_width": 10, - "length_penalty": 1.0, - "core_cell": tf.nn.rnn_cell.LSTMCell, - "core_cell_params": { - "num_units": 1024, - }, - "decoder_layers": 8, - "decoder_dp_input_keep_prob": 0.8, - "decoder_dp_output_keep_prob": 1.0, - "decoder_use_skip_connections": True, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, - "tgt_emb_size": 1024, - "attention_type": "gnmt_v2", - "attention_layer_size": 1024, - }, - - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": True, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"newstest2014.tok.bpe.32000.en", - # this is intentional - "target_file": data_root+"newstest2014.tok.bpe.32000.en", - "delimiter": " ", - "shuffle": False, - "repeat": False, - "max_length": 512, - }, -} diff --git a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size100.py b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size100.py deleted file mode 100644 index 6cbd3d66c..000000000 --- a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size100.py +++ /dev/null @@ -1,163 +0,0 @@ -from __future__ import absolute_import, division, print_function -import tensorflow as tf - -from open_seq2seq.models import Text2Text -from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding_cuDNN -from open_seq2seq.decoders import RNNDecoderWithAttention, \ - BeamSearchRNNDecoderWithAttention -from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer -from open_seq2seq.losses import BasicSequenceLoss -from open_seq2seq.data.text2text.text2text import SpecialTextTokens -from open_seq2seq.optimizers.lr_policies import exp_decay - -data_root = "/data/wmt16_s2s/" - -base_model = Text2Text - -base_params = { - "use_horovod": True, - "iter_size": 100, - "num_gpus": 1, - "max_steps": 340, - "batch_size_per_gpu": 128, - "save_summaries_steps": 50, - "print_loss_steps": 48, - "print_samples_steps": 48, - "eval_steps": 1000, - "save_checkpoint_steps": 2001, - "logdir": "GNMT-MP-cuDNN-enc", - "optimizer": "Adam", - "optimizer_params": {}, - # luong10 decay scheme - "lr_policy": exp_decay, - "lr_policy_params": { - "learning_rate": 0.001, - "begin_decay_at": 170, - "decay_steps": 17, - "decay_rate": 0.5, - "use_staircase_decay": True, - "min_lr": 0.0000005, - }, - #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - #"max_grad_norm": 32768.0, - #"dtype": tf.float32, - "dtype": "mixed", - "loss_scaling": "Backoff", - "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, - "encoder_params": { - "initializer": tf.random_uniform_initializer, - "initializer_params": { - "minval": -0.1, - "maxval": 0.1, - }, - "encoder_cell_type": "lstm", - "encoder_cell_units": 1024, - "encoder_layers": 7, - "encoder_dp_output_keep_prob": 1.0, - "src_emb_size": 1024, - }, - - "decoder": RNNDecoderWithAttention, - "decoder_params": { - "initializer": tf.random_uniform_initializer, - "initializer_params": { - "minval": -0.1, - "maxval": 0.1, - }, - "core_cell": tf.nn.rnn_cell.LSTMCell, - "core_cell_params": { - "num_units": 1024, - "forget_bias": 1.0, - }, - - "decoder_layers": 8, - "decoder_dp_input_keep_prob": 0.8, - "decoder_dp_output_keep_prob": 1.0, - "decoder_use_skip_connections": True, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - - "tgt_emb_size": 1024, - "attention_type": "gnmt_v2", - "attention_layer_size": 1024, - }, - - "loss": BasicSequenceLoss, - "loss_params": { - "offset_target_by_one": True, - "average_across_timestep": True, - "do_mask": True - } -} - -train_params = { - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": True, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"train.tok.clean.bpe.32000.en", - "target_file": data_root+"train.tok.clean.bpe.32000.de", - "delimiter": " ", - "shuffle": True, - "repeat": True, - "map_parallel_calls": 16, - "prefetch_buffer_size": 8, - "max_length": 50, - }, -} -eval_params = { - "batch_size_per_gpu": 16, - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": True, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"newstest2013.tok.bpe.32000.en", - "target_file": data_root+"newstest2013.tok.bpe.32000.de", - "delimiter": " ", - "shuffle": False, - "repeat": True, - "map_parallel_calls": 16, - "prefetch_buffer_size": 1, - "max_length": 32, - }, -} - -infer_params = { - "batch_size_per_gpu": 1, - "decoder": BeamSearchRNNDecoderWithAttention, - "decoder_params": { - "beam_width": 10, - "length_penalty": 1.0, - "core_cell": tf.nn.rnn_cell.LSTMCell, - "core_cell_params": { - "num_units": 1024, - }, - "decoder_layers": 8, - "decoder_dp_input_keep_prob": 0.8, - "decoder_dp_output_keep_prob": 1.0, - "decoder_use_skip_connections": True, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, - "tgt_emb_size": 1024, - "attention_type": "gnmt_v2", - "attention_layer_size": 1024, - }, - - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": True, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"newstest2014.tok.bpe.32000.en", - # this is intentional - "target_file": data_root+"newstest2014.tok.bpe.32000.en", - "delimiter": " ", - "shuffle": False, - "repeat": False, - "max_length": 512, - }, -} diff --git a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size5.py b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size5.py deleted file mode 100644 index 3608fe997..000000000 --- a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod_iter_size5.py +++ /dev/null @@ -1,163 +0,0 @@ -from __future__ import absolute_import, division, print_function -import tensorflow as tf - -from open_seq2seq.models import Text2Text -from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding_cuDNN -from open_seq2seq.decoders import RNNDecoderWithAttention, \ - BeamSearchRNNDecoderWithAttention -from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer -from open_seq2seq.losses import BasicSequenceLoss -from open_seq2seq.data.text2text.text2text import SpecialTextTokens -from open_seq2seq.optimizers.lr_policies import exp_decay - -data_root = "/data/wmt16_s2s/" - -base_model = Text2Text - -base_params = { - "use_horovod": True, - "iter_size": 5, - "num_gpus": 1, - "max_steps": 6800, - "batch_size_per_gpu": 128, - "save_summaries_steps": 50, - "print_loss_steps": 48, - "print_samples_steps": 48, - "eval_steps": 1000, - "save_checkpoint_steps": 2001, - "logdir": "GNMT-MP-cuDNN-enc", - "optimizer": "Adam", - "optimizer_params": {}, - # luong10 decay scheme - "lr_policy": exp_decay, - "lr_policy_params": { - "learning_rate": 0.001, - "begin_decay_at": 3400, - "decay_steps": 340, - "decay_rate": 0.5, - "use_staircase_decay": True, - "min_lr": 0.0000005, - }, - #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - #"max_grad_norm": 32768.0, - #"dtype": tf.float32, - "dtype": "mixed", - "loss_scaling": "Backoff", - "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, - "encoder_params": { - "initializer": tf.random_uniform_initializer, - "initializer_params": { - "minval": -0.1, - "maxval": 0.1, - }, - "encoder_cell_type": "lstm", - "encoder_cell_units": 1024, - "encoder_layers": 7, - "encoder_dp_output_keep_prob": 1.0, - "src_emb_size": 1024, - }, - - "decoder": RNNDecoderWithAttention, - "decoder_params": { - "initializer": tf.random_uniform_initializer, - "initializer_params": { - "minval": -0.1, - "maxval": 0.1, - }, - "core_cell": tf.nn.rnn_cell.LSTMCell, - "core_cell_params": { - "num_units": 1024, - "forget_bias": 1.0, - }, - - "decoder_layers": 8, - "decoder_dp_input_keep_prob": 0.8, - "decoder_dp_output_keep_prob": 1.0, - "decoder_use_skip_connections": True, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - - "tgt_emb_size": 1024, - "attention_type": "gnmt_v2", - "attention_layer_size": 1024, - }, - - "loss": BasicSequenceLoss, - "loss_params": { - "offset_target_by_one": True, - "average_across_timestep": True, - "do_mask": True - } -} - -train_params = { - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": True, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"train.tok.clean.bpe.32000.en", - "target_file": data_root+"train.tok.clean.bpe.32000.de", - "delimiter": " ", - "shuffle": True, - "repeat": True, - "map_parallel_calls": 16, - "prefetch_buffer_size": 8, - "max_length": 50, - }, -} -eval_params = { - "batch_size_per_gpu": 16, - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": True, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"newstest2013.tok.bpe.32000.en", - "target_file": data_root+"newstest2013.tok.bpe.32000.de", - "delimiter": " ", - "shuffle": False, - "repeat": True, - "map_parallel_calls": 16, - "prefetch_buffer_size": 1, - "max_length": 32, - }, -} - -infer_params = { - "batch_size_per_gpu": 1, - "decoder": BeamSearchRNNDecoderWithAttention, - "decoder_params": { - "beam_width": 10, - "length_penalty": 1.0, - "core_cell": tf.nn.rnn_cell.LSTMCell, - "core_cell_params": { - "num_units": 1024, - }, - "decoder_layers": 8, - "decoder_dp_input_keep_prob": 0.8, - "decoder_dp_output_keep_prob": 1.0, - "decoder_use_skip_connections": True, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, - "tgt_emb_size": 1024, - "attention_type": "gnmt_v2", - "attention_layer_size": 1024, - }, - - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": True, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"newstest2014.tok.bpe.32000.en", - # this is intentional - "target_file": data_root+"newstest2014.tok.bpe.32000.en", - "delimiter": " ", - "shuffle": False, - "repeat": False, - "max_length": 512, - }, -} From a2cdabbe660c6b42e6bc432e5d4f1e261eb0848c Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Wed, 20 Jun 2018 11:03:31 -0700 Subject: [PATCH 078/102] add more configs --- .../en-de/en-de-gnmt-like-8GPUs-fast.py | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-fast.py diff --git a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-fast.py b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-fast.py new file mode 100644 index 000000000..e538978e2 --- /dev/null +++ b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-fast.py @@ -0,0 +1,163 @@ +from __future__ import absolute_import, division, print_function +import tensorflow as tf + +from open_seq2seq.models import Text2Text +from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding_cuDNN +from open_seq2seq.decoders import RNNDecoderWithAttention, \ + BeamSearchRNNDecoderWithAttention +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer +from open_seq2seq.losses import BasicSequenceLoss +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.optimizers.lr_policies import exp_decay + +data_root = "/data/wmt16_s2s/" + +base_model = Text2Text +pad_vocabs_2_eight = True + +base_params = { + "use_horovod": True, + "num_gpus": 1, # each Horovod process will occupy single GPU + "max_steps": 34000, + "batch_size_per_gpu": 64, + "iter_size": 2, # This will make virtual batch_per_gpu = batch_size_per_gpu * iter_size + "save_summaries_steps": 50, + "print_loss_steps": 48, + "print_samples_steps": 48, + "eval_steps": 1000, + "save_checkpoint_steps": 2001, + "logdir": "GNMT-like-en-de", + "optimizer": "Adam", + "optimizer_params": {}, + # luong10 decay scheme + "lr_policy": exp_decay, + "lr_policy_params": { + "learning_rate": 0.0008, + "begin_decay_at": 17000, + "decay_steps": 1700, + "decay_rate": 0.5, + "use_staircase_decay": True, + "min_lr": 0.0000005, + }, + #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + #"dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, + "encoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "encoder_cell_type": "lstm", + "encoder_cell_units": 1024, + "encoder_layers": 7, + "encoder_dp_output_keep_prob": 1.0, + "src_emb_size": 1024, + }, + + "decoder": RNNDecoderWithAttention, + "decoder_params": { + "initializer": tf.random_uniform_initializer, + "initializer_params": { + "minval": -0.1, + "maxval": 0.1, + }, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 1024, + "forget_bias": 1.0, + }, + + "decoder_layers": 8, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": True, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + + "tgt_emb_size": 1024, + "attention_type": "gnmt_v2", + "attention_layer_size": 1024, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": pad_vocabs_2_eight, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"train.tok.clean.bpe.32000.en", + "target_file": data_root+"train.tok.clean.bpe.32000.de", + "delimiter": " ", + "shuffle": True, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 8, + "max_length": 50, + }, +} +eval_params = { + "batch_size_per_gpu": 16, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": pad_vocabs_2_eight, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2013.tok.bpe.32000.en", + "target_file": data_root+"newstest2013.tok.bpe.32000.de", + "delimiter": " ", + "shuffle": False, + "repeat": True, + "map_parallel_calls": 16, + "prefetch_buffer_size": 1, + "max_length": 32, + }, +} + +infer_params = { + "batch_size_per_gpu": 8, + "decoder": BeamSearchRNNDecoderWithAttention, + "decoder_params": { + "beam_width": 10, + "length_penalty": 1.0, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 1024, + }, + "decoder_layers": 8, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": True, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + "tgt_emb_size": 1024, + "attention_type": "gnmt_v2", + "attention_layer_size": 1024, + }, + + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": pad_vocabs_2_eight, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2014.tok.bpe.32000.en", + # this is intentional + "target_file": data_root+"newstest2014.tok.bpe.32000.en", + "delimiter": " ", + "shuffle": False, + "repeat": False, + "max_length": 512, + }, +} From c7ea9c85e98c7dc3142fa2f57cbf0d8e5083f483 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Fri, 22 Jun 2018 17:37:48 -0700 Subject: [PATCH 079/102] update text2text configs --- .../de-en/de-en-gnmt-like-8GPUs-horovod.py | 161 ----------------- .../{en-de => }/en-de-gnmt-like-4GPUs.py | 37 ++-- .../text2text/{en-de => }/en-de-nmt-small.py | 31 ++-- .../en-de/en-de-gnmt-like-8GPUs-fast.py | 163 ------------------ .../en-de/en-de-gnmt-like-8GPUs-horovod.py | 161 ----------------- .../en-de/transformer-base-test-mp.py | 133 -------------- .../text2text/en-de/transformer-base-test.py | 133 -------------- example_configs/text2text/nmt-reversal-RR.py | 16 +- example_configs/text2text/nmt-reversal-RT.py | 6 +- ...transformer-base.py => transformer-big.py} | 2 +- 10 files changed, 43 insertions(+), 800 deletions(-) delete mode 100644 example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py rename example_configs/text2text/{en-de => }/en-de-gnmt-like-4GPUs.py (84%) rename example_configs/text2text/{en-de => }/en-de-nmt-small.py (90%) delete mode 100644 example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-fast.py delete mode 100644 example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py delete mode 100644 example_configs/text2text/en-de/transformer-base-test-mp.py delete mode 100644 example_configs/text2text/en-de/transformer-base-test.py rename example_configs/text2text/{en-de/transformer-base.py => transformer-big.py} (98%) diff --git a/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py deleted file mode 100644 index 40b2222e3..000000000 --- a/example_configs/text2text/de-en/de-en-gnmt-like-8GPUs-horovod.py +++ /dev/null @@ -1,161 +0,0 @@ -from __future__ import absolute_import, division, print_function -import tensorflow as tf - -from open_seq2seq.models import Text2Text -from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding_cuDNN -from open_seq2seq.decoders import RNNDecoderWithAttention, \ - BeamSearchRNNDecoderWithAttention -from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer -from open_seq2seq.losses import BasicSequenceLoss -from open_seq2seq.data.text2text.text2text import SpecialTextTokens -from open_seq2seq.optimizers.lr_policies import exp_decay - -data_root = "/data/wmt16_s2s/" - -base_model = Text2Text - -base_params = { - "use_horovod": True, - "num_gpus": 1, # each Horovod process will occupy single GPU - "max_steps": 34000, - "batch_size_per_gpu": 64, - "save_summaries_steps": 50, - "print_loss_steps": 48, - "print_samples_steps": 48, - "eval_steps": 1000, - "save_checkpoint_steps": 2001, - "logdir": "GNMT-like-de-en", - "optimizer": "Adam", - "optimizer_params": {}, - # luong10 decay scheme - "lr_policy": exp_decay, - "lr_policy_params": { - "learning_rate": 0.0008, - "begin_decay_at": 17000, - "decay_steps": 1700, - "decay_rate": 0.5, - "use_staircase_decay": True, - "min_lr": 0.0000005, - }, - #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "dtype": tf.float32, - #"dtype": "mixed", - #"loss_scaling": "Backoff", - "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, - "encoder_params": { - "initializer": tf.random_uniform_initializer, - "initializer_params": { - "minval": -0.1, - "maxval": 0.1, - }, - "encoder_cell_type": "lstm", - "encoder_cell_units": 1024, - "encoder_layers": 7, - "encoder_dp_output_keep_prob": 1.0, - "src_emb_size": 1024, - }, - - "decoder": RNNDecoderWithAttention, - "decoder_params": { - "initializer": tf.random_uniform_initializer, - "initializer_params": { - "minval": -0.1, - "maxval": 0.1, - }, - "core_cell": tf.nn.rnn_cell.LSTMCell, - "core_cell_params": { - "num_units": 1024, - "forget_bias": 1.0, - }, - - "decoder_layers": 8, - "decoder_dp_input_keep_prob": 0.8, - "decoder_dp_output_keep_prob": 1.0, - "decoder_use_skip_connections": True, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - - "tgt_emb_size": 1024, - "attention_type": "gnmt_v2", - "attention_layer_size": 1024, - }, - - "loss": BasicSequenceLoss, - "loss_params": { - "offset_target_by_one": True, - "average_across_timestep": True, - "do_mask": True - } -} - -train_params = { - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": False, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"train.tok.clean.bpe.32000.de", - "target_file": data_root+"train.tok.clean.bpe.32000.en", - "delimiter": " ", - "shuffle": True, - "repeat": True, - "map_parallel_calls": 16, - "prefetch_buffer_size": 8, - "max_length": 50, - }, -} -eval_params = { - "batch_size_per_gpu": 16, - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": False, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"newstest2013.tok.bpe.32000.de", - "target_file": data_root+"newstest2013.tok.bpe.32000.en", - "delimiter": " ", - "shuffle": False, - "repeat": True, - "map_parallel_calls": 16, - "prefetch_buffer_size": 1, - "max_length": 32, - }, -} - -infer_params = { - "batch_size_per_gpu": 1, - "decoder": BeamSearchRNNDecoderWithAttention, - "decoder_params": { - "beam_width": 10, - "length_penalty": 1.0, - "core_cell": tf.nn.rnn_cell.LSTMCell, - "core_cell_params": { - "num_units": 1024, - }, - "decoder_layers": 8, - "decoder_dp_input_keep_prob": 0.8, - "decoder_dp_output_keep_prob": 1.0, - "decoder_use_skip_connections": True, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, - "tgt_emb_size": 1024, - "attention_type": "gnmt_v2", - "attention_layer_size": 1024, - }, - - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": False, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"newstest2014.tok.bpe.32000.de", - # this is intentional - "target_file": data_root+"newstest2014.tok.bpe.32000.de", - "delimiter": " ", - "shuffle": False, - "repeat": False, - "max_length": 512, - }, -} diff --git a/example_configs/text2text/en-de/en-de-gnmt-like-4GPUs.py b/example_configs/text2text/en-de-gnmt-like-4GPUs.py similarity index 84% rename from example_configs/text2text/en-de/en-de-gnmt-like-4GPUs.py rename to example_configs/text2text/en-de-gnmt-like-4GPUs.py index 895552d65..6d13440f0 100644 --- a/example_configs/text2text/en-de/en-de-gnmt-like-4GPUs.py +++ b/example_configs/text2text/en-de-gnmt-like-4GPUs.py @@ -10,7 +10,8 @@ from open_seq2seq.data.text2text.text2text import SpecialTextTokens from open_seq2seq.optimizers.lr_policies import exp_decay -data_root = "[REPLACE THIS TO THE PATH WITH YOUR WMT DATA]" +#data_root = "[REPLACE THIS TO THE PATH WITH YOUR WMT DATA]" +data_root = "/mnt/D1/Data/Translate/wmt16/" base_model = Text2Text @@ -37,12 +38,12 @@ "use_staircase_decay": True, "min_lr": 0.0000005, }, - # "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], "max_grad_norm": 32768.0, "dtype": tf.float32, - # "dtype": "mixed", - # "loss_scaling": "Backoff", + #"dtype": "mixed", + #"automatic_loss_scaling": "Backoff", "encoder": GNMTLikeEncoderWithEmbedding, "encoder_params": { "initializer": tf.random_uniform_initializer, @@ -50,12 +51,10 @@ "minval": -0.1, "maxval": 0.1, }, - #"encoder_cell_type": "lstm", - #"encoder_cell_units": 1024, - "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell": tf.nn.rnn_cell.LSTMCell, "core_cell_params": { - "num_units": 1024, - # "forget_bias": 1.0, + "num_units": 1024, + "forget_bias": 1.0, }, "encoder_layers": 7, "encoder_dp_input_keep_prob": 0.8, @@ -71,12 +70,10 @@ "minval": -0.1, "maxval": 0.1, }, - #"decoder_cell_type": "lstm", - #"decoder_cell_units": 1024, - "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell": tf.nn.rnn_cell.LSTMCell, "core_cell_params": { - "num_units": 1024, - # "forget_bias": 1.0, + "num_units": 1024, + "forget_bias": 1.0, }, "decoder_layers": 8, "decoder_dp_input_keep_prob": 0.8, @@ -138,12 +135,10 @@ "decoder_params": { "beam_width": 10, "length_penalty": 1.0, - #"decoder_cell_type": "lstm", - #"decoder_cell_units": 1024, - "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, + "core_cell": tf.nn.rnn_cell.LSTMCell, "core_cell_params": { - "num_units": 1024, - # "forget_bias": 1.0, + "num_units": 1024, + "forget_bias": 1.0, }, "decoder_layers": 8, "decoder_dp_input_keep_prob": 0.8, @@ -170,4 +165,4 @@ "repeat": False, "max_length": 512, }, -} +} \ No newline at end of file diff --git a/example_configs/text2text/en-de/en-de-nmt-small.py b/example_configs/text2text/en-de-nmt-small.py similarity index 90% rename from example_configs/text2text/en-de/en-de-nmt-small.py rename to example_configs/text2text/en-de-nmt-small.py index 9266827f7..16ae97895 100644 --- a/example_configs/text2text/en-de/en-de-nmt-small.py +++ b/example_configs/text2text/en-de-nmt-small.py @@ -10,7 +10,7 @@ from open_seq2seq.data.text2text.text2text import SpecialTextTokens from open_seq2seq.optimizers.lr_policies import fixed_lr -data_root = "/data/wmt16_s2s/" +data_root = "[REPLACE THIS TO THE PATH WITH YOUR WMT DATA]" # This model should run fine on single GPU such as 1080ti or better base_model = Text2Text @@ -18,34 +18,34 @@ base_params = { "use_horovod": False, "num_gpus": 1, - "max_steps": 100000, + "max_steps": 160082, "batch_size_per_gpu": 128, "save_summaries_steps": 50, "print_loss_steps": 48, "print_samples_steps": 48, - "eval_steps": 3000, + "eval_steps": 1000, "save_checkpoint_steps": 2001, "logdir": "nmt-small-en-de", "optimizer": "Adam", "optimizer_params": {}, "lr_policy": fixed_lr, "lr_policy_params": { - "learning_rate": 0.0008, + "learning_rate": 0.001, + }, + "larc_params": { + "larc_eta": 0.001, }, - #"larc_params": { - # "larc_eta": 0.001, - #}, "dtype": tf.float32, - # "dtype": "mixed", - # "loss_scaling": "Backoff", + #"dtype": "mixed", + #"automatic_loss_scaling": "Backoff", "encoder": BidirectionalRNNEncoderWithEmbedding, "encoder_params": { "initializer": tf.glorot_uniform_initializer, "core_cell": tf.nn.rnn_cell.LSTMCell, "core_cell_params": { - "num_units": 512, - "forget_bias": 1.0, + "num_units": 512, + "forget_bias": 1.0, }, "encoder_layers": 2, "encoder_dp_input_keep_prob": 0.8, @@ -122,8 +122,11 @@ "decoder_params": { "beam_width": 10, "length_penalty": 1.0, - "decoder_cell_type": "lstm", - "decoder_cell_units": 512, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 512, + "forget_bias": 1.0, + }, "decoder_layers": 2, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, @@ -148,4 +151,4 @@ "max_length": 256, "prefetch_buffer_size": 1, }, -} +} \ No newline at end of file diff --git a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-fast.py b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-fast.py deleted file mode 100644 index e538978e2..000000000 --- a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-fast.py +++ /dev/null @@ -1,163 +0,0 @@ -from __future__ import absolute_import, division, print_function -import tensorflow as tf - -from open_seq2seq.models import Text2Text -from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding_cuDNN -from open_seq2seq.decoders import RNNDecoderWithAttention, \ - BeamSearchRNNDecoderWithAttention -from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer -from open_seq2seq.losses import BasicSequenceLoss -from open_seq2seq.data.text2text.text2text import SpecialTextTokens -from open_seq2seq.optimizers.lr_policies import exp_decay - -data_root = "/data/wmt16_s2s/" - -base_model = Text2Text -pad_vocabs_2_eight = True - -base_params = { - "use_horovod": True, - "num_gpus": 1, # each Horovod process will occupy single GPU - "max_steps": 34000, - "batch_size_per_gpu": 64, - "iter_size": 2, # This will make virtual batch_per_gpu = batch_size_per_gpu * iter_size - "save_summaries_steps": 50, - "print_loss_steps": 48, - "print_samples_steps": 48, - "eval_steps": 1000, - "save_checkpoint_steps": 2001, - "logdir": "GNMT-like-en-de", - "optimizer": "Adam", - "optimizer_params": {}, - # luong10 decay scheme - "lr_policy": exp_decay, - "lr_policy_params": { - "learning_rate": 0.0008, - "begin_decay_at": 17000, - "decay_steps": 1700, - "decay_rate": 0.5, - "use_staircase_decay": True, - "min_lr": 0.0000005, - }, - #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - #"dtype": tf.float32, - "dtype": "mixed", - "loss_scaling": "Backoff", - "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, - "encoder_params": { - "initializer": tf.random_uniform_initializer, - "initializer_params": { - "minval": -0.1, - "maxval": 0.1, - }, - "encoder_cell_type": "lstm", - "encoder_cell_units": 1024, - "encoder_layers": 7, - "encoder_dp_output_keep_prob": 1.0, - "src_emb_size": 1024, - }, - - "decoder": RNNDecoderWithAttention, - "decoder_params": { - "initializer": tf.random_uniform_initializer, - "initializer_params": { - "minval": -0.1, - "maxval": 0.1, - }, - "core_cell": tf.nn.rnn_cell.LSTMCell, - "core_cell_params": { - "num_units": 1024, - "forget_bias": 1.0, - }, - - "decoder_layers": 8, - "decoder_dp_input_keep_prob": 0.8, - "decoder_dp_output_keep_prob": 1.0, - "decoder_use_skip_connections": True, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - - "tgt_emb_size": 1024, - "attention_type": "gnmt_v2", - "attention_layer_size": 1024, - }, - - "loss": BasicSequenceLoss, - "loss_params": { - "offset_target_by_one": True, - "average_across_timestep": True, - "do_mask": True - } -} - -train_params = { - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": pad_vocabs_2_eight, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"train.tok.clean.bpe.32000.en", - "target_file": data_root+"train.tok.clean.bpe.32000.de", - "delimiter": " ", - "shuffle": True, - "repeat": True, - "map_parallel_calls": 16, - "prefetch_buffer_size": 8, - "max_length": 50, - }, -} -eval_params = { - "batch_size_per_gpu": 16, - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": pad_vocabs_2_eight, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"newstest2013.tok.bpe.32000.en", - "target_file": data_root+"newstest2013.tok.bpe.32000.de", - "delimiter": " ", - "shuffle": False, - "repeat": True, - "map_parallel_calls": 16, - "prefetch_buffer_size": 1, - "max_length": 32, - }, -} - -infer_params = { - "batch_size_per_gpu": 8, - "decoder": BeamSearchRNNDecoderWithAttention, - "decoder_params": { - "beam_width": 10, - "length_penalty": 1.0, - "core_cell": tf.nn.rnn_cell.LSTMCell, - "core_cell_params": { - "num_units": 1024, - }, - "decoder_layers": 8, - "decoder_dp_input_keep_prob": 0.8, - "decoder_dp_output_keep_prob": 1.0, - "decoder_use_skip_connections": True, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, - "tgt_emb_size": 1024, - "attention_type": "gnmt_v2", - "attention_layer_size": 1024, - }, - - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": pad_vocabs_2_eight, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"newstest2014.tok.bpe.32000.en", - # this is intentional - "target_file": data_root+"newstest2014.tok.bpe.32000.en", - "delimiter": " ", - "shuffle": False, - "repeat": False, - "max_length": 512, - }, -} diff --git a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py b/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py deleted file mode 100644 index 2826b4a86..000000000 --- a/example_configs/text2text/en-de/en-de-gnmt-like-8GPUs-horovod.py +++ /dev/null @@ -1,161 +0,0 @@ -from __future__ import absolute_import, division, print_function -import tensorflow as tf - -from open_seq2seq.models import Text2Text -from open_seq2seq.encoders import GNMTLikeEncoderWithEmbedding_cuDNN -from open_seq2seq.decoders import RNNDecoderWithAttention, \ - BeamSearchRNNDecoderWithAttention -from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer -from open_seq2seq.losses import BasicSequenceLoss -from open_seq2seq.data.text2text.text2text import SpecialTextTokens -from open_seq2seq.optimizers.lr_policies import exp_decay - -data_root = "/data/wmt16_s2s/" - -base_model = Text2Text - -base_params = { - "use_horovod": True, - "num_gpus": 1, # each Horovod process will occupy single GPU - "max_steps": 34000, - "batch_size_per_gpu": 64, - "save_summaries_steps": 50, - "print_loss_steps": 48, - "print_samples_steps": 48, - "eval_steps": 1000, - "save_checkpoint_steps": 2001, - "logdir": "GNMT-like-en-de", - "optimizer": "Adam", - "optimizer_params": {}, - # luong10 decay scheme - "lr_policy": exp_decay, - "lr_policy_params": { - "learning_rate": 0.0008, - "begin_decay_at": 17000, - "decay_steps": 1700, - "decay_rate": 0.5, - "use_staircase_decay": True, - "min_lr": 0.0000005, - }, - #"summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - "dtype": tf.float32, - #"dtype": "mixed", - #"loss_scaling": "Backoff", - "encoder": GNMTLikeEncoderWithEmbedding_cuDNN, - "encoder_params": { - "initializer": tf.random_uniform_initializer, - "initializer_params": { - "minval": -0.1, - "maxval": 0.1, - }, - "encoder_cell_type": "lstm", - "encoder_cell_units": 1024, - "encoder_layers": 7, - "encoder_dp_output_keep_prob": 1.0, - "src_emb_size": 1024, - }, - - "decoder": RNNDecoderWithAttention, - "decoder_params": { - "initializer": tf.random_uniform_initializer, - "initializer_params": { - "minval": -0.1, - "maxval": 0.1, - }, - "core_cell": tf.nn.rnn_cell.LSTMCell, - "core_cell_params": { - "num_units": 1024, - "forget_bias": 1.0, - }, - - "decoder_layers": 8, - "decoder_dp_input_keep_prob": 0.8, - "decoder_dp_output_keep_prob": 1.0, - "decoder_use_skip_connections": True, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - - "tgt_emb_size": 1024, - "attention_type": "gnmt_v2", - "attention_layer_size": 1024, - }, - - "loss": BasicSequenceLoss, - "loss_params": { - "offset_target_by_one": True, - "average_across_timestep": True, - "do_mask": True - } -} - -train_params = { - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": False, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"train.tok.clean.bpe.32000.en", - "target_file": data_root+"train.tok.clean.bpe.32000.de", - "delimiter": " ", - "shuffle": True, - "repeat": True, - "map_parallel_calls": 16, - "prefetch_buffer_size": 8, - "max_length": 50, - }, -} -eval_params = { - "batch_size_per_gpu": 16, - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": False, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"newstest2013.tok.bpe.32000.en", - "target_file": data_root+"newstest2013.tok.bpe.32000.de", - "delimiter": " ", - "shuffle": False, - "repeat": True, - "map_parallel_calls": 16, - "prefetch_buffer_size": 1, - "max_length": 32, - }, -} - -infer_params = { - "batch_size_per_gpu": 1, - "decoder": BeamSearchRNNDecoderWithAttention, - "decoder_params": { - "beam_width": 10, - "length_penalty": 1.0, - "core_cell": tf.nn.rnn_cell.LSTMCell, - "core_cell_params": { - "num_units": 1024, - }, - "decoder_layers": 8, - "decoder_dp_input_keep_prob": 0.8, - "decoder_dp_output_keep_prob": 1.0, - "decoder_use_skip_connections": True, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, - "tgt_emb_size": 1024, - "attention_type": "gnmt_v2", - "attention_layer_size": 1024, - }, - - "data_layer": ParallelTextDataLayer, - "data_layer_params": { - "pad_vocab_to_eight": False, - "src_vocab_file": data_root+"vocab.bpe.32000", - "tgt_vocab_file": data_root+"vocab.bpe.32000", - "source_file": data_root+"newstest2014.tok.bpe.32000.en", - # this is intentional - "target_file": data_root+"newstest2014.tok.bpe.32000.en", - "delimiter": " ", - "shuffle": False, - "repeat": False, - "max_length": 512, - }, -} diff --git a/example_configs/text2text/en-de/transformer-base-test-mp.py b/example_configs/text2text/en-de/transformer-base-test-mp.py deleted file mode 100644 index 8b91e990b..000000000 --- a/example_configs/text2text/en-de/transformer-base-test-mp.py +++ /dev/null @@ -1,133 +0,0 @@ -from __future__ import absolute_import, division, print_function -from open_seq2seq.models import Text2Text -from open_seq2seq.encoders import TransformerEncoder -from open_seq2seq.decoders import TransformerDecoder -from open_seq2seq.data.text2text.text2text import TransformerDataLayer -from open_seq2seq.losses import PaddedCrossEntropyLossWithSmoothing -from open_seq2seq.data.text2text.text2text import SpecialTextTokens -from open_seq2seq.data.text2text.tokenizer import EOS_ID -from open_seq2seq.optimizers.lr_policies import transformer_policy -import tensorflow as tf - -""" -This configuration file describes a variant of Transformer model from -https://arxiv.org/abs/1706.03762 -""" - -base_model = Text2Text -d_model = 512 -num_layers = 6 - -data_root = "/tmp/translate_ende/" - -base_params = { - "use_horovod": False, - "num_gpus": 1, - "batch_size_per_gpu": 4096, # this size is in tokens - "max_steps": 500000, - "save_summaries_steps": 50, - "print_loss_steps": 50, - "print_samples_steps": 50, - "eval_steps": 4001, - "save_checkpoint_steps": 1000, - "logdir": "Transformer-MP", - #"dtype": tf.float32, - "dtype": "mixed", - "loss_scaling": "Backoff", - "optimizer": tf.contrib.opt.LazyAdamOptimizer, - "optimizer_params": { - "beta1": 0.9, - "beta2": 0.997, - "epsilon": 1e-09, - }, - - "lr_policy": transformer_policy, - "lr_policy_params": { - "learning_rate": 2.0, - "warmup_steps": 16000, - "d_model": d_model, - }, - - # "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - - "encoder": TransformerEncoder, - "encoder_params": { - "encoder_layers": num_layers, - "hidden_size": d_model, - "num_heads": 8, - "attention_dropout": 0.1, - "filter_size": 4 * d_model, - "relu_dropout": 0.1, - "layer_postprocess_dropout": 0.1, - "pad_embeddings_2_eight": True, - }, - - "decoder": TransformerDecoder, - "decoder_params": { - "layer_postprocess_dropout": 0.1, - "num_hidden_layers": num_layers, - "hidden_size": d_model, - "num_heads": 8, - "attention_dropout": 0.1, - "relu_dropout": 0.1, - "filter_size": 4 * d_model, - "beam_size": 4, - "alpha": 0.6, - "extra_decode_length": 50, - "EOS_ID": EOS_ID, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, - }, - - "loss": PaddedCrossEntropyLossWithSmoothing, - "loss_params": { - "label_smoothing": 0.1, - } -} - -train_params = { - "data_layer": TransformerDataLayer, - "data_layer_params": { - 'data_dir': data_root, - 'file_pattern': "*train*", - 'src_vocab_file': data_root + "vocab.ende.32768", - 'max_length': 256, - 'shuffle': True, - 'repeat': 100000, - 'mode': 'train', - "delimiter": ' ', - }, -} - -eval_params = { - "batch_size_per_gpu": 256, - "data_layer": TransformerDataLayer, - "data_layer_params": { - 'data_dir': data_root, - 'file_pattern': "*dev*", - 'src_vocab_file': data_root + "vocab.ende.32768", - 'max_length': 256, - 'shuffle': False, - 'repeat': 1, - 'mode': 'train', - "delimiter": ' ', - }, -} - -infer_params = { - "batch_size_per_gpu": 64, # it is now in samples, not tokens - "data_layer": TransformerDataLayer, - "data_layer_params": { - 'data_dir': data_root, - 'file_pattern': "*test*", - 'batch_in_tokens': False, # this is necessary to preserve the order - 'src_vocab_file': data_root + "vocab.ende.32768", - 'max_length': 256, - 'shuffle': False, - 'repeat': 1, - 'mode': 'train', - "delimiter": ' ', - }, -} \ No newline at end of file diff --git a/example_configs/text2text/en-de/transformer-base-test.py b/example_configs/text2text/en-de/transformer-base-test.py deleted file mode 100644 index 0ba99384e..000000000 --- a/example_configs/text2text/en-de/transformer-base-test.py +++ /dev/null @@ -1,133 +0,0 @@ -from __future__ import absolute_import, division, print_function -from open_seq2seq.models import Text2Text -from open_seq2seq.encoders import TransformerEncoder -from open_seq2seq.decoders import TransformerDecoder -from open_seq2seq.data.text2text.text2text import TransformerDataLayer -from open_seq2seq.losses import PaddedCrossEntropyLossWithSmoothing -from open_seq2seq.data.text2text.text2text import SpecialTextTokens -from open_seq2seq.data.text2text.tokenizer import EOS_ID -from open_seq2seq.optimizers.lr_policies import transformer_policy -import tensorflow as tf - -""" -This configuration file describes a variant of Transformer model from -https://arxiv.org/abs/1706.03762 -""" - -base_model = Text2Text -d_model = 512 -num_layers = 6 - -data_root = "/tmp/translate_ende/" - -base_params = { - "use_horovod": False, - "num_gpus": 1, - "batch_size_per_gpu": 4096, # this size is in tokens - "max_steps": 500000, - "save_summaries_steps": 50, - "print_loss_steps": 50, - "print_samples_steps": 50, - "eval_steps": 4001, - "save_checkpoint_steps": 1000, - "logdir": "Transformer-FP32", - "dtype": tf.float32, - # "dtype": "mixed", - # "loss_scaling": "Backoff", - "optimizer": tf.contrib.opt.LazyAdamOptimizer, - "optimizer_params": { - "beta1": 0.9, - "beta2": 0.997, - "epsilon": 1e-09, - }, - - "lr_policy": transformer_policy, - "lr_policy_params": { - "learning_rate": 2.0, - "warmup_steps": 16000, - "d_model": d_model, - }, - - # "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', - # 'variable_norm', 'gradient_norm', 'global_gradient_norm'], - - "encoder": TransformerEncoder, - "encoder_params": { - "encoder_layers": num_layers, - "hidden_size": d_model, - "num_heads": 8, - "attention_dropout": 0.1, - "filter_size": 4 * d_model, - "relu_dropout": 0.1, - "layer_postprocess_dropout": 0.1, - "pad_embeddings_2_eight": True, - }, - - "decoder": TransformerDecoder, - "decoder_params": { - "layer_postprocess_dropout": 0.1, - "num_hidden_layers": num_layers, - "hidden_size": d_model, - "num_heads": 8, - "attention_dropout": 0.1, - "relu_dropout": 0.1, - "filter_size": 4 * d_model, - "beam_size": 4, - "alpha": 0.6, - "extra_decode_length": 50, - "EOS_ID": EOS_ID, - "GO_SYMBOL": SpecialTextTokens.S_ID.value, - "END_SYMBOL": SpecialTextTokens.EOS_ID.value, - "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, - }, - - "loss": PaddedCrossEntropyLossWithSmoothing, - "loss_params": { - "label_smoothing": 0.1, - } -} - -train_params = { - "data_layer": TransformerDataLayer, - "data_layer_params": { - 'data_dir': data_root, - 'file_pattern': "*train*", - 'src_vocab_file': data_root + "vocab.ende.32768", - 'max_length': 256, - 'shuffle': True, - 'repeat': 100000, - 'mode': 'train', - "delimiter": ' ', - }, -} - -eval_params = { - "batch_size_per_gpu": 256, - "data_layer": TransformerDataLayer, - "data_layer_params": { - 'data_dir': data_root, - 'file_pattern': "*dev*", - 'src_vocab_file': data_root + "vocab.ende.32768", - 'max_length': 256, - 'shuffle': False, - 'repeat': 1, - 'mode': 'train', - "delimiter": ' ', - }, -} - -infer_params = { - "batch_size_per_gpu": 64, # it is now in samples, not tokens - "data_layer": TransformerDataLayer, - "data_layer_params": { - 'data_dir': data_root, - 'file_pattern': "*test*", - 'batch_in_tokens': False, # this is necessary to preserve the order - 'src_vocab_file': data_root + "vocab.ende.32768", - 'max_length': 256, - 'shuffle': False, - 'repeat': 1, - 'mode': 'train', - "delimiter": ' ', - }, -} \ No newline at end of file diff --git a/example_configs/text2text/nmt-reversal-RR.py b/example_configs/text2text/nmt-reversal-RR.py index 0babbc646..ba76b2b12 100644 --- a/example_configs/text2text/nmt-reversal-RR.py +++ b/example_configs/text2text/nmt-reversal-RR.py @@ -37,17 +37,15 @@ 'learning_rate': 0.001 }, "max_grad_norm": 3.0, - #"dtype": tf.float32, - "dtype": "mixed", + "dtype": tf.float32, + #"dtype": "mixed", "encoder": BidirectionalRNNEncoderWithEmbedding, "encoder_params": { - #"encoder_cell_type": "lstm", - #"encoder_cell_units": 128, "core_cell": tf.nn.rnn_cell.LSTMCell, "core_cell_params": { "num_units": 128, - #"forget_bias": 1.0, + "forget_bias": 1.0, }, "encoder_layers": 1, "encoder_dp_input_keep_prob": 0.8, @@ -58,10 +56,7 @@ "decoder": RNNDecoderWithAttention, "decoder_params": { - #"decoder_cell_type": "lstm", - #"decoder_cell_units": 128, "core_cell": tf.nn.rnn_cell.LSTMCell, - # tf.nn.rnn_cell.LSTMCell, "core_cell_params": { "num_units": 128, # "forget_bias": 1.0, @@ -121,11 +116,10 @@ "decoder_params": { #"decoder_cell_type": "lstm", #"decoder_cell_units": 128, - "core_cell": tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell, - # tf.nn.rnn_cell.LSTMCell, + "core_cell": tf.nn.rnn_cell.LSTMCell, "core_cell_params": { "num_units": 128, - # "forget_bias": 1.0, + "forget_bias": 1.0, }, "decoder_layers": 1, "decoder_dp_input_keep_prob": 0.8, diff --git a/example_configs/text2text/nmt-reversal-RT.py b/example_configs/text2text/nmt-reversal-RT.py index 0b04d9a3b..0d1247171 100644 --- a/example_configs/text2text/nmt-reversal-RT.py +++ b/example_configs/text2text/nmt-reversal-RT.py @@ -44,8 +44,10 @@ "encoder": BidirectionalRNNEncoderWithEmbedding, "encoder_params": { - "encoder_cell_type": "lstm", - "encoder_cell_units": 128, + "core_cell_params": { + "num_units": 128, + "forget_bias": 1.0, + }, "encoder_layers": 1, "encoder_dp_input_keep_prob": 0.8, "encoder_dp_output_keep_prob": 1.0, diff --git a/example_configs/text2text/en-de/transformer-base.py b/example_configs/text2text/transformer-big.py similarity index 98% rename from example_configs/text2text/en-de/transformer-base.py rename to example_configs/text2text/transformer-big.py index 6f2e3e52e..fd819bca3 100644 --- a/example_configs/text2text/en-de/transformer-base.py +++ b/example_configs/text2text/transformer-big.py @@ -33,7 +33,7 @@ "logdir": "Transformer-FP32", "dtype": tf.float32, # "dtype": "mixed", - # "loss_scaling": "Backoff", + # "automatic_loss_scaling": "Backoff", "optimizer": tf.contrib.opt.LazyAdamOptimizer, "optimizer_params": { "beta1": 0.9, From 2f00013c913eccc4353f4132f94a981f7862220f Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Fri, 22 Jun 2018 17:39:05 -0700 Subject: [PATCH 080/102] adjust config --- example_configs/text2text/en-de-gnmt-like-4GPUs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/example_configs/text2text/en-de-gnmt-like-4GPUs.py b/example_configs/text2text/en-de-gnmt-like-4GPUs.py index 6d13440f0..585bf4389 100644 --- a/example_configs/text2text/en-de-gnmt-like-4GPUs.py +++ b/example_configs/text2text/en-de-gnmt-like-4GPUs.py @@ -10,8 +10,7 @@ from open_seq2seq.data.text2text.text2text import SpecialTextTokens from open_seq2seq.optimizers.lr_policies import exp_decay -#data_root = "[REPLACE THIS TO THE PATH WITH YOUR WMT DATA]" -data_root = "/mnt/D1/Data/Translate/wmt16/" +data_root = "[REPLACE THIS TO THE PATH WITH YOUR WMT DATA]" base_model = Text2Text From 6568ee98f79fb9e9aa292a6ef79ede3d5bac453f Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 25 Jun 2018 09:51:38 -0700 Subject: [PATCH 081/102] Add training_step to print_logs and finialize_evaluation --- open_seq2seq/models/image2label.py | 4 ++-- open_seq2seq/models/model.py | 7 +++++-- open_seq2seq/models/speech2text.py | 4 ++-- open_seq2seq/models/speech2text_test.py | 2 +- open_seq2seq/models/text2text.py | 4 ++-- open_seq2seq/utils/hooks.py | 4 ++-- 6 files changed, 14 insertions(+), 11 deletions(-) diff --git a/open_seq2seq/models/image2label.py b/open_seq2seq/models/image2label.py index 9e7e4380a..1c8565487 100644 --- a/open_seq2seq/models/image2label.py +++ b/open_seq2seq/models/image2label.py @@ -12,7 +12,7 @@ class Image2Label(EncoderDecoderModel): - def maybe_print_logs(self, input_values, output_values): + def maybe_print_logs(self, input_values, output_values, training_step): labels = input_values['target_tensors'][0] logits = output_values[0] @@ -31,7 +31,7 @@ def maybe_print_logs(self, input_values, output_values): "Train batch top-5": top5, } - def finalize_evaluation(self, results_per_batch): + def finalize_evaluation(self, results_per_batch, training_step=None): top1 = 0.0 top5 = 0.0 total = 0.0 diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index b84136b5b..0d5761ee6 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -457,7 +457,7 @@ def _build_forward_pass_graph(self, input_tensors, gpu_id=0): """ pass - def maybe_print_logs(self, input_values, output_values): + def maybe_print_logs(self, input_values, output_values, training_step): """This method can be used to print logs that help to visualize training. For example, you can print sample input sequences and their corresponding predictions. This method will be called every ``print_samples_steps`` @@ -475,6 +475,7 @@ def maybe_print_logs(self, input_values, output_values): output_values: evaluation of :meth:`self.get_output_tensors(0) `, that is, output tensors for one batch on the *first* GPU. + training_step (int): Current training step. Returns: dict: dictionary with values that need to be logged to TensorBoard @@ -519,7 +520,7 @@ def evaluate(self, input_values, output_values): """ return [] - def finalize_evaluation(self, results_per_batch): + def finalize_evaluation(self, results_per_batch, training_step=None): """This method can be used in conjunction with :meth:`self.evaluate()` to calculate evaluation metrics. @@ -542,6 +543,8 @@ def finalize_evaluation(self, results_per_batch): results_per_batch (list): aggregation of values returned from all calls to :meth:`self.evaluate()` method (number of calls will be equal to number of evaluation batches). + training_step (int): current training step. Will only be passed if mode + is "train_eval". Returns: dict: dictionary with values that need to be logged to TensorBoard diff --git a/open_seq2seq/models/speech2text.py b/open_seq2seq/models/speech2text.py index 976817bb2..62f1cf001 100644 --- a/open_seq2seq/models/speech2text.py +++ b/open_seq2seq/models/speech2text.py @@ -49,7 +49,7 @@ def _create_decoder(self): ) return super(Speech2Text, self)._create_decoder() - def maybe_print_logs(self, input_values, output_values): + def maybe_print_logs(self, input_values, output_values, training_step): y, len_y = input_values['target_tensors'] decoded_sequence = output_values y_one_sample = y[0] @@ -74,7 +74,7 @@ def maybe_print_logs(self, input_values, output_values): 'Sample WER': sample_wer, } - def finalize_evaluation(self, results_per_batch): + def finalize_evaluation(self, results_per_batch, training_step=None): total_word_lev = 0.0 total_word_count = 0.0 for word_lev, word_count in results_per_batch: diff --git a/open_seq2seq/models/speech2text_test.py b/open_seq2seq/models/speech2text_test.py index feed83b75..b069a8fe7 100644 --- a/open_seq2seq/models/speech2text_test.py +++ b/open_seq2seq/models/speech2text_test.py @@ -300,7 +300,7 @@ def test_maybe_functions(self): inp_dict = {'source_tensors': [input_values[0][0], input_values[0][1]], 'target_tensors': [input_values[0][2], input_values[0][3]]} - output_dict = model.maybe_print_logs(inp_dict, output_values[0]) + output_dict = model.maybe_print_logs(inp_dict, output_values[0], 0) self.assertEqual(output_dict['Sample WER'], 0.4) diff --git a/open_seq2seq/models/text2text.py b/open_seq2seq/models/text2text.py index baef575a3..8ef385339 100644 --- a/open_seq2seq/models/text2text.py +++ b/open_seq2seq/models/text2text.py @@ -111,7 +111,7 @@ def finalize_inference(self, results_per_batch, output_file): deco_print("") step += 1 - def maybe_print_logs(self, input_values, output_values): + def maybe_print_logs(self, input_values, output_values, training_step): x, len_x = input_values['source_tensors'] y, len_y = input_values['target_tensors'] samples = output_values[0] @@ -201,7 +201,7 @@ def evaluate(self, input_values, output_values): return preds, targets - def finalize_evaluation(self, results_per_batch): + def finalize_evaluation(self, results_per_batch, training_step=None): preds, targets = [], [] for preds_cur, targets_cur in results_per_batch: if self.params.get('eval_using_bleu', True): diff --git a/open_seq2seq/utils/hooks.py b/open_seq2seq/utils/hooks.py index 5ee72799f..037ba4346 100644 --- a/open_seq2seq/utils/hooks.py +++ b/open_seq2seq/utils/hooks.py @@ -89,7 +89,7 @@ def after_run(self, run_context, run_values): self._timer.update_last_triggered_step(self._iter_count - 1) input_values, output_values = results - dict_to_log = self._model.maybe_print_logs(input_values, output_values) + dict_to_log = self._model.maybe_print_logs(input_values, output_values, step) # optionally logging to tensorboard any values # returned from maybe_print_logs if self._model.params['save_summaries_steps'] and dict_to_log: @@ -193,7 +193,7 @@ def after_run(self, run_context, run_values): if not self._model.on_horovod or self._model.hvd.rank() == 0: deco_print("Validation loss: {:.4f}".format(total_loss), offset=4) - dict_to_log = self._model.finalize_evaluation(results_per_batch) + dict_to_log = self._model.finalize_evaluation(results_per_batch, step) dict_to_log['eval_loss'] = total_loss # saving the best validation model From 16725d0686dd88242dcc7caa85a90c354665fb96 Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 25 Jun 2018 10:31:33 -0700 Subject: [PATCH 082/102] Add nested_updates for different modes --- open_seq2seq/utils/utils.py | 9 ++++++++- run.py | 7 ++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index 54d22893b..b9db4ff6d 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -315,7 +315,14 @@ def nest_dict(flat_dict): def nested_update(org_dict, upd_dict): for key, value in upd_dict.items(): if isinstance(value, dict): - nested_update(org_dict[key], value) + if key in org_dict: + if not isinstance(org_dict[key], dict): + raise ValueError( + "Mismatch between org_dict and upd_dict at node {}".format(key) + ) + nested_update(org_dict[key], value) + else: + org_dict[key] = value else: org_dict[key] = value diff --git a/run.py b/run.py index 93e5f1a23..8949b2b10 100644 --- a/run.py +++ b/run.py @@ -170,13 +170,13 @@ def main(): if args.mode == 'train' or args.mode == 'train_eval': if 'train_params' in config_module: - train_config.update(copy.deepcopy(config_module['train_params'])) + nested_update(train_config, copy.deepcopy(config_module['train_params'])) if hvd is None or hvd.rank() == 0: deco_print("Training config:") pprint.pprint(train_config) if args.mode == 'eval' or args.mode == 'train_eval': if 'eval_params' in config_module: - eval_config.update(copy.deepcopy(config_module['eval_params'])) + nested_update(eval_config, copy.deepcopy(config_module['eval_params'])) if hvd is None or hvd.rank() == 0: deco_print("Evaluation config:") pprint.pprint(eval_config) @@ -184,7 +184,8 @@ def main(): if args.infer_output_file is None: raise ValueError("\"infer_output_file\" command line parameter is " "required in inference mode") - infer_config.update(copy.deepcopy(config_module['infer_params'])) + if "infer_params" in config_module: + nested_update(infer_config, copy.deepcopy(config_module['infer_params'])) if hvd is None or hvd.rank() == 0: deco_print("Inference config:") From a9fa9ee15512bc07e4b571322dd18c26307e76b5 Mon Sep 17 00:00:00 2001 From: Kipok Date: Mon, 25 Jun 2018 14:51:45 -0700 Subject: [PATCH 083/102] Rename samples -> outputs in decoder_output --- open_seq2seq/decoders/decoder.py | 3 ++- open_seq2seq/decoders/fc_decoders.py | 12 +++++----- open_seq2seq/decoders/rnn_decoders.py | 10 ++++---- open_seq2seq/decoders/transformer_decoder.py | 24 ++++++++------------ open_seq2seq/models/encoder_decoder.py | 8 +++---- open_seq2seq/models/model.py | 8 +++---- 6 files changed, 31 insertions(+), 34 deletions(-) diff --git a/open_seq2seq/decoders/decoder.py b/open_seq2seq/decoders/decoder.py index 92d8f7283..4eb811b3b 100644 --- a/open_seq2seq/decoders/decoder.py +++ b/open_seq2seq/decoders/decoder.py @@ -168,7 +168,8 @@ def _decode(self, input_dict): { "logits": logits that will be passed to Loss - "samples": actual decoded output, e.g. characters instead of logits + "outputs": list with actual decoded outputs, e.g. characters + instead of logits } """ pass diff --git a/open_seq2seq/decoders/fc_decoders.py b/open_seq2seq/decoders/fc_decoders.py index 55d59a8f7..46106b135 100644 --- a/open_seq2seq/decoders/fc_decoders.py +++ b/open_seq2seq/decoders/fc_decoders.py @@ -54,7 +54,7 @@ def _decode(self, input_dict): { 'logits': logits with the shape=[batch_size, output_dim] - 'samples': [logits] (same as logits but wrapped in list) + 'outputs': [logits] (same as logits but wrapped in list) } """ inputs = input_dict['encoder_output']['outputs'] @@ -67,7 +67,7 @@ def _decode(self, input_dict): kernel_regularizer=regularizer, name='fully_connected', ) - return {'logits': logits, 'samples': [logits]} + return {'logits': logits, 'outputs': [logits]} class FullyConnectedTimeDecoder(Decoder): @@ -97,7 +97,7 @@ def __init__(self, params, model, * **tgt_vocab_size** (int) --- target vocabulary size, i.e. number of output features. * **logits_to_outputs_func** --- function that maps produced logits to - decoder samples, i.e. actual text sequences. + decoder outputs, i.e. actual text sequences. """ super(FullyConnectedTimeDecoder, self).__init__(params, model, name, mode) @@ -119,7 +119,7 @@ def _decode(self, input_dict): { 'logits': logits with the shape=[time length, batch_size, tgt_vocab_size] - 'samples': logits_to_outputs_func(logits, input_dict) + 'outputs': logits_to_outputs_func(logits, input_dict) } """ inputs = input_dict['encoder_output']['outputs'] @@ -146,9 +146,9 @@ def _decode(self, input_dict): logits = tf.transpose(logits, [1, 0, 2]) if 'logits_to_outputs_func' in self.params: - samples = self.params['logits_to_outputs_func'](logits, input_dict) + outputs = self.params['logits_to_outputs_func'](logits, input_dict) return { - 'samples': samples, + 'outputs': outputs, 'logits': logits, 'src_length': input_dict['encoder_output']['src_length'], } diff --git a/open_seq2seq/decoders/rnn_decoders.py b/open_seq2seq/decoders/rnn_decoders.py index 522ef28a9..096181d89 100644 --- a/open_seq2seq/decoders/rnn_decoders.py +++ b/open_seq2seq/decoders/rnn_decoders.py @@ -278,8 +278,8 @@ def _decode(self, input_dict): ) return {'logits': final_outputs.rnn_output if not time_major else - tf.transpose(final_outputs.rnn_output, perm=[1, 0, 2]), - 'samples': [tf.argmax(final_outputs.rnn_output, axis=-1)], + tf.transpose(final_outputs.rnn_output, perm=[1, 0, 2]), + 'outputs': [tf.argmax(final_outputs.rnn_output, axis=-1)], 'final_state': final_state, 'final_sequence_lengths': final_sequence_lengths} @@ -438,7 +438,7 @@ def _decode(self, input_dict): embedding_fn = lambda ids: tf.cast( tf.nn.embedding_lookup(self._dec_emb_w, ids), dtype=self.params['dtype']) - #decoder = tf.contrib.seq2seq.BeamSearchDecoder( + # decoder = tf.contrib.seq2seq.BeamSearchDecoder( decoder = BeamSearchDecoder( cell=attentive_decoder_cell, embedding=embedding_fn, @@ -464,7 +464,7 @@ def _decode(self, input_dict): ) return {'logits': final_outputs.predicted_ids[:, :, 0] if not time_major else - tf.transpose(final_outputs.predicted_ids[:, :, 0], perm=[1, 0, 2]), - 'samples': [final_outputs.predicted_ids[:, :, 0]], + tf.transpose(final_outputs.predicted_ids[:, :, 0], perm=[1, 0, 2]), + 'outputs': [final_outputs.predicted_ids[:, :, 0]], 'final_state': final_state, 'final_sequence_lengths': final_sequence_lengths} diff --git a/open_seq2seq/decoders/transformer_decoder.py b/open_seq2seq/decoders/transformer_decoder.py index 2300b3550..dd7a2080a 100644 --- a/open_seq2seq/decoders/transformer_decoder.py +++ b/open_seq2seq/decoders/transformer_decoder.py @@ -10,7 +10,8 @@ from open_seq2seq.parts.transformer import utils, attention_layer, \ ffn_layer, beam_search from open_seq2seq.parts.transformer.common import PrePostProcessingWrapper, \ - LayerNormalization + LayerNormalization + class TransformerDecoder(Decoder): @staticmethod @@ -71,7 +72,7 @@ def __init__(self, params, model, self.layers = [] def _decode(self, input_dict): - #targets = input_dict['tgt_sequence'] + # targets = input_dict['tgt_sequence'] targets = input_dict['target_tensors'][0] if 'target_tensors' \ in input_dict else None encoder_outputs = input_dict['encoder_output']['outputs'] @@ -110,13 +111,12 @@ def _decode(self, input_dict): else: logits = self.decode_pass(targets, encoder_outputs, inputs_attention_bias) return {"logits": logits, - "samples": [tf.argmax(logits, axis=-1)], + "outputs": [tf.argmax(logits, axis=-1)], "final_state": None, "final_sequence_lengths": None} - def _call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias, - attention_bias, cache=None): + attention_bias, cache=None): for n, layer in enumerate(self.layers): self_attention_layer = layer[0] enc_dec_attention_layer = layer[1] @@ -128,7 +128,7 @@ def _call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias, with tf.variable_scope(layer_name): with tf.variable_scope("self_attention"): # TODO: Figure out why this is needed - #decoder_self_attention_bias = tf.cast(x=decoder_self_attention_bias, + # decoder_self_attention_bias = tf.cast(x=decoder_self_attention_bias, # dtype=decoder_inputs.dtype) decoder_inputs = self_attention_layer( decoder_inputs, decoder_self_attention_bias, cache=layer_cache) @@ -140,7 +140,6 @@ def _call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias, return self.output_normalization(decoder_inputs) - def decode_pass(self, targets, encoder_outputs, inputs_attention_bias): """Generate logits for each value in the target sequence. @@ -163,7 +162,7 @@ def decode_pass(self, targets, encoder_outputs, inputs_attention_bias): decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] - #decoder_inputs += utils.get_position_encoding( + # decoder_inputs += utils.get_position_encoding( # length, self.params["hidden_size"]) decoder_inputs += tf.cast(utils.get_position_encoding( length, self.params["hidden_size"]), dtype=self.params['dtype']) @@ -172,11 +171,10 @@ def decode_pass(self, targets, encoder_outputs, inputs_attention_bias): decoder_inputs, 1 - self.params["layer_postprocess_dropout"]) # Run values - #decoder_self_attention_bias = tf.cast(x=utils.get_decoder_self_attention_bias( + # decoder_self_attention_bias = tf.cast(x=utils.get_decoder_self_attention_bias( # length), dtype=decoder_inputs.dtype) decoder_self_attention_bias = utils.get_decoder_self_attention_bias(length) - # do decode outputs = self._call(decoder_inputs=decoder_inputs, encoder_outputs=encoder_outputs, @@ -191,7 +189,7 @@ def _get_symbols_to_logits_fn(self, max_decode_length): timing_signal = utils.get_position_encoding( max_decode_length + 1, self.params["hidden_size"]) - #decoder_self_attention_bias = tf.cast(x=utils.get_decoder_self_attention_bias( + # decoder_self_attention_bias = tf.cast(x=utils.get_decoder_self_attention_bias( # max_decode_length), dtype=self.params['dtype']) decoder_self_attention_bias = utils.get_decoder_self_attention_bias( max_decode_length) @@ -279,8 +277,6 @@ def predict(self, encoder_outputs, encoder_decoder_attention_bias): # tf.shape(top_decoded_ids)[1], # self.params["tgt_vocab_size"]]), "logits": logits, - "samples": [top_decoded_ids], + "outputs": [top_decoded_ids], "final_state": None, "final_sequence_lengths": None} - - diff --git a/open_seq2seq/models/encoder_decoder.py b/open_seq2seq/models/encoder_decoder.py index cc74f0871..4a3858b3f 100644 --- a/open_seq2seq/models/encoder_decoder.py +++ b/open_seq2seq/models/encoder_decoder.py @@ -130,8 +130,8 @@ def _build_forward_pass_graph(self, input_tensors, gpu_id=0): Returns: tuple: tuple containing loss tensor as returned from - ``loss.compute_loss()`` and samples tensor, which is taken from - ``decoder.decode()['samples']``. When ``mode == 'infer'``, loss will + ``loss.compute_loss()`` and list of outputs tensors, which is taken from + ``decoder.decode()['outputs']``. When ``mode == 'infer'``, loss will be None. """ if not isinstance(input_tensors, dict) or \ @@ -159,7 +159,7 @@ def _build_forward_pass_graph(self, input_tensors, gpu_id=0): if self.mode == "train": decoder_input['target_tensors'] = target_tensors decoder_output = self.decoder.decode(input_dict=decoder_input) - decoder_samples = decoder_output.get("samples", None) + model_outputs = decoder_output.get("outputs", None) if self.mode == "train" or self.mode == "eval": with tf.variable_scope("Loss"): @@ -171,7 +171,7 @@ def _build_forward_pass_graph(self, input_tensors, gpu_id=0): else: deco_print("Inference Mode. Loss part of graph isn't built.") loss = None - return loss, decoder_samples + return loss, model_outputs @property def encoder(self): diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index 0d5761ee6..588dd8450 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -331,7 +331,7 @@ def compile(self, force_var_reuse=False): ) if self._outputs[gpu_cnt] is not None and \ not isinstance(self._outputs[gpu_cnt], list): - raise ValueError('Decoder samples have to be either None or list') + raise ValueError('Decoder outputs have to be either None or list') if self._mode == "train" or self._mode == "eval": losses.append(loss) # end of for gpu_ind loop @@ -357,7 +357,7 @@ def compile(self, force_var_reuse=False): loss, self._output = self._build_forward_pass_graph(input_tensors, gpu_id=0) if self._output is not None and not isinstance(self._output, list): - raise ValueError('Decoder samples have to be either None or list') + raise ValueError('Decoder outputs have to be either None or list') if self._mode == "train": self.loss = loss @@ -442,7 +442,7 @@ def _build_forward_pass_graph(self, input_tensors, gpu_id=0): is constructed. For Horovod this is always zero. Returns: - tuple: tuple containing loss tensor and samples tensor. + tuple: tuple containing loss tensor and list of outputs tensors. Loss tensor will be automatically provided to the optimizer and corresponding :attr:`train_op` will be created. @@ -452,7 +452,7 @@ def _build_forward_pass_graph(self, input_tensors, gpu_id=0): this happens inside :class:`utils.hooks.RunEvaluationHook` to fetch output values for evaluation. - Both loss and samples can be None when corresponding part of the graph + Both loss and outputs can be None when corresponding part of the graph is not built. """ pass From 91702d1a8722e88964c7afc5fe92482708b3df19 Mon Sep 17 00:00:00 2001 From: Vitaly Lavrukhin Date: Wed, 27 Jun 2018 14:06:28 -0700 Subject: [PATCH 084/102] Added audio normalization, switched to padding with zeros --- open_seq2seq/data/speech2text/speech_utils.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/open_seq2seq/data/speech2text/speech_utils.py b/open_seq2seq/data/speech2text/speech_utils.py index 73bd9f1a9..53357cc21 100644 --- a/open_seq2seq/data/speech2text/speech_utils.py +++ b/open_seq2seq/data/speech2text/speech_utils.py @@ -42,6 +42,13 @@ def get_speech_features_from_file(filename, num_features, pad_to=8, ) +def normalize_signal(signal): + """ + Normalize float32 signal to [-1, 1] range + """ + return signal / np.max(np.abs(signal)) + + def augment_audio_signal(signal, fs, augmentation): """Function that performs audio signal augmentation. @@ -53,7 +60,7 @@ def augment_audio_signal(signal, fs, augmentation): Returns: np.array: np.array with augmented audio signal. """ - signal_float = signal.astype(np.float32) / 32768.0 + signal_float = normalize_signal(signal.astype(np.float32)) if augmentation['time_stretch_ratio'] > 0: # time stretch (might be slow) @@ -72,7 +79,7 @@ def augment_audio_signal(signal, fs, augmentation): signal_float += np.random.randn(signal_float.shape[0]) * \ 10.0 ** (noise_level_db / 20.0) - return (signal_float * 32768.0).astype(np.int16) + return (normalize_signal(signal_float) * 32767.0).astype(np.int16) def get_speech_features(signal, fs, num_features, pad_to=8, @@ -118,7 +125,7 @@ def get_speech_features(signal, fs, num_features, pad_to=8, if pad_to > 0: if length % pad_to != 0: pad_size = (pad_to - length % pad_to) * n_window_stride - signal = np.pad(signal, (0, pad_size), mode='reflect') + signal = np.pad(signal, (0, pad_size), mode='constant') if features_type == 'spectrogram': frames = psf.sigproc.framesig(sig=signal, From 845a5850a41b55566915e551428e0066f16c1a6d Mon Sep 17 00:00:00 2001 From: Vitaly Lavrukhin Date: Wed, 27 Jun 2018 14:22:03 -0700 Subject: [PATCH 085/102] Added AUTOTUNE flag to prefetch in data layers --- open_seq2seq/data/image2label/image2label.py | 4 ++-- open_seq2seq/data/speech2text/speech2text.py | 2 +- open_seq2seq/data/text2text/text2text.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/open_seq2seq/data/image2label/image2label.py b/open_seq2seq/data/image2label/image2label.py index 32e643c43..9255946f8 100644 --- a/open_seq2seq/data/image2label/image2label.py +++ b/open_seq2seq/data/image2label/image2label.py @@ -129,7 +129,7 @@ def build_graph(self): ) dataset = dataset.batch(self.params['batch_size']) - dataset = dataset.prefetch(1) + dataset = dataset.prefetch(tf.contrib.AUTOTUNE) self._iterator = dataset.make_initializable_iterator() inputs, labels = self.iterator.get_next() @@ -231,7 +231,7 @@ def build_graph(self): ) dataset = dataset.batch(self.params['batch_size']) - dataset = dataset.prefetch(1) + dataset = dataset.prefetch(tf.contrib.AUTOTUNE) self._iterator = dataset.make_initializable_iterator() inputs, labels = self.iterator.get_next() diff --git a/open_seq2seq/data/speech2text/speech2text.py b/open_seq2seq/data/speech2text/speech2text.py index f6f039fc1..f4f174af2 100644 --- a/open_seq2seq/data/speech2text/speech2text.py +++ b/open_seq2seq/data/speech2text/speech2text.py @@ -145,7 +145,7 @@ def build_graph(self): padded_shapes=([None, self.params['num_audio_features']], 1, 1) ) - self._iterator = self._dataset.prefetch(8).make_initializable_iterator() + self._iterator = self._dataset.prefetch(tf.contrib.AUTOTUNE).make_initializable_iterator() if self.params['mode'] != 'infer': x, x_length, y, y_length = self._iterator.get_next() diff --git a/open_seq2seq/data/text2text/text2text.py b/open_seq2seq/data/text2text/text2text.py index 1206d5be7..6604a2dcb 100644 --- a/open_seq2seq/data/text2text/text2text.py +++ b/open_seq2seq/data/text2text/text2text.py @@ -77,7 +77,7 @@ def __init__(self, params, model, num_workers=1, worker_id=0): self._delimiter = self.params.get('delimiter', ' ') self._map_parallel_calls = self.params.get('map_parallel_calls', 8) self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight', False) - self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', 4) + self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', tf.contrib.AUTOTUNE) self._num_workers = num_workers self._worker_id = worker_id if self._pad_lengths_to_eight and not (self.params['max_length'] % 8 == 0): From 479f1ce3a021e46f3be01e6e67efb9020e63d80a Mon Sep 17 00:00:00 2001 From: Kipok Date: Wed, 27 Jun 2018 18:23:10 -0700 Subject: [PATCH 086/102] Fix typo with AUTOTUNE --- open_seq2seq/data/image2label/image2label.py | 4 ++-- open_seq2seq/data/speech2text/speech2text.py | 3 ++- open_seq2seq/data/text2text/text2text.py | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/open_seq2seq/data/image2label/image2label.py b/open_seq2seq/data/image2label/image2label.py index 9255946f8..a16023327 100644 --- a/open_seq2seq/data/image2label/image2label.py +++ b/open_seq2seq/data/image2label/image2label.py @@ -129,7 +129,7 @@ def build_graph(self): ) dataset = dataset.batch(self.params['batch_size']) - dataset = dataset.prefetch(tf.contrib.AUTOTUNE) + dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) self._iterator = dataset.make_initializable_iterator() inputs, labels = self.iterator.get_next() @@ -231,7 +231,7 @@ def build_graph(self): ) dataset = dataset.batch(self.params['batch_size']) - dataset = dataset.prefetch(tf.contrib.AUTOTUNE) + dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) self._iterator = dataset.make_initializable_iterator() inputs, labels = self.iterator.get_next() diff --git a/open_seq2seq/data/speech2text/speech2text.py b/open_seq2seq/data/speech2text/speech2text.py index f4f174af2..20dc94bd0 100644 --- a/open_seq2seq/data/speech2text/speech2text.py +++ b/open_seq2seq/data/speech2text/speech2text.py @@ -145,7 +145,8 @@ def build_graph(self): padded_shapes=([None, self.params['num_audio_features']], 1, 1) ) - self._iterator = self._dataset.prefetch(tf.contrib.AUTOTUNE).make_initializable_iterator() + self._iterator = self._dataset.prefetch(tf.contrib.data.AUTOTUNE)\ + .make_initializable_iterator() if self.params['mode'] != 'infer': x, x_length, y, y_length = self._iterator.get_next() diff --git a/open_seq2seq/data/text2text/text2text.py b/open_seq2seq/data/text2text/text2text.py index 6604a2dcb..4f87ae2ad 100644 --- a/open_seq2seq/data/text2text/text2text.py +++ b/open_seq2seq/data/text2text/text2text.py @@ -77,7 +77,8 @@ def __init__(self, params, model, num_workers=1, worker_id=0): self._delimiter = self.params.get('delimiter', ' ') self._map_parallel_calls = self.params.get('map_parallel_calls', 8) self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight', False) - self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', tf.contrib.AUTOTUNE) + self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', + tf.contrib.data.AUTOTUNE) self._num_workers = num_workers self._worker_id = worker_id if self._pad_lengths_to_eight and not (self.params['max_length'] % 8 == 0): From b810de81c237c719e7545b8b1937abbc8d6d5a9c Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 28 Jun 2018 10:57:21 -0700 Subject: [PATCH 087/102] Code style fixes for run.py --- run.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/run.py b/run.py index 8949b2b10..b2cef97bf 100644 --- a/run.py +++ b/run.py @@ -3,9 +3,7 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals -from six.moves import range -import tensorflow as tf import datetime import argparse import ast @@ -16,6 +14,10 @@ import sys import shutil +import tensorflow as tf +from six.moves import range + + from open_seq2seq.utils.utils import deco_print, flatten_dict, \ nest_dict, nested_update, get_git_diff, \ get_git_hash, Logger @@ -66,9 +68,10 @@ def main(): # with command line arguments that were passed to the script parser_unk = argparse.ArgumentParser() for pm, value in flatten_dict(base_config).items(): - if type(value) is int or type(value) is float or type(value) is str: + if isinstance(value, int) or isinstance(value, float) or \ + isinstance(value, str): parser_unk.add_argument('--' + pm, default=value, type=type(value)) - elif type(value) is bool: + elif isinstance(value, bool): parser_unk.add_argument('--' + pm, default=value, type=ast.literal_eval) config_update = parser_unk.parse_args(unknown) nested_update(base_config, nest_dict(vars(config_update))) @@ -97,8 +100,8 @@ def main(): checkpoint = tf.train.latest_checkpoint(ckpt_dir) if checkpoint is None: raise IOError( - "There is no valid TensorFlow checkpoint in the " - "{} directory. Can't load model".format(ckpt_dir) + "There is no valid TensorFlow checkpoint in the " + "{} directory. Can't load model".format(ckpt_dir) ) else: if args.continue_learning: @@ -111,12 +114,14 @@ def main(): checkpoint = tf.train.latest_checkpoint(ckpt_dir) if checkpoint is None: raise IOError( - "There is no valid TensorFlow checkpoint in the " - "{} directory. Can't load model".format(ckpt_dir) + "There is no valid TensorFlow checkpoint in the " + "{} directory. Can't load model".format(ckpt_dir) ) else: raise IOError( - "{} does not exist or is empty, can't restore model".format(ckpt_dir) + "{} does not exist or is empty, can't restore model".format( + ckpt_dir + ) ) except IOError as e: if args.no_dir_check: @@ -140,24 +145,26 @@ def main(): tm_suf = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') shutil.copy( - args.config_file, - os.path.join(logdir, 'config_{}.py'.format(tm_suf)), + args.config_file, + os.path.join(logdir, 'config_{}.py'.format(tm_suf)), ) - with open(os.path.join(logdir, 'cmd-args_{}.log'.format(tm_suf)), 'w') as f: + with open(os.path.join(logdir, 'cmd-args_{}.log'.format(tm_suf)), + 'w') as f: f.write(" ".join(sys.argv)) - with open(os.path.join(logdir, 'git-info_{}.log'.format(tm_suf)), 'w') as f: + with open(os.path.join(logdir, 'git-info_{}.log'.format(tm_suf)), + 'w') as f: f.write('commit hash: {}'.format(get_git_hash())) f.write(get_git_diff()) old_stdout = sys.stdout old_stderr = sys.stderr stdout_log = open( - os.path.join(logdir, 'stdout_{}.log'.format(tm_suf)), 'a', 1 + os.path.join(logdir, 'stdout_{}.log'.format(tm_suf)), 'a', 1 ) stderr_log = open( - os.path.join(logdir, 'stderr_{}.log'.format(tm_suf)), 'a', 1 + os.path.join(logdir, 'stderr_{}.log'.format(tm_suf)), 'a', 1 ) sys.stdout = Logger(sys.stdout, stdout_log) sys.stderr = Logger(sys.stderr, stderr_log) @@ -218,7 +225,7 @@ def main(): deco_print("Starting training from scratch") else: deco_print( - "Restored checkpoint from {}. Resuming training".format(checkpoint), + "Restored checkpoint from {}. Resuming training".format(checkpoint), ) elif args.mode == 'eval' or args.mode == 'infer': if hvd is None or hvd.rank() == 0: From 681c5f7d8e17b6aa093e912d839ae193a30a835c Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 28 Jun 2018 13:36:16 -0700 Subject: [PATCH 088/102] Fix unicode bug in python2 --- run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/run.py b/run.py index b2cef97bf..8dd3506a6 100644 --- a/run.py +++ b/run.py @@ -16,6 +16,7 @@ import tensorflow as tf from six.moves import range +from six import string_types from open_seq2seq.utils.utils import deco_print, flatten_dict, \ @@ -69,7 +70,7 @@ def main(): parser_unk = argparse.ArgumentParser() for pm, value in flatten_dict(base_config).items(): if isinstance(value, int) or isinstance(value, float) or \ - isinstance(value, str): + isinstance(value, string_types): parser_unk.add_argument('--' + pm, default=value, type=type(value)) elif isinstance(value, bool): parser_unk.add_argument('--' + pm, default=value, type=ast.literal_eval) From 8b895cfdbc44fabde6ae5172d8d68e6268d1ecde Mon Sep 17 00:00:00 2001 From: Kipok Date: Thu, 28 Jun 2018 17:45:10 -0700 Subject: [PATCH 089/102] Fix tests for the new code --- open_seq2seq/data/speech2text/speech_utils.py | 3 ++- open_seq2seq/data/speech2text/speech_utils_test.py | 2 +- open_seq2seq/models/speech2text_test.py | 9 +++++---- open_seq2seq/test_utils/test_speech_config.py | 5 ++--- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/open_seq2seq/data/speech2text/speech_utils.py b/open_seq2seq/data/speech2text/speech_utils.py index 53357cc21..1ef485807 100644 --- a/open_seq2seq/data/speech2text/speech_utils.py +++ b/open_seq2seq/data/speech2text/speech_utils.py @@ -156,7 +156,8 @@ def get_speech_features(signal, fs, num_features, pad_to=8, else: raise ValueError('Unknown features type: {}'.format(features_type)) - assert features.shape[0] % pad_to == 0 + if pad_to > 0: + assert features.shape[0] % pad_to == 0 m = np.mean(features) s = np.std(features) features = (features - m) / s diff --git a/open_seq2seq/data/speech2text/speech_utils_test.py b/open_seq2seq/data/speech2text/speech_utils_test.py index 6c400877d..e457578cb 100644 --- a/open_seq2seq/data/speech2text/speech_utils_test.py +++ b/open_seq2seq/data/speech2text/speech_utils_test.py @@ -118,7 +118,7 @@ def test_get_speech_features_with_sine(self): fs = 16000.0 t = np.arange(0, 0.5, 1.0 / fs) signal = np.sin(2 * np.pi * 4000 * t) - features = get_speech_features(signal, fs, 161) + features = get_speech_features(signal, fs, 161, pad_to=0) npt.assert_allclose( np.abs(features - features[0]), np.zeros_like(features), diff --git a/open_seq2seq/models/speech2text_test.py b/open_seq2seq/models/speech2text_test.py index b069a8fe7..60f046578 100644 --- a/open_seq2seq/models/speech2text_test.py +++ b/open_seq2seq/models/speech2text_test.py @@ -96,7 +96,7 @@ def test_convergence(self): loss, eval_loss, eval_dict = self.run_model(train_config, eval_config) self.assertLess(loss, 5.0) - self.assertLess(eval_loss, 200.0) + self.assertLess(eval_loss, 30.0) self.assertLess(eval_dict['Eval WER'], 0.1) def test_convergence_with_iter_size(self): @@ -114,6 +114,7 @@ def test_convergence_with_iter_size(self): "iter_size": 5, "batch_size_per_gpu": 2, "use_horovod": True, + "num_epochs": 200, }) eval_config.update({ "dtype": dtype, @@ -123,13 +124,13 @@ def test_convergence_with_iter_size(self): }) loss, eval_loss, eval_dict = self.run_model(train_config, eval_config, hvd) - self.assertLess(loss, 5.0) - self.assertLess(eval_loss, 200.0) + self.assertLess(loss, 10.0) + self.assertLess(eval_loss, 30.0) self.assertLess(eval_dict['Eval WER'], 0.1) def test_infer(self): train_config, infer_config = self.prepare_config() - train_config['num_epochs'] = 200 + train_config['num_epochs'] = 250 infer_config['batch_size_per_gpu'] = 4 with tf.Graph().as_default() as g: diff --git a/open_seq2seq/test_utils/test_speech_config.py b/open_seq2seq/test_utils/test_speech_config.py index 3cf326a17..7f1c48e26 100644 --- a/open_seq2seq/test_utils/test_speech_config.py +++ b/open_seq2seq/test_utils/test_speech_config.py @@ -13,7 +13,7 @@ base_params = { "random_seed": 0, "use_horovod": False, - "num_epochs": 111, + "num_epochs": 150, "num_gpus": 1, "batch_size_per_gpu": 10, @@ -32,7 +32,6 @@ "lr_policy_params": { "learning_rate": 0.001, "power": 2, - "decay_steps": 500, }, "larc_params": { "larc_eta": 0.001, @@ -71,7 +70,7 @@ }, "activation_fn": lambda x: tf.minimum(tf.nn.relu(x), 20.0), "data_format": "channels_first", - "bn_momentum": 0.1, + "bn_momentum": 0.001, }, "decoder": FullyConnectedCTCDecoder, From 8387bb2c5f0a0c5e14eb5e2304dd0309dc198ff3 Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 29 Jun 2018 09:14:23 -0700 Subject: [PATCH 090/102] Update docs and fix run.py bug --- docs/sources/source/models-and-recipes.rst | 19 ++++++++----------- run.py | 4 ++-- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/docs/sources/source/models-and-recipes.rst b/docs/sources/source/models-and-recipes.rst index 0d7337568..9c7ad7e67 100644 --- a/docs/sources/source/models-and-recipes.rst +++ b/docs/sources/source/models-and-recipes.rst @@ -67,41 +67,38 @@ Original Deep Speech 2 model description: https://arxiv.org/abs/1512.02595. The table below contains description and results of Deep Speech 2 based models available in OpenSeq2Seq. -WER-512 and WER-2048 is word error rate obtained with beam width of 512 and 2048 -correspondingly. For beam width of 2048 we also used ``batch_size_per_gpu = 1`` +WER is the word error rate obtained on a dev-clean subset of LibriSpeech using +greedy decoder (``decoder_params/use_language_model = False``). +For the final evaluation we used ``batch_size_per_gpu = 1`` to eliminate the effect of `cudnn padding issue `_. For more details about model descriptions and training setup, have a look at the `configuration files `_. .. list-table:: - :widths: 1 1 1 1 1 1 + :widths: 1 1 1 1 1 :header-rows: 1 * - Config file - - WER-512 - - WER-2048 + - WER - Training setup and additional comments - Short description of the model - Checkpoint * - `ds2_large_8gpus.py `_ - - 4.90% - - 4.59% + - 14.89% - This model was trained for 50 epochs using SGD with Momentum and LARC on the full LibriSpeech in a few days using Horovod on eight GPUs. - This model has 2 convolutional layers and 5 bidirectional GRU layers with 800 units. - `link `_ * - `ds2_medium_4gpus.py `_ - - 6.12% - - 5.49% + - 22.60% - This model was trained for 50 epochs using Adam on the full LibriSpeech in a few days using Horovod on four GPUs. - This model has 3 convolutional layers and 3 unidirectional GRU layers with 1024 units. - `link `_ * - `ds2_small_1gpu.py `_ - - 11.77% - - 9.32% + - 39.08% - This model was trained for 12 epochs using Adam on a "clean" subset of LibriSpeech in less than a day using a single GPU. - This model has 2 convolutional layers and 2 bidirectional diff --git a/run.py b/run.py index 8dd3506a6..335a09d7c 100644 --- a/run.py +++ b/run.py @@ -69,10 +69,10 @@ def main(): # with command line arguments that were passed to the script parser_unk = argparse.ArgumentParser() for pm, value in flatten_dict(base_config).items(): - if isinstance(value, int) or isinstance(value, float) or \ + if type(value) == int or type(value) == float or \ isinstance(value, string_types): parser_unk.add_argument('--' + pm, default=value, type=type(value)) - elif isinstance(value, bool): + elif type(value) == bool: parser_unk.add_argument('--' + pm, default=value, type=ast.literal_eval) config_update = parser_unk.parse_args(unknown) nested_update(base_config, nest_dict(vars(config_update))) From ce076768585da65876cde4b488f213fc7c3bac04 Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 29 Jun 2018 09:24:38 -0700 Subject: [PATCH 091/102] Fix API docs structure --- docs/sources/source/api-docs/encoders.rst | 9 +++++++++ docs/sources/source/api-docs/parts.transformer.rst | 8 -------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/sources/source/api-docs/encoders.rst b/docs/sources/source/api-docs/encoders.rst index 362287eb6..ba77b5532 100644 --- a/docs/sources/source/api-docs/encoders.rst +++ b/docs/sources/source/api-docs/encoders.rst @@ -53,3 +53,12 @@ resnet\_blocks :members: :undoc-members: :show-inheritance: + + +cnn\_encoder +-------------------------------- + +.. automodule:: encoders.cnn_encoder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sources/source/api-docs/parts.transformer.rst b/docs/sources/source/api-docs/parts.transformer.rst index 7dab39e8a..8fa9237fd 100644 --- a/docs/sources/source/api-docs/parts.transformer.rst +++ b/docs/sources/source/api-docs/parts.transformer.rst @@ -22,14 +22,6 @@ beam\_search :undoc-members: :show-inheritance: -beam\_search\_test ---------------------------------------------------------- - -.. automodule:: parts.transformer.beam_search_test - :members: - :undoc-members: - :show-inheritance: - common --------------------------------------------- From 905156f0b782ae2e73804a6c3eccd817769002f5 Mon Sep 17 00:00:00 2001 From: Vahid Noroozi Date: Fri, 29 Jun 2018 09:47:39 -0700 Subject: [PATCH 092/102] WIP: Conv seq2seq implementation (#152) Conv seq2seq implementation --- .../text2text/en-de/en-de-convs2s.py | 178 +++++++++ .../{ => en-de}/en-de-gnmt-like-4GPUs.py | 0 .../text2text/{ => en-de}/en-de-nmt-small.py | 0 .../text2text/{ => en-de}/transformer-big.py | 10 +- .../text2text/toy-reversal/nmt-reversal-CC.py | 147 +++++++ .../text2text/toy-reversal/nmt-reversal-CR.py | 160 ++++++++ .../text2text/toy-reversal/nmt-reversal-RC.py | 140 +++++++ .../{ => toy-reversal}/nmt-reversal-RR.py | 0 .../{ => toy-reversal}/nmt-reversal-RT.py | 0 .../{ => toy-reversal}/nmt-reversal-TT.py | 0 open_seq2seq/decoders/__init__.py | 5 + open_seq2seq/decoders/convs2s_decoder.py | 367 ++++++++++++++++++ open_seq2seq/encoders/__init__.py | 3 + open_seq2seq/encoders/convs2s_encoder.py | 221 +++++++++++ open_seq2seq/models/text2text_test.py | 4 +- open_seq2seq/parts/convs2s/__init__.py | 3 + .../parts/convs2s/attention_wn_layer.py | 90 +++++ open_seq2seq/parts/convs2s/conv_wn_layer.py | 103 +++++ open_seq2seq/parts/convs2s/ffn_wn_layer.py | 68 ++++ .../parts/transformer/embedding_layer.py | 41 +- open_seq2seq/parts/transformer/utils.py | 26 +- open_seq2seq/utils/utils.py | 6 +- run.py | 2 +- 23 files changed, 1540 insertions(+), 34 deletions(-) create mode 100644 example_configs/text2text/en-de/en-de-convs2s.py rename example_configs/text2text/{ => en-de}/en-de-gnmt-like-4GPUs.py (100%) rename example_configs/text2text/{ => en-de}/en-de-nmt-small.py (100%) rename example_configs/text2text/{ => en-de}/transformer-big.py (96%) create mode 100644 example_configs/text2text/toy-reversal/nmt-reversal-CC.py create mode 100644 example_configs/text2text/toy-reversal/nmt-reversal-CR.py create mode 100644 example_configs/text2text/toy-reversal/nmt-reversal-RC.py rename example_configs/text2text/{ => toy-reversal}/nmt-reversal-RR.py (100%) rename example_configs/text2text/{ => toy-reversal}/nmt-reversal-RT.py (100%) rename example_configs/text2text/{ => toy-reversal}/nmt-reversal-TT.py (100%) create mode 100644 open_seq2seq/decoders/convs2s_decoder.py create mode 100644 open_seq2seq/encoders/convs2s_encoder.py create mode 100644 open_seq2seq/parts/convs2s/__init__.py create mode 100644 open_seq2seq/parts/convs2s/attention_wn_layer.py create mode 100644 open_seq2seq/parts/convs2s/conv_wn_layer.py create mode 100644 open_seq2seq/parts/convs2s/ffn_wn_layer.py diff --git a/example_configs/text2text/en-de/en-de-convs2s.py b/example_configs/text2text/en-de/en-de-convs2s.py new file mode 100644 index 000000000..60ff79ec5 --- /dev/null +++ b/example_configs/text2text/en-de/en-de-convs2s.py @@ -0,0 +1,178 @@ +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf + +from open_seq2seq.models import Text2Text +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer + +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.data.text2text.tokenizer import EOS_ID + +from open_seq2seq.encoders import ConvS2SEncoder +from open_seq2seq.decoders import ConvS2SDecoder + +from open_seq2seq.losses import BasicSequenceLoss, PaddedCrossEntropyLossWithSmoothing + +from open_seq2seq.optimizers.lr_policies import transformer_policy + +# REPLACE THIS TO THE PATH WITH YOUR WMT DATA +data_root = "./wmt16_en_dt/" + +base_model = Text2Text +num_layers = 15 +d_model = 768 +max_length = 128 + +batch_size = 64 +num_gpus = 4 +epoch_num = 30 + +base_params = { + "use_horovod": False, + "num_gpus": num_gpus, + # set max_step to achieve the given epoch_num, 4.5M is the size of the dataset + "max_steps": int((4500000 / (num_gpus * batch_size)) * epoch_num), + "batch_size_per_gpu": batch_size, + "save_summaries_steps": 50, + "print_loss_steps": 50, + "print_samples_steps": 50, + "eval_steps": 4000, + "save_checkpoint_steps": 1000, + "logdir": "RealData-CC", + + + "optimizer": "Adam", + "optimizer_params": {}, + "lr_policy": transformer_policy, + "lr_policy_params": { + "learning_rate": 9, + "max_lr": 1e-3, + "warmup_steps": 4000, + "d_model": d_model, + }, + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + + "max_grad_norm": 0.1, + #"dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + + "encoder": ConvS2SEncoder, + "encoder_params": { + "encoder_layers": num_layers, + + "src_emb_size": d_model, + "pad_embeddings_2_eight": False, + "att_layer_num": num_layers, + + # original paper + #"conv_nchannels_kwidth": [(512, 3)]*10 + [(768, 3)]*3 + [(2048, 1)]*2, + + # fairseq config + "conv_nchannels_kwidth": [(512, 3)]*9 + [(1024, 3)]*4 + [(2048, 1)]*2, + + "embedding_dropout_keep_prob": 0.8, + "hidden_dropout_keep_prob": 0.8, + + "max_input_length": max_length, + + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + }, + + + "decoder": ConvS2SDecoder, + "decoder_params": { + "decoder_layers": num_layers, + + "shared_embed": True, + "tgt_emb_size": d_model, + "pad_embeddings_2_eight": False, + "out_emb_size": d_model, + + # original paper + #"conv_nchannels_kwidth": [(512, 3)]*10 + [(768, 3)]*3 + [(2048, 1)]*2, + + # fairseq config + "conv_nchannels_kwidth": [(512, 3)]*9 + [(1024, 3)]*4 + [(2048, 1)]*2, + + "embedding_dropout_keep_prob": 0.8, + "hidden_dropout_keep_prob": 0.8, + "out_dropout_keep_prob": 0.8, + + "max_input_length": max_length, + "extra_decode_length": 56, + "beam_size": 5, + "alpha": 0.6, + + "EOS_ID": EOS_ID, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } + +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "src_vocab_file": data_root + "vocab.bpe.32000", + "tgt_vocab_file": data_root + "vocab.bpe.32000", + "source_file": data_root+"train.tok.clean.bpe.32000.en", + "target_file": data_root+"train.tok.clean.bpe.32000.de", + "delimiter": " ", + "shuffle": False, + "repeat": True, + "map_parallel_calls": 8, + "prefetch_buffer_size": 4, + "max_length": max_length, + }, +} + +eval_params = { + "batch_size_per_gpu": 64, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2014.tok.bpe.32000.en", + "target_file": data_root+"newstest2014.tok.bpe.32000.de", + "delimiter": " ", + "shuffle": False, + "repeat": True, + "max_length": 64, + }, + +} + +infer_params = { + "batch_size_per_gpu": 1, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2013.tok.bpe.32000.en", + # this is intentional to be sure that model is not using target + "target_file": data_root+"newstest2013.tok.bpe.32000.en", + "delimiter": " ", + "shuffle": False, + "repeat": False, + "max_length": max_length, + }, +} + + diff --git a/example_configs/text2text/en-de-gnmt-like-4GPUs.py b/example_configs/text2text/en-de/en-de-gnmt-like-4GPUs.py similarity index 100% rename from example_configs/text2text/en-de-gnmt-like-4GPUs.py rename to example_configs/text2text/en-de/en-de-gnmt-like-4GPUs.py diff --git a/example_configs/text2text/en-de-nmt-small.py b/example_configs/text2text/en-de/en-de-nmt-small.py similarity index 100% rename from example_configs/text2text/en-de-nmt-small.py rename to example_configs/text2text/en-de/en-de-nmt-small.py diff --git a/example_configs/text2text/transformer-big.py b/example_configs/text2text/en-de/transformer-big.py similarity index 96% rename from example_configs/text2text/transformer-big.py rename to example_configs/text2text/en-de/transformer-big.py index fd819bca3..a1a56039c 100644 --- a/example_configs/text2text/transformer-big.py +++ b/example_configs/text2text/en-de/transformer-big.py @@ -18,7 +18,8 @@ d_model = 512 num_layers = 6 -data_root = "[REPLACE THIS TO THE PATH WITH YOUR WMT DATA]" +# REPLACE THIS TO THE PATH WITH YOUR WMT DATA +data_root = "./wmt16_en_dt/" base_params = { "use_horovod": False, @@ -31,9 +32,10 @@ "eval_steps": 4001, "save_checkpoint_steps": 4000, "logdir": "Transformer-FP32", - "dtype": tf.float32, - # "dtype": "mixed", - # "automatic_loss_scaling": "Backoff", + #"dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + "optimizer": tf.contrib.opt.LazyAdamOptimizer, "optimizer_params": { "beta1": 0.9, diff --git a/example_configs/text2text/toy-reversal/nmt-reversal-CC.py b/example_configs/text2text/toy-reversal/nmt-reversal-CC.py new file mode 100644 index 000000000..a74b94c9d --- /dev/null +++ b/example_configs/text2text/toy-reversal/nmt-reversal-CC.py @@ -0,0 +1,147 @@ +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf + +from open_seq2seq.models import Text2Text + +from open_seq2seq.decoders import ConvS2SDecoder +from open_seq2seq.encoders import ConvS2SEncoder + +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer +from open_seq2seq.losses import BasicSequenceLoss + +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.data.text2text.tokenizer import EOS_ID +from open_seq2seq.optimizers.lr_policies import fixed_lr + +""" +This configuration file describes fully convolutional model (ConvS2S) +on the toy task of reversing sequences +""" + +base_model = Text2Text +d_model = 128 +num_layers = 2 + +base_params = { + "use_horovod": False, + "num_gpus": 1, + "batch_size_per_gpu": 64, + "max_steps": 1000, + "save_summaries_steps": 10, + "print_loss_steps": 10, + "print_samples_steps": 20, + "eval_steps": 50, + "save_checkpoint_steps": 200, + + "logdir": "ReversalTask-CC", + + "optimizer": "Adam", + "optimizer_params": {"epsilon": 1e-9}, + "lr_policy": fixed_lr, + "lr_policy_params": { + 'learning_rate': 1e-3 + }, + + "max_grad_norm": 3.0, + "dtype": tf.float32, + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": ConvS2SEncoder, + "encoder_params": { + "encoder_layers": num_layers, + + "src_emb_size": d_model, + "embedding_dropout_keep_prob": 0.9, + "pad_embeddings_2_eight": False, + "att_layer_num": num_layers, + + "conv_nchannels_kwidth": [(d_model, 3)] * num_layers, + + "hidden_dropout_keep_prob": 0.9, + + "max_input_length": 100, + + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + }, + + "decoder": ConvS2SDecoder, + "decoder_params": { + "decoder_layers": num_layers, + + "shared_embed": True, + "tgt_emb_size": d_model, + "embedding_dropout_keep_prob": 0.9, + "pad_embeddings_2_eight": False, + + "conv_nchannels_kwidth": [(d_model, 3)] * num_layers, + + "hidden_dropout_keep_prob": 0.9, + "out_dropout_keep_prob": 0.9, + + "max_input_length": 120, + "extra_decode_length": 10, + "beam_size": 5, + "alpha": 0.6, + + "EOS_ID": EOS_ID, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/target.txt", + "source_file": "toy_text_data/train/source.txt", + "target_file": "toy_text_data/train/target.txt", + "shuffle": True, + "repeat": True, + "max_length": 56, + "delimiter": " ", + }, +} + +eval_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/target.txt", + "source_file": "toy_text_data/dev/source.txt", + "target_file": "toy_text_data/dev/target.txt", + "shuffle": False, + "repeat": True, + "max_length": 56, + "delimiter": " ", + }, +} + + +infer_params = { + "batch_size_per_gpu": 1, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/source.txt", + "source_file": "toy_text_data/test/source.txt", + # this is intentional to be sure model is not using ground truth + "target_file": "toy_text_data/test/source.txt", + "shuffle": False, + "repeat": False, + "max_length": 256, + "delimiter": " ", + }, +} diff --git a/example_configs/text2text/toy-reversal/nmt-reversal-CR.py b/example_configs/text2text/toy-reversal/nmt-reversal-CR.py new file mode 100644 index 000000000..cf6ab6e7b --- /dev/null +++ b/example_configs/text2text/toy-reversal/nmt-reversal-CR.py @@ -0,0 +1,160 @@ +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf + +from open_seq2seq.models import Text2Text +from open_seq2seq.decoders import RNNDecoderWithAttention, BeamSearchRNNDecoderWithAttention +from open_seq2seq.encoders import ConvS2SEncoder + +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer +from open_seq2seq.losses import BasicSequenceLoss + +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.optimizers.lr_policies import fixed_lr + +""" +This configuration file describes convolutional encoder and rnn decoder with attention +on the toy task of reversing sequences +""" + +base_model = Text2Text +d_model = 128 +num_layers = 2 + +base_params = { + "use_horovod": False, + "num_gpus": 1, + "batch_size_per_gpu": 64, + "max_steps": 1000, + "save_summaries_steps": 10, + "print_loss_steps": 10, + "print_samples_steps": 20, + "eval_steps": 50, + "save_checkpoint_steps": 200, + + "logdir": "ReversalTask-CR", + + "optimizer": "Adam", + "optimizer_params": {"epsilon": 1e-9}, + "lr_policy": fixed_lr, + "lr_policy_params": { + 'learning_rate': 1e-3 + }, + + "max_grad_norm": 3.0, + "dtype": tf.float32, + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": ConvS2SEncoder, + "encoder_params": { + "encoder_layers": num_layers, + + "src_emb_size": d_model, + "att_layer_num": num_layers, + "embedding_dropout_keep_prob": 0.9, + "pad_embeddings_2_eight": True, + + "hidden_dropout_keep_prob": 0.9, + + "conv_nchannels_kwidth": [(d_model, 3)] * num_layers, + + "max_input_length": 100, + + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + }, + + "decoder": RNNDecoderWithAttention, + "decoder_params": { + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": d_model, + }, + "decoder_layers": num_layers, + + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": False, + + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + + "tgt_emb_size": d_model, + "attention_type": "luong", + "luong_scale": False, + "attention_layer_size": 128, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/target.txt", + "source_file": "toy_text_data/train/source.txt", + "target_file": "toy_text_data/train/target.txt", + "shuffle": True, + "repeat": True, + "max_length": 56, + "delimiter": " ", + }, +} + +eval_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/target.txt", + "source_file": "toy_text_data/dev/source.txt", + "target_file": "toy_text_data/dev/target.txt", + "shuffle": False, + "repeat": True, + "max_length": 56, + "delimiter": " ", + }, +} + +infer_params = { + "batch_size_per_gpu": 1, + "decoder": BeamSearchRNNDecoderWithAttention, + "decoder_params": { + "decoder_cell_type": "lstm", + "decoder_cell_units": d_model, + "decoder_layers": num_layers, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": False, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + "tgt_emb_size": d_model, + "attention_type": "luong", + "luong_scale": False, + "attention_layer_size": d_model, + "beam_width": 5, + "length_penalty": 1.0, + }, + + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/source.txt", + "source_file": "toy_text_data/test/source.txt", + "target_file": "toy_text_data/test/source.txt", + "shuffle": False, + "repeat": False, + "max_length": 256, + "delimiter": " ", + }, + +} diff --git a/example_configs/text2text/toy-reversal/nmt-reversal-RC.py b/example_configs/text2text/toy-reversal/nmt-reversal-RC.py new file mode 100644 index 000000000..5ed166fdb --- /dev/null +++ b/example_configs/text2text/toy-reversal/nmt-reversal-RC.py @@ -0,0 +1,140 @@ +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf + +from open_seq2seq.models import Text2Text +from open_seq2seq.encoders import BidirectionalRNNEncoderWithEmbedding +from open_seq2seq.decoders import ConvS2SDecoder + +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer +from open_seq2seq.losses import BasicSequenceLoss + +from open_seq2seq.data.text2text.tokenizer import EOS_ID +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.optimizers.lr_policies import fixed_lr + +""" +This configuration file describes bidirectional rnn based encoder and convolutional decoder +on the toy task of reversing sequences +""" + +base_model = Text2Text +d_model = 128 +num_layers = 2 + +base_params = { + "use_horovod": False, + "num_gpus": 1, + "batch_size_per_gpu": 64, + "max_steps": 1000, + "save_summaries_steps": 10, + "print_loss_steps": 10, + "print_samples_steps": 20, + "eval_steps": 50, + "save_checkpoint_steps": 200, + + "logdir": "ReversalTask-RC", + + "optimizer": "Adam", + "optimizer_params": {"epsilon": 1e-9}, + "lr_policy": fixed_lr, + "lr_policy_params": { + 'learning_rate': 1e-3 + }, + + "max_grad_norm": 3.0, + "dtype": tf.float32, + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": BidirectionalRNNEncoderWithEmbedding, + "encoder_params": { + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": int(d_model/2), + }, + + "encoder_layers": num_layers, + "encoder_dp_input_keep_prob": 0.8, + "encoder_dp_output_keep_prob": 1.0, + "encoder_use_skip_connections": False, + "src_emb_size": d_model, + }, + + "decoder": ConvS2SDecoder, + "decoder_params": { + "decoder_layers": num_layers, + + "shared_embed": True, + "tgt_emb_size": d_model, + + "conv_nchannels_kwidth": [(d_model, 3)] * num_layers, + + "embedding_dropout_keep_prob": 0.9, + "hidden_dropout_keep_prob": 0.9, + "out_dropout_keep_prob": 0.9, + + "max_input_length": 100, + "extra_decode_length": 10, + "beam_size": 5, + "alpha": 0.6, + + "EOS_ID": EOS_ID, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/target.txt", + "source_file": "toy_text_data/train/source.txt", + "target_file": "toy_text_data/train/target.txt", + "shuffle": True, + "repeat": True, + "max_length": 56, + "delimiter": " ", + }, +} + +eval_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/target.txt", + "source_file": "toy_text_data/dev/source.txt", + "target_file": "toy_text_data/dev/target.txt", + "shuffle": False, + "repeat": True, + "max_length": 56, + "delimiter": " ", + }, +} + +infer_params = { + "batch_size_per_gpu": 1, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/source.txt", + "source_file": "toy_text_data/test/source.txt", + # this is intentional to be sure model is not using ground truth + "target_file": "toy_text_data/test/source.txt", + "shuffle": False, + "repeat": False, + "max_length": 256, + "delimiter": " ", + }, +} diff --git a/example_configs/text2text/nmt-reversal-RR.py b/example_configs/text2text/toy-reversal/nmt-reversal-RR.py similarity index 100% rename from example_configs/text2text/nmt-reversal-RR.py rename to example_configs/text2text/toy-reversal/nmt-reversal-RR.py diff --git a/example_configs/text2text/nmt-reversal-RT.py b/example_configs/text2text/toy-reversal/nmt-reversal-RT.py similarity index 100% rename from example_configs/text2text/nmt-reversal-RT.py rename to example_configs/text2text/toy-reversal/nmt-reversal-RT.py diff --git a/example_configs/text2text/nmt-reversal-TT.py b/example_configs/text2text/toy-reversal/nmt-reversal-TT.py similarity index 100% rename from example_configs/text2text/nmt-reversal-TT.py rename to example_configs/text2text/toy-reversal/nmt-reversal-TT.py diff --git a/open_seq2seq/decoders/__init__.py b/open_seq2seq/decoders/__init__.py index 3601364a7..09c95dedf 100644 --- a/open_seq2seq/decoders/__init__.py +++ b/open_seq2seq/decoders/__init__.py @@ -6,5 +6,10 @@ from .decoder import Decoder from .rnn_decoders import RNNDecoderWithAttention, \ BeamSearchRNNDecoderWithAttention + from .transformer_decoder import TransformerDecoder from .fc_decoders import FullyConnectedCTCDecoder, FullyConnectedDecoder + +from .convs2s_decoder import ConvS2SDecoder +#from .convs2s_decoder_old import ConvS2SDecoder + diff --git a/open_seq2seq/decoders/convs2s_decoder.py b/open_seq2seq/decoders/convs2s_decoder.py new file mode 100644 index 000000000..1944845f8 --- /dev/null +++ b/open_seq2seq/decoders/convs2s_decoder.py @@ -0,0 +1,367 @@ +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf +import math +from .decoder import Decoder + +from open_seq2seq.parts.transformer import beam_search + +from open_seq2seq.parts.transformer import embedding_layer +from open_seq2seq.parts.transformer.utils import get_padding + +from open_seq2seq.parts.convs2s import ffn_wn_layer, conv_wn_layer, attention_wn_layer + +# Default value used if max_input_length is not given +MAX_INPUT_LENGTH = 128 + + +class ConvS2SDecoder(Decoder): + + @staticmethod + def get_required_params(): + """Static method with description of required parameters. + + Returns: + dict: + Dictionary containing all the parameters that **have to** be + included into the ``params`` parameter of the + class :meth:`__init__` method. + """ + return dict( + Decoder.get_required_params(), **{ + 'batch_size': int, + 'decoder_layers': int, + 'tgt_emb_size': int, + 'tgt_vocab_size': int, + 'shared_embed': bool, + 'embedding_dropout_keep_prob': float, + 'conv_nchannels_kwidth': list, + 'hidden_dropout_keep_prob': float, + 'out_dropout_keep_prob': float, + 'beam_size': int, + 'alpha': float, + 'extra_decode_length': int, + 'EOS_ID': int, + }) + + @staticmethod + def get_optional_params(): + """Static method with description of optional parameters. + + Returns: + dict: + Dictionary containing all the parameters that **can** be + included into the ``params`` parameter of the + class :meth:`__init__` method. + """ + return dict( + Decoder.get_optional_params(), + **{ + 'pad_embeddings_2_eight': bool, + + # if not provided, tgt_emb_size is used as the default value + 'out_emb_size': int, + 'max_input_length': int, + 'GO_SYMBOL': int, + 'PAD_SYMBOL': int, + 'END_SYMBOL': int, + }) + + def _cast_types(self, input_dict): + return input_dict + + def __init__(self, params, model, name="convs2s_decoder", mode='train'): + super(ConvS2SDecoder, self).__init__(params, model, name, mode) + self.embedding_softmax_layer = None + self.position_embedding_layer = None + self.layers = [] + self._tgt_vocab_size = self.params['tgt_vocab_size'] + self._tgt_emb_size = self.params['tgt_emb_size'] + self._mode = mode + self._pad_sym = self.params.get('PAD_SYMBOL', 0) + self._pad2eight = params.get('pad_embeddings_2_eight', False) + + def _decode(self, input_dict): + targets = input_dict['target_tensors'][0] \ + if 'target_tensors' in input_dict else None + + encoder_outputs = input_dict['encoder_output']['outputs'] + encoder_outputs_b = input_dict['encoder_output'].get( + 'outputs_b', encoder_outputs) + + inputs_attention_bias = input_dict['encoder_output'].get( + 'inputs_attention_bias_cs2s', None) + + with tf.name_scope("decode"): + # prepare decoder layers + if len(self.layers) == 0: + knum_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[0] + kwidth_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[1] + + # preparing embedding layers + with tf.variable_scope("embedding"): + if 'embedding_softmax_layer' in input_dict['encoder_output'] \ + and self.params['shared_embed']: + self.embedding_softmax_layer = \ + input_dict['encoder_output']['embedding_softmax_layer'] + else: + self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self._tgt_vocab_size, + hidden_size=self._tgt_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + with tf.variable_scope("pos_embedding"): + if 'position_embedding_layer' in input_dict['encoder_output'] \ + and self.params['shared_embed']: + self.position_embedding_layer = \ + input_dict['encoder_output']['position_embedding_layer'] + else: + self.position_embedding_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self.params.get("max_input_length", + MAX_INPUT_LENGTH), + hidden_size=self._tgt_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + # linear projection before cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + self._tgt_emb_size, + knum_list[0], + dropout=self.params["embedding_dropout_keep_prob"], + var_scope_name="linear_mapping_before_cnn_layers")) + + for i in range(self.params['decoder_layers']): + in_dim = knum_list[i] if i == 0 else knum_list[i - 1] + out_dim = knum_list[i] + + # linear projection is needed for residual connections if + # input and output of a cnn layer do not match + if in_dim != out_dim: + linear_proj = ffn_wn_layer.FeedFowardNetworkNormalized( + in_dim, + out_dim, + var_scope_name="linear_mapping_cnn_" + str(i + 1), + dropout=1.0) + else: + linear_proj = None + + conv_layer = conv_wn_layer.Conv1DNetworkNormalized( + in_dim, + out_dim, + kernel_width=kwidth_list[i], + mode=self.mode, + layer_id=i + 1, + hidden_dropout=self.params["hidden_dropout_keep_prob"], + conv_padding="VALID", + decode_padding=True) + + att_layer = attention_wn_layer.AttentionLayerNormalized( + out_dim, + embed_size=self._tgt_emb_size, + layer_id=i + 1, + add_res=True) + + self.layers.append([linear_proj, conv_layer, att_layer]) + + # linear projection after cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + knum_list[self.params['decoder_layers'] - 1], + self.params.get("out_emb_size", self._tgt_emb_size), + dropout=1.0, + var_scope_name="linear_mapping_after_cnn_layers")) + + if not self.params['shared_embed']: + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + self.params.get("out_emb_size", self._tgt_emb_size), + self._tgt_vocab_size, + dropout=self.params["out_dropout_keep_prob"], + var_scope_name="linear_mapping_to_vocabspace")) + else: + # if embedding is shared, + # the shared embedding is used as the final linear projection to vocab space + self.layers.append(None) + + if targets is None: + return self.predict(encoder_outputs, encoder_outputs_b, + inputs_attention_bias) + else: + logits = self.decode_pass(targets, encoder_outputs, encoder_outputs_b, + inputs_attention_bias) + return { + "logits": logits, + "outputs": [tf.argmax(logits, axis=-1)], + "final_state": None, + "final_sequence_lengths": None + } + + def decode_pass(self, targets, encoder_outputs, encoder_outputs_b, + inputs_attention_bias): + """Generate logits for each value in the target sequence. + + Args: + targets: target values for the output sequence. + int tensor with shape [batch_size, target_length] + encoder_outputs: continuous representation of input sequence. + float tensor with shape [batch_size, input_length, hidden_size] + float tensor with shape [batch_size, input_length, hidden_size] + encoder_outputs_b: continuous representation of input sequence + which includes the source embeddings. + float tensor with shape [batch_size, input_length, hidden_size] + inputs_attention_bias: float tensor with shape [batch_size, 1, input_length] + + Returns: + float32 tensor with shape [batch_size, target_length, vocab_size] + """ + + # Prepare inputs to decoder layers by applying embedding + # and adding positional encoding. + decoder_inputs = self.embedding_softmax_layer(targets) + + with tf.name_scope("add_pos_encoding"): + pos_input = tf.range( + 0, tf.shape(decoder_inputs)[1], delta=1, dtype=tf.int32, name='range') + pos_encoding = self.position_embedding_layer(pos_input) + decoder_inputs = decoder_inputs + tf.cast( + x=pos_encoding, dtype=decoder_inputs.dtype) + + if self.mode == "train": + decoder_inputs = tf.nn.dropout(decoder_inputs, + self.params["embedding_dropout_keep_prob"]) + + # mask the paddings in the target + inputs_padding = get_padding( + targets, padding_value=self._pad_sym, dtype=decoder_inputs.dtype) + decoder_inputs *= tf.expand_dims(1.0 - inputs_padding, 2) + + # do decode + logits = self._call( + decoder_inputs=decoder_inputs, + encoder_outputs_a=encoder_outputs, + encoder_outputs_b=encoder_outputs_b, + input_attention_bias=inputs_attention_bias) + + return logits + + def _call(self, decoder_inputs, encoder_outputs_a, encoder_outputs_b, + input_attention_bias): + # run input into the decoder layers and returns the logits + target_embed = decoder_inputs + with tf.variable_scope("linear_layer_before_cnn_layers"): + outputs = self.layers[0](decoder_inputs) + + for i in range(1, len(self.layers) - 2): + linear_proj, conv_layer, att_layer = self.layers[i] + + with tf.variable_scope("layer_%d" % i): + if linear_proj is not None: + res_inputs = linear_proj(outputs) + else: + res_inputs = outputs + + with tf.variable_scope("conv_layer"): + outputs = conv_layer(outputs) + + with tf.variable_scope("attention_layer"): + outputs = att_layer(outputs, target_embed, encoder_outputs_a, + encoder_outputs_b, input_attention_bias) + outputs = (outputs + res_inputs) * math.sqrt(0.5) + + with tf.variable_scope("linear_layer_after_cnn_layers"): + outputs = self.layers[-2](outputs) + + if self.mode == "train": + outputs = tf.nn.dropout(outputs, self.params["out_dropout_keep_prob"]) + + with tf.variable_scope("pre_softmax_projection"): + if self.layers[-1] is None: + logits = self.embedding_softmax_layer.linear(outputs) + else: + logits = self.layers[-1](outputs) + + return tf.cast(logits, dtype=tf.float32) + + def predict(self, encoder_outputs, encoder_outputs_b, inputs_attention_bias): + """Return predicted sequence.""" + batch_size = tf.shape(encoder_outputs)[0] + input_length = tf.shape(encoder_outputs)[1] + max_decode_length = input_length + self.params["extra_decode_length"] + + symbols_to_logits_fn = self._get_symbols_to_logits_fn() + + # Create initial set of IDs that will be passed into symbols_to_logits_fn. + initial_ids = tf.zeros( + [batch_size], dtype=tf.int32) + self.params["GO_SYMBOL"] + + cache = {} + # Add encoder outputs and attention bias to the cache. + cache["encoder_outputs"] = encoder_outputs + cache["encoder_outputs_b"] = encoder_outputs_b + if inputs_attention_bias is not None: + cache["inputs_attention_bias"] = inputs_attention_bias + + # Use beam search to find the top beam_size sequences and scores. + decoded_ids, scores = beam_search.sequence_beam_search( + symbols_to_logits_fn=symbols_to_logits_fn, + initial_ids=initial_ids, + initial_cache=cache, + vocab_size=self.params["tgt_vocab_size"], + beam_size=self.params["beam_size"], + alpha=self.params["alpha"], + max_decode_length=max_decode_length, + eos_id=self.params["EOS_ID"]) + + # Get the top sequence for each batch element + top_decoded_ids = decoded_ids[:, 0, :] + top_scores = scores[:, 0] + + # this isn't particularly efficient + logits = self.decode_pass(top_decoded_ids, encoder_outputs, + encoder_outputs_b, inputs_attention_bias) + + return { + "logits": logits, + "outputs": [top_decoded_ids], + "final_state": None, + "final_sequence_lengths": None + } + + def _get_symbols_to_logits_fn(self): + """Returns a decoding function that calculates logits of the next tokens.""" + + def symbols_to_logits_fn(ids, i, cache): + """Generate logits for next potential IDs. + + Args: + ids: Current decoded sequences. + int tensor with shape [batch_size * beam_size, i - 1] + i: Loop index + cache: dictionary of values storing the encoder output, encoder-decoder + attention bias, and previous decoder attention values. + + Returns: + Tuple of + (logits with shape [batch_size * beam_size, vocab_size], + updated cache values) + """ + + # pass the decoded ids from the beginneing up to the current into the decoder + # not efficient + decoder_outputs = self.decode_pass(ids, cache.get("encoder_outputs"), + cache.get("encoder_outputs_b"), + cache.get("inputs_attention_bias")) + + logits = decoder_outputs[:, i, :] + return logits, cache + + return symbols_to_logits_fn diff --git a/open_seq2seq/encoders/__init__.py b/open_seq2seq/encoders/__init__.py index e8a919a9f..bae57227e 100644 --- a/open_seq2seq/encoders/__init__.py +++ b/open_seq2seq/encoders/__init__.py @@ -11,3 +11,6 @@ from .transformer_encoder import TransformerEncoder from .ds2_encoder import DeepSpeech2Encoder from .resnet_encoder import ResNetEncoder + +from .convs2s_encoder import ConvS2SEncoder +#from .convs2s_encoder_old import ConvS2SEncoder diff --git a/open_seq2seq/encoders/convs2s_encoder.py b/open_seq2seq/encoders/convs2s_encoder.py new file mode 100644 index 000000000..13c9e204e --- /dev/null +++ b/open_seq2seq/encoders/convs2s_encoder.py @@ -0,0 +1,221 @@ +# Copyright (c) 2018 NVIDIA Corporation +""" +Conv-based encoder +""" +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf +import math +from .encoder import Encoder + +from open_seq2seq.parts.transformer import embedding_layer +from open_seq2seq.parts.transformer.utils import get_padding_bias, get_padding +from open_seq2seq.parts.convs2s import ffn_wn_layer, conv_wn_layer + +# Default value used if max_input_length is not given +MAX_INPUT_LENGTH = 128 + + +class ConvS2SEncoder(Encoder): + """ + Fully convolutional Encoder of ConvS2S + """ + + @staticmethod + def get_required_params(): + return dict( + Encoder.get_required_params(), **{ + "encoder_layers": int, + "src_emb_size": int, + "src_vocab_size": int, + "pad_embeddings_2_eight": bool, + "conv_nchannels_kwidth": list, + "embedding_dropout_keep_prob": float, + "hidden_dropout_keep_prob": float, + }) + + @staticmethod + def get_optional_params(): + return dict( + Encoder.get_optional_params(), **{ + "att_layer_num": int, + 'max_input_length': int, + 'PAD_SYMBOL': int, + }) + + def __init__(self, + params, + model, + name="convs2s_encoder_with_emb", + mode='train'): + super(ConvS2SEncoder, self).__init__(params, model, name=name, mode=mode) + + self._src_vocab_size = self.params['src_vocab_size'] + self._src_emb_size = self.params['src_emb_size'] + self.layers = [] + self._mode = mode + self._pad_sym = self.params.get('PAD_SYMBOL', 0) + self._pad2eight = params.get('pad_embeddings_2_eight', False) + + def _encode(self, input_dict): + inputs = input_dict['source_tensors'][0] + source_length = input_dict['source_tensors'][1] + + with tf.variable_scope("encode"): + # prepare encoder graph + if len(self.layers) == 0: + knum_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[0] + kwidth_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[1] + + with tf.variable_scope("embedding"): + self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self._src_vocab_size, + hidden_size=self._src_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + with tf.variable_scope("pos_embedding"): + self.position_embedding_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self.params.get("max_input_length", MAX_INPUT_LENGTH), + hidden_size=self._src_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + # linear projection before cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + self._src_emb_size, + knum_list[0], + dropout=self.params["embedding_dropout_keep_prob"], + var_scope_name="linear_mapping_before_cnn_layers")) + + for i in range(self.params['encoder_layers']): + in_dim = knum_list[i] if i == 0 else knum_list[i - 1] + out_dim = knum_list[i] + + # linear projection is needed for residual connections if + # input and output of a cnn layer do not match + if in_dim != out_dim: + linear_proj = ffn_wn_layer.FeedFowardNetworkNormalized( + in_dim, + out_dim, + var_scope_name="linear_mapping_cnn_" + str(i + 1), + dropout=1.0) + else: + linear_proj = None + + conv_layer = conv_wn_layer.Conv1DNetworkNormalized( + in_dim, + out_dim, + kernel_width=kwidth_list[i], + mode=self.mode, + layer_id=i + 1, + hidden_dropout=self.params["hidden_dropout_keep_prob"], + conv_padding="SAME", + decode_padding=False) + + self.layers.append([linear_proj, conv_layer]) + + # linear projection after cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + knum_list[self.params['encoder_layers'] - 1], + self._src_emb_size, + dropout=1.0, + var_scope_name="linear_mapping_after_cnn_layers")) + + encoder_inputs = self.embedding_softmax_layer(inputs) + inputs_attention_bias = get_padding_bias( + inputs, res_rank=3, pad_sym=self._pad_sym) + + with tf.name_scope("add_pos_encoding"): + pos_input = tf.range( + 0, + tf.shape(encoder_inputs)[1], + delta=1, + dtype=tf.int32, + name='range') + pos_encoding = self.position_embedding_layer(pos_input) + encoder_inputs = encoder_inputs + tf.cast( + x=pos_encoding, dtype=encoder_inputs.dtype) + + if self.mode == "train": + encoder_inputs = tf.nn.dropout( + encoder_inputs, self.params["embedding_dropout_keep_prob"]) + + # mask the paddings in the input given to cnn layers + inputs_padding = get_padding( + inputs, self._pad_sym, dtype=encoder_inputs.dtype) + padding_mask = tf.expand_dims(1 - inputs_padding, 2) + encoder_inputs *= padding_mask + + # disables padding masks in middle layers + # padding_mask = None + outputs, outputs_b, final_state = self._call(encoder_inputs, padding_mask) + + return { + 'outputs': outputs, + 'outputs_b': outputs_b, + 'inputs_attention_bias_cs2s': inputs_attention_bias, + 'state': final_state, + 'src_lengths': source_length, # should it include paddings or not? + 'embedding_softmax_layer': self.embedding_softmax_layer, + # TODO: Should we share position embedding? + # 'position_embedding_layer': self.position_embedding_layer, + 'encoder_input': inputs + } + + def _call(self, encoder_inputs, padding_mask): + # Run inputs through the sublayers. + with tf.variable_scope("linear_layer_before_cnn_layers"): + outputs = self.layers[0](encoder_inputs) + + for i in range(1, len(self.layers) - 1): + linear_proj, conv_layer = self.layers[i] + + with tf.variable_scope("layer_%d" % i): + if padding_mask is not None: + outputs *= padding_mask + if linear_proj is not None: + res_inputs = linear_proj(outputs) + else: + res_inputs = outputs + outputs = conv_layer(outputs) + outputs = (outputs + res_inputs) * math.sqrt(0.5) + + with tf.variable_scope("linear_layer_after_cnn_layers"): + outputs = self.layers[-1](outputs) + + if padding_mask is not None: + outputs *= padding_mask + + # Gradients are scaled as the gradients from + # all decoder attention layers enters the encoder + scale = 1.0 / ( + 2.0 * self.params.get("att_layer_num", self.params["encoder_layers"])) + outputs = (1.0 - scale) * tf.stop_gradient(outputs) + scale * outputs + + outputs_b = (outputs + encoder_inputs) * math.sqrt(0.5) + + if padding_mask is not None: + outputs_b *= padding_mask + + # Average of the encoder outputs is calculated as the final state of the encoder + # it can be used for decoders which just accept the final state + final_state = tf.reduce_mean(outputs_b, 1) + return outputs, outputs_b, final_state + + @property + def src_vocab_size(self): + return self._src_vocab_size + + @property + def src_emb_size(self): + return self._src_emb_size diff --git a/open_seq2seq/models/text2text_test.py b/open_seq2seq/models/text2text_test.py index 75c77f220..cbd244134 100644 --- a/open_seq2seq/models/text2text_test.py +++ b/open_seq2seq/models/text2text_test.py @@ -21,7 +21,7 @@ def tearDown(self): def test_train(self): config_module = runpy.run_path( - "./example_configs/text2text/nmt-reversal-RR.py") + "./example_configs/text2text/toy-reversal/nmt-reversal-RR.py") train_config = config_module['base_params'] if 'train_params' in config_module: train_config.update(config_module['train_params']) @@ -70,7 +70,7 @@ def test_train(self): print("Attempting BasicSeq2SeqWithAttention on Horovod") hvd.init() config_module = runpy.run_path( - "./example_configs/text2text/nmt-reversal-RR.py") + "./example_configs/text2text/toy-reversal/nmt-reversal-RR.py") train_config = config_module['base_params'] if 'train_params' in config_module: train_config.update(config_module['train_params']) diff --git a/open_seq2seq/parts/convs2s/__init__.py b/open_seq2seq/parts/convs2s/__init__.py new file mode 100644 index 000000000..f6874261b --- /dev/null +++ b/open_seq2seq/parts/convs2s/__init__.py @@ -0,0 +1,3 @@ +from . import ffn_wn_layer +from . import conv_wn_layer +from . import attention_wn_layer diff --git a/open_seq2seq/parts/convs2s/attention_wn_layer.py b/open_seq2seq/parts/convs2s/attention_wn_layer.py new file mode 100644 index 000000000..89d9c3c6e --- /dev/null +++ b/open_seq2seq/parts/convs2s/attention_wn_layer.py @@ -0,0 +1,90 @@ +"""Implementation of the attention layer for convs2s. +Inspired from https://github.com/tobyyouup/conv_seq2seq""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import tensorflow as tf +import math +from open_seq2seq.parts.convs2s.ffn_wn_layer import FeedFowardNetworkNormalized + + +class AttentionLayerNormalized(tf.layers.Layer): + """Attention layer for convs2s with weight normalization""" + + def __init__(self, in_dim, embed_size, layer_id, add_res): + """initializes the attention layer. + It uses weight normalization for linear projections + (Salimans & Kingma, 2016) w = g * v/2-norm(v) + + Args: + in_dim: int last dimension of the inputs + embed_size: int target embedding size + layer_id: int the id of current convolution layer + add_res: bool whether residual connection should be added or not + """ + super(AttentionLayerNormalized, self).__init__() + + self.add_res = add_res + with tf.variable_scope("attention_layer_" + str(layer_id)): + + # linear projection layer to project the attention input to target space + self.tgt_embed_proj = FeedFowardNetworkNormalized( + in_dim, + embed_size, + dropout=1.0, + var_scope_name="att_linear_mapping_tgt_embed") + + # linear projection layer to project back to the input space + self.out_proj = FeedFowardNetworkNormalized( + embed_size, + in_dim, + dropout=1.0, + var_scope_name="att_linear_mapping_out") + + def call(self, input, target_embed, encoder_output_a, encoder_output_b, + input_attention_bias): + """Calculates the attention vectors. + + Args: + input: A float32 tensor with shape [batch_size, length, in_dim] + target_embed: A float32 tensor with shape [batch_size, length, in_dim] + containing the target embeddings + encoder_output_a: A float32 tensor with shape [batch_size, length, out_dim] + containing the first encoder outputs, uses as the keys + encoder_output_b: A float32 tensor with shape [batch_size, length, src_emb_dim] + containing the second encoder outputs, uses as the values + input_attention_bias: A float32 tensor with shape [batch_size, length, 1] + containing the bias used to mask the paddings + + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + + h_proj = self.tgt_embed_proj(input) + d_proj = (h_proj + target_embed) * math.sqrt(0.5) + att_score = tf.matmul(d_proj, encoder_output_a, transpose_b=True) + + # Masking need to be done in float32. Added to support mixed-precision training. + att_score = tf.cast(x=att_score, dtype=tf.float32) + + # mask out the paddings + if input_attention_bias is not None: + att_score = att_score + input_attention_bias + + att_score = tf.nn.softmax(att_score) + + # Cast back to original type + att_score = tf.cast(x=att_score, dtype=encoder_output_b.dtype) + + length = tf.cast(tf.shape(encoder_output_b), encoder_output_b.dtype) + output = tf.matmul(att_score, encoder_output_b) * \ + length[1] * tf.cast(tf.sqrt(1.0 / length[1]), dtype=encoder_output_b.dtype) + output = self.out_proj(output) + + if self.add_res: + output = (output + input) * math.sqrt(0.5) + + return output diff --git a/open_seq2seq/parts/convs2s/conv_wn_layer.py b/open_seq2seq/parts/convs2s/conv_wn_layer.py new file mode 100644 index 000000000..1c18a3b19 --- /dev/null +++ b/open_seq2seq/parts/convs2s/conv_wn_layer.py @@ -0,0 +1,103 @@ +"""Implementation of a 1d convolutional layer with weight normalization. +Inspired from https://github.com/tobyyouup/conv_seq2seq""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import tensorflow as tf +import math + + +class Conv1DNetworkNormalized(tf.layers.Layer): + """1D convolutional layer with weight normalization""" + + def __init__(self, in_dim, out_dim, kernel_width, mode, layer_id, + hidden_dropout, conv_padding, decode_padding): + """initializes the 1D convolution layer. + It uses weight normalization (Salimans & Kingma, 2016) w = g * v/2-norm(v) + + Args: + in_dim: int last dimension of the inputs + out_dim: int new dimension for the output + kernel_width: int width of kernel + mode: str the current mode + layer_id: int the id of current convolution layer + hidden_dropout: float the keep-dropout value used on the input. + Give 1.0 if no dropout. + It is used to initialize the weights of convolution. + conv_padding: str the type of padding done for convolution + decode_padding: bool specifies if this convolution layer is in decoder or not + in decoder padding is done explicitly before convolution + """ + + super(Conv1DNetworkNormalized, self).__init__() + self.mode = mode + self.conv_padding = conv_padding + self.decode_padding = decode_padding + self.hidden_dropout = hidden_dropout + self.kernel_width = kernel_width + + with tf.variable_scope("conv_layer_" + str(layer_id)): + V_std = math.sqrt(4.0 * hidden_dropout / (kernel_width * in_dim)) + self.V = tf.get_variable( + 'V', + shape=[kernel_width, in_dim, 2 * out_dim], + initializer=tf.random_normal_initializer(mean=0, stddev=V_std), + trainable=True) + self.V_norm = tf.norm(self.V.initialized_value(), axis=[0, 1]) + self.g = tf.get_variable('g', initializer=self.V_norm, trainable=True) + self.b = tf.get_variable( + 'b', + shape=[2 * out_dim], + initializer=tf.zeros_initializer(), + trainable=True) + + self.W = tf.reshape(self.g, [1, 1, 2 * out_dim]) * tf.nn.l2_normalize( + self.V, [0, 1]) + + def call(self, input): + """Applies convolution with gated linear units on x. + + Args: + x: A float32 tensor with shape [batch_size, length, in_dim] + + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + x = input + if self.mode == "train": + x = tf.nn.dropout(x, self.hidden_dropout) + + if self.decode_padding: + x = tf.pad( + x, [[0, 0], [self.kernel_width - 1, self.kernel_width - 1], [0, 0]], + "CONSTANT") + + output = tf.nn.bias_add( + tf.nn.conv1d( + value=x, filters=self.W, stride=1, padding=self.conv_padding), + self.b) + + if self.decode_padding and self.kernel_width > 1: + output = output[:, 0:-self.kernel_width + 1, :] + + output = self.gated_linear_units(output) + + return output + + def gated_linear_units(self, inputs): + """Gated Linear Units (GLU) on x. + + Args: + x: A float32 tensor with shape [batch_size, length, 2*out_dim] + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + input_shape = inputs.get_shape().as_list() + assert len(input_shape) == 3 + input_pass = inputs[:, :, 0:int(input_shape[2] / 2)] + input_gate = inputs[:, :, int(input_shape[2] / 2):] + input_gate = tf.sigmoid(input_gate) + return tf.multiply(input_pass, input_gate) diff --git a/open_seq2seq/parts/convs2s/ffn_wn_layer.py b/open_seq2seq/parts/convs2s/ffn_wn_layer.py new file mode 100644 index 000000000..27da7a159 --- /dev/null +++ b/open_seq2seq/parts/convs2s/ffn_wn_layer.py @@ -0,0 +1,68 @@ +"""Implementation of fully connected network with weight normalization. +Inspired from https://github.com/tobyyouup/conv_seq2seq""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import tensorflow as tf +import math + + +class FeedFowardNetworkNormalized(tf.layers.Layer): + """Fully connected feedforward network with weight normalization""" + + def __init__(self, in_dim, out_dim, dropout, var_scope_name): + """initializes the linear layer. + This layer projects from in_dim-dimenstional space to out_dim-dimentional space. + It uses weight normalization (Salimans & Kingma, 2016) w = g * v/2-norm(v) + + Args: + in_dim: int last dimension of the inputs + out_dim: int new dimension for the output + dropout: float the keep-dropout value used in the previous layer. + It is used to initialize the weights. Give 1.0 if no dropout. + var_scope_name: str the scope name for the weight variables + """ + super(FeedFowardNetworkNormalized, self).__init__() + self.out_dim = out_dim + self.in_dim = in_dim + + with tf.variable_scope(var_scope_name): + V_initializer = \ + tf.random_normal_initializer(mean=0, stddev=math.sqrt(dropout * 1.0 / in_dim)) + self.V = tf.get_variable( + 'V', + shape=[in_dim, out_dim], + initializer=V_initializer, + trainable=True) + self.V_norm = tf.norm(self.V.initialized_value(), axis=0) + self.g = tf.get_variable('g', initializer=self.V_norm, trainable=True) + self.b = tf.get_variable( + 'b', + shape=[out_dim], + initializer=tf.zeros_initializer(), + trainable=True) + + def call(self, x): + """Projects x with its linear transformation. + + Args: + x: A float32 tensor with shape [batch_size, length, in_dim] + + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + batch_size = tf.shape(x)[0] + + x = tf.reshape(x, [-1, self.in_dim]) + output = tf.matmul(x, self.V) + output = tf.reshape(output, [batch_size, -1, self.out_dim]) + + # x*(v*(g/2-norm(v))) + b + scaler = tf.div(self.g, tf.norm(self.V, axis=0)) + output = tf.reshape(scaler, [1, self.out_dim]) * output + \ + tf.reshape(self.b, [1, self.out_dim]) + + return output diff --git a/open_seq2seq/parts/transformer/embedding_layer.py b/open_seq2seq/parts/transformer/embedding_layer.py index 23f7c2177..0966cdde7 100644 --- a/open_seq2seq/parts/transformer/embedding_layer.py +++ b/open_seq2seq/parts/transformer/embedding_layer.py @@ -26,24 +26,31 @@ class EmbeddingSharedWeights(tf.layers.Layer): """Calculates input embeddings and pre-softmax linear with shared weights.""" - def __init__(self, vocab_size, hidden_size, pad_vocab_to_eight=False): + def __init__(self, vocab_size, hidden_size, pad_vocab_to_eight=False, init_var=None, + embed_scale=True, pad_sym=0, mask_paddings=True): super(EmbeddingSharedWeights, self).__init__() self.hidden_size = hidden_size + self.embed_scale = embed_scale + self.pad_sym = pad_sym + self.mask_paddings = mask_paddings + padf = lambda x: x if x % 8 == 0 else x + 8 - x % 8 if pad_vocab_to_eight: self.vocab_size = padf(vocab_size) else: self.vocab_size = vocab_size + if init_var is None: + self.init_var = hidden_size ** -0.5 + else: + self.init_var = init_var def build(self, _): with tf.variable_scope("embedding_and_softmax", reuse=tf.AUTO_REUSE): # Create and initialize weights. The random normal initializer was chosen # randomly, and works well. - self.shared_weights = tf.get_variable( - "weights", [self.vocab_size, self.hidden_size], - initializer=tf.random_normal_initializer( - 0., self.hidden_size ** -0.5)) + self.shared_weights = tf.get_variable("weights", [self.vocab_size, self.hidden_size], + initializer=tf.random_normal_initializer(0., self.init_var)) self.built = True @@ -59,18 +66,18 @@ def call(self, x): """ with tf.name_scope("embedding"): embeddings = tf.gather(self.shared_weights, x) - - # Scale embedding by the sqrt of the hidden size - embeddings *= self.hidden_size ** 0.5 - - # Create binary array of size [batch_size, length] - # where 1 = padding, 0 = not padding - padding = model_utils.get_padding(x) - - # Set all padding embedding values to 0 - #embeddings *= tf.expand_dims(1 - padding, -1) - embeddings *= tf.cast(tf.expand_dims(1 - padding, -1), - dtype=embeddings.dtype) + if self.embed_scale: + # Scale embedding by the sqrt of the hidden size + embeddings *= self.hidden_size ** 0.5 + + if self.mask_paddings: + # Create binary array of size [batch_size, length] + # where 1 = padding, 0 = not padding + padding = model_utils.get_padding(x, padding_value=self.pad_sym) + + # Set all padding embedding values to 0 + #embeddings *= tf.expand_dims(1 - padding, -1) + embeddings *= tf.cast(tf.expand_dims(1.0 - padding, -1), dtype=embeddings.dtype) return embeddings def linear(self, x): diff --git a/open_seq2seq/parts/transformer/utils.py b/open_seq2seq/parts/transformer/utils.py index 467ff319a..cef61af1d 100644 --- a/open_seq2seq/parts/transformer/utils.py +++ b/open_seq2seq/parts/transformer/utils.py @@ -75,23 +75,23 @@ def get_decoder_self_attention_bias(length): return decoder_bias -def get_padding(x, padding_value=0): +def get_padding(x, padding_value=0, dtype=tf.float32): """Return float tensor representing the padding values in x. Args: x: int tensor with any shape padding_value: int value that + dtype: type of the output Returns: flaot tensor with same shape as x containing values 0 or 1. 0 -> non-padding, 1 -> padding """ with tf.name_scope("padding"): - return tf.to_float(tf.equal(x, padding_value)) - #return tf.cast(tf.equal(x, padding_value), dtype=x.dtype) + return tf.cast(tf.equal(x, padding_value), dtype=dtype) -def get_padding_bias(x): +def get_padding_bias(x, res_rank=4, pad_sym=0): """Calculate bias tensor from padding values in tensor. Bias tensor that is added to the pre-softmax multi-headed attention logits, @@ -100,14 +100,22 @@ def get_padding_bias(x): Args: x: int tensor with shape [batch_size, length] + res_rank: int indicates the rank of attention_bias. + dtype: type of the output attention_bias + pad_sym: int the symbol used for padding Returns: - Attention bias tensor of shape [batch_size, 1, 1, length]. + Attention bias tensor of shape + [batch_size, 1, 1, length] if res_rank = 4 - for Transformer + or [batch_size, 1, length] if res_rank = 3 - for ConvS2S """ with tf.name_scope("attention_bias"): - padding = get_padding(x) + padding = get_padding(x, padding_value=pad_sym) attention_bias = padding * _NEG_INF - attention_bias = tf.expand_dims( - tf.expand_dims(attention_bias, axis=1), axis=1) + if res_rank == 4: + attention_bias = tf.expand_dims(tf.expand_dims(attention_bias, axis=1), axis=1) + elif res_rank == 3: + attention_bias = tf.expand_dims(attention_bias, axis=1) + else: + raise ValueError("res_rank should be 3 or 4 but got {}".format(res_rank)) return attention_bias - diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index b9db4ff6d..5900783c8 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -4,6 +4,7 @@ from six.moves import range from six import string_types +import six import tensorflow as tf import subprocess import numpy as np @@ -335,7 +336,10 @@ def mask_nans(x): def deco_print(line, offset=0, start="*** ", end='\n'): - print(start + " " * offset + line, end=end) + if six.PY2: + print((start + " " * offset + line).encode('utf-8'), end=end) + else: + print(start + " " * offset + line, end=end) def array_to_string(row, vocab, delim=' '): diff --git a/run.py b/run.py index 8dd3506a6..60108f799 100644 --- a/run.py +++ b/run.py @@ -260,4 +260,4 @@ def main(): if __name__ == '__main__': - main() + main() \ No newline at end of file From ccf95bbeeeaf8c87fda5db6d47982d35e00049ea Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 29 Jun 2018 10:13:22 -0700 Subject: [PATCH 093/102] Update API docs structure --- docs/sources/source/api-docs/decoders.rst | 8 +++++ docs/sources/source/api-docs/encoders.rst | 8 +++++ .../sources/source/api-docs/parts.convs2s.rst | 31 +++++++++++++++++++ docs/sources/source/api-docs/parts.rst | 1 + docs/sources/source/models-and-recipes.rst | 24 +++++++------- 5 files changed, 59 insertions(+), 13 deletions(-) create mode 100644 docs/sources/source/api-docs/parts.convs2s.rst diff --git a/docs/sources/source/api-docs/decoders.rst b/docs/sources/source/api-docs/decoders.rst index 07c22c247..681f5d1e1 100644 --- a/docs/sources/source/api-docs/decoders.rst +++ b/docs/sources/source/api-docs/decoders.rst @@ -37,3 +37,11 @@ transformer\_decoders :members: :undoc-members: :show-inheritance: + +convs2s\_decoder +------------------------------------- + +.. automodule:: decoders.convs2s_decoder + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/sources/source/api-docs/encoders.rst b/docs/sources/source/api-docs/encoders.rst index ba77b5532..faff6961b 100644 --- a/docs/sources/source/api-docs/encoders.rst +++ b/docs/sources/source/api-docs/encoders.rst @@ -38,6 +38,14 @@ transformer\_encoders :undoc-members: :show-inheritance: +convs2s\_encoder +------------------------------------- + +.. automodule:: encoders.convs2s_encoder + :members: + :undoc-members: + :show-inheritance: + resnet\_encoder ---------------------------------- diff --git a/docs/sources/source/api-docs/parts.convs2s.rst b/docs/sources/source/api-docs/parts.convs2s.rst new file mode 100644 index 000000000..226652c72 --- /dev/null +++ b/docs/sources/source/api-docs/parts.convs2s.rst @@ -0,0 +1,31 @@ +convs2s +======================================= + +.. automodule:: parts.convs2s + :members: + :undoc-members: + :show-inheritance: + +attention\_wn\_layer +------------------------------------------------------- + +.. automodule:: parts.convs2s.attention_wn_layer + :members: + :undoc-members: + :show-inheritance: + +conv\_wn\_layer +------------------------------------------------------- + +.. automodule:: parts.convs2s.conv_wn_layer + :members: + :undoc-members: + :show-inheritance: + +ffn\_wn\_layer +------------------------------------------------------- + +.. automodule:: parts.convs2s.ffn_wn_layer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sources/source/api-docs/parts.rst b/docs/sources/source/api-docs/parts.rst index 3f85cb82a..f6ad481d9 100644 --- a/docs/sources/source/api-docs/parts.rst +++ b/docs/sources/source/api-docs/parts.rst @@ -10,3 +10,4 @@ parts parts.rnns parts.transformer + parts.convs2s \ No newline at end of file diff --git a/docs/sources/source/models-and-recipes.rst b/docs/sources/source/models-and-recipes.rst index 9c7ad7e67..4f6035158 100644 --- a/docs/sources/source/models-and-recipes.rst +++ b/docs/sources/source/models-and-recipes.rst @@ -3,9 +3,6 @@ Models and recipes ================== -.. This section will contain information about different models that OpenSeq2Seq -.. supports, exact config parameters to train them, final training/validation/test -.. metrics and links to checkpoints (tensorboards also?) of trained models. .. note:: Currently OpenSeq2Seq has model implementations for machine translation and @@ -13,12 +10,12 @@ Models and recipes We recommend you use :ref:`mixed precision training ` when training on Volta GPUs. -To train models you can use the following -commands (don't forget to substitute valid config_file path there). +To train models you can use the following commands (don't forget to substitute +valid config_file path there and number of GPUs if using Horovod). With Horovod (highly recommended when using multiple GPUs):: - mpirun --allow-run-as-root --mca orte_base_help_aggregate 0 -mca btl ^openib -np 4 -H localhost:4 -bind-to none -map-by slot -x LD_LIBRARY_PATH python run.py --config_file=... --mode=train_eval --use_horovod=True --enable_logs + mpiexec --allow-run-as-root -np python run.py --config_file=... --mode=train_eval --use_horovod=True --enable_logs Without Horovod:: @@ -38,17 +35,17 @@ Machine translation - Training setup and additional comments - Short description of the model - Checkpoint - * - `en-de-nmt-small.py `_ + * - `en-de-nmt-small.py `_ - 20.23 - This model should train on a single GPU such as 1080Ti. It is trained using Adam optimizer. - RNN-based. Bi-directional encoder with 2 layers and. GNMT-like decoder with 2 layers and attention. Uses LSTM cells of size 512. - `link `_ - * - `en-de-gnmt-like-4GPUs.py `_ + * - `en-de-gnmt-like-4GPUs.py `_ - 23.89 - This model was trained on 4 GPUs with Adam optimizer and learning rate decay. - RNN-based. This is GNMT-like model which tries to match the one described in https://arxiv.org/abs/1609.08144 as close as possible. - `link `_ - * - `transformer-big.py `_ + * - `transformer-big.py `_ - 26.17 - This model was trained on 4 GPUs with Adam optimizer and learning rate decay. - Transformer "big" model. This model does not have any RNN layers @@ -61,11 +58,8 @@ We measure BLEU score on newstest2014.tok.de file using ``multi-bleu.perl`` scri Speech recognition ------------------ -Deep Speech 2 based models -~~~~~~~~~~~~~~~~~~~~~~~~~~ -Original Deep Speech 2 model description: https://arxiv.org/abs/1512.02595. The table below contains description and results of -Deep Speech 2 based models available in OpenSeq2Seq. +speech recognition models available in OpenSeq2Seq. WER is the word error rate obtained on a dev-clean subset of LibriSpeech using greedy decoder (``decoder_params/use_language_model = False``). @@ -104,3 +98,7 @@ have a look at the `configuration files `_ + +Original Deep Speech 2 model description: https://arxiv.org/abs/1512.02595 . + +Original Wav2Letter model description: https://arxiv.org/abs/1609.03193 . \ No newline at end of file From be51b903c4f66d8bd29110b26b5465d2525c14d4 Mon Sep 17 00:00:00 2001 From: Ravi Teja Gadde Date: Fri, 29 Jun 2018 10:26:38 -0700 Subject: [PATCH 094/102] Wave2Letter V1 Implementation (#146) Wave2Letter Implementation --- .../speech2text/w2l_large_8gpus.py | 161 +++++++++++++ .../speech2text/w2l_large_8gpus_mp.py | 162 +++++++++++++ open_seq2seq/data/speech2text/speech2text.py | 2 +- open_seq2seq/data/speech2text/speech_utils.py | 11 + .../data/speech2text/speech_utils_test.py | 2 +- open_seq2seq/encoders/__init__.py | 2 +- open_seq2seq/encoders/ds2_encoder.py | 227 ++++++++---------- open_seq2seq/encoders/w2l_encoder.py | 155 ++++++++++++ open_seq2seq/models/speech2text_ds2_test.py | 55 +++++ open_seq2seq/models/speech2text_test.py | 139 +++++------ open_seq2seq/models/speech2text_w2l_test.py | 46 ++++ open_seq2seq/parts/cnns/__init__.py | 1 + open_seq2seq/parts/cnns/conv_blocks.py | 90 +++++++ .../test_speech_configs/__init__.py | 0 .../ds2_test_config.py} | 0 .../test_speech_configs/w2l_test_config.py | 103 ++++++++ open_seq2seq/utils/utils_test.py | 12 +- 17 files changed, 960 insertions(+), 208 deletions(-) create mode 100644 example_configs/speech2text/w2l_large_8gpus.py create mode 100644 example_configs/speech2text/w2l_large_8gpus_mp.py create mode 100644 open_seq2seq/encoders/w2l_encoder.py create mode 100644 open_seq2seq/models/speech2text_ds2_test.py create mode 100644 open_seq2seq/models/speech2text_w2l_test.py create mode 100644 open_seq2seq/parts/cnns/__init__.py create mode 100644 open_seq2seq/parts/cnns/conv_blocks.py create mode 100644 open_seq2seq/test_utils/test_speech_configs/__init__.py rename open_seq2seq/test_utils/{test_speech_config.py => test_speech_configs/ds2_test_config.py} (100%) create mode 100644 open_seq2seq/test_utils/test_speech_configs/w2l_test_config.py diff --git a/example_configs/speech2text/w2l_large_8gpus.py b/example_configs/speech2text/w2l_large_8gpus.py new file mode 100644 index 000000000..897bde1c9 --- /dev/null +++ b/example_configs/speech2text/w2l_large_8gpus.py @@ -0,0 +1,161 @@ +import tensorflow as tf +from open_seq2seq.models import Speech2Text +from open_seq2seq.encoders import Wave2LetterEncoder +from open_seq2seq.decoders import FullyConnectedCTCDecoder +from open_seq2seq.data import Speech2TextDataLayer +from open_seq2seq.losses import CTCLoss +from open_seq2seq.optimizers.lr_policies import poly_decay + + +base_model = Speech2Text + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 50, + + "num_gpus": 8, + "batch_size_per_gpu": 32, + + "save_summaries_steps": 100, + "print_loss_steps": 10, + "print_samples_steps": 2200, + "eval_steps": 2200, + "save_checkpoint_steps": 1000, + "logdir": "w2l_log_folder", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.001, + "power": 0.5, + }, + "larc_params": { + "larc_eta": 0.001, + }, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0005 + }, + + #"max_grad_norm": 15.0, + "dtype": tf.float32, + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": Wave2LetterEncoder, + "encoder_params": { + "convnet_layers": [ + { + "type": "conv1d", "repeat" : 5, + "kernel_size": [7], "stride": [1], + "num_channels": 200, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [11], "stride": [1], + "num_channels": 400, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [15], "stride": [1], + "num_channels": 400, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [19], "stride": [1], + "num_channels": 400, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [23], "stride": [1], + "num_channels": 600, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [29], "stride": [1], + "num_channels": 800, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [1], "stride": [1], + "num_channels": 1000, "padding": "SAME" + }, + ], + + "dropout_keep_prob": 0.8, + + "initializer": tf.contrib.layers.xavier_initializer, + "initializer_params": { + 'uniform': False, + }, + "normalization" : "batch_norm", + "activation_fn" : lambda x: tf.minimum(tf.nn.relu(x), 20.0), + "data_format": "channels_last", + }, + + "decoder": FullyConnectedCTCDecoder, + "decoder_params": { + "initializer": tf.contrib.layers.xavier_initializer, + "use_language_model": True, + + # params for decoding the sequence with language model + "beam_width": 512, + "lm_weight": 2.0, + "word_count_weight": 1.5, + "valid_word_count_weight": 2.5, + + "decoder_library_path": "ctc_decoder_with_lm/libctc_decoder_with_kenlm.so", + "lm_binary_path": "language_model/lm.binary", + "lm_trie_path": "language_model/trie", + "alphabet_config_path": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + }, + "loss": CTCLoss, + "loss_params": {}, +} + +train_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "/data/librispeech/librivox-train-clean-100.csv", + "/data/librispeech/librivox-train-clean-360.csv", + "/data/librispeech/librivox-train-other-500.csv", + ], + "shuffle": True, + }, +} + +eval_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "/data/librispeech/librivox-dev-clean.csv", + ], + "shuffle": False, + }, +} + +infer_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "/data/librispeech/librivox-test-clean.csv", + ], + "shuffle": False, + }, +} diff --git a/example_configs/speech2text/w2l_large_8gpus_mp.py b/example_configs/speech2text/w2l_large_8gpus_mp.py new file mode 100644 index 000000000..342fcd6ff --- /dev/null +++ b/example_configs/speech2text/w2l_large_8gpus_mp.py @@ -0,0 +1,162 @@ +import tensorflow as tf +from open_seq2seq.models import Speech2Text +from open_seq2seq.encoders import Wave2LetterEncoder +from open_seq2seq.decoders import FullyConnectedCTCDecoder +from open_seq2seq.data import Speech2TextDataLayer +from open_seq2seq.losses import CTCLoss +from open_seq2seq.optimizers.lr_policies import poly_decay + + +base_model = Speech2Text + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 50, + + "num_gpus": 8, + "batch_size_per_gpu": 32, + + "save_summaries_steps": 100, + "print_loss_steps": 10, + "print_samples_steps": 2200, + "eval_steps": 2200, + "save_checkpoint_steps": 1000, + "logdir": "w2l_log_folder", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.001, + "power": 0.5, + }, + "larc_params": { + "larc_eta": 0.001, + }, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0005 + }, + + #"max_grad_norm": 15.0, + "dtype": "mixed", + "loss_scaling": "Backoff", + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": Wave2LetterEncoder, + "encoder_params": { + "convnet_layers": [ + { + "type": "conv1d", "repeat" : 5, + "kernel_size": [7], "stride": [1], + "num_channels": 200, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [11], "stride": [1], + "num_channels": 400, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [15], "stride": [1], + "num_channels": 400, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [19], "stride": [1], + "num_channels": 400, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [23], "stride": [1], + "num_channels": 600, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [29], "stride": [1], + "num_channels": 800, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [1], "stride": [1], + "num_channels": 1000, "padding": "SAME" + }, + ], + + "dropout_keep_prob": 0.8, + + "initializer": tf.contrib.layers.xavier_initializer, + "initializer_params": { + 'uniform': False, + }, + "normalization" : "batch_norm", + "activation_fn" : lambda x: tf.minimum(tf.nn.relu(x), 20.0), + "data_format": "channels_last", + }, + + "decoder": FullyConnectedCTCDecoder, + "decoder_params": { + "initializer": tf.contrib.layers.xavier_initializer, + "use_language_model": True, + + # params for decoding the sequence with language model + "beam_width": 512, + "lm_weight": 2.0, + "word_count_weight": 1.5, + "valid_word_count_weight": 2.5, + + "decoder_library_path": "ctc_decoder_with_lm/libctc_decoder_with_kenlm.so", + "lm_binary_path": "language_model/lm.binary", + "lm_trie_path": "language_model/trie", + "alphabet_config_path": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + }, + "loss": CTCLoss, + "loss_params": {}, +} + +train_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "/data/librispeech/librivox-train-clean-100.csv", + "/data/librispeech/librivox-train-clean-360.csv", + "/data/librispeech/librivox-train-other-500.csv", + ], + "shuffle": True, + }, +} + +eval_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "/data/librispeech/librivox-dev-clean.csv", + ], + "shuffle": False, + }, +} + +infer_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "/data/librispeech/librivox-test-clean.csv", + ], + "shuffle": False, + }, +} diff --git a/open_seq2seq/data/speech2text/speech2text.py b/open_seq2seq/data/speech2text/speech2text.py index 20dc94bd0..7baeb85a8 100644 --- a/open_seq2seq/data/speech2text/speech2text.py +++ b/open_seq2seq/data/speech2text/speech2text.py @@ -20,7 +20,7 @@ class Speech2TextDataLayer(DataLayer): def get_required_params(): return dict(DataLayer.get_required_params(), **{ 'num_audio_features': int, - 'input_type': ['spectrogram', 'mfcc'], + 'input_type': ['spectrogram', 'mfcc', 'logfbank'], 'vocab_file': str, 'dataset_files': list, }) diff --git a/open_seq2seq/data/speech2text/speech_utils.py b/open_seq2seq/data/speech2text/speech_utils.py index 1ef485807..d93d54348 100644 --- a/open_seq2seq/data/speech2text/speech_utils.py +++ b/open_seq2seq/data/speech2text/speech_utils.py @@ -153,6 +153,17 @@ def get_speech_features(signal, fs, num_features, pad_to=8, preemph=0.97, ceplifter=2*num_features, appendEnergy=False) + + elif features_type == 'logfbank': + features = psf.logfbank(signal=signal, + samplerate=fs, + winlen=window_size, + winstep=window_stride, + nfilt=num_features, + nfft=512, + lowfreq=0, highfreq=fs/2, + preemph=0.97) + else: raise ValueError('Unknown features type: {}'.format(features_type)) diff --git a/open_seq2seq/data/speech2text/speech_utils_test.py b/open_seq2seq/data/speech2text/speech_utils_test.py index e457578cb..33b8ca7ea 100644 --- a/open_seq2seq/data/speech2text/speech_utils_test.py +++ b/open_seq2seq/data/speech2text/speech_utils_test.py @@ -47,7 +47,7 @@ def test_get_speech_features_from_file(self): for num_features in [161, 120]: for window_stride in [10e-3, 5e-3, 40e-3]: for window_size in [20e-3, 30e-3]: - for features_type in ['spectrogram', 'mfcc']: + for features_type in ['spectrogram', 'mfcc', 'logfbank']: fs, signal = wave.read(filename) n_window_size = int(fs * window_size) n_window_stride = int(fs * window_stride) diff --git a/open_seq2seq/encoders/__init__.py b/open_seq2seq/encoders/__init__.py index bae57227e..827ef1c32 100644 --- a/open_seq2seq/encoders/__init__.py +++ b/open_seq2seq/encoders/__init__.py @@ -11,6 +11,6 @@ from .transformer_encoder import TransformerEncoder from .ds2_encoder import DeepSpeech2Encoder from .resnet_encoder import ResNetEncoder - +from .w2l_encoder import Wave2LetterEncoder from .convs2s_encoder import ConvS2SEncoder #from .convs2s_encoder_old import ConvS2SEncoder diff --git a/open_seq2seq/encoders/ds2_encoder.py b/open_seq2seq/encoders/ds2_encoder.py index ec093fbae..0fd455d00 100644 --- a/open_seq2seq/encoders/ds2_encoder.py +++ b/open_seq2seq/encoders/ds2_encoder.py @@ -7,41 +7,14 @@ from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops from .encoder import Encoder - - -def conv2d_bn_actv(name, inputs, filters, kernel_size, activation_fn, strides, - padding, regularizer, training, data_format, bn_momentum, - bn_epsilon): - """Helper function that applies convolution, batch norm and activation.""" - conv = tf.layers.conv2d( - name="{}".format(name), - inputs=inputs, - filters=filters, - kernel_size=kernel_size, - strides=strides, - padding=padding, - kernel_regularizer=regularizer, - use_bias=False, - data_format=data_format, - ) - bn = tf.layers.batch_normalization( - name="{}/bn".format(name), - inputs=conv, - gamma_regularizer=regularizer, - training=training, - axis=-1 if data_format == 'channels_last' else 1, - momentum=bn_momentum, - epsilon=bn_epsilon, - ) - output = activation_fn(bn) - return output +from open_seq2seq.parts.cnns.conv_blocks import conv_bn_actv def rnn_cell(rnn_cell_dim, layer_type, dropout_keep_prob=1.0): """Helper function that creates RNN cell.""" if layer_type == "layernorm_lstm": cell = tf.contrib.rnn.LayerNormBasicLSTMCell( - num_units=rnn_cell_dim, dropout_keep_prob=dropout_keep_prob) + num_units=rnn_cell_dim, dropout_keep_prob=dropout_keep_prob) else: if layer_type == "lstm": cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_cell_dim) @@ -55,7 +28,7 @@ def rnn_cell(rnn_cell_dim, layer_type, dropout_keep_prob=1.0): raise ValueError("Error: not supported rnn type:{}".format(layer_type)) cell = tf.nn.rnn_cell.DropoutWrapper( - cell, output_keep_prob=dropout_keep_prob) + cell, output_keep_prob=dropout_keep_prob) return cell @@ -75,28 +48,28 @@ def row_conv(name, input_layer, batch, channels, width, activation_fn, x = tf.cast(x, tf.float32) cast_back = True filters = tf.get_variable( - name+'/w', - shape=[width, 1, channels, 1], - regularizer=regularizer, - dtype=tf.float32, + name + '/w', + shape=[width, 1, channels, 1], + regularizer=regularizer, + dtype=tf.float32, ) strides = [1, 1, 1, 1] y = tf.nn.depthwise_conv2d( - name=name + '/conv', - input=x, - filter=filters, - strides=strides, - padding='SAME', - data_format='NHWC' if data_format == 'channels_last' else 'NCHW', + name=name + '/conv', + input=x, + filter=filters, + strides=strides, + padding='SAME', + data_format='NHWC' if data_format == 'channels_last' else 'NCHW', ) bn = tf.layers.batch_normalization( - name="{}/bn".format(name), - inputs=y, - gamma_regularizer=regularizer, - training=training, - axis=-1 if data_format == 'channels_last' else 1, - momentum=bn_momentum, - epsilon=bn_epsilon, + name="{}/bn".format(name), + inputs=y, + gamma_regularizer=regularizer, + training=training, + axis=-1 if data_format == 'channels_last' else 1, + momentum=bn_momentum, + epsilon=bn_epsilon, ) output = activation_fn(bn) if data_format == 'channels_first': @@ -112,25 +85,25 @@ class DeepSpeech2Encoder(Encoder): @staticmethod def get_required_params(): return dict(Encoder.get_required_params(), **{ - 'dropout_keep_prob': float, - 'conv_layers': list, - 'activation_fn': None, # any valid callable - 'num_rnn_layers': int, - 'row_conv': bool, - 'n_hidden': int, - 'use_cudnn_rnn': bool, - 'rnn_cell_dim': int, - 'rnn_type': ['layernorm_lstm', 'lstm', 'gru', 'cudnn_gru', 'cudnn_lstm'], - 'rnn_unidirectional': bool, + 'dropout_keep_prob': float, + 'conv_layers': list, + 'activation_fn': None, # any valid callable + 'num_rnn_layers': int, + 'row_conv': bool, + 'n_hidden': int, + 'use_cudnn_rnn': bool, + 'rnn_cell_dim': int, + 'rnn_type': ['layernorm_lstm', 'lstm', 'gru', 'cudnn_gru', 'cudnn_lstm'], + 'rnn_unidirectional': bool, }) @staticmethod def get_optional_params(): return dict(Encoder.get_optional_params(), **{ - 'row_conv_width': int, - 'data_format': ['channels_first', 'channels_last'], - 'bn_momentum': float, - 'bn_epsilon': float, + 'row_conv_width': int, + 'data_format': ['channels_first', 'channels_last'], + 'bn_momentum': float, + 'bn_epsilon': float, }) def __init__(self, params, model, name="ds2_encoder", mode='train'): @@ -214,8 +187,8 @@ def _encode(self, input_dict): top_layer = input_layer else: top_layer = tf.transpose(input_layer, [0, 3, 1, 2]) - - # ----- Convolutional layers ----------------------------------------------- + + # ----- Convolutional layers --------------------------------------------- conv_layers = self.params['conv_layers'] for idx_conv in range(len(conv_layers)): @@ -229,19 +202,20 @@ def _encode(self, input_dict): else: src_length = (src_length + strides[0] - 1) // strides[0] - top_layer = conv2d_bn_actv( - name="conv{}".format(idx_conv + 1), - inputs=top_layer, - filters=ch_out, - kernel_size=kernel_size, - activation_fn=self.params['activation_fn'], - strides=strides, - padding=padding, - regularizer=regularizer, - training=training, - data_format=data_format, - bn_momentum=bn_momentum, - bn_epsilon=bn_epsilon, + top_layer = conv_bn_actv( + type="conv2d", + name="conv{}".format(idx_conv + 1), + inputs=top_layer, + filters=ch_out, + kernel_size=kernel_size, + activation_fn=self.params['activation_fn'], + strides=strides, + padding=padding, + regularizer=regularizer, + training=training, + data_format=data_format, + bn_momentum=bn_momentum, + bn_epsilon=bn_epsilon, ) if data_format == 'channels_first': top_layer = tf.transpose(top_layer, [0, 2, 3, 1]) @@ -267,55 +241,56 @@ def _encode(self, input_dict): if rnn_type == "cudnn_gru" or rnn_type == "gru": rnn_block = tf.contrib.cudnn_rnn.CudnnGRU( - num_layers=num_rnn_layers, - num_units=rnn_cell_dim, - direction=direction, - dropout=1.0 - dropout_keep_prob, - dtype=rnn_input.dtype, - name="cudnn_gru", + num_layers=num_rnn_layers, + num_units=rnn_cell_dim, + direction=direction, + dropout=1.0 - dropout_keep_prob, + dtype=rnn_input.dtype, + name="cudnn_gru", ) elif rnn_type == "cudnn_lstm" or rnn_type == "lstm": rnn_block = tf.contrib.cudnn_rnn.CudnnLSTM( - num_layers=num_rnn_layers, - num_units=rnn_cell_dim, - direction=direction, - dropout=1.0 - dropout_keep_prob, - dtype=rnn_input.dtype, - name="cudnn_lstm", + num_layers=num_rnn_layers, + num_units=rnn_cell_dim, + direction=direction, + dropout=1.0 - dropout_keep_prob, + dtype=rnn_input.dtype, + name="cudnn_lstm", ) else: raise ValueError( - "{} is not a valid rnn_type for cudnn_rnn layers".format(rnn_type) + "{} is not a valid rnn_type for cudnn_rnn layers".format( + rnn_type) ) top_layer, state = rnn_block(rnn_input) top_layer = tf.transpose(top_layer, [1, 0, 2]) else: rnn_input = top_layer multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell( - [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type, - dropout_keep_prob=dropout_keep_prob) - for _ in range(num_rnn_layers)] + [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type, + dropout_keep_prob=dropout_keep_prob) + for _ in range(num_rnn_layers)] ) if self.params['rnn_unidirectional']: top_layer, state = tf.nn.dynamic_rnn( - cell=multirnn_cell_fw, - inputs=rnn_input, - sequence_length=src_length, - dtype=rnn_input.dtype, - time_major=False, + cell=multirnn_cell_fw, + inputs=rnn_input, + sequence_length=src_length, + dtype=rnn_input.dtype, + time_major=False, ) else: multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell( - [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type, - dropout_keep_prob=dropout_keep_prob) - for _ in range(num_rnn_layers)] + [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type, + dropout_keep_prob=dropout_keep_prob) + for _ in range(num_rnn_layers)] ) top_layer, state = tf.nn.bidirectional_dynamic_rnn( - cell_fw=multirnn_cell_fw, cell_bw=multirnn_cell_bw, - inputs=rnn_input, - sequence_length=src_length, - dtype=rnn_input.dtype, - time_major=False + cell_fw=multirnn_cell_fw, cell_bw=multirnn_cell_bw, + inputs=rnn_input, + sequence_length=src_length, + dtype=rnn_input.dtype, + time_major=False ) # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim] top_layer = tf.concat(top_layer, 2) @@ -324,41 +299,41 @@ def _encode(self, input_dict): if self.params['row_conv']: channels = top_layer.get_shape().as_list()[-1] top_layer = row_conv( - name="row_conv", - input_layer=top_layer, - batch=batch_size, - channels=channels, - activation_fn=self.params['activation_fn'], - width=self.params['row_conv_width'], - regularizer=regularizer, - training=training, - data_format=data_format, - bn_momentum=bn_momentum, - bn_epsilon=bn_epsilon, + name="row_conv", + input_layer=top_layer, + batch=batch_size, + channels=channels, + activation_fn=self.params['activation_fn'], + width=self.params['row_conv_width'], + regularizer=regularizer, + training=training, + data_format=data_format, + bn_momentum=bn_momentum, + bn_epsilon=bn_epsilon, ) # Reshape [B, T, C] --> [B*T, C] c = top_layer.get_shape().as_list()[-1] top_layer = tf.reshape(top_layer, [-1, c]) - # --- hidden layer with clipped ReLU activation and dropout----------------- + # --- hidden layer with clipped ReLU activation and dropout--------------- top_layer = tf.layers.dense( - inputs=top_layer, - units=self.params['n_hidden'], - kernel_regularizer=regularizer, - activation=self.params['activation_fn'], - name='fully_connected', + inputs=top_layer, + units=self.params['n_hidden'], + kernel_regularizer=regularizer, + activation=self.params['activation_fn'], + name='fully_connected', ) outputs = tf.nn.dropout(x=top_layer, keep_prob=dropout_keep_prob) # reshape from [B*T,A] --> [B, T, A]. # Output shape: [batch_size, n_steps, n_hidden] outputs = tf.reshape( - outputs, - [batch_size, -1, self.params['n_hidden']], + outputs, + [batch_size, -1, self.params['n_hidden']], ) return { - 'outputs': outputs, - 'src_length': src_length, + 'outputs': outputs, + 'src_length': src_length, } diff --git a/open_seq2seq/encoders/w2l_encoder.py b/open_seq2seq/encoders/w2l_encoder.py new file mode 100644 index 000000000..75b5b6a56 --- /dev/null +++ b/open_seq2seq/encoders/w2l_encoder.py @@ -0,0 +1,155 @@ +# Copyright (c) 2018 NVIDIA Corporation +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals +from six.moves import range + +import tensorflow as tf + +from .encoder import Encoder +from open_seq2seq.parts.cnns.conv_blocks import * + + +class Wave2LetterEncoder(Encoder): + """Wave2Letter like encoder. Fully convolutional model""" + + @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + 'dropout_keep_prob': float, + 'convnet_layers': list, + 'activation_fn': None, # any valid callable + }) + + @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + 'data_format': ['channels_first', 'channels_last'], + 'normalization': [None, 'batch_norm'], + 'bn_momentum': float, + 'bn_epsilon': float, + }) + + def __init__(self, params, model, name="w2l_encoder", mode='train'): + """Wave2Letter like encoder constructor. + + See parent class for arguments description. + + Config parameters: + + * **dropout_keep_prop** (float) --- keep probability for dropout. + * **convnet_layers** (list) --- list with the description of convolutional + layers. For example:: + "convnet_layers": [ + { + "type": "conv1d", "repeat" : 5, + "kernel_size": [7], "stride": [1], + "num_channels": 250, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [11], "stride": [1], + "num_channels": 500, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [32], "stride": [1], + "num_channels": 1000, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [1], "stride": [1], + "num_channels": 1000, "padding": "SAME" + }, + ] + * **activation_fn** --- activation function to use. + * **data_format** (string) --- could be either "channels_first" or + "channels_last". Defaults to "channels_last". + * **normalization** --- normalization to use. Accepts [None, 'batch_norm']. + Use None if you don't want to use normalization. Defaults to 'batch_norm'. + * **bn_momentum** (float) --- momentum for batch norm. Defaults to 0.90. + * **bn_epsilon** (float) --- epsilon for batch norm. Defaults to 1e-3. + """ + super(Wave2LetterEncoder, self).__init__(params, model, name, mode) + + def _encode(self, input_dict): + """Creates TensorFlow graph for Wav2Letter like encoder. + + Args: + input_dict (dict): input dictionary that has to contain + the following fields:: + input_dict = { + "source_tensors": [ + src_sequence (shape=[batch_size, sequence length, num features]), + src_length (shape=[batch_size]) + ] + } + + Returns: + dict: dictionary with the following tensors:: + + { + 'outputs': hidden state, shape=[batch_size, sequence length, n_hidden] + 'src_length': tensor, shape=[batch_size] + } + """ + + source_sequence, src_length = input_dict['source_tensors'] + + training = (self._mode == "train") + dropout_keep_prob = self.params['dropout_keep_prob'] if training else 1.0 + regularizer = self.params.get('regularizer', None) + data_format = self.params.get('data_format', 'channels_last') + normalization = self.params.get('normalization', 'batch_norm') + + normalization_params = {} + if normalization == None: + conv_block = conv_actv + elif normalization == "batch_norm": + conv_block = conv_bn_actv + normalization_params['bn_momentum'] = self.params.get( + 'bn_momentum', 0.90) + normalization_params['bn_epsilon'] = self.params.get('bn_epsilon', 1e-3) + + conv_inputs = source_sequence + batch_size = conv_inputs.get_shape().as_list()[0] + if data_format == 'channels_last': + conv_feats = conv_inputs # B T F + else: + conv_feats = tf.transpose(conv_inputs, [0, 2, 1]) # B F T + + # ----- Convolutional layers --------------------------------------------- + convnet_layers = self.params['convnet_layers'] + + for idx_convnet in range(len(convnet_layers)): + layer_type = convnet_layers[idx_convnet]['type'] + layer_repeat = convnet_layers[idx_convnet]['repeat'] + ch_out = convnet_layers[idx_convnet]['num_channels'] + kernel_size = convnet_layers[idx_convnet]['kernel_size'] + strides = convnet_layers[idx_convnet]['stride'] + padding = convnet_layers[idx_convnet]['padding'] + + for idx_layer in range(layer_repeat): + conv_feats = conv_block( + type=layer_type, + name="conv{}{}".format( + idx_convnet + 1, idx_layer + 1), + inputs=conv_feats, + filters=ch_out, + kernel_size=kernel_size, + activation_fn=self.params['activation_fn'], + strides=strides, + padding=padding, + regularizer=regularizer, + training=training, + data_format=data_format, + **normalization_params + ) + outputs = tf.nn.dropout(x=conv_feats, keep_prob=dropout_keep_prob) + + if data_format == 'channels_first': + outputs = tf.transpose(outputs, [0, 2, 1]) + + return { + 'outputs': outputs, + 'src_length': src_length, + } diff --git a/open_seq2seq/models/speech2text_ds2_test.py b/open_seq2seq/models/speech2text_ds2_test.py new file mode 100644 index 000000000..519e4039c --- /dev/null +++ b/open_seq2seq/models/speech2text_ds2_test.py @@ -0,0 +1,55 @@ +# Copyright (c) 2017 NVIDIA Corporation +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals +from six.moves import range + +import tensorflow as tf +import numpy as np +import copy +import numpy.testing as npt +import tempfile +import os +import pandas as pd + +from .speech2text_test import Speech2TextModelTests +from open_seq2seq.test_utils.test_speech_configs.ds2_test_config import base_params, \ + train_params, \ + eval_params, \ + base_model + + +class DS2ModelTests(Speech2TextModelTests): + + def setUp(self): + self.base_model = base_model + self.base_params = base_params + self.train_params = train_params + self.eval_params = eval_params + + def tearDown(self): + pass + + def test_regularizer(self): + return self.regularizer_test() + + def test_convergence(self): + return self.convergence_test(5.0, 30.0, 0.1) + + def test_convergence_with_iter_size(self): + return self.convergence_with_iter_size_test() + + def test_infer(self): + return self.infer_test() + + def test_mp_collection(self): + return self.mp_collection_test(14, 7) + + def test_levenshtein(self): + return self.levenshtein_test() + + def test_maybe_functions(self): + return self.maybe_functions_test() + + +if __name__ == '__main__': + tf.test.main() diff --git a/open_seq2seq/models/speech2text_test.py b/open_seq2seq/models/speech2text_test.py index 60f046578..7493e90ac 100644 --- a/open_seq2seq/models/speech2text_test.py +++ b/open_seq2seq/models/speech2text_test.py @@ -12,26 +12,17 @@ import pandas as pd from .speech2text import levenshtein -from open_seq2seq.test_utils.test_speech_config import base_params, \ - train_params, \ - eval_params, \ - base_model from open_seq2seq.utils import train, evaluate, infer from open_seq2seq.utils.utils import get_available_gpus class Speech2TextModelTests(tf.test.TestCase): - def setUp(self): - pass - - def tearDown(self): - pass def run_model(self, train_config, eval_config, hvd=None): with tf.Graph().as_default() as g: - train_model = base_model(params=train_config, mode="train", hvd=hvd) + train_model = self.base_model(params=train_config, mode="train", hvd=hvd) train_model.compile() - eval_model = base_model(params=eval_config, mode="eval", hvd=hvd) + eval_model = self.base_model(params=eval_config, mode="eval", hvd=hvd) eval_model.compile(force_var_reuse=True) train(train_model, eval_model) @@ -50,33 +41,34 @@ def run_model(self, train_config, eval_config, hvd=None): eval_loss = np.mean(eval_losses) weights_new = sess.run(tf.trainable_variables()) - # checking that the weights has not changed from just computing the loss + # checking that the weights has not changed from just computing the + # loss for w, w_new in zip(weights, weights_new): npt.assert_allclose(w, w_new) eval_dict = evaluate(eval_model, checkpoint) return loss, eval_loss, eval_dict def prepare_config(self): - base_params['logdir'] = tempfile.mktemp() - train_config = copy.deepcopy(base_params) - eval_config = copy.deepcopy(base_params) - train_config.update(copy.deepcopy(train_params)) - eval_config.update(copy.deepcopy(eval_params)) + self.base_params['logdir'] = tempfile.mktemp() + train_config = copy.deepcopy(self.base_params) + eval_config = copy.deepcopy(self.base_params) + train_config.update(copy.deepcopy(self.train_params)) + eval_config.update(copy.deepcopy(self.eval_params)) return train_config, eval_config - def test_regularizer(self): + def regularizer_test(self): for dtype in [tf.float16, tf.float32, 'mixed']: train_config, eval_config = self.prepare_config() train_config['num_epochs'] = 60 train_config.update({ - "dtype": dtype, - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 1e4, - }, + "dtype": dtype, + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 1e4, + }, }) eval_config.update({ - "dtype": dtype, + "dtype": dtype, }) loss, eval_loss, eval_dict = self.run_model(train_config, eval_config) @@ -84,22 +76,22 @@ def test_regularizer(self): self.assertGreaterEqual(eval_loss, 500.0) self.assertGreaterEqual(eval_dict['Eval WER'], 0.95) - def test_convergence(self): + def convergence_test(self, train_loss_threshold, eval_loss_threshold, eval_wer_threshold): for dtype in [tf.float32, "mixed"]: train_config, eval_config = self.prepare_config() train_config.update({ - "dtype": dtype, + "dtype": dtype, }) eval_config.update({ - "dtype": dtype, + "dtype": dtype, }) loss, eval_loss, eval_dict = self.run_model(train_config, eval_config) - self.assertLess(loss, 5.0) - self.assertLess(eval_loss, 30.0) - self.assertLess(eval_dict['Eval WER'], 0.1) + self.assertLess(loss, train_loss_threshold) + self.assertLess(eval_loss, eval_loss_threshold) + self.assertLess(eval_dict['Eval WER'], eval_wer_threshold) - def test_convergence_with_iter_size(self): + def convergence_with_iter_size_test(self): try: import horovod.tensorflow as hvd hvd.init() @@ -110,25 +102,26 @@ def test_convergence_with_iter_size(self): for dtype in [tf.float32, "mixed"]: train_config, eval_config = self.prepare_config() train_config.update({ - "dtype": dtype, - "iter_size": 5, - "batch_size_per_gpu": 2, - "use_horovod": True, - "num_epochs": 200, + "dtype": dtype, + "iter_size": 5, + "batch_size_per_gpu": 2, + "use_horovod": True, + "num_epochs": 200, }) eval_config.update({ - "dtype": dtype, - "iter_size": 5, - "batch_size_per_gpu": 2, - "use_horovod": True, + "dtype": dtype, + "iter_size": 5, + "batch_size_per_gpu": 2, + "use_horovod": True, }) - loss, eval_loss, eval_dict = self.run_model(train_config, eval_config, hvd) + loss, eval_loss, eval_dict = self.run_model( + train_config, eval_config, hvd) self.assertLess(loss, 10.0) self.assertLess(eval_loss, 30.0) self.assertLess(eval_dict['Eval WER'], 0.1) - def test_infer(self): + def infer_test(self): train_config, infer_config = self.prepare_config() train_config['num_epochs'] = 250 infer_config['batch_size_per_gpu'] = 4 @@ -143,24 +136,26 @@ def test_infer(self): infer_config['num_gpus'] = 1 with tf.Graph().as_default(): - train_model = base_model(params=train_config, mode="train", hvd=None) + train_model = self.base_model( + params=train_config, mode="train", hvd=None) train_model.compile() train(train_model, None) with tf.Graph().as_default(): - infer_model = base_model(params=infer_config, mode="infer", hvd=None) + infer_model = self.base_model( + params=infer_config, mode="infer", hvd=None) infer_model.compile() print(train_model.params['logdir']) output_file = os.path.join(train_model.params['logdir'], 'infer_out.csv') infer( - infer_model, - tf.train.latest_checkpoint(train_model.params['logdir']), - output_file, + infer_model, + tf.train.latest_checkpoint(train_model.params['logdir']), + output_file, ) pred_csv = pd.read_csv(output_file) true_csv = pd.read_csv( - 'open_seq2seq/test_utils/toy_speech_data/toy_data.csv', + 'open_seq2seq/test_utils/toy_speech_data/toy_data.csv', ) for pred_row, true_row in zip(pred_csv.as_matrix(), true_csv.as_matrix()): # checking file name @@ -168,20 +163,20 @@ def test_infer(self): # checking prediction self.assertEqual(pred_row[-1], true_row[-1]) - def test_mp_collection(self): + def mp_collection_test(self, num_train_vars, num_master_copies): train_config, eval_config = self.prepare_config() train_config['dtype'] = 'mixed' with tf.Graph().as_default(): - model = base_model(params=train_config, mode="train", hvd=None) + model = self.base_model(params=train_config, mode="train", hvd=None) model.compile() - self.assertEqual(len(tf.trainable_variables()), 14) + self.assertEqual(len(tf.trainable_variables()), num_train_vars) self.assertEqual( - len(tf.get_collection('FP32_MASTER_COPIES')), - 7, # minus batch norm beta and gamma and row_conv vars + len(tf.get_collection('FP32_MASTER_COPIES')), + num_master_copies, # minus batch norm beta and gamma and row_conv vars ) - def test_levenshtein(self): + def levenshtein_test(self): s1 = 'this is a great day' s2 = 'this is great day' self.assertEqual(levenshtein(s1.split(), s2.split()), 1) @@ -210,28 +205,28 @@ def test_levenshtein(self): self.assertEqual(levenshtein(s1, s2), 11) self.assertEqual(levenshtein(s2, s1), 11) - def test_maybe_functions(self): + def maybe_functions_test(self): train_config, eval_config = self.prepare_config() with tf.Graph().as_default(): - model = base_model(params=train_config, mode="train", hvd=None) + model = self.base_model(params=train_config, mode="train", hvd=None) model.compile() model._gpu_ids = range(5) model.params['batch_size_per_gpu'] = 2 char2idx = model.get_data_layer().params['char2idx'] inputs = [ - ['this is a great day', 'london is the capital of great britain'], - ['ooo', 'lll'], - ['a b c\' asdf', 'blah blah bblah'], - ['this is great day', 'london capital gret britain'], - ['aaaaaaaasdfdasdf', 'df d sdf asd fd f sdf df blah\' blah'], + ['this is a great day', 'london is the capital of great britain'], + ['ooo', 'lll'], + ['a b c\' asdf', 'blah blah bblah'], + ['this is great day', 'london capital gret britain'], + ['aaaaaaaasdfdasdf', 'df d sdf asd fd f sdf df blah\' blah'], ] outputs = [ - ['this is great a day', 'london capital gret britain'], - ['ooo', 'lll'], - ['aaaaaaaasdfdasdf', 'df d sdf asd fd f sdf df blah blah'], - ['this is a great day', 'london is the capital of great britain'], - ['a b c\' asdf', 'blah blah\' bblah'], + ['this is great a day', 'london capital gret britain'], + ['ooo', 'lll'], + ['aaaaaaaasdfdasdf', 'df d sdf asd fd f sdf df blah blah'], + ['this is a great day', 'london is the capital of great britain'], + ['a b c\' asdf', 'blah blah\' bblah'], ] y = [None] * len(inputs) len_y = [None] * len(inputs) @@ -248,7 +243,7 @@ def test_maybe_functions(self): len_y[gpu_id][sample_id] = num_letters for letter_id in range(num_letters): y[gpu_id][sample_id, letter_id] = char2idx[ - inputs[gpu_id][sample_id][letter_id] + inputs[gpu_id][sample_id][letter_id] ] num_gpus = len(outputs) @@ -262,7 +257,7 @@ def test_maybe_functions(self): num_letters = len(outputs[gpu_id][sample_id]) for letter_id in range(num_letters): values[gpu_id].append( - char2idx[outputs[gpu_id][sample_id][letter_id]] + char2idx[outputs[gpu_id][sample_id][letter_id]] ) indices[gpu_id].append(np.array([sample_id, letter_id])) values[gpu_id] = np.array(values[gpu_id], dtype=np.int) @@ -272,8 +267,8 @@ def test_maybe_functions(self): len_x = [None] * len(y) input_values = list(zip(x, len_x, y, len_y)) output_values = [ - [tf.SparseTensorValue(indices[i], values[i], dense_shape[i])] - for i in range(num_gpus) + [tf.SparseTensorValue(indices[i], values[i], dense_shape[i])] + for i in range(num_gpus) ] results = [] @@ -303,7 +298,3 @@ def test_maybe_functions(self): 'target_tensors': [input_values[0][2], input_values[0][3]]} output_dict = model.maybe_print_logs(inp_dict, output_values[0], 0) self.assertEqual(output_dict['Sample WER'], 0.4) - - -if __name__ == '__main__': - tf.test.main() diff --git a/open_seq2seq/models/speech2text_w2l_test.py b/open_seq2seq/models/speech2text_w2l_test.py new file mode 100644 index 000000000..88f20eee8 --- /dev/null +++ b/open_seq2seq/models/speech2text_w2l_test.py @@ -0,0 +1,46 @@ +# Copyright (c) 2017 NVIDIA Corporation +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals +from six.moves import range + +import tensorflow as tf +import numpy as np +import copy +import numpy.testing as npt +import tempfile +import os +import pandas as pd + +from .speech2text_test import Speech2TextModelTests +from open_seq2seq.test_utils.test_speech_configs.w2l_test_config import base_params, \ + train_params, \ + eval_params, \ + base_model + + +class DS2ModelTests(Speech2TextModelTests): + + def setUp(self): + self.base_model = base_model + self.base_params = base_params + self.train_params = train_params + self.eval_params = eval_params + + def tearDown(self): + pass + + def test_convergence(self): + return self.convergence_test(5.0, 30.0, 0.1) + + def test_convergence_with_iter_size(self): + return self.convergence_with_iter_size_test() + + def test_infer(self): + return self.infer_test() + + def test_mp_collection(self): + return self.mp_collection_test(14, 6) + + +if __name__ == '__main__': + tf.test.main() diff --git a/open_seq2seq/parts/cnns/__init__.py b/open_seq2seq/parts/cnns/__init__.py new file mode 100644 index 000000000..856829f6e --- /dev/null +++ b/open_seq2seq/parts/cnns/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2018 NVIDIA Corporation diff --git a/open_seq2seq/parts/cnns/conv_blocks.py b/open_seq2seq/parts/cnns/conv_blocks.py new file mode 100644 index 000000000..c098162b2 --- /dev/null +++ b/open_seq2seq/parts/cnns/conv_blocks.py @@ -0,0 +1,90 @@ +# Copyright (c) 2018 NVIDIA Corporation +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from six.moves import range + +import tensorflow as tf + + +def conv_actv(type, name, inputs, filters, kernel_size, activation_fn, strides, + padding, regularizer, training, data_format): + """Helper function that applies convolution and activation. + Args: + type: the following types are supported + 'conv1d', 'conv2d' + """ + if type == "conv1d": + layer = tf.layers.conv1d + elif type == "conv2d": + layer = tf.layers.conv2d + + conv = layer( + name="{}".format(name), + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=padding, + kernel_regularizer=regularizer, + use_bias=False, + data_format=data_format, + ) + + output = conv + if activation_fn is not None: + output = activation_fn(output) + return output + + +def conv_bn_actv(type, name, inputs, filters, kernel_size, activation_fn, strides, + padding, regularizer, training, data_format, bn_momentum, + bn_epsilon): + """Helper function that applies convolution, batch norm and activation. + Accepts inputs in 'channels_last' format only. + Args: + type: the following types are supported + 'conv1d', 'conv2d' + """ + if type == "conv1d": + layer = tf.layers.conv1d + elif type == "conv2d": + layer = tf.layers.conv2d + + conv = layer( + name="{}".format(name), + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=padding, + kernel_regularizer=regularizer, + use_bias=False, + data_format=data_format, + ) + + # trick to make batchnorm work for mixed precision training. + # To-Do check if batchnorm works smoothly for >4 dimensional tensors + squeeze = False + if type == "conv1d": + conv = tf.expand_dims(conv, axis=1) # NWC --> NHWC + squeeze = True + + bn = tf.layers.batch_normalization( + name="{}/bn".format(name), + inputs=conv, + gamma_regularizer=regularizer, + training=training, + axis=-1 if data_format == 'channels_last' else 1, + momentum=bn_momentum, + epsilon=bn_epsilon, + ) + + if squeeze: + bn = tf.squeeze(bn, axis=1) + + output = bn + if activation_fn is not None: + output = activation_fn(output) + return output diff --git a/open_seq2seq/test_utils/test_speech_configs/__init__.py b/open_seq2seq/test_utils/test_speech_configs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/open_seq2seq/test_utils/test_speech_config.py b/open_seq2seq/test_utils/test_speech_configs/ds2_test_config.py similarity index 100% rename from open_seq2seq/test_utils/test_speech_config.py rename to open_seq2seq/test_utils/test_speech_configs/ds2_test_config.py diff --git a/open_seq2seq/test_utils/test_speech_configs/w2l_test_config.py b/open_seq2seq/test_utils/test_speech_configs/w2l_test_config.py new file mode 100644 index 000000000..7709154d9 --- /dev/null +++ b/open_seq2seq/test_utils/test_speech_configs/w2l_test_config.py @@ -0,0 +1,103 @@ +from __future__ import absolute_import, division, print_function +import tensorflow as tf +from open_seq2seq.models import Speech2Text +from open_seq2seq.encoders import Wave2LetterEncoder +from open_seq2seq.decoders import FullyConnectedCTCDecoder +from open_seq2seq.data import Speech2TextDataLayer +from open_seq2seq.losses import CTCLoss +from open_seq2seq.optimizers.lr_policies import poly_decay + + +base_model = Speech2Text + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 200, + + "num_gpus": 1, + "batch_size_per_gpu": 10, + "save_summaries_steps": 10, + "print_loss_steps": 10, + "print_samples_steps": 20, + "eval_steps": 50, + "save_checkpoint_steps": 50, + "logdir": "tmp_log_folder", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.001, + "power": 2, + "decay_steps": 500, + }, + "larc_params": { + "larc_eta": 0.001, + }, + "dtype": tf.float32, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": Wave2LetterEncoder, + "encoder_params": { + "convnet_layers": [ + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [7], "stride": [1], + "num_channels": 200, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [1], "stride": [1], + "num_channels": 400, "padding": "SAME" #n_hidden = num_channels + }, + ], + + "dropout_keep_prob": 0.9, + + "initializer": tf.contrib.layers.xavier_initializer, + "initializer_params": { + 'uniform': False, + }, + "activation_fn": lambda x: tf.minimum(tf.nn.relu(x), 20.0), + "data_format": "channels_last", + "bn_momentum": 0.001, + }, + + "decoder": FullyConnectedCTCDecoder, + "decoder_params": { + "initializer": tf.contrib.layers.xavier_initializer, + "use_language_model": False, + }, + "loss": CTCLoss, + "loss_params": {}, +} + +train_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "open_seq2seq/test_utils/toy_speech_data/toy_data.csv", + ], + "shuffle": True, + }, +} + +eval_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "open_seq2seq/test_utils/toy_speech_data/toy_data.csv", + ], + "shuffle": False, + }, +} diff --git a/open_seq2seq/utils/utils_test.py b/open_seq2seq/utils/utils_test.py index 6990ab1ed..3ab3709ce 100644 --- a/open_seq2seq/utils/utils_test.py +++ b/open_seq2seq/utils/utils_test.py @@ -9,14 +9,15 @@ import numpy as np import numpy.testing as npt -from open_seq2seq.test_utils.test_speech_config import base_params, \ - train_params, \ - eval_params, \ - base_model +from open_seq2seq.test_utils.test_speech_configs.ds2_test_config import base_params, \ + train_params, \ + eval_params, \ + base_model from open_seq2seq.utils.utils import get_results_for_epoch, get_available_gpus class UtilsTests(tf.test.TestCase): + def setUp(self): base_params['logdir'] = tempfile.mktemp() self.train_config = copy.deepcopy(base_params) @@ -47,7 +48,8 @@ def test_get_results_for_epoch(self): with self.test_session(g, use_gpu=True) as sess: sess.run(tf.global_variables_initializer()) - inputs_per_batch = get_results_for_epoch(model, sess, False, "infer") + inputs_per_batch = get_results_for_epoch( + model, sess, False, "infer") length = np.hstack([inp['source_tensors'][1] for inp in inputs_per_batch]) ids = np.hstack([inp['source_ids'] for inp in inputs_per_batch]) From 495c57c977f8f166020cba015d70f30052d3e9ad Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 29 Jun 2018 10:42:04 -0700 Subject: [PATCH 095/102] Update API in docs and simplified tests --- docs/sources/source/api-docs/encoders.rst | 8 +++++++ docs/sources/source/api-docs/parts.cnns.rst | 15 +++++++++++++ docs/sources/source/api-docs/parts.rst | 3 ++- .../source/installation-instructions.rst | 2 +- open_seq2seq/models/speech2text_ds2_test.py | 14 +++---------- open_seq2seq/models/speech2text_w2l_test.py | 21 +++---------------- open_seq2seq/parts/cnns/conv_blocks.py | 8 ++++--- 7 files changed, 37 insertions(+), 34 deletions(-) create mode 100644 docs/sources/source/api-docs/parts.cnns.rst diff --git a/docs/sources/source/api-docs/encoders.rst b/docs/sources/source/api-docs/encoders.rst index faff6961b..5c11d1f26 100644 --- a/docs/sources/source/api-docs/encoders.rst +++ b/docs/sources/source/api-docs/encoders.rst @@ -22,6 +22,14 @@ ds2\_encoder :undoc-members: :show-inheritance: +w2l\_encoder +---------------------------- + +.. automodule:: encoders.w2l_encoder + :members: + :undoc-members: + :show-inheritance: + rnn\_encoders ----------------------------- diff --git a/docs/sources/source/api-docs/parts.cnns.rst b/docs/sources/source/api-docs/parts.cnns.rst new file mode 100644 index 000000000..631cb86c1 --- /dev/null +++ b/docs/sources/source/api-docs/parts.cnns.rst @@ -0,0 +1,15 @@ +cnns +======================================= + +.. automodule:: parts.cnns + :members: + :undoc-members: + :show-inheritance: + +conv\_blocks +------------------------------------------------------- + +.. automodule:: parts.cnns.conv_blocks + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sources/source/api-docs/parts.rst b/docs/sources/source/api-docs/parts.rst index f6ad481d9..6a57d9287 100644 --- a/docs/sources/source/api-docs/parts.rst +++ b/docs/sources/source/api-docs/parts.rst @@ -10,4 +10,5 @@ parts parts.rnns parts.transformer - parts.convs2s \ No newline at end of file + parts.convs2s + parts.cnns \ No newline at end of file diff --git a/docs/sources/source/installation-instructions.rst b/docs/sources/source/installation-instructions.rst index ba09bbccc..0e4c7d110 100644 --- a/docs/sources/source/installation-instructions.rst +++ b/docs/sources/source/installation-instructions.rst @@ -32,7 +32,7 @@ run unittests:: python -m unittest discover -s open_seq2seq -p '*_test.py' -It might take up to 10 minutes. You should see a lot of output, but no errors +It might take up to 30 minutes. You should see a lot of output, but no errors in the end. .. _installation_speech: diff --git a/open_seq2seq/models/speech2text_ds2_test.py b/open_seq2seq/models/speech2text_ds2_test.py index 519e4039c..7a433c109 100644 --- a/open_seq2seq/models/speech2text_ds2_test.py +++ b/open_seq2seq/models/speech2text_ds2_test.py @@ -1,21 +1,13 @@ # Copyright (c) 2017 NVIDIA Corporation from __future__ import absolute_import, division, print_function from __future__ import unicode_literals -from six.moves import range import tensorflow as tf -import numpy as np -import copy -import numpy.testing as npt -import tempfile -import os -import pandas as pd + +from open_seq2seq.test_utils.test_speech_configs.ds2_test_config import \ + base_params, train_params, eval_params, base_model from .speech2text_test import Speech2TextModelTests -from open_seq2seq.test_utils.test_speech_configs.ds2_test_config import base_params, \ - train_params, \ - eval_params, \ - base_model class DS2ModelTests(Speech2TextModelTests): diff --git a/open_seq2seq/models/speech2text_w2l_test.py b/open_seq2seq/models/speech2text_w2l_test.py index 88f20eee8..de18448a3 100644 --- a/open_seq2seq/models/speech2text_w2l_test.py +++ b/open_seq2seq/models/speech2text_w2l_test.py @@ -1,24 +1,15 @@ # Copyright (c) 2017 NVIDIA Corporation from __future__ import absolute_import, division, print_function from __future__ import unicode_literals -from six.moves import range import tensorflow as tf -import numpy as np -import copy -import numpy.testing as npt -import tempfile -import os -import pandas as pd +from open_seq2seq.test_utils.test_speech_configs.w2l_test_config import \ + base_params, train_params, eval_params, base_model from .speech2text_test import Speech2TextModelTests -from open_seq2seq.test_utils.test_speech_configs.w2l_test_config import base_params, \ - train_params, \ - eval_params, \ - base_model -class DS2ModelTests(Speech2TextModelTests): +class W2LModelTests(Speech2TextModelTests): def setUp(self): self.base_model = base_model @@ -32,12 +23,6 @@ def tearDown(self): def test_convergence(self): return self.convergence_test(5.0, 30.0, 0.1) - def test_convergence_with_iter_size(self): - return self.convergence_with_iter_size_test() - - def test_infer(self): - return self.infer_test() - def test_mp_collection(self): return self.mp_collection_test(14, 6) diff --git a/open_seq2seq/parts/cnns/conv_blocks.py b/open_seq2seq/parts/cnns/conv_blocks.py index c098162b2..31087d6e9 100644 --- a/open_seq2seq/parts/cnns/conv_blocks.py +++ b/open_seq2seq/parts/cnns/conv_blocks.py @@ -11,6 +11,7 @@ def conv_actv(type, name, inputs, filters, kernel_size, activation_fn, strides, padding, regularizer, training, data_format): """Helper function that applies convolution and activation. + Args: type: the following types are supported 'conv1d', 'conv2d' @@ -38,11 +39,12 @@ def conv_actv(type, name, inputs, filters, kernel_size, activation_fn, strides, return output -def conv_bn_actv(type, name, inputs, filters, kernel_size, activation_fn, strides, - padding, regularizer, training, data_format, bn_momentum, - bn_epsilon): +def conv_bn_actv(type, name, inputs, filters, kernel_size, activation_fn, + strides, padding, regularizer, training, data_format, + bn_momentum, bn_epsilon): """Helper function that applies convolution, batch norm and activation. Accepts inputs in 'channels_last' format only. + Args: type: the following types are supported 'conv1d', 'conv2d' From 21a5c776d36aa242891b5de9124d5e60d949acc8 Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 29 Jun 2018 11:24:57 -0700 Subject: [PATCH 096/102] Update models-and-recipes with new models --- docs/sources/source/models-and-recipes.rst | 45 ++++++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/docs/sources/source/models-and-recipes.rst b/docs/sources/source/models-and-recipes.rst index 4f6035158..5e2a1ebbd 100644 --- a/docs/sources/source/models-and-recipes.rst +++ b/docs/sources/source/models-and-recipes.rst @@ -6,8 +6,10 @@ Models and recipes .. note:: Currently OpenSeq2Seq has model implementations for machine translation and - automatic speech recognition. All models work both in float32 and mixed precision. - We recommend you use :ref:`mixed precision training ` when training on Volta GPUs. + automatic speech recognition. + All models work both in float32 and mixed precision. + We recommend you use :ref:`mixed precision training ` + when training on Volta GPUs. To train models you can use the following commands (don't forget to substitute @@ -26,6 +28,16 @@ The description of implemented models is available in the next sections: Machine translation ------------------- +The table below contains description and results of +machine translation models available in OpenSeq2Seq. +Currently, we have GNMT-based model, Transformer-based models and +ConvS2S-based models. + +We measure BLEU score on newstest2014.tok.de file using ``multi-bleu.perl`` script from Mosses. +For more details about model descriptions and training setup, +have a look at the `configuration files `_. + + .. list-table:: :widths: 1 1 1 1 1 :header-rows: 1 @@ -50,23 +62,31 @@ Machine translation - This model was trained on 4 GPUs with Adam optimizer and learning rate decay. - Transformer "big" model. This model does not have any RNN layers - `link `_ + * - `en-de-convs2s.py `_ + - xx.xx + - This model was trained on 4 GPUs with Adam optimizer, learning rate decay and warm-up. + - This is an implementation of the ConvS2S model proposed in https://arxiv.org/abs/1705.03122. + - Coming soon. -GNMT model description can be found `here `_. -Transformer model description can be found `here `_. -We measure BLEU score on newstest2014.tok.de file using ``multi-bleu.perl`` script from Mosses. +GNMT model description: https://arxiv.org/abs/1609.08144. + +Transformer model description: https://arxiv.org/abs/1706.03762. + +ConvS2S model description: https://arxiv.org/abs/1705.03122. Speech recognition ------------------ The table below contains description and results of speech recognition models available in OpenSeq2Seq. +Currently, we have DeepSpeech2-based models and Wav2Letter-based models. WER is the word error rate obtained on a dev-clean subset of LibriSpeech using greedy decoder (``decoder_params/use_language_model = False``). For the final evaluation we used ``batch_size_per_gpu = 1`` to eliminate the effect of `cudnn padding issue `_. For more details about model descriptions and training setup, -have a look at the `configuration files `_. +have a look at the `configuration files `_. .. list-table:: :widths: 1 1 1 1 1 @@ -98,7 +118,16 @@ have a look at the `configuration files `_ + * - `w2l_large_8gpus.py `_ + - 15.38% + - This model was trained for 18 epochs (with early stopping based on + validation loss) using SGD with Momentum and LARC on + the full LibriSpeech in a few days on eight GPUs. + - The model has 19 convolutional layers (200--1000 units, 7--21 kernel size). + We use batch norm between all layers. + - Coming soon. + -Original Deep Speech 2 model description: https://arxiv.org/abs/1512.02595 . +Deep Speech 2 model description: https://arxiv.org/abs/1512.02595. -Original Wav2Letter model description: https://arxiv.org/abs/1609.03193 . \ No newline at end of file +Wav2Letter model description: https://arxiv.org/abs/1609.03193, https://arxiv.org/abs/1712.09444. From 45aa3d109097ec3d779eb52732f79d7d4a741893 Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 29 Jun 2018 11:27:34 -0700 Subject: [PATCH 097/102] Fix wrong links --- docs/sources/source/models-and-recipes.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sources/source/models-and-recipes.rst b/docs/sources/source/models-and-recipes.rst index 5e2a1ebbd..52507f6ab 100644 --- a/docs/sources/source/models-and-recipes.rst +++ b/docs/sources/source/models-and-recipes.rst @@ -35,7 +35,7 @@ ConvS2S-based models. We measure BLEU score on newstest2014.tok.de file using ``multi-bleu.perl`` script from Mosses. For more details about model descriptions and training setup, -have a look at the `configuration files `_. +have a look at the `configuration files `_. .. list-table:: @@ -86,7 +86,7 @@ greedy decoder (``decoder_params/use_language_model = False``). For the final evaluation we used ``batch_size_per_gpu = 1`` to eliminate the effect of `cudnn padding issue `_. For more details about model descriptions and training setup, -have a look at the `configuration files `_. +have a look at the `configuration files `_. .. list-table:: :widths: 1 1 1 1 1 From 3974cb8e278e933b5f803d78457bd91015446d59 Mon Sep 17 00:00:00 2001 From: Vitaly Lavrukhin Date: Fri, 29 Jun 2018 12:03:51 -0700 Subject: [PATCH 098/102] Updated docs - Updated greedy WERs - Uploaded new checkpoints --- docs/sources/source/models-and-recipes.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/sources/source/models-and-recipes.rst b/docs/sources/source/models-and-recipes.rst index 52507f6ab..70de3f5ce 100644 --- a/docs/sources/source/models-and-recipes.rst +++ b/docs/sources/source/models-and-recipes.rst @@ -98,12 +98,12 @@ have a look at the `configuration files `_ - - 14.89% + - 9.28% - This model was trained for 50 epochs using SGD with Momentum and LARC on the full LibriSpeech in a few days using Horovod on eight GPUs. - This model has 2 convolutional layers and 5 bidirectional GRU layers with 800 units. - - `link `_ + - `link `_ * - `ds2_medium_4gpus.py `_ - 22.60% - This model was trained for 50 epochs using Adam on the full @@ -119,13 +119,13 @@ have a look at the `configuration files `_ * - `w2l_large_8gpus.py `_ - - 15.38% + - 15.44% - This model was trained for 18 epochs (with early stopping based on validation loss) using SGD with Momentum and LARC on the full LibriSpeech in a few days on eight GPUs. - The model has 19 convolutional layers (200--1000 units, 7--21 kernel size). We use batch norm between all layers. - - Coming soon. + - `link `_ Deep Speech 2 model description: https://arxiv.org/abs/1512.02595. From e98698c61b87fa957f0b0866fc61129f82eb6624 Mon Sep 17 00:00:00 2001 From: Vitaly Lavrukhin Date: Fri, 29 Jun 2018 12:04:45 -0700 Subject: [PATCH 099/102] Update w2l_large_8gpus.py --- example_configs/speech2text/w2l_large_8gpus.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/example_configs/speech2text/w2l_large_8gpus.py b/example_configs/speech2text/w2l_large_8gpus.py index 897bde1c9..e093d06b8 100644 --- a/example_configs/speech2text/w2l_large_8gpus.py +++ b/example_configs/speech2text/w2l_large_8gpus.py @@ -126,9 +126,9 @@ "input_type": "logfbank", "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", "dataset_files": [ - "/data/librispeech/librivox-train-clean-100.csv", - "/data/librispeech/librivox-train-clean-360.csv", - "/data/librispeech/librivox-train-other-500.csv", + "data/librispeech/librivox-train-clean-100.csv", + "data/librispeech/librivox-train-clean-360.csv", + "data/librispeech/librivox-train-other-500.csv", ], "shuffle": True, }, @@ -141,7 +141,7 @@ "input_type": "logfbank", "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", "dataset_files": [ - "/data/librispeech/librivox-dev-clean.csv", + "data/librispeech/librivox-dev-clean.csv", ], "shuffle": False, }, @@ -154,7 +154,7 @@ "input_type": "logfbank", "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", "dataset_files": [ - "/data/librispeech/librivox-test-clean.csv", + "data/librispeech/librivox-test-clean.csv", ], "shuffle": False, }, From 45def43e1809a446e4df2fd1d3fe4a6860c558ac Mon Sep 17 00:00:00 2001 From: Vitaly Lavrukhin Date: Fri, 29 Jun 2018 12:05:31 -0700 Subject: [PATCH 100/102] Update w2l_large_8gpus_mp.py --- example_configs/speech2text/w2l_large_8gpus_mp.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/example_configs/speech2text/w2l_large_8gpus_mp.py b/example_configs/speech2text/w2l_large_8gpus_mp.py index 342fcd6ff..b3514f452 100644 --- a/example_configs/speech2text/w2l_large_8gpus_mp.py +++ b/example_configs/speech2text/w2l_large_8gpus_mp.py @@ -127,9 +127,9 @@ "input_type": "logfbank", "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", "dataset_files": [ - "/data/librispeech/librivox-train-clean-100.csv", - "/data/librispeech/librivox-train-clean-360.csv", - "/data/librispeech/librivox-train-other-500.csv", + "data/librispeech/librivox-train-clean-100.csv", + "data/librispeech/librivox-train-clean-360.csv", + "data/librispeech/librivox-train-other-500.csv", ], "shuffle": True, }, @@ -142,7 +142,7 @@ "input_type": "logfbank", "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", "dataset_files": [ - "/data/librispeech/librivox-dev-clean.csv", + "data/librispeech/librivox-dev-clean.csv", ], "shuffle": False, }, @@ -155,7 +155,7 @@ "input_type": "logfbank", "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", "dataset_files": [ - "/data/librispeech/librivox-test-clean.csv", + "data/librispeech/librivox-test-clean.csv", ], "shuffle": False, }, From 911d6cbcb6bb4e42224e5ac48dbbc63dd6280524 Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 29 Jun 2018 12:11:30 -0700 Subject: [PATCH 101/102] Add docs for cnn_encoder --- open_seq2seq/encoders/cnn_encoder.py | 75 ++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/open_seq2seq/encoders/cnn_encoder.py b/open_seq2seq/encoders/cnn_encoder.py index 301197012..dc382a586 100644 --- a/open_seq2seq/encoders/cnn_encoder.py +++ b/open_seq2seq/encoders/cnn_encoder.py @@ -1,4 +1,8 @@ # Copyright (c) 2018 NVIDIA Corporation +""" +This module contains classes and functions to build "general" convolutional +neural networks from the description of arbitrary "layers". +""" from __future__ import absolute_import, division, print_function from __future__ import unicode_literals from six.moves import range @@ -17,6 +21,32 @@ def build_layer(inputs, layer, layer_params, data_format, regularizer, training, verbose=True): + """This function builds a layer from the layer function and it's parameters. + + It will automatically add regularizer parameter to the layer_params if the + layer supports regularization. To check this, it will look for the + "regularizer", "kernel_regularizer" and "gamma_regularizer" names in this + order in the ``layer`` call signature. If one of this parameters is supported + it will pass regularizer object as a value for that parameter. Based on the + same "checking signature" technique "data_format" and "training" parameters + will try to be added. + + Args: + inputs: input Tensor that will be passed to the layer. Note that layer has + to accept input as the first parameter. + layer: layer function or class with ``__call__`` method defined. + layer_params (dict): parameters passed to the ``layer``. + data_format (string): data format ("channels_first" or "channels_last") + that will be tried to be passed as an additional argument. + regularizer: regularizer instance that will be tried to be passed as an + additional argument. + training (bool): whether layer is built in training mode. Will be tried to + be passed as an additional argument. + verbose (bool): whether to print information about built layers. + + Returns: + Tensor with layer output. + """ layer_params_cp = copy.deepcopy(layer_params) for reg_name in ['regularizer', 'kernel_regularizer', 'gamma_regularizer']: if reg_name not in layer_params_cp and \ @@ -47,6 +77,8 @@ def build_layer(inputs, layer, layer_params, data_format, class CNNEncoder(Encoder): + """General CNN encoder that can be used to construct various different models. + """ @staticmethod def get_required_params(): return dict(Encoder.get_required_params(), **{ @@ -61,6 +93,49 @@ def get_optional_params(): }) def __init__(self, params, model, name="cnn_encoder", mode='train'): + """CNN Encoder constructor. + + See parent class for arguments description. + + Config parameters: + + * **cnn_layers** (list) --- list with the description of "convolutional" + layers. For example:: + "conv_layers": [ + (tf.layers.conv2d, { + 'filters': 64, 'kernel_size': (11, 11), + 'strides': (4, 4), 'padding': 'VALID', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 192, 'kernel_size': (5, 5), + 'strides': (1, 1), 'padding': 'SAME', + }), + (tf.layers.batch_normalization, {'momentum': 0.9, 'epsilon': 0.0001}), + (tf.nn.relu, {}), + ] + Note that you don't need to provide "regularizer", "training" and + "data_format" parameters since they will be automatically added. + + * **cnn_layers** (list) --- list with the description of "fully-connected" + layers. The only different from convolutional layers is that the input + will be automatically reshaped to 2D (batch size x num features). + For example:: + 'fc_layers': [ + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.layers.dropout, {'rate': 0.5}), + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.layers.dropout, {'rate': 0.5}), + ], + Note that you don't need to provide "regularizer", "training" and + "data_format" parameters since they will be automatically added. + + * **data_format** (string) --- could be either "channels_first" or + "channels_last". Defaults to "channels_first". + """ super(CNNEncoder, self).__init__(params, model, name, mode) def _encode(self, input_dict): From e4b1c3215503aa0da69aa54c6c778889d34c9d6b Mon Sep 17 00:00:00 2001 From: Kipok Date: Fri, 29 Jun 2018 13:24:53 -0700 Subject: [PATCH 102/102] Docs rebuilt --- .../data/image2label/image2label.html | 154 +++- .../image2label/imagenet_preprocessing.html | 45 +- .../data/speech2text/speech2text.html | 45 +- .../data/speech2text/speech_utils.html | 27 +- docs/html/_modules/data/text2text/t2t.html | 44 +- .../_modules/data/text2text/text2text.html | 39 +- .../_modules/decoders/convs2s_decoder.html | 602 ++++++++++++++ docs/html/_modules/decoders/decoder.html | 45 +- docs/html/_modules/decoders/fc_decoders.html | 12 +- docs/html/_modules/decoders/rnn_decoders.html | 112 +-- docs/html/_modules/encoders/cnn_encoder.html | 405 +++++++++ .../_modules/encoders/convs2s_encoder.html | 456 +++++++++++ docs/html/_modules/encoders/ds2_encoder.html | 227 +++-- docs/html/_modules/encoders/encoder.html | 43 +- docs/html/_modules/encoders/rnn_encoders.html | 265 ++++-- docs/html/_modules/encoders/w2l_encoder.html | 390 +++++++++ docs/html/_modules/index.html | 9 +- docs/html/_modules/losses/sequence_loss.html | 8 +- .../html/_modules/models/encoder_decoder.html | 8 +- docs/html/_modules/models/image2label.html | 11 +- docs/html/_modules/models/model.html | 91 ++- docs/html/_modules/models/speech2text.html | 18 +- docs/html/_modules/models/text2text.html | 48 +- .../optimizers/automatic_loss_scaler.html | 18 +- docs/html/_modules/optimizers/mp_wrapper.html | 32 +- docs/html/_modules/optimizers/optimizers.html | 772 ++++++------------ .../html/_modules/parts/cnns/conv_blocks.html | 327 ++++++++ .../parts/convs2s/attention_wn_layer.html | 325 ++++++++ .../_modules/parts/convs2s/conv_wn_layer.html | 338 ++++++++ .../_modules/parts/convs2s/ffn_wn_layer.html | 303 +++++++ docs/html/_modules/parts/rnns/utils.html | 157 +--- .../parts/transformer/beam_search_test.html | 337 -------- .../parts/transformer/embedding_layer.html | 50 +- .../_modules/parts/transformer/utils.html | 26 +- docs/html/_modules/utils/funcs.html | 55 +- docs/html/_modules/utils/hooks.html | 11 +- docs/html/_modules/utils/utils.html | 306 ++++--- docs/html/_sources/api-docs/decoders.rst.txt | 8 + docs/html/_sources/api-docs/encoders.rst.txt | 25 + .../html/_sources/api-docs/parts.cnns.rst.txt | 15 + .../_sources/api-docs/parts.convs2s.rst.txt | 31 + docs/html/_sources/api-docs/parts.rst.txt | 2 + .../api-docs/parts.transformer.rst.txt | 8 - .../installation-instructions.rst.txt | 2 +- docs/html/_sources/models-and-recipes.rst.txt | 86 +- docs/html/api-docs/data.image2label.html | 154 +++- docs/html/api-docs/data.speech2text.html | 20 +- docs/html/api-docs/data.text2text.html | 13 +- docs/html/api-docs/decoders.html | 99 ++- docs/html/api-docs/encoders.html | 425 +++++++++- docs/html/api-docs/models.html | 187 +++-- docs/html/api-docs/modules.html | 15 +- docs/html/api-docs/optimizers.html | 245 +----- docs/html/api-docs/parts.cnns.html | 307 +++++++ docs/html/api-docs/parts.convs2s.html | 460 +++++++++++ docs/html/api-docs/parts.html | 13 +- docs/html/api-docs/parts.rnns.html | 35 +- docs/html/api-docs/parts.transformer.html | 72 +- docs/html/api-docs/utils.html | 33 +- docs/html/genindex.html | 226 +++-- .../using-existing-models.html | 18 +- docs/html/installation-instructions.html | 2 +- docs/html/models-and-recipes.html | 94 ++- docs/html/objects.inv | Bin 4963 -> 5283 bytes docs/html/py-modindex.html | 55 +- docs/html/searchindex.js | 2 +- 66 files changed, 6615 insertions(+), 2198 deletions(-) create mode 100644 docs/html/_modules/decoders/convs2s_decoder.html create mode 100644 docs/html/_modules/encoders/cnn_encoder.html create mode 100644 docs/html/_modules/encoders/convs2s_encoder.html create mode 100644 docs/html/_modules/encoders/w2l_encoder.html create mode 100644 docs/html/_modules/parts/cnns/conv_blocks.html create mode 100644 docs/html/_modules/parts/convs2s/attention_wn_layer.html create mode 100644 docs/html/_modules/parts/convs2s/conv_wn_layer.html create mode 100644 docs/html/_modules/parts/convs2s/ffn_wn_layer.html delete mode 100644 docs/html/_modules/parts/transformer/beam_search_test.html create mode 100644 docs/html/_sources/api-docs/parts.cnns.rst.txt create mode 100644 docs/html/_sources/api-docs/parts.convs2s.rst.txt create mode 100644 docs/html/api-docs/parts.cnns.html create mode 100644 docs/html/api-docs/parts.convs2s.html diff --git a/docs/html/_modules/data/image2label/image2label.html b/docs/html/_modules/data/image2label/image2label.html index 8f82b1463..b83262df4 100644 --- a/docs/html/_modules/data/image2label/image2label.html +++ b/docs/html/_modules/data/image2label/image2label.html @@ -162,11 +162,154 @@

Source code for data.image2label.image2label

import os
 import tensorflow as tf
+import numpy as np
 
 from open_seq2seq.data.data_layer import DataLayer
 from .imagenet_preprocessing import parse_record
 
 
+
[docs]class CifarDataLayer(DataLayer): + _HEIGHT = 28 + _WIDTH = 28 + _NUM_CHANNELS = 3 + _DEFAULT_IMAGE_BYTES = 32 * 32 * 3 + # The record is the image plus a one-byte label + _RECORD_BYTES = _DEFAULT_IMAGE_BYTES + 1 + _NUM_CLASSES = 10 + _NUM_DATA_FILES = 5 + + _NUM_IMAGES = { + 'train': 50000, + 'validation': 10000, + } + +
[docs] @staticmethod + def get_required_params(): + return dict(DataLayer.get_required_params(), **{ + 'data_dir': str, + })
+ +
[docs] @staticmethod + def get_optional_params(): + return dict(DataLayer.get_optional_params(), **{ + 'num_parallel_calls': int, + 'shuffle_buffer': int, + 'image_size': int, + 'num_classes': int, + })
+ + def __init__(self, params, model, num_workers, worker_id): + super(CifarDataLayer, self).__init__(params, model, + num_workers, worker_id) + if self.params['mode'] == 'infer': + raise ValueError('Inference is not supported on CifarDataLayer') + + if self.params['mode'] == 'train': + filenames = [ + os.path.join(self.params['data_dir'], 'data_batch_{}.bin'.format(i)) + for i in range(1, self._NUM_DATA_FILES + 1) + ] + else: + filenames = [os.path.join(self.params['data_dir'], 'test_batch.bin')] + + self.file_names = filenames + self._train_size = 50000 + self._valid_size = 10000 + self._iterator = None + self._input_tensors = None + +
[docs] def preprocess_image(self, image, is_training): + """Preprocess a single image of layout [height, width, depth].""" + if is_training: + # Resize the image to add four extra pixels on each side. + image = tf.image.resize_image_with_crop_or_pad( + image, self._HEIGHT + 8, self._WIDTH + 8) + + # Randomly crop a [_HEIGHT, _WIDTH] section of the image. + image = tf.random_crop(image, [self._HEIGHT, self._WIDTH, + self._NUM_CHANNELS]) + + # Randomly flip the image horizontally. + image = tf.image.random_flip_left_right(image) + + else: + image = tf.image.resize_image_with_crop_or_pad( + image, self._HEIGHT, self._WIDTH) + + # Subtract off the mean and divide by the variance of the pixels. + image = tf.image.per_image_standardization(image) + + return image
+ +
[docs] def parse_record(self, raw_record, is_training, num_classes=10): + """Parse CIFAR-10 image and label from a raw record.""" + # Convert bytes to a vector of uint8 that is record_bytes long. + record_vector = tf.decode_raw(raw_record, tf.uint8) + + # The first byte represents the label, which we convert from uint8 to int32 + # and then to one-hot. + label = tf.cast(record_vector[0], tf.int32) + + # The remaining bytes after the label represent the image, which we reshape + # from [depth * height * width] to [depth, height, width]. + depth_major = tf.reshape(record_vector[1:self._RECORD_BYTES], + [3, 32, 32]) + + # Convert from [depth, height, width] to [height, width, depth], and cast as + # float32. + image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32) + + image = self.preprocess_image(image, is_training) + label = tf.one_hot(tf.reshape(label, shape=[]), num_classes) + + return image, label
+ +
[docs] def build_graph(self): + dataset = tf.data.FixedLengthRecordDataset(self.file_names, + self._RECORD_BYTES) + + dataset = dataset.prefetch(buffer_size=self.params['batch_size']) + if self.params['shuffle']: + # shuffling images + dataset = dataset.shuffle(buffer_size=self.params.get('shuffle_buffer', + 1500)) + dataset = dataset.repeat() + + dataset = dataset.map( + lambda value: self.parse_record( + raw_record=value, + is_training=self.params['mode'] == 'train', + ), + num_parallel_calls=self.params.get('num_parallel_calls', 16), + ) + + dataset = dataset.batch(self.params['batch_size']) + dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) + + self._iterator = dataset.make_initializable_iterator() + inputs, labels = self.iterator.get_next() + if self.params['mode'] == 'train': + tf.summary.image('augmented_images', inputs, max_outputs=1) + self._input_tensors = { + 'source_tensors': [inputs], + 'target_tensors': [labels], + }
+ + @property + def input_tensors(self): + return self._input_tensors + + @property + def iterator(self): + return self._iterator + +
[docs] def get_size_in_samples(self): + if self.params['mode'] == 'train': + return self._train_size + else: + return len(np.arange(self._valid_size)[self._worker_id::self._num_workers])
+ +
[docs]class ImagenetDataLayer(DataLayer):
[docs] @staticmethod def get_required_params(): @@ -179,6 +322,8 @@

Source code for data.image2label.image2label

return dict(DataLayer.get_optional_params(), **{
       'num_parallel_calls': int,
       'shuffle_buffer': int,
+      'image_size': int,
+      'num_classes': int,
     })
def __init__(self, params, model, num_workers, worker_id): @@ -231,12 +376,17 @@

Source code for data.image2label.image2label

dataset = dataset.repeat()
 
     dataset = dataset.map(
-      lambda value: parse_record(value, self.params['mode'] == 'train'),
+      lambda value: parse_record(
+        raw_record=value,
+        is_training=self.params['mode'] == 'train',
+        image_size=self.params.get('image_size', 224),
+        num_classes=self.params.get('num_classes', 1000),
+      ),
       num_parallel_calls=self.params.get('num_parallel_calls', 16),
     )
 
     dataset = dataset.batch(self.params['batch_size'])
-    dataset = dataset.prefetch(1)
+    dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
 
     self._iterator = dataset.make_initializable_iterator()
     inputs, labels = self.iterator.get_next()
diff --git a/docs/html/_modules/data/image2label/imagenet_preprocessing.html b/docs/html/_modules/data/image2label/imagenet_preprocessing.html
index f5f425d67..6c7ad01bf 100644
--- a/docs/html/_modules/data/image2label/imagenet_preprocessing.html
+++ b/docs/html/_modules/data/image2label/imagenet_preprocessing.html
@@ -168,22 +168,17 @@ 

Source code for data.image2label.imagenet_preprocessing

# limitations under the License. # ============================================================================== """Provides utilities to preprocess images. - Training images are sampled using the provided bounding boxes, and subsequently cropped to the sampled bounding box. Images are additionally flipped randomly, then resized to the target output size (without aspect-ratio preservation). - Images used during evaluation are resized (with aspect-ratio preservation) and centrally cropped. - All images undergo mean color subtraction. - Note that these steps are colloquially referred to as "ResNet preprocessing," and they differ from "VGG preprocessing," which does not use bounding boxes and instead does an aspect-preserving resize followed by random crop during training. (These both differ from "Inception preprocessing," which introduces color distortion steps.) - """ from __future__ import absolute_import @@ -202,14 +197,9 @@

Source code for data.image2label.imagenet_preprocessing

# _RESIZE_MIN x (_RESIZE_MIN * 2). _RESIZE_MIN = 256 -_DEFAULT_IMAGE_SIZE = 224 -_NUM_CHANNELS = 3 -_NUM_CLASSES = 1001 -
[docs]def _decode_crop_and_flip(image_buffer, bbox, num_channels): """Crops the given image to a random part of the image, and randomly flips. - We use the fused decode_and_crop op, which performs better than the two ops used separately in series, but note that this requires that the image be passed in as an un-decoded string Tensor. @@ -223,7 +213,6 @@

Source code for data.image2label.imagenet_preprocessing

Returns: 3-D tensor with cropped image. - """ # A large fraction of image datasets contain a human-annotated bounding box # delineating the region of the image containing the object of interest. We @@ -278,12 +267,12 @@

Source code for data.image2label.imagenet_preprocessing

image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
-
[docs]def _mean_image_subtraction(image, means, num_channels): - """Subtracts the given means from each image channel. +
[docs]def _mean_image_subtraction_and_normalization(image, means, num_channels): + """Subtracts the given means from each image channel and divides by 127.5. For example: means = [123.68, 116.779, 103.939] - image = _mean_image_subtraction(image, means) + image = _mean_image_subtraction_and_normalization(image, means) Note that the rank of `image` must be known. @@ -293,7 +282,7 @@

Source code for data.image2label.imagenet_preprocessing

num_channels: number of color channels in the image that will be distorted. Returns: - the centered image. + the centered image and normalized image. Raises: ValueError: If the rank of `image` is unknown, if `image` has a rank other @@ -309,12 +298,11 @@

Source code for data.image2label.imagenet_preprocessing

# We have a 1-D tensor of means; convert to 3-D. means = tf.expand_dims(tf.expand_dims(means, 0), 0) - return image - means
+ return (image - means) / 127.5
[docs]def _smallest_size_at_least(height, width, resize_min): """Computes new shape with the smallest side equal to `smallest_side`. - Computes new shape with the smallest side equal to `smallest_side` while preserving the original aspect ratio. @@ -364,7 +352,6 @@

Source code for data.image2label.imagenet_preprocessing

[docs]def _resize_image(image, height, width): """Simple wrapper around tf.resize_images. - This is primarily to make sure we use the same `ResizeMethod` and other details each time. @@ -385,7 +372,6 @@

Source code for data.image2label.imagenet_preprocessing

[docs]def preprocess_image(image_buffer, bbox, output_height, output_width, num_channels, is_training=False): """Preprocesses the given image. - Preprocessing includes decoding, cropping, and resizing for both training and eval images. Training preprocessing, however, introduces some random distortion of the image to improve accuracy. @@ -416,16 +402,15 @@

Source code for data.image2label.imagenet_preprocessing

image.set_shape([output_height, output_width, num_channels]) - return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)
+ return _mean_image_subtraction_and_normalization(image, _CHANNEL_MEANS, + num_channels)
[docs]def _parse_example_proto(example_serialized): """Parses an Example proto containing a training example of an image. - The output of the build_image_data.py image preprocessing script is a dataset containing serialized Example protocol buffers. Each Example proto contains the following fields (values are included as examples): - image/height: 462 image/width: 581 image/colorspace: 'RGB' @@ -489,16 +474,17 @@

Source code for data.image2label.imagenet_preprocessing

return features['image/encoded'], label, bbox
-
[docs]def parse_record(raw_record, is_training): +
[docs]def parse_record(raw_record, is_training, image_size=224, num_classes=1000): """Parses a record containing a training example of an image. - The input record is parsed into a label and image, and the image is passed through preprocessing steps (cropping, flipping, and so on). Args: raw_record: scalar Tensor tf.string containing a serialized - Example protocol buffer. + Example protocol buffer. is_training: A boolean denoting whether the input is for training. + image_size (int): size that images should be resized to. + num_classes (int): number of output classes. Returns: Tuple with processed image tensor and one-hot-encoded label tensor. @@ -508,12 +494,13 @@

Source code for data.image2label.imagenet_preprocessing

image = preprocess_image( image_buffer=image_buffer, bbox=bbox, - output_height=_DEFAULT_IMAGE_SIZE, - output_width=_DEFAULT_IMAGE_SIZE, - num_channels=_NUM_CHANNELS, + output_height=image_size, + output_width=image_size, + num_channels=3, is_training=is_training) - label = tf.one_hot(tf.reshape(label, shape=[]), _NUM_CLASSES) + # subtracting 1 to make labels go from 0 to 999 + label = tf.one_hot(tf.reshape(label - 1, shape=[]), num_classes) return image, label
diff --git a/docs/html/_modules/data/speech2text/speech2text.html b/docs/html/_modules/data/speech2text/speech2text.html index 4f55f45b1..3f527aaab 100644 --- a/docs/html/_modules/data/speech2text/speech2text.html +++ b/docs/html/_modules/data/speech2text/speech2text.html @@ -175,7 +175,7 @@

Source code for data.speech2text.speech2text

def get_required_params():
     return dict(DataLayer.get_required_params(), **{
       'num_audio_features': int,
-      'input_type': ['spectrogram', 'mfcc'],
+      'input_type': ['spectrogram', 'mfcc', 'logfbank'],
       'vocab_file': str,
       'dataset_files': list,
     })
@@ -187,7 +187,7 @@

Source code for data.speech2text.speech2text

'pad_to': int,
     })
-
[docs] def __init__(self, params, model, num_workers=None, worker_id=None): +
[docs] def __init__(self, params, model, num_workers, worker_id): """Speech-to-text data layer constructor. See parent class for arguments description. @@ -241,7 +241,6 @@

Source code for data.speech2text.speech2text

self._input_tensors = None
[docs] def split_data(self, data): - """Method that performs data split for evaluation.""" if self.params['mode'] != 'train' and self._num_workers is not None: size = len(data) start = size // self._num_workers * self._worker_id @@ -260,12 +259,12 @@

Source code for data.speech2text.speech2text

[docs]  def build_graph(self):
     """Builds data processing graph using ``tf.data`` API."""
-    self._dataset = tf.data.Dataset.from_tensor_slices(self._files)
-    if self.params['shuffle']:
-      self._dataset = self._dataset.shuffle(self._size)
-    self._dataset = self._dataset.repeat()
-
     if self.params['mode'] != 'infer':
+      self._dataset = tf.data.Dataset.from_tensor_slices(self._files)
+      if self.params['shuffle']:
+        self._dataset = self._dataset.shuffle(self._size)
+      self._dataset = self._dataset.repeat()
+
       self._dataset = self._dataset.map(
         lambda line: tf.py_func(
           self._parse_audio_transcript_element,
@@ -280,21 +279,29 @@ 

Source code for data.speech2text.speech2text

padded_shapes=([None, self.params['num_audio_features']], 1, [None], 1)
       )
     else:
+      indices = self.split_data(
+        np.array(list(map(lambda num: str(num), range(len(self.all_files)))))
+      )
+      self._dataset = tf.data.Dataset.from_tensor_slices(
+        np.hstack((indices[:, np.newaxis], self._files[:, np.newaxis]))
+      )
+      self._dataset = self._dataset.repeat()
       self._dataset = self._dataset.map(
         lambda line: tf.py_func(
           self._parse_audio_element,
           [line],
-          [self.params['dtype'], tf.int32],
+          [self.params['dtype'], tf.int32, tf.int32],
           stateful=False,
         ),
         num_parallel_calls=8,
       )
       self._dataset = self._dataset.padded_batch(
         self.params['batch_size'],
-        padded_shapes=([None, self.params['num_audio_features']], 1)
+        padded_shapes=([None, self.params['num_audio_features']], 1, 1)
       )
 
-    self._iterator = self._dataset.prefetch(8).make_initializable_iterator()
+    self._iterator = self._dataset.prefetch(tf.contrib.data.AUTOTUNE)\
+                         .make_initializable_iterator()
 
     if self.params['mode'] != 'infer':
       x, x_length, y, y_length = self._iterator.get_next()
@@ -303,7 +310,9 @@ 

Source code for data.speech2text.speech2text

y.set_shape([self.params['batch_size'], None])
       y_length = tf.reshape(y_length, [self.params['batch_size']])
     else:
-      x, x_length = self._iterator.get_next()
+      x, x_length, x_id = self._iterator.get_next()
+      x_id = tf.reshape(x_id, [self.params['batch_size']])
+
     x.set_shape([self.params['batch_size'], None,
                  self.params['num_audio_features']])
     x_length = tf.reshape(x_length, [self.params['batch_size']])
@@ -311,7 +320,9 @@ 

Source code for data.speech2text.speech2text

self._input_tensors = {}
     self._input_tensors["source_tensors"] = [x, x_length]
     if self.params['mode'] != 'infer':
-      self._input_tensors['target_tensors'] = [y, y_length]
+ self._input_tensors['target_tensors'] = [y, y_length] + else: + self._input_tensors['source_ids'] = [x_id]
[docs] def _parse_audio_transcript_element(self, element): """Parses tf.data element from TextLineDataset into audio and text. @@ -338,15 +349,17 @@

Source code for data.speech2text.speech2text

np.int32(target), \
            np.int32([len(target)])
-
[docs] def _parse_audio_element(self, audio_filename): +
[docs] def _parse_audio_element(self, id_and_audio_filename): """Parses audio from file and returns array of audio features. Args: - audio_filename: audio file name. + id_and_audio_filename: tuple of sample id and corresponding audio file name. Returns: tuple: source audio features as ``np.array``, length of source sequence, + sample id. """ + idx, audio_filename = id_and_audio_filename pad_to = self.params.get('pad_to', 8) source = get_speech_features_from_file( audio_filename, self.params['num_audio_features'], pad_to, @@ -354,7 +367,7 @@

Source code for data.speech2text.speech2text

augmentation=self.params.get('augmentation', None),
     )
     return source.astype(self.params['dtype'].as_numpy_dtype()), \
-           np.int32([len(source)])
+ np.int32([len(source)]), np.int32([idx])
@property def input_tensors(self): diff --git a/docs/html/_modules/data/speech2text/speech_utils.html b/docs/html/_modules/data/speech2text/speech_utils.html index 92f7dd343..0d9bfc77d 100644 --- a/docs/html/_modules/data/speech2text/speech_utils.html +++ b/docs/html/_modules/data/speech2text/speech_utils.html @@ -197,6 +197,13 @@

Source code for data.speech2text.speech_utils

)
+
[docs]def normalize_signal(signal): + """ + Normalize float32 signal to [-1, 1] range + """ + return signal / np.max(np.abs(signal))
+ +
[docs]def augment_audio_signal(signal, fs, augmentation): """Function that performs audio signal augmentation. @@ -208,7 +215,7 @@

Source code for data.speech2text.speech_utils

Returns: np.array: np.array with augmented audio signal. """ - signal_float = signal.astype(np.float32) / 32768.0 + signal_float = normalize_signal(signal.astype(np.float32)) if augmentation['time_stretch_ratio'] > 0: # time stretch (might be slow) @@ -227,7 +234,7 @@

Source code for data.speech2text.speech_utils

signal_float += np.random.randn(signal_float.shape[0]) * \ 10.0 ** (noise_level_db / 20.0) - return (signal_float * 32768.0).astype(np.int16)
+ return (normalize_signal(signal_float) * 32767.0).astype(np.int16)
[docs]def get_speech_features(signal, fs, num_features, pad_to=8, @@ -273,7 +280,7 @@

Source code for data.speech2text.speech_utils

if pad_to > 0: if length % pad_to != 0: pad_size = (pad_to - length % pad_to) * n_window_stride - signal = np.pad(signal, (0, pad_size), mode='reflect') + signal = np.pad(signal, (0, pad_size), mode='constant') if features_type == 'spectrogram': frames = psf.sigproc.framesig(sig=signal, @@ -301,10 +308,22 @@

Source code for data.speech2text.speech_utils

preemph=0.97, ceplifter=2*num_features, appendEnergy=False) + + elif features_type == 'logfbank': + features = psf.logfbank(signal=signal, + samplerate=fs, + winlen=window_size, + winstep=window_stride, + nfilt=num_features, + nfft=512, + lowfreq=0, highfreq=fs/2, + preemph=0.97) + else: raise ValueError('Unknown features type: {}'.format(features_type)) - assert features.shape[0] % pad_to == 0 + if pad_to > 0: + assert features.shape[0] % pad_to == 0 m = np.mean(features) s = np.std(features) features = (features - m) / s diff --git a/docs/html/_modules/data/text2text/t2t.html b/docs/html/_modules/data/text2text/t2t.html index 8caf1ebfe..adbbbff9f 100644 --- a/docs/html/_modules/data/text2text/t2t.html +++ b/docs/html/_modules/data/text2text/t2t.html @@ -200,8 +200,9 @@

Source code for data.text2text.t2t

    is the list of training files. Second, while reading records using
    `parallel_interleave`, the `sloppy` argument is used to generate randomness
    in the order of the examples.
-"""
 
+3. Modified slightly to fit OpenSeq2Seq needs
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -227,7 +228,7 @@ 

Source code for data.text2text.t2t

   return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER)
-
[docs]def _parse_example(serialized_example): +
[docs]def _parse_example(serialized_example, pad_2_eight=False): """Return inputs and targets Tensors from a serialized tf.Example.""" data_fields = { "inputs": tf.VarLenFeature(tf.int64), @@ -236,6 +237,17 @@

Source code for data.text2text.t2t

   parsed = tf.parse_single_example(serialized_example, data_fields)
   inputs = tf.sparse_tensor_to_dense(parsed["inputs"])
   targets = tf.sparse_tensor_to_dense(parsed["targets"])
+
+  if pad_2_eight:
+    inputs = tf.cond(tf.equal(tf.shape(inputs)[0] % 8, 0),
+                     true_fn=lambda:  inputs,
+                     false_fn=lambda: tf.pad(inputs,
+                                      paddings=[[0, 8 - tf.shape(inputs)[0] % 8]]))
+    targets = tf.cond(tf.equal(tf.shape(targets)[0] % 8, 0),
+                     true_fn=lambda:  targets,
+                     false_fn=lambda: tf.pad(targets,
+                                      paddings=[[0, 8 - tf.shape(targets)[0] % 8]]))
+
   return inputs, targets
@@ -283,7 +295,7 @@

Source code for data.text2text.t2t

   return buckets_min, buckets_max
-
[docs]def _batch_examples(dataset, batch_size, max_length): +
[docs]def _batch_examples(dataset, batch_size, max_length, pad_2_eight=True): """Group examples by similar lengths, and return batched dataset. Each batch of similar-length examples are padded to the same length, and may @@ -309,7 +321,12 @@

Source code for data.text2text.t2t

 
   # Create list of batch sizes for each bucket_id, so that
   # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
-  bucket_batch_sizes = [batch_size // x for x in buckets_max]
+  if pad_2_eight: # pad to 8 for HMMA
+    bucket_batch_sizes = [
+      batch_size // x if batch_size // x % 8 == 0 else batch_size // x + (
+            8 - batch_size // x % 8) for x in buckets_max]
+  else:
+    bucket_batch_sizes = [batch_size // x for x in buckets_max]
   # bucket_id will be a tensor, so convert this list to a tensor as well.
   bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
 
@@ -331,7 +348,6 @@ 

Source code for data.text2text.t2t

   def batching_fn(bucket_id, grouped_dataset):
     """Batch and add padding to a dataset of elements with similar lengths."""
     bucket_batch_size = window_size_fn(bucket_id)
-
     # Batch the dataset and add padding so that all input sequences in the
     # examples have the same length, and all target sequences have the same
     # lengths as well. Resulting lengths of inputs and targets can differ.
@@ -346,7 +362,7 @@ 

Source code for data.text2text.t2t

 
 
[docs]def _read_and_batch_from_files( file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat, - num_workers, worker_id): + num_workers, worker_id, batch_in_tokens, pad2eight=True): """Create dataset where each item is a dict of "inputs" and "targets". Args: @@ -359,6 +375,11 @@

Source code for data.text2text.t2t

       repeated forever.
     num_workers: Number of workers or number of Horovod workers
     worker_id: Worker id or Horovod rank
+    batch_in_tokens: whether to batch_size means amounts in tokens or sentence
+    pairs. batching in tokens is more efficient as it reduces PADs. batching in
+    sentences should be used in inference mode since order of
+    sentences is important
+    pad2eight: if True, it will pad both dimensions to be divisible by 8
 
   Returns:
     tf.data.Dataset object containing examples loaded from the files.
@@ -379,14 +400,19 @@ 

Source code for data.text2text.t2t

 
   # Parse each tf.Example into a dictionary
   # TODO: Look into prefetch_input_elements for performance optimization.
-  dataset = dataset.map(_parse_example,
+  dataset = dataset.map(lambda x: _parse_example(x, pad_2_eight=pad2eight),
                         num_parallel_calls=num_cpu_cores)
 
   # Remove examples where the input or target length exceeds the maximum length,
   dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length))
 
-  # Batch such that each batch has examples of similar length.
-  dataset = _batch_examples(dataset, batch_size, max_length)
+  if batch_in_tokens:
+    # Batch such that each batch has examples of similar length.
+    dataset = _batch_examples(dataset, batch_size, max_length,
+                              pad_2_eight=pad2eight)
+  else:
+    # Examples can have different lenghts
+    dataset = dataset.padded_batch(batch_size, ([None], [None]))
   dataset = dataset.repeat(repeat)
 
   # Prefetch the next element to improve speed of input pipeline.
diff --git a/docs/html/_modules/data/text2text/text2text.html b/docs/html/_modules/data/text2text/text2text.html
index 6af9cecb6..c59b0fc53 100644
--- a/docs/html/_modules/data/text2text/text2text.html
+++ b/docs/html/_modules/data/text2text/text2text.html
@@ -232,7 +232,8 @@ 

Source code for data.text2text.text2text

     self._delimiter = self.params.get('delimiter', ' ')
     self._map_parallel_calls = self.params.get('map_parallel_calls', 8)
     self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight', False)
-    self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', 4)
+    self._prefetch_buffer_size = self.params.get('prefetch_buffer_size',
+                                                 tf.contrib.data.AUTOTUNE)
     self._num_workers = num_workers
     self._worker_id = worker_id
     if self._pad_lengths_to_eight and not (self.params['max_length'] % 8 == 0):
@@ -321,7 +322,7 @@ 

Source code for data.text2text.text2text

              [SpecialTextTokens.EOS_ID.value], self._pad_lengths_to_eight), dtype="int32")
 
     _sources = tf.data.TextLineDataset(self.source_file)\
-      .map(lambda line: tf.py_func(func=src_token_to_id,inp=[line],
+      .map(lambda line: tf.py_func(func=src_token_to_id, inp=[line],
                                    Tout=[tf.int32], stateful=False),
            num_parallel_calls=self._map_parallel_calls) \
       .map(lambda tokens: (tokens, tf.size(tokens)),
@@ -409,7 +410,8 @@ 

Source code for data.text2text.text2text

       'repeat': int,
       'num_cpu_cores': int,
       'tgt_vocab_file': str,
-      'm_padding': bool,
+      'pad_data_to_eight': bool,
+      'batch_in_tokens': bool,
     })
def __init__(self, params, model, num_workers=1, worker_id=0): @@ -456,38 +458,13 @@

Source code for data.text2text.text2text

       shuffle=self.params['shuffle'],
       repeat=self.params['repeat'],
       num_workers=self._num_workers,
-      worker_id=self._worker_id)
+      worker_id=self._worker_id,
+      batch_in_tokens=self.params.get('batch_in_tokens', True),
+      pad2eight=self.params.get('pad_data_to_eight', False))
 
     self._iterator = self.batched_dataset.make_initializable_iterator()
     x, y = self.iterator.get_next()
 
-    if self.params.get('m_padding', False):
-      # MAGIC PADDING
-      x = tf.cond(tf.equal(tf.shape(x)[1] % 8, 0),
-                  true_fn = lambda: x,
-                  false_fn = lambda: tf.pad(x,
-                                            paddings=[[0, 0],
-                                                      [0, 8 - tf.shape(x)[1] % 8]]))
-
-      y = tf.cond(tf.equal(tf.shape(y)[1] % 8, 0),
-                  true_fn = lambda: y,
-                  false_fn = lambda: tf.pad(y,
-                                            paddings=[[0, 0],
-                                                      [0, 8 - tf.shape(y)[1] % 8]]))
-
-      x = tf.cond(tf.equal(tf.shape(x)[0] % 8, 0),
-                  true_fn = lambda: x,
-                  false_fn = lambda: tf.pad(x,
-                                            paddings=[[0, 8 - tf.shape(x)[0] % 8],
-                                                      [0, 0]]))
-
-      y = tf.cond(tf.equal(tf.shape(y)[0] % 8, 0),
-                  true_fn=lambda: y,
-                  false_fn=lambda: tf.pad(y,
-                                          paddings=[[0, 8 - tf.shape(y)[0] % 8],
-                                                    [0, 0]]))
-      # ENDOF MAGIC PADDING
-
     len_x = tf.count_nonzero(x, axis=1, dtype=tf.int32)
     len_y = tf.count_nonzero(y, axis=1, dtype=tf.int32)
     if self.params['mode'] == 'train' or self.params['mode'] == 'eval':
diff --git a/docs/html/_modules/decoders/convs2s_decoder.html b/docs/html/_modules/decoders/convs2s_decoder.html
new file mode 100644
index 000000000..1b084dd87
--- /dev/null
+++ b/docs/html/_modules/decoders/convs2s_decoder.html
@@ -0,0 +1,602 @@
+
+
+
+
+  
+
+  
+  
+  
+  
+  decoders.convs2s_decoder — OpenSeq2Seq 0.2 documentation
+  
+
+  
+  
+    
+  
+  
+  
+
+  
+
+  
+  
+    
+
+  
+
+  
+    
+  
+  
+  
+    
+     
+
+  
+  
+
+
+
+
+
+   
+  
+ + + + +
+ + + + + +
+ +
+ + + + + + + + + + + + + + + + + +
+ +
    + +
  • Docs »
  • + +
  • Module code »
  • + +
  • decoders.convs2s_decoder
  • + + +
  • + +
  • + +
+ + +
+
+
+
+ +

Source code for decoders.convs2s_decoder

+from __future__ import absolute_import, division, print_function
+from __future__ import unicode_literals
+
+import tensorflow as tf
+import math
+from .decoder import Decoder
+
+from open_seq2seq.parts.transformer import beam_search
+
+from open_seq2seq.parts.transformer import embedding_layer
+from open_seq2seq.parts.transformer.utils import get_padding
+
+from open_seq2seq.parts.convs2s import ffn_wn_layer, conv_wn_layer, attention_wn_layer
+
+# Default value used if max_input_length is not given
+MAX_INPUT_LENGTH = 128
+
+
+
[docs]class ConvS2SDecoder(Decoder): + +
[docs] @staticmethod + def get_required_params(): + """Static method with description of required parameters. + + Returns: + dict: + Dictionary containing all the parameters that **have to** be + included into the ``params`` parameter of the + class :meth:`__init__` method. + """ + return dict( + Decoder.get_required_params(), **{ + 'batch_size': int, + 'decoder_layers': int, + 'tgt_emb_size': int, + 'tgt_vocab_size': int, + 'shared_embed': bool, + 'embedding_dropout_keep_prob': float, + 'conv_nchannels_kwidth': list, + 'hidden_dropout_keep_prob': float, + 'out_dropout_keep_prob': float, + 'beam_size': int, + 'alpha': float, + 'extra_decode_length': int, + 'EOS_ID': int, + })
+ +
[docs] @staticmethod + def get_optional_params(): + """Static method with description of optional parameters. + + Returns: + dict: + Dictionary containing all the parameters that **can** be + included into the ``params`` parameter of the + class :meth:`__init__` method. + """ + return dict( + Decoder.get_optional_params(), + **{ + 'pad_embeddings_2_eight': bool, + + # if not provided, tgt_emb_size is used as the default value + 'out_emb_size': int, + 'max_input_length': int, + 'GO_SYMBOL': int, + 'PAD_SYMBOL': int, + 'END_SYMBOL': int, + })
+ + def _cast_types(self, input_dict): + return input_dict + + def __init__(self, params, model, name="convs2s_decoder", mode='train'): + super(ConvS2SDecoder, self).__init__(params, model, name, mode) + self.embedding_softmax_layer = None + self.position_embedding_layer = None + self.layers = [] + self._tgt_vocab_size = self.params['tgt_vocab_size'] + self._tgt_emb_size = self.params['tgt_emb_size'] + self._mode = mode + self._pad_sym = self.params.get('PAD_SYMBOL', 0) + self._pad2eight = params.get('pad_embeddings_2_eight', False) + + def _decode(self, input_dict): + targets = input_dict['target_tensors'][0] \ + if 'target_tensors' in input_dict else None + + encoder_outputs = input_dict['encoder_output']['outputs'] + encoder_outputs_b = input_dict['encoder_output'].get( + 'outputs_b', encoder_outputs) + + inputs_attention_bias = input_dict['encoder_output'].get( + 'inputs_attention_bias_cs2s', None) + + with tf.name_scope("decode"): + # prepare decoder layers + if len(self.layers) == 0: + knum_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[0] + kwidth_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[1] + + # preparing embedding layers + with tf.variable_scope("embedding"): + if 'embedding_softmax_layer' in input_dict['encoder_output'] \ + and self.params['shared_embed']: + self.embedding_softmax_layer = \ + input_dict['encoder_output']['embedding_softmax_layer'] + else: + self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self._tgt_vocab_size, + hidden_size=self._tgt_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + with tf.variable_scope("pos_embedding"): + if 'position_embedding_layer' in input_dict['encoder_output'] \ + and self.params['shared_embed']: + self.position_embedding_layer = \ + input_dict['encoder_output']['position_embedding_layer'] + else: + self.position_embedding_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self.params.get("max_input_length", + MAX_INPUT_LENGTH), + hidden_size=self._tgt_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + # linear projection before cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + self._tgt_emb_size, + knum_list[0], + dropout=self.params["embedding_dropout_keep_prob"], + var_scope_name="linear_mapping_before_cnn_layers")) + + for i in range(self.params['decoder_layers']): + in_dim = knum_list[i] if i == 0 else knum_list[i - 1] + out_dim = knum_list[i] + + # linear projection is needed for residual connections if + # input and output of a cnn layer do not match + if in_dim != out_dim: + linear_proj = ffn_wn_layer.FeedFowardNetworkNormalized( + in_dim, + out_dim, + var_scope_name="linear_mapping_cnn_" + str(i + 1), + dropout=1.0) + else: + linear_proj = None + + conv_layer = conv_wn_layer.Conv1DNetworkNormalized( + in_dim, + out_dim, + kernel_width=kwidth_list[i], + mode=self.mode, + layer_id=i + 1, + hidden_dropout=self.params["hidden_dropout_keep_prob"], + conv_padding="VALID", + decode_padding=True) + + att_layer = attention_wn_layer.AttentionLayerNormalized( + out_dim, + embed_size=self._tgt_emb_size, + layer_id=i + 1, + add_res=True) + + self.layers.append([linear_proj, conv_layer, att_layer]) + + # linear projection after cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + knum_list[self.params['decoder_layers'] - 1], + self.params.get("out_emb_size", self._tgt_emb_size), + dropout=1.0, + var_scope_name="linear_mapping_after_cnn_layers")) + + if not self.params['shared_embed']: + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + self.params.get("out_emb_size", self._tgt_emb_size), + self._tgt_vocab_size, + dropout=self.params["out_dropout_keep_prob"], + var_scope_name="linear_mapping_to_vocabspace")) + else: + # if embedding is shared, + # the shared embedding is used as the final linear projection to vocab space + self.layers.append(None) + + if targets is None: + return self.predict(encoder_outputs, encoder_outputs_b, + inputs_attention_bias) + else: + logits = self.decode_pass(targets, encoder_outputs, encoder_outputs_b, + inputs_attention_bias) + return { + "logits": logits, + "outputs": [tf.argmax(logits, axis=-1)], + "final_state": None, + "final_sequence_lengths": None + } + +
[docs] def decode_pass(self, targets, encoder_outputs, encoder_outputs_b, + inputs_attention_bias): + """Generate logits for each value in the target sequence. + + Args: + targets: target values for the output sequence. + int tensor with shape [batch_size, target_length] + encoder_outputs: continuous representation of input sequence. + float tensor with shape [batch_size, input_length, hidden_size] + float tensor with shape [batch_size, input_length, hidden_size] + encoder_outputs_b: continuous representation of input sequence + which includes the source embeddings. + float tensor with shape [batch_size, input_length, hidden_size] + inputs_attention_bias: float tensor with shape [batch_size, 1, input_length] + + Returns: + float32 tensor with shape [batch_size, target_length, vocab_size] + """ + + # Prepare inputs to decoder layers by applying embedding + # and adding positional encoding. + decoder_inputs = self.embedding_softmax_layer(targets) + + with tf.name_scope("add_pos_encoding"): + pos_input = tf.range( + 0, tf.shape(decoder_inputs)[1], delta=1, dtype=tf.int32, name='range') + pos_encoding = self.position_embedding_layer(pos_input) + decoder_inputs = decoder_inputs + tf.cast( + x=pos_encoding, dtype=decoder_inputs.dtype) + + if self.mode == "train": + decoder_inputs = tf.nn.dropout(decoder_inputs, + self.params["embedding_dropout_keep_prob"]) + + # mask the paddings in the target + inputs_padding = get_padding( + targets, padding_value=self._pad_sym, dtype=decoder_inputs.dtype) + decoder_inputs *= tf.expand_dims(1.0 - inputs_padding, 2) + + # do decode + logits = self._call( + decoder_inputs=decoder_inputs, + encoder_outputs_a=encoder_outputs, + encoder_outputs_b=encoder_outputs_b, + input_attention_bias=inputs_attention_bias) + + return logits
+ + def _call(self, decoder_inputs, encoder_outputs_a, encoder_outputs_b, + input_attention_bias): + # run input into the decoder layers and returns the logits + target_embed = decoder_inputs + with tf.variable_scope("linear_layer_before_cnn_layers"): + outputs = self.layers[0](decoder_inputs) + + for i in range(1, len(self.layers) - 2): + linear_proj, conv_layer, att_layer = self.layers[i] + + with tf.variable_scope("layer_%d" % i): + if linear_proj is not None: + res_inputs = linear_proj(outputs) + else: + res_inputs = outputs + + with tf.variable_scope("conv_layer"): + outputs = conv_layer(outputs) + + with tf.variable_scope("attention_layer"): + outputs = att_layer(outputs, target_embed, encoder_outputs_a, + encoder_outputs_b, input_attention_bias) + outputs = (outputs + res_inputs) * math.sqrt(0.5) + + with tf.variable_scope("linear_layer_after_cnn_layers"): + outputs = self.layers[-2](outputs) + + if self.mode == "train": + outputs = tf.nn.dropout(outputs, self.params["out_dropout_keep_prob"]) + + with tf.variable_scope("pre_softmax_projection"): + if self.layers[-1] is None: + logits = self.embedding_softmax_layer.linear(outputs) + else: + logits = self.layers[-1](outputs) + + return tf.cast(logits, dtype=tf.float32) + +
[docs] def predict(self, encoder_outputs, encoder_outputs_b, inputs_attention_bias): + """Return predicted sequence.""" + batch_size = tf.shape(encoder_outputs)[0] + input_length = tf.shape(encoder_outputs)[1] + max_decode_length = input_length + self.params["extra_decode_length"] + + symbols_to_logits_fn = self._get_symbols_to_logits_fn() + + # Create initial set of IDs that will be passed into symbols_to_logits_fn. + initial_ids = tf.zeros( + [batch_size], dtype=tf.int32) + self.params["GO_SYMBOL"] + + cache = {} + # Add encoder outputs and attention bias to the cache. + cache["encoder_outputs"] = encoder_outputs + cache["encoder_outputs_b"] = encoder_outputs_b + if inputs_attention_bias is not None: + cache["inputs_attention_bias"] = inputs_attention_bias + + # Use beam search to find the top beam_size sequences and scores. + decoded_ids, scores = beam_search.sequence_beam_search( + symbols_to_logits_fn=symbols_to_logits_fn, + initial_ids=initial_ids, + initial_cache=cache, + vocab_size=self.params["tgt_vocab_size"], + beam_size=self.params["beam_size"], + alpha=self.params["alpha"], + max_decode_length=max_decode_length, + eos_id=self.params["EOS_ID"]) + + # Get the top sequence for each batch element + top_decoded_ids = decoded_ids[:, 0, :] + top_scores = scores[:, 0] + + # this isn't particularly efficient + logits = self.decode_pass(top_decoded_ids, encoder_outputs, + encoder_outputs_b, inputs_attention_bias) + + return { + "logits": logits, + "outputs": [top_decoded_ids], + "final_state": None, + "final_sequence_lengths": None + }
+ +
[docs] def _get_symbols_to_logits_fn(self): + """Returns a decoding function that calculates logits of the next tokens.""" + + def symbols_to_logits_fn(ids, i, cache): + """Generate logits for next potential IDs. + + Args: + ids: Current decoded sequences. + int tensor with shape [batch_size * beam_size, i - 1] + i: Loop index + cache: dictionary of values storing the encoder output, encoder-decoder + attention bias, and previous decoder attention values. + + Returns: + Tuple of + (logits with shape [batch_size * beam_size, vocab_size], + updated cache values) + """ + + # pass the decoded ids from the beginneing up to the current into the decoder + # not efficient + decoder_outputs = self.decode_pass(ids, cache.get("encoder_outputs"), + cache.get("encoder_outputs_b"), + cache.get("inputs_attention_bias")) + + logits = decoder_outputs[:, i, :] + return logits, cache + + return symbols_to_logits_fn
+
+ +
+ +
+ + +
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/decoders/decoder.html b/docs/html/_modules/decoders/decoder.html index 0751e74a4..bd4c98f82 100644 --- a/docs/html/_modules/decoders/decoder.html +++ b/docs/html/_modules/decoders/decoder.html @@ -242,24 +242,9 @@

Source code for decoders.decoder

       else:
         self._params['dtype'] = tf.float32
 
-    if 'regularizer' not in self._params:
-      if self._model and 'regularizer' in self._model.params:
-        self._params['regularizer'] = self._model.params['regularizer']
-        self._params['regularizer_params'] = self._model.params['regularizer_params']
-
-    if 'regularizer' in self._params:
-      init_dict = self._params.get('regularizer_params', {})
-      self._params['regularizer'] = self._params['regularizer'](**init_dict)
-      if self._params['dtype'] == 'mixed':
-        self._params['regularizer'] = mp_regularizer_wrapper(
-          self._params['regularizer'],
-        )
-
-    if self._params['dtype'] == 'mixed':
-      self._params['dtype'] = tf.float16
-
     self._name = name
-    self._mode = mode
+ self._mode = mode + self._compiled = False
[docs] def decode(self, input_dict): """Wrapper around :meth:`self._decode() <_decode>` method. @@ -272,12 +257,35 @@

Source code for decoders.decoder

     Returns:
       see :meth:`self._decode() <_decode>` docs.
     """
+    if not self._compiled:
+      if 'regularizer' not in self._params:
+        if self._model and 'regularizer' in self._model.params:
+          self._params['regularizer'] = copy.deepcopy(
+            self._model.params['regularizer']
+          )
+          self._params['regularizer_params'] = copy.deepcopy(
+            self._model.params['regularizer_params']
+          )
+
+      if 'regularizer' in self._params:
+        init_dict = self._params.get('regularizer_params', {})
+        self._params['regularizer'] = self._params['regularizer'](**init_dict)
+        if self._params['dtype'] == 'mixed':
+          self._params['regularizer'] = mp_regularizer_wrapper(
+            self._params['regularizer'],
+          )
+
+      if self._params['dtype'] == 'mixed':
+        self._params['dtype'] = tf.float16
+      
     if 'initializer' in self.params:
       init_dict = self.params.get('initializer_params', {})
       initializer = self.params['initializer'](**init_dict)
     else:
       initializer = None
 
+    self._compiled = True
+
     with tf.variable_scope(self._name, initializer=initializer,
                            dtype=self.params['dtype']):
       return self._decode(self._cast_types(input_dict))
@@ -315,7 +323,8 @@

Source code for decoders.decoder

 
           {
             "logits": logits that will be passed to Loss
-            "samples": actual decoded output, e.g. characters instead of logits
+            "outputs": list with actual decoded outputs, e.g. characters
+                       instead of logits
           }
     """
     pass
diff --git a/docs/html/_modules/decoders/fc_decoders.html b/docs/html/_modules/decoders/fc_decoders.html index e2aecfac0..c8680966c 100644 --- a/docs/html/_modules/decoders/fc_decoders.html +++ b/docs/html/_modules/decoders/fc_decoders.html @@ -209,7 +209,7 @@

Source code for decoders.fc_decoders

 
         {
           'logits': logits with the shape=[batch_size, output_dim]
-          'samples': [logits] (same as logits but wrapped in list)
+          'outputs': [logits] (same as logits but wrapped in list)
         }
     """
     inputs = input_dict['encoder_output']['outputs']
@@ -222,7 +222,7 @@ 

Source code for decoders.fc_decoders

       kernel_regularizer=regularizer,
       name='fully_connected',
     )
-    return {'logits': logits, 'samples': [logits]}
+ return {'logits': logits, 'outputs': [logits]}
[docs]class FullyConnectedTimeDecoder(Decoder): @@ -252,7 +252,7 @@

Source code for decoders.fc_decoders

     * **tgt_vocab_size** (int) --- target vocabulary size, i.e. number of
       output features.
     * **logits_to_outputs_func** --- function that maps produced logits to
-      decoder samples, i.e. actual text sequences.
+      decoder outputs, i.e. actual text sequences.
     """
     super(FullyConnectedTimeDecoder, self).__init__(params, model, name, mode)
@@ -274,7 +274,7 @@

Source code for decoders.fc_decoders

 
         {
           'logits': logits with the shape=[time length, batch_size, tgt_vocab_size]
-          'samples': logits_to_outputs_func(logits, input_dict)
+          'outputs': logits_to_outputs_func(logits, input_dict)
         }
     """
     inputs = input_dict['encoder_output']['outputs']
@@ -301,9 +301,9 @@ 

Source code for decoders.fc_decoders

     logits = tf.transpose(logits, [1, 0, 2])
 
     if 'logits_to_outputs_func' in self.params:
-      samples = self.params['logits_to_outputs_func'](logits, input_dict)
+      outputs = self.params['logits_to_outputs_func'](logits, input_dict)
       return {
-        'samples': samples,
+        'outputs': outputs,
         'logits': logits,
         'src_length': input_dict['encoder_output']['src_length'],
       }
diff --git a/docs/html/_modules/decoders/rnn_decoders.html b/docs/html/_modules/decoders/rnn_decoders.html
index f5cd124d2..e049842a6 100644
--- a/docs/html/_modules/decoders/rnn_decoders.html
+++ b/docs/html/_modules/decoders/rnn_decoders.html
@@ -165,7 +165,7 @@ 

Source code for decoders.rnn_decoders

 
 from open_seq2seq.parts.rnns.gnmt import GNMTAttentionMultiCell, \
                                                            gnmt_residual_fn
-from open_seq2seq.parts.rnns.utils import create_rnn_cell
+from open_seq2seq.parts.rnns.utils import single_cell
 from open_seq2seq.parts.rnns.attention_wrapper import BahdanauAttention, \
                                                  LuongAttention, \
                                                  AttentionWrapper
@@ -185,8 +185,7 @@ 

Source code for decoders.rnn_decoders

       'tgt_emb_size': int,
       'attention_layer_size': int,
       'attention_type': ['bahdanau', 'luong', 'gnmt', 'gnmt_v2'],
-      'decoder_cell_units': int,
-      'decoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'],
+      'core_cell': None,
       'decoder_layers': int,
       'decoder_use_skip_connections': bool,
       'batch_size': int,
@@ -195,6 +194,7 @@ 

Source code for decoders.rnn_decoders

 
[docs] @staticmethod def get_optional_params(): return dict(Decoder.get_optional_params(), **{ + 'core_cell_params': dict, 'bahdanau_normalize': bool, 'luong_scale': bool, 'decoder_dp_input_keep_prob': float, @@ -220,8 +220,8 @@

Source code for decoders.rnn_decoders

     * **END_SYMBOL** (int) --- END symbol id, must be the same as used in
       data layer.
     * **tgt_emb_size** (int) --- embedding size to use.
-    * **decoder_cell_units** (int) - number of units in RNN
-    * **decoder_cell_type** (string) - RNN type: lstm, gru, glstm, etc.
+    * **core_cell_params** (dict) - parameters for RNN class
+    * **core_cell** (string) - RNN class.
     * **decoder_dp_input_keep_prob** (float) - dropout input keep probability.
     * **decoder_dp_output_keep_prob** (float) - dropout output keep probability.
     * **decoder_use_skip_connections** (bool) - use residual connections or not.
@@ -339,8 +339,8 @@ 

Source code for decoders.rnn_decoders

       self._tgt_vocab_size, use_bias=False,
     )
 
-    cell_params = copy.deepcopy(self.params)
-    cell_params["num_units"] = self.params['decoder_cell_units']
+    #cell_params = copy.deepcopy(self.params)
+    #cell_params["num_units"] = self.params['decoder_cell_units']
 
     if self._mode == "train":
       dp_input_keep_prob = self.params['decoder_dp_input_keep_prob']
@@ -349,22 +349,17 @@ 

Source code for decoders.rnn_decoders

       dp_input_keep_prob = 1.0
       dp_output_keep_prob = 1.0
 
-    if self.params['attention_type'].startswith('gnmt'):
-      residual_connections = False
-      wrap_to_multi_rnn = False
-    else:
-      residual_connections = self.params['decoder_use_skip_connections']
-      wrap_to_multi_rnn = True
-
-    self._decoder_cells = create_rnn_cell(
-      cell_type=self.params['decoder_cell_type'],
-      cell_params=cell_params,
-      num_layers=self.params['decoder_layers'],
-      dp_input_keep_prob=dp_input_keep_prob,
-      dp_output_keep_prob=dp_output_keep_prob,
-      residual_connections=residual_connections,
-      wrap_to_multi_rnn=wrap_to_multi_rnn,
-    )
+    residual_connections = self.params['decoder_use_skip_connections']
+
+    # list of cells
+    self._decoder_cells = [
+      single_cell(cell_class=self.params['core_cell'],
+                  cell_params=self.params.get('core_cell_params', {}),
+                  dp_input_keep_prob=dp_input_keep_prob,
+                  dp_output_keep_prob=dp_output_keep_prob,
+                  # residual connections are added a little differently for GNMT
+                  residual_connections=False if self.params['attention_type'].startswith('gnmt') else residual_connections,
+                  ) for _ in range(self.params['decoder_layers'])]
 
     attention_mechanism = self._build_attention(
       encoder_outputs,
@@ -372,7 +367,6 @@ 

Source code for decoders.rnn_decoders

     )
     if self.params['attention_type'].startswith('gnmt'):
       attention_cell = self._decoder_cells.pop(0)
-      # attention_cell = tf.contrib.seq2seq.AttentionWrapper(
       attention_cell = AttentionWrapper(
         attention_cell,
         attention_mechanism=attention_mechanism,
@@ -380,12 +374,12 @@ 

Source code for decoders.rnn_decoders

         output_attention=False,
         name="gnmt_attention")
       attentive_decoder_cell = GNMTAttentionMultiCell(
-        attention_cell, self._add_residual_wrapper(self._decoder_cells),
+        attention_cell, self._add_residual_wrapper(self._decoder_cells) if residual_connections else self._decoder_cells,
         use_new_attention=(self.params['attention_type'] == 'gnmt_v2'))
     else:
       # attentive_decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
       attentive_decoder_cell = AttentionWrapper(
-        cell=self._decoder_cells,
+        cell=tf.contrib.rnn.MultiRNNCell(self._decoder_cells),
         attention_mechanism=attention_mechanism,
       )
     if self._mode == "train":
@@ -438,8 +432,9 @@ 

Source code for decoders.rnn_decoders

       output_time_major=time_major,
     )
 
-    return {'logits': final_outputs.rnn_output,
-            'samples': [tf.argmax(final_outputs.rnn_output, axis=-1)],
+    return {'logits': final_outputs.rnn_output if not time_major else
+            tf.transpose(final_outputs.rnn_output, perm=[1, 0, 2]),
+            'outputs': [tf.argmax(final_outputs.rnn_output, axis=-1)],
             'final_state': final_state,
             'final_sequence_lengths': final_sequence_lengths}
@@ -526,8 +521,8 @@

Source code for decoders.rnn_decoders

       self._tgt_vocab_size, use_bias=False,
     )
 
-    cell_params = copy.deepcopy(self.params)
-    cell_params["num_units"] = self.params['decoder_cell_units']
+    #cell_params = copy.deepcopy(self.params)
+    #cell_params["num_units"] = self.params['decoder_cell_units']
 
     if self._mode == "train":
       dp_input_keep_prob = self.params['decoder_dp_input_keep_prob']
@@ -536,22 +531,34 @@ 

Source code for decoders.rnn_decoders

       dp_input_keep_prob = 1.0
       dp_output_keep_prob = 1.0
 
-    if self.params['attention_type'].startswith('gnmt'):
-      residual_connections = False
-      wrap_to_multi_rnn = False
-    else:
-      residual_connections = self.params['decoder_use_skip_connections']
-      wrap_to_multi_rnn = True
-
-    self._decoder_cells = create_rnn_cell(
-      cell_type=self.params['decoder_cell_type'],
-      cell_params=cell_params,
-      num_layers=self.params['decoder_layers'],
-      dp_input_keep_prob=dp_input_keep_prob,
-      dp_output_keep_prob=dp_output_keep_prob,
-      residual_connections=residual_connections,
-      wrap_to_multi_rnn=wrap_to_multi_rnn,
-    )
+    #if self.params['attention_type'].startswith('gnmt'):
+    #  residual_connections = False
+    #  wrap_to_multi_rnn = False
+    #else:
+    #  residual_connections = self.params['decoder_use_skip_connections']
+    #  wrap_to_multi_rnn = True
+
+    #self._decoder_cells = create_rnn_cell(
+    #  cell_type=self.params['decoder_cell_type'],
+    #  cell_params=cell_params,
+    #  num_layers=self.params['decoder_layers'],
+    #  dp_input_keep_prob=dp_input_keep_prob,
+    #  dp_output_keep_prob=dp_output_keep_prob,
+    #  residual_connections=residual_connections,
+    #  wrap_to_multi_rnn=wrap_to_multi_rnn,
+    #)
+    residual_connections = self.params['decoder_use_skip_connections']
+    # list of cells
+    self._decoder_cells = [
+      single_cell(cell_class=self.params['core_cell'],
+                  cell_params=self.params.get('core_cell_params', {}),
+                  dp_input_keep_prob=dp_input_keep_prob,
+                  dp_output_keep_prob=dp_output_keep_prob,
+                  # residual connections are added a little differently for GNMT
+                  residual_connections=False if self.params[
+                    'attention_type'].startswith(
+                    'gnmt') else residual_connections,
+                  ) for _ in range(self.params['decoder_layers'])]
 
     tiled_enc_outputs = tf.contrib.seq2seq.tile_batch(
       encoder_outputs,
@@ -575,18 +582,18 @@ 

Source code for decoders.rnn_decoders

         output_attention=False,
         name="gnmt_attention")
       attentive_decoder_cell = GNMTAttentionMultiCell(
-        attention_cell, self._add_residual_wrapper(self._decoder_cells),
+        attention_cell, self._add_residual_wrapper(self._decoder_cells) if residual_connections else self._decoder_cells,
         use_new_attention=(self.params['attention_type'] == 'gnmt_v2'))
-    else:
+    else: # non-GNMT
       attentive_decoder_cell = AttentionWrapper(
-        cell=self._decoder_cells,
+        cell=tf.contrib.rnn.MultiRNNCell(self._decoder_cells),
         attention_mechanism=attention_mechanism,
       )
     batch_size_tensor = tf.constant(self._batch_size)
     embedding_fn = lambda ids: tf.cast(
       tf.nn.embedding_lookup(self._dec_emb_w, ids),
       dtype=self.params['dtype'])
-    #decoder = tf.contrib.seq2seq.BeamSearchDecoder(
+    # decoder = tf.contrib.seq2seq.BeamSearchDecoder(
     decoder = BeamSearchDecoder(
       cell=attentive_decoder_cell,
       embedding=embedding_fn,
@@ -611,8 +618,9 @@ 

Source code for decoders.rnn_decoders

       output_time_major=time_major,
     )
 
-    return {'logits': final_outputs.predicted_ids[:, :, 0],
-            'samples': [final_outputs.predicted_ids[:, :, 0]],
+    return {'logits': final_outputs.predicted_ids[:, :, 0] if not time_major else
+            tf.transpose(final_outputs.predicted_ids[:, :, 0], perm=[1, 0, 2]),
+            'outputs': [final_outputs.predicted_ids[:, :, 0]],
             'final_state': final_state,
             'final_sequence_lengths': final_sequence_lengths}
diff --git a/docs/html/_modules/encoders/cnn_encoder.html b/docs/html/_modules/encoders/cnn_encoder.html new file mode 100644 index 000000000..fa53218d1 --- /dev/null +++ b/docs/html/_modules/encoders/cnn_encoder.html @@ -0,0 +1,405 @@ + + + + + + + + + + + encoders.cnn_encoder — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + + + + +
+ +
+ + + + + + + + + + + + + + + + + +
+ + + + +
+
+
+
+ +

Source code for encoders.cnn_encoder

+# Copyright (c) 2018 NVIDIA Corporation
+"""
+This module contains classes and functions to build "general" convolutional
+neural networks from the description of arbitrary "layers".
+"""
+from __future__ import absolute_import, division, print_function
+from __future__ import unicode_literals
+from six.moves import range
+
+import tensorflow as tf
+import copy
+
+try:
+    from inspect import signature
+except ImportError:
+    from funcsigs import signature
+
+from .encoder import Encoder
+from open_seq2seq.utils.utils import deco_print
+
+
+
[docs]def build_layer(inputs, layer, layer_params, data_format, + regularizer, training, verbose=True): + """This function builds a layer from the layer function and it's parameters. + + It will automatically add regularizer parameter to the layer_params if the + layer supports regularization. To check this, it will look for the + "regularizer", "kernel_regularizer" and "gamma_regularizer" names in this + order in the ``layer`` call signature. If one of this parameters is supported + it will pass regularizer object as a value for that parameter. Based on the + same "checking signature" technique "data_format" and "training" parameters + will try to be added. + + Args: + inputs: input Tensor that will be passed to the layer. Note that layer has + to accept input as the first parameter. + layer: layer function or class with ``__call__`` method defined. + layer_params (dict): parameters passed to the ``layer``. + data_format (string): data format ("channels_first" or "channels_last") + that will be tried to be passed as an additional argument. + regularizer: regularizer instance that will be tried to be passed as an + additional argument. + training (bool): whether layer is built in training mode. Will be tried to + be passed as an additional argument. + verbose (bool): whether to print information about built layers. + + Returns: + Tensor with layer output. + """ + layer_params_cp = copy.deepcopy(layer_params) + for reg_name in ['regularizer', 'kernel_regularizer', 'gamma_regularizer']: + if reg_name not in layer_params_cp and \ + reg_name in signature(layer).parameters: + layer_params_cp.update({reg_name: regularizer}) + + if 'data_format' not in layer_params_cp and \ + 'data_format' in signature(layer).parameters: + layer_params_cp.update({'data_format': data_format}) + + if 'training' not in layer_params_cp and \ + 'training' in signature(layer).parameters: + layer_params_cp.update({'training': training}) + + outputs = layer(inputs, **layer_params_cp) + + if verbose: + if hasattr(layer, '_tf_api_names'): + layer_name = layer._tf_api_names[0] + else: + layer_name = layer + deco_print("Building layer: {}(inputs, {})".format( + layer_name, + ", ".join("{}={}".format(key, value) + for key, value in layer_params_cp.items()) + )) + return outputs
+ + +
[docs]class CNNEncoder(Encoder): + """General CNN encoder that can be used to construct various different models. + """ +
[docs] @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + 'cnn_layers': list, + })
+ +
[docs] @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + 'data_format': ['channels_first', 'channels_last'], + 'fc_layers': list, + })
+ +
[docs] def __init__(self, params, model, name="cnn_encoder", mode='train'): + """CNN Encoder constructor. + + See parent class for arguments description. + + Config parameters: + + * **cnn_layers** (list) --- list with the description of "convolutional" + layers. For example:: + "conv_layers": [ + (tf.layers.conv2d, { + 'filters': 64, 'kernel_size': (11, 11), + 'strides': (4, 4), 'padding': 'VALID', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 192, 'kernel_size': (5, 5), + 'strides': (1, 1), 'padding': 'SAME', + }), + (tf.layers.batch_normalization, {'momentum': 0.9, 'epsilon': 0.0001}), + (tf.nn.relu, {}), + ] + Note that you don't need to provide "regularizer", "training" and + "data_format" parameters since they will be automatically added. + + * **cnn_layers** (list) --- list with the description of "fully-connected" + layers. The only different from convolutional layers is that the input + will be automatically reshaped to 2D (batch size x num features). + For example:: + 'fc_layers': [ + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.layers.dropout, {'rate': 0.5}), + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.layers.dropout, {'rate': 0.5}), + ], + Note that you don't need to provide "regularizer", "training" and + "data_format" parameters since they will be automatically added. + + * **data_format** (string) --- could be either "channels_first" or + "channels_last". Defaults to "channels_first". + """ + super(CNNEncoder, self).__init__(params, model, name, mode)
+ + def _encode(self, input_dict): + regularizer = self.params.get('regularizer', None) + data_format = self.params.get('data_format', 'channels_first') + + x = input_dict['source_tensors'][0] + if data_format == 'channels_first': + x = tf.transpose(x, [0, 3, 1, 2]) + + for layer, layer_params in self.params['cnn_layers']: + x = build_layer(x, layer, layer_params, data_format, + regularizer, self.mode == 'train') + + if data_format == 'channels_first': + x = tf.transpose(x, [0, 2, 3, 1]) + + fc_layers = self.params.get('fc_layers', []) + + # if fully connected layers exist, flattening the output and applying them + if fc_layers: + input_shape = x.get_shape().as_list() + num_inputs = input_shape[1] * input_shape[2] * input_shape[3] + x = tf.reshape(x, [-1, num_inputs]) + for layer, layer_params in fc_layers: + x = build_layer(x, layer, layer_params, data_format, regularizer, + self.mode == 'train') + else: + # if there are no fully connected layers, doing average pooling + x = tf.reduce_mean(x, [1, 2]) + + return {'outputs': x}
+
+ +
+ +
+ + +
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/encoders/convs2s_encoder.html b/docs/html/_modules/encoders/convs2s_encoder.html new file mode 100644 index 000000000..d870f4b10 --- /dev/null +++ b/docs/html/_modules/encoders/convs2s_encoder.html @@ -0,0 +1,456 @@ + + + + + + + + + + + encoders.convs2s_encoder — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + + + + +
+ +
+ + + + + + + + + + + + + + + + + +
+ +
    + +
  • Docs »
  • + +
  • Module code »
  • + +
  • encoders.convs2s_encoder
  • + + +
  • + +
  • + +
+ + +
+
+
+
+ +

Source code for encoders.convs2s_encoder

+# Copyright (c) 2018 NVIDIA Corporation
+"""
+Conv-based encoder
+"""
+from __future__ import absolute_import, division, print_function
+from __future__ import unicode_literals
+
+import tensorflow as tf
+import math
+from .encoder import Encoder
+
+from open_seq2seq.parts.transformer import embedding_layer
+from open_seq2seq.parts.transformer.utils import get_padding_bias, get_padding
+from open_seq2seq.parts.convs2s import ffn_wn_layer, conv_wn_layer
+
+# Default value used if max_input_length is not given
+MAX_INPUT_LENGTH = 128
+
+
+
[docs]class ConvS2SEncoder(Encoder): + """ + Fully convolutional Encoder of ConvS2S + """ + +
[docs] @staticmethod + def get_required_params(): + return dict( + Encoder.get_required_params(), **{ + "encoder_layers": int, + "src_emb_size": int, + "src_vocab_size": int, + "pad_embeddings_2_eight": bool, + "conv_nchannels_kwidth": list, + "embedding_dropout_keep_prob": float, + "hidden_dropout_keep_prob": float, + })
+ +
[docs] @staticmethod + def get_optional_params(): + return dict( + Encoder.get_optional_params(), **{ + "att_layer_num": int, + 'max_input_length': int, + 'PAD_SYMBOL': int, + })
+ + def __init__(self, + params, + model, + name="convs2s_encoder_with_emb", + mode='train'): + super(ConvS2SEncoder, self).__init__(params, model, name=name, mode=mode) + + self._src_vocab_size = self.params['src_vocab_size'] + self._src_emb_size = self.params['src_emb_size'] + self.layers = [] + self._mode = mode + self._pad_sym = self.params.get('PAD_SYMBOL', 0) + self._pad2eight = params.get('pad_embeddings_2_eight', False) + + def _encode(self, input_dict): + inputs = input_dict['source_tensors'][0] + source_length = input_dict['source_tensors'][1] + + with tf.variable_scope("encode"): + # prepare encoder graph + if len(self.layers) == 0: + knum_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[0] + kwidth_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[1] + + with tf.variable_scope("embedding"): + self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self._src_vocab_size, + hidden_size=self._src_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + with tf.variable_scope("pos_embedding"): + self.position_embedding_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self.params.get("max_input_length", MAX_INPUT_LENGTH), + hidden_size=self._src_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + # linear projection before cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + self._src_emb_size, + knum_list[0], + dropout=self.params["embedding_dropout_keep_prob"], + var_scope_name="linear_mapping_before_cnn_layers")) + + for i in range(self.params['encoder_layers']): + in_dim = knum_list[i] if i == 0 else knum_list[i - 1] + out_dim = knum_list[i] + + # linear projection is needed for residual connections if + # input and output of a cnn layer do not match + if in_dim != out_dim: + linear_proj = ffn_wn_layer.FeedFowardNetworkNormalized( + in_dim, + out_dim, + var_scope_name="linear_mapping_cnn_" + str(i + 1), + dropout=1.0) + else: + linear_proj = None + + conv_layer = conv_wn_layer.Conv1DNetworkNormalized( + in_dim, + out_dim, + kernel_width=kwidth_list[i], + mode=self.mode, + layer_id=i + 1, + hidden_dropout=self.params["hidden_dropout_keep_prob"], + conv_padding="SAME", + decode_padding=False) + + self.layers.append([linear_proj, conv_layer]) + + # linear projection after cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + knum_list[self.params['encoder_layers'] - 1], + self._src_emb_size, + dropout=1.0, + var_scope_name="linear_mapping_after_cnn_layers")) + + encoder_inputs = self.embedding_softmax_layer(inputs) + inputs_attention_bias = get_padding_bias( + inputs, res_rank=3, pad_sym=self._pad_sym) + + with tf.name_scope("add_pos_encoding"): + pos_input = tf.range( + 0, + tf.shape(encoder_inputs)[1], + delta=1, + dtype=tf.int32, + name='range') + pos_encoding = self.position_embedding_layer(pos_input) + encoder_inputs = encoder_inputs + tf.cast( + x=pos_encoding, dtype=encoder_inputs.dtype) + + if self.mode == "train": + encoder_inputs = tf.nn.dropout( + encoder_inputs, self.params["embedding_dropout_keep_prob"]) + + # mask the paddings in the input given to cnn layers + inputs_padding = get_padding( + inputs, self._pad_sym, dtype=encoder_inputs.dtype) + padding_mask = tf.expand_dims(1 - inputs_padding, 2) + encoder_inputs *= padding_mask + + # disables padding masks in middle layers + # padding_mask = None + outputs, outputs_b, final_state = self._call(encoder_inputs, padding_mask) + + return { + 'outputs': outputs, + 'outputs_b': outputs_b, + 'inputs_attention_bias_cs2s': inputs_attention_bias, + 'state': final_state, + 'src_lengths': source_length, # should it include paddings or not? + 'embedding_softmax_layer': self.embedding_softmax_layer, + # TODO: Should we share position embedding? + # 'position_embedding_layer': self.position_embedding_layer, + 'encoder_input': inputs + } + + def _call(self, encoder_inputs, padding_mask): + # Run inputs through the sublayers. + with tf.variable_scope("linear_layer_before_cnn_layers"): + outputs = self.layers[0](encoder_inputs) + + for i in range(1, len(self.layers) - 1): + linear_proj, conv_layer = self.layers[i] + + with tf.variable_scope("layer_%d" % i): + if padding_mask is not None: + outputs *= padding_mask + if linear_proj is not None: + res_inputs = linear_proj(outputs) + else: + res_inputs = outputs + outputs = conv_layer(outputs) + outputs = (outputs + res_inputs) * math.sqrt(0.5) + + with tf.variable_scope("linear_layer_after_cnn_layers"): + outputs = self.layers[-1](outputs) + + if padding_mask is not None: + outputs *= padding_mask + + # Gradients are scaled as the gradients from + # all decoder attention layers enters the encoder + scale = 1.0 / ( + 2.0 * self.params.get("att_layer_num", self.params["encoder_layers"])) + outputs = (1.0 - scale) * tf.stop_gradient(outputs) + scale * outputs + + outputs_b = (outputs + encoder_inputs) * math.sqrt(0.5) + + if padding_mask is not None: + outputs_b *= padding_mask + + # Average of the encoder outputs is calculated as the final state of the encoder + # it can be used for decoders which just accept the final state + final_state = tf.reduce_mean(outputs_b, 1) + return outputs, outputs_b, final_state + + @property + def src_vocab_size(self): + return self._src_vocab_size + + @property + def src_emb_size(self): + return self._src_emb_size
+
+ +
+ +
+ + +
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/encoders/ds2_encoder.html b/docs/html/_modules/encoders/ds2_encoder.html index b6bbbdd1c..e0d79ff53 100644 --- a/docs/html/_modules/encoders/ds2_encoder.html +++ b/docs/html/_modules/encoders/ds2_encoder.html @@ -162,41 +162,14 @@

Source code for encoders.ds2_encoder

 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 
 from .encoder import Encoder
-
-
-
[docs]def conv2d_bn_actv(name, inputs, filters, kernel_size, activation_fn, strides, - padding, regularizer, training, data_format, bn_momentum, - bn_epsilon): - """Helper function that applies convolution, batch norm and activation.""" - conv = tf.layers.conv2d( - name="{}".format(name), - inputs=inputs, - filters=filters, - kernel_size=kernel_size, - strides=strides, - padding=padding, - kernel_regularizer=regularizer, - use_bias=False, - data_format=data_format, - ) - bn = tf.layers.batch_normalization( - name="{}/bn".format(name), - inputs=conv, - gamma_regularizer=regularizer, - training=training, - axis=-1 if data_format == 'channels_last' else 1, - momentum=bn_momentum, - epsilon=bn_epsilon, - ) - output = activation_fn(bn) - return output
+from open_seq2seq.parts.cnns.conv_blocks import conv_bn_actv
[docs]def rnn_cell(rnn_cell_dim, layer_type, dropout_keep_prob=1.0): """Helper function that creates RNN cell.""" if layer_type == "layernorm_lstm": cell = tf.contrib.rnn.LayerNormBasicLSTMCell( - num_units=rnn_cell_dim, dropout_keep_prob=dropout_keep_prob) + num_units=rnn_cell_dim, dropout_keep_prob=dropout_keep_prob) else: if layer_type == "lstm": cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_cell_dim) @@ -210,7 +183,7 @@

Source code for encoders.ds2_encoder

       raise ValueError("Error: not supported rnn type:{}".format(layer_type))
 
     cell = tf.nn.rnn_cell.DropoutWrapper(
-      cell, output_keep_prob=dropout_keep_prob)
+        cell, output_keep_prob=dropout_keep_prob)
   return cell
@@ -230,28 +203,28 @@

Source code for encoders.ds2_encoder

     x = tf.cast(x, tf.float32)
     cast_back = True
   filters = tf.get_variable(
-    name+'/w',
-    shape=[width, 1, channels, 1],
-    regularizer=regularizer,
-    dtype=tf.float32,
+      name + '/w',
+      shape=[width, 1, channels, 1],
+      regularizer=regularizer,
+      dtype=tf.float32,
   )
   strides = [1, 1, 1, 1]
   y = tf.nn.depthwise_conv2d(
-    name=name + '/conv',
-    input=x,
-    filter=filters,
-    strides=strides,
-    padding='SAME',
-    data_format='NHWC' if data_format == 'channels_last' else 'NCHW',
+      name=name + '/conv',
+      input=x,
+      filter=filters,
+      strides=strides,
+      padding='SAME',
+      data_format='NHWC' if data_format == 'channels_last' else 'NCHW',
   )
   bn = tf.layers.batch_normalization(
-    name="{}/bn".format(name),
-    inputs=y,
-    gamma_regularizer=regularizer,
-    training=training,
-    axis=-1 if data_format == 'channels_last' else 1,
-    momentum=bn_momentum,
-    epsilon=bn_epsilon,
+      name="{}/bn".format(name),
+      inputs=y,
+      gamma_regularizer=regularizer,
+      training=training,
+      axis=-1 if data_format == 'channels_last' else 1,
+      momentum=bn_momentum,
+      epsilon=bn_epsilon,
   )
   output = activation_fn(bn)
   if data_format == 'channels_first':
@@ -267,25 +240,25 @@ 

Source code for encoders.ds2_encoder

 
[docs] @staticmethod def get_required_params(): return dict(Encoder.get_required_params(), **{ - 'dropout_keep_prob': float, - 'conv_layers': list, - 'activation_fn': None, # any valid callable - 'num_rnn_layers': int, - 'row_conv': bool, - 'n_hidden': int, - 'use_cudnn_rnn': bool, - 'rnn_cell_dim': int, - 'rnn_type': ['layernorm_lstm', 'lstm', 'gru', 'cudnn_gru', 'cudnn_lstm'], - 'rnn_unidirectional': bool, + 'dropout_keep_prob': float, + 'conv_layers': list, + 'activation_fn': None, # any valid callable + 'num_rnn_layers': int, + 'row_conv': bool, + 'n_hidden': int, + 'use_cudnn_rnn': bool, + 'rnn_cell_dim': int, + 'rnn_type': ['layernorm_lstm', 'lstm', 'gru', 'cudnn_gru', 'cudnn_lstm'], + 'rnn_unidirectional': bool, })
[docs] @staticmethod def get_optional_params(): return dict(Encoder.get_optional_params(), **{ - 'row_conv_width': int, - 'data_format': ['channels_first', 'channels_last'], - 'bn_momentum': float, - 'bn_epsilon': float, + 'row_conv_width': int, + 'data_format': ['channels_first', 'channels_last'], + 'bn_momentum': float, + 'bn_epsilon': float, })
[docs] def __init__(self, params, model, name="ds2_encoder", mode='train'): @@ -369,8 +342,8 @@

Source code for encoders.ds2_encoder

       top_layer = input_layer
     else:
       top_layer = tf.transpose(input_layer, [0, 3, 1, 2])
-    
-    # ----- Convolutional layers -----------------------------------------------
+
+    # ----- Convolutional layers ---------------------------------------------
     conv_layers = self.params['conv_layers']
 
     for idx_conv in range(len(conv_layers)):
@@ -384,19 +357,20 @@ 

Source code for encoders.ds2_encoder

       else:
         src_length = (src_length + strides[0] - 1) // strides[0]
 
-      top_layer = conv2d_bn_actv(
-        name="conv{}".format(idx_conv + 1),
-        inputs=top_layer,
-        filters=ch_out,
-        kernel_size=kernel_size,
-        activation_fn=self.params['activation_fn'],
-        strides=strides,
-        padding=padding,
-        regularizer=regularizer,
-        training=training,
-        data_format=data_format,
-        bn_momentum=bn_momentum,
-        bn_epsilon=bn_epsilon,
+      top_layer = conv_bn_actv(
+          type="conv2d",
+          name="conv{}".format(idx_conv + 1),
+          inputs=top_layer,
+          filters=ch_out,
+          kernel_size=kernel_size,
+          activation_fn=self.params['activation_fn'],
+          strides=strides,
+          padding=padding,
+          regularizer=regularizer,
+          training=training,
+          data_format=data_format,
+          bn_momentum=bn_momentum,
+          bn_epsilon=bn_epsilon,
       )
     if data_format == 'channels_first':
       top_layer = tf.transpose(top_layer, [0, 2, 3, 1])
@@ -422,55 +396,56 @@ 

Source code for encoders.ds2_encoder

 
         if rnn_type == "cudnn_gru" or rnn_type == "gru":
           rnn_block = tf.contrib.cudnn_rnn.CudnnGRU(
-            num_layers=num_rnn_layers,
-            num_units=rnn_cell_dim,
-            direction=direction,
-            dropout=1.0 - dropout_keep_prob,
-            dtype=rnn_input.dtype,
-            name="cudnn_gru",
+              num_layers=num_rnn_layers,
+              num_units=rnn_cell_dim,
+              direction=direction,
+              dropout=1.0 - dropout_keep_prob,
+              dtype=rnn_input.dtype,
+              name="cudnn_gru",
           )
         elif rnn_type == "cudnn_lstm" or rnn_type == "lstm":
           rnn_block = tf.contrib.cudnn_rnn.CudnnLSTM(
-            num_layers=num_rnn_layers,
-            num_units=rnn_cell_dim,
-            direction=direction,
-            dropout=1.0 - dropout_keep_prob,
-            dtype=rnn_input.dtype,
-            name="cudnn_lstm",
+              num_layers=num_rnn_layers,
+              num_units=rnn_cell_dim,
+              direction=direction,
+              dropout=1.0 - dropout_keep_prob,
+              dtype=rnn_input.dtype,
+              name="cudnn_lstm",
           )
         else:
           raise ValueError(
-            "{} is not a valid rnn_type for cudnn_rnn layers".format(rnn_type)
+              "{} is not a valid rnn_type for cudnn_rnn layers".format(
+                  rnn_type)
           )
         top_layer, state = rnn_block(rnn_input)
         top_layer = tf.transpose(top_layer, [1, 0, 2])
       else:
         rnn_input = top_layer
         multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell(
-          [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type,
-                    dropout_keep_prob=dropout_keep_prob)
-           for _ in range(num_rnn_layers)]
+            [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type,
+                      dropout_keep_prob=dropout_keep_prob)
+             for _ in range(num_rnn_layers)]
         )
         if self.params['rnn_unidirectional']:
           top_layer, state = tf.nn.dynamic_rnn(
-            cell=multirnn_cell_fw,
-            inputs=rnn_input,
-            sequence_length=src_length,
-            dtype=rnn_input.dtype,
-            time_major=False,
+              cell=multirnn_cell_fw,
+              inputs=rnn_input,
+              sequence_length=src_length,
+              dtype=rnn_input.dtype,
+              time_major=False,
           )
         else:
           multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell(
-            [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type,
-                      dropout_keep_prob=dropout_keep_prob)
-             for _ in range(num_rnn_layers)]
+              [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type,
+                        dropout_keep_prob=dropout_keep_prob)
+               for _ in range(num_rnn_layers)]
           )
           top_layer, state = tf.nn.bidirectional_dynamic_rnn(
-            cell_fw=multirnn_cell_fw, cell_bw=multirnn_cell_bw,
-            inputs=rnn_input,
-            sequence_length=src_length,
-            dtype=rnn_input.dtype,
-            time_major=False
+              cell_fw=multirnn_cell_fw, cell_bw=multirnn_cell_bw,
+              inputs=rnn_input,
+              sequence_length=src_length,
+              dtype=rnn_input.dtype,
+              time_major=False
           )
           # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim]
           top_layer = tf.concat(top_layer, 2)
@@ -479,43 +454,43 @@ 

Source code for encoders.ds2_encoder

     if self.params['row_conv']:
       channels = top_layer.get_shape().as_list()[-1]
       top_layer = row_conv(
-        name="row_conv",
-        input_layer=top_layer,
-        batch=batch_size,
-        channels=channels,
-        activation_fn=self.params['activation_fn'],
-        width=self.params['row_conv_width'],
-        regularizer=regularizer,
-        training=training,
-        data_format=data_format,
-        bn_momentum=bn_momentum,
-        bn_epsilon=bn_epsilon,
+          name="row_conv",
+          input_layer=top_layer,
+          batch=batch_size,
+          channels=channels,
+          activation_fn=self.params['activation_fn'],
+          width=self.params['row_conv_width'],
+          regularizer=regularizer,
+          training=training,
+          data_format=data_format,
+          bn_momentum=bn_momentum,
+          bn_epsilon=bn_epsilon,
       )
 
     # Reshape [B, T, C] --> [B*T, C]
     c = top_layer.get_shape().as_list()[-1]
     top_layer = tf.reshape(top_layer, [-1, c])
 
-    # --- hidden layer with clipped ReLU activation and dropout-----------------
+    # --- hidden layer with clipped ReLU activation and dropout---------------
     top_layer = tf.layers.dense(
-      inputs=top_layer,
-      units=self.params['n_hidden'],
-      kernel_regularizer=regularizer,
-      activation=self.params['activation_fn'],
-      name='fully_connected',
+        inputs=top_layer,
+        units=self.params['n_hidden'],
+        kernel_regularizer=regularizer,
+        activation=self.params['activation_fn'],
+        name='fully_connected',
     )
     outputs = tf.nn.dropout(x=top_layer, keep_prob=dropout_keep_prob)
 
     # reshape from  [B*T,A] --> [B, T, A].
     # Output shape: [batch_size, n_steps, n_hidden]
     outputs = tf.reshape(
-      outputs,
-      [batch_size, -1, self.params['n_hidden']],
+        outputs,
+        [batch_size, -1, self.params['n_hidden']],
     )
 
     return {
-      'outputs': outputs,
-      'src_length': src_length,
+        'outputs': outputs,
+        'src_length': src_length,
     }
diff --git a/docs/html/_modules/encoders/encoder.html b/docs/html/_modules/encoders/encoder.html index 85a680c45..4172b435e 100644 --- a/docs/html/_modules/encoders/encoder.html +++ b/docs/html/_modules/encoders/encoder.html @@ -242,24 +242,9 @@

Source code for encoders.encoder

       else:
         self._params['dtype'] = tf.float32
 
-    if 'regularizer' not in self._params:
-      if self._model and 'regularizer' in self._model.params:
-        self._params['regularizer'] = self._model.params['regularizer']
-        self._params['regularizer_params'] = self._model.params['regularizer_params']
-
-    if 'regularizer' in self._params:
-      init_dict = self._params.get('regularizer_params', {})
-      self._params['regularizer'] = self._params['regularizer'](**init_dict)
-      if self._params['dtype'] == 'mixed':
-        self._params['regularizer'] = mp_regularizer_wrapper(
-          self._params['regularizer'],
-        )
-
-    if self._params['dtype'] == 'mixed':
-      self._params['dtype'] = tf.float16
-
     self._name = name
-    self._mode = mode
+ self._mode = mode + self._compiled = False
[docs] def encode(self, input_dict): """Wrapper around :meth:`self._encode() <_encode>` method. @@ -272,11 +257,35 @@

Source code for encoders.encoder

     Returns:
       see :meth:`self._encode() <_encode>` docs.
     """
+    if not self._compiled:
+      if 'regularizer' not in self._params:
+        if self._model and 'regularizer' in self._model.params:
+          self._params['regularizer'] = copy.deepcopy(
+            self._model.params['regularizer']
+          )
+          self._params['regularizer_params'] = copy.deepcopy(
+            self._model.params['regularizer_params']
+          )
+
+      if 'regularizer' in self._params:
+        init_dict = self._params.get('regularizer_params', {})
+        self._params['regularizer'] = self._params['regularizer'](**init_dict)
+        if self._params['dtype'] == 'mixed':
+          self._params['regularizer'] = mp_regularizer_wrapper(
+            self._params['regularizer'],
+          )
+
+      if self._params['dtype'] == 'mixed':
+        self._params['dtype'] = tf.float16
+
     if 'initializer' in self.params:
       init_dict = self.params.get('initializer_params', {})
       initializer = self.params['initializer'](**init_dict)
     else:
       initializer = None
+
+    self._compiled = True
+
     with tf.variable_scope(self._name, initializer=initializer,
                            dtype=self.params['dtype']):
       return self._encode(self._cast_types(input_dict))
diff --git a/docs/html/_modules/encoders/rnn_encoders.html b/docs/html/_modules/encoders/rnn_encoders.html index 4e3bdea06..93a9f354b 100644 --- a/docs/html/_modules/encoders/rnn_encoders.html +++ b/docs/html/_modules/encoders/rnn_encoders.html @@ -160,12 +160,11 @@

Source code for encoders.rnn_encoders

 from __future__ import absolute_import, division, print_function
 from __future__ import unicode_literals
 
-import copy
 import tensorflow as tf
 
-from open_seq2seq.parts.rnns.utils import create_rnn_cell
+from open_seq2seq.parts.rnns.utils import single_cell
 from .encoder import Encoder
-
+from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 
 
[docs]class UnidirectionalRNNEncoderWithEmbedding(Encoder): """ @@ -177,8 +176,8 @@

Source code for encoders.rnn_encoders

     return dict(Encoder.get_required_params(), **{
       'src_vocab_size': int,
       'src_emb_size': int,
-      'encoder_cell_units': int,
-      'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'],
+      'core_cell': None,
+      'core_cell_params': dict,
       'encoder_layers': int,
       'encoder_use_skip_connections': bool,
     })
@@ -242,10 +241,6 @@

Source code for encoders.rnn_encoders

     source_sequence = input_dict['source_tensors'][0]
     source_length = input_dict['source_tensors'][1]
 
-
-    cell_params = copy.deepcopy(self.params)
-    cell_params["num_units"] = self.params['encoder_cell_units']
-
     self._enc_emb_w = tf.get_variable(
       name="EncoderEmbeddingMatrix",
       shape=[self._src_vocab_size, self._src_emb_size],
@@ -259,14 +254,16 @@ 

Source code for encoders.rnn_encoders

       dp_input_keep_prob = 1.0
       dp_output_keep_prob = 1.0
 
-    self._encoder_cell_fw = create_rnn_cell(
-      cell_type=self.params['encoder_cell_type'],
-      cell_params=cell_params,
-      num_layers=self.params['encoder_layers'],
-      dp_input_keep_prob=dp_input_keep_prob,
-      dp_output_keep_prob=dp_output_keep_prob,
-      residual_connections=self.params['encoder_use_skip_connections'],
-    )
+    fwd_cells = [
+      single_cell(cell_class=self.params['core_cell'],
+                  cell_params=self.params.get('core_cell_params', {}),
+                  dp_input_keep_prob=dp_input_keep_prob,
+                  dp_output_keep_prob=dp_output_keep_prob,
+                  residual_connections=self.params[
+                    'encoder_use_skip_connections']
+                  ) for _ in range(self.params['encoder_layers'])]
+
+    self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells)
 
     time_major = self.params.get("time_major", False)
     use_swap_memory = self.params.get("use_swap_memory", False)
@@ -312,10 +309,10 @@ 

Source code for encoders.rnn_encoders

     return dict(Encoder.get_required_params(), **{
       'src_vocab_size': int,
       'src_emb_size': int,
-      'encoder_cell_units': int,
-      'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'],
       'encoder_layers': int,
       'encoder_use_skip_connections': bool,
+      'core_cell': None,
+      'core_cell_params': dict,
     })
[docs] @staticmethod @@ -382,9 +379,6 @@

Source code for encoders.rnn_encoders

       dtype=tf.float32
     )
 
-    cell_params = copy.deepcopy(self.params)
-    cell_params["num_units"] = self.params['encoder_cell_units']
-
     if self._mode == "train":
       dp_input_keep_prob = self.params['encoder_dp_input_keep_prob']
       dp_output_keep_prob = self.params['encoder_dp_output_keep_prob']
@@ -392,25 +386,27 @@ 

Source code for encoders.rnn_encoders

       dp_input_keep_prob = 1.0
       dp_output_keep_prob = 1.0
 
+    fwd_cells = [
+      single_cell(cell_class=self.params['core_cell'],
+                  cell_params=self.params.get('core_cell_params', {}),
+                  dp_input_keep_prob=dp_input_keep_prob,
+                  dp_output_keep_prob=dp_output_keep_prob,
+                  residual_connections=self.params['encoder_use_skip_connections']
+                  ) for _ in range(self.params['encoder_layers'])]
+    bwd_cells = [
+      single_cell(cell_class=self.params['core_cell'],
+                  cell_params=self.params.get('core_cell_params', {}),
+                  dp_input_keep_prob=dp_input_keep_prob,
+                  dp_output_keep_prob=dp_output_keep_prob,
+                  residual_connections=self.params['encoder_use_skip_connections']
+                  ) for _ in range(self.params['encoder_layers'])]
+
+
     with tf.variable_scope("FW"):
-      self._encoder_cell_fw = create_rnn_cell(
-        cell_type=self.params['encoder_cell_type'],
-        cell_params=cell_params,
-        num_layers=self.params['encoder_layers'],
-        dp_input_keep_prob=dp_input_keep_prob,
-        dp_output_keep_prob=dp_output_keep_prob,
-        residual_connections=self.params['encoder_use_skip_connections']
-      )
+      self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells)
 
     with tf.variable_scope("BW"):
-      self._encoder_cell_bw = create_rnn_cell(
-        cell_type=self.params['encoder_cell_type'],
-        cell_params=cell_params,
-        num_layers=self.params['encoder_layers'],
-        dp_input_keep_prob=dp_input_keep_prob,
-        dp_output_keep_prob=dp_output_keep_prob,
-        residual_connections=self.params['encoder_use_skip_connections']
-      )
+      self._encoder_cell_bw = tf.contrib.rnn.MultiRNNCell(bwd_cells)
 
     embedded_inputs = tf.cast(tf.nn.embedding_lookup(
       self.enc_emb_w,
@@ -456,8 +452,10 @@ 

Source code for encoders.rnn_encoders

     return dict(Encoder.get_required_params(), **{
       'src_vocab_size': int,
       'src_emb_size': int,
-      'encoder_cell_units': int,
-      'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'],
+      'core_cell': None,
+      'core_cell_params': dict,
+      #'encoder_cell_units': int,
+      #'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'],
       'encoder_layers': int,
       'encoder_use_skip_connections': bool,
     })
@@ -508,27 +506,24 @@

Source code for encoders.rnn_encoders

     if self.params['encoder_layers'] < 2:
       raise ValueError("GNMT encoder must have at least 2 layers")
 
-    cell_params = copy.deepcopy(self.params)
-    cell_params["num_units"] = self.params['encoder_cell_units']
+    #cell_params = copy.deepcopy(self.params)
+    #cell_params["num_units"] = self.params['encoder_cell_units']
 
     with tf.variable_scope("Level1FW"):
-      self._encoder_l1_cell_fw = create_rnn_cell(
-        cell_type=self.params['encoder_cell_type'],
-        cell_params=cell_params,
-        num_layers=1,
+      self._encoder_l1_cell_fw = single_cell(
+        cell_class=self.params['core_cell'],
+        cell_params=self.params.get('core_cell_params', {}),
         dp_input_keep_prob=1.0,
         dp_output_keep_prob=1.0,
-        residual_connections=False,
-      )
+        residual_connections=False)
+
     with tf.variable_scope("Level1BW"):
-      self._encoder_l1_cell_bw = create_rnn_cell(
-        cell_type=self.params['encoder_cell_type'],
-        cell_params=cell_params,
-        num_layers=1,
+      self._encoder_l1_cell_bw = single_cell(
+        cell_class=self.params['core_cell'],
+        cell_params=self.params.get('core_cell_params', {}),
         dp_input_keep_prob=1.0,
         dp_output_keep_prob=1.0,
-        residual_connections=False,
-      )
+        residual_connections=False)
 
     if self._mode == "train":
       dp_input_keep_prob = self.params['encoder_dp_input_keep_prob']
@@ -538,15 +533,13 @@ 

Source code for encoders.rnn_encoders

       dp_output_keep_prob = 1.0
 
     with tf.variable_scope("UniDirLevel"):
-      self._encoder_cells = create_rnn_cell(
-        cell_type=self.params['encoder_cell_type'],
-        cell_params=cell_params,
-        num_layers=self.params['encoder_layers'] - 1,
+      self._encoder_cells = [single_cell(
+        cell_class=self.params['core_cell'],
+        cell_params=self.params.get('core_cell_params', {}),
         dp_input_keep_prob=dp_input_keep_prob,
         dp_output_keep_prob=dp_output_keep_prob,
-        residual_connections=False,
-        wrap_to_multi_rnn=False,
-      )
+        residual_connections=False) for _ in range(self.params['encoder_layers'] - 1)]
+
       # add residual connections starting from the third layer
       for idx, cell in enumerate(self._encoder_cells):
         if idx > 0:
@@ -577,7 +570,7 @@ 

Source code for encoders.rnn_encoders

       inputs=encoder_l1_outputs,
       sequence_length=source_length,
       swap_memory=use_swap_memory,
-      time_major = time_major,
+      time_major=time_major,
       dtype=encoder_l1_outputs.dtype,
     )
 
@@ -597,6 +590,154 @@ 

Source code for encoders.rnn_encoders

   @property
   def enc_emb_w(self):
     return self._enc_emb_w
+ +
[docs]class GNMTLikeEncoderWithEmbedding_cuDNN(Encoder): + """ + Encoder similar to the one used in + GNMT model: https://arxiv.org/abs/1609.08144. + Must have at least 2 layers. Uses cuDNN RNN blocks for efficiency + """ + +
[docs] @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + 'src_vocab_size': int, + 'src_emb_size': int, + 'encoder_cell_units': int, + 'encoder_cell_type': ['lstm', 'gru'], + 'encoder_layers': int, + #'core_cell': None, + #'core_cell_params': dict, + })
+ +
[docs] @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + 'encoder_dp_output_keep_prob': float, + })
+ +
[docs] def __init__(self, params, model, + name="gnmt_encoder_with_emb_cudnn", mode='train'): + """ + Encodes data into representation + :param params: a Python dictionary. + Must define: + * src_inputs - a Tensor of shape [batch_size, time] or [time, batch_size] + (depending on time_major param) + * src_lengths - a Tensor of shape [batch_size] + :return: a Python dictionary with: + * encoder_outputs - a Tensor of shape + [batch_size, time, representation_dim] + or [time, batch_size, representation_dim] + * encoder_state - a Tensor of shape [batch_size, dim] + * src_lengths - (copy ref from input) a Tensor of shape [batch_size] + """ + super(GNMTLikeEncoderWithEmbedding_cuDNN, self).__init__( + params, model, name=name, mode=mode, + ) + + self._src_vocab_size = self.params['src_vocab_size'] + self._src_emb_size = self.params['src_emb_size']
+ + def _encode(self, input_dict): + source_sequence = input_dict['source_tensors'][0] + source_length = input_dict['source_tensors'][1] + self._enc_emb_w = tf.get_variable( + name="EncoderEmbeddingMatrix", + shape=[self._src_vocab_size, self._src_emb_size], + dtype=tf.float32 + ) + + if self.params['encoder_layers'] < 2: + raise ValueError("GNMT encoder must have at least 2 layers") + + if self._mode == "train": + dp_output_keep_prob = self.params['encoder_dp_output_keep_prob'] + else: + dp_output_keep_prob = 1.0 + + # source_sequence is of [batch, time] shape + embedded_inputs = tf.cast(tf.nn.embedding_lookup( + self.enc_emb_w, + tf.transpose(source_sequence), # cudnn wants [time, batch, ...] + ), self.params['dtype']) + + with tf.variable_scope("Bi_Directional_Layer"): + direction = cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION + if self.params['encoder_cell_type'] == "gru": + bidirectional_block = tf.contrib.cudnn_rnn.CudnnGRU( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=0.0, + dtype=self.params['dtype'], + name="cudnn_gru_bidi", + ) + elif self.params['encoder_cell_type'] == "lstm": + bidirectional_block = tf.contrib.cudnn_rnn.CudnnLSTM( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=0.0, + dtype=self.params['dtype'], + name="cudnn_lstm_bidi", + ) + else: + raise ValueError( + "{} is not a valid rnn_type for cudnn_rnn layers" + .format(self.params['encoder_cell_units']) + ) + bidi_output, bidi_state = bidirectional_block(embedded_inputs) + + with tf.variable_scope("Uni_Directional_Layer"): + direction = cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION + layer_input = bidi_output + for ind in range(self.params['encoder_layers'] - 1): + with tf.variable_scope("uni_layer_{}".format(ind)): + if self.params['encoder_cell_type'] == "gru": + unidirectional_block = tf.contrib.cudnn_rnn.CudnnGRU( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=1.0 - dp_output_keep_prob, + dtype=self.params['dtype'], + name="cudnn_gru_uni_".format(ind), + ) + elif self.params['encoder_cell_type'] == "lstm": + unidirectional_block = tf.contrib.cudnn_rnn.CudnnLSTM( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=1.0 - dp_output_keep_prob, + dtype=self.params['dtype'], + name="cudnn_lstm_uni_".format(ind), + ) + layer_output, encoder_state = unidirectional_block( + layer_input) + if ind > 0: # add residual connection + layer_output = layer_input + layer_output + layer_input = layer_output + + return {'outputs': tf.transpose(layer_input, perm=[1, 0, 2]), + 'state': None, + 'src_lengths': source_length, + 'encoder_input': source_sequence} + + + @property + def src_vocab_size(self): + return self._src_vocab_size + + @property + def src_emb_size(self): + return self._src_emb_size + + @property + def enc_emb_w(self): + return self._enc_emb_w
+ + +
diff --git a/docs/html/_modules/encoders/w2l_encoder.html b/docs/html/_modules/encoders/w2l_encoder.html new file mode 100644 index 000000000..c7e0c51ac --- /dev/null +++ b/docs/html/_modules/encoders/w2l_encoder.html @@ -0,0 +1,390 @@ + + + + + + + + + + + encoders.w2l_encoder — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + + + + +
+ +
+ + + + + + + + + + + + + + + + + +
+ + + + +
+
+
+
+ +

Source code for encoders.w2l_encoder

+# Copyright (c) 2018 NVIDIA Corporation
+from __future__ import absolute_import, division, print_function
+from __future__ import unicode_literals
+from six.moves import range
+
+import tensorflow as tf
+
+from .encoder import Encoder
+from open_seq2seq.parts.cnns.conv_blocks import *
+
+
+
[docs]class Wave2LetterEncoder(Encoder): + """Wave2Letter like encoder. Fully convolutional model""" + +
[docs] @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + 'dropout_keep_prob': float, + 'convnet_layers': list, + 'activation_fn': None, # any valid callable + })
+ +
[docs] @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + 'data_format': ['channels_first', 'channels_last'], + 'normalization': [None, 'batch_norm'], + 'bn_momentum': float, + 'bn_epsilon': float, + })
+ +
[docs] def __init__(self, params, model, name="w2l_encoder", mode='train'): + """Wave2Letter like encoder constructor. + + See parent class for arguments description. + + Config parameters: + + * **dropout_keep_prop** (float) --- keep probability for dropout. + * **convnet_layers** (list) --- list with the description of convolutional + layers. For example:: + "convnet_layers": [ + { + "type": "conv1d", "repeat" : 5, + "kernel_size": [7], "stride": [1], + "num_channels": 250, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [11], "stride": [1], + "num_channels": 500, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [32], "stride": [1], + "num_channels": 1000, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [1], "stride": [1], + "num_channels": 1000, "padding": "SAME" + }, + ] + * **activation_fn** --- activation function to use. + * **data_format** (string) --- could be either "channels_first" or + "channels_last". Defaults to "channels_last". + * **normalization** --- normalization to use. Accepts [None, 'batch_norm']. + Use None if you don't want to use normalization. Defaults to 'batch_norm'. + * **bn_momentum** (float) --- momentum for batch norm. Defaults to 0.90. + * **bn_epsilon** (float) --- epsilon for batch norm. Defaults to 1e-3. + """ + super(Wave2LetterEncoder, self).__init__(params, model, name, mode)
+ +
[docs] def _encode(self, input_dict): + """Creates TensorFlow graph for Wav2Letter like encoder. + + Args: + input_dict (dict): input dictionary that has to contain + the following fields:: + input_dict = { + "source_tensors": [ + src_sequence (shape=[batch_size, sequence length, num features]), + src_length (shape=[batch_size]) + ] + } + + Returns: + dict: dictionary with the following tensors:: + + { + 'outputs': hidden state, shape=[batch_size, sequence length, n_hidden] + 'src_length': tensor, shape=[batch_size] + } + """ + + source_sequence, src_length = input_dict['source_tensors'] + + training = (self._mode == "train") + dropout_keep_prob = self.params['dropout_keep_prob'] if training else 1.0 + regularizer = self.params.get('regularizer', None) + data_format = self.params.get('data_format', 'channels_last') + normalization = self.params.get('normalization', 'batch_norm') + + normalization_params = {} + if normalization == None: + conv_block = conv_actv + elif normalization == "batch_norm": + conv_block = conv_bn_actv + normalization_params['bn_momentum'] = self.params.get( + 'bn_momentum', 0.90) + normalization_params['bn_epsilon'] = self.params.get('bn_epsilon', 1e-3) + + conv_inputs = source_sequence + batch_size = conv_inputs.get_shape().as_list()[0] + if data_format == 'channels_last': + conv_feats = conv_inputs # B T F + else: + conv_feats = tf.transpose(conv_inputs, [0, 2, 1]) # B F T + + # ----- Convolutional layers --------------------------------------------- + convnet_layers = self.params['convnet_layers'] + + for idx_convnet in range(len(convnet_layers)): + layer_type = convnet_layers[idx_convnet]['type'] + layer_repeat = convnet_layers[idx_convnet]['repeat'] + ch_out = convnet_layers[idx_convnet]['num_channels'] + kernel_size = convnet_layers[idx_convnet]['kernel_size'] + strides = convnet_layers[idx_convnet]['stride'] + padding = convnet_layers[idx_convnet]['padding'] + + for idx_layer in range(layer_repeat): + conv_feats = conv_block( + type=layer_type, + name="conv{}{}".format( + idx_convnet + 1, idx_layer + 1), + inputs=conv_feats, + filters=ch_out, + kernel_size=kernel_size, + activation_fn=self.params['activation_fn'], + strides=strides, + padding=padding, + regularizer=regularizer, + training=training, + data_format=data_format, + **normalization_params + ) + outputs = tf.nn.dropout(x=conv_feats, keep_prob=dropout_keep_prob) + + if data_format == 'channels_first': + outputs = tf.transpose(outputs, [0, 2, 1]) + + return { + 'outputs': outputs, + 'src_length': src_length, + }
+
+ +
+ +
+ + +
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/index.html b/docs/html/_modules/index.html index 979fef39a..26c5b9829 100644 --- a/docs/html/_modules/index.html +++ b/docs/html/_modules/index.html @@ -160,14 +160,18 @@

All modules for which code is available

  • data.text2text.text2text
  • data.text2text.tokenizer
  • data.utils
  • +
  • decoders.convs2s_decoder
  • decoders.decoder
  • decoders.fc_decoders
  • decoders.rnn_decoders
  • +
  • encoders.cnn_encoder
  • +
  • encoders.convs2s_encoder
  • encoders.ds2_encoder
  • encoders.encoder
  • encoders.resnet_blocks
  • encoders.resnet_encoder
  • encoders.rnn_encoders
  • +
  • encoders.w2l_encoder
  • losses.cross_entropy_loss
  • losses.ctc_loss
  • losses.loss
  • @@ -181,6 +185,10 @@

    All modules for which code is available

  • optimizers.lr_policies
  • optimizers.mp_wrapper
  • optimizers.optimizers
  • +
  • parts.cnns.conv_blocks
  • +
  • parts.convs2s.attention_wn_layer
  • +
  • parts.convs2s.conv_wn_layer
  • +
  • parts.convs2s.ffn_wn_layer
  • parts.rnns.attention_wrapper
  • parts.rnns.flstm
  • parts.rnns.glstm
  • @@ -190,7 +198,6 @@

    All modules for which code is available

  • parts.rnns.utils
  • parts.transformer.attention_layer
  • parts.transformer.beam_search
  • -
  • parts.transformer.beam_search_test
  • parts.transformer.common
  • parts.transformer.embedding_layer
  • parts.transformer.ffn_layer
  • diff --git a/docs/html/_modules/losses/sequence_loss.html b/docs/html/_modules/losses/sequence_loss.html index 8906dc30a..e1f239446 100644 --- a/docs/html/_modules/losses/sequence_loss.html +++ b/docs/html/_modules/losses/sequence_loss.html @@ -407,11 +407,17 @@

    Source code for losses.sequence_loss

           'batch_size': int,
           'tgt_vocab_size': int,
           'label_smoothing': float,
    +      'pad_embeddings_2_eight': bool,
         })
    def __init__(self, params, model, name="padded_cross_entropy_with_smoothing"): super(PaddedCrossEntropyLossWithSmoothing, self).__init__(params, model, name) - self._tgt_vocab_size = self.params["tgt_vocab_size"] + if self.params.get('pad_embeddings_2_eight', False): + self._tgt_vocab_size = self.params["tgt_vocab_size"] if self.params[ + "tgt_vocab_size"] % 8 == 0 else \ + self.params["tgt_vocab_size"] + (8 - self.params["tgt_vocab_size"] % 8) + else: + self._tgt_vocab_size = self.params["tgt_vocab_size"] self._label_smoothing = self.params.get("label_smoothing", 0.0) def _compute_loss(self, input_dict): diff --git a/docs/html/_modules/models/encoder_decoder.html b/docs/html/_modules/models/encoder_decoder.html index 1fdcf3f46..205a65200 100644 --- a/docs/html/_modules/models/encoder_decoder.html +++ b/docs/html/_modules/models/encoder_decoder.html @@ -285,8 +285,8 @@

    Source code for models.encoder_decoder

     
         Returns:
           tuple: tuple containing loss tensor as returned from
    -      ``loss.compute_loss()`` and samples tensor, which is taken from
    -      ``decoder.decode()['samples']``. When ``mode == 'infer'``, loss will
    +      ``loss.compute_loss()`` and list of outputs tensors, which is taken from
    +      ``decoder.decode()['outputs']``. When ``mode == 'infer'``, loss will
           be None.
         """
         if not isinstance(input_tensors, dict) or \
    @@ -314,7 +314,7 @@ 

    Source code for models.encoder_decoder

           if self.mode == "train":
             decoder_input['target_tensors'] = target_tensors
           decoder_output = self.decoder.decode(input_dict=decoder_input)
    -      decoder_samples = decoder_output.get("samples", None)
    +      model_outputs = decoder_output.get("outputs", None)
     
           if self.mode == "train" or self.mode == "eval":
             with tf.variable_scope("Loss"):
    @@ -326,7 +326,7 @@ 

    Source code for models.encoder_decoder

           else:
             deco_print("Inference Mode. Loss part of graph isn't built.")
             loss = None
    -      return loss, decoder_samples
    + return loss, model_outputs
    @property def encoder(self): diff --git a/docs/html/_modules/models/image2label.html b/docs/html/_modules/models/image2label.html index 50f0d80be..813c9b37c 100644 --- a/docs/html/_modules/models/image2label.html +++ b/docs/html/_modules/models/image2label.html @@ -167,7 +167,7 @@

    Source code for models.image2label

     
     
     
    [docs]class Image2Label(EncoderDecoderModel): -
    [docs] def maybe_print_logs(self, input_values, output_values): +
    [docs] def maybe_print_logs(self, input_values, output_values, training_step): labels = input_values['target_tensors'][0] logits = output_values[0] @@ -186,7 +186,7 @@

    Source code for models.image2label

           "Train batch top-5": top5,
         }
    -
    [docs] def finalize_evaluation(self, results_per_batch): +
    [docs] def finalize_evaluation(self, results_per_batch, training_step=None): top1 = 0.0 top5 = 0.0 total = 0.0 @@ -211,11 +211,12 @@

    Source code for models.image2label

         labels = np.where(labels == 1)[1]
     
         total = logits.shape[0]
    -    top1 = np.sum(np.argmax(logits, axis=1) == labels)
    -    top5 = np.sum(labels[:, np.newaxis] == np.argpartition(logits, -5)[:, -5:])
    +    top1 = np.sum(np.equal(np.argmax(logits, axis=1), labels))
    +    top5 = np.sum(np.equal(labels[:, np.newaxis],
    +                           np.argpartition(logits, -5)[:, -5:]))
         return total, top1, top5
    -
    [docs] def get_num_objects_per_step(self, worker_id=0): +
    [docs] def _get_num_objects_per_step(self, worker_id=0): """Returns number of images in current batch, i.e. batch size.""" data_layer = self.get_data_layer(worker_id) num_images = tf.shape(data_layer.input_tensors['source_tensors'][0])[0] diff --git a/docs/html/_modules/models/model.html b/docs/html/_modules/models/model.html index 97595e7d6..ed8c775af 100644 --- a/docs/html/_modules/models/model.html +++ b/docs/html/_modules/models/model.html @@ -165,6 +165,11 @@

    Source code for models.model

     import copy
     import time
     
    +try:
    +    from inspect import signature
    +except ImportError:
    +    from funcsigs import signature
    +
     from open_seq2seq.utils.utils import deco_print, clip_last_batch
     from open_seq2seq.optimizers import optimize_loss, get_regularization_loss
     from open_seq2seq.utils.utils import check_params
    @@ -210,6 +215,7 @@ 

    Source code for models.model

           'save_summaries_steps': None,  # could be int or None
           'print_loss_steps': None,  # could be int or None
           'print_samples_steps': None,  # could be int or None
    +      'print_bench_info_steps': None,  # could be int or None
           'save_checkpoint_steps': None,  # could be int or None
           'eval_steps': int,
     
    @@ -230,9 +236,9 @@ 

    Source code for models.model

           'lr_policy_params': dict,
           'max_grad_norm': float,
           'larc_params': dict,
    -      'loss_scale': float,
    -      'automatic_loss_scaling': [None, 'Backoff', 'LogMax'],
    +      'loss_scaling': None,  # float, "Backoff" or "LogMax"
           'summaries': list,
    +      'iter_size': int,
         }
    [docs] def __init__(self, params, mode="train", hvd=None): @@ -276,6 +282,11 @@

    Source code for models.model

         * **print_samples_steps** (int or None) --- how often to print training
           samples (input sequences, correct answers and model predictions).
           Setting it to None disables samples printing.
    +    * **print_bench_info_steps** (int or None) --- how often to print training
    +      benchmarking information (average number of objects processed per step).
    +      Setting it to None disables intermediate benchmarking printing, but
    +      the average information across the whole training will always be printed
    +      after the last iteration.
         * **save_checkpoint_steps** (int or None) --- how often to save model
           checkpoints. Setting it to None disables checkpoint saving.
         * **eval_steps** (int) --- how often to run evaluation during training.
    @@ -311,14 +322,17 @@ 

    Source code for models.model

         * **max_grad_norm** (float) --- maximum value of gradient norm. Clipping
           will be performed if some gradients exceed this value (this is checked
           for each variable independently).
    -    * **loss_scale** (float) --- static loss scale to use. For details see
    -      :ref:`mixed precision training <mixed_precision>` section in docs.
    -    * **automatic_loss_scaling** --- automatic loss scaling mode. Could be
    -      either None, "Backoff" or "Logmax". For details see
    -      :ref:`mixed precision training <mixed_precision>` section in docs.
    +    * **loss_scaling** --- could be float or string. If float, static loss
    +      scaling is applied. If string, the corresponding automatic
    +      loss scaling algorithm is used. Must be one of 'Backoff'
    +      of 'LogMax' (case insensitive). Only used when dtype="mixed". For details
    +      see :ref:`mixed precision training <mixed_precision>` section in docs.
         * **summaries** (list) --- which summaries to log. Could contain
           "learning_rate", "gradients", "gradient_norm", "global_gradient_norm",
           "variables", "variable_norm".
    +    * **iter_size** (int) --- use this parameter to emulate large batches.
    +      The gradients will be accumulated for ``iter_size`` number of steps before
    +      applying update.
         * **larc_params** --- dictionary with parameters for LARC (or LARS)
           optimization algorithms. Can contain the following parameters:
     
    @@ -335,6 +349,9 @@ 

    Source code for models.model

     
         self._params = copy.deepcopy(params)
     
    +    if self._params.get('iter_size', 1) > 1 and hvd is None:
    +      raise ValueError("iter_size is only supported in Horovod mode")
    +
         # parameter checks
         self._mode = mode
         if self._mode not in ["train", "infer", "eval"]:
    @@ -356,6 +373,8 @@ 

    Source code for models.model

           self._params['save_checkpoint_steps'] = None
         if 'save_summaries_steps' not in self._params:
           self._params['save_summaries_steps'] = None
    +    if 'print_bench_info_steps' not in self._params:
    +      self._params['print_bench_info_steps'] = None
     
         # checking that frequencies of samples and loss are aligned
         s_fr = self._params['print_samples_steps']
    @@ -421,15 +440,21 @@ 

    Source code for models.model

               self._steps_in_epoch //= self._hvd.size()
             else:
               self._steps_in_epoch //= self.num_gpus
    +        self._steps_in_epoch //= self._params.get('iter_size', 1)
    +        if self._steps_in_epoch == 0:
    +          raise ValueError("Overall batch size is too big for this dataset.")
             self._last_step = self._params['num_epochs'] * self._steps_in_epoch
     
         if self.on_horovod:
           self._output = None
         else:
           self._outputs = [None] * self.num_gpus
    +
         self.loss = None
         self.train_op = None
    -    self.eval_losses = None
    + self.eval_losses = None + self._num_objects_per_step = None + self.skip_update_ph = None
    [docs] def compile(self, force_var_reuse=False): """TensorFlow graph is built here.""" @@ -461,7 +486,7 @@

    Source code for models.model

               )
               if self._outputs[gpu_cnt] is not None and \
                  not isinstance(self._outputs[gpu_cnt], list):
    -            raise ValueError('Decoder samples have to be either None or list')
    +            raise ValueError('Decoder outputs have to be either None or list')
               if self._mode == "train" or self._mode == "eval":
                 losses.append(loss)
           # end of for gpu_ind loop
    @@ -487,13 +512,19 @@ 

    Source code for models.model

             loss, self._output = self._build_forward_pass_graph(input_tensors,
                                                                 gpu_id=0)
             if self._output is not None and not isinstance(self._output, list):
    -          raise ValueError('Decoder samples have to be either None or list')
    +          raise ValueError('Decoder outputs have to be either None or list')
     
             if self._mode == "train":
               self.loss = loss
             if self._mode == "eval":
               self.eval_losses = [loss]
     
    +    try:
    +      self._num_objects_per_step = [self._get_num_objects_per_step(worker_id)
    +                                    for worker_id in range(self.num_gpus)]
    +    except NotImplementedError:
    +      pass
    +
         if self._mode == "train":
           if 'lr_policy' not in self.params:
             lr_policy = None
    @@ -501,34 +532,31 @@ 

    Source code for models.model

             lr_params = self.params.get('lr_policy_params', {})
             # adding default decay_steps = max_steps if lr_policy supports it and
             # different value is not provided
    -        if 'decay_steps' in self.params['lr_policy'].__code__.co_varnames and \
    -           'decay_steps' not in lr_params:
    +        func_params = signature(self.params['lr_policy']).parameters
    +        if 'decay_steps' in func_params and 'decay_steps' not in lr_params:
               lr_params['decay_steps'] = self._last_step
    -        if 'steps_per_epoch' in self.params['lr_policy'].__code__.co_varnames and \
    +        if 'steps_per_epoch' in func_params and \
                'steps_per_epoch' not in lr_params and 'num_epochs' in self.params:
               lr_params['steps_per_epoch'] = self.steps_in_epoch
             lr_policy = lambda gs: self.params['lr_policy'](global_step=gs,
                                                             **lr_params)
     
    +      if self.params.get('iter_size', 1) > 1:
    +        self.skip_update_ph = tf.placeholder(tf.bool)
    +
           self.train_op = optimize_loss(
             loss=tf.cast(self.loss, tf.float32) + get_regularization_loss(),
             dtype=self.params['dtype'],
             optimizer=self.params['optimizer'],
             optimizer_params=self.params.get('optimizer_params', {}),
    -        gradient_noise_scale=None,
    -        gradient_multipliers=None,
             clip_gradients=self.params.get('max_grad_norm', None),
             learning_rate_decay_fn=lr_policy,
    -        update_ops=None,
    -        variables=None,
    -        name="Loss_Optimization",
             summaries=self.params.get('summaries', None),
    -        colocate_gradients_with_ops=True,
    -        increment_global_step=True,
             larc_params=self.params.get('larc_params', None),
    -        loss_scale=self.params.get('loss_scale', 1.0),
    -        automatic_loss_scaling=self.params.get('automatic_loss_scaling', None),
    +        loss_scaling=self.params.get('loss_scaling', 1.0),
             on_horovod=self.on_horovod,
    +        iter_size=self.params.get('iter_size', 1),
    +        skip_update_ph=self.skip_update_ph,
           )
           tf.summary.scalar(name="train_loss", tensor=self.loss)
           if self.steps_in_epoch:
    @@ -569,7 +597,7 @@ 

    Source code for models.model

               is constructed. For Horovod this is always zero.
     
         Returns:
    -      tuple: tuple containing loss tensor and samples tensor.
    +      tuple: tuple containing loss tensor and list of outputs tensors.
     
           Loss tensor will be automatically provided to the optimizer and
           corresponding :attr:`train_op` will be created.
    @@ -579,12 +607,12 @@ 

    Source code for models.model

           this happens inside :class:`utils.hooks.RunEvaluationHook`
           to fetch output values for evaluation.
     
    -      Both loss and samples can be None when corresponding part of the graph
    +      Both loss and outputs can be None when corresponding part of the graph
           is not built.
         """
         pass
    -
    [docs] def maybe_print_logs(self, input_values, output_values): +
    [docs] def maybe_print_logs(self, input_values, output_values, training_step): """This method can be used to print logs that help to visualize training. For example, you can print sample input sequences and their corresponding predictions. This method will be called every ``print_samples_steps`` @@ -602,6 +630,7 @@

    Source code for models.model

           output_values: evaluation of
               :meth:`self.get_output_tensors(0) <get_output_tensors>`,
               that is, output tensors for one batch on the *first* GPU.
    +      training_step (int): Current training step.
     
         Returns:
           dict: dictionary with values that need to be logged to TensorBoard
    @@ -646,7 +675,7 @@ 

    Source code for models.model

         """
         return []
    -
    [docs] def finalize_evaluation(self, results_per_batch): +
    [docs] def finalize_evaluation(self, results_per_batch, training_step=None): """This method can be used in conjunction with :meth:`self.evaluate()<evaluate>` to calculate evaluation metrics. @@ -669,6 +698,8 @@

    Source code for models.model

           results_per_batch (list): aggregation of values returned from all calls
               to :meth:`self.evaluate()<evaluate>` method (number of calls will be
               equal to number of evaluation batches).
    +      training_step (int): current training step. Will only be passed if mode
    +          is "train_eval".
     
         Returns:
           dict: dictionary with values that need to be logged to TensorBoard
    @@ -776,7 +807,7 @@ 

    Source code for models.model

         else:
           return self.params['dtype']
    -
    [docs] def get_num_objects_per_step(self, worker_id=0): +
    [docs] def _get_num_objects_per_step(self, worker_id=0): """Define this method if you need benchmarking functionality. For example, for translation models, this method should return number of tokens in current batch, for image recognition model should return number @@ -791,6 +822,12 @@

    Source code for models.model

         """
         raise NotImplementedError()
    +
    [docs] def get_num_objects_per_step(self, worker_id=0): + if self._num_objects_per_step: + return self._num_objects_per_step[worker_id] + else: + raise NotImplementedError()
    + @property def params(self): """Parameters used to construct the model (dictionary).""" diff --git a/docs/html/_modules/models/speech2text.html b/docs/html/_modules/models/speech2text.html index 80796538c..ee88e2bf3 100644 --- a/docs/html/_modules/models/speech2text.html +++ b/docs/html/_modules/models/speech2text.html @@ -204,7 +204,7 @@

    Source code for models.speech2text

         )
         return super(Speech2Text, self)._create_decoder()
     
    -
    [docs] def maybe_print_logs(self, input_values, output_values): +
    [docs] def maybe_print_logs(self, input_values, output_values, training_step): y, len_y = input_values['target_tensors'] decoded_sequence = output_values y_one_sample = y[0] @@ -229,7 +229,7 @@

    Source code for models.speech2text

           'Sample WER': sample_wer,
         }
    -
    [docs] def finalize_evaluation(self, results_per_batch): +
    [docs] def finalize_evaluation(self, results_per_batch, training_step=None): total_word_lev = 0.0 total_word_count = 0.0 for word_lev, word_count in results_per_batch: @@ -275,13 +275,21 @@

    Source code for models.speech2text

         )
         for sample_id in range(len(decoded_texts)):
           preds.append("".join(decoded_texts[sample_id]))
    -    return preds
    + return preds, input_values['source_ids']
    [docs] def finalize_inference(self, results_per_batch, output_file): preds = [] + ids = [] - for result in results_per_batch: + for result, idx in results_per_batch: preds.extend(result) + ids.extend(idx) + + preds = np.array(preds) + ids = np.hstack(ids) + # restoring the correct order + preds = preds[np.argsort(ids)] + pd.DataFrame( { 'wav_filename': self.get_data_layer().all_files, @@ -290,7 +298,7 @@

    Source code for models.speech2text

           columns=['wav_filename', 'predicted_transcript'],
         ).to_csv(output_file, index=False)
    -
    [docs] def get_num_objects_per_step(self, worker_id=0): +
    [docs] def _get_num_objects_per_step(self, worker_id=0): """Returns number of audio frames in current batch.""" data_layer = self.get_data_layer(worker_id) num_frames = tf.reduce_sum(data_layer.input_tensors['source_tensors'][1]) diff --git a/docs/html/_modules/models/text2text.html b/docs/html/_modules/models/text2text.html index 7efd1f609..fdf38dcdf 100644 --- a/docs/html/_modules/models/text2text.html +++ b/docs/html/_modules/models/text2text.html @@ -235,22 +235,23 @@

    Source code for models.text2text

         input_strings, output_strings = [], []
         input_values = input_values['source_tensors']
         for input_sample, output_sample in zip(input_values, output_values):
    -      output_strings.append(text_ids_to_string(
    -        output_sample[0],
    -        self.get_data_layer().params['target_idx2seq'],
    -        S_ID=self.decoder.params['GO_SYMBOL'],
    -        EOS_ID=self.decoder.params['END_SYMBOL'],
    -        PAD_ID=self.decoder.params['PAD_SYMBOL'],
    -        ignore_special=True, delim=' ',
    -      ))
    -      input_strings.append(text_ids_to_string(
    -        input_sample[0],
    -        self.get_data_layer().params['source_idx2seq'],
    -        S_ID=self.decoder.params['GO_SYMBOL'],
    -        EOS_ID=self.decoder.params['END_SYMBOL'],
    -        PAD_ID=self.decoder.params['PAD_SYMBOL'],
    -        ignore_special=True, delim=' ',
    -      ))
    +      for i in range(0, input_sample.shape[0]): # iterate over batch dimension
    +        output_strings.append(text_ids_to_string(
    +          output_sample[i],
    +          self.get_data_layer().params['target_idx2seq'],
    +          S_ID=self.decoder.params['GO_SYMBOL'],
    +          EOS_ID=self.decoder.params['END_SYMBOL'],
    +          PAD_ID=self.decoder.params['PAD_SYMBOL'],
    +          ignore_special=True, delim=' ',
    +        ))
    +        input_strings.append(text_ids_to_string(
    +          input_sample[i],
    +          self.get_data_layer().params['source_idx2seq'],
    +          S_ID=self.decoder.params['GO_SYMBOL'],
    +          EOS_ID=self.decoder.params['END_SYMBOL'],
    +          PAD_ID=self.decoder.params['PAD_SYMBOL'],
    +          ignore_special=True, delim=' ',
    +        ))
         return input_strings, output_strings
    [docs] def finalize_inference(self, results_per_batch, output_file): @@ -265,7 +266,7 @@

    Source code for models.text2text

                 deco_print("")
               step += 1
    -
    [docs] def maybe_print_logs(self, input_values, output_values): +
    [docs] def maybe_print_logs(self, input_values, output_values, training_step): x, len_x = input_values['source_tensors'] y, len_y = input_values['target_tensors'] samples = output_values[0] @@ -355,7 +356,7 @@

    Source code for models.text2text

     
         return preds, targets
    -
    [docs] def finalize_evaluation(self, results_per_batch): +
    [docs] def finalize_evaluation(self, results_per_batch, training_step=None): preds, targets = [], [] for preds_cur, targets_cur in results_per_batch: if self.params.get('eval_using_bleu', True): @@ -369,13 +370,18 @@

    Source code for models.text2text

     
         return {}
    -
    [docs] def get_num_objects_per_step(self, worker_id=0): +
    [docs] def _get_num_objects_per_step(self, worker_id=0): """Returns number of source tokens + number of target tokens in batch.""" data_layer = self.get_data_layer(worker_id) # sum of source length in batch num_tokens = tf.reduce_sum(data_layer.input_tensors['source_tensors'][1]) - # sum of target length in batch - num_tokens += tf.reduce_sum(data_layer.input_tensors['target_tensors'][1]) + if self.mode != "infer": + # sum of target length in batch + num_tokens += tf.reduce_sum(data_layer.input_tensors['target_tensors'][1]) + else: + # TODO: this is not going to be correct when batch size > 1, since it will + # count padding? + num_tokens += tf.reduce_sum(tf.shape(self.get_output_tensors(worker_id)[0])) return num_tokens
    diff --git a/docs/html/_modules/optimizers/automatic_loss_scaler.html b/docs/html/_modules/optimizers/automatic_loss_scaler.html index d2f005f41..e13af41f7 100644 --- a/docs/html/_modules/optimizers/automatic_loss_scaler.html +++ b/docs/html/_modules/optimizers/automatic_loss_scaler.html @@ -163,25 +163,25 @@

    Source code for optimizers.automatic_loss_scaler

    import tensorflow as tf -
    [docs]class AutomaticLossScaler: - SUPPORTED_ALGOS = ['Backoff', 'LogMax'] +
    [docs]class AutomaticLossScaler(object): + SUPPORTED_ALGOS = ['backoff', 'logmax'] def __init__(self, algorithm='Backoff', scale_min=1.0, scale_max=2.**24): - if algorithm == 'Backoff': + algorithm = algorithm.lower().strip() + if algorithm == 'backoff': self.scaler = BackoffScaler(scale_min=scale_min, scale_max=scale_max, step_factor=2.0, step_window=2000) - elif algorithm == 'LogMax': + elif algorithm == 'logmax': self.scaler = LogMaxScaler(scale_min=scale_min, scale_max=scale_max, log_max=16., beta1=0.99, beta2=0.999, - overflow_std_dev=3.09) # ppf(.999) + overflow_std_dev=3.09) # ppf(.999) else: - raise ValueError('Unknown dynamic scaling algorithm: %s' - % algorithm_name) + raise ValueError('Unknown scaling algorithm: {}'.format(algorithm))
    [docs] def update_op(self, has_nan, amax): return self.scaler.update_op(has_nan, amax)
    @@ -210,7 +210,7 @@

    Source code for optimizers.automatic_loss_scaler

    return has_nan, amax
    -
    [docs]class BackoffScaler: +
    [docs]class BackoffScaler(object): def __init__(self, scale_min, scale_max, step_factor, step_window): self.scale_min = scale_min self.scale_max = scale_max @@ -260,7 +260,7 @@

    Source code for optimizers.automatic_loss_scaler

    return self.scale
    -
    [docs]class LogMaxScaler: +
    [docs]class LogMaxScaler(object): def __init__(self, scale_min, scale_max, log_max, beta1, beta2, overflow_std_dev): self.scale_min = scale_min self.scale_max = scale_max diff --git a/docs/html/_modules/optimizers/mp_wrapper.html b/docs/html/_modules/optimizers/mp_wrapper.html index 5ab6be4bf..34c0746f5 100644 --- a/docs/html/_modules/optimizers/mp_wrapper.html +++ b/docs/html/_modules/optimizers/mp_wrapper.html @@ -165,23 +165,28 @@

    Source code for optimizers.mp_wrapper

     
     
     
    [docs]class MixedPrecisionOptimizerWrapper(tf.train.Optimizer): - def __init__(self, optimizer, automatic_loss_scaler=None): + def __init__(self, optimizer, loss_scale=None): super(MixedPrecisionOptimizerWrapper, self).__init__( optimizer._use_locking, optimizer._name + '-MP', ) self._optimizer = optimizer self._fp32_to_fp16 = {} - self._loss_scaler = automatic_loss_scaler + self._loss_scaler = None + if loss_scale is None: + self._loss_scale = 1.0 + elif isinstance(loss_scale, float): + self._loss_scale = loss_scale + elif isinstance(loss_scale, AutomaticLossScaler): + self._loss_scaler = loss_scale + self._loss_scale = self._loss_scaler.loss_scale
    [docs] def compute_gradients(self, loss, var_list=None, gate_gradients=tf.train.Optimizer.GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): - if self._loss_scaler: - loss *= self._loss_scaler.loss_scale - + loss *= self._loss_scale grads_and_vars_fp16 = self._optimizer.compute_gradients( loss, var_list=var_list, gate_gradients=gate_gradients, @@ -214,7 +219,7 @@

    Source code for optimizers.mp_wrapper

               fp32_grad = tf.cast(grad, tf.float32)
               # adding regularization part with respect to fp32 copy
               if var.name in reg_funcs:
    -            fp32_grad += tf.gradients(
    +            fp32_grad += self._loss_scale * tf.gradients(
                   tf.contrib.layers.apply_regularization(
                     reg_funcs[var.name],
                     [fp32_var],
    @@ -225,15 +230,11 @@ 

    Source code for optimizers.mp_wrapper

             else:
               grads_and_vars_fp32.append((grad, var))
     
    -    # Unscale gradients if necessary
    -    if self._loss_scaler:
    -      grads_and_vars_fp32 = _scale_grads(grads_and_vars_fp32,
    -                                         1. / self._loss_scaler.loss_scale)
    -
    +    grads_and_vars_fp32 = _scale_grads(grads_and_vars_fp32,
    +                                       1.0 / self._loss_scale)
         return grads_and_vars_fp32
    [docs] def apply_gradients(self, grads_and_vars, global_step=None, name=None): - def apply_ops_wrapper(): update_op = self._optimizer.apply_gradients(grads_and_vars, global_step, name) @@ -243,7 +244,7 @@

    Source code for optimizers.mp_wrapper

               if var.name in self._fp32_to_fp16:
                 dst_var = self._fp32_to_fp16[var.name]
                 apply_ops.append(
    -              tf.assign(dst_var, tf.cast(var, tf.float16)))
    +              tf.assign(dst_var, tf.saturate_cast(var, tf.float16)))
           if apply_ops:
             return tf.group(apply_ops)
           return update_op
    @@ -254,9 +255,7 @@ 

    Source code for optimizers.mp_wrapper

           loss_scale_update_op = self._loss_scaler.update_op(grad_has_nans,
                                                              grad_amax)
           with tf.control_dependencies([loss_scale_update_op]):
    -        return tf.cond(should_skip_update,
    -                       tf.no_op,
    -                       apply_ops_wrapper)
    +        return tf.cond(should_skip_update, tf.no_op, apply_ops_wrapper)
         else:
           return apply_ops_wrapper()
    @@ -284,6 +283,7 @@

    Source code for optimizers.mp_wrapper

             grad *= scale
         scaled_grads_and_vars.append((grad, var))
       return scaled_grads_and_vars
    +
     
    diff --git a/docs/html/_modules/optimizers/optimizers.html b/docs/html/_modules/optimizers/optimizers.html index 794d8c680..a995d301e 100644 --- a/docs/html/_modules/optimizers/optimizers.html +++ b/docs/html/_modules/optimizers/optimizers.html @@ -182,23 +182,8 @@

    Source code for optimizers.optimizers

     
     import six
     import tensorflow as tf
    -
    -from tensorflow.contrib import framework as contrib_framework
    -from tensorflow.python.framework import constant_op
    -from tensorflow.python.framework import dtypes
    -from tensorflow.python.framework import ops
    -from tensorflow.python.ops import array_ops
    -from tensorflow.python.ops import clip_ops
     from tensorflow.python.ops import control_flow_ops
    -from tensorflow.python.ops import init_ops
    -from tensorflow.python.ops import math_ops
    -from tensorflow.python.ops import random_ops
    -from tensorflow.python.ops import variable_scope as vs
    -from tensorflow.python.ops import variables as vars_
    -from tensorflow.python.summary import summary
    -from tensorflow.python.training import moving_averages
    -from tensorflow.python.training import optimizer as optimizer_
    -from tensorflow.python.training import training as train
    +
     
     from .automatic_loss_scaler import AutomaticLossScaler
     from .mp_wrapper import MixedPrecisionOptimizerWrapper
    @@ -206,12 +191,12 @@ 

    Source code for optimizers.optimizers

     
     
     OPTIMIZER_CLS_NAMES = {
    -  "Adagrad": train.AdagradOptimizer,
    -  "Adam": train.AdamOptimizer,
    -  "Ftrl": train.FtrlOptimizer,
    -  "Momentum": train.MomentumOptimizer,
    -  "RMSProp": train.RMSPropOptimizer,
    -  "SGD": train.GradientDescentOptimizer,
    +  "Adagrad": tf.train.AdagradOptimizer,
    +  "Adam": tf.train.AdamOptimizer,
    +  "Ftrl": tf.train.FtrlOptimizer,
    +  "Momentum": tf.train.MomentumOptimizer,
    +  "RMSProp": tf.train.RMSPropOptimizer,
    +  "SGD": tf.train.GradientDescentOptimizer,
     }
     
     OPTIMIZER_SUMMARIES = [
    @@ -236,555 +221,302 @@ 

    Source code for optimizers.optimizers

       """
       losses = tf.losses.get_regularization_losses(scope)
       if losses:
    -    return math_ops.add_n(list(map(lambda x: tf.cast(x, tf.float32), losses)),
    -                          name=name)
    +    return tf.add_n(list(map(lambda x: tf.cast(x, tf.float32), losses)),
    +                    name=name)
       else:
    -    return constant_op.constant(0.0)
    - - -
    [docs]class DistributedOptimizer(tf.train.Optimizer): - """An optimizer that wraps another tf.Optimizer, using an allreduce to - average gradient values before applying gradients to model weights.""" - -
    [docs] def __init__(self, optimizer, name=None, use_locking=False, device_dense='', - device_sparse=''): - """Construct a new DistributedOptimizer, which uses another optimizer - under the hood for computing single-process gradient values and - applying gradient updates after the gradient values have been averaged - across all the Horovod ranks. - Args: - optimizer: - Optimizer to use for computing gradients and applying updates. - name: - Optional name prefix for the operations created when applying - gradients. Defaults to "Distributed" followed by the provided - optimizer type. - use_locking: - Whether to use locking when updating variables. - See Optimizer.__init__ for more info. - device_dense: - Device to be used for dense tensors. Uses GPU by default - if Horovod was build with HOROVOD_GPU_ALLREDUCE. - device_sparse: - Device to be used for sparse tensors. Uses GPU by default - if Horovod was build with HOROVOD_GPU_ALLGATHER. - """ - if name is None: - name = "Distributed{}".format(type(optimizer).__name__) - - self._optimizer = optimizer - self._device_dense = device_dense - self._device_sparse = device_sparse - super(DistributedOptimizer, self).__init__( - name=name, use_locking=use_locking)
    - -
    [docs] def compute_gradients(self, *args, **kwargs): - """Compute gradients of all trainable variables. - See Optimizer.compute_gradients() for more info. - In DistributedOptimizer, compute_gradients() is overriden to also - allreduce the gradients before returning them. - """ - gradients = self._optimizer.compute_gradients(*args, **kwargs) + return tf.constant(0.0)
    + + +
    [docs]def reduce_gradients(grads_and_vars, on_horovod): + if on_horovod: from horovod.common import size from horovod.tensorflow import allreduce if size() > 1: - averaged_gradients = [] - with tf.name_scope(self._name + "_Allreduce"): - for grad, var in gradients: + averaged_grads_and_vars = [] + with tf.name_scope("all_reduce"): + for grad, var in grads_and_vars: if grad is not None: - avg_grad = allreduce(grad, device_dense=self._device_dense, - device_sparse=self._device_sparse) - averaged_gradients.append((avg_grad, var)) + avg_grad = allreduce(grad) + averaged_grads_and_vars.append((avg_grad, var)) else: - averaged_gradients.append((None, var)) - return averaged_gradients + averaged_grads_and_vars.append((None, var)) + return averaged_grads_and_vars else: - return gradients
    - -
    [docs] def apply_gradients(self, grads_and_vars, global_step=None, name=None): - """Calls this same method on the underlying optimizer.""" - return self._optimizer.apply_gradients(grads_and_vars, global_step, name)
    + return grads_and_vars + else: + raise NotImplementedError("Reduce in tower-mode is not implemented.")
    [docs]def optimize_loss(loss, optimizer, optimizer_params, learning_rate_decay_fn, - global_step=None, dtype=tf.float32, - gradient_noise_scale=None, - gradient_multipliers=None, clip_gradients=None, - update_ops=None, - variables=None, - name=None, summaries=None, - colocate_gradients_with_ops=False, - increment_global_step=True, larc_params=None, - loss_scale=1.0, - automatic_loss_scaling=None, - on_horovod=False): + loss_scaling=1.0, + on_horovod=False, + iter_size=1, + skip_update_ph=None): """Given loss and parameters for optimizer, returns a training op. - Various ways of passing optimizers include: - - - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES - for full list. E.g. `optimize_loss(..., optimizer='Adam')`. - - by function taking learning rate `Tensor` as argument and returning an - `Optimizer` instance. E.g. `optimize_loss(..., - optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`. - Alternatively, if `learning_rate` is `None`, the function takes no - arguments. E.g. `optimize_loss(..., learning_rate=None, - optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`. - - by a subclass of `Optimizer` having a single-argument constructor - (the argument is the learning rate), such as AdamOptimizer or - AdagradOptimizer. E.g. `optimize_loss(..., - optimizer=tf.train.AdagradOptimizer)`. - - by an instance of a subclass of `Optimizer`. - E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`. - Args: loss: Scalar `Tensor`. - global_step: Scalar int `Tensor`, step counter to update on each step - unless `increment_global_step` is `False`. If not supplied, - it will be fetched from the default graph (see - `tf.train.get_global_step` for details). If it has - not been created, no step will be incremented with each weight - update. `learning_rate_decay_fn` requires `global_step`. - learning_rate: float or `Tensor`, magnitude of update per each training - step. Can be `None`. - optimizer: string, class or optimizer instance, used as trainer. - string should be name of optimizer, like 'SGD', - 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. - class should be sub-class of `tf.Optimizer` that implements - `compute_gradients` and `apply_gradients` functions. - optimizer instance should be instantiation of `tf.Optimizer` - sub-class and have `compute_gradients` and `apply_gradients` - functions. - gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this - value. - gradient_multipliers: dict of variables or variable names to floats. - If present, gradients for specified - variables will be multiplied by given constant. - clip_gradients: float, callable or `None`. If float, is provided, a global - clipping is applied to prevent the norm of the gradient to exceed this - value. Alternatively, a callable can be provided e.g.: adaptive_clipping. - This callable takes a `list` of `(gradients, variables)` `tuple`s and - returns the same thing with the gradients modified. - learning_rate_decay_fn: function, takes `learning_rate` and `global_step` - `Tensor`s, returns `Tensor`. - Can be used to implement any learning rate decay - functions. - For example: `tf.train.exponential_decay`. - Ignored if `learning_rate` is not supplied. - update_ops: list of update `Operation`s to execute at each step. If `None`, - uses elements of UPDATE_OPS collection. The order of execution - between `update_ops` and `loss` is non-deterministic. - variables: list of variables to optimize or - `None` to use all trainable variables. - name: The name for this operation is used to scope operations and summaries. + optimizer: string or class of optimizer, used as trainer. + string should be name of optimizer, like 'SGD', + 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. + class should be sub-class of `tf.Optimizer` that implements + `compute_gradients` and `apply_gradients` functions. + optimizer_params: parameters of the optimizer. + dtype: model dtype (tf.float16, tf.float32 or "mixed"). + learning_rate_decay_fn: function, takes `global_step` + `Tensor`s, returns `Tensor`. + Can be used to implement any learning rate decay + functions. + For example: `tf.train.exponential_decay`. + Ignored if `learning_rate` is not supplied. + clip_gradients: float, max gradient norm to clip to. summaries: List of internal quantities to visualize on tensorboard. If not - set only the loss and the learning rate will be reported. The - complete list is in OPTIMIZER_SUMMARIES. - colocate_gradients_with_ops: If True, try colocating gradients with the - corresponding op. - increment_global_step: Whether to increment `global_step`. If your model - calls `optimize_loss` multiple times per training step (e.g. to optimize - different parts of the model), use this arg to avoid incrementing - `global_step` more times than necessary. - LARC_mode: 'scale' or 'clip' - LARC_nu: If not None, LARC re-scaling will be - applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu - automatic_loss_scaling: if not None, use the corresponding automatic - loss scaling algorithm. Must be one of 'Backoff' - of 'LogMax'. `dtype` must be "mixed" to use ALS. + set only the loss and the learning rate will be reported. The + complete list is in OPTIMIZER_SUMMARIES. + larc_params: If not None, LARC re-scaling will + be applied with corresponding parameters. + loss_scaling: could be float or string. If float, static loss scaling + is applied. If string, the corresponding automatic + loss scaling algorithm is used. Must be one of 'Backoff' + of 'LogMax' (case insensitive). Only used when dtype="mixed". + on_horovod: whether the model is run on horovod. + Returns: - Training op. - - Raises: - ValueError: if: - * `loss` is an invalid type or shape. - * `global_step` is an invalid type or shape. - * `learning_rate` is an invalid type or value. - * `optimizer` has the wrong type. - * `clip_gradients` is neither float nor callable. - * `learning_rate` and `learning_rate_decay_fn` are supplied, but no - `global_step` is available. - * `gradients` is empty. + training op. """ - loss = ops.convert_to_tensor(loss) - contrib_framework.assert_scalar(loss) - if global_step is None: - global_step = tf.train.get_or_create_global_step() + if summaries is None: + summaries = ["learning_rate", "global_gradient_norm"] else: - tf.train.assert_global_step(global_step) - with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]): - # Update ops take UPDATE_OPS collection if not provided. - if update_ops is None: - update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) - # Make sure update ops are ran before computing loss. - if update_ops: - loss = control_flow_ops.with_dependencies(list(update_ops), loss) - - if summaries is None: - summaries = ["learning_rate", "global_gradient_norm"] - else: - for summ in summaries: - if summ not in OPTIMIZER_SUMMARIES: - raise ValueError("Summaries should be one of [%s], you provided %s." % - (", ".join(OPTIMIZER_SUMMARIES), summ)) - if global_step is None: - raise ValueError("global_step is required for learning_rate_decay_fn.") - lr = learning_rate_decay_fn(global_step) - - if "learning_rate" in summaries: - summary.scalar("learning_rate", lr) + for summ in summaries: + if summ not in OPTIMIZER_SUMMARIES: + raise ValueError( + "Summaries should be one of [{}], you provided {}.".format( + ", ".join(OPTIMIZER_SUMMARIES), summ, + )) + if clip_gradients is not None and larc_params is not None: + raise AttributeError( + "LARC and gradient norm clipping should not be used together" + ) + + global_step = tf.train.get_or_create_global_step() + lr = learning_rate_decay_fn(global_step) + if "learning_rate" in summaries: + tf.summary.scalar("learning_rate", lr) + + with tf.variable_scope("Loss_Optimization"): + update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) + loss = control_flow_ops.with_dependencies(list(update_ops), loss) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): - if lr is None: - raise ValueError("Learning rate is None, but should be specified if " - "optimizer is string (%s)." % optimizer) if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( - "Optimizer name should be one of [%s], you provided %s." % - (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) - opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr, **optimizer_params) - elif (isinstance(optimizer, type) and - issubclass(optimizer, optimizer_.Optimizer)): - if lr is None: - raise ValueError("Learning rate is None, but should be specified if " - "optimizer is class (%s)." % optimizer) - opt = optimizer(learning_rate=lr, **optimizer_params) - elif isinstance(optimizer, optimizer_.Optimizer): - opt = optimizer - elif callable(optimizer): - if lr is not None: - opt = optimizer(lr, **optimizer_params) - else: - opt = optimizer(**optimizer_params) - if not isinstance(opt, optimizer_.Optimizer): - raise ValueError("Unrecognized optimizer: function should return " - "subclass of Optimizer. Got %s." % str(opt)) - else: - raise ValueError("Unrecognized optimizer: should be string, " - "subclass of Optimizer, instance of " - "subclass of Optimizer or function with one argument. " - "Got %s." % str(optimizer)) - # All trainable variables, if specific variables are not specified. - if variables is None: - variables = vars_.trainable_variables() - - if automatic_loss_scaling is not None: - if not automatic_loss_scaling in AutomaticLossScaler.SUPPORTED_ALGOS: - raise ValueError("Unknown automatic loss scaling algorithm: %s." - % automatic_loss_sclaing) - if dtype != "mixed": - raise ValueError("Automatic loss scaling can be used only with " - "dtype=mixed.") - loss_scaler = AutomaticLossScaler(algorithm=automatic_loss_scaling) - else: - loss_scaler = None + "Optimizer name should be one of [{}], you provided {}.".format( + ", ".join(OPTIMIZER_CLS_NAMES), optimizer + )) + optimizer = OPTIMIZER_CLS_NAMES[optimizer] + opt = optimizer(learning_rate=lr, **optimizer_params) + + if isinstance(loss_scaling, six.string_types): + loss_scaling = AutomaticLossScaler(algorithm=loss_scaling) if dtype == 'mixed': - opt = MixedPrecisionOptimizerWrapper( - opt, - automatic_loss_scaler=loss_scaler, - ) - if on_horovod: - opt = DistributedOptimizer(opt) + opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scaling) # Compute gradients. - gradients = opt.compute_gradients( - loss if loss_scale == 1.0 else loss * loss_scale, - variables, - colocate_gradients_with_ops=colocate_gradients_with_ops) - - if loss_scale != 1.0: - gradients = _multiply_gradients_const(gradients, 1.0 / loss_scale) - - # Optionally add gradient noise. - if gradient_noise_scale is not None: - gradients = _add_scaled_noise_to_gradients(gradients, - gradient_noise_scale) - - # Multiply some gradients. - if gradient_multipliers is not None: - gradients = _multiply_gradients(gradients, gradient_multipliers) - if not gradients: - raise ValueError( - "Empty list of (gradient, var) pairs encountered. This is most " - "likely to be caused by an improper value of gradient_multipliers.") - - if "global_gradient_norm" in summaries or "gradient_norm" in summaries: - summary.scalar( - "global_norm/gradient_norm", - clip_ops.global_norm(list(map( - lambda x: tf.cast(x, tf.float32), - list(zip(*gradients))[0]) - )), - ) + grads_and_vars = opt.compute_gradients( + loss, colocate_gradients_with_ops=True, + ) - # Optionally clip gradients by global norm. - if clip_gradients is not None and larc_params is not None: - raise AttributeError( - "LARC and gradient norm clipping should not be used together" - ) - if isinstance(clip_gradients, float): - gradients = _clip_gradients_by_norm(gradients, clip_gradients) - elif callable(clip_gradients): - gradients = clip_gradients(gradients) - elif clip_gradients is not None: - raise ValueError( - "Unknown type %s for clip_gradients" % type(clip_gradients)) - - # Add histograms for variables, gradients and gradient norms. - for gradient, variable in gradients: - if isinstance(gradient, ops.IndexedSlices): - grad_values = gradient.values - else: - grad_values = gradient - - if isinstance(variable, ops.IndexedSlices): - var_values = variable.values + if on_horovod: + if iter_size > 1: + grads_and_vars_accum = [] + accum_ops = [] + for grad, var in grads_and_vars: + # necessary to use tf.Variable directly to instantiate cudnn rnn cells + # which don't have explicit shape. + grad_accum = tf.Variable( + initial_value=tf.zeros_like(var), + name=grad.name.split(":")[0] + "_accum", + expected_shape=var.shape, + dtype=grad.dtype, + trainable=False, + validate_shape=bool(var.get_shape()) + ) + if isinstance(grad, tf.IndexedSlices): + add_grads = tf.scatter_nd_add(grad_accum, grad.indices, + grad.values / iter_size) + else: + add_grads = grad_accum + grad / iter_size + + accum_ops.append(tf.assign(grad_accum, add_grads)) + grads_and_vars_accum.append((grad_accum, var)) + + accum_op = tf.group(accum_ops) + + def update_and_clear_op(): + with tf.control_dependencies([accum_op]): + red_grad_updates = opt.apply_gradients( + post_process_gradients( + reduce_gradients(grads_and_vars_accum, on_horovod=True), + lr=lr, + clip_gradients=clip_gradients, + larc_params=larc_params, + summaries=summaries, + ), + global_step=global_step, + ) + + with tf.control_dependencies([red_grad_updates]): + return tf.group([tf.assign(g, tf.zeros_like(g)) + for g, v in grads_and_vars_accum]) + + grad_updates = tf.cond( + pred=skip_update_ph, + true_fn=lambda: accum_op, + false_fn=update_and_clear_op, + ) else: - var_values = variable - - if grad_values is not None: - var_name = variable.name.replace(":", "_") - if "gradients" in summaries: - summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) - if "gradient_norm" in summaries: - summary.scalar("gradient_norm/%s" % var_name, - clip_ops.global_norm([grad_values])) - if "variables" in summaries: - summary.histogram("variables/%s" % var_name, var_values) - if "variable_norm" in summaries: - summary.scalar("variable_norm/%s" % var_name, - clip_ops.global_norm([var_values])) - - if clip_gradients is not None and ("global_gradient_norm" in summaries or - "gradient_norm" in summaries): - summary.scalar( - "global_norm/clipped_gradient_norm", - clip_ops.global_norm(list(map( - lambda x: tf.cast(x, tf.float32), - list(zip(*gradients))[0]) - )), - ) - - # LARC gradient re-scaling - if larc_params is not None: - check_params( - config=larc_params, - required_dict={'larc_eta': float}, - optional_dict={ - 'larc_mode': ['clip', 'scale'], - 'min_update': float, - 'epsilon': float - }, + grad_updates = opt.apply_gradients( + post_process_gradients( + reduce_gradients(grads_and_vars, on_horovod=True), + lr=lr, + clip_gradients=clip_gradients, + larc_params=larc_params, + summaries=summaries, + ), + global_step=global_step, + ) + else: + grad_updates = opt.apply_gradients( + post_process_gradients( + grads_and_vars, + lr=lr, + clip_gradients=clip_gradients, + larc_params=larc_params, + summaries=summaries, + ), + global_step=global_step, ) - larc_eta = larc_params['larc_eta'] - larc_mode = larc_params.get('larc_mode', 'clip') - min_update = larc_params.get('min_update', 1e-7) - eps = larc_params.get('epsilon', 1e-7) - - for idx, (g, v) in enumerate(gradients): - var_dtype = v.dtype - v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) - g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) - - if larc_mode == 'clip': - larc_grad_update = tf.maximum( - larc_eta * v_norm / (lr * (g_norm + eps)), - min_update, - ) - if "larc_summaries" in summaries: - summary.scalar('larc_clip_on/{}'.format(v.name), - tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) - larc_grad_update = tf.minimum(larc_grad_update, 1.0) - else: - larc_grad_update = tf.maximum( - larc_eta * v_norm / (g_norm + eps), - min_update, - ) - larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) - gradients[idx] = (larc_grad_update * g, v) - # adding additional summary - if "larc_summaries" in summaries: - summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update) - summary.scalar("larc_final_lr/{}".format(v.name), - tf.cast(lr, var_dtype) * larc_grad_update) - - # Create gradient updates. - grad_updates = opt.apply_gradients( - gradients, - global_step=global_step if increment_global_step else None, - name="train") - - # # Ensure the train_tensor computes grad_updates. + # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) return train_tensor
    -
    [docs]def _clip_gradients_by_norm(grads_and_vars, clip_gradients): - """Clips gradients by global norm.""" - gradients, variables = zip(*grads_and_vars) - clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_gradients) - return list(zip(clipped_gradients, variables))
    - - -
    [docs]def _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name): - """Find max_norm given norm and previous average.""" - with vs.variable_scope(name, "AdaptiveMaxNorm", [norm]): - log_norm = math_ops.log(norm + epsilon) - - def moving_average(name, value, decay): - moving_average_variable = vs.get_variable( - name, - shape=value.get_shape(), - dtype=value.dtype, - initializer=init_ops.zeros_initializer(), - trainable=False) - return moving_averages.assign_moving_average( - moving_average_variable, value, decay, zero_debias=False) - - # quicker adaptation at the beginning - if global_step is not None: - n = math_ops.to_float(global_step) - decay = math_ops.minimum(decay, n / (n + 1.)) - - # update averages - mean = moving_average("mean", log_norm, decay) - sq_mean = moving_average("sq_mean", math_ops.square(log_norm), decay) - - variance = sq_mean - math_ops.square(mean) - std = math_ops.sqrt(math_ops.maximum(epsilon, variance)) - max_norms = math_ops.exp(mean + std_factor * std) - return max_norms, mean
    - - -
    [docs]def adaptive_clipping_fn(std_factor=2., - decay=0.95, - static_max_norm=None, - global_step=None, - report_summary=False, - epsilon=1e-8, - name=None): - """Adapt the clipping value using statistics on the norms. - - Implement adaptive gradient as presented in section 3.2.1 of - https://arxiv.org/abs/1412.1602. - - Keeps a moving average of the mean and std of the log(norm) of the gradient. - If the norm exceeds `exp(mean + std_factor*std)` then all gradients will be - rescaled such that the global norm becomes `exp(mean)`. - - Args: - std_factor: Python scaler (or tensor). - `max_norm = exp(mean + std_factor*std)` - decay: The smoothing factor of the moving averages. - static_max_norm: If provided, will threshold the norm to this value as an - extra safety. - global_step: Optional global_step. If provided, `decay = decay*n/(n+1)`. - This provides a quicker adaptation of the mean for the first steps. - report_summary: If `True`, will add histogram summaries of the `max_norm`. - epsilon: Small value chosen to avoid zero variance. - name: The name for this operation is used to scope operations and summaries. - - Returns: - A function for applying gradient clipping. - """ - - def gradient_clipping(grads_and_vars): - """Internal function for adaptive clipping.""" - grads, variables = zip(*grads_and_vars) +
    [docs]def post_process_gradients(grads_and_vars, summaries, lr, + clip_gradients, larc_params): + """Applies post processing to gradients, i.e. clipping, LARC, summaries.""" + if "global_gradient_norm" in summaries: + tf.summary.scalar( + "global_gradient_norm", + _global_norm_with_cast(grads_and_vars), + ) - norm = clip_ops.global_norm(grads) + # Optionally clip gradients by global norm. + if clip_gradients is not None: + grads_and_vars = _clip_gradients_by_norm(grads_and_vars, clip_gradients) - max_norm, log_mean = _adaptive_max_norm(norm, std_factor, decay, - global_step, epsilon, name) - - # reports the max gradient norm for debugging - if report_summary: - summary.scalar("global_norm/adaptive_max_gradient_norm", max_norm) - - # factor will be 1. if norm is smaller than max_norm - factor = array_ops.where(norm < max_norm, - array_ops.ones_like(norm), - math_ops.exp(log_mean) / norm) - - if static_max_norm is not None: - factor = math_ops.minimum(static_max_norm / norm, factor) + # Add histograms for variables, gradients and gradient norms. + for gradient, variable in grads_and_vars: + if isinstance(gradient, tf.IndexedSlices): + grad_values = gradient.values + else: + grad_values = gradient - # apply factor - clipped_grads = [] - for grad in grads: - if grad is None: - clipped_grads.append(None) - elif isinstance(grad, ops.IndexedSlices): - clipped_grads.append( - ops.IndexedSlices(grad.values * factor, grad.indices, - grad.dense_shape)) + if isinstance(variable, tf.IndexedSlices): + var_values = variable.values + else: + var_values = variable + + if grad_values is not None: + var_name = variable.name.replace(":", "_") + if "gradients" in summaries: + # need to mask nans for automatic loss scaling + tf.summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) + if "gradient_norm" in summaries: + tf.summary.scalar("gradient_norm/%s" % var_name, tf.norm(grad_values)) + if "variables" in summaries: + tf.summary.histogram("variables/%s" % var_name, var_values) + if "variable_norm" in summaries: + tf.summary.scalar("variable_norm/%s" % var_name, tf.norm(var_values)) + + if clip_gradients is not None and "global_gradient_norm" in summaries: + tf.summary.scalar( + "global_clipped_gradient_norm", + _global_norm_with_cast(grads_and_vars), + ) + + # LARC gradient re-scaling + if larc_params is not None: + check_params( + config=larc_params, + required_dict={'larc_eta': float}, + optional_dict={ + 'larc_mode': ['clip', 'scale'], + 'min_update': float, + 'epsilon': float + }, + ) + larc_eta = larc_params['larc_eta'] + larc_mode = larc_params.get('larc_mode', 'clip') + min_update = larc_params.get('min_update', 1e-7) + eps = larc_params.get('epsilon', 1e-7) + + grads_and_vars_larc = [None] * len(grads_and_vars) + for idx, (g, v) in enumerate(grads_and_vars): + var_dtype = v.dtype + v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) + g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) + + if larc_mode == 'clip': + larc_grad_update = tf.maximum( + larc_eta * v_norm / (lr * (g_norm + eps)), + min_update, + ) + if "larc_summaries" in summaries: + tf.summary.scalar('larc_clip_on/{}'.format(v.name), + tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) + larc_grad_update = tf.minimum(larc_grad_update, 1.0) else: - clipped_grads.append(grad * factor) - - return list(zip(clipped_grads, variables)) + larc_grad_update = tf.maximum( + larc_eta * v_norm / (g_norm + eps), + min_update, + ) + larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) + grads_and_vars_larc[idx] = (larc_grad_update * g, v) + + # adding additional summary + if "larc_summaries" in summaries: + tf.summary.scalar('larc_grad_update/{}'.format(v.name), + larc_grad_update) + tf.summary.scalar("larc_final_lr/{}".format(v.name), + tf.cast(lr, var_dtype) * larc_grad_update) + grads_and_vars = grads_and_vars_larc + return grads_and_vars
    + + +def _global_norm_with_cast(grads_and_vars): + return tf.global_norm(list(map( + lambda x: tf.cast(x, tf.float32), + list(zip(*grads_and_vars))[0]) + )) - return gradient_clipping
    - -
    [docs]def _add_scaled_noise_to_gradients(grads_and_vars, gradient_noise_scale): - """Adds scaled noise from a 0-mean normal distribution to gradients.""" +
    [docs]def _clip_gradients_by_norm(grads_and_vars, clip_gradients): + """Clips gradients by global norm.""" gradients, variables = zip(*grads_and_vars) - noisy_gradients = [] - for gradient in gradients: - if gradient is None: - noisy_gradients.append(None) - continue - if isinstance(gradient, ops.IndexedSlices): - gradient_shape = gradient.dense_shape - else: - gradient_shape = gradient.get_shape() - noise = random_ops.truncated_normal(gradient_shape) * gradient_noise_scale - noisy_gradients.append(gradient + noise) - return list(zip(noisy_gradients, variables))
    - - -
    [docs]def _multiply_gradients(grads_and_vars, gradient_multipliers): - """Multiply specified gradients.""" - multiplied_grads_and_vars = [] - for grad, var in grads_and_vars: - if grad is not None and \ - (var in gradient_multipliers or var.name in gradient_multipliers): - key = var if var in gradient_multipliers else var.name - multiplier = constant_op.constant( - gradient_multipliers[key], dtype=dtypes.float32) - if isinstance(grad, ops.IndexedSlices): - grad_values = grad.values * multiplier - grad = ops.IndexedSlices(grad_values, grad.indices, grad.dense_shape) - else: - grad *= multiplier - multiplied_grads_and_vars.append((grad, var)) - return multiplied_grads_and_vars
    - - -
    [docs]def _multiply_gradients_const(grads_and_vars, multiplier): - """Multiply specified gradients.""" - multiplied_grads_and_vars = [] - for grad, var in grads_and_vars: - if grad is not None: - if isinstance(grad, ops.IndexedSlices): - grad_values = grad.values * multiplier - grad = ops.IndexedSlices(grad_values, grad.indices, grad.dense_shape) - else: - grad *= multiplier - multiplied_grads_and_vars.append((grad, var)) - return multiplied_grads_and_vars
    + clipped_gradients, _ = tf.clip_by_global_norm(gradients, clip_gradients) + return list(zip(clipped_gradients, variables))
    diff --git a/docs/html/_modules/parts/cnns/conv_blocks.html b/docs/html/_modules/parts/cnns/conv_blocks.html new file mode 100644 index 000000000..fc1db4850 --- /dev/null +++ b/docs/html/_modules/parts/cnns/conv_blocks.html @@ -0,0 +1,327 @@ + + + + + + + + + + + parts.cnns.conv_blocks — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + + + + +
    +
    +
    +
    + +

    Source code for parts.cnns.conv_blocks

    +# Copyright (c) 2018 NVIDIA Corporation
    +from __future__ import absolute_import
    +from __future__ import division
    +from __future__ import print_function
    +from __future__ import unicode_literals
    +from six.moves import range
    +
    +import tensorflow as tf
    +
    +
    +
    [docs]def conv_actv(type, name, inputs, filters, kernel_size, activation_fn, strides, + padding, regularizer, training, data_format): + """Helper function that applies convolution and activation. + + Args: + type: the following types are supported + 'conv1d', 'conv2d' + """ + if type == "conv1d": + layer = tf.layers.conv1d + elif type == "conv2d": + layer = tf.layers.conv2d + + conv = layer( + name="{}".format(name), + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=padding, + kernel_regularizer=regularizer, + use_bias=False, + data_format=data_format, + ) + + output = conv + if activation_fn is not None: + output = activation_fn(output) + return output
    + + +
    [docs]def conv_bn_actv(type, name, inputs, filters, kernel_size, activation_fn, + strides, padding, regularizer, training, data_format, + bn_momentum, bn_epsilon): + """Helper function that applies convolution, batch norm and activation. + Accepts inputs in 'channels_last' format only. + + Args: + type: the following types are supported + 'conv1d', 'conv2d' + """ + if type == "conv1d": + layer = tf.layers.conv1d + elif type == "conv2d": + layer = tf.layers.conv2d + + conv = layer( + name="{}".format(name), + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=padding, + kernel_regularizer=regularizer, + use_bias=False, + data_format=data_format, + ) + + # trick to make batchnorm work for mixed precision training. + # To-Do check if batchnorm works smoothly for >4 dimensional tensors + squeeze = False + if type == "conv1d": + conv = tf.expand_dims(conv, axis=1) # NWC --> NHWC + squeeze = True + + bn = tf.layers.batch_normalization( + name="{}/bn".format(name), + inputs=conv, + gamma_regularizer=regularizer, + training=training, + axis=-1 if data_format == 'channels_last' else 1, + momentum=bn_momentum, + epsilon=bn_epsilon, + ) + + if squeeze: + bn = tf.squeeze(bn, axis=1) + + output = bn + if activation_fn is not None: + output = activation_fn(output) + return output
    +
    + +
    + +
    + + +
    +
    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/parts/convs2s/attention_wn_layer.html b/docs/html/_modules/parts/convs2s/attention_wn_layer.html new file mode 100644 index 000000000..ee507bc84 --- /dev/null +++ b/docs/html/_modules/parts/convs2s/attention_wn_layer.html @@ -0,0 +1,325 @@ + + + + + + + + + + + parts.convs2s.attention_wn_layer — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + +
      + +
    • Docs »
    • + +
    • Module code »
    • + +
    • parts.convs2s.attention_wn_layer
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for parts.convs2s.attention_wn_layer

    +"""Implementation of the attention layer for convs2s.
    +Inspired from https://github.com/tobyyouup/conv_seq2seq"""
    +
    +from __future__ import absolute_import
    +from __future__ import division
    +from __future__ import print_function
    +from __future__ import unicode_literals
    +
    +import tensorflow as tf
    +import math
    +from open_seq2seq.parts.convs2s.ffn_wn_layer import FeedFowardNetworkNormalized
    +
    +
    +
    [docs]class AttentionLayerNormalized(tf.layers.Layer): + """Attention layer for convs2s with weight normalization""" + +
    [docs] def __init__(self, in_dim, embed_size, layer_id, add_res): + """initializes the attention layer. + It uses weight normalization for linear projections + (Salimans & Kingma, 2016) w = g * v/2-norm(v) + + Args: + in_dim: int last dimension of the inputs + embed_size: int target embedding size + layer_id: int the id of current convolution layer + add_res: bool whether residual connection should be added or not + """ + super(AttentionLayerNormalized, self).__init__() + + self.add_res = add_res + with tf.variable_scope("attention_layer_" + str(layer_id)): + + # linear projection layer to project the attention input to target space + self.tgt_embed_proj = FeedFowardNetworkNormalized( + in_dim, + embed_size, + dropout=1.0, + var_scope_name="att_linear_mapping_tgt_embed") + + # linear projection layer to project back to the input space + self.out_proj = FeedFowardNetworkNormalized( + embed_size, + in_dim, + dropout=1.0, + var_scope_name="att_linear_mapping_out")
    + +
    [docs] def call(self, input, target_embed, encoder_output_a, encoder_output_b, + input_attention_bias): + """Calculates the attention vectors. + + Args: + input: A float32 tensor with shape [batch_size, length, in_dim] + target_embed: A float32 tensor with shape [batch_size, length, in_dim] + containing the target embeddings + encoder_output_a: A float32 tensor with shape [batch_size, length, out_dim] + containing the first encoder outputs, uses as the keys + encoder_output_b: A float32 tensor with shape [batch_size, length, src_emb_dim] + containing the second encoder outputs, uses as the values + input_attention_bias: A float32 tensor with shape [batch_size, length, 1] + containing the bias used to mask the paddings + + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + + h_proj = self.tgt_embed_proj(input) + d_proj = (h_proj + target_embed) * math.sqrt(0.5) + att_score = tf.matmul(d_proj, encoder_output_a, transpose_b=True) + + # Masking need to be done in float32. Added to support mixed-precision training. + att_score = tf.cast(x=att_score, dtype=tf.float32) + + # mask out the paddings + if input_attention_bias is not None: + att_score = att_score + input_attention_bias + + att_score = tf.nn.softmax(att_score) + + # Cast back to original type + att_score = tf.cast(x=att_score, dtype=encoder_output_b.dtype) + + length = tf.cast(tf.shape(encoder_output_b), encoder_output_b.dtype) + output = tf.matmul(att_score, encoder_output_b) * \ + length[1] * tf.cast(tf.sqrt(1.0 / length[1]), dtype=encoder_output_b.dtype) + output = self.out_proj(output) + + if self.add_res: + output = (output + input) * math.sqrt(0.5) + + return output
    +
    + +
    + +
    + + +
    +
    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/parts/convs2s/conv_wn_layer.html b/docs/html/_modules/parts/convs2s/conv_wn_layer.html new file mode 100644 index 000000000..39bf548d9 --- /dev/null +++ b/docs/html/_modules/parts/convs2s/conv_wn_layer.html @@ -0,0 +1,338 @@ + + + + + + + + + + + parts.convs2s.conv_wn_layer — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + +
      + +
    • Docs »
    • + +
    • Module code »
    • + +
    • parts.convs2s.conv_wn_layer
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for parts.convs2s.conv_wn_layer

    +"""Implementation of a 1d convolutional layer with weight normalization.
    +Inspired from https://github.com/tobyyouup/conv_seq2seq"""
    +
    +from __future__ import absolute_import
    +from __future__ import division
    +from __future__ import print_function
    +from __future__ import unicode_literals
    +
    +import tensorflow as tf
    +import math
    +
    +
    +
    [docs]class Conv1DNetworkNormalized(tf.layers.Layer): + """1D convolutional layer with weight normalization""" + +
    [docs] def __init__(self, in_dim, out_dim, kernel_width, mode, layer_id, + hidden_dropout, conv_padding, decode_padding): + """initializes the 1D convolution layer. + It uses weight normalization (Salimans & Kingma, 2016) w = g * v/2-norm(v) + + Args: + in_dim: int last dimension of the inputs + out_dim: int new dimension for the output + kernel_width: int width of kernel + mode: str the current mode + layer_id: int the id of current convolution layer + hidden_dropout: float the keep-dropout value used on the input. + Give 1.0 if no dropout. + It is used to initialize the weights of convolution. + conv_padding: str the type of padding done for convolution + decode_padding: bool specifies if this convolution layer is in decoder or not + in decoder padding is done explicitly before convolution + """ + + super(Conv1DNetworkNormalized, self).__init__() + self.mode = mode + self.conv_padding = conv_padding + self.decode_padding = decode_padding + self.hidden_dropout = hidden_dropout + self.kernel_width = kernel_width + + with tf.variable_scope("conv_layer_" + str(layer_id)): + V_std = math.sqrt(4.0 * hidden_dropout / (kernel_width * in_dim)) + self.V = tf.get_variable( + 'V', + shape=[kernel_width, in_dim, 2 * out_dim], + initializer=tf.random_normal_initializer(mean=0, stddev=V_std), + trainable=True) + self.V_norm = tf.norm(self.V.initialized_value(), axis=[0, 1]) + self.g = tf.get_variable('g', initializer=self.V_norm, trainable=True) + self.b = tf.get_variable( + 'b', + shape=[2 * out_dim], + initializer=tf.zeros_initializer(), + trainable=True) + + self.W = tf.reshape(self.g, [1, 1, 2 * out_dim]) * tf.nn.l2_normalize( + self.V, [0, 1])
    + +
    [docs] def call(self, input): + """Applies convolution with gated linear units on x. + + Args: + x: A float32 tensor with shape [batch_size, length, in_dim] + + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + x = input + if self.mode == "train": + x = tf.nn.dropout(x, self.hidden_dropout) + + if self.decode_padding: + x = tf.pad( + x, [[0, 0], [self.kernel_width - 1, self.kernel_width - 1], [0, 0]], + "CONSTANT") + + output = tf.nn.bias_add( + tf.nn.conv1d( + value=x, filters=self.W, stride=1, padding=self.conv_padding), + self.b) + + if self.decode_padding and self.kernel_width > 1: + output = output[:, 0:-self.kernel_width + 1, :] + + output = self.gated_linear_units(output) + + return output
    + +
    [docs] def gated_linear_units(self, inputs): + """Gated Linear Units (GLU) on x. + + Args: + x: A float32 tensor with shape [batch_size, length, 2*out_dim] + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + input_shape = inputs.get_shape().as_list() + assert len(input_shape) == 3 + input_pass = inputs[:, :, 0:int(input_shape[2] / 2)] + input_gate = inputs[:, :, int(input_shape[2] / 2):] + input_gate = tf.sigmoid(input_gate) + return tf.multiply(input_pass, input_gate)
    +
    + +
    + +
    + + +
    +
    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/parts/convs2s/ffn_wn_layer.html b/docs/html/_modules/parts/convs2s/ffn_wn_layer.html new file mode 100644 index 000000000..dca35bdc1 --- /dev/null +++ b/docs/html/_modules/parts/convs2s/ffn_wn_layer.html @@ -0,0 +1,303 @@ + + + + + + + + + + + parts.convs2s.ffn_wn_layer — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + +
      + +
    • Docs »
    • + +
    • Module code »
    • + +
    • parts.convs2s.ffn_wn_layer
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for parts.convs2s.ffn_wn_layer

    +"""Implementation of fully connected network with weight normalization.
    +Inspired from https://github.com/tobyyouup/conv_seq2seq"""
    +
    +from __future__ import absolute_import
    +from __future__ import division
    +from __future__ import print_function
    +from __future__ import unicode_literals
    +
    +import tensorflow as tf
    +import math
    +
    +
    +
    [docs]class FeedFowardNetworkNormalized(tf.layers.Layer): + """Fully connected feedforward network with weight normalization""" + +
    [docs] def __init__(self, in_dim, out_dim, dropout, var_scope_name): + """initializes the linear layer. + This layer projects from in_dim-dimenstional space to out_dim-dimentional space. + It uses weight normalization (Salimans & Kingma, 2016) w = g * v/2-norm(v) + + Args: + in_dim: int last dimension of the inputs + out_dim: int new dimension for the output + dropout: float the keep-dropout value used in the previous layer. + It is used to initialize the weights. Give 1.0 if no dropout. + var_scope_name: str the scope name for the weight variables + """ + super(FeedFowardNetworkNormalized, self).__init__() + self.out_dim = out_dim + self.in_dim = in_dim + + with tf.variable_scope(var_scope_name): + V_initializer = \ + tf.random_normal_initializer(mean=0, stddev=math.sqrt(dropout * 1.0 / in_dim)) + self.V = tf.get_variable( + 'V', + shape=[in_dim, out_dim], + initializer=V_initializer, + trainable=True) + self.V_norm = tf.norm(self.V.initialized_value(), axis=0) + self.g = tf.get_variable('g', initializer=self.V_norm, trainable=True) + self.b = tf.get_variable( + 'b', + shape=[out_dim], + initializer=tf.zeros_initializer(), + trainable=True)
    + +
    [docs] def call(self, x): + """Projects x with its linear transformation. + + Args: + x: A float32 tensor with shape [batch_size, length, in_dim] + + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + batch_size = tf.shape(x)[0] + + x = tf.reshape(x, [-1, self.in_dim]) + output = tf.matmul(x, self.V) + output = tf.reshape(output, [batch_size, -1, self.out_dim]) + + # x*(v*(g/2-norm(v))) + b + scaler = tf.div(self.g, tf.norm(self.V, axis=0)) + output = tf.reshape(scaler, [1, self.out_dim]) * output + \ + tf.reshape(self.b, [1, self.out_dim]) + + return output
    +
    + +
    + +
    + + +
    +
    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/parts/rnns/utils.html b/docs/html/_modules/parts/rnns/utils.html index 0752d0e09..0406f0b3e 100644 --- a/docs/html/_modules/parts/rnns/utils.html +++ b/docs/html/_modules/parts/rnns/utils.html @@ -166,137 +166,32 @@

    Source code for parts.rnns.utils

     import tensorflow as tf
     
     
    -
    [docs]def create_rnn_cell(cell_type, - cell_params, - num_layers=1, - dp_input_keep_prob=1.0, - dp_output_keep_prob=1.0, - residual_connections=False, - wrap_to_multi_rnn=True): - """ - TODO: MOVE THIS properly to utils. Write doc - :param cell_type: - :param cell_params: - :param num_layers: - :param dp_input_keep_prob: - :param dp_output_keep_prob: - :param residual_connections: - :return: - """ - def single_cell(cell_params): - # TODO: This method is ugly - redo - size = cell_params["num_units"] - proj_size = None if "proj_size" not in cell_params else cell_params["proj_size"] - - if cell_type == "lstm": - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return tf.nn.rnn_cell.LSTMCell(num_units=size, - num_proj=proj_size, - forget_bias=1.0) - else: - return DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=size, - num_proj=proj_size, - forget_bias=1.0), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(tf.nn.rnn_cell.LSTMCell(num_units=size, - num_proj=proj_size, - forget_bias=1.0)) - else: - return ResidualWrapper(DropoutWrapper( - tf.nn.rnn_cell.LSTMCell( - num_units=size, - num_proj=proj_size, - forget_bias=1.0, - ), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - )) - elif cell_type == "gru": - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return tf.nn.rnn_cell.GRUCell(num_units=size) - else: - return DropoutWrapper( - tf.nn.rnn_cell.GRUCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - ) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(tf.nn.rnn_cell.GRUCell(num_units=size)) - else: - return ResidualWrapper(DropoutWrapper( - tf.nn.rnn_cell.GRUCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob), - ) - elif cell_type == "glstm": - num_groups = cell_params["num_groups"] - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return GLSTMCell(num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0) - else: - return DropoutWrapper(GLSTMCell(num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(GLSTMCell(num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0)) - else: - return ResidualWrapper(DropoutWrapper( - GLSTMCell( - num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0, - ), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - )) - elif cell_type == "slstm": - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return BasicSLSTMCell(num_units=size) - else: - return DropoutWrapper(BasicSLSTMCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob - ) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(BasicSLSTMCell(num_units=size)) - else: - return ResidualWrapper(DropoutWrapper( - BasicSLSTMCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - )) - else: - raise ValueError("Unknown RNN cell class: {}".format(cell_type)) - - if num_layers > 1: - if wrap_to_multi_rnn: - return MultiRNNCell([single_cell(cell_params) for _ in range(num_layers)]) - else: - cells = [] # for GNMT-like attention in decoder - for i in range(num_layers): - cells.append(single_cell(cell_params)) - return cells - else: - return single_cell(cell_params)
    +
    [docs]def single_cell(cell_class, + cell_params, + dp_input_keep_prob=1.0, + dp_output_keep_prob=1.0, + residual_connections=False): + """Creates an instance of the rnn cell. + Such cell describes one step one layer and can include residual connection + and/or dropout + + Args: + cell_class: Tensorflow RNN cell class + cell_params (dict): cell parameters + dp_input_keep_prob (float): (default: 1.0) input dropout keep probability + dp_output_keep_prob (float): (default: 1.0) output dropout keep probability + residual_connections (bool): whether to add residual connection + + Returns: + TF RNN instance + """ + cell = cell_class(**cell_params) + if residual_connections: + cell = ResidualWrapper(cell) + if dp_input_keep_prob != 1.0 or dp_output_keep_prob != 1.0: + cell = DropoutWrapper(cell, input_keep_prob=dp_input_keep_prob, + output_keep_prob=dp_output_keep_prob) + return cell
    diff --git a/docs/html/_modules/parts/transformer/beam_search_test.html b/docs/html/_modules/parts/transformer/beam_search_test.html deleted file mode 100644 index 3c9e88cfe..000000000 --- a/docs/html/_modules/parts/transformer/beam_search_test.html +++ /dev/null @@ -1,337 +0,0 @@ - - - - - - - - - - - parts.transformer.beam_search_test — OpenSeq2Seq 0.2 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - -
    - -
    - - - - - - - - - - - - - - - - - -
    - -
      - -
    • Docs »
    • - -
    • Module code »
    • - -
    • parts.transformer.beam_search_test
    • - - -
    • - -
    • - -
    - - -
    -
    -
    -
    - -

    Source code for parts.transformer.beam_search_test

    -# Copyright 2018 MLBenchmark Group. All Rights Reserved.
    -#
    -# Licensed under the Apache License, Version 2.0 (the "License");
    -# you may not use this file except in compliance with the License.
    -# You may obtain a copy of the License at
    -#
    -#     http://www.apache.org/licenses/LICENSE-2.0
    -#
    -# Unless required by applicable law or agreed to in writing, software
    -# distributed under the License is distributed on an "AS IS" BASIS,
    -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    -# See the License for the specific language governing permissions and
    -# limitations under the License.
    -# ==============================================================================
    -"""Test beam search helper methods."""
    -
    -from __future__ import absolute_import
    -from __future__ import division
    -from __future__ import print_function
    -
    -import tensorflow as tf
    -
    -from . import beam_search
    -#import beam_search
    -
    -
    -
    [docs]class BeamSearchHelperTests(tf.test.TestCase): - -
    [docs] def test_expand_to_beam_size(self): - x = tf.ones([7, 4, 2, 5]) - x = beam_search._expand_to_beam_size(x, 3) - with self.test_session() as sess: - shape = sess.run(tf.shape(x)) - self.assertAllEqual([7, 3, 4, 2, 5], shape)
    - -
    [docs] def test_shape_list(self): - y = tf.constant(4.0) - x = tf.ones([7, tf.to_int32(tf.sqrt(y)), 2, 5]) - shape = beam_search._shape_list(x) - self.assertIsInstance(shape[0], int) - self.assertIsInstance(shape[1], tf.Tensor) - self.assertIsInstance(shape[2], int) - self.assertIsInstance(shape[3], int)
    - -
    [docs] def test_get_shape_keep_last_dim(self): - y = tf.constant(4.0) - x = tf.ones([7, tf.to_int32(tf.sqrt(y)), 2, 5]) - shape = beam_search._get_shape_keep_last_dim(x) - self.assertAllEqual([None, None, None, 5], - shape.as_list())
    - -
    [docs] def test_flatten_beam_dim(self): - x = tf.ones([7, 4, 2, 5]) - x = beam_search._flatten_beam_dim(x) - with self.test_session() as sess: - shape = sess.run(tf.shape(x)) - self.assertAllEqual([28, 2, 5], shape)
    - -
    [docs] def test_unflatten_beam_dim(self): - x = tf.ones([28, 2, 5]) - x = beam_search._unflatten_beam_dim(x, 7, 4) - with self.test_session() as sess: - shape = sess.run(tf.shape(x)) - self.assertAllEqual([7, 4, 2, 5], shape)
    - -
    [docs] def test_gather_beams(self): - x = tf.reshape(tf.range(24), [2, 3, 4]) - # x looks like: [[[ 0 1 2 3] - # [ 4 5 6 7] - # [ 8 9 10 11]] - # - # [[12 13 14 15] - # [16 17 18 19] - # [20 21 22 23]]] - - y = beam_search._gather_beams(x, [[1, 2], [0, 2]], 2, 2) - with self.test_session() as sess: - y = sess.run(y) - - self.assertAllEqual([[[4, 5, 6, 7], - [8, 9, 10, 11]], - [[12, 13, 14, 15], - [20, 21, 22, 23]]], - y)
    - -
    [docs] def test_gather_topk_beams(self): - x = tf.reshape(tf.range(24), [2, 3, 4]) - x_scores = [[0, 1, 1], [1, 0, 1]] - - y = beam_search._gather_topk_beams(x, x_scores, 2, 2) - with self.test_session() as sess: - y = sess.run(y) - - self.assertAllEqual([[[4, 5, 6, 7], - [8, 9, 10, 11]], - [[12, 13, 14, 15], - [20, 21, 22, 23]]], - y)
    - - -if __name__ == "__main__": - tf.test.main() -
    - -
    - -
    - - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/html/_modules/parts/transformer/embedding_layer.html b/docs/html/_modules/parts/transformer/embedding_layer.html index f51ff5d3c..5e5cef04e 100644 --- a/docs/html/_modules/parts/transformer/embedding_layer.html +++ b/docs/html/_modules/parts/transformer/embedding_layer.html @@ -181,23 +181,31 @@

    Source code for parts.transformer.embedding_layer

    [docs]class EmbeddingSharedWeights(tf.layers.Layer): """Calculates input embeddings and pre-softmax linear with shared weights.""" - def __init__(self, vocab_size, hidden_size, pad2eight=False): + def __init__(self, vocab_size, hidden_size, pad_vocab_to_eight=False, init_var=None, + embed_scale=True, pad_sym=0, mask_paddings=True): super(EmbeddingSharedWeights, self).__init__() - self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.embed_scale = embed_scale + self.pad_sym = pad_sym + self.mask_paddings = mask_paddings + padf = lambda x: x if x % 8 == 0 else x + 8 - x % 8 - if pad2eight: - self.hidden_size = padf(hidden_size) + if pad_vocab_to_eight: + self.vocab_size = padf(vocab_size) + else: + self.vocab_size = vocab_size + + if init_var is None: + self.init_var = hidden_size ** -0.5 else: - self.hidden_size = hidden_size + self.init_var = init_var
    [docs] def build(self, _): with tf.variable_scope("embedding_and_softmax", reuse=tf.AUTO_REUSE): # Create and initialize weights. The random normal initializer was chosen # randomly, and works well. - self.shared_weights = tf.get_variable( - "weights", [self.vocab_size, self.hidden_size], - initializer=tf.random_normal_initializer( - 0., self.hidden_size ** -0.5)) + self.shared_weights = tf.get_variable("weights", [self.vocab_size, self.hidden_size], + initializer=tf.random_normal_initializer(0., self.init_var)) self.built = True
    @@ -213,18 +221,18 @@

    Source code for parts.transformer.embedding_layer

    """ with tf.name_scope("embedding"): embeddings = tf.gather(self.shared_weights, x) - - # Scale embedding by the sqrt of the hidden size - embeddings *= self.hidden_size ** 0.5 - - # Create binary array of size [batch_size, length] - # where 1 = padding, 0 = not padding - padding = model_utils.get_padding(x) - - # Set all padding embedding values to 0 - #embeddings *= tf.expand_dims(1 - padding, -1) - embeddings *= tf.cast(tf.expand_dims(1 - padding, -1), - dtype=embeddings.dtype) + if self.embed_scale: + # Scale embedding by the sqrt of the hidden size + embeddings *= self.hidden_size ** 0.5 + + if self.mask_paddings: + # Create binary array of size [batch_size, length] + # where 1 = padding, 0 = not padding + padding = model_utils.get_padding(x, padding_value=self.pad_sym) + + # Set all padding embedding values to 0 + #embeddings *= tf.expand_dims(1 - padding, -1) + embeddings *= tf.cast(tf.expand_dims(1.0 - padding, -1), dtype=embeddings.dtype) return embeddings
    [docs] def linear(self, x): diff --git a/docs/html/_modules/parts/transformer/utils.html b/docs/html/_modules/parts/transformer/utils.html index 14c3a5604..166eeefe4 100644 --- a/docs/html/_modules/parts/transformer/utils.html +++ b/docs/html/_modules/parts/transformer/utils.html @@ -230,23 +230,23 @@

    Source code for parts.transformer.utils

       return decoder_bias
    -
    [docs]def get_padding(x, padding_value=0): +
    [docs]def get_padding(x, padding_value=0, dtype=tf.float32): """Return float tensor representing the padding values in x. Args: x: int tensor with any shape padding_value: int value that + dtype: type of the output Returns: flaot tensor with same shape as x containing values 0 or 1. 0 -> non-padding, 1 -> padding """ with tf.name_scope("padding"): - return tf.to_float(tf.equal(x, padding_value))
    - #return tf.cast(tf.equal(x, padding_value), dtype=x.dtype) + return tf.cast(tf.equal(x, padding_value), dtype=dtype)
    -
    [docs]def get_padding_bias(x): +
    [docs]def get_padding_bias(x, res_rank=4, pad_sym=0): """Calculate bias tensor from padding values in tensor. Bias tensor that is added to the pre-softmax multi-headed attention logits, @@ -255,17 +255,25 @@

    Source code for parts.transformer.utils

     
       Args:
         x: int tensor with shape [batch_size, length]
    +    res_rank: int indicates the rank of attention_bias.
    +    dtype: type of the output attention_bias
    +    pad_sym: int the symbol used for padding
     
       Returns:
    -    Attention bias tensor of shape [batch_size, 1, 1, length].
    +    Attention bias tensor of shape
    +    [batch_size, 1, 1, length] if  res_rank = 4 - for Transformer
    +    or [batch_size, 1, length] if res_rank = 3 - for ConvS2S
       """
       with tf.name_scope("attention_bias"):
    -    padding = get_padding(x)
    +    padding = get_padding(x, padding_value=pad_sym)
         attention_bias = padding * _NEG_INF
    -    attention_bias = tf.expand_dims(
    -        tf.expand_dims(attention_bias, axis=1), axis=1)
    +    if res_rank == 4:
    +      attention_bias = tf.expand_dims(tf.expand_dims(attention_bias, axis=1), axis=1)
    +    elif res_rank == 3:
    +      attention_bias = tf.expand_dims(attention_bias, axis=1)
    +    else:
    +      raise ValueError("res_rank should be 3 or 4 but got {}".format(res_rank))
       return attention_bias
    -
    diff --git a/docs/html/_modules/utils/funcs.html b/docs/html/_modules/utils/funcs.html index e18613100..28a2ac808 100644 --- a/docs/html/_modules/utils/funcs.html +++ b/docs/html/_modules/utils/funcs.html @@ -164,7 +164,8 @@

    Source code for utils.funcs

     
     from .hooks import PrintSamplesHook, RunEvaluationHook, PrintLossAndTimeHook, \
                        BroadcastGlobalVariablesHook
    -from open_seq2seq.utils.utils import deco_print, get_results_for_epoch
    +from open_seq2seq.utils.utils import deco_print, get_results_for_epoch, \
    +                                     collect_if_horovod
     from tensorflow.python import debug as tf_debug
     
     
    @@ -272,7 +273,17 @@ 

    Source code for utils.funcs

             break
           tm = time.time()
           try:
    -        fetches_vals = sess.run(fetches)
    +        feed_dict = {}
    +        iter_size = train_model.params.get('iter_size', 1)
    +        if iter_size > 1:
    +          feed_dict[train_model.skip_update_ph] = step % iter_size != 0
    +        if step % iter_size == 0:
    +          fetches_vals = sess.run(fetches, feed_dict)
    +        else:
    +          # necessary to skip "no-update" steps when iter_size > 1
    +          def run_with_no_hooks(step_context):
    +            return step_context.session.run(fetches, feed_dict)
    +          fetches_vals = sess.run_step_fn(run_with_no_hooks)
           except tf.errors.OutOfRangeError:
             break
           if step >= bench_start:
    @@ -280,29 +291,29 @@ 

    Source code for utils.funcs

             if len(fetches) > 1:
               for i in range(train_model.num_gpus):
                 total_objects += np.sum(fetches_vals[i + 1])
    +          if train_model.params['print_bench_info_steps'] is not None:
    +            if step % train_model.params['print_bench_info_steps'] == 0:
    +              total_objects_cur = collect_if_horovod(total_objects, hvd,
    +                                                     mode="sum")
    +              if master_worker:
    +                avg_objects = 1.0 * total_objects_cur / total_time
    +                deco_print("Avg objects per second: {:.3f}".format(avg_objects))
    +
           step += 1
     
    -  if hvd is not None:
    -    deco_print("Finished training on rank {}".format(hvd.rank()))
    -  else:
    -    deco_print("Finished training")
    +  if len(fetches) > 1:
    +    total_objects = collect_if_horovod(total_objects, hvd, mode="sum")
     
    -  if train_model.on_horovod:
    -    ending = " on worker {}".format(hvd.rank())
    -  else:
    -    ending = ""
    -  if step > bench_start:
    -    deco_print(
    -      "Avg time per step{}: {:.3f}s".format(
    -        ending, 1.0 * total_time / (step - bench_start))
    -    )
    -    if len(fetches) > 1:
    -      deco_print(
    -        "Avg objects per second{}: {:.3f}".format(
    -          ending, 1.0 * total_objects / total_time)
    -      )
    -  else:
    -    deco_print("Not enough steps for benchmarking{}".format(ending))
    + if master_worker: + deco_print("Finished training") + if step > bench_start: + avg_time = 1.0 * total_time / (step - bench_start) + deco_print("Avg time per step: {:.3f}s".format(avg_time)) + if len(fetches) > 1: + avg_objects = 1.0 * total_objects / total_time + deco_print("Avg objects per second: {:.3f}".format(avg_objects)) + else: + deco_print("Not enough steps for benchmarking")
    [docs]def restore_and_get_results(model, checkpoint, mode): diff --git a/docs/html/_modules/utils/hooks.html b/docs/html/_modules/utils/hooks.html index 60851a684..e4c74a11a 100644 --- a/docs/html/_modules/utils/hooks.html +++ b/docs/html/_modules/utils/hooks.html @@ -244,10 +244,10 @@

    Source code for utils.hooks

         self._timer.update_last_triggered_step(self._iter_count - 1)
     
         input_values, output_values = results
    -    dict_to_log = self._model.maybe_print_logs(input_values, output_values)
    +    dict_to_log = self._model.maybe_print_logs(input_values, output_values, step)
         # optionally logging to tensorboard any values
         # returned from maybe_print_logs
    -    if dict_to_log:
    +    if self._model.params['save_summaries_steps'] and dict_to_log:
           log_summaries_from_dict(
             dict_to_log,
             self._model.params['logdir'],
    @@ -348,11 +348,12 @@ 

    Source code for utils.hooks

         if not self._model.on_horovod or self._model.hvd.rank() == 0:
           deco_print("Validation loss: {:.4f}".format(total_loss), offset=4)
     
    -      dict_to_log = self._model.finalize_evaluation(results_per_batch)
    +      dict_to_log = self._model.finalize_evaluation(results_per_batch, step)
           dict_to_log['eval_loss'] = total_loss
     
           # saving the best validation model
    -      if total_loss < self._best_eval_loss:
    +      if self._model.params['save_checkpoint_steps'] and \
    +         total_loss < self._best_eval_loss:
             self._best_eval_loss = total_loss
             self._eval_saver.save(
               run_context.session,
    @@ -363,7 +364,7 @@ 

    Source code for utils.hooks

     
           # optionally logging to tensorboard any values
           # returned from maybe_print_logs
    -      if dict_to_log:
    +      if self._model.params['save_summaries_steps']:
             log_summaries_from_dict(
               dict_to_log,
               self._model.params['logdir'],
    diff --git a/docs/html/_modules/utils/utils.html b/docs/html/_modules/utils/utils.html
    index 412bf547e..1dc41f439 100644
    --- a/docs/html/_modules/utils/utils.html
    +++ b/docs/html/_modules/utils/utils.html
    @@ -159,6 +159,7 @@ 

    Source code for utils.utils

     from six.moves import range
     from six import string_types
     
    +import six
     import tensorflow as tf
     import subprocess
     import numpy as np
    @@ -186,6 +187,42 @@ 

    Source code for utils.utils

                                   dense_shape_clipped)
    +
    [docs]def collect_if_horovod(value, hvd, mode='sum'): + """Collects values from all workers if run on Horovod. + Note, that on all workers except first this function will return None. + + Args: + value: value to collect. + hvd: horovod.tensorflow module or None + mode: could be "sum", "mean" or "gather", indicating reduce_sum or gather. + For "sum" and "mean" value has to be numerical, for "gather", value has + to be iterable. + + Returns: + collected results if run on Horovod or value otherwise. + """ + if hvd is None: + return value + + import mpi4py.rc + mpi4py.rc.initialize = False + from mpi4py import MPI + + values = MPI.COMM_WORLD.gather(value) + # synchronize all workers + MPI.COMM_WORLD.Barrier() + + if MPI.COMM_WORLD.Get_rank() != 0: + return None + + if mode == 'sum': + return np.sum(values) + elif mode == 'mean': + return np.mean(values) + elif mode == 'gather': + return [item for sl in values for item in sl]
    + +
    [docs]def clip_last_batch(last_batch, true_size): last_batch_clipped = [] for val in last_batch: @@ -196,179 +233,172 @@

    Source code for utils.utils

       return last_batch_clipped
    -
    [docs]def iterate_data_layer(model, dl_id, sess, compute_loss, mode, verbose): +
    [docs]def iterate_data(model, sess, compute_loss, mode, verbose): total_time = 0.0 bench_start = model.params.get('bench_start', 10) results_per_batch = [] - if model.on_horovod: - data_layer = model.get_data_layer() - if compute_loss: - loss_tensor = model.eval_losses[0] - output_tensors = model.get_output_tensors() - else: - data_layer = model.get_data_layer(dl_id) - if compute_loss: - loss_tensor = model.eval_losses[dl_id] - output_tensors = model.get_output_tensors(dl_id) - - sess.run(data_layer.iterator.initializer) - - fetches = [ - data_layer.input_tensors, - output_tensors, - ] + size_defined = model.get_data_layer().get_size_in_samples() is not None + if size_defined: + dl_sizes = [] if compute_loss: - fetches.append(loss_tensor) total_loss = 0.0 - total_samples = 0.0 - size_defined = data_layer.get_size_in_samples() is not None + total_samples = [] + fetches = [] - if size_defined: - data_size = data_layer.get_size_in_samples() // \ - data_layer.params['batch_size'] - last_batch_size = data_layer.get_size_in_samples() % \ - data_layer.params['batch_size'] + # on horovod num_gpus is 1 + for worker_id in range(model.num_gpus): + cur_fetches = [ + model.get_data_layer(worker_id).input_tensors, + model.get_output_tensors(worker_id), + ] + if compute_loss: + cur_fetches.append(model.eval_losses[worker_id]) + if size_defined: + dl_sizes.append(model.get_data_layer(worker_id).get_size_in_samples()) + try: + total_objects = 0.0 + cur_fetches.append(model.get_num_objects_per_step(worker_id)) + except NotImplementedError: + total_objects = None + deco_print("WARNING: Can't compute number of objects per step, since " + "train model does not define get_num_objects_per_step method.") + fetches.append(cur_fetches) + total_samples.append(0.0) + + sess.run([model.get_data_layer(i).iterator.initializer + for i in range(model.num_gpus)]) + + step = 0 + processed_batches = 0 + if verbose: + if model.on_horovod: + ending = " on worker {}".format(model.hvd.rank()) + else: + ending = "" - if model.on_horovod: - worker_id = model.hvd.rank() - else: - worker_id = dl_id + while True: + tm = time.time() + fetches_vals = {} + if size_defined: + fetches_to_run = {} + # removing finished data layers + for worker_id in range(model.num_gpus): + if total_samples[worker_id] < dl_sizes[worker_id]: + fetches_to_run[worker_id] = fetches[worker_id] + fetches_vals = sess.run(fetches_to_run) + else: + # if size is not defined we have to process fetches sequentially, so not + # to lose data when exception is thrown on one data layer + for worker_id, one_fetch in enumerate(fetches): + try: + fetches_vals[worker_id] = sess.run(one_fetch) + except tf.errors.OutOfRangeError: + continue - cross_over = 0 - if size_defined: - if data_size == 0: - raise ValueError( - "Batch size is bigger than dataset size: {} > {}".format( - data_layer.params['batch_size'], data_layer.get_size_in_samples() - ) - ) - if last_batch_size != 0: - cross_over = 1 - else: - # setting data_size to be infinity and assume - # that tf.errors.OutOfRangeError will be raised - data_size = 1000000000000 + if step >= bench_start: + total_time += time.time() - tm - for step in range(data_size + cross_over): - tm = time.time() - try: + # looping over num_gpus. In Horovod case this loop is "dummy", + # since num_gpus = 1 + for worker_id, fetches_val in fetches_vals.items(): if compute_loss: - inputs, outputs, loss = sess.run(fetches) + inputs, outputs, loss = fetches_val[:3] else: - inputs, outputs = sess.run(fetches) - except tf.errors.OutOfRangeError: - break - if step >= bench_start: - total_time += time.time() - tm + inputs, outputs = fetches_val[:2] - # assuming any element of inputs["source_tensors"][ shape[0] is batch size - batch_size = inputs["source_tensors"][0].shape[0] + if total_objects is not None: + total_objects += np.sum(fetches_val[-1]) - if compute_loss: - total_loss += loss * batch_size - total_samples += batch_size + # assuming any element of inputs["source_tensors"] .shape[0] is batch size + batch_size = inputs["source_tensors"][0].shape[0] + total_samples[worker_id] += batch_size - if size_defined and step == data_size: - inputs["source_tensors"] = model.clip_last_batch( - inputs["source_tensors"], last_batch_size, - ) - if 'target_tensors' in inputs: - inputs["target_tensors"] = model.clip_last_batch( - inputs["target_tensors"], last_batch_size, - ) - outputs = model.clip_last_batch(outputs, last_batch_size) - - if mode == 'eval': - results_per_batch.append(model.evaluate(inputs, outputs)) - elif mode == 'infer': - results_per_batch.append(model.infer(inputs, outputs)) - else: - raise ValueError("Unknown mode: {}".format(mode)) + if size_defined: + # this data_layer is at the last batch with few more elements, cutting + if total_samples[worker_id] > dl_sizes[worker_id]: + last_batch_size = dl_sizes[worker_id] % batch_size + for key, value in inputs.items(): + inputs[key] = model.clip_last_batch(value, last_batch_size) + outputs = model.clip_last_batch(outputs, last_batch_size) + + processed_batches += 1 + + if compute_loss: + total_loss += loss * batch_size + + if mode == 'eval': + results_per_batch.append(model.evaluate(inputs, outputs)) + elif mode == 'infer': + results_per_batch.append(model.infer(inputs, outputs)) + else: + raise ValueError("Unknown mode: {}".format(mode)) if verbose: if size_defined: - if data_size > 10 and step % (data_size // 10) == 0: - deco_print("Processed {}/{} batches on worker {}".format( - step + 1, data_size, worker_id)) + data_size = int(np.sum(np.ceil(np.array(dl_sizes) / + model.params['batch_size_per_gpu']))) + if step == 0 or len(fetches_vals) == 0 or \ + (data_size > 10 and processed_batches % (data_size // 10) == 0): + deco_print("Processed {}/{} batches{}".format( + processed_batches, data_size, ending)) else: - deco_print("Processed {} batches".format(step + 1), end='\r') + deco_print("Processed {} batches{}".format(processed_batches, ending), + end='\r') + + if len(fetches_vals) == 0: + break + step += 1 if verbose: if step > bench_start: deco_print( - "Avg time per step: {:.3}s on worker {}".format( - 1.0 * total_time / (step - bench_start), worker_id), + "Avg time per step{}: {:.3}s".format( + ending, 1.0 * total_time / (step - bench_start)), ) + if total_objects is not None: + avg_objects = 1.0 * total_objects / total_time + deco_print("Avg objects per second{}: {:.3f}".format(ending, + avg_objects)) else: deco_print( - "Not enough steps for benchmarking on worker {}".format(worker_id) + "Not enough steps for benchmarking{}".format(ending) ) if compute_loss: - return results_per_batch, total_loss, total_samples + return results_per_batch, total_loss, np.sum(total_samples) else: return results_per_batch
    [docs]def get_results_for_epoch(model, sess, compute_loss, mode, verbose=False): - if model.on_horovod: - if compute_loss: - results_per_batch, total_loss, total_samples = iterate_data_layer( - model, 0, sess, compute_loss, mode, verbose, - ) - else: - results_per_batch = iterate_data_layer( - model, 0, sess, compute_loss, mode, verbose, - ) + if compute_loss: + results_per_batch, total_loss, total_samples = iterate_data( + model, sess, compute_loss, mode, verbose, + ) else: - results_per_batch_all = [] - total_loss_all = [] - total_samples_all = [] - for dl_id in range(model.num_gpus): - if compute_loss: - results_per_batch, total_loss, total_samples = iterate_data_layer( - model, dl_id, sess, compute_loss, mode, verbose, - ) - total_loss_all.append(total_loss) - total_samples_all.append(total_samples) - else: - results_per_batch = iterate_data_layer( - model, dl_id, sess, compute_loss, mode, verbose, - ) - results_per_batch_all.append(results_per_batch) + results_per_batch = iterate_data( + model, sess, compute_loss, mode, verbose, + ) - if model.on_horovod: - import mpi4py.rc - mpi4py.rc.initialize = False - from mpi4py import MPI + if compute_loss: + total_samples = collect_if_horovod(total_samples, model.hvd, 'sum') + total_loss = collect_if_horovod(total_loss, model.hvd, 'sum') + results_per_batch = collect_if_horovod(results_per_batch, model.hvd, 'gather') + if results_per_batch is None: + # returning dummy tuple of correct shape if not in master worker if compute_loss: - total_samples_all = MPI.COMM_WORLD.gather(total_samples) - total_loss_all = MPI.COMM_WORLD.gather(total_loss) - results_per_batch_all = MPI.COMM_WORLD.gather(results_per_batch) - - MPI.COMM_WORLD.Barrier() - if MPI.COMM_WORLD.Get_rank() != 0: - # returning dummy tuple of correct shape - if compute_loss: - return None, None - else: - return None - - if compute_loss: - total_loss = np.sum(total_loss_all) - total_samples = np.sum(total_samples_all) - # moving GPU dimension into the batch dimension - results_per_batch = [item for sl in results_per_batch_all for item in sl] + return None, None + else: + return None if compute_loss: - total_loss /= total_samples - return results_per_batch, total_loss - - return results_per_batch
    + return results_per_batch, total_loss / total_samples + else: + return results_per_batch
    [docs]def log_summaries_from_dict(dict_to_log, output_dir, step): @@ -441,7 +471,14 @@

    Source code for utils.utils

     
    [docs]def nested_update(org_dict, upd_dict): for key, value in upd_dict.items(): if isinstance(value, dict): - nested_update(org_dict[key], value) + if key in org_dict: + if not isinstance(org_dict[key], dict): + raise ValueError( + "Mismatch between org_dict and upd_dict at node {}".format(key) + ) + nested_update(org_dict[key], value) + else: + org_dict[key] = value else: org_dict[key] = value
    @@ -454,7 +491,10 @@

    Source code for utils.utils

     
     
     
    [docs]def deco_print(line, offset=0, start="*** ", end='\n'): - print(start + " " * offset + line, end=end)
    + if six.PY2: + print((start + " " * offset + line).encode('utf-8'), end=end) + else: + print(start + " " * offset + line, end=end)
    [docs]def array_to_string(row, vocab, delim=' '): diff --git a/docs/html/_sources/api-docs/decoders.rst.txt b/docs/html/_sources/api-docs/decoders.rst.txt index 07c22c247..681f5d1e1 100644 --- a/docs/html/_sources/api-docs/decoders.rst.txt +++ b/docs/html/_sources/api-docs/decoders.rst.txt @@ -37,3 +37,11 @@ transformer\_decoders :members: :undoc-members: :show-inheritance: + +convs2s\_decoder +------------------------------------- + +.. automodule:: decoders.convs2s_decoder + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/html/_sources/api-docs/encoders.rst.txt b/docs/html/_sources/api-docs/encoders.rst.txt index 362287eb6..5c11d1f26 100644 --- a/docs/html/_sources/api-docs/encoders.rst.txt +++ b/docs/html/_sources/api-docs/encoders.rst.txt @@ -22,6 +22,14 @@ ds2\_encoder :undoc-members: :show-inheritance: +w2l\_encoder +---------------------------- + +.. automodule:: encoders.w2l_encoder + :members: + :undoc-members: + :show-inheritance: + rnn\_encoders ----------------------------- @@ -38,6 +46,14 @@ transformer\_encoders :undoc-members: :show-inheritance: +convs2s\_encoder +------------------------------------- + +.. automodule:: encoders.convs2s_encoder + :members: + :undoc-members: + :show-inheritance: + resnet\_encoder ---------------------------------- @@ -53,3 +69,12 @@ resnet\_blocks :members: :undoc-members: :show-inheritance: + + +cnn\_encoder +-------------------------------- + +.. automodule:: encoders.cnn_encoder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/html/_sources/api-docs/parts.cnns.rst.txt b/docs/html/_sources/api-docs/parts.cnns.rst.txt new file mode 100644 index 000000000..631cb86c1 --- /dev/null +++ b/docs/html/_sources/api-docs/parts.cnns.rst.txt @@ -0,0 +1,15 @@ +cnns +======================================= + +.. automodule:: parts.cnns + :members: + :undoc-members: + :show-inheritance: + +conv\_blocks +------------------------------------------------------- + +.. automodule:: parts.cnns.conv_blocks + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/html/_sources/api-docs/parts.convs2s.rst.txt b/docs/html/_sources/api-docs/parts.convs2s.rst.txt new file mode 100644 index 000000000..226652c72 --- /dev/null +++ b/docs/html/_sources/api-docs/parts.convs2s.rst.txt @@ -0,0 +1,31 @@ +convs2s +======================================= + +.. automodule:: parts.convs2s + :members: + :undoc-members: + :show-inheritance: + +attention\_wn\_layer +------------------------------------------------------- + +.. automodule:: parts.convs2s.attention_wn_layer + :members: + :undoc-members: + :show-inheritance: + +conv\_wn\_layer +------------------------------------------------------- + +.. automodule:: parts.convs2s.conv_wn_layer + :members: + :undoc-members: + :show-inheritance: + +ffn\_wn\_layer +------------------------------------------------------- + +.. automodule:: parts.convs2s.ffn_wn_layer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/html/_sources/api-docs/parts.rst.txt b/docs/html/_sources/api-docs/parts.rst.txt index 3f85cb82a..6a57d9287 100644 --- a/docs/html/_sources/api-docs/parts.rst.txt +++ b/docs/html/_sources/api-docs/parts.rst.txt @@ -10,3 +10,5 @@ parts parts.rnns parts.transformer + parts.convs2s + parts.cnns \ No newline at end of file diff --git a/docs/html/_sources/api-docs/parts.transformer.rst.txt b/docs/html/_sources/api-docs/parts.transformer.rst.txt index 7dab39e8a..8fa9237fd 100644 --- a/docs/html/_sources/api-docs/parts.transformer.rst.txt +++ b/docs/html/_sources/api-docs/parts.transformer.rst.txt @@ -22,14 +22,6 @@ beam\_search :undoc-members: :show-inheritance: -beam\_search\_test ---------------------------------------------------------- - -.. automodule:: parts.transformer.beam_search_test - :members: - :undoc-members: - :show-inheritance: - common --------------------------------------------- diff --git a/docs/html/_sources/installation-instructions.rst.txt b/docs/html/_sources/installation-instructions.rst.txt index ba09bbccc..0e4c7d110 100644 --- a/docs/html/_sources/installation-instructions.rst.txt +++ b/docs/html/_sources/installation-instructions.rst.txt @@ -32,7 +32,7 @@ run unittests:: python -m unittest discover -s open_seq2seq -p '*_test.py' -It might take up to 10 minutes. You should see a lot of output, but no errors +It might take up to 30 minutes. You should see a lot of output, but no errors in the end. .. _installation_speech: diff --git a/docs/html/_sources/models-and-recipes.rst.txt b/docs/html/_sources/models-and-recipes.rst.txt index 0d7337568..70de3f5ce 100644 --- a/docs/html/_sources/models-and-recipes.rst.txt +++ b/docs/html/_sources/models-and-recipes.rst.txt @@ -3,22 +3,21 @@ Models and recipes ================== -.. This section will contain information about different models that OpenSeq2Seq -.. supports, exact config parameters to train them, final training/validation/test -.. metrics and links to checkpoints (tensorboards also?) of trained models. .. note:: Currently OpenSeq2Seq has model implementations for machine translation and - automatic speech recognition. All models work both in float32 and mixed precision. - We recommend you use :ref:`mixed precision training ` when training on Volta GPUs. + automatic speech recognition. + All models work both in float32 and mixed precision. + We recommend you use :ref:`mixed precision training ` + when training on Volta GPUs. -To train models you can use the following -commands (don't forget to substitute valid config_file path there). +To train models you can use the following commands (don't forget to substitute +valid config_file path there and number of GPUs if using Horovod). With Horovod (highly recommended when using multiple GPUs):: - mpirun --allow-run-as-root --mca orte_base_help_aggregate 0 -mca btl ^openib -np 4 -H localhost:4 -bind-to none -map-by slot -x LD_LIBRARY_PATH python run.py --config_file=... --mode=train_eval --use_horovod=True --enable_logs + mpiexec --allow-run-as-root -np python run.py --config_file=... --mode=train_eval --use_horovod=True --enable_logs Without Horovod:: @@ -29,6 +28,16 @@ The description of implemented models is available in the next sections: Machine translation ------------------- +The table below contains description and results of +machine translation models available in OpenSeq2Seq. +Currently, we have GNMT-based model, Transformer-based models and +ConvS2S-based models. + +We measure BLEU score on newstest2014.tok.de file using ``multi-bleu.perl`` script from Mosses. +For more details about model descriptions and training setup, +have a look at the `configuration files `_. + + .. list-table:: :widths: 1 1 1 1 1 :header-rows: 1 @@ -38,72 +47,87 @@ Machine translation - Training setup and additional comments - Short description of the model - Checkpoint - * - `en-de-nmt-small.py `_ + * - `en-de-nmt-small.py `_ - 20.23 - This model should train on a single GPU such as 1080Ti. It is trained using Adam optimizer. - RNN-based. Bi-directional encoder with 2 layers and. GNMT-like decoder with 2 layers and attention. Uses LSTM cells of size 512. - `link `_ - * - `en-de-gnmt-like-4GPUs.py `_ + * - `en-de-gnmt-like-4GPUs.py `_ - 23.89 - This model was trained on 4 GPUs with Adam optimizer and learning rate decay. - RNN-based. This is GNMT-like model which tries to match the one described in https://arxiv.org/abs/1609.08144 as close as possible. - `link `_ - * - `transformer-big.py `_ + * - `transformer-big.py `_ - 26.17 - This model was trained on 4 GPUs with Adam optimizer and learning rate decay. - Transformer "big" model. This model does not have any RNN layers - `link `_ + * - `en-de-convs2s.py `_ + - xx.xx + - This model was trained on 4 GPUs with Adam optimizer, learning rate decay and warm-up. + - This is an implementation of the ConvS2S model proposed in https://arxiv.org/abs/1705.03122. + - Coming soon. -GNMT model description can be found `here `_. -Transformer model description can be found `here `_. -We measure BLEU score on newstest2014.tok.de file using ``multi-bleu.perl`` script from Mosses. +GNMT model description: https://arxiv.org/abs/1609.08144. + +Transformer model description: https://arxiv.org/abs/1706.03762. + +ConvS2S model description: https://arxiv.org/abs/1705.03122. Speech recognition ------------------ -Deep Speech 2 based models -~~~~~~~~~~~~~~~~~~~~~~~~~~ -Original Deep Speech 2 model description: https://arxiv.org/abs/1512.02595. The table below contains description and results of -Deep Speech 2 based models available in OpenSeq2Seq. +speech recognition models available in OpenSeq2Seq. +Currently, we have DeepSpeech2-based models and Wav2Letter-based models. -WER-512 and WER-2048 is word error rate obtained with beam width of 512 and 2048 -correspondingly. For beam width of 2048 we also used ``batch_size_per_gpu = 1`` +WER is the word error rate obtained on a dev-clean subset of LibriSpeech using +greedy decoder (``decoder_params/use_language_model = False``). +For the final evaluation we used ``batch_size_per_gpu = 1`` to eliminate the effect of `cudnn padding issue `_. For more details about model descriptions and training setup, -have a look at the `configuration files `_. +have a look at the `configuration files `_. .. list-table:: - :widths: 1 1 1 1 1 1 + :widths: 1 1 1 1 1 :header-rows: 1 * - Config file - - WER-512 - - WER-2048 + - WER - Training setup and additional comments - Short description of the model - Checkpoint * - `ds2_large_8gpus.py `_ - - 4.90% - - 4.59% + - 9.28% - This model was trained for 50 epochs using SGD with Momentum and LARC on the full LibriSpeech in a few days using Horovod on eight GPUs. - This model has 2 convolutional layers and 5 bidirectional GRU layers with 800 units. - - `link `_ + - `link `_ * - `ds2_medium_4gpus.py `_ - - 6.12% - - 5.49% + - 22.60% - This model was trained for 50 epochs using Adam on the full LibriSpeech in a few days using Horovod on four GPUs. - This model has 3 convolutional layers and 3 unidirectional GRU layers with 1024 units. - `link `_ * - `ds2_small_1gpu.py `_ - - 11.77% - - 9.32% + - 39.08% - This model was trained for 12 epochs using Adam on a "clean" subset of LibriSpeech in less than a day using a single GPU. - This model has 2 convolutional layers and 2 bidirectional GRU layers with 512 units. - `link `_ + * - `w2l_large_8gpus.py `_ + - 15.44% + - This model was trained for 18 epochs (with early stopping based on + validation loss) using SGD with Momentum and LARC on + the full LibriSpeech in a few days on eight GPUs. + - The model has 19 convolutional layers (200--1000 units, 7--21 kernel size). + We use batch norm between all layers. + - `link `_ + + +Deep Speech 2 model description: https://arxiv.org/abs/1512.02595. + +Wav2Letter model description: https://arxiv.org/abs/1609.03193, https://arxiv.org/abs/1712.09444. diff --git a/docs/html/api-docs/data.image2label.html b/docs/html/api-docs/data.image2label.html index 53efdff46..27f21f736 100644 --- a/docs/html/api-docs/data.image2label.html +++ b/docs/html/api-docs/data.image2label.html @@ -186,6 +186,108 @@

    image2label

    +
    +class data.image2label.image2label.CifarDataLayer(params, model, num_workers, worker_id)[source]
    +

    Bases: open_seq2seq.data.data_layer.DataLayer

    +
    +
    +build_graph()[source]
    +

    Here all TensorFlow graph construction should happen.

    +
    + +
    +
    +static get_optional_params()[source]
    +

    Static method with description of optional parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that can be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +static get_required_params()[source]
    +

    Static method with description of required parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that have to be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +get_size_in_samples()[source]
    +

    Should return the dataset size in samples. +That is, the number of objects in the dataset. This method is used to +calculate a valid epoch size. If this method is not defined, you will need +to make sure that your dataset for evaluation is created only for +one epoch. You will also not be able to use num_epochs parameter in the +base config.

    + +++ + + + + + +
    Returns:dataset size in samples.
    Return type:int
    +
    + +
    +
    +input_tensors
    +

    Dictionary containing input tensors. +This dictionary has to define the following keys: source_tensors, +which should contain all tensors describing the input object (i.e. tensors +that are passed to the encoder, e.g. input sequence and input length). And +when self.params['mode'] != "infer" data layer should also define +target_tensors which is the list of all tensors related to the +corresponding target object (i.e. tensors taht are passed to the decoder and +loss, e.g. target sequence and target length). Note that all tensors have +to be created inside self.build_graph() method.

    +
    + +
    +
    +iterator
    +

    tf.data.Dataset iterator. +Should be created by self.build_graph().

    +
    + +
    +
    +parse_record(raw_record, is_training, num_classes=10)[source]
    +

    Parse CIFAR-10 image and label from a raw record.

    +
    + +
    +
    +preprocess_image(image, is_training)[source]
    +

    Preprocess a single image of layout [height, width, depth].

    +
    + +
    + +
    class data.image2label.image2label.ImagenetDataLayer(params, model, num_workers, worker_id)[source]

    Bases: open_seq2seq.data.data_layer.DataLayer

    @@ -283,14 +385,14 @@

    image2label

    imagenet_preprocessing

    -

    Provides utilities to preprocess images.

    -

    Training images are sampled using the provided bounding boxes, and subsequently +

    Provides utilities to preprocess images. +Training images are sampled using the provided bounding boxes, and subsequently cropped to the sampled bounding box. Images are additionally flipped randomly, -then resized to the target output size (without aspect-ratio preservation).

    -

    Images used during evaluation are resized (with aspect-ratio preservation) and -centrally cropped.

    -

    All images undergo mean color subtraction.

    -

    Note that these steps are colloquially referred to as “ResNet preprocessing,” +then resized to the target output size (without aspect-ratio preservation). +Images used during evaluation are resized (with aspect-ratio preservation) and +centrally cropped. +All images undergo mean color subtraction. +Note that these steps are colloquially referred to as “ResNet preprocessing,” and they differ from “VGG preprocessing,” which does not use bounding boxes and instead does an aspect-preserving resize followed by random crop during training. (These both differ from “Inception preprocessing,” which introduces @@ -345,8 +447,8 @@

    image2label
    data.image2label.imagenet_preprocessing._decode_crop_and_flip(image_buffer, bbox, num_channels)[source]
    -

    Crops the given image to a random part of the image, and randomly flips.

    -

    We use the fused decode_and_crop op, which performs better than the two ops +

    Crops the given image to a random part of the image, and randomly flips. +We use the fused decode_and_crop op, which performs better than the two ops used separately in series, but note that this requires that the image be passed in as an un-decoded string Tensor.

    @@ -370,13 +472,13 @@

    image2label -
    -data.image2label.imagenet_preprocessing._mean_image_subtraction(image, means, num_channels)[source]
    -

    Subtracts the given means from each image channel.

    +
    +data.image2label.imagenet_preprocessing._mean_image_subtraction_and_normalization(image, means, num_channels)[source]
    +

    Subtracts the given means from each image channel and divides by 127.5.

    For example:
    means = [123.68, 116.779, 103.939] -image = _mean_image_subtraction(image, means)
    +image = _mean_image_subtraction_and_normalization(image, means)

    Note that the rank of image must be known.

    @@ -390,7 +492,7 @@

    image2label

    - +equal to number of evaluation batches). +
  • training_step (int) – current training step. Will only be passed if mode +is “train_eval”.
  • + +
    - + - +
    Returns:

    the centered image.

    +
    Returns:

    the centered image and normalized image.

    Raises:

    ValueError – If the rank of image is unknown, if image has a rank other @@ -405,8 +507,8 @@

    image2label
    data.image2label.imagenet_preprocessing._parse_example_proto(example_serialized)[source]
    -

    Parses an Example proto containing a training example of an image.

    -

    The output of the build_image_data.py image preprocessing script is a dataset +

    Parses an Example proto containing a training example of an image. +The output of the build_image_data.py image preprocessing script is a dataset containing serialized Example protocol buffers. Each Example proto contains the following fields (values are included as examples):

    @@ -448,8 +550,8 @@

    image2label
    data.image2label.imagenet_preprocessing._resize_image(image, height, width)[source]
    -

    Simple wrapper around tf.resize_images.

    -

    This is primarily to make sure we use the same ResizeMethod and other +

    Simple wrapper around tf.resize_images. +This is primarily to make sure we use the same ResizeMethod and other details each time.

    @@ -480,8 +582,8 @@

    image2label
    data.image2label.imagenet_preprocessing._smallest_size_at_least(height, width, resize_min)[source]
    -

    Computes new shape with the smallest side equal to smallest_side.

    -

    Computes new shape with the smallest side equal to smallest_side while +

    Computes new shape with the smallest side equal to smallest_side. +Computes new shape with the smallest side equal to smallest_side while preserving the original aspect ratio.

    @@ -508,9 +610,9 @@

    image2label
    -data.image2label.imagenet_preprocessing.parse_record(raw_record, is_training)[source]
    -

    Parses a record containing a training example of an image.

    -

    The input record is parsed into a label and image, and the image is passed +data.image2label.imagenet_preprocessing.parse_record(raw_record, is_training, image_size=224, num_classes=1000)[source] +

    Parses a record containing a training example of an image. +The input record is parsed into a label and image, and the image is passed through preprocessing steps (cropping, flipping, and so on).

    @@ -520,6 +622,8 @@

    image2label
    data.image2label.imagenet_preprocessing.preprocess_image(image_buffer, bbox, output_height, output_width, num_channels, is_training=False)[source]
    -

    Preprocesses the given image.

    -

    Preprocessing includes decoding, cropping, and resizing for both training +

    Preprocesses the given image. +Preprocessing includes decoding, cropping, and resizing for both training and eval images. Training preprocessing, however, introduces some random distortion of the image to improve accuracy.

    diff --git a/docs/html/api-docs/data.speech2text.html b/docs/html/api-docs/data.speech2text.html index 3dbeab305..c6ac82900 100644 --- a/docs/html/api-docs/data.speech2text.html +++ b/docs/html/api-docs/data.speech2text.html @@ -187,12 +187,12 @@

    speech2text

    -class data.speech2text.speech2text.Speech2TextDataLayer(params, model, num_workers=None, worker_id=None)[source]
    +class data.speech2text.speech2text.Speech2TextDataLayer(params, model, num_workers, worker_id)[source]

    Bases: open_seq2seq.data.data_layer.DataLayer

    Speech-to-text data layer class.

    -__init__(params, model, num_workers=None, worker_id=None)[source]
    +__init__(params, model, num_workers, worker_id)[source]

    Speech-to-text data layer constructor.

    See parent class for arguments description.

    Config parameters:

    @@ -223,15 +223,16 @@

    speech2text
    -_parse_audio_element(audio_filename)[source]
    +_parse_audio_element(id_and_audio_filename)[source]

    Parses audio from file and returns array of audio features.

    - + - + @@ -335,8 +336,7 @@

    speech2text
    split_data(data)[source]
    -

    Method that performs data split for evaluation.

    -
    +
    @@ -439,6 +439,12 @@

    speech2text +
    +data.speech2text.speech_utils.normalize_signal(signal)[source]
    +

    Normalize float32 signal to [-1, 1] range

    +
    + diff --git a/docs/html/api-docs/data.text2text.html b/docs/html/api-docs/data.text2text.html index 63610cc2d..2e34a5c9c 100644 --- a/docs/html/api-docs/data.text2text.html +++ b/docs/html/api-docs/data.text2text.html @@ -227,10 +227,12 @@ parallel_interleave, the sloppy argument is used to generate randomness in the order of the examples.

    +
  • Modified slightly to fit OpenSeq2Seq needs

    +
  • -data.text2text.t2t._batch_examples(dataset, batch_size, max_length)[source]
    +data.text2text.t2t._batch_examples(dataset, batch_size, max_length, pad_2_eight=True)[source]

    Group examples by similar lengths, and return batched dataset.

    Each batch of similar-length examples are padded to the same length, and may have different number of elements in each batch, such that:

    @@ -303,13 +305,13 @@
    -data.text2text.t2t._parse_example(serialized_example)[source]
    +data.text2text.t2t._parse_example(serialized_example, pad_2_eight=False)[source]

    Return inputs and targets Tensors from a serialized tf.Example.

    -data.text2text.t2t._read_and_batch_from_files(file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat, num_workers, worker_id)[source]
    +data.text2text.t2t._read_and_batch_from_files(file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat, num_workers, worker_id, batch_in_tokens, pad2eight=True)[source]

    Create dataset where each item is a dict of “inputs” and “targets”.

    Parameters:audio_filename – audio file name.
    Parameters:id_and_audio_filename – tuple of sample id and corresponding audio file name.
    Returns:source audio features as np.array, length of source sequence,
    Returns:source audio features as np.array, length of source sequence, +sample id.
    Return type:tuple
    @@ -325,6 +327,11 @@ repeated forever.
  • num_workers – Number of workers or number of Horovod workers
  • worker_id – Worker id or Horovod rank
  • +
  • batch_in_tokens – whether to batch_size means amounts in tokens or sentence
  • +
  • batching in tokens is more efficient as it reduces PADs. batching in (pairs.) –
  • +
  • should be used in inference mode since order of (sentences) –
  • +
  • is important (sentences) –
  • +
  • pad2eight – if True, it will pad both dimensions to be divisible by 8
  • diff --git a/docs/html/api-docs/decoders.html b/docs/html/api-docs/decoders.html index edeafe227..de9b48e7c 100644 --- a/docs/html/api-docs/decoders.html +++ b/docs/html/api-docs/decoders.html @@ -100,6 +100,7 @@
  • fc_decoders
  • rnn_decoders
  • transformer_decoders
  • +
  • convs2s_decoder
  • losses
  • @@ -269,7 +270,8 @@ - @@ -328,6 +336,26 @@
    Returns:dictionary of decoder outputs. Typically this will be just:
    {
       "logits": logits that will be passed to Loss
    -  "samples": actual decoded output, e.g. characters instead of logits
    +  "outputs": list with actual decoded outputs, e.g. characters
    +             instead of logits
     }
     
    @@ -469,7 +471,7 @@
    Returns:dictionary with the following tensors:
    {
       'logits': logits with the shape=[batch_size, output_dim]
    -  'samples': [logits] (same as logits but wrapped in list)
    +  'outputs': [logits] (same as logits but wrapped in list)
     }
     
    @@ -517,7 +519,7 @@
  • tgt_vocab_size (int) — target vocabulary size, i.e. number of output features.
  • logits_to_outputs_func — function that maps produced logits to -decoder samples, i.e. actual text sequences.
  • +decoder outputs, i.e. actual text sequences. @@ -543,7 +545,7 @@
    Returns:dictionary with the following tensors:
    {
       'logits': logits with the shape=[time length, batch_size, tgt_vocab_size]
    -  'samples': logits_to_outputs_func(logits, input_dict)
    +  'outputs': logits_to_outputs_func(logits, input_dict)
     }
     
    @@ -703,8 +705,8 @@
  • END_SYMBOL (int) — END symbol id, must be the same as used in data layer.
  • tgt_emb_size (int) — embedding size to use.
  • -
  • decoder_cell_units (int) - number of units in RNN
  • -
  • decoder_cell_type (string) - RNN type: lstm, gru, glstm, etc.
  • +
  • core_cell_params (dict) - parameters for RNN class
  • +
  • core_cell (string) - RNN class.
  • decoder_dp_input_keep_prob (float) - dropout input keep probability.
  • decoder_dp_output_keep_prob (float) - dropout output keep probability.
  • decoder_use_skip_connections (bool) - use residual connections or not.
  • @@ -814,6 +816,91 @@

    transformer_decoders

    +
    +
    +

    convs2s_decoder

    +
    +
    +class decoders.convs2s_decoder.ConvS2SDecoder(params, model, name='convs2s_decoder', mode='train')[source]
    +

    Bases: decoders.decoder.Decoder

    +
    +
    +_get_symbols_to_logits_fn()[source]
    +

    Returns a decoding function that calculates logits of the next tokens.

    +
    + +
    +
    +decode_pass(targets, encoder_outputs, encoder_outputs_b, inputs_attention_bias)[source]
    +

    Generate logits for each value in the target sequence.

    + +++ + + + + + +
    Parameters:
      +
    • targets – target values for the output sequence. +int tensor with shape [batch_size, target_length]
    • +
    • encoder_outputs – continuous representation of input sequence. +float tensor with shape [batch_size, input_length, hidden_size] +float tensor with shape [batch_size, input_length, hidden_size]
    • +
    • encoder_outputs_b – continuous representation of input sequence +which includes the source embeddings. +float tensor with shape [batch_size, input_length, hidden_size]
    • +
    • inputs_attention_bias – float tensor with shape [batch_size, 1, input_length]
    • +
    +
    Returns:

    float32 tensor with shape [batch_size, target_length, vocab_size]

    +
    +
    + +
    +
    +static get_optional_params()[source]
    +

    Static method with description of optional parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that can be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +static get_required_params()[source]
    +

    Static method with description of required parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that have to be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +predict(encoder_outputs, encoder_outputs_b, inputs_attention_bias)[source]
    +

    Return predicted sequence.

    +
    + +
    +
    diff --git a/docs/html/api-docs/encoders.html b/docs/html/api-docs/encoders.html index 64d919f02..eb4f9bd7c 100644 --- a/docs/html/api-docs/encoders.html +++ b/docs/html/api-docs/encoders.html @@ -97,10 +97,13 @@
  • encoders
  • decoders
  • @@ -496,12 +499,6 @@ -
    -
    -encoders.ds2_encoder.conv2d_bn_actv(name, inputs, filters, kernel_size, activation_fn, strides, padding, regularizer, training, data_format, bn_momentum, bn_epsilon)[source]
    -

    Helper function that applies convolution, batch norm and activation.

    -
    -
    encoders.ds2_encoder.rnn_cell(rnn_cell_dim, layer_type, dropout_keep_prob=1.0)[source]
    @@ -514,6 +511,137 @@

    Helper function that applies “row” or “in plane” convolution.

    + +
    +

    w2l_encoder

    +
    +
    +class encoders.w2l_encoder.Wave2LetterEncoder(params, model, name='w2l_encoder', mode='train')[source]
    +

    Bases: encoders.encoder.Encoder

    +

    Wave2Letter like encoder. Fully convolutional model

    +
    +
    +__init__(params, model, name='w2l_encoder', mode='train')[source]
    +

    Wave2Letter like encoder constructor.

    +

    See parent class for arguments description.

    +

    Config parameters:

    +
      +
    • dropout_keep_prop (float) — keep probability for dropout.

      +
    • +
    • convnet_layers (list) — list with the description of convolutional +layers. For example:

      +
      "convnet_layers": [
      +  {
      +    "type": "conv1d", "repeat" : 5,
      +    "kernel_size": [7], "stride": [1],
      +    "num_channels": 250, "padding": "SAME"
      +  },
      +  {
      +    "type": "conv1d", "repeat" : 3,
      +    "kernel_size": [11], "stride": [1],
      +    "num_channels": 500, "padding": "SAME"
      +  },
      +  {
      +    "type": "conv1d", "repeat" : 1,
      +    "kernel_size": [32], "stride": [1],
      +    "num_channels": 1000, "padding": "SAME"
      +  },
      +  {
      +    "type": "conv1d", "repeat" : 1,
      +    "kernel_size": [1], "stride": [1],
      +    "num_channels": 1000, "padding": "SAME"
      +  },
      +]
      +
      +
      +
    • +
    • activation_fn — activation function to use.

      +
    • +
    • data_format (string) — could be either “channels_first” or +“channels_last”. Defaults to “channels_last”.

      +
    • +
    • normalization — normalization to use. Accepts [None, ‘batch_norm’]. +Use None if you don’t want to use normalization. Defaults to ‘batch_norm’.

      +
    • +
    • bn_momentum (float) — momentum for batch norm. Defaults to 0.90.

      +
    • +
    • bn_epsilon (float) — epsilon for batch norm. Defaults to 1e-3.

      +
    • +
    +
    + +
    +
    +_encode(input_dict)[source]
    +

    Creates TensorFlow graph for Wav2Letter like encoder.

    + +++ + + + + + + + +
    Parameters:input_dict (dict) –

    input dictionary that has to contain +the following fields:

    +
    input_dict = {
    +  "source_tensors": [
    +    src_sequence (shape=[batch_size, sequence length, num features]),
    +    src_length (shape=[batch_size])
    +  ]
    +}
    +
    +
    +
    Returns:dictionary with the following tensors:
    {
    +  'outputs': hidden state, shape=[batch_size, sequence length, n_hidden]
    +  'src_length': tensor, shape=[batch_size]
    +}
    +
    +
    +
    Return type:dict
    +
    + +
    +
    +static get_optional_params()[source]
    +

    Static method with description of optional parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that can be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +static get_required_params()[source]
    +

    Static method with description of required parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that have to be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +

    rnn_encoders

    @@ -733,6 +861,98 @@ +
    +
    +class encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding_cuDNN(params, model, name='gnmt_encoder_with_emb_cudnn', mode='train')[source]
    +

    Bases: encoders.encoder.Encoder

    +

    Encoder similar to the one used in +GNMT model: https://arxiv.org/abs/1609.08144. +Must have at least 2 layers. Uses cuDNN RNN blocks for efficiency

    +
    +
    +__init__(params, model, name='gnmt_encoder_with_emb_cudnn', mode='train')[source]
    +

    Encodes data into representation +:param params: a Python dictionary. +Must define:

    +
    +
      +
    • +
      src_inputs - a Tensor of shape [batch_size, time] or [time, batch_size]
      +
      (depending on time_major param)
      +
      +
    • +
    • src_lengths - a Tensor of shape [batch_size]
    • +
    +
    + +++ + + + +
    Returns:a Python dictionary with: +* encoder_outputs - a Tensor of shape
    +
    [batch_size, time, representation_dim]
    +

    or [time, batch_size, representation_dim] +* encoder_state - a Tensor of shape [batch_size, dim] +* src_lengths - (copy ref from input) a Tensor of shape [batch_size]

    +
    +
    + +
    +
    +enc_emb_w
    +
    + +
    +
    +static get_optional_params()[source]
    +

    Static method with description of optional parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that can be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +static get_required_params()[source]
    +

    Static method with description of required parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that have to be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +src_emb_size
    +
    + +
    +
    +src_vocab_size
    +
    + +
    +
    class encoders.rnn_encoders.UnidirectionalRNNEncoderWithEmbedding(params, model, name='unidir_rnn_encoder_with_emb', mode='train')[source]
    @@ -851,6 +1071,63 @@

    transformer_encoders

    +
    +
    +

    convs2s_encoder

    +

    Conv-based encoder

    +
    +
    +class encoders.convs2s_encoder.ConvS2SEncoder(params, model, name='convs2s_encoder_with_emb', mode='train')[source]
    +

    Bases: encoders.encoder.Encoder

    +

    Fully convolutional Encoder of ConvS2S

    +
    +
    +static get_optional_params()[source]
    +

    Static method with description of optional parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that can be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +static get_required_params()[source]
    +

    Static method with description of required parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that have to be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +src_emb_size
    +
    + +
    +
    +src_vocab_size
    +
    + +
    +

    resnet_encoder

    @@ -1138,6 +1415,142 @@

    transformer_encoders +

    +
    +

    cnn_encoder

    +

    This module contains classes and functions to build “general” convolutional +neural networks from the description of arbitrary “layers”.

    +
    +
    +class encoders.cnn_encoder.CNNEncoder(params, model, name='cnn_encoder', mode='train')[source]
    +

    Bases: encoders.encoder.Encoder

    +

    General CNN encoder that can be used to construct various different models.

    +
    +
    +__init__(params, model, name='cnn_encoder', mode='train')[source]
    +

    CNN Encoder constructor.

    +

    See parent class for arguments description.

    +

    Config parameters:

    +
      +
    • cnn_layers (list) — list with the description of “convolutional” +layers. For example:

      +
      "conv_layers": [
      +    (tf.layers.conv2d, {
      +        'filters': 64, 'kernel_size': (11, 11),
      +        'strides': (4, 4), 'padding': 'VALID',
      +        'activation': tf.nn.relu,
      +    }),
      +    (tf.layers.max_pooling2d, {
      +        'pool_size': (3, 3), 'strides': (2, 2),
      +    }),
      +    (tf.layers.conv2d, {
      +        'filters': 192, 'kernel_size': (5, 5),
      +        'strides': (1, 1), 'padding': 'SAME',
      +    }),
      +    (tf.layers.batch_normalization, {'momentum': 0.9, 'epsilon': 0.0001}),
      +    (tf.nn.relu, {}),
      +]
      +
      +
      +

      Note that you don’t need to provide “regularizer”, “training” and +“data_format” parameters since they will be automatically added.

      +
    • +
    • cnn_layers (list) — list with the description of “fully-connected” +layers. The only different from convolutional layers is that the input +will be automatically reshaped to 2D (batch size x num features). +For example:

      +
      'fc_layers': [
      +    (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}),
      +    (tf.layers.dropout, {'rate': 0.5}),
      +    (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}),
      +    (tf.layers.dropout, {'rate': 0.5}),
      +],
      +
      +
      +

      Note that you don’t need to provide “regularizer”, “training” and +“data_format” parameters since they will be automatically added.

      +
    • +
    • data_format (string) — could be either “channels_first” or +“channels_last”. Defaults to “channels_first”.

      +
    • +
    +
    + +
    +
    +static get_optional_params()[source]
    +

    Static method with description of optional parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that can be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +static get_required_params()[source]
    +

    Static method with description of required parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that have to be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    + +
    +
    +encoders.cnn_encoder.build_layer(inputs, layer, layer_params, data_format, regularizer, training, verbose=True)[source]
    +

    This function builds a layer from the layer function and it’s parameters.

    +

    It will automatically add regularizer parameter to the layer_params if the +layer supports regularization. To check this, it will look for the +“regularizer”, “kernel_regularizer” and “gamma_regularizer” names in this +order in the layer call signature. If one of this parameters is supported +it will pass regularizer object as a value for that parameter. Based on the +same “checking signature” technique “data_format” and “training” parameters +will try to be added.

    + +++ + + + + + +
    Parameters:
      +
    • inputs – input Tensor that will be passed to the layer. Note that layer has +to accept input as the first parameter.
    • +
    • layer – layer function or class with __call__ method defined.
    • +
    • layer_params (dict) – parameters passed to the layer.
    • +
    • data_format (string) – data format (“channels_first” or “channels_last”) +that will be tried to be passed as an additional argument.
    • +
    • regularizer – regularizer instance that will be tried to be passed as an +additional argument.
    • +
    • training (bool) – whether layer is built in training mode. Will be tried to +be passed as an additional argument.
    • +
    • verbose (bool) – whether to print information about built layers.
    • +
    +
    Returns:

    Tensor with layer output.

    +
    +
    +
    diff --git a/docs/html/api-docs/models.html b/docs/html/api-docs/models.html index 0b49514d6..61c902754 100644 --- a/docs/html/api-docs/models.html +++ b/docs/html/api-docs/models.html @@ -236,6 +236,11 @@
  • print_samples_steps (int or None) — how often to print training samples (input sequences, correct answers and model predictions). Setting it to None disables samples printing.
  • +
  • print_bench_info_steps (int or None) — how often to print training +benchmarking information (average number of objects processed per step). +Setting it to None disables intermediate benchmarking printing, but +the average information across the whole training will always be printed +after the last iteration.
  • save_checkpoint_steps (int or None) — how often to save model checkpoints. Setting it to None disables checkpoint saving.
  • eval_steps (int) — how often to run evaluation during training. @@ -271,14 +276,17 @@
  • max_grad_norm (float) — maximum value of gradient norm. Clipping will be performed if some gradients exceed this value (this is checked for each variable independently).
  • -
  • loss_scale (float) — static loss scale to use. For details see -mixed precision training section in docs.
  • -
  • automatic_loss_scaling — automatic loss scaling mode. Could be -either None, “Backoff” or “Logmax”. For details see -mixed precision training section in docs.
  • +
  • loss_scaling — could be float or string. If float, static loss +scaling is applied. If string, the corresponding automatic +loss scaling algorithm is used. Must be one of ‘Backoff’ +of ‘LogMax’ (case insensitive). Only used when dtype=”mixed”. For details +see mixed precision training section in docs.
  • summaries (list) — which summaries to log. Could contain “learning_rate”, “gradients”, “gradient_norm”, “global_gradient_norm”, “variables”, “variable_norm”.
  • +
  • iter_size (int) — use this parameter to emulate large batches. +The gradients will be accumulated for iter_size number of steps before +applying update.
  • larc_params — dictionary with parameters for LARC (or LARS) optimization algorithms. Can contain the following parameters:
    • larc_mode — Could be either “scale” (LARS) or “clip” (LARC). @@ -309,14 +317,14 @@
  • Returns:

    tuple containing loss tensor and samples tensor.

    +
    Returns:

    tuple containing loss tensor and list of outputs tensors.

    Loss tensor will be automatically provided to the optimizer and corresponding train_op will be created.

    Samples tensors are stored in the _outputs attribute and can be accessed by calling get_output_tensors() function. For example, this happens inside utils.hooks.RunEvaluationHook to fetch output values for evaluation.

    -

    Both loss and samples can be None when corresponding part of the graph +

    Both loss and outputs can be None when corresponding part of the graph is not built.

    +
    +
    +_get_num_objects_per_step(worker_id=0)[source]
    +

    Define this method if you need benchmarking functionality. +For example, for translation models, this method should return number of +tokens in current batch, for image recognition model should return number +of images in current batch.

    + +++ + + + + + +
    Parameters:worker_id (int) – id of the worker to get data layer from +(not used for Horovod).
    Returns:tf.Tensor with number of objects in batch.
    +
    +
    clip_last_batch(last_batch, true_size)[source]
    @@ -410,7 +438,7 @@
    -finalize_evaluation(results_per_batch)[source]
    +finalize_evaluation(results_per_batch, training_step=None)[source]

    This method can be used in conjunction with self.evaluate() to calculate evaluation metrics. @@ -432,14 +460,21 @@

    Parameters:results_per_batch (list) – aggregation of values returned from all calls +
    Parameters:
      +
    • results_per_batch (list) – aggregation of values returned from all calls to self.evaluate() method (number of calls will be -equal to number of evaluation batches).
    Returns:dictionary with values that need to be logged to TensorBoard -(can be empty).
    Returns:

    dictionary with values that need to be logged to TensorBoard +(can be empty).

    +
    Return type:dict
    Return type:

    dict

    +
    @@ -491,22 +526,7 @@
    get_num_objects_per_step(worker_id=0)[source]
    -

    Define this method if you need benchmarking functionality. -For example, for translation models, this method should return number of -tokens in current batch, for image recognition model should return number -of images in current batch.

    - --- - - - - - -
    Parameters:worker_id (int) – id of the worker to get data layer from -(not used for Horovod).
    Returns:tf.Tensor with number of objects in batch.
    -
    +

    @@ -619,7 +639,7 @@
    -maybe_print_logs(input_values, output_values)[source]
    +maybe_print_logs(input_values, output_values, training_step)[source]

    This method can be used to print logs that help to visualize training. For example, you can print sample input sequences and their corresponding predictions. This method will be called every print_samples_steps @@ -639,6 +659,7 @@

  • output_values – evaluation of self.get_output_tensors(0), that is, output tensors for one batch on the first GPU.
  • +
  • training_step (int) – Current training step.
  • @@ -770,8 +791,8 @@ Returns:

    tuple containing loss tensor as returned from -loss.compute_loss() and samples tensor, which is taken from -decoder.decode()['samples']. When mode == 'infer', loss will +loss.compute_loss() and list of outputs tensors, which is taken from +decoder.decode()['outputs']. When mode == 'infer', loss will be None.

    @@ -893,6 +914,12 @@
    class models.speech2text.Speech2Text(params, mode='train', hvd=None)[source]

    Bases: models.encoder_decoder.EncoderDecoderModel

    +
    +
    +_get_num_objects_per_step(worker_id=0)[source]
    +

    Returns number of audio frames in current batch.

    +
    +
    evaluate(input_values, output_values)[source]
    @@ -942,7 +969,7 @@
    -finalize_evaluation(results_per_batch)[source]
    +finalize_evaluation(results_per_batch, training_step=None)[source]

    This method can be used in conjunction with self.evaluate() to calculate evaluation metrics. @@ -964,14 +991,21 @@ -Parameters:results_per_batch (list) – aggregation of values returned from all calls +Parameters:

      +
    • results_per_batch (list) – aggregation of values returned from all calls to self.evaluate() method (number of calls will be -equal to number of evaluation batches). +equal to number of evaluation batches).
    • +
    • training_step (int) – current training step. Will only be passed if mode +is “train_eval”.
    • +
    + -Returns:dictionary with values that need to be logged to TensorBoard -(can be empty). +Returns:

    dictionary with values that need to be logged to TensorBoard +(can be empty).

    + -Return type:dict +Return type:

    dict

    + @@ -1000,12 +1034,6 @@
    -
    -
    -get_num_objects_per_step(worker_id=0)[source]
    -

    Returns number of audio frames in current batch.

    -
    -
    infer(input_values, output_values)[source]
    @@ -1043,7 +1071,7 @@
    -maybe_print_logs(input_values, output_values)[source]
    +maybe_print_logs(input_values, output_values, training_step)[source]

    This method can be used to print logs that help to visualize training. For example, you can print sample input sequences and their corresponding predictions. This method will be called every print_samples_steps @@ -1063,6 +1091,7 @@

  • output_values – evaluation of self.get_output_tensors(0), that is, output tensors for one batch on the first GPU.
  • +
  • training_step (int) – Current training step.
  • @@ -1099,6 +1128,12 @@ class models.text2text.Text2Text(params, mode='train', hvd=None)[source]

    Bases: models.encoder_decoder.EncoderDecoderModel

    An example class implementing classical text-to-text model.

    +
    +
    +_get_num_objects_per_step(worker_id=0)[source]
    +

    Returns number of source tokens + number of target tokens in batch.

    +
    +
    evaluate(input_values, output_values)[source]
    @@ -1148,7 +1183,7 @@
    -finalize_evaluation(results_per_batch)[source]
    +finalize_evaluation(results_per_batch, training_step=None)[source]

    This method can be used in conjunction with self.evaluate() to calculate evaluation metrics. @@ -1170,14 +1205,21 @@ -Parameters:results_per_batch (list) – aggregation of values returned from all calls +Parameters:

      +
    • results_per_batch (list) – aggregation of values returned from all calls to self.evaluate() method (number of calls will be -equal to number of evaluation batches). +equal to number of evaluation batches).
    • +
    • training_step (int) – current training step. Will only be passed if mode +is “train_eval”.
    • +
    + -Returns:dictionary with values that need to be logged to TensorBoard -(can be empty). +Returns:

    dictionary with values that need to be logged to TensorBoard +(can be empty).

    + -Return type:dict +Return type:

    dict

    + @@ -1206,12 +1248,6 @@
    -
    -
    -get_num_objects_per_step(worker_id=0)[source]
    -

    Returns number of source tokens + number of target tokens in batch.

    -
    -
    infer(input_values, output_values)[source]
    @@ -1249,7 +1285,7 @@
    -maybe_print_logs(input_values, output_values)[source]
    +maybe_print_logs(input_values, output_values, training_step)[source]

    This method can be used to print logs that help to visualize training. For example, you can print sample input sequences and their corresponding predictions. This method will be called every print_samples_steps @@ -1269,6 +1305,7 @@

  • output_values – evaluation of self.get_output_tensors(0), that is, output tensors for one batch on the first GPU.
  • +
  • training_step (int) – Current training step.
  • @@ -1317,6 +1354,12 @@
    class models.image2label.Image2Label(params, mode='train', hvd=None)[source]

    Bases: models.encoder_decoder.EncoderDecoderModel

    +
    +
    +_get_num_objects_per_step(worker_id=0)[source]
    +

    Returns number of images in current batch, i.e. batch size.

    +
    +
    evaluate(input_values, output_values)[source]
    @@ -1366,7 +1409,7 @@
    -finalize_evaluation(results_per_batch)[source]
    +finalize_evaluation(results_per_batch, training_step=None)[source]

    This method can be used in conjunction with self.evaluate() to calculate evaluation metrics. @@ -1388,28 +1431,29 @@ -Parameters:results_per_batch (list) – aggregation of values returned from all calls +Parameters:

      +
    • results_per_batch (list) – aggregation of values returned from all calls to self.evaluate() method (number of calls will be -equal to number of evaluation batches). +equal to number of evaluation batches).
    • +
    • training_step (int) – current training step. Will only be passed if mode +is “train_eval”.
    • +
    + -Returns:dictionary with values that need to be logged to TensorBoard -(can be empty). +Returns:

    dictionary with values that need to be logged to TensorBoard +(can be empty).

    + -Return type:dict +Return type:

    dict

    +
    -
    -
    -get_num_objects_per_step(worker_id=0)[source]
    -

    Returns number of images in current batch, i.e. batch size.

    -
    -
    -maybe_print_logs(input_values, output_values)[source]
    +maybe_print_logs(input_values, output_values, training_step)[source]

    This method can be used to print logs that help to visualize training. For example, you can print sample input sequences and their corresponding predictions. This method will be called every print_samples_steps @@ -1429,6 +1473,7 @@

  • output_values – evaluation of self.get_output_tensors(0), that is, output tensors for one batch on the first GPU.
  • +
  • training_step (int) – Current training step.
  • diff --git a/docs/html/api-docs/modules.html b/docs/html/api-docs/modules.html index e6e21831a..d3c3a83bc 100644 --- a/docs/html/api-docs/modules.html +++ b/docs/html/api-docs/modules.html @@ -202,10 +202,13 @@
  • encoders
  • decoders
  • losses
  • utils
      diff --git a/docs/html/api-docs/optimizers.html b/docs/html/api-docs/optimizers.html index ac34b5501..38768754e 100644 --- a/docs/html/api-docs/optimizers.html +++ b/docs/html/api-docs/optimizers.html @@ -179,123 +179,12 @@

      optimizers

      Optimizer ops for use in layers and tf.learn.

      -
      -
      -class optimizers.optimizers.DistributedOptimizer(optimizer, name=None, use_locking=False, device_dense='', device_sparse='')[source]
      -

      Bases: tensorflow.python.training.optimizer.Optimizer

      -

      An optimizer that wraps another tf.Optimizer, using an allreduce to -average gradient values before applying gradients to model weights.

      -
      -
      -__init__(optimizer, name=None, use_locking=False, device_dense='', device_sparse='')[source]
      -

      Construct a new DistributedOptimizer, which uses another optimizer -under the hood for computing single-process gradient values and -applying gradient updates after the gradient values have been averaged -across all the Horovod ranks. -:param optimizer: Optimizer to use for computing gradients and applying updates. -:param name: Optional name prefix for the operations created when applying

      -
      -
      gradients. Defaults to “Distributed” followed by the provided -optimizer type.
      - --- - - - -
      Parameters:
        -
      • use_locking – Whether to use locking when updating variables. -See Optimizer.__init__ for more info.
      • -
      • device_dense – Device to be used for dense tensors. Uses GPU by default -if Horovod was build with HOROVOD_GPU_ALLREDUCE.
      • -
      • device_sparse – Device to be used for sparse tensors. Uses GPU by default -if Horovod was build with HOROVOD_GPU_ALLGATHER.
      • -
      -
      -
      - -
      -
      -apply_gradients(grads_and_vars, global_step=None, name=None)[source]
      -

      Calls this same method on the underlying optimizer.

      -
      - -
      -
      -compute_gradients(*args, **kwargs)[source]
      -

      Compute gradients of all trainable variables. -See Optimizer.compute_gradients() for more info. -In DistributedOptimizer, compute_gradients() is overriden to also -allreduce the gradients before returning them.

      -
      - -
      - -
      -
      -optimizers.optimizers._adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name)[source]
      -

      Find max_norm given norm and previous average.

      -
      - -
      -
      -optimizers.optimizers._add_scaled_noise_to_gradients(grads_and_vars, gradient_noise_scale)[source]
      -

      Adds scaled noise from a 0-mean normal distribution to gradients.

      -
      -
      optimizers.optimizers._clip_gradients_by_norm(grads_and_vars, clip_gradients)[source]

      Clips gradients by global norm.

      -
      -
      -optimizers.optimizers._multiply_gradients(grads_and_vars, gradient_multipliers)[source]
      -

      Multiply specified gradients.

      -
      - -
      -
      -optimizers.optimizers._multiply_gradients_const(grads_and_vars, multiplier)[source]
      -

      Multiply specified gradients.

      -
      - -
      -
      -optimizers.optimizers.adaptive_clipping_fn(std_factor=2.0, decay=0.95, static_max_norm=None, global_step=None, report_summary=False, epsilon=1e-08, name=None)[source]
      -

      Adapt the clipping value using statistics on the norms.

      -

      Implement adaptive gradient as presented in section 3.2.1 of -https://arxiv.org/abs/1412.1602.

      -

      Keeps a moving average of the mean and std of the log(norm) of the gradient. -If the norm exceeds exp(mean + std_factor*std) then all gradients will be -rescaled such that the global norm becomes exp(mean).

      - --- - - - - - -
      Parameters:
        -
      • std_factor – Python scaler (or tensor). -max_norm = exp(mean + std_factor*std)
      • -
      • decay – The smoothing factor of the moving averages.
      • -
      • static_max_norm – If provided, will threshold the norm to this value as an -extra safety.
      • -
      • global_step – Optional global_step. If provided, decay = decay*n/(n+1). -This provides a quicker adaptation of the mean for the first steps.
      • -
      • report_summary – If True, will add histogram summaries of the max_norm.
      • -
      • epsilon – Small value chosen to avoid zero variance.
      • -
      • name – The name for this operation is used to scope operations and summaries.
      • -
      -
      Returns:

      A function for applying gradient clipping.

      -
      -
      -
      optimizers.optimizers.get_regularization_loss(scope=None, name='total_regularization_loss')[source]
      @@ -314,137 +203,65 @@

      optimizers
      -optimizers.optimizers.optimize_loss(loss, optimizer, optimizer_params, learning_rate_decay_fn, global_step=None, dtype=tf.float32, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, update_ops=None, variables=None, name=None, summaries=None, colocate_gradients_with_ops=False, increment_global_step=True, larc_params=None, loss_scale=1.0, automatic_loss_scaling=None, on_horovod=False)[source]
      +optimizers.optimizers.optimize_loss(loss, optimizer, optimizer_params, learning_rate_decay_fn, dtype=tf.float32, clip_gradients=None, summaries=None, larc_params=None, loss_scaling=1.0, on_horovod=False, iter_size=1, skip_update_ph=None)[source]

      Given loss and parameters for optimizer, returns a training op.

      -

      Various ways of passing optimizers include:

      -
        -
      • -
        by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES
        -

        for full list. E.g. optimize_loss(…, optimizer=’Adam’).

        -
        -
        -
      • -
      • -
        by function taking learning rate Tensor as argument and returning an
        -

        Optimizer instance. E.g. optimize_loss(…, -optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5)).

        -
        -
        -

        Alternatively, if learning_rate is None, the function takes no -arguments. E.g. `optimize_loss(…, learning_rate=None,

        -
        -

        optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.

        -
        -
      • -
      • -
        by a subclass of Optimizer having a single-argument constructor
        -

        (the argument is the learning rate), such as AdamOptimizer or -AdagradOptimizer. E.g. optimize_loss(…, -optimizer=tf.train.AdagradOptimizer).

        -
        -
        -
      • -
      • -
        by an instance of a subclass of Optimizer.
        -

        E.g., optimize_loss(…, optimizer=tf.train.AdagradOptimizer(0.5)).

        -
        -
        -
      • -
      - - -
      Parameters:
      • loss – Scalar Tensor.
      • -
      • global_step – Scalar int Tensor, step counter to update on each step -unless increment_global_step is False. If not supplied, -it will be fetched from the default graph (see -tf.train.get_global_step for details). If it has -not been created, no step will be incremented with each weight -update. learning_rate_decay_fn requires global_step.
      • -
      • learning_rate – float or Tensor, magnitude of update per each training -step. Can be None.
      • -
      • optimizer

        string, class or optimizer instance, used as trainer. -string should be name of optimizer, like ‘SGD’,

        -
        -
        ’Adam’, ‘Adagrad’. Full list in OPTIMIZER_CLS_NAMES constant.
        -
        -
        class should be sub-class of tf.Optimizer that implements
        -
        compute_gradients and apply_gradients functions.
        -
        optimizer instance should be instantiation of tf.Optimizer
        -
        sub-class and have compute_gradients and apply_gradients -functions.
        -
        -
      • -
      • gradient_noise_scale – float or None, adds 0-mean normal noise scaled by this -value.
      • -
      • gradient_multipliers – dict of variables or variable names to floats. -If present, gradients for specified -variables will be multiplied by given constant.
      • -
      • clip_gradients – float, callable or None. If float, is provided, a global -clipping is applied to prevent the norm of the gradient to exceed this -value. Alternatively, a callable can be provided e.g.: adaptive_clipping. -This callable takes a list of (gradients, variables) `tuple`s and -returns the same thing with the gradients modified.
      • -
      • learning_rate_decay_fn – function, takes learning_rate and global_step +
      • optimizer – string or class of optimizer, used as trainer. +string should be name of optimizer, like ‘SGD’, +‘Adam’, ‘Adagrad’. Full list in OPTIMIZER_CLS_NAMES constant. +class should be sub-class of tf.Optimizer that implements +compute_gradients and apply_gradients functions.
      • +
      • optimizer_params – parameters of the optimizer.
      • +
      • dtype – model dtype (tf.float16, tf.float32 or “mixed”).
      • +
      • learning_rate_decay_fn – function, takes global_step Tensor`s, returns `Tensor. Can be used to implement any learning rate decay functions. For example: tf.train.exponential_decay. Ignored if learning_rate is not supplied.
      • -
      • update_ops – list of update Operation`s to execute at each step. If `None, -uses elements of UPDATE_OPS collection. The order of execution -between update_ops and loss is non-deterministic.
      • -
      • variables – list of variables to optimize or -None to use all trainable variables.
      • -
      • name – The name for this operation is used to scope operations and summaries.
      • +
      • clip_gradients – float, max gradient norm to clip to.
      • summaries – List of internal quantities to visualize on tensorboard. If not set only the loss and the learning rate will be reported. The complete list is in OPTIMIZER_SUMMARIES.
      • -
      • colocate_gradients_with_ops – If True, try colocating gradients with the -corresponding op.
      • -
      • increment_global_step – Whether to increment global_step. If your model -calls optimize_loss multiple times per training step (e.g. to optimize -different parts of the model), use this arg to avoid incrementing -global_step more times than necessary.
      • -
      • LARC_mode – ‘scale’ or ‘clip’
      • -
      • LARC_nu – If not None, LARC re-scaling will be -applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu
      • -
      • automatic_loss_scaling – if not None, use the corresponding automatic -loss scaling algorithm. Must be one of ‘Backoff’ -of ‘LogMax’. dtype must be “mixed” to use ALS.
      • +
      • larc_params – If not None, LARC re-scaling will +be applied with corresponding parameters.
      • +
      • loss_scaling – could be float or string. If float, static loss scaling +is applied. If string, the corresponding automatic +loss scaling algorithm is used. Must be one of ‘Backoff’ +of ‘LogMax’ (case insensitive). Only used when dtype=”mixed”.
      • +
      • on_horovod – whether the model is run on horovod.
      Returns:

      Training op.

      -
      Raises:

      ValueError – if: -* loss is an invalid type or shape. -* global_step is an invalid type or shape. -* learning_rate is an invalid type or value. -* optimizer has the wrong type. -* clip_gradients is neither float nor callable. -* learning_rate and learning_rate_decay_fn are supplied, but no

      -
      -

      global_step is available.

      -
      -
        -
      • gradients is empty.
      • -
      +
      Returns:

      training op.

      +
      +
      +optimizers.optimizers.post_process_gradients(grads_and_vars, summaries, lr, clip_gradients, larc_params)[source]
      +

      Applies post processing to gradients, i.e. clipping, LARC, summaries.

      +
      + +
      +
      +optimizers.optimizers.reduce_gradients(grads_and_vars, on_horovod)[source]
      +
      +

      mp_wrapper

      -class optimizers.mp_wrapper.MixedPrecisionOptimizerWrapper(optimizer, automatic_loss_scaler=None)[source]
      +class optimizers.mp_wrapper.MixedPrecisionOptimizerWrapper(optimizer, loss_scale=None)[source]

      Bases: tensorflow.python.training.optimizer.Optimizer

      @@ -547,7 +364,7 @@

      optimizersobject

      -SUPPORTED_ALGOS = ['Backoff', 'LogMax']
      +SUPPORTED_ALGOS = ['backoff', 'logmax']

      diff --git a/docs/html/api-docs/parts.cnns.html b/docs/html/api-docs/parts.cnns.html new file mode 100644 index 000000000..cc9297c39 --- /dev/null +++ b/docs/html/api-docs/parts.cnns.html @@ -0,0 +1,307 @@ + + + + + + + + + + + cnns — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + + + + +
      + + + + + +
      + +
      + + + + + + + + + + + + + + + + + +
      + + + + +
      +
      +
      +
      + +
      +

      cnns

      +
      +

      conv_blocks

      +
      +
      +parts.cnns.conv_blocks.conv_actv(type, name, inputs, filters, kernel_size, activation_fn, strides, padding, regularizer, training, data_format)[source]
      +

      Helper function that applies convolution and activation.

      + +++ + + + +
      Parameters:type – the following types are supported +‘conv1d’, ‘conv2d’
      +
      + +
      +
      +parts.cnns.conv_blocks.conv_bn_actv(type, name, inputs, filters, kernel_size, activation_fn, strides, padding, regularizer, training, data_format, bn_momentum, bn_epsilon)[source]
      +

      Helper function that applies convolution, batch norm and activation. +Accepts inputs in ‘channels_last’ format only.

      + +++ + + + +
      Parameters:type – the following types are supported +‘conv1d’, ‘conv2d’
      +
      + +
      +
      + + +
      + +
      + + +
      +
      + +
      + +
      + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/api-docs/parts.convs2s.html b/docs/html/api-docs/parts.convs2s.html new file mode 100644 index 000000000..264585c1d --- /dev/null +++ b/docs/html/api-docs/parts.convs2s.html @@ -0,0 +1,460 @@ + + + + + + + + + + + convs2s — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + + + + +
      + + + + + +
      + +
      + + + + + + + + + + + + + + + + + +
      + + + + +
      +
      +
      +
      + +
      +

      convs2s

      +
      +

      attention_wn_layer

      +

      Implementation of the attention layer for convs2s. +Inspired from https://github.com/tobyyouup/conv_seq2seq

      +
      +
      +class parts.convs2s.attention_wn_layer.AttentionLayerNormalized(in_dim, embed_size, layer_id, add_res)[source]
      +

      Bases: tensorflow.python.layers.base.Layer

      +

      Attention layer for convs2s with weight normalization

      +
      +
      +__init__(in_dim, embed_size, layer_id, add_res)[source]
      +

      initializes the attention layer. +It uses weight normalization for linear projections +(Salimans & Kingma, 2016) w = g * v/2-norm(v)

      + +++ + + + +
      Parameters:
        +
      • in_dim – int last dimension of the inputs
      • +
      • embed_size – int target embedding size
      • +
      • layer_id – int the id of current convolution layer
      • +
      • add_res – bool whether residual connection should be added or not
      • +
      +
      +
      + +
      +
      +call(input, target_embed, encoder_output_a, encoder_output_b, input_attention_bias)[source]
      +

      Calculates the attention vectors.

      + +++ + + + + + +
      Parameters:
        +
      • input – A float32 tensor with shape [batch_size, length, in_dim]
      • +
      • target_embed – A float32 tensor with shape [batch_size, length, in_dim] +containing the target embeddings
      • +
      • encoder_output_a – A float32 tensor with shape [batch_size, length, out_dim] +containing the first encoder outputs, uses as the keys
      • +
      • encoder_output_b – A float32 tensor with shape [batch_size, length, src_emb_dim] +containing the second encoder outputs, uses as the values
      • +
      • input_attention_bias – A float32 tensor with shape [batch_size, length, 1] +containing the bias used to mask the paddings
      • +
      +
      Returns:

      float32 tensor with shape [batch_size, length, out_dim].

      +
      +
      + +
      + +
      +
      +

      conv_wn_layer

      +

      Implementation of a 1d convolutional layer with weight normalization. +Inspired from https://github.com/tobyyouup/conv_seq2seq

      +
      +
      +class parts.convs2s.conv_wn_layer.Conv1DNetworkNormalized(in_dim, out_dim, kernel_width, mode, layer_id, hidden_dropout, conv_padding, decode_padding)[source]
      +

      Bases: tensorflow.python.layers.base.Layer

      +

      1D convolutional layer with weight normalization

      +
      +
      +__init__(in_dim, out_dim, kernel_width, mode, layer_id, hidden_dropout, conv_padding, decode_padding)[source]
      +

      initializes the 1D convolution layer. +It uses weight normalization (Salimans & Kingma, 2016) w = g * v/2-norm(v)

      + +++ + + + +
      Parameters:
        +
      • in_dim – int last dimension of the inputs
      • +
      • out_dim – int new dimension for the output
      • +
      • kernel_width – int width of kernel
      • +
      • mode – str the current mode
      • +
      • layer_id – int the id of current convolution layer
      • +
      • hidden_dropout – float the keep-dropout value used on the input. +Give 1.0 if no dropout. +It is used to initialize the weights of convolution.
      • +
      • conv_padding – str the type of padding done for convolution
      • +
      • decode_padding – bool specifies if this convolution layer is in decoder or not +in decoder padding is done explicitly before convolution
      • +
      +
      +
      + +
      +
      +call(input)[source]
      +

      Applies convolution with gated linear units on x.

      + +++ + + + + + +
      Parameters:x – A float32 tensor with shape [batch_size, length, in_dim]
      Returns:float32 tensor with shape [batch_size, length, out_dim].
      +
      + +
      +
      +gated_linear_units(inputs)[source]
      +

      Gated Linear Units (GLU) on x.

      + +++ + + + + + +
      Parameters:x – A float32 tensor with shape [batch_size, length, 2*out_dim]
      Returns:float32 tensor with shape [batch_size, length, out_dim].
      +
      + +
      + +
      +
      +

      ffn_wn_layer

      +

      Implementation of fully connected network with weight normalization. +Inspired from https://github.com/tobyyouup/conv_seq2seq

      +
      +
      +class parts.convs2s.ffn_wn_layer.FeedFowardNetworkNormalized(in_dim, out_dim, dropout, var_scope_name)[source]
      +

      Bases: tensorflow.python.layers.base.Layer

      +

      Fully connected feedforward network with weight normalization

      +
      +
      +__init__(in_dim, out_dim, dropout, var_scope_name)[source]
      +

      initializes the linear layer. +This layer projects from in_dim-dimenstional space to out_dim-dimentional space. +It uses weight normalization (Salimans & Kingma, 2016) w = g * v/2-norm(v)

      + +++ + + + +
      Parameters:
        +
      • in_dim – int last dimension of the inputs
      • +
      • out_dim – int new dimension for the output
      • +
      • dropout – float the keep-dropout value used in the previous layer. +It is used to initialize the weights. Give 1.0 if no dropout.
      • +
      • var_scope_name – str the scope name for the weight variables
      • +
      +
      +
      + +
      +
      +call(x)[source]
      +

      Projects x with its linear transformation.

      + +++ + + + + + +
      Parameters:x – A float32 tensor with shape [batch_size, length, in_dim]
      Returns:float32 tensor with shape [batch_size, length, out_dim].
      +
      + +
      + +
      +
      + + +
      + +
      + + +
      +
      + +
      + +
      + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/api-docs/parts.html b/docs/html/api-docs/parts.html index 214851326..06963ed73 100644 --- a/docs/html/api-docs/parts.html +++ b/docs/html/api-docs/parts.html @@ -101,6 +101,8 @@
    • parts
    • utils
    • @@ -189,13 +191,22 @@
    • transformer
    • +
    • convs2s +
    • +
    • cnns +
  • diff --git a/docs/html/api-docs/parts.rnns.html b/docs/html/api-docs/parts.rnns.html index 1950858be..583c41dea 100644 --- a/docs/html/api-docs/parts.rnns.html +++ b/docs/html/api-docs/parts.rnns.html @@ -110,6 +110,8 @@
  • transformer
  • +
  • convs2s
  • +
  • cnns
  • utils
  • @@ -1521,16 +1523,29 @@

    utils

    -
    -parts.rnns.utils.create_rnn_cell(cell_type, cell_params, num_layers=1, dp_input_keep_prob=1.0, dp_output_keep_prob=1.0, residual_connections=False, wrap_to_multi_rnn=True)[source]
    -

    TODO: MOVE THIS properly to utils. Write doc -:param cell_type: -:param cell_params: -:param num_layers: -:param dp_input_keep_prob: -:param dp_output_keep_prob: -:param residual_connections: -:return:

    +
    +parts.rnns.utils.single_cell(cell_class, cell_params, dp_input_keep_prob=1.0, dp_output_keep_prob=1.0, residual_connections=False)[source]
    +

    Creates an instance of the rnn cell. +Such cell describes one step one layer and can include residual connection +and/or dropout

    + +++ + + + + + +
    Parameters:
      +
    • cell_class – Tensorflow RNN cell class
    • +
    • cell_params (dict) – cell parameters
    • +
    • dp_input_keep_prob (float) – (default: 1.0) input dropout keep probability
    • +
    • dp_output_keep_prob (float) – (default: 1.0) output dropout keep probability
    • +
    • residual_connections (bool) – whether to add residual connection
    • +
    +
    Returns:

    TF RNN instance

    +
    diff --git a/docs/html/api-docs/parts.transformer.html b/docs/html/api-docs/parts.transformer.html index 55d5d135c..e891a2849 100644 --- a/docs/html/api-docs/parts.transformer.html +++ b/docs/html/api-docs/parts.transformer.html @@ -33,7 +33,7 @@ - + @@ -103,13 +103,14 @@
  • transformer
  • +
  • convs2s
  • +
  • cnns
  • utils
  • @@ -675,51 +676,6 @@
    - -
    -

    beam_search_test

    -

    Test beam search helper methods.

    -
    -
    -class parts.transformer.beam_search_test.BeamSearchHelperTests(methodName='runTest')[source]
    -

    Bases: tensorflow.python.framework.test_util.TensorFlowTestCase

    -
    -
    -test_expand_to_beam_size()[source]
    -
    - -
    -
    -test_flatten_beam_dim()[source]
    -
    - -
    -
    -test_gather_beams()[source]
    -
    - -
    -
    -test_gather_topk_beams()[source]
    -
    - -
    -
    -test_get_shape_keep_last_dim()[source]
    -
    - -
    -
    -test_shape_list()[source]
    -
    - -
    -
    -test_unflatten_beam_dim()[source]
    -
    - -
    -

    common

    @@ -770,7 +726,7 @@

    Implementation of embedding layer with shared weights.

    -class parts.transformer.embedding_layer.EmbeddingSharedWeights(vocab_size, hidden_size, pad2eight=False)[source]
    +class parts.transformer.embedding_layer.EmbeddingSharedWeights(vocab_size, hidden_size, pad_vocab_to_eight=False, init_var=None, embed_scale=True, pad_sym=0, mask_paddings=True)[source]

    Bases: tensorflow.python.layers.base.Layer

    Calculates input embeddings and pre-softmax linear with shared weights.

    @@ -875,7 +831,7 @@
    -parts.transformer.utils.get_padding(x, padding_value=0)[source]
    +parts.transformer.utils.get_padding(x, padding_value=0, dtype=tf.float32)[source]

    Return float tensor representing the padding values in x.

    @@ -884,6 +840,7 @@ @@ -901,7 +858,7 @@
    -parts.transformer.utils.get_padding_bias(x)[source]
    +parts.transformer.utils.get_padding_bias(x, res_rank=4, pad_sym=0)[source]

    Calculate bias tensor from padding values in tensor.

    Bias tensor that is added to the pre-softmax multi-headed attention logits, which has shape [batch_size, num_heads, length, length]. The tensor is zero at @@ -910,9 +867,18 @@

    - + - +
    Parameters:
    • x – int tensor with any shape
    • padding_value – int value that
    • +
    • dtype – type of the output
    Parameters:x – int tensor with shape [batch_size, length]
    Parameters:
      +
    • x – int tensor with shape [batch_size, length]
    • +
    • res_rank – int indicates the rank of attention_bias.
    • +
    • dtype – type of the output attention_bias
    • +
    • pad_sym – int the symbol used for padding
    • +
    +
    Returns:Attention bias tensor of shape [batch_size, 1, 1, length].
    Returns:

    Attention bias tensor of shape +[batch_size, 1, 1, length] if res_rank = 4 - for Transformer +or [batch_size, 1, length] if res_rank = 3 - for ConvS2S

    +
    @@ -955,7 +921,7 @@
    +
    +
    +utils.utils.collect_if_horovod(value, hvd, mode='sum')[source]
    +

    Collects values from all workers if run on Horovod. +Note, that on all workers except first this function will return None.

    + +++ + + + + + +
    Parameters:
      +
    • value – value to collect.
    • +
    • hvd – horovod.tensorflow module or None
    • +
    • mode – could be “sum”, “mean” or “gather”, indicating reduce_sum or gather. +For “sum” and “mean” value has to be numerical, for “gather”, value has +to be iterable.
    • +
    +
    Returns:

    collected results if run on Horovod or value otherwise.

    +
    +
    +
    utils.utils.deco_print(line, offset=0, start='*** ', end='\n')[source]
    @@ -536,8 +561,8 @@

    utils

    -
    -utils.utils.iterate_data_layer(model, dl_id, sess, compute_loss, mode, verbose)[source]
    +
    +utils.utils.iterate_data(model, sess, compute_loss, mode, verbose)[source]
    @@ -579,7 +604,7 @@

    utils diff --git a/docs/html/genindex.html b/docs/html/genindex.html index 426fae298..593fd7f67 100644 --- a/docs/html/genindex.html +++ b/docs/html/genindex.html @@ -201,6 +201,8 @@

    _

  • (decoders.rnn_decoders.BeamSearchRNNDecoderWithAttention method)
  • (decoders.rnn_decoders.RNNDecoderWithAttention method) +
  • +
  • (encoders.cnn_encoder.CNNEncoder method)
  • (encoders.ds2_encoder.DeepSpeech2Encoder method)
  • @@ -209,8 +211,12 @@

    _

  • (encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding method)
  • (encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding method) +
  • +
  • (encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding_cuDNN method)
  • (encoders.rnn_encoders.UnidirectionalRNNEncoderWithEmbedding method) +
  • +
  • (encoders.w2l_encoder.Wave2LetterEncoder method)
  • (losses.ctc_loss.CTCLoss method)
  • @@ -224,7 +230,11 @@

    _

  • (models.model.Model method)
  • -
  • (optimizers.optimizers.DistributedOptimizer method) +
  • (parts.convs2s.attention_wn_layer.AttentionLayerNormalized method) +
  • +
  • (parts.convs2s.conv_wn_layer.Conv1DNetworkNormalized method) +
  • +
  • (parts.convs2s.ffn_wn_layer.FeedFowardNetworkNormalized method)
  • (parts.rnns.attention_wrapper.AttentionWrapper method)
  • @@ -249,10 +259,6 @@

    _

  • (utils.hooks.BroadcastGlobalVariablesHook method)
  • -
  • _adaptive_max_norm() (in module optimizers.optimizers) -
  • -
  • _add_scaled_noise_to_gradients() (in module optimizers.optimizers) -
  • _aspect_preserving_resize() (in module data.image2label.imagenet_preprocessing)
  • _batch_examples() (in module data.text2text.t2t) @@ -327,6 +333,8 @@

    _

  • (encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding method)
  • (encoders.rnn_encoders.UnidirectionalRNNEncoderWithEmbedding method) +
  • +
  • (encoders.w2l_encoder.Wave2LetterEncoder method)
  • _escape_token() (in module data.text2text.tokenizer) @@ -358,6 +366,18 @@

    _

  • _get_new_alive_state() (parts.transformer.beam_search.SequenceBeamSearch method)
  • _get_new_finished_state() (parts.transformer.beam_search.SequenceBeamSearch method) +
  • +
  • _get_num_objects_per_step() (models.image2label.Image2Label method) + +
  • +
  • _get_symbols_to_logits_fn() (decoders.convs2s_decoder.ConvS2SDecoder method)
  • _grow_alive_seq() (parts.transformer.beam_search.SequenceBeamSearch method)
  • @@ -379,13 +399,9 @@

    _

  • _maybe_split_batch_beams() (parts.rnns.rnn_beam_search_decoder.BeamSearchDecoder method)
  • -
  • _mean_image_subtraction() (in module data.image2label.imagenet_preprocessing) +
  • _mean_image_subtraction_and_normalization() (in module data.image2label.imagenet_preprocessing)
  • _merge_batch_beams() (parts.rnns.rnn_beam_search_decoder.BeamSearchDecoder method) -
  • -
  • _multiply_gradients() (in module optimizers.optimizers) -
  • -
  • _multiply_gradients_const() (in module optimizers.optimizers)
  • _native_to_unicode() (in module data.text2text.tokenizer)
  • @@ -433,8 +449,6 @@

    _

    A

    @@ -703,6 +733,8 @@

    E

      +
    • encoders.encoder (module) +
    • encoders.resnet_blocks (module)
    • encoders.resnet_encoder (module)
    • encoders.rnn_encoders (module) +
    • +
    • encoders.w2l_encoder (module)
    • END_OF_CHOICE (data.text2text.text2text.SpecialTextTokens attribute)
    • @@ -760,6 +798,8 @@

      F

      + -
      • parts.rnns.rnn_beam_search_decoder (module)
      • parts.rnns.slstm (module) @@ -1238,8 +1322,6 @@

        P

      • parts.transformer.attention_layer (module)
      • parts.transformer.beam_search (module) -
      • -
      • parts.transformer.beam_search_test (module)
      • parts.transformer.common (module)
      • @@ -1252,11 +1334,19 @@

        P

      • piecewise_constant() (in module optimizers.lr_policies)
      • poly_decay() (in module optimizers.lr_policies) +
      • +
      • post_process_gradients() (in module optimizers.optimizers) +
      • +
      • predict() (decoders.convs2s_decoder.ConvS2SDecoder method)
      • PrePostProcessingWrapper (class in parts.transformer.common)
      • -
      • preprocess_image() (in module data.image2label.imagenet_preprocessing) +
      • preprocess_image() (data.image2label.image2label.CifarDataLayer method) + +
      • PrintLossAndTimeHook (class in utils.hooks)
      • PrintSamplesHook (class in utils.hooks) @@ -1267,14 +1357,16 @@

        P

        R

      • split_heads() (parts.transformer.attention_layer.Attention method)
      • -
      • src_emb_size (encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding attribute) +
      • src_emb_size (encoders.convs2s_encoder.ConvS2SEncoder attribute)
        • -
        • src_vocab_size (encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding attribute) +
        • src_vocab_size (encoders.convs2s_encoder.ConvS2SEncoder attribute)
            +
          • (encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding attribute) +
          • (encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding attribute) +
          • +
          • (encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding_cuDNN attribute)
          • (encoders.rnn_encoders.UnidirectionalRNNEncoderWithEmbedding attribute)
          • @@ -1359,30 +1461,16 @@

            S

            T

            - +
            • train() (in module utils.funcs)
            • train_input_fn() (in module data.text2text.t2t) @@ -1426,6 +1514,10 @@

              U

              W

              + + + + + + + + + + + + + @@ -392,6 +412,36 @@

              Python Module Index

              + + + + + + + + + + + + + + + + + + - - -
              • write() (utils.utils.Logger method)
              • diff --git a/docs/html/in-depth-tutorials/using-existing-models.html b/docs/html/in-depth-tutorials/using-existing-models.html index 54eb6113a..43480f24c 100644 --- a/docs/html/in-depth-tutorials/using-existing-models.html +++ b/docs/html/in-depth-tutorials/using-existing-models.html @@ -304,6 +304,11 @@

                How to run modelsmixed precision training section in docs. -
              • automatic_loss_scaling — automatic loss scaling mode. Could be -either None, “Backoff” or “Logmax”. For details see -mixed precision training section in docs.
              • +
              • loss_scaling — could be float or string. If float, static loss +scaling is applied. If string, the corresponding automatic +loss scaling algorithm is used. Must be one of ‘Backoff’ +of ‘LogMax’ (case insensitive). Only used when dtype=”mixed”. For details +see mixed precision training section in docs.
              • summaries (list) — which summaries to log. Could contain “learning_rate”, “gradients”, “gradient_norm”, “global_gradient_norm”, “variables”, “variable_norm”.
              • +
              • iter_size (int) — use this parameter to emulate large batches. +The gradients will be accumulated for iter_size number of steps before +applying update.
              • larc_params — dictionary with parameters for LARC (or LARS) optimization algorithms. Can contain the following parameters:
                • larc_mode — Could be either “scale” (LARS) or “clip” (LARC). diff --git a/docs/html/installation-instructions.html b/docs/html/installation-instructions.html index 4b0e1fba4..fc90be005 100644 --- a/docs/html/installation-instructions.html +++ b/docs/html/installation-instructions.html @@ -195,7 +195,7 @@

                  Running tests
                  python -m unittest discover -s open_seq2seq -p '*_test.py'
                   
                  -

                  It might take up to 10 minutes. You should see a lot of output, but no errors +

                  It might take up to 30 minutes. You should see a lot of output, but no errors in the end.

                  diff --git a/docs/html/models-and-recipes.html b/docs/html/models-and-recipes.html index 272cfa0bb..3e016d9c8 100644 --- a/docs/html/models-and-recipes.html +++ b/docs/html/models-and-recipes.html @@ -89,10 +89,7 @@
                • Getting started
                • Models and recipes
                • Distributed training
                • @@ -168,13 +165,15 @@

                  Note

                  Currently OpenSeq2Seq has model implementations for machine translation and -automatic speech recognition. All models work both in float32 and mixed precision. -We recommend you use mixed precision training when training on Volta GPUs.

                  +automatic speech recognition. +All models work both in float32 and mixed precision. +We recommend you use mixed precision training +when training on Volta GPUs.

                  -

                  To train models you can use the following -commands (don’t forget to substitute valid config_file path there).

                  +

                  To train models you can use the following commands (don’t forget to substitute +valid config_file path there and number of GPUs if using Horovod).

                  With Horovod (highly recommended when using multiple GPUs):

                  -
                  mpirun --allow-run-as-root --mca orte_base_help_aggregate 0 -mca btl ^openib -np 4 -H localhost:4 -bind-to none -map-by slot -x LD_LIBRARY_PATH python run.py --config_file=... --mode=train_eval --use_horovod=True --enable_logs
                  +
                  mpiexec --allow-run-as-root -np <num_gpus> python run.py --config_file=... --mode=train_eval --use_horovod=True --enable_logs
                   

                  Without Horovod:

                  @@ -184,6 +183,13 @@

                  The description of implemented models is available in the next sections:

                  Machine translation

                  +

                  The table below contains description and results of +machine translation models available in OpenSeq2Seq. +Currently, we have GNMT-based model, Transformer-based models and +ConvS2S-based models.

                  +

                  We measure BLEU score on newstest2014.tok.de file using multi-bleu.perl script from Mosses. +For more details about model descriptions and training setup, +have a look at the configuration files.

                  @@ -201,55 +207,58 @@

                  Machine translation -

                  + - + - + + + + + + +
                  en-de-nmt-small.py
                  en-de-nmt-small.py 20.23 This model should train on a single GPU such as 1080Ti. It is trained using Adam optimizer. RNN-based. Bi-directional encoder with 2 layers and. GNMT-like decoder with 2 layers and attention. Uses LSTM cells of size 512. link
                  en-de-gnmt-like-4GPUs.py
                  en-de-gnmt-like-4GPUs.py 23.89 This model was trained on 4 GPUs with Adam optimizer and learning rate decay. RNN-based. This is GNMT-like model which tries to match the one described in https://arxiv.org/abs/1609.08144 as close as possible. link
                  transformer-big.py
                  transformer-big.py 26.17 This model was trained on 4 GPUs with Adam optimizer and learning rate decay. Transformer “big” model. This model does not have any RNN layers link
                  en-de-convs2s.pyxx.xxThis model was trained on 4 GPUs with Adam optimizer, learning rate decay and warm-up.This is an implementation of the ConvS2S model proposed in https://arxiv.org/abs/1705.03122.Coming soon.
                  -

                  GNMT model description can be found here. -Transformer model description can be found here. -We measure BLEU score on newstest2014.tok.de file using multi-bleu.perl script from Mosses.

                  +

                  GNMT model description: https://arxiv.org/abs/1609.08144.

                  +

                  Transformer model description: https://arxiv.org/abs/1706.03762.

                  +

                  ConvS2S model description: https://arxiv.org/abs/1705.03122.

                  Speech recognition

                  -
                  -

                  Deep Speech 2 based models

                  -

                  Original Deep Speech 2 model description: https://arxiv.org/abs/1512.02595. -The table below contains description and results of -Deep Speech 2 based models available in OpenSeq2Seq.

                  -

                  WER-512 and WER-2048 is word error rate obtained with beam width of 512 and 2048 -correspondingly. For beam width of 2048 we also used batch_size_per_gpu = 1 +

                  The table below contains description and results of +speech recognition models available in OpenSeq2Seq. +Currently, we have DeepSpeech2-based models and Wav2Letter-based models.

                  +

                  WER is the word error rate obtained on a dev-clean subset of LibriSpeech using +greedy decoder (decoder_params/use_language_model = False). +For the final evaluation we used batch_size_per_gpu = 1 to eliminate the effect of cudnn padding issue. For more details about model descriptions and training setup, -have a look at the configuration files.

                  +have a look at the configuration files.

                  ------+++++ - - + @@ -257,17 +266,15 @@

                  Deep Speech 2 based models

                  - - + - + - - + - - + + + + + + +
                  Config fileWER-512WER-2048WER Training setup and additional comments Short description of the model Checkpoint
                  ds2_large_8gpus.py4.90%4.59%9.28% This model was trained for 50 epochs using SGD with Momentum and LARC on the full LibriSpeech in a few days using Horovod on eight GPUs. This model has 2 convolutional layers and 5 bidirectional GRU layers with 800 units.linklink
                  ds2_medium_4gpus.py6.12%5.49%22.60% This model was trained for 50 epochs using Adam on the full LibriSpeech in a few days using Horovod on four GPUs. This model has 3 convolutional layers and 3 unidirectional @@ -275,17 +282,26 @@

                  Deep Speech 2 based modelslink

                  ds2_small_1gpu.py11.77%9.32%39.08% This model was trained for 12 epochs using Adam on a “clean” subset of LibriSpeech in less than a day using a single GPU. This model has 2 convolutional layers and 2 bidirectional GRU layers with 512 units. link
                  w2l_large_8gpus.py15.44%This model was trained for 18 epochs (with early stopping based on +validation loss) using SGD with Momentum and LARC on +the full LibriSpeech in a few days on eight GPUs.The model has 19 convolutional layers (200–1000 units, 7–21 kernel size). +We use batch norm between all layers.link
                  -
                  +

                  Deep Speech 2 model description: https://arxiv.org/abs/1512.02595.

                  +

                  Wav2Letter model description: https://arxiv.org/abs/1609.03193, https://arxiv.org/abs/1712.09444.

                  diff --git a/docs/html/objects.inv b/docs/html/objects.inv index 6559ef1e7b336eb68d4010b2a768563e69ebc44a..929c2fa7d1428d375a060158c403acd1b906096f 100644 GIT binary patch delta 5202 zcmV-Y6s_yyCZj2kfPdq-702)X6snqB#m@MY%FS)NdtBqT9m}1`4F!>qgqYg^pqG65 zVBx+KcvNzj9+@P5e=H9V4fbzIk)}G zW_zIzs((<3KMAu>>D|bm_XWcbG3T3K`w*jPJ%ngX1wuL_M1P!;z=pEVXc(X)!|BNl zbR+^Lsh~-WLzH5MBd&m4U_pI1EF*trl#u`!%jtiJtV6_cl!k=66`#gwCXgU8Pnbqw zAsF3ff>_Z6VOS*08C2ESTnvrykz9s&Pr?xsuU*w|?)3}SMS9otfZ2>b)e%@op4nHN zWo&v9i0IOO27e+tzZW{oE$E}}TC;kvvy5yhPGrOR2@BS&x|ETWB|hOC>8n7s?$qy? zh*kP<2fkW2?kb(9C->{F-kK9VVx~s&~PzHBF5u!TRpoF8U6Vvzz zIudv29>P2&zCOwcdy|KxMJn4+x`!vqGb_n(S#{Du zI(u zP>QocHz2WD(c2O&lnEMgSifnQ9?p)cFXx?{)1$Qh(NSZ0zKxBU0|$ad#*B@vEP)Oz ziO_+D1X`>Jyog(UsY6o(VtGw}vsS0&|9~(a+J9pq%bEv#<4x594-tJP8-<`qI++N6 zpna0X0cI2iE}K~PZVe7ed=w`Ls4}P`R9N+bPhK3u6Oi7lOzw zd6NUHas7uFfF~moX-`l<(9T%?dXj^=oM(hH1Jech>_^WGncQ`3q9qvBrGXNR>iihV zU4L&ucmyCeD>7Q5BV!_;VlDHMyvkC**n0=--9Fr)$8YHRx5vBd+ZFTg-9GNn-OZW` zk3U~jQNI4X{{8l4O`)f&o0W^Qc2)g+_#IT?MNMoEGNdC6*hBY`S8&k1GHd(iLD4nc zbFb)1-m+f{%2_H?mLm!Si8czDx6AeovVXM+_Y&uZ( zYp$VW?iH*-Ch=Ni-Nenf-?jf%HSf!bpV0G*qpbC!`kf~iZ2@R1;XY1@!NO7Vd8?jn zW&W!QGT|#k)pJbVD^m~Y#BGkHc;yiZ!eM&Cds&8n`fKplshSoWy;E^Q0y8y`y?-S> z-lNHzmWw3+ozOV9PVIl>2x9KV!r)cgDlUx!@&Viudg;Q{$>f^aB5*97%bV(4D2r*~ z)nL#ZOYjcQ`I>_jB)>CG0V-tiI!-NbLzQdOuvX})_3qw?$SlrkJ58Bd6_dR| zwU0J{EX3?{G%r0l@Y2%eWqAlao`1wlgdX_WP!aO8-C`6c{3L$kmpp&ifB|8=Yx(!? za(7cvyAfgox`1YuO!;TDPeSe7!sJK=&sXMpf(L7%{0WRyG_-( zM8OFxj>qW`u~>VJRMAx(>w2oVxFH-9xp(A%#oFfZMiDvVRN;ox%FBD&{MIp@m& z9q5PdSwS#zSAI4Wh~_u~G%4OQ;SWSu4hLf_K)*LB1LFi)HP^8B6g7tTl?F0bzNv!e za(bj@zqiqzly>=fN* za1G7x4?kY+=_>*6dShUqE;lLx*8xUlRxc=KpkSH7DU7~PYSO4&C^?{2hJB? zp%cy%2knRRC4b$z;`?RjIyJ2!yg%?8xx6Pr5N0_!9jDp%(kZTGkDS25^@bTjPL>}g zoRfhGv^m$Yfc&t}2#(%bnwDi)pDHGW>N-g=+ZI2lQ z+xDy&00i?PmtXEY|1#U#;HzC(QUl;KW|b8nXQd_qSy#j z#rca+D*TGnwpas8YSsCUvDZ%FOeB$ti}V@^;<8vn3v3QR@2WVeRuy-pNT*-5HGgd3 z-tOnq)8q5Y?G3uR|Ngie99F>u{gZmJLC(*D(O?ryuR6be*|IcHhMo+#EGwO_*ndq9 z2WVSpU{|Z8rpg-CbDtc4;1B59sZ&?0oQ8;7-GmIKNl1N4oSa+@%+ovQJ+{()5s}*-i!Sv)$MviLxN_DE=3=AK-MEYdnSW30h;oi5 z1{zqO>a?vn>c{bDDs_G&seX%c^v&DF81ozZV(``ZHgbx!k8N>gCyLR6nM;PsmY*$m z3PSI3ZOs^%>6QEK=noe>6kjY0dVOt=mXpZDofU978OVeVQra@$TBCu}TbBHK^FYLV z!d@ZTVHO^u1?CAs*8QN!QGZ6LGK`}!!Axy2Jv6M&k~|zB`(B;qeIq3JmZ+P7fDJxZ z0iw-kYgslsn;2u*@Zg|N)x5%3fNBY=Jj@T`pKwfhG}3IWXPB7H^<+IB<46Zhqwioq zXsM+RvX;zP5bsQ#Wz$Z4DoBLXT6UHW-Q9Lqnd&Vr?EIGOx~+Q_6Myn#7^^;tF}Psv ztgtKC@Rq94o*yPLROY|pQ-EWfjcXx>33*pwHZGV{c;MWELjC5_J`>=L&Z9Wa~zPmJE3*pv$9hh2*%fGTL1y=tcMmP;J9FBa9dxE9j zWZx4UA+9$iAAg%eOACOzg@Un6)%u6k4{8{9e+vw33ru{pGVewwd|a- z=Pu{DOK6l_+|Ycj;&Gsri=_GiJ%>;}DwIzIuUmTnW6*rD;b+spBm~3$D@VzDjHF>u zej>r@<$Bb0xT+(zUH}+U_@;Wz8A>8(NUAqDEXG#u8mDSN{@S!lu-eCQ8y|a#*3o4z z4WN7Xa^2%hgMZEMjab^o$mcfa*@pRACa3XHTItx1>BAYRJ-jlOliFni8LD4VP1q;z z_BesPIV&w4g@x!zlXe$KqP?^ubRsz5t16j5YAu}|2@fl_NCOYmDmAAMXQnl%SL}6B z4OZ?oGtO6gM#wsfGg2=YST*%4n%BwmD0BK|0+6hJgMWa$BN^Dfk+Z875!(5L)h8|T zvXVc~Aecu(Q-Lpx+A7A1bm;jz`Q)(OuHt%E_jmuiMb}r?zuitm6A(K0kKfVL^W)du z8MJn{|5#P!`sXvcd$_s%*QzSt?jG)Tzun%TZ}(Tsk^aiI2j32t_U5;OYBlHY)st zQzB`Ik{y-u_!}A(XQev2Jc@S;B+yXGJ4l4i!8tEKupaTFio~^v&U+dLRVtDTWy{E2W7~ zKePjc`zOn=dY8&G3VY0{JKQW_t9}@nXJlpFShO;or znpn}drQ(B-eK%j^;c8GG+6ci`VNZ^9Ir2$uR-oR;zKVkv8WEf7>|^8wW*JPDWa$pd zYyo-JwY9@hYH4`w^scp3N4eHelVwvo&wp8bTki5Qgz?V0)KLp z<8VL^2Z!V*%ukld^Uz2?JX&s`wvZ!x57WCGV}UJ_hshD~EQ;hnPh>_|64|y%g!wCq zact8Rv5XmQbyZptAhTD8MS8W95uyQK>`jW}8!m#sn+_~r1r&~P4U-7=ZieOi5s8m*%W_Wv^Ie6O;TW>nk3q7T#B4q;RMebJn~_NK0ON7(3}7d;yTjDI97_YK&n zUl;vv#Ezw}y)(PX|G4ORbY=umK3TJ2Jz1zfV>8TX&+9j}0~bv{(Mzd3$VUIXkVQGr z<2H{Z8jgg<+km9vv=y10Pg5MqPK@^wQG&qXcQ5uSp#_;+v-`g@@lmV zHP4s~-Kb;w=2WmL&KQI_h<_TOGMhc9Uuvl;`Ch*4>?96axhoImCU4e?V)9;en`Wvq zE6sGu=MJa*V%Cbb^4wPWqODnE?8`;}rJcNKFRo7wsN@gRPfFyEHFo&sct5<@bAJk6 zeD16b&yV-R^>#2*Wc?wIHdYhdOQYVs<9oJj+%(#(J%=gHo@(jG)xpA{zAr`zIQc z&5Lp_q4DxvZ%{{d&lc5cj&I82J)kc#geNSB&Gpy~nwxpWEU@nJ={_WZZ0&ZZH4(>)&S7mYU||cj_A#=eP8Y<1^=< M!uD6_5)kaO{U==c5}Q^fsFACtTgKp?^? z(bHqdT=tkXSIR&NS@d z3r9={C6^%;N`KdO%!SfjJ}5&dT8!Eta*Kv_2qm7Ua&Abf8P2-}B^=}o<&4INb4-pw z2l5#`0+8n@)_XZ(uX2~PNM##J_fU{njf$Wr6oD8K!Q@Db8O>LeNl;7^q*VYU2{@#9 zO}Pv~0`x(^i@iKSBeI^LbQl}~^sWc6JV*syQO#I|Ie)5fYO1O-SD+C^lTSam5zq+nBiPpt^&&GQy$iD8fFBGqZ-TQAm{ift$%XXu&=kVF|*@caEcto z)ULS+tNzV!vp0w3{{mqmu!lmPXBm)9H)RhzMvR$kGz2o~1Ty#z_Ou`gWH=05c7Ysy z8k{0>6sHhS)!|i08jhM$n3O?UkW=r3&LM!ZpMMg4kzs70Ht`U(MXz$=Gp_kS0`O$S zAnhp>Fz8^cx1Qu=E3X66g@L0*J%`Z?LmGb_muM!8s%;<>Ms;}%w z$e1O@P|G~i7kL(8?EMTI-9Fra$FJb}`{Uj9?TYn}ZXb8x?q*Ge$6wFtRKEPW{`2;E zO@E=M&o?Wd%Gyo!>*3F!3eW1o^FUdKSYQu5L|(x`56Z0VpGQU4bkBpLD|yRdtx(QV znX;TwSdeI=fS~b7ZjLsO5F5zS=6{7ZqfNZxGea z1idw;p5mF?5Vm;b5fOlCeu8^>4hfFe;IGp(JvMm5;sgX_Y9f23G2DaMTZmSY|Dl+a z)~WrEoI%W;(J*+`w~C8Ng5Cr7gkHKZZ8E#&Wd$bExxA{u1qy;IuNDT)p#-l{XMeK- zYr3yHS%u`E%~Jst3bKw>Fd{N5wc1W|rdGw|Xi)v54WLLN`;kAwp05Z0r=uqvxkl9Vja!4tRF4tbn2#OZ-G7d3bsP=K zQYM^bb%fXGn{|6+lx|kjVfz$TPzll7|3CWJ-+7po$1#6V9X<0G^?`(c0_7h85HfZ| zdHImkEg7Gox=;(+i~)Oar)6HiLYNv5F-lWk z41EKtUJe*fL$7M!XxY8u?ti9y2AaE$w_!PsgFYt^CfxWLB8M3;ee&dWyai|SI^Kda za~(fHVu-7k4)!UHUz}%{4kR)UvJuKMhw3%CuLKzXmAVgEyx+!CEe$ zEjww4H}GZJoS9~>?pd$j(n_&=pWI9H+ry9Nd;Efe54|aHpdmLY5pDoX%B*DgvzjXy z{G?`3L!VU&XyCud+JBSGB85OEzfdd0G_y=9+%Q|Fx6*i9F22IRTP7ZK1TL3!8j309 z9F4??=Ga+;SniIQ{6A1x9%%jn)Xp%J=vwW%HGA$$?AefSQ`_IEPIT;rc-)u3xhoy9OM=xOS12wtwxLx{zC^w2IHI>@6U; z_GiNgfCys?U7m%vo#jogwau2ESz#L~&t?6kdqmFeqXUS(K#t>Gxy#mKXHaf-JPUO; ztT}QRb6cX~Z-iZy_Ee-M++9YgIE{s&e#j8LIsp&VyWr1e1@}S|O_Y9p!!YAL>Vgjn}5y;bsPm57<08aiHuC zGG$&eM4fmk(MPFF{?KmtHTdn7_BHt9Ud036YLvm5+ymWekI-n#LF}ny;R8LCRQHxM z$mAQIBNDSIOQ&7SS#rit_Z4cXI~hUfv>SyqU@O}lgMS^c>~fU~p`?yJb7jd@{Y;7s z(EWc{VhxOoSZ8N4rs+Rc_t3OZERE>sg^y%+33E071Nbn;54! z2dTP9)Hl}MIdY_xuVguuce3g>z}J!oS(qFUQvs1PN)3cw&Tvg(@49xHYR6rJw~f8cddfI=d3_ek0>oDuWb347u<(I z2{=(kU#VBG%q+buAgE5^Dwhz-sqvsTq}fEL)3ee@azUFJY6RitlE+}a?s=B$=BmfA zLHz9)+};t4k-Wxp8=9h5>9bD);n8=87f{T?dVlHu)k=GOf{a@NXEL`<4~JI17qw+O z6b6dB4TB+&?+}<|x|#nFGKlF9f$hzHh~j1P4KRjjT%>9pV4tGG)kGsvWff8}xfMzU zUzvKxeq{BlIeupAtwXR7G*eNLUd{kqp4Qk}m(jH3^jx4HOJRKw<1rJq0^Be?4}rkFRav8X?&NkHiBDSMvF+7t0$~IPRA0r~Tla()5t8Qop)03MF9aSzIb*SAh4jmgNB57^jU|u?< zq;3kA?wmo+EUS9LLs-b1Mi{$jZ339UT&LkuS=6Uqma<|dW{yZC5t~P;IyLs>>wl%_ z8VL3up(>sgslaEw7#wO`zdyR}JJle$db4Ug8~SppSyL^2g}(T#?|}M&t4(WM4w)En zHTuw*moa-8A(N%l@rj~Qom^mg8gqn2^{e?mP<(;}^K8)N_3;keNM89LNv2J2@)6n) zTIzxbttB%x)q7KCriTE0EKmm2f`3<$kKFDuExAP(cZfS2}@ne&{o~FDN+%u6>ue zN{!cQ_z`&Mb2vek$jEmr8nNbORgJr;iGr{>JjhV5$Z|##N9ZBtBJ&AjG=CuadcS-A zaa|@@vF+;~!mmgBFz3my4+=2T^^yM5+R?ZBG$8us(|tXJU-zBC)Wt{nVJ8_U9T_e1 zAYf;^=z{{-MIRZ!E=VT4%tDseufsO+yt-(0Hj#0hr@1Jw`bU4vHz{B`am72sQZFLz z5zGMBnfRt34`0@dBgj_1zkjJ*nr`(yhx;|Lg^anunjv;X zvif;F>NZ@}5j!scj46Coqvirmb z@^6`GzIqW{wh<{%`+vlOHPf)7@k3>#Ie$YTkgR_87QF!(+`giVt5#9k<%Cr#=JH(8 z-xm-pBci$BH;mdU#tL-!`6v3|VTWDAjXvMs{p%K7f4=^HI}=fa(7Av72A+OCe%W0> zYj^uUtEyc8`U&nHZf^g*s>;{9hr8YP+Z*up{`0rpWu$i3kAFXJFQv6gFQNBbeVJ3x z^4e(PJCN=ohg47UMk2vkc!eVrC&ZeSqgCjF<}YhWq=+1aN#L!PaNzvCG#3?qQr|+H z`ZlUNyCAD~bDf8!7tVzg0RnNcN{pz&;^Buos?$<x zb;0bSwq0|WC2pM&#@v#T&OALdkCcHuhG8P^Mrr1QCw|mbDoi=njiHl#xmA6l-;7Yd zugqSh;##beCSa(#EkeI&{bIajIVLB1$`DX7u6ezK7wP!REfXZ!isOkN2Fsl&eQ{_j zT`c;-D1S^;X5w!t-NW=>kYTt_5&tfa4myVMDonCw?g_e5;X%l;TP_iEH7So}a&T4H zqa(IQKB=S#>b>vJM1Nq`r8$Qf9hBJyp0=#UotWLwj!@EgNp zcXcKTL&o5T&YCaS#B4l4=%%2zEv)l$G)cN_i=U@AbRkTCF<_#!3iM2}8i&Hw6?}qY zm*OfUfUgjzYF??ctvNuBSS_6#4j#o%ke?ijmr;#=a=gy4X33F;&vee>P9{c6NJ@`@ z7k^nMXKEy?!)WH(Ga2MBfI#BXAq44#EGJ!67A3$W^>8S!QXT=EaBFu^?cZ>b{L>`W z-F9;$Nmp6_{`1q_M(!`vtF78gUp3#amK~@e@~;j;9nj@BYO7nIlgxLta68+q2DNtd zc?=Eig*%_%*;IdB z^^&>|7H#Ew-N)#DCE^BhN&#BmMORB9dn^Jx~b2uHEN?#NT@bVR8EYoz6Mn;rB$Ks5mTX728~U1 z#30O0&;ixe96wql_(!8KI5DH!|*DJ{T#gd+H4>Q%Qg3T%)DIY+FhQGV!QK^t{` zv}#tfe^c)t0R1kbXv7FiWldNSi-MtP_CSG3&g|dR`>&RIu^Gj2&$s!d4sEJK)j-EG zyw_(w<5P_pRQrb;tg_v@dbQa;=7{Xj-=E}v@^`W$Q`22r^@nN#x!pUpksnnJ*c{%p z56{qBCpg8I=Dth;n3&3n-G4b;mT$GsM$VFDsCJd9bY9!M%IDQ?px&W!K&YLY=J)mV zk(rN=cR+Q0hfu%urN`0Qj<%2_TXit+Mn>mH%bFTvmiss5{t~1YliQ%Cn!~8y1}ZR2 zA8@Yu$F@39*GsLa7&n{c+fV=aPython Module Index

              decoders
                  + decoders.convs2s_decoder +
                  @@ -266,6 +271,16 @@

              Python Module Index

              encoders
                  + encoders.cnn_encoder +
                  + encoders.convs2s_encoder +
                  @@ -291,6 +306,11 @@

              Python Module Index

                  encoders.rnn_encoders
                  + encoders.w2l_encoder +
               
              l
              parts
                  + parts.cnns +
                  + parts.cnns.conv_blocks +
                  + parts.convs2s +
                  + parts.convs2s.attention_wn_layer +
                  + parts.convs2s.conv_wn_layer +
                  + parts.convs2s.ffn_wn_layer +
                  @@ -447,11 +497,6 @@

              Python Module Index

                  parts.transformer.beam_search
                  - parts.transformer.beam_search_test -
                  diff --git a/docs/html/searchindex.js b/docs/html/searchindex.js index d1ffd100f..52eb7af85 100644 --- a/docs/html/searchindex.js +++ b/docs/html/searchindex.js @@ -1 +1 @@ -Search.setIndex({docnames:["api-docs/data","api-docs/data.image2label","api-docs/data.speech2text","api-docs/data.text2text","api-docs/decoders","api-docs/encoders","api-docs/losses","api-docs/models","api-docs/modules","api-docs/optimizers","api-docs/parts","api-docs/parts.rnns","api-docs/parts.transformer","api-docs/utils","distr-training","extending","extending/adding-new-data-layer","extending/adding-new-decoder","extending/adding-new-encoder","extending/adding-new-loss","getting-started","getting-started/asr","getting-started/nmt","in-depth-tutorials","in-depth-tutorials/internal-structure","in-depth-tutorials/using-existing-models","index","installation-instructions","mixed-precision","models-and-recipes"],envversion:53,filenames:["api-docs/data.rst","api-docs/data.image2label.rst","api-docs/data.speech2text.rst","api-docs/data.text2text.rst","api-docs/decoders.rst","api-docs/encoders.rst","api-docs/losses.rst","api-docs/models.rst","api-docs/modules.rst","api-docs/optimizers.rst","api-docs/parts.rst","api-docs/parts.rnns.rst","api-docs/parts.transformer.rst","api-docs/utils.rst","distr-training.rst","extending.rst","extending/adding-new-data-layer.rst","extending/adding-new-decoder.rst","extending/adding-new-encoder.rst","extending/adding-new-loss.rst","getting-started.rst","getting-started/asr.rst","getting-started/nmt.rst","in-depth-tutorials.rst","in-depth-tutorials/internal-structure.rst","in-depth-tutorials/using-existing-models.rst","index.rst","installation-instructions.rst","mixed-precision.rst","models-and-recipes.rst"],objects:{"":{data:[0,0,0,"-"],decoders:[4,0,0,"-"],encoders:[5,0,0,"-"],losses:[6,0,0,"-"],models:[7,0,0,"-"],optimizers:[9,0,0,"-"],parts:[10,0,0,"-"],utils:[13,0,0,"-"]},"data.data_layer":{DataLayer:[0,1,1,""]},"data.data_layer.DataLayer":{__init__:[0,2,1,""],build_graph:[0,2,1,""],get_optional_params:[0,3,1,""],get_required_params:[0,3,1,""],get_size_in_samples:[0,2,1,""],input_tensors:[0,4,1,""],iterator:[0,4,1,""],params:[0,4,1,""]},"data.image2label":{image2label:[1,0,0,"-"],imagenet_preprocessing:[1,0,0,"-"]},"data.image2label.image2label":{ImagenetDataLayer:[1,1,1,""]},"data.image2label.image2label.ImagenetDataLayer":{build_graph:[1,2,1,""],get_optional_params:[1,3,1,""],get_required_params:[1,3,1,""],get_size_in_samples:[1,2,1,""],input_tensors:[1,4,1,""],iterator:[1,4,1,""],split_data:[1,2,1,""]},"data.image2label.imagenet_preprocessing":{_aspect_preserving_resize:[1,5,1,""],_central_crop:[1,5,1,""],_decode_crop_and_flip:[1,5,1,""],_mean_image_subtraction:[1,5,1,""],_parse_example_proto:[1,5,1,""],_resize_image:[1,5,1,""],_smallest_size_at_least:[1,5,1,""],parse_record:[1,5,1,""],preprocess_image:[1,5,1,""]},"data.speech2text":{speech2text:[2,0,0,"-"],speech_utils:[2,0,0,"-"]},"data.speech2text.speech2text":{Speech2TextDataLayer:[2,1,1,""]},"data.speech2text.speech2text.Speech2TextDataLayer":{__init__:[2,2,1,""],_parse_audio_element:[2,2,1,""],_parse_audio_transcript_element:[2,2,1,""],build_graph:[2,2,1,""],get_optional_params:[2,3,1,""],get_required_params:[2,3,1,""],get_size_in_samples:[2,2,1,""],input_tensors:[2,4,1,""],iterator:[2,4,1,""],split_data:[2,2,1,""]},"data.speech2text.speech_utils":{augment_audio_signal:[2,5,1,""],get_speech_features:[2,5,1,""],get_speech_features_from_file:[2,5,1,""]},"data.text2text":{t2t:[3,0,0,"-"],text2text:[3,0,0,"-"],tokenizer:[3,0,0,"-"]},"data.text2text.t2t":{_batch_examples:[3,5,1,""],_create_min_max_boundaries:[3,5,1,""],_filter_max_length:[3,5,1,""],_get_example_length:[3,5,1,""],_load_records:[3,5,1,""],_parse_example:[3,5,1,""],_read_and_batch_from_files:[3,5,1,""],eval_input_fn:[3,5,1,""],train_input_fn:[3,5,1,""]},"data.text2text.text2text":{ParallelTextDataLayer:[3,1,1,""],SpecialTextTokens:[3,1,1,""],TransformerDataLayer:[3,1,1,""]},"data.text2text.text2text.ParallelTextDataLayer":{build_graph:[3,2,1,""],get_optional_params:[3,3,1,""],get_required_params:[3,3,1,""],get_size_in_samples:[3,2,1,""],input_tensors:[3,4,1,""],iterator:[3,4,1,""]},"data.text2text.text2text.SpecialTextTokens":{END_OF_CHOICE:[3,4,1,""],EOS_ID:[3,4,1,""],OUT_OF_BUCKET:[3,4,1,""],PAD_ID:[3,4,1,""],S_ID:[3,4,1,""],UNK_ID:[3,4,1,""]},"data.text2text.text2text.TransformerDataLayer":{build_graph:[3,2,1,""],get_optional_params:[3,3,1,""],get_required_params:[3,3,1,""],input_tensors:[3,4,1,""],iterator:[3,4,1,""]},"data.text2text.tokenizer":{Subtokenizer:[3,1,1,""],_count_and_gen_subtokens:[3,5,1,""],_count_tokens:[3,5,1,""],_escape_token:[3,5,1,""],_filter_and_bucket_subtokens:[3,5,1,""],_gen_new_subtoken_list:[3,5,1,""],_generate_alphabet_dict:[3,5,1,""],_generate_subtokens:[3,5,1,""],_generate_subtokens_with_target_vocab_size:[3,5,1,""],_join_tokens_to_string:[3,5,1,""],_list_to_index_dict:[3,5,1,""],_load_vocab_file:[3,5,1,""],_native_to_unicode:[3,5,1,""],_save_vocab_file:[3,5,1,""],_split_string_to_tokens:[3,5,1,""],_split_token_to_subtokens:[3,5,1,""],_unescape_token:[3,5,1,""],_unicode_to_native:[3,5,1,""]},"data.text2text.tokenizer.Subtokenizer":{__init__:[3,2,1,""],_subtoken_ids_to_tokens:[3,2,1,""],_token_to_subtoken_ids:[3,2,1,""],decode:[3,2,1,""],encode:[3,2,1,""],init_from_files:[3,3,1,""]},"data.utils":{load_pre_existing_vocabulary:[0,5,1,""],pad_vocab_to_eight:[0,5,1,""]},"decoders.decoder":{Decoder:[4,1,1,""]},"decoders.decoder.Decoder":{__init__:[4,2,1,""],_cast_types:[4,2,1,""],_decode:[4,2,1,""],decode:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""],mode:[4,4,1,""],name:[4,4,1,""],params:[4,4,1,""]},"decoders.fc_decoders":{FullyConnectedCTCDecoder:[4,1,1,""],FullyConnectedDecoder:[4,1,1,""],FullyConnectedTimeDecoder:[4,1,1,""]},"decoders.fc_decoders.FullyConnectedCTCDecoder":{__init__:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""]},"decoders.fc_decoders.FullyConnectedDecoder":{__init__:[4,2,1,""],_decode:[4,2,1,""],get_required_params:[4,3,1,""]},"decoders.fc_decoders.FullyConnectedTimeDecoder":{__init__:[4,2,1,""],_decode:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""]},"decoders.rnn_decoders":{BeamSearchRNNDecoderWithAttention:[4,1,1,""],RNNDecoderWithAttention:[4,1,1,""]},"decoders.rnn_decoders.BeamSearchRNNDecoderWithAttention":{__init__:[4,2,1,""],_decode:[4,2,1,""],get_optional_params:[4,3,1,""]},"decoders.rnn_decoders.RNNDecoderWithAttention":{__init__:[4,2,1,""],_build_attention:[4,2,1,""],_decode:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""]},"encoders.ds2_encoder":{DeepSpeech2Encoder:[5,1,1,""],conv2d_bn_actv:[5,5,1,""],rnn_cell:[5,5,1,""],row_conv:[5,5,1,""]},"encoders.ds2_encoder.DeepSpeech2Encoder":{__init__:[5,2,1,""],_encode:[5,2,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""]},"encoders.encoder":{Encoder:[5,1,1,""]},"encoders.encoder.Encoder":{__init__:[5,2,1,""],_cast_types:[5,2,1,""],_encode:[5,2,1,""],encode:[5,2,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],mode:[5,4,1,""],name:[5,4,1,""],params:[5,4,1,""]},"encoders.resnet_blocks":{batch_norm:[5,5,1,""],block_layer:[5,5,1,""],bottleneck_block_v1:[5,5,1,""],bottleneck_block_v2:[5,5,1,""],building_block_v1:[5,5,1,""],building_block_v2:[5,5,1,""],conv2d_fixed_padding:[5,5,1,""],fixed_padding:[5,5,1,""]},"encoders.resnet_encoder":{ResNetEncoder:[5,1,1,""]},"encoders.resnet_encoder.ResNetEncoder":{get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""]},"encoders.rnn_encoders":{BidirectionalRNNEncoderWithEmbedding:[5,1,1,""],GNMTLikeEncoderWithEmbedding:[5,1,1,""],UnidirectionalRNNEncoderWithEmbedding:[5,1,1,""]},"encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding":{__init__:[5,2,1,""],_encode:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding":{__init__:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.rnn_encoders.UnidirectionalRNNEncoderWithEmbedding":{__init__:[5,2,1,""],_encode:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"losses.cross_entropy_loss":{CrossEntropyLoss:[6,1,1,""]},"losses.ctc_loss":{CTCLoss:[6,1,1,""],dense_to_sparse:[6,5,1,""]},"losses.ctc_loss.CTCLoss":{__init__:[6,2,1,""],_compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""]},"losses.loss":{Loss:[6,1,1,""]},"losses.loss.Loss":{__init__:[6,2,1,""],_cast_types:[6,2,1,""],_compute_loss:[6,2,1,""],compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""],get_required_params:[6,3,1,""],name:[6,4,1,""],params:[6,4,1,""]},"losses.sequence_loss":{BasicSequenceLoss:[6,1,1,""],CrossEntropyWithSmoothing:[6,1,1,""],PaddedCrossEntropyLossWithSmoothing:[6,1,1,""]},"losses.sequence_loss.BasicSequenceLoss":{__init__:[6,2,1,""],_compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""],get_required_params:[6,3,1,""]},"losses.sequence_loss.CrossEntropyWithSmoothing":{__init__:[6,2,1,""],_compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""],get_required_params:[6,3,1,""]},"losses.sequence_loss.PaddedCrossEntropyLossWithSmoothing":{get_optional_params:[6,3,1,""]},"models.encoder_decoder":{EncoderDecoderModel:[7,1,1,""]},"models.encoder_decoder.EncoderDecoderModel":{__init__:[7,2,1,""],_build_forward_pass_graph:[7,2,1,""],_create_decoder:[7,2,1,""],_create_encoder:[7,2,1,""],_create_loss:[7,2,1,""],decoder:[7,4,1,""],encoder:[7,4,1,""],get_optional_params:[7,3,1,""],get_required_params:[7,3,1,""],loss_computator:[7,4,1,""]},"models.image2label":{Image2Label:[7,1,1,""]},"models.image2label.Image2Label":{evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],get_num_objects_per_step:[7,2,1,""],maybe_print_logs:[7,2,1,""]},"models.model":{Model:[7,1,1,""]},"models.model.Model":{__init__:[7,2,1,""],_build_forward_pass_graph:[7,2,1,""],clip_last_batch:[7,2,1,""],compile:[7,2,1,""],evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],finalize_inference:[7,2,1,""],get_data_layer:[7,2,1,""],get_num_objects_per_step:[7,2,1,""],get_optional_params:[7,3,1,""],get_output_tensors:[7,2,1,""],get_required_params:[7,3,1,""],get_tf_dtype:[7,2,1,""],hvd:[7,4,1,""],infer:[7,2,1,""],last_step:[7,4,1,""],maybe_print_logs:[7,2,1,""],mode:[7,4,1,""],num_gpus:[7,4,1,""],on_horovod:[7,4,1,""],params:[7,4,1,""],steps_in_epoch:[7,4,1,""]},"models.speech2text":{Speech2Text:[7,1,1,""],levenshtein:[7,5,1,""],sparse_tensor_to_chars:[7,5,1,""]},"models.speech2text.Speech2Text":{evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],finalize_inference:[7,2,1,""],get_num_objects_per_step:[7,2,1,""],infer:[7,2,1,""],maybe_print_logs:[7,2,1,""]},"models.text2text":{Text2Text:[7,1,1,""],calculate_bleu:[7,5,1,""],transform_for_bleu:[7,5,1,""]},"models.text2text.Text2Text":{evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],finalize_inference:[7,2,1,""],get_num_objects_per_step:[7,2,1,""],infer:[7,2,1,""],maybe_print_logs:[7,2,1,""]},"optimizers.automatic_loss_scaler":{AutomaticLossScaler:[9,1,1,""],BackoffScaler:[9,1,1,""],LogMaxScaler:[9,1,1,""]},"optimizers.automatic_loss_scaler.AutomaticLossScaler":{SUPPORTED_ALGOS:[9,4,1,""],check_grads:[9,3,1,""],loss_scale:[9,4,1,""],update_op:[9,2,1,""]},"optimizers.automatic_loss_scaler.BackoffScaler":{loss_scale:[9,4,1,""],update_op:[9,2,1,""]},"optimizers.automatic_loss_scaler.LogMaxScaler":{loss_scale:[9,4,1,""],update_op:[9,2,1,""]},"optimizers.lr_policies":{exp_decay:[9,5,1,""],fixed_lr:[9,5,1,""],piecewise_constant:[9,5,1,""],poly_decay:[9,5,1,""],transformer_policy:[9,5,1,""]},"optimizers.mp_wrapper":{MixedPrecisionOptimizerWrapper:[9,1,1,""],mp_regularizer_wrapper:[9,5,1,""]},"optimizers.mp_wrapper.MixedPrecisionOptimizerWrapper":{apply_gradients:[9,2,1,""],compute_gradients:[9,2,1,""]},"optimizers.optimizers":{DistributedOptimizer:[9,1,1,""],_adaptive_max_norm:[9,5,1,""],_add_scaled_noise_to_gradients:[9,5,1,""],_clip_gradients_by_norm:[9,5,1,""],_multiply_gradients:[9,5,1,""],_multiply_gradients_const:[9,5,1,""],adaptive_clipping_fn:[9,5,1,""],get_regularization_loss:[9,5,1,""],optimize_loss:[9,5,1,""]},"optimizers.optimizers.DistributedOptimizer":{__init__:[9,2,1,""],apply_gradients:[9,2,1,""],compute_gradients:[9,2,1,""]},"parts.rnns":{attention_wrapper:[11,0,0,"-"],flstm:[11,0,0,"-"],glstm:[11,0,0,"-"],gnmt:[11,0,0,"-"],rnn_beam_search_decoder:[11,0,0,"-"],slstm:[11,0,0,"-"],utils:[11,0,0,"-"]},"parts.rnns.attention_wrapper":{AttentionMechanism:[11,1,1,""],AttentionWrapper:[11,1,1,""],AttentionWrapperState:[11,1,1,""],BahdanauAttention:[11,1,1,""],BahdanauMonotonicAttention:[11,1,1,""],LuongAttention:[11,1,1,""],LuongMonotonicAttention:[11,1,1,""],hardmax:[11,5,1,""],monotonic_attention:[11,5,1,""],safe_cumprod:[11,5,1,""]},"parts.rnns.attention_wrapper.AttentionMechanism":{alignments_size:[11,4,1,""],state_size:[11,4,1,""]},"parts.rnns.attention_wrapper.AttentionWrapper":{__init__:[11,2,1,""],_item_or_tuple:[11,2,1,""],call:[11,2,1,""],output_size:[11,4,1,""],state_size:[11,4,1,""],zero_state:[11,2,1,""]},"parts.rnns.attention_wrapper.AttentionWrapperState":{clone:[11,2,1,""]},"parts.rnns.attention_wrapper.BahdanauAttention":{__init__:[11,2,1,""]},"parts.rnns.attention_wrapper.BahdanauMonotonicAttention":{__init__:[11,2,1,""]},"parts.rnns.attention_wrapper.LuongAttention":{__init__:[11,2,1,""]},"parts.rnns.attention_wrapper.LuongMonotonicAttention":{__init__:[11,2,1,""]},"parts.rnns.flstm":{FLSTMCell:[11,1,1,""]},"parts.rnns.flstm.FLSTMCell":{__init__:[11,2,1,""],call:[11,2,1,""],output_size:[11,4,1,""],state_size:[11,4,1,""]},"parts.rnns.glstm":{GLSTMCell:[11,1,1,""]},"parts.rnns.glstm.GLSTMCell":{__init__:[11,2,1,""],_get_input_for_group:[11,2,1,""],call:[11,2,1,""],output_size:[11,4,1,""],state_size:[11,4,1,""]},"parts.rnns.gnmt":{GNMTAttentionMultiCell:[11,1,1,""],gnmt_residual_fn:[11,5,1,""]},"parts.rnns.gnmt.GNMTAttentionMultiCell":{__init__:[11,2,1,""]},"parts.rnns.rnn_beam_search_decoder":{BeamSearchDecoder:[11,1,1,""],BeamSearchDecoderOutput:[11,1,1,""],BeamSearchDecoderState:[11,1,1,""],FinalBeamSearchDecoderOutput:[11,1,1,""],tile_batch:[11,5,1,""]},"parts.rnns.rnn_beam_search_decoder.BeamSearchDecoder":{__init__:[11,2,1,""],_maybe_merge_batch_beams:[11,2,1,""],_maybe_split_batch_beams:[11,2,1,""],_merge_batch_beams:[11,2,1,""],_split_batch_beams:[11,2,1,""],batch_size:[11,4,1,""],finalize:[11,2,1,""],initialize:[11,2,1,""],output_dtype:[11,4,1,""],output_size:[11,4,1,""],step:[11,2,1,""],tracks_own_finished:[11,4,1,""]},"parts.rnns.slstm":{BasicSLSTMCell:[11,1,1,""],_linear:[11,5,1,""]},"parts.rnns.slstm.BasicSLSTMCell":{__init__:[11,2,1,""],call:[11,2,1,""],output_size:[11,4,1,""],state_size:[11,4,1,""]},"parts.rnns.utils":{create_rnn_cell:[11,5,1,""]},"parts.transformer":{attention_layer:[12,0,0,"-"],beam_search:[12,0,0,"-"],beam_search_test:[12,0,0,"-"],common:[12,0,0,"-"],embedding_layer:[12,0,0,"-"],ffn_layer:[12,0,0,"-"],utils:[12,0,0,"-"]},"parts.transformer.attention_layer":{Attention:[12,1,1,""],SelfAttention:[12,1,1,""]},"parts.transformer.attention_layer.Attention":{call:[12,2,1,""],combine_heads:[12,2,1,""],split_heads:[12,2,1,""]},"parts.transformer.attention_layer.SelfAttention":{call:[12,2,1,""]},"parts.transformer.beam_search":{SequenceBeamSearch:[12,1,1,""],_StateKeys:[12,1,1,""],_expand_to_beam_size:[12,5,1,""],_flatten_beam_dim:[12,5,1,""],_gather_beams:[12,5,1,""],_gather_topk_beams:[12,5,1,""],_length_normalization:[12,5,1,""],_shape_list:[12,5,1,""],_unflatten_beam_dim:[12,5,1,""],sequence_beam_search:[12,5,1,""]},"parts.transformer.beam_search.SequenceBeamSearch":{_continue_search:[12,2,1,""],_create_initial_state:[12,2,1,""],_get_new_alive_state:[12,2,1,""],_get_new_finished_state:[12,2,1,""],_grow_alive_seq:[12,2,1,""],_search_step:[12,2,1,""],search:[12,2,1,""]},"parts.transformer.beam_search._StateKeys":{ALIVE_CACHE:[12,4,1,""],ALIVE_LOG_PROBS:[12,4,1,""],ALIVE_SEQ:[12,4,1,""],CUR_INDEX:[12,4,1,""],FINISHED_FLAGS:[12,4,1,""],FINISHED_SCORES:[12,4,1,""],FINISHED_SEQ:[12,4,1,""]},"parts.transformer.beam_search_test":{BeamSearchHelperTests:[12,1,1,""]},"parts.transformer.beam_search_test.BeamSearchHelperTests":{test_expand_to_beam_size:[12,2,1,""],test_flatten_beam_dim:[12,2,1,""],test_gather_beams:[12,2,1,""],test_gather_topk_beams:[12,2,1,""],test_get_shape_keep_last_dim:[12,2,1,""],test_shape_list:[12,2,1,""],test_unflatten_beam_dim:[12,2,1,""]},"parts.transformer.common":{LayerNormalization:[12,1,1,""],PrePostProcessingWrapper:[12,1,1,""]},"parts.transformer.common.LayerNormalization":{build:[12,2,1,""],call:[12,2,1,""]},"parts.transformer.embedding_layer":{EmbeddingSharedWeights:[12,1,1,""]},"parts.transformer.embedding_layer.EmbeddingSharedWeights":{build:[12,2,1,""],call:[12,2,1,""],linear:[12,2,1,""]},"parts.transformer.ffn_layer":{FeedFowardNetwork:[12,1,1,""]},"parts.transformer.ffn_layer.FeedFowardNetwork":{call:[12,2,1,""]},"parts.transformer.utils":{get_decoder_self_attention_bias:[12,5,1,""],get_padding:[12,5,1,""],get_padding_bias:[12,5,1,""],get_position_encoding:[12,5,1,""]},"utils.funcs":{evaluate:[13,5,1,""],infer:[13,5,1,""],restore_and_get_results:[13,5,1,""],train:[13,5,1,""]},"utils.hooks":{BroadcastGlobalVariablesHook:[13,1,1,""],PrintLossAndTimeHook:[13,1,1,""],PrintSamplesHook:[13,1,1,""],RunEvaluationHook:[13,1,1,""]},"utils.hooks.BroadcastGlobalVariablesHook":{__init__:[13,2,1,""],after_create_session:[13,2,1,""],begin:[13,2,1,""]},"utils.hooks.PrintLossAndTimeHook":{after_run:[13,2,1,""],before_run:[13,2,1,""],begin:[13,2,1,""]},"utils.hooks.PrintSamplesHook":{after_run:[13,2,1,""],before_run:[13,2,1,""],begin:[13,2,1,""]},"utils.hooks.RunEvaluationHook":{after_run:[13,2,1,""],before_run:[13,2,1,""],begin:[13,2,1,""]},"utils.utils":{Logger:[13,1,1,""],array_to_string:[13,5,1,""],cast_types:[13,5,1,""],check_params:[13,5,1,""],clip_last_batch:[13,5,1,""],clip_sparse:[13,5,1,""],deco_print:[13,5,1,""],flatten_dict:[13,5,1,""],get_available_gpus:[13,5,1,""],get_git_diff:[13,5,1,""],get_git_hash:[13,5,1,""],get_results_for_epoch:[13,5,1,""],iterate_data_layer:[13,5,1,""],log_summaries_from_dict:[13,5,1,""],mask_nans:[13,5,1,""],nest_dict:[13,5,1,""],nested_update:[13,5,1,""],text_ids_to_string:[13,5,1,""]},"utils.utils.Logger":{flush:[13,2,1,""],write:[13,2,1,""]},data:{data_layer:[0,0,0,"-"],image2label:[1,0,0,"-"],speech2text:[2,0,0,"-"],text2text:[3,0,0,"-"],utils:[0,0,0,"-"]},decoders:{decoder:[4,0,0,"-"],fc_decoders:[4,0,0,"-"],rnn_decoders:[4,0,0,"-"]},encoders:{ds2_encoder:[5,0,0,"-"],encoder:[5,0,0,"-"],resnet_blocks:[5,0,0,"-"],resnet_encoder:[5,0,0,"-"],rnn_encoders:[5,0,0,"-"]},losses:{cross_entropy_loss:[6,0,0,"-"],ctc_loss:[6,0,0,"-"],loss:[6,0,0,"-"],sequence_loss:[6,0,0,"-"]},models:{encoder_decoder:[7,0,0,"-"],image2label:[7,0,0,"-"],model:[7,0,0,"-"],speech2text:[7,0,0,"-"],text2text:[7,0,0,"-"]},optimizers:{automatic_loss_scaler:[9,0,0,"-"],lr_policies:[9,0,0,"-"],mp_wrapper:[9,0,0,"-"],optimizers:[9,0,0,"-"]},parts:{rnns:[11,0,0,"-"],transformer:[12,0,0,"-"]},utils:{funcs:[13,0,0,"-"],hooks:[13,0,0,"-"],utils:[13,0,0,"-"]}},objnames:{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","method","Python method"],"3":["py","staticmethod","Python static method"],"4":["py","attribute","Python attribute"],"5":["py","function","Python function"]},objtypes:{"0":"py:module","1":"py:class","2":"py:method","3":"py:staticmethod","4":"py:attribute","5":"py:function"},terms:{"106gb":21,"1080ti":29,"16xlarg":28,"1e6":3,"1e9":12,"1x1":5,"224gb":21,"2xlarg":28,"4gpu":29,"55gb":21,"5gb":27,"8xlarg":28,"\u03b1":22,"\u03b4":22,"\u03b5":22,"\u03b6":22,"\u03ba":22,"abstract":[0,4,5,6,7],"boolean":[1,5,11],"byte":3,"case":[0,4,5,6,7,11,14,16,25,28],"char":0,"class":[0,1,2,3,4,5,6,7,9,11,12,13,16,25],"default":[3,5,6,9,11,13,25,27,28],"enum":3,"export":21,"final":[4,7,11,13,21,25],"float":[1,2,4,5,7,9,11,12,25,28],"function":[0,2,4,5,6,7,9,11,12,13,25,28],"import":[25,28],"int":[0,1,2,3,4,5,7,9,11,12,25],"long":11,"new":[0,1,3,9,11,12,13,23,25,26],"return":[0,1,2,3,4,5,6,7,9,11,12,13,16,28],"short":[11,25,29],"static":[0,1,2,3,4,5,6,7,9,11,16,25,28],"true":[0,1,3,5,6,7,9,11,12,14,25,29],"try":[9,22,25,27],"while":[1,3,16,22,25,28],ALS:9,AWS:28,Adding:23,And:[0,1,3,28],But:7,EOS:12,For:[1,2,3,4,5,7,9,11,13,16,21,22,25,26,28,29],IDs:3,Its:11,NOT:11,One:[22,25,28],Such:16,That:[0,1,3,4,7],The:[0,1,3,5,7,9,11,12,13,16,21,25,27,28,29],There:25,These:[1,4,7,11,13,28],Used:[3,7,12],Uses:[9,13,29],Using:23,With:[14,16,29],__init__:[0,1,2,3,4,5,6,7,9,11,13,16,25],_adaptive_max_norm:9,_add_scaled_noise_to_gradi:9,_aspect_preserving_res:1,_baseattentionmechan:11,_basemonotonicattentionmechan:11,_batch_exampl:3,_build_attent:4,_build_forward_pass_graph:7,_building_block_v1:5,_building_block_v2:5,_cast_typ:[4,5,6],_central_crop:1,_clip_gradients_by_norm:9,_compute_loss:6,_continue_search:12,_count_and_gen_subtoken:3,_count_token:3,_create_decod:7,_create_encod:7,_create_initial_st:12,_create_loss:7,_create_min_max_boundari:3,_decod:4,_decode_crop_and_flip:1,_distributed_appli:9,_encod:5,_escape_token:3,_expand_to_beam_s:12,_filter_and_bucket_subtoken:3,_filter_max_length:3,_flatten_beam_dim:12,_gather_beam:12,_gather_topk_beam:12,_gather_tre:11,_gen_new_subtoken_list:3,_generate_alphabet_dict:3,_generate_subtoken:3,_generate_subtokens_with_target_vocab_s:3,_get_example_length:3,_get_input_for_group:11,_get_new_alive_st:12,_get_new_finished_st:12,_grow_alive_seq:12,_item_or_tupl:11,_join_tokens_to_str:3,_length_norm:12,_linear:11,_list_to_index_dict:3,_load_record:3,_load_vocab_fil:3,_maybe_merge_batch_beam:11,_maybe_split_batch_beam:11,_mean_image_subtract:1,_merge_batch_beam:11,_monotonic_probability_fn:11,_multiply_gradi:9,_multiply_gradients_const:9,_native_to_unicod:3,_output:7,_parse_audio_el:2,_parse_audio_transcript_el:2,_parse_exampl:3,_parse_example_proto:1,_read_and_batch_from_fil:3,_resize_imag:1,_save_vocab_fil:3,_search_step:12,_shape_list:12,_smallest_size_at_least:1,_split_batch_beam:11,_split_string_to_token:3,_split_token_to_subtoken:3,_statekei:12,_subtoken_ids_to_token:3,_test:27,_token_to_subtoken_id:3,_unescape_token:3,_unflatten_beam_dim:12,_unicode_to_n:3,abl:[0,1,3,21,27],about:[13,29],abov:[11,22,28],abs:[5,9,11,29],absolut:28,acceler:11,accept:[3,11],access:[0,4,5,6,7],accord:11,accuraci:[1,7,27,28],achiev:[11,27],across:[3,6,7,9,25,28],activ:[5,11,28],activation_fn:5,actual:[4,7,11,25],adagrad:[7,9,25],adagradoptim:9,adam:[7,9,25,29],adamoptim:9,adapt:[5,7,9,25],adaptive_clip:9,adaptive_clipping_fn:9,add:[3,4,5,7,9,11,13,25,28],add_eo:3,added:[0,3,7,11,12,13,25],adding:13,addit:[2,7,9,11,12,21,25,28,29],addition:[1,25],adjust:[7,9,22,25,28],after:[0,1,5,9,11,13,21,22,25,27],after_create_sess:13,after_run:13,again:[25,27],aggreg:7,aggregation_method:9,aggregationmethod:9,alben:28,algorithm:[7,9,25,28],align:11,alignment_histori:11,alignments_s:11,aliv:12,alive_cach:12,alive_log_prob:12,alive_seq:12,all:[0,1,2,3,4,5,6,7,9,11,12,13,16,22,25,27,28,29],allow:[3,22,29],allreduc:9,along:[5,11],alpha:12,alphabet:[3,4],alphabet_config_path:4,alreadi:[11,26],also:[0,1,3,7,9,13,14,22,25,26,27,28,29],altern:[9,27,28],alwai:[7,9,28],amax:9,amount:3,analog:7,analysi:2,ani:[3,4,5,7,9,11,12,13,25,28,29],anoth:[3,9,11],answer:[7,25],anymor:13,anyth:9,api:[2,26],appear:3,append:[3,25],appli:[3,5,9,11,12,28],apply_gradi:9,approach:[7,11,14,28],apt:[21,27],arbitrari:9,architectur:[22,28],archiv:21,aren:3,arg:[9,11,13],argmax:11,argument:[2,3,4,5,6,7,9,11,12,13,14,21,25],arithmet:28,around:[1,4,5,6,21,27,28],arrai:[2,3,7],arrang:1,array_op:11,array_to_str:13,articl:11,artifici:4,arxiv:[5,9,11,28,29],aspect:1,assign:[0,4],assum:[11,21,27],assumpt:[7,11],attend:11,attent:[4,11,12,22,29],attention_cel:11,attention_depth:11,attention_dropout:12,attention_lay:[8,10],attention_layer_s:11,attention_mechan:11,attention_or_cell_output:11,attention_st:11,attention_typ:4,attention_wrapp:[8,10],attentioninputwrapp:11,attentionmechan:11,attentionwrapp:11,attentionwrapperst:11,attribut:[7,25],audio:[2,7,21],audio_filenam:2,augment:2,augment_audio_sign:2,auto:6,automat:[4,5,6,7,9,25,27,29],automatic_loss_sc:[7,9,25,28],automatic_loss_scal:8,automaticlossscal:9,autoregress:12,avail:[9,22,25,26,28,29],averag:[6,7,9],average_across_timestep:6,avoid:[9,11],axi:[7,11],back:[12,28],backoff:[7,9,25,28],backoffscal:9,backpropag:28,backslash:3,bahadanau:11,bahdanau:[4,11],bahdanau_norm:4,bahdanauattent:11,bahdanaumonotonicattent:11,bandwidth:28,base:[0,1,2,3,4,5,6,7,9,11,12,13,14,21,22,25,26,27],base_model:25,base_param:[25,28],basic:[6,11,26],basic_sequence_loss:6,basicsequenceloss:6,basicslstmcel:11,batch:[3,4,5,6,7,9,11,12,16,25],batch_norm:5,batch_siz:[2,3,4,5,6,11,12],batch_size_per_gpu:[6,7,22,25,29],batches_per_epoch:9,bazel:27,bbox:1,beahvior:11,beam:[4,11,12,29],beam_indic:12,beam_search:[8,10],beam_search_decoder_output:11,beam_search_test:[8,10],beam_siz:12,beam_width:[4,11],beamsearch:11,beamsearchdecod:11,beamsearchdecoderoutput:11,beamsearchdecoderst:11,beamsearchhelpertest:12,beamsearchrnndecoderwithattent:4,becaus:[3,12,22,28],becom:[2,9],been:[9,11,12,13,28],befor:[3,5,9,11,13,22,28],before_run:13,begin:[3,11,13,28],begin_decay_at:9,behavior:[11,28],being:[3,23],below:[27,28,29],bench_start:25,bench_step:25,benchmark:[7,25,28],benefici:28,bengio:11,besid:7,best:[11,12,27],beta1:9,beta2:9,better:[1,12],between:[3,7,9,11,12],bhadanau:11,bia:[11,12],bias:11,bias_initi:11,bidir_rnn_encoder_with_emb:5,bidirect:29,bidirectionalrnnencoderwithembed:5,big:[27,29],bigger:28,bin:27,binari:[3,21,27],bind:29,bleu:[7,29],blob:12,block:[5,25],block_fn:5,block_lay:5,blue:22,bn_epsilon:5,bn_momentum:5,bn_regular:5,bodi:12,bool:[0,4,5,6,7,9,11,12,25],boost:27,bori:28,both:[1,7,11,25,28,29],bottleneck:5,bottleneck_block:5,bottleneck_block_v1:5,bottleneck_block_v2:5,bottom:11,bound:1,boundari:[3,9],boundary_scal:3,box:1,bpe_us:7,broadcast:13,broadcastglobalvariableshook:13,btl:29,bucket:3,buckets_max:3,buckets_min:3,buffer:1,build:[2,4,9,12,13,20,26,27,28],build_graph:[0,1,2,3],build_image_data:1,build_lm:21,build_pip_packag:27,building_block:5,building_block_v1:5,building_block_v2:5,built:[7,25,26],c_state:11,cach:12,calcul:[0,1,3,7,11,12],calculate_bleu:7,call:[4,5,6,7,9,11,12,13,16,28],callabl:[9,11],callback:13,can:[0,1,2,3,4,5,6,7,9,11,13,14,16,22,25,26,27,28,29],candiat:3,candid:3,cannot:[7,11,12,25],cast:[4,5,6,28],cast_typ:13,cat:22,cell:[4,5,11,29],cell_input_fn:11,cell_param:11,cell_stat:11,cell_typ:11,center:1,central:1,chang:[11,13,14,22,28],channel:[1,5],channels_first:5,channels_last:5,charact:[3,4],check:[7,11,25,26,27,28],check_grad:9,check_param:13,checkpoint:[7,11,13,25,29],child:25,cho:11,choos:[11,28],chosen:9,christoph:11,classic:7,clean:[21,29],cleaned_fil:22,clip:[7,9,25],clip_gradi:9,clip_last_batch:[7,13],clip_spars:13,clone:[11,27],close:[3,11,29],cloud:28,cmake:27,cnn:26,code:[3,7,11,25],coeffici:[9,28],colin:11,collect:[7,9,12,28],colloqui:1,coloc:9,colocate_gradients_with_op:9,color:1,colorspac:1,column:11,com:[12,27],combin:[7,9,12,26],combine_head:12,command:[14,21,22,25,27,29],comment:29,commit:25,common:[8,10,28],commonli:28,compar:[5,28],compat:9,compil:[7,25],complet:[7,9,17,18,19,24,25,27],complex:28,compon:11,compos:11,compress:21,comput:[1,6,7,9,11,12,28],compute_gradi:[9,28],compute_loss:[6,7,13],concat:11,concaten:[7,11],concret:28,config:[0,1,2,3,4,5,6,7,9,13,14,23,27,29],config_fil:[21,22,25,27,29],configur:[7,14,22,25,26,27,29],conflict:11,conjunct:[7,25],connect:[4,5,7,12],conrib:11,consecut:3,consist:[4,13],constant:[9,28],constraint:11,construct:[0,1,3,4,5,6,7,9,11,13],constructor:[0,2,4,5,6,7,9,11,25],consumpt:28,contain:[0,1,2,3,4,5,6,7,9,11,12,13,22,25,27,28,29],content:[1,4,5,6],context:11,continu:[12,25],continue_learn:25,contrib:11,control:[7,11],conv2d:5,conv2d_bn_actv:5,conv2d_fixed_pad:5,conv_lay:5,conveni:25,convent:5,converg:28,convert:[2,3,11,21,28],convolut:[5,29],coord:[1,13],coordin:[1,13],copi:[5,7,11,28],copt:27,core:[3,28],correct:[7,11,12,21,22,25],correctli:[7,27],correspond:[0,1,3,7,9,11,12,16,22,25,27,28],correspondingli:[9,16,29],cosin:12,could:[0,2,4,5,6,7,9,25],count:[0,3,4,25],counter:9,cover:25,cpu:[3,28],creat:[0,1,3,4,5,6,7,9,11,12,13,20,25,28],create_rnn_cel:11,create_toy_data:22,creation:[7,28],crop:1,crop_height:1,crop_width:1,cross:6,cross_entropy_loss:8,cross_entropy_with_smooth:6,crossentropyloss:6,crossentropywithsmooth:6,csv:[2,21],ctc:[4,6],ctc_decoder_with_lm:27,ctc_greedy_decod:4,ctc_loss:8,ctcloss:6,cuda:[27,28],cudnn:29,cudnn_gru:5,cudnn_lstm:5,cudnnlstm:11,cumprod:11,cumsum:11,cumul:11,cur_index:12,current:[1,4,5,7,9,11,12,25,29],custom:[27,28],cut:[4,7],d_model:9,dai:29,data:[1,2,3,4,5,6,7,8,11,13,15,21,22,23,25,28],data_fil:3,data_format:5,data_lay:[1,2,3,4,5,6,7,8,25],data_layer_param:[7,25],data_root:22,datalay:[0,1,2,3,7,16,25],dataset:[0,1,2,3,7,16,20,22],dataset_fil:2,david:28,dct:13,debug:25,debug_port:[13,25],debugger_port:25,dec:5,decai:[9,28,29],decay_r:9,decay_step:9,deco_print:13,decod:[0,1,3,5,6,7,8,11,12,15,21,22,23,25,26,29],decode_and_crop:1,decoder_cell_typ:4,decoder_cell_unit:4,decoder_dp_input_keep_prob:4,decoder_dp_output_keep_prob:4,decoder_initial_st:11,decoder_library_path:4,decoder_output:6,decoder_param:[4,7],decoder_use_skip_connect:4,decreas:[3,28],deep:[5,11,28],deepbench:28,deepspeech2encod:[5,25],deepspeech:[5,25,27],defaultdict:3,defin:[0,1,3,4,5,7,9,11,12,22,25,28],definit:5,degre:28,delet:21,delim:[7,13],denomin:[7,25],denot:1,dens:[7,9,11],dense_tensor:6,dense_to_spars:6,depend:[5,11],deprec:11,depth:[1,11,26],deriv:[0,4,5,6,7,25],describ:[0,1,3,4,5,6,7,11,25,27,28,29],descript:[0,1,2,3,4,5,6,7,25,29],design:[26,28],desir:28,detail:[1,2,4,5,7,9,11,25,26,29],determin:[11,12],determinist:9,dev:[21,22,27],deviat:11,devic:[9,13],device_dens:9,device_spars:9,diamo:28,dict:[0,1,2,3,4,5,6,7,9,12,25],dict_to_log:13,dictionari:[0,1,2,3,4,5,6,7,12,16,25],did:22,diederik:11,diff:25,differ:[1,3,5,9,11,12,13,22,25,26,27,28],dim:[4,5,6,11],dimens:[1,4,5,11,12],dimension:[9,11],direct:[5,29],directori:[7,25,27],disabl:[7,11,22,25,27,28],discov:27,disk:22,displai:25,distanc:7,distort:1,distribut:[3,7,9,11,25,26,27,28],distributedoptim:9,divid:12,divis:[0,2,7,11],dl_id:13,dnn:28,do_mask:6,doc:[4,5,6,7,11,25],docker:[27,28],docstr:11,document:[11,25,26,28],doe:[1,4,6,7,11,22,25,27,28,29],doesn:1,domain:2,don:[3,12,29],done:[22,27],dot:12,dougla:11,download:[21,22],download_lm:27,downsampl:5,dp_input_keep_prob:11,dp_output_keep_prob:11,draw:12,drawn:3,dropout:[4,5],dropout_keep_prob:5,dropout_keep_prop:5,ds2_encod:[8,25],ds2_large_8gpu:29,ds2_librispeech_larc_config:21,ds2_medium_4gpu:29,ds2_small_1gpu:29,ds2_toy_data_config:[21,27],dtype:[0,4,5,6,7,9,11,13,25,28],due:11,dure:[1,3,4,7,11,12,13,22,25,28],dynam:[7,11,28],dynamic_decod:11,dzmitri:11,each:[0,1,3,6,7,9,11,12,13,16,25,28],eager:9,earli:11,easi:26,easili:22,eck:11,effect:[11,29],effici:[11,26],eight:29,either:[0,2,4,5,6,7,9,11,25,28],element:[2,3,7,9,11],elimin:29,els:[6,9],elsen:28,embed:[4,5,11,12],embedding_lay:[8,10],embedding_lookup:11,embedding_s:12,embeddingsharedweight:12,emit:11,emnlp:11,empti:[7,9,11,16,25],enabl:[7,9,11,14,25,27],enable_log:[25,29],enc_emb_w:5,encod:[0,1,3,4,6,7,8,11,12,15,22,23,25,26,29],encoder_cell_typ:5,encoder_cell_unit:5,encoder_decod:[4,5,6,8,25],encoder_dp_input_keep_prob:5,encoder_dp_output_keep_prob:5,encoder_final_st:11,encoder_lay:5,encoder_output:[4,5,11],encoder_param:[5,7],encoder_sequence_length:4,encoder_st:[5,11],encoder_use_skip_connect:5,encoderdecodermodel:[7,25],encorc:11,end:[3,4,11,12,13,22,27,28],end_compat:9,end_learning_r:9,end_of_choic:3,end_symbol:4,end_token:11,energi:11,enforc:11,english:[3,20],enough:[22,25,28],ensur:[3,11,12,13,28],entri:11,entropi:6,enumer:3,eos:12,eos_id:[3,12,13],epoch:[0,1,3,7,9,25,29],epsilon:[5,7,9,12,25],equal:[1,7,11,25],equival:[9,11],erich:28,error:[7,11,21,27,29],escap:3,especi:28,essenti:13,estim:28,etc:[4,5,7,16,25,26],etl:16,eval:[1,4,5,7,21,25],eval_input_fn:3,eval_model:13,eval_param:25,eval_step:[7,25],evalu:[0,1,2,3,7,13,21,22,25],evenli:3,event:[25,28],everi:[5,7],every_step:13,everyth:[21,25,26,27],exact:[11,25],exactli:11,exampl:[0,1,2,3,4,5,7,9,11,16,21,22,25,27,28],example_config:[21,22,25,27],example_seri:1,exce:[7,9,25],except:[5,11,13,25,27,28],execut:[7,9,14,22,25],exist:[0,4,11,23,28],exp:[9,11],exp_decai:9,expect:6,experi:[22,25,26],experiment:26,explicit:5,explicitli:[11,28],exponenti:9,exponential_decai:9,express:25,extend:26,extens:28,extra:9,extract:[2,16,21],fact_siz:11,factor:[9,11,12,28],fail:27,fairli:25,fals:[0,1,3,4,5,6,7,9,11,12,13,27],familiar:26,fc_decod:8,featur:[2,4,5,6,26],features_typ:2,fed:16,feed:[11,13],feed_dictionari:16,feedforward:12,feedfowardnetwork:12,feel:21,fetch:[7,9],few:29,ffn_layer:[8,10],field:[1,2,4,5,6,11],file:[0,1,2,3,4,7,21,22,25,27,29],file_byte_limit:3,file_pattern:3,file_with_bpe_segment:22,filenam:[1,2,3],filepath:3,filter:[3,5,9],filter_s:12,final_output:4,final_sequence_length:4,final_st:[4,11],finalbeamdecoderoutput:11,finalbeamsearchdecoderoutput:11,finalize_evalu:7,finalize_infer:7,find:[9,12,26],finish:[11,12,22,25],finished_flag:12,finished_scor:12,finished_seq:12,first:[0,1,3,5,7,9,11,12,14,21,22,25,28],fix:9,fixed_lr:9,fixed_pad:5,flag:[11,12],flaot:12,flat_dict:13,flatten_dict:13,flexibl:26,flip:1,float16:[0,4,5,6,7,25,28],float32:[0,4,5,6,7,9,12,25,28,29],flstm:[8,10],flstmcell:11,flush:13,folder:[21,22,25,27],follow:[0,1,3,4,5,6,7,9,16,21,22,25,27,28,29],forc:25,force_var_reus:7,forev:3,forget:[11,29],forget_bia:11,form:[0,3,11],format:[1,3,5],formul:12,forward:[7,28],found:[3,29],four:29,fp32:28,fraction:3,frame:[2,7],framework:12,free:21,frequenc:[2,3],frequent:3,from:[0,1,2,3,4,5,6,7,9,11,12,13,16,25,27,28,29],ftrl:[7,25],full:[5,9,28,29],fulli:[4,5,12],fully_connected_ctc_decod:4,fully_connected_decod:4,fully_connected_time_decod:4,fullyconnectedctcdecod:[4,21],fullyconnecteddecod:4,fullyconnectedtimedecod:4,func:8,furthermor:28,fuse:1,futur:12,ganesh:28,garcia:28,gate:[9,11],gate_gradi:9,gate_graph:9,gate_non:9,gate_op:9,gather:12,gen_input_tensor:16,gener:[3,4,7,11,12,25],generate_tri:27,geometr:12,german:[3,20],get:[7,9,12,16,21,22,26,27],get_available_gpu:13,get_data_lay:7,get_decoder_self_attention_bia:12,get_git_diff:13,get_git_hash:13,get_global_step:9,get_next:16,get_num_objects_per_step:7,get_optional_param:[0,1,2,3,4,5,6,7,16,25],get_output_tensor:7,get_pad:12,get_padding_bia:12,get_position_encod:12,get_regularization_loss:9,get_required_param:[0,1,2,3,4,5,6,7,16,25],get_results_for_epoch:13,get_size_in_sampl:[0,1,2,3,16],get_speech_featur:2,get_speech_features_from_fil:2,get_tf_dtyp:7,get_vari:28,get_wmt16_en_dt:22,getter:28,ginsburg:[11,28],git:[25,27],github:[12,27],given:[1,9,11,12,28],global:[9,13],global_gradient_norm:[7,25],global_step:9,glstm:[4,8,10],glstmcell:11,gnmt:[4,5,8,10,29],gnmt_encoder_with_emb:5,gnmt_residual_fn:11,gnmt_v2:4,gnmtattentionmulticel:11,gnmtlikeencoderwithembed:5,go_symbol:4,goal:22,going:[4,5,17,18,19,24,25,27],good:25,gpu:[0,6,7,9,13,14,16,22,25,26,27,28,29],gpu_id:[7,25],grad_loss:9,gradient:[7,9,11,25,28],gradient_multipli:9,gradient_noise_scal:9,gradient_norm:[7,25],grads_and_var:9,gram:21,graph:[0,1,2,3,4,5,6,7,9,13,25,28],graphic:28,graphkei:9,greater:5,gregori:28,group:[3,11],group_batch_s:3,group_id:11,group_siz:11,grow:12,gru:[4,5,29],guarante:[3,11],half:28,halv:28,handl:11,happen:[0,1,3,4,5,6,7,16],hard:11,hardmax:11,has:[0,1,2,3,4,5,6,7,9,11,12,13,28,29],has_nan:9,hash:25,hat:9,have:[0,1,2,3,4,5,6,7,9,11,12,16,22,25,26,27,28,29],head:12,height:1,height_in:5,help:[7,21,25],helper:[5,12],henc:28,here:[0,1,3,4,5,6,7,11,16,25,28,29],hetland:7,hidden:[4,5,11],hidden_s:12,hieu:11,high:28,higher:11,highest:12,highli:29,histogram:9,histori:11,hold:[9,12],hood:9,hook:[7,8],horovod:[0,3,6,7,9,13,14,25,29],horovod_gpu_allgath:9,horovod_gpu_allreduc:9,horovod_gpu_broadcast:13,hot:[1,6,11],houston:28,how:[7,9,11,12,20,22,23],howev:[1,28],http:[5,7,9,11,12,27,29],human:13,hvd:[7,25],hyperparamet:28,iclr:[11,28],icml:11,ident:[5,11],ids:[0,2,3,7,11,12,13,25],idx2char:7,ignor:[0,7,9,11,14,25],ignore_speci:[7,13],illeg:12,illustr:28,ilsvrc2012_val_00041207:1,imag:[1,5,7],image2label:[0,8,25],image_buff:1,imagenet_preprocess:[0,8],imagenetdatalay:1,implement:[6,7,9,11,12,16,25,29],impli:11,import_librivox:21,improv:[1,3,28],incept:1,includ:[0,1,2,3,4,5,6,7,9,11,21,28],increas:[12,28],increment:9,increment_global_step:9,independ:[5,6,7,25],index:[11,12],indexedslic:9,indic:[1,3,12,13],inf:11,infer:[0,1,3,4,5,7,11,13,21,25],infer_output_fil:[21,22,25],infer_param:25,infin:12,info:9,inform:[11,12,13,25,26],inherit:[0,4,5,6,7,16],init_from_fil:3,initi:[3,4,5,7,9,11,12,13,25,28],initial_cach:12,initial_cell_st:11,initial_id:12,initial_st:11,initializer_param:[4,5,7,25],inner:[11,12],input:[0,1,2,3,4,5,6,7,11,12,25,26],input_dict:[4,5,6,13],input_lay:5,input_s:11,input_sequence_length:11,input_tensor:[0,1,2,3,4,5,6,7],input_typ:2,input_valu:7,insid:[0,1,3,7,22,28],inspect:28,inspir:11,instabl:11,instal:[21,26],instanc:[0,4,5,6,7,9,11,28],instanti:9,instead:[1,4,9,11,27,28],instruct:[21,26],insur:12,int32:[1,11,12],int64:[11,12],intact:5,integ:[1,3,5,11],inter:28,intermedi:28,intern:[9,11,23],introduc:[1,5,28],invalid:9,invalidargu:11,invari:12,invers:3,involv:12,is_train:1,issu:29,item:[3,11,12],iter:[0,1,2,3,7,13,16,28],iterate_data_lay:13,its:[3,11,12,21,25],jian:5,join:3,jointli:11,jonah:28,jpeg:1,jul:5,just:[4,5,27],kaim:5,keep:[4,5,6,9,13,28],kei:[0,1,3,5,7,9,12],kenlm:[21,27],kept:28,kernel:5,kernel_initi:11,kernel_s:5,key_channel:12,keyword:12,kind:25,kingma:11,knee:1,known:[1,3,11],kpu:27,kuchaev:28,kuchaiev:11,kwarg:[9,11,12],kyunghyun:11,label:[1,4,6],lambda:[9,11,28],languag:[4,20],language_model:27,lar:[7,25],larc:[7,9,25,29],larc_eta:[7,25],larc_mod:[7,9,25],larc_nu:9,larc_param:[7,9,25],larg:[11,28],largest:12,last:[4,5,7],last_batch:[7,13],last_step:[7,13],latenc:28,later:28,latter:11,launch:[13,22],layer:[0,1,2,3,4,5,7,9,11,12,15,23,25,28,29],layer_typ:5,layernorm:12,layernorm_lstm:5,ld_library_path:29,lead:27,learn:[5,7,9,11,22,25,29],learnabl:11,learning_r:[7,9,25],learning_rate_decay_fn:9,least:[5,12],left:11,length:[0,1,2,3,4,5,6,11,12],length_i:12,length_penalty_weight:11,length_x:12,less:[3,28,29],level:[25,28],levenshtein:7,libboost:27,libctc_decoder_with_kenlm:27,librari:4,librispeech:[20,29],librivox:21,libsox:21,libtensorflow_cc:27,libtensorflow_framework:27,like:[5,9,12,16,22,29],limit:28,line:[0,3,13,14,22,25],linear:[4,11,12],link:[27,29],list:[0,1,2,3,4,5,6,7,9,11,12,16,25,26],liu:11,live:12,lm_binary_path:4,lm_trie_path:4,lm_weight:4,load:[0,3,16],load_pre_existing_vocabulari:0,localhost:29,locat:[12,22,27],lock:9,log:[7,9,11,12,23,28],log_fil:13,log_max:9,log_summaries_from_dict:13,logdir:[7,22,25],logger:13,logic:[7,12,16],logit:[4,6,11,12],logits_to_outputs_func:4,logmax:[7,9,25,28],logmaxscal:9,lognorm:28,logspac:11,longer:13,longest:3,look:[7,25,26,29],loop:12,lose:28,loss:[0,1,3,4,7,8,9,15,23,25],loss_comput:7,loss_input_dict:6,loss_param:7,loss_scal:[7,9,25,28],lot:[21,22,27],lower:[3,28],lr_polici:[7,8,25],lr_policy_param:[7,25],ls_dir:21,lst:3,lstm:[4,5,11,29],lstmstatetupl:11,luong:[4,11],luong_scal:4,luongattent:11,luongmonotonicattent:11,m_state:11,machin:[11,20],magnitud:9,mai:[3,11,12],main:[4,5,6,22,25,26],maintain:[12,28],major:11,make:[0,1,3,11,21,22,26,27],malform:9,man:11,mani:28,manner:11,manual:[11,27,28],map:[0,3,4,5,11,29],mark:[3,11,12],mask:[6,11,12],mask_nan:[6,13],master:[12,28],match:[1,3,5,11,28,29],matric:11,matrix:[11,12],max:3,max_decode_length:12,max_grad_norm:[7,25],max_length:3,max_lr:9,max_norm:9,max_pool2d:5,max_step:[7,25],max_subtoken_length:3,max_tim:11,max_timescal:12,maxim:9,maximum:[3,7,11,12,25,28],mayb:11,maybe_print_log:7,mca:29,mean:[1,9,28],measur:[22,29],mechan:[4,11,12],memori:[0,11,22,28],memory_sequence_length:11,mention:28,merg:11,method:[0,1,2,3,4,5,6,7,9,11,12,13,16,25,28],methodnam:12,methodolog:28,metric:7,mfcc:2,michael:28,micikeviciu:28,might:[22,25,27,28],milli:2,min:3,min_boundari:3,min_count:3,min_idx:0,min_lr:9,min_timescal:12,min_upd:[7,25],minh:11,mini:16,minibatch:11,minim:[7,9,25],minimum:[0,3,11,12],minumum:3,minut:27,mismanag:11,misspel:21,mix:[4,5,7,9,11,12,25,26,29],mixedprecisionoptimizerwrapp:[9,28],mkdir:[21,27],modal:26,mode:[0,1,3,4,5,7,11,13,14,21,22,25,27,29],model:[0,1,2,3,4,5,6,8,9,11,12,13,20,22,23,26,28],model_param:28,modifi:[4,5,6,9,13,28],modul:[4,7,9,11,25],modular:26,momentum:[5,7,9,25,29],momentumoptim:9,monoton:11,monotonic_attent:11,more:[9,11,12,22,26,28,29],moreov:25,moss:[22,29],most:[3,7,16,25,28],move:[9,11,25],mozilla:27,mp_regularizer_wrapp:[9,28],mp_wrapper:8,mpi4pi:27,mpi:7,mpirun:[14,29],msg:13,much:[3,12,22,27],multi:[7,12,14,16,22,26,29],multicel:11,multihead:12,multipl:[9,11,12,29],multipli:[9,11,28],multirnncel:11,must:[0,1,3,4,5,6,9,11,12,25],mutli:14,myfavoriteattentionmechan:11,n03623198:1,n_hidden:5,name:[2,3,4,5,6,7,9,11,25,28],namedtupl:11,nan:6,narang:28,nativ:3,nearli:28,necessari:[0,5,7,9,13,25,27,28],necessarili:12,need:[0,1,3,4,5,7,9,12,14,16,21,22,25,27,28],neg:[11,12],neither:9,nest:[11,12,25],nest_dict:13,nested_upd:13,network:[5,11,12,28],neural:[11,28],new_beam_s:12,new_cach:12,new_height:1,new_log_prob:12,new_seq:12,new_width:1,newli:11,newstest2014:[22,29],next:[7,11,25,27,28,29],next_batch_feed_dict:16,next_input:11,next_stat:11,nmt:[22,29],nmt_revers:22,no_dir_check:25,noam:9,node:[14,26],nois:[9,11],noise_level_max:2,noise_level_min:2,non:[9,11,12],none:[0,2,3,4,5,6,7,9,11,12,13,16,25,28,29],nor:9,norm:[5,7,9,25],normal:[4,5,9,11,12,28],note:[0,1,3,4,5,6,7,9,11,21,22,25,28],now:[11,21,25,27,28],num:[4,5,6],num_audio_featur:2,num_box:1,num_channel:[1,5],num_cpu_cor:3,num_epoch:[0,1,3,7,9,25],num_featur:[2,4],num_gpu:[7,14,22,25],num_head:12,num_iter:3,num_lay:11,num_proj:11,num_rnn_lay:5,num_time_step:2,num_unit:11,num_work:[0,1,2,3,16],number:[0,1,2,3,4,5,6,7,9,11,12,14,25,28],number_of_group:11,numer:[7,11,25,28],numpi:2,nvidia:[26,27,28],object:[0,1,3,4,5,6,7,9,11,12,13,16,28],obtain:[27,28,29],occur:11,offici:26,offset:13,offset_target_by_on:6,often:[7,25,28],old:12,oleksii:28,on_horovod:[7,9],onc:[3,11,13,22],one:[0,1,3,4,5,6,7,9,11,12,13,22,25,28,29],ones:26,onli:[0,1,3,4,7,9,11,13,14,22,25,28],onlin:11,open:7,open_seq2seq:[1,2,3,7,27],openib:29,openseq2seq:[3,14,21,22,25,28,29],oper:[4,5,9,11,13,27,28],ops:[1,9,11,13],opt:27,optim:[7,8,25,29],optimize_loss:9,optimizer_cls_nam:9,optimizer_param:[7,9,25],optimizer_summari:9,option:[0,1,2,3,4,5,6,7,9,11,16,25],optional_dict:13,order:[3,5,9,11,21,22,27,28],org:[5,7,9,11,29],org_dict:13,origin:[1,3,5,11,12,13,21,29],orte_base_help_aggreg:29,other:[1,4,7,9,11,13,21,25,27,28],otherwis:[1,7,9,11,22,25,27,28],our:[26,27,28],out:[11,12,22,26],out_of_bucket:3,output:[1,4,5,6,7,11,12,13,16,21,22,25,26,27],output_attent:11,output_dim:[4,11],output_dir:13,output_dtyp:11,output_fil:[7,13],output_height:1,output_lay:11,output_s:11,output_time_major:11,output_valu:7,output_width:1,outsid:3,over:[6,28],overal:13,overcom:28,overflow:28,overflow_std_dev:9,overrid:11,overridden:11,overriden:9,overwrit:[7,25],overwritten:25,own:20,p_choose_i:11,packag:[4,5],pad2eight:12,pad:[0,1,2,3,5,12,16,29],pad_id:[3,13],pad_to:2,pad_vocab_to_eight:0,padded_cross_entropy_with_smooth:6,padded_input_length:3,padded_length:3,padded_target_length:3,paddedcrossentropylosswithsmooth:6,padding_valu:12,page:[26,27],pair:[7,9,11],paper:22,parallel:[3,11,12,22],parallel_interleav:3,paralleltextdatalay:3,param:[0,1,2,3,4,5,6,7,9,11,12,13,16,25],paramet:[0,1,2,3,4,5,6,7,9,11,12,13,14,16,22,23,27,28],parent:[0,2,4,5,6,25],pars:[1,2],parse_record:1,part:[1,4,6,7,8,9,11,12,21,25,28],particular:11,partli:11,pass:[0,1,3,4,5,6,7,9,11,12,13,25,28],past:11,path:[0,2,3,4,7,25,29],pauliu:28,pdf:[5,9],penal:11,per:[2,3,6,9],perform:[1,2,4,5,6,7,11,22,25,28],period:[25,28],perl:[22,29],peter:11,pham:11,piecewis:9,piecewise_const:9,pip:27,pip_packag:27,pipelin:3,place:[3,25],placehold:16,plane:5,pleas:[11,21],point:[3,11,13,22,28],polici:[7,9,25],poly_decai:9,polynomi:9,polynomial_decai:9,popul:[7,9],posit:[5,11,12],possibl:[7,11,25,28,29],post:12,power:[9,11],practic:28,pre:[0,11,12,22],preactiv:5,precis:[4,5,7,25,26,29],pred:7,predict:[7,11,12,13,25],predicted_id:11,preevious_attent:11,prefer:27,prefix:[3,9],prepar:[11,25],prepostprocessingwrapp:12,preprint:28,preprocess:[1,21],preprocess_imag:1,presenc:28,present:9,preserv:1,prevent:9,previou:[9,11,12],previous_attent:11,primarili:1,principl:28,print:[7,13,25],print_loss_step:[7,25],print_samples_step:[7,25],printlossandtimehook:13,printsampleshook:13,prior:[3,11],probability_fn:11,probabl:[4,5,11,12,25,28],problem:[11,22,28],proce:25,process:[0,1,2,3,9,11,12,13,22,27],produc:[4,5,6,11],product:[11,12,26],progress:22,project:[5,11,26],projection_shortcut:5,propag:[11,28],proper:11,properli:11,properti:[11,12],propos:[5,11],proto:1,protocol:1,provabl:12,proven:28,provid:[1,3,4,5,7,9,11,13,21,28],pull:12,put:27,python:[1,3,4,5,7,9,11,12,13,16,21,22,25,27,29],quantiti:9,queri:11,quicker:9,raffel:11,rais:[1,9,11,13],random:[1,3,7,11,13,25],random_se:[7,25],randomli:1,rang:[11,28],rank:[1,3,9,11,13],rare:28,rate:[7,9,25,29],rather:[0,5,7,25,28],ratio:1,raw:[1,2,5],raw_record:1,raw_str:3,reach:12,read:[0,3,11,25],read_char:0,readabl:13,real:22,realli:4,reason:11,receiv:13,recent:28,recip:[26,28],recogn:21,recognit:[5,7,20,27],recommend:[9,11,16,27,28,29],record:[1,3],recov:13,recurr:[11,28],recurs:11,redefin:28,reduc:[11,22],reduce_mean:6,ref:5,refer:[1,3,12],regress:6,regular:[4,5,7,9,25],regularizer_param:[4,5,7,25],relat:[0,1,3,11],relu:5,relu_dropout:12,remov:[7,22],ren:5,reparameter:11,repeat:[3,11],replac:[3,11,21],report:9,report_summari:9,repositori:27,repres:[1,11,12],represent:[4,5],representation_dim:5,request:13,request_stop:13,requir:[0,1,2,3,4,5,6,7,9,11,16,25,27,28],required_dict:13,rescal:9,research:26,reserv:3,reserved_token:3,reshap:[11,12],residu:[4,5,11],residual_connect:11,resiz:1,resize_imag:1,resize_min:1,resized_imag:1,resizemethod:1,resnet:[1,5],resnet_block:8,resnet_encod:8,resnetencod:5,respect:[11,28],rest:27,restor:[11,13,25],restore_and_get_result:13,result:[7,11,12,13,21,22,27,28,29],results_per_batch:7,retriev:28,reus:[11,28],revers:20,rgb:1,right:11,rmsprop:[7,25],rnn:[4,5,8,10,22,26,29],rnn_beam_search_decod:[8,10],rnn_cell:5,rnn_cell_dim:5,rnn_cell_impl:11,rnn_decod:8,rnn_decoder_with_attent:4,rnn_encod:8,rnn_type:5,rnn_unidirect:5,rnncell:11,rnndecoderwithattent:4,robust:28,ron:11,root:[13,29],root_rank:13,row:[5,7,11,13],row_conv:5,row_conv_width:5,rule:28,run:[4,5,7,11,12,13,14,21,23,28,29],run_context:13,run_valu:13,runevaluationhook:[7,13],runtest:12,runtim:11,runtimeerror:9,s_id:[3,13],safe:25,safe_cumprod:11,safeti:9,sai:25,saliman:11,same:[0,1,3,4,5,6,9,11,12,13],sampl:[0,1,3,4,7,11,13,16,25],save:[3,7,25],save_checkpoint_step:[7,25],save_summaries_step:[7,25],scalar:[1,9,11,12],scale:[3,4,7,9,11,12,25],scale_max:9,scale_min:9,scaler:9,scan:11,scheme:[3,9],scope:[4,5,6,9,11],score:[7,11,12,29],score_bias_init:11,score_mask_valu:11,score_or_log_prob:12,script:[1,14,21,22,25,27,29],search:[3,4,11,12],second:[2,3,9,11,13,28],section:[4,5,7,9,12,17,18,19,24,25,26,27,29],sed:22,see:[2,4,5,6,7,9,11,22,25,26,27,28],seed:[7,11,25],select:[7,28],self:[0,1,3,4,5,6,7,11,12,16,25],selfattent:12,semi:3,send:13,separ:[1,25],seq2seq:[6,11],seq:11,sequenc:[0,1,2,3,4,5,6,7,11,12,20,25,26,27],sequence_beam_search:12,sequence_length:[6,11],sequence_loss:8,sequencebeamsearch:12,seri:1,serial:[1,3],serialized_exampl:3,sess:[7,13],session:13,session_run_hook:13,sessionrunarg:13,sessionruncontext:13,sessionrunhook:13,sessionrunvalu:13,set:[3,4,5,6,7,9,11,13,14,25,26,27,28],setup:[21,29],sgd:[7,9,25,29],shaoq:5,shape:[1,2,3,4,5,6,9,11,12,16],sharan:28,share:[12,25,28],shift:28,shortcut:5,shorter:12,should:[0,1,3,4,5,6,7,9,11,12,13,16,21,22,25,27,28,29],shuffl:[0,3,11,16],side:1,sigmoid:11,sigmoid_nois:11,sigmoid_noise_se:11,signal:[2,13],signatur:11,significantli:3,similar:[3,5],simpl:[1,4,11,14,22,25,28],simplest:27,sinc:[7,11,21,25,27,28],sine:12,singl:[3,5,9,11,12,22,29],singleton:6,singular:11,situat:[13,28],size:[0,1,2,3,4,5,6,7,11,12,13,16,21,25,27,28,29],skip:[25,27,28],slice:11,sloppi:3,slot:29,slowest:11,slstm:[8,10],small:[7,9,11,22,25,28,29],smallest:1,smallest_sid:1,smooth:[6,9],softmax:[6,11,12],solut:11,some:[1,4,5,7,9,11,12,25,26,27,28],someth:27,sometim:25,soon:[11,17,18,19,24,25],sort:3,sourc:[0,1,2,3,4,5,6,7,9,11,12,13,25,27],source_length:2,source_sequ:2,source_tensor:[0,1,2,3,5,7],sox:21,spars:[7,9],sparse_tensor_to_char:7,sparsemax:11,sparsetensorvalu:7,spatial:5,specialtexttoken:3,specif:[2,4,5,25,27],specifi:[2,7,9,12,14,16,25],spectrogram:2,speech2text:[0,8,21,25,27],speech2textdatalay:2,speech:[2,7,20,26],speech_util:[0,8],speed:[3,28],speedup:28,split:[2,3,4,11,12],split_data:[1,2],split_head:12,src:22,src_emb_siz:5,src_input:[4,5],src_length:[4,5,6],src_sequenc:5,src_vocab_s:5,stabil:[7,25],stack:11,stai:28,staircas:9,stamp:25,standard:[5,7,11,28],start:[9,11,12,13,14,22,25,26],start_input:11,start_token:11,state:[4,5,11,12],state_is_tupl:11,state_s:11,static_max_norm:9,statist:[9,28],std:9,std_factor:9,stderr:25,stdout:25,step:[1,7,9,11,13,25,27],step_factor:9,step_window:9,steps_in_epoch:7,steps_per_epoch:9,still:12,stop:[11,13],store:[3,7,11,12],str:[0,2,3,4,5,6,7],stream:13,strength:12,stride:[2,5],string:[1,2,3,4,5,7,9,13,25],structur:[11,12,23],style:11,sub:9,subclass:9,subfold:25,submit:28,subsequ:[1,11,25],subset:[11,29],substitut:29,subtoken:[3,12],subtoken_count:3,subtoken_dict:3,subtoken_list:3,subtract:1,sudo:[21,27],suffer:11,suggest:28,sum:[6,11],sum_i:11,summar:28,summari:[7,9,25],sun:5,suppli:9,support:[0,4,5,6,7,11,14,16,25,26,27,28],supported_algo:9,sure:[0,1,3,21,22],symbol:[0,4],symbols_to_logits_fn:12,symlink:27,synset:1,system:22,t2t:[0,8],tab:25,tabl:29,taht:[0,1,3],take:[4,5,6,9,11,12,13,21,22,27],taken:7,tanh:11,target:[0,1,2,3,4,6,7,13,22],target_length:2,target_s:3,target_sequ:[2,6],target_tensor:[0,1,2,3,4,6,7],target_vocab_s:3,task:20,tensor2tensor:12,tensor:[0,1,2,3,4,5,6,7,9,11,12,13,16,28],tensorarrai:11,tensorboard:[7,9,22,25],tensorflow:[0,1,3,4,5,6,7,9,11,12,13,25,26,28],tensorflow_pkg:27,tensorflowtestcas:12,tensorshap:11,term:[9,11],termin:12,tesla:28,test:[12,22],test_expand_to_beam_s:12,test_flatten_beam_dim:12,test_gather_beam:12,test_gather_topk_beam:12,test_get_shape_keep_last_dim:12,test_shape_list:12,test_unflatten_beam_dim:12,test_util:12,text2text:[0,8,22,25],text:[1,2,3,4,7,13,21,26,27],text_ids_to_str:13,textlinedataset:2,tfrecord:3,tgt:22,tgt_emb_siz:4,tgt_input:4,tgt_length:[4,6],tgt_sequenc:6,tgt_vocab_s:[4,6],than:[1,3,5,9,11,12,22,28,29],thang:11,thei:[1,3,11,28],them:[9,21,27,28],thi:[0,1,3,4,5,6,7,9,11,12,13,16,17,18,19,21,22,24,25,26,27,28,29],thing:[3,9,22,25,28],those:[7,11],thread:13,three:[1,3],threshold:[3,9],through:[1,11,12,26],thu:[3,4,25],tile:[11,12],tile_batch:11,tiled_encoder_final_st:11,tiled_encoder_output:11,tiled_input:11,tiled_sequence_length:11,tim:11,time:[1,3,4,5,6,9,11,13,21,25,27,28],time_major:5,time_stretch_ratio:2,timestep:[6,11,28],titan:28,tmp:27,todo:[11,16,27],togeth:[7,13,28],toi:[20,21,27],tok:[22,29],token:[0,7,8,11,12],token_count:3,tool:27,toolkit:26,top:[3,11,12],topic:25,total:[0,9,28],total_regularization_loss:9,tower:[7,14],toy_text_data:22,tra:3,track:[11,13],tracks_own_finish:11,train:[0,1,3,4,5,7,9,11,12,13,20,22,25,26,27,29],train_ev:[7,21,22,25,27,29],train_input_fn:3,train_model:13,train_op:7,train_param:25,trainabl:9,trainable_vari:9,trainer:9,transform:[3,4,8,9,10,16,22,29],transform_for_bleu:7,transformer_decod:8,transformer_encod:8,transformer_polici:9,transformerdatalay:3,translat:[3,7,11,12,20],transpos:12,treat:[7,9,25],tri:29,trick:11,trie:[4,21,27],true_batch_s:11,true_siz:[7,13],tupl:[1,2,7,9,11,12],tutori:[25,26],twice:3,two:[1,3,11,12,13,14,21,28],txt:[22,27],type:[0,1,2,3,4,5,6,7,9,11,12,28],typeerror:[9,11],typic:[0,4,5,6,28],ubuntu:27,ultim:5,unbatch:3,unchang:[11,12],under:9,underflow:[11,28],undergo:1,underli:[2,9,28],underlin:3,understand:25,unescap:3,uni:5,unicod:3,unidir_rnn_encoder_with_emb:5,unidirect:29,unidirectionalrnnencoderwithembed:5,uniqu:12,unit:[4,5,11,29],unittest:27,unk_id:3,unknown:1,unless:9,unspecifi:11,upcom:13,upd_dict:13,updat:[3,7,9,11,25,28],update_op:9,use:[0,1,3,4,5,6,7,9,11,14,16,21,22,25,27,28,29],use_horovod:[7,14,25,29],use_language_model:[4,27],use_lock:9,use_new_attent:11,use_staircase_decai:9,use_swap_memori:5,used:[0,1,3,4,5,6,7,9,11,12,13,22,25,28,29],useful:[7,25,26],user:[11,28],uses:[6,9,11,21,22,28],using:[1,2,3,5,7,9,11,13,14,16,22,26,27,28,29],usual:[4,11,21,27,28],util:[1,7,8,10,28],utter:21,v100:28,valid:[0,1,3,4,5,7,9,11,13,25,29],valid_word_count_weight:4,valu:[1,3,7,9,11,12,13,25,28],value_channel:12,valueerror:[1,9,11],var_list:9,variabl:[3,4,5,6,7,9,11,12,13,25,28],variable_norm:[7,25],varianc:[9,28],variant:5,varieti:28,variou:[4,5,9,25,26],vector:[1,11],venkatesh:28,verbos:13,veri:[11,22],versa:11,version:[3,4,11,27,28],vgg:1,via:11,vice:11,view:[3,22],visual:[7,9],vocab:[0,3,7,13],vocab_fil:[2,3],vocab_s:12,vocabulari:[0,2,3,4,5,6,13,22],volta:[28,29],wai:[3,7,9,27,28],want:[21,22,25,27],warm:9,warmup_step:9,wave:2,wavelength:12,weight:[4,5,9,11,12,13,28],weiss:11,well:[4,5,7,22],wer:29,were:[5,11,22],what:23,when:[0,1,3,5,7,9,11,12,13,14,25,27,28,29],whenev:[25,28],where:[1,3,7,9,11,12,16,22,25,28],whether:[0,1,3,4,5,6,7,9,11,12,25],which:[0,1,3,4,5,6,7,9,11,12,13,16,22,25,28,29],whl:27,whose:11,width:[1,4,5,29],width_in:5,window:2,window_s:2,window_strid:2,within:5,without:[1,4,5,12,25,28,29],wmt:22,word:[0,3,4,7,29],word_count_weight:4,work:[7,22,25,27,29],worker:[3,6,7,13],worker_id:[0,1,2,3,7,16],workshop:11,wors:27,worst:[11,12],worth:28,wrap:[3,4,9,11,13,28],wrap_to_multi_rnn:11,wrapper:[1,4,5,6,11,12,28],write:[11,13,25],wrong:[9,11],xiangyu:5,xmax:1,xmin:1,ymax:1,ymin:1,yoshua:11,you:[0,1,3,7,9,11,12,13,14,16,21,22,25,26,27,28,29],your:[0,1,3,9,16,20,22,25,27],yourself:28,zero:[7,9,11,12],zero_st:11,zhang:5},titles:["data","image2label","speech2text","text2text","decoders","encoders","losses","models","API documentation","optimizers","parts","rnns","transformer","utils","Distributed training","Adding new models","Adding new data layer","Adding new decoder","Adding new encoder","Adding new loss","Getting started","Speech Recognition","Machine Translation","In-depth tutorials","Internal structure","Using existing models","OpenSeq2Seq","Installation instructions","Mixed precision training","Models and recipes"],titleterms:{"new":[15,16,17,18,19],Adding:[15,16,17,18,19],Using:25,add:27,adventur:22,api:8,attention_lay:12,attention_wrapp:11,automat:28,automatic_loss_scal:9,base:29,beam_search:12,beam_search_test:12,being:25,bleu:22,bpe:22,build:21,clean:22,common:12,comput:22,config:25,creat:22,cross_entropy_loss:6,ctc:27,ctc_loss:6,data:[0,16],data_lay:0,dataset:21,decod:[4,17,27],deep:29,depth:23,detail:28,distribut:14,document:8,download:27,ds2_encod:5,embedding_lay:12,enabl:28,encod:[5,18],encoder_decod:7,english:22,exist:25,fc_decod:4,feel:22,ffn_layer:12,flstm:11,func:13,gener:27,german:22,get:20,glstm:11,gnmt:11,hook:13,horovod:27,how:[21,25,27,28],image2label:[1,7],imagenet_preprocess:1,implement:28,infer:22,instal:27,instruct:27,intern:24,languag:[21,27],layer:16,librispeech:21,log:25,loss:[6,19,28],lr_polici:9,machin:[22,29],mix:28,model:[7,15,21,25,27,29],mp_wrapper:9,openseq2seq:[26,27],optim:[9,28],own:21,paramet:25,part:10,precis:28,prerequisit:28,recip:29,recognit:[21,29],regular:28,resnet_block:5,resnet_encod:5,revers:22,rnn:11,rnn_beam_search_decod:11,rnn_decod:4,rnn_encod:5,run:[22,25,27],scale:28,score:22,segment:22,sequenc:22,sequence_loss:6,slstm:11,speech2text:[2,7],speech:[21,27,29],speech_util:2,start:20,structur:24,t2t:3,task:22,tensorflow:27,test:27,text2text:[3,7],toi:22,token:3,train:[14,21,28],transform:12,transformer_decod:4,transformer_encod:5,translat:[22,29],tutori:23,util:[0,11,12,13],what:25,your:21}}) \ No newline at end of file +Search.setIndex({docnames:["api-docs/data","api-docs/data.image2label","api-docs/data.speech2text","api-docs/data.text2text","api-docs/decoders","api-docs/encoders","api-docs/losses","api-docs/models","api-docs/modules","api-docs/optimizers","api-docs/parts","api-docs/parts.cnns","api-docs/parts.convs2s","api-docs/parts.rnns","api-docs/parts.transformer","api-docs/utils","distr-training","extending","extending/adding-new-data-layer","extending/adding-new-decoder","extending/adding-new-encoder","extending/adding-new-loss","getting-started","getting-started/asr","getting-started/nmt","in-depth-tutorials","in-depth-tutorials/internal-structure","in-depth-tutorials/using-existing-models","index","installation-instructions","mixed-precision","models-and-recipes"],envversion:53,filenames:["api-docs/data.rst","api-docs/data.image2label.rst","api-docs/data.speech2text.rst","api-docs/data.text2text.rst","api-docs/decoders.rst","api-docs/encoders.rst","api-docs/losses.rst","api-docs/models.rst","api-docs/modules.rst","api-docs/optimizers.rst","api-docs/parts.rst","api-docs/parts.cnns.rst","api-docs/parts.convs2s.rst","api-docs/parts.rnns.rst","api-docs/parts.transformer.rst","api-docs/utils.rst","distr-training.rst","extending.rst","extending/adding-new-data-layer.rst","extending/adding-new-decoder.rst","extending/adding-new-encoder.rst","extending/adding-new-loss.rst","getting-started.rst","getting-started/asr.rst","getting-started/nmt.rst","in-depth-tutorials.rst","in-depth-tutorials/internal-structure.rst","in-depth-tutorials/using-existing-models.rst","index.rst","installation-instructions.rst","mixed-precision.rst","models-and-recipes.rst"],objects:{"":{data:[0,0,0,"-"],decoders:[4,0,0,"-"],encoders:[5,0,0,"-"],losses:[6,0,0,"-"],models:[7,0,0,"-"],optimizers:[9,0,0,"-"],parts:[10,0,0,"-"],utils:[15,0,0,"-"]},"data.data_layer":{DataLayer:[0,1,1,""]},"data.data_layer.DataLayer":{__init__:[0,2,1,""],build_graph:[0,2,1,""],get_optional_params:[0,3,1,""],get_required_params:[0,3,1,""],get_size_in_samples:[0,2,1,""],input_tensors:[0,4,1,""],iterator:[0,4,1,""],params:[0,4,1,""]},"data.image2label":{image2label:[1,0,0,"-"],imagenet_preprocessing:[1,0,0,"-"]},"data.image2label.image2label":{CifarDataLayer:[1,1,1,""],ImagenetDataLayer:[1,1,1,""]},"data.image2label.image2label.CifarDataLayer":{build_graph:[1,2,1,""],get_optional_params:[1,3,1,""],get_required_params:[1,3,1,""],get_size_in_samples:[1,2,1,""],input_tensors:[1,4,1,""],iterator:[1,4,1,""],parse_record:[1,2,1,""],preprocess_image:[1,2,1,""]},"data.image2label.image2label.ImagenetDataLayer":{build_graph:[1,2,1,""],get_optional_params:[1,3,1,""],get_required_params:[1,3,1,""],get_size_in_samples:[1,2,1,""],input_tensors:[1,4,1,""],iterator:[1,4,1,""],split_data:[1,2,1,""]},"data.image2label.imagenet_preprocessing":{_aspect_preserving_resize:[1,5,1,""],_central_crop:[1,5,1,""],_decode_crop_and_flip:[1,5,1,""],_mean_image_subtraction_and_normalization:[1,5,1,""],_parse_example_proto:[1,5,1,""],_resize_image:[1,5,1,""],_smallest_size_at_least:[1,5,1,""],parse_record:[1,5,1,""],preprocess_image:[1,5,1,""]},"data.speech2text":{speech2text:[2,0,0,"-"],speech_utils:[2,0,0,"-"]},"data.speech2text.speech2text":{Speech2TextDataLayer:[2,1,1,""]},"data.speech2text.speech2text.Speech2TextDataLayer":{__init__:[2,2,1,""],_parse_audio_element:[2,2,1,""],_parse_audio_transcript_element:[2,2,1,""],build_graph:[2,2,1,""],get_optional_params:[2,3,1,""],get_required_params:[2,3,1,""],get_size_in_samples:[2,2,1,""],input_tensors:[2,4,1,""],iterator:[2,4,1,""],split_data:[2,2,1,""]},"data.speech2text.speech_utils":{augment_audio_signal:[2,5,1,""],get_speech_features:[2,5,1,""],get_speech_features_from_file:[2,5,1,""],normalize_signal:[2,5,1,""]},"data.text2text":{t2t:[3,0,0,"-"],text2text:[3,0,0,"-"],tokenizer:[3,0,0,"-"]},"data.text2text.t2t":{_batch_examples:[3,5,1,""],_create_min_max_boundaries:[3,5,1,""],_filter_max_length:[3,5,1,""],_get_example_length:[3,5,1,""],_load_records:[3,5,1,""],_parse_example:[3,5,1,""],_read_and_batch_from_files:[3,5,1,""],eval_input_fn:[3,5,1,""],train_input_fn:[3,5,1,""]},"data.text2text.text2text":{ParallelTextDataLayer:[3,1,1,""],SpecialTextTokens:[3,1,1,""],TransformerDataLayer:[3,1,1,""]},"data.text2text.text2text.ParallelTextDataLayer":{build_graph:[3,2,1,""],get_optional_params:[3,3,1,""],get_required_params:[3,3,1,""],get_size_in_samples:[3,2,1,""],input_tensors:[3,4,1,""],iterator:[3,4,1,""]},"data.text2text.text2text.SpecialTextTokens":{END_OF_CHOICE:[3,4,1,""],EOS_ID:[3,4,1,""],OUT_OF_BUCKET:[3,4,1,""],PAD_ID:[3,4,1,""],S_ID:[3,4,1,""],UNK_ID:[3,4,1,""]},"data.text2text.text2text.TransformerDataLayer":{build_graph:[3,2,1,""],get_optional_params:[3,3,1,""],get_required_params:[3,3,1,""],input_tensors:[3,4,1,""],iterator:[3,4,1,""]},"data.text2text.tokenizer":{Subtokenizer:[3,1,1,""],_count_and_gen_subtokens:[3,5,1,""],_count_tokens:[3,5,1,""],_escape_token:[3,5,1,""],_filter_and_bucket_subtokens:[3,5,1,""],_gen_new_subtoken_list:[3,5,1,""],_generate_alphabet_dict:[3,5,1,""],_generate_subtokens:[3,5,1,""],_generate_subtokens_with_target_vocab_size:[3,5,1,""],_join_tokens_to_string:[3,5,1,""],_list_to_index_dict:[3,5,1,""],_load_vocab_file:[3,5,1,""],_native_to_unicode:[3,5,1,""],_save_vocab_file:[3,5,1,""],_split_string_to_tokens:[3,5,1,""],_split_token_to_subtokens:[3,5,1,""],_unescape_token:[3,5,1,""],_unicode_to_native:[3,5,1,""]},"data.text2text.tokenizer.Subtokenizer":{__init__:[3,2,1,""],_subtoken_ids_to_tokens:[3,2,1,""],_token_to_subtoken_ids:[3,2,1,""],decode:[3,2,1,""],encode:[3,2,1,""],init_from_files:[3,3,1,""]},"data.utils":{load_pre_existing_vocabulary:[0,5,1,""],pad_vocab_to_eight:[0,5,1,""]},"decoders.convs2s_decoder":{ConvS2SDecoder:[4,1,1,""]},"decoders.convs2s_decoder.ConvS2SDecoder":{_get_symbols_to_logits_fn:[4,2,1,""],decode_pass:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""],predict:[4,2,1,""]},"decoders.decoder":{Decoder:[4,1,1,""]},"decoders.decoder.Decoder":{__init__:[4,2,1,""],_cast_types:[4,2,1,""],_decode:[4,2,1,""],decode:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""],mode:[4,4,1,""],name:[4,4,1,""],params:[4,4,1,""]},"decoders.fc_decoders":{FullyConnectedCTCDecoder:[4,1,1,""],FullyConnectedDecoder:[4,1,1,""],FullyConnectedTimeDecoder:[4,1,1,""]},"decoders.fc_decoders.FullyConnectedCTCDecoder":{__init__:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""]},"decoders.fc_decoders.FullyConnectedDecoder":{__init__:[4,2,1,""],_decode:[4,2,1,""],get_required_params:[4,3,1,""]},"decoders.fc_decoders.FullyConnectedTimeDecoder":{__init__:[4,2,1,""],_decode:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""]},"decoders.rnn_decoders":{BeamSearchRNNDecoderWithAttention:[4,1,1,""],RNNDecoderWithAttention:[4,1,1,""]},"decoders.rnn_decoders.BeamSearchRNNDecoderWithAttention":{__init__:[4,2,1,""],_decode:[4,2,1,""],get_optional_params:[4,3,1,""]},"decoders.rnn_decoders.RNNDecoderWithAttention":{__init__:[4,2,1,""],_build_attention:[4,2,1,""],_decode:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""]},"encoders.cnn_encoder":{CNNEncoder:[5,1,1,""],build_layer:[5,5,1,""]},"encoders.cnn_encoder.CNNEncoder":{__init__:[5,2,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""]},"encoders.convs2s_encoder":{ConvS2SEncoder:[5,1,1,""]},"encoders.convs2s_encoder.ConvS2SEncoder":{get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.ds2_encoder":{DeepSpeech2Encoder:[5,1,1,""],rnn_cell:[5,5,1,""],row_conv:[5,5,1,""]},"encoders.ds2_encoder.DeepSpeech2Encoder":{__init__:[5,2,1,""],_encode:[5,2,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""]},"encoders.encoder":{Encoder:[5,1,1,""]},"encoders.encoder.Encoder":{__init__:[5,2,1,""],_cast_types:[5,2,1,""],_encode:[5,2,1,""],encode:[5,2,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],mode:[5,4,1,""],name:[5,4,1,""],params:[5,4,1,""]},"encoders.resnet_blocks":{batch_norm:[5,5,1,""],block_layer:[5,5,1,""],bottleneck_block_v1:[5,5,1,""],bottleneck_block_v2:[5,5,1,""],building_block_v1:[5,5,1,""],building_block_v2:[5,5,1,""],conv2d_fixed_padding:[5,5,1,""],fixed_padding:[5,5,1,""]},"encoders.resnet_encoder":{ResNetEncoder:[5,1,1,""]},"encoders.resnet_encoder.ResNetEncoder":{get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""]},"encoders.rnn_encoders":{BidirectionalRNNEncoderWithEmbedding:[5,1,1,""],GNMTLikeEncoderWithEmbedding:[5,1,1,""],GNMTLikeEncoderWithEmbedding_cuDNN:[5,1,1,""],UnidirectionalRNNEncoderWithEmbedding:[5,1,1,""]},"encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding":{__init__:[5,2,1,""],_encode:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding":{__init__:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding_cuDNN":{__init__:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.rnn_encoders.UnidirectionalRNNEncoderWithEmbedding":{__init__:[5,2,1,""],_encode:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.w2l_encoder":{Wave2LetterEncoder:[5,1,1,""]},"encoders.w2l_encoder.Wave2LetterEncoder":{__init__:[5,2,1,""],_encode:[5,2,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""]},"losses.cross_entropy_loss":{CrossEntropyLoss:[6,1,1,""]},"losses.ctc_loss":{CTCLoss:[6,1,1,""],dense_to_sparse:[6,5,1,""]},"losses.ctc_loss.CTCLoss":{__init__:[6,2,1,""],_compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""]},"losses.loss":{Loss:[6,1,1,""]},"losses.loss.Loss":{__init__:[6,2,1,""],_cast_types:[6,2,1,""],_compute_loss:[6,2,1,""],compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""],get_required_params:[6,3,1,""],name:[6,4,1,""],params:[6,4,1,""]},"losses.sequence_loss":{BasicSequenceLoss:[6,1,1,""],CrossEntropyWithSmoothing:[6,1,1,""],PaddedCrossEntropyLossWithSmoothing:[6,1,1,""]},"losses.sequence_loss.BasicSequenceLoss":{__init__:[6,2,1,""],_compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""],get_required_params:[6,3,1,""]},"losses.sequence_loss.CrossEntropyWithSmoothing":{__init__:[6,2,1,""],_compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""],get_required_params:[6,3,1,""]},"losses.sequence_loss.PaddedCrossEntropyLossWithSmoothing":{get_optional_params:[6,3,1,""]},"models.encoder_decoder":{EncoderDecoderModel:[7,1,1,""]},"models.encoder_decoder.EncoderDecoderModel":{__init__:[7,2,1,""],_build_forward_pass_graph:[7,2,1,""],_create_decoder:[7,2,1,""],_create_encoder:[7,2,1,""],_create_loss:[7,2,1,""],decoder:[7,4,1,""],encoder:[7,4,1,""],get_optional_params:[7,3,1,""],get_required_params:[7,3,1,""],loss_computator:[7,4,1,""]},"models.image2label":{Image2Label:[7,1,1,""]},"models.image2label.Image2Label":{_get_num_objects_per_step:[7,2,1,""],evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],maybe_print_logs:[7,2,1,""]},"models.model":{Model:[7,1,1,""]},"models.model.Model":{__init__:[7,2,1,""],_build_forward_pass_graph:[7,2,1,""],_get_num_objects_per_step:[7,2,1,""],clip_last_batch:[7,2,1,""],compile:[7,2,1,""],evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],finalize_inference:[7,2,1,""],get_data_layer:[7,2,1,""],get_num_objects_per_step:[7,2,1,""],get_optional_params:[7,3,1,""],get_output_tensors:[7,2,1,""],get_required_params:[7,3,1,""],get_tf_dtype:[7,2,1,""],hvd:[7,4,1,""],infer:[7,2,1,""],last_step:[7,4,1,""],maybe_print_logs:[7,2,1,""],mode:[7,4,1,""],num_gpus:[7,4,1,""],on_horovod:[7,4,1,""],params:[7,4,1,""],steps_in_epoch:[7,4,1,""]},"models.speech2text":{Speech2Text:[7,1,1,""],levenshtein:[7,5,1,""],sparse_tensor_to_chars:[7,5,1,""]},"models.speech2text.Speech2Text":{_get_num_objects_per_step:[7,2,1,""],evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],finalize_inference:[7,2,1,""],infer:[7,2,1,""],maybe_print_logs:[7,2,1,""]},"models.text2text":{Text2Text:[7,1,1,""],calculate_bleu:[7,5,1,""],transform_for_bleu:[7,5,1,""]},"models.text2text.Text2Text":{_get_num_objects_per_step:[7,2,1,""],evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],finalize_inference:[7,2,1,""],infer:[7,2,1,""],maybe_print_logs:[7,2,1,""]},"optimizers.automatic_loss_scaler":{AutomaticLossScaler:[9,1,1,""],BackoffScaler:[9,1,1,""],LogMaxScaler:[9,1,1,""]},"optimizers.automatic_loss_scaler.AutomaticLossScaler":{SUPPORTED_ALGOS:[9,4,1,""],check_grads:[9,3,1,""],loss_scale:[9,4,1,""],update_op:[9,2,1,""]},"optimizers.automatic_loss_scaler.BackoffScaler":{loss_scale:[9,4,1,""],update_op:[9,2,1,""]},"optimizers.automatic_loss_scaler.LogMaxScaler":{loss_scale:[9,4,1,""],update_op:[9,2,1,""]},"optimizers.lr_policies":{exp_decay:[9,5,1,""],fixed_lr:[9,5,1,""],piecewise_constant:[9,5,1,""],poly_decay:[9,5,1,""],transformer_policy:[9,5,1,""]},"optimizers.mp_wrapper":{MixedPrecisionOptimizerWrapper:[9,1,1,""],mp_regularizer_wrapper:[9,5,1,""]},"optimizers.mp_wrapper.MixedPrecisionOptimizerWrapper":{apply_gradients:[9,2,1,""],compute_gradients:[9,2,1,""]},"optimizers.optimizers":{_clip_gradients_by_norm:[9,5,1,""],get_regularization_loss:[9,5,1,""],optimize_loss:[9,5,1,""],post_process_gradients:[9,5,1,""],reduce_gradients:[9,5,1,""]},"parts.cnns":{conv_blocks:[11,0,0,"-"]},"parts.cnns.conv_blocks":{conv_actv:[11,5,1,""],conv_bn_actv:[11,5,1,""]},"parts.convs2s":{attention_wn_layer:[12,0,0,"-"],conv_wn_layer:[12,0,0,"-"],ffn_wn_layer:[12,0,0,"-"]},"parts.convs2s.attention_wn_layer":{AttentionLayerNormalized:[12,1,1,""]},"parts.convs2s.attention_wn_layer.AttentionLayerNormalized":{__init__:[12,2,1,""],call:[12,2,1,""]},"parts.convs2s.conv_wn_layer":{Conv1DNetworkNormalized:[12,1,1,""]},"parts.convs2s.conv_wn_layer.Conv1DNetworkNormalized":{__init__:[12,2,1,""],call:[12,2,1,""],gated_linear_units:[12,2,1,""]},"parts.convs2s.ffn_wn_layer":{FeedFowardNetworkNormalized:[12,1,1,""]},"parts.convs2s.ffn_wn_layer.FeedFowardNetworkNormalized":{__init__:[12,2,1,""],call:[12,2,1,""]},"parts.rnns":{attention_wrapper:[13,0,0,"-"],flstm:[13,0,0,"-"],glstm:[13,0,0,"-"],gnmt:[13,0,0,"-"],rnn_beam_search_decoder:[13,0,0,"-"],slstm:[13,0,0,"-"],utils:[13,0,0,"-"]},"parts.rnns.attention_wrapper":{AttentionMechanism:[13,1,1,""],AttentionWrapper:[13,1,1,""],AttentionWrapperState:[13,1,1,""],BahdanauAttention:[13,1,1,""],BahdanauMonotonicAttention:[13,1,1,""],LuongAttention:[13,1,1,""],LuongMonotonicAttention:[13,1,1,""],hardmax:[13,5,1,""],monotonic_attention:[13,5,1,""],safe_cumprod:[13,5,1,""]},"parts.rnns.attention_wrapper.AttentionMechanism":{alignments_size:[13,4,1,""],state_size:[13,4,1,""]},"parts.rnns.attention_wrapper.AttentionWrapper":{__init__:[13,2,1,""],_item_or_tuple:[13,2,1,""],call:[13,2,1,""],output_size:[13,4,1,""],state_size:[13,4,1,""],zero_state:[13,2,1,""]},"parts.rnns.attention_wrapper.AttentionWrapperState":{clone:[13,2,1,""]},"parts.rnns.attention_wrapper.BahdanauAttention":{__init__:[13,2,1,""]},"parts.rnns.attention_wrapper.BahdanauMonotonicAttention":{__init__:[13,2,1,""]},"parts.rnns.attention_wrapper.LuongAttention":{__init__:[13,2,1,""]},"parts.rnns.attention_wrapper.LuongMonotonicAttention":{__init__:[13,2,1,""]},"parts.rnns.flstm":{FLSTMCell:[13,1,1,""]},"parts.rnns.flstm.FLSTMCell":{__init__:[13,2,1,""],call:[13,2,1,""],output_size:[13,4,1,""],state_size:[13,4,1,""]},"parts.rnns.glstm":{GLSTMCell:[13,1,1,""]},"parts.rnns.glstm.GLSTMCell":{__init__:[13,2,1,""],_get_input_for_group:[13,2,1,""],call:[13,2,1,""],output_size:[13,4,1,""],state_size:[13,4,1,""]},"parts.rnns.gnmt":{GNMTAttentionMultiCell:[13,1,1,""],gnmt_residual_fn:[13,5,1,""]},"parts.rnns.gnmt.GNMTAttentionMultiCell":{__init__:[13,2,1,""]},"parts.rnns.rnn_beam_search_decoder":{BeamSearchDecoder:[13,1,1,""],BeamSearchDecoderOutput:[13,1,1,""],BeamSearchDecoderState:[13,1,1,""],FinalBeamSearchDecoderOutput:[13,1,1,""],tile_batch:[13,5,1,""]},"parts.rnns.rnn_beam_search_decoder.BeamSearchDecoder":{__init__:[13,2,1,""],_maybe_merge_batch_beams:[13,2,1,""],_maybe_split_batch_beams:[13,2,1,""],_merge_batch_beams:[13,2,1,""],_split_batch_beams:[13,2,1,""],batch_size:[13,4,1,""],finalize:[13,2,1,""],initialize:[13,2,1,""],output_dtype:[13,4,1,""],output_size:[13,4,1,""],step:[13,2,1,""],tracks_own_finished:[13,4,1,""]},"parts.rnns.slstm":{BasicSLSTMCell:[13,1,1,""],_linear:[13,5,1,""]},"parts.rnns.slstm.BasicSLSTMCell":{__init__:[13,2,1,""],call:[13,2,1,""],output_size:[13,4,1,""],state_size:[13,4,1,""]},"parts.rnns.utils":{single_cell:[13,5,1,""]},"parts.transformer":{attention_layer:[14,0,0,"-"],beam_search:[14,0,0,"-"],common:[14,0,0,"-"],embedding_layer:[14,0,0,"-"],ffn_layer:[14,0,0,"-"],utils:[14,0,0,"-"]},"parts.transformer.attention_layer":{Attention:[14,1,1,""],SelfAttention:[14,1,1,""]},"parts.transformer.attention_layer.Attention":{call:[14,2,1,""],combine_heads:[14,2,1,""],split_heads:[14,2,1,""]},"parts.transformer.attention_layer.SelfAttention":{call:[14,2,1,""]},"parts.transformer.beam_search":{SequenceBeamSearch:[14,1,1,""],_StateKeys:[14,1,1,""],_expand_to_beam_size:[14,5,1,""],_flatten_beam_dim:[14,5,1,""],_gather_beams:[14,5,1,""],_gather_topk_beams:[14,5,1,""],_length_normalization:[14,5,1,""],_shape_list:[14,5,1,""],_unflatten_beam_dim:[14,5,1,""],sequence_beam_search:[14,5,1,""]},"parts.transformer.beam_search.SequenceBeamSearch":{_continue_search:[14,2,1,""],_create_initial_state:[14,2,1,""],_get_new_alive_state:[14,2,1,""],_get_new_finished_state:[14,2,1,""],_grow_alive_seq:[14,2,1,""],_search_step:[14,2,1,""],search:[14,2,1,""]},"parts.transformer.beam_search._StateKeys":{ALIVE_CACHE:[14,4,1,""],ALIVE_LOG_PROBS:[14,4,1,""],ALIVE_SEQ:[14,4,1,""],CUR_INDEX:[14,4,1,""],FINISHED_FLAGS:[14,4,1,""],FINISHED_SCORES:[14,4,1,""],FINISHED_SEQ:[14,4,1,""]},"parts.transformer.common":{LayerNormalization:[14,1,1,""],PrePostProcessingWrapper:[14,1,1,""]},"parts.transformer.common.LayerNormalization":{build:[14,2,1,""],call:[14,2,1,""]},"parts.transformer.embedding_layer":{EmbeddingSharedWeights:[14,1,1,""]},"parts.transformer.embedding_layer.EmbeddingSharedWeights":{build:[14,2,1,""],call:[14,2,1,""],linear:[14,2,1,""]},"parts.transformer.ffn_layer":{FeedFowardNetwork:[14,1,1,""]},"parts.transformer.ffn_layer.FeedFowardNetwork":{call:[14,2,1,""]},"parts.transformer.utils":{get_decoder_self_attention_bias:[14,5,1,""],get_padding:[14,5,1,""],get_padding_bias:[14,5,1,""],get_position_encoding:[14,5,1,""]},"utils.funcs":{evaluate:[15,5,1,""],infer:[15,5,1,""],restore_and_get_results:[15,5,1,""],train:[15,5,1,""]},"utils.hooks":{BroadcastGlobalVariablesHook:[15,1,1,""],PrintLossAndTimeHook:[15,1,1,""],PrintSamplesHook:[15,1,1,""],RunEvaluationHook:[15,1,1,""]},"utils.hooks.BroadcastGlobalVariablesHook":{__init__:[15,2,1,""],after_create_session:[15,2,1,""],begin:[15,2,1,""]},"utils.hooks.PrintLossAndTimeHook":{after_run:[15,2,1,""],before_run:[15,2,1,""],begin:[15,2,1,""]},"utils.hooks.PrintSamplesHook":{after_run:[15,2,1,""],before_run:[15,2,1,""],begin:[15,2,1,""]},"utils.hooks.RunEvaluationHook":{after_run:[15,2,1,""],before_run:[15,2,1,""],begin:[15,2,1,""]},"utils.utils":{Logger:[15,1,1,""],array_to_string:[15,5,1,""],cast_types:[15,5,1,""],check_params:[15,5,1,""],clip_last_batch:[15,5,1,""],clip_sparse:[15,5,1,""],collect_if_horovod:[15,5,1,""],deco_print:[15,5,1,""],flatten_dict:[15,5,1,""],get_available_gpus:[15,5,1,""],get_git_diff:[15,5,1,""],get_git_hash:[15,5,1,""],get_results_for_epoch:[15,5,1,""],iterate_data:[15,5,1,""],log_summaries_from_dict:[15,5,1,""],mask_nans:[15,5,1,""],nest_dict:[15,5,1,""],nested_update:[15,5,1,""],text_ids_to_string:[15,5,1,""]},"utils.utils.Logger":{flush:[15,2,1,""],write:[15,2,1,""]},data:{data_layer:[0,0,0,"-"],image2label:[1,0,0,"-"],speech2text:[2,0,0,"-"],text2text:[3,0,0,"-"],utils:[0,0,0,"-"]},decoders:{convs2s_decoder:[4,0,0,"-"],decoder:[4,0,0,"-"],fc_decoders:[4,0,0,"-"],rnn_decoders:[4,0,0,"-"]},encoders:{cnn_encoder:[5,0,0,"-"],convs2s_encoder:[5,0,0,"-"],ds2_encoder:[5,0,0,"-"],encoder:[5,0,0,"-"],resnet_blocks:[5,0,0,"-"],resnet_encoder:[5,0,0,"-"],rnn_encoders:[5,0,0,"-"],w2l_encoder:[5,0,0,"-"]},losses:{cross_entropy_loss:[6,0,0,"-"],ctc_loss:[6,0,0,"-"],loss:[6,0,0,"-"],sequence_loss:[6,0,0,"-"]},models:{encoder_decoder:[7,0,0,"-"],image2label:[7,0,0,"-"],model:[7,0,0,"-"],speech2text:[7,0,0,"-"],text2text:[7,0,0,"-"]},optimizers:{automatic_loss_scaler:[9,0,0,"-"],lr_policies:[9,0,0,"-"],mp_wrapper:[9,0,0,"-"],optimizers:[9,0,0,"-"]},parts:{cnns:[11,0,0,"-"],convs2s:[12,0,0,"-"],rnns:[13,0,0,"-"],transformer:[14,0,0,"-"]},utils:{funcs:[15,0,0,"-"],hooks:[15,0,0,"-"],utils:[15,0,0,"-"]}},objnames:{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","method","Python method"],"3":["py","staticmethod","Python static method"],"4":["py","attribute","Python attribute"],"5":["py","function","Python function"]},objtypes:{"0":"py:module","1":"py:class","2":"py:method","3":"py:staticmethod","4":"py:attribute","5":"py:function"},terms:{"106gb":23,"1080ti":31,"16xlarg":30,"1e6":3,"1e9":14,"1x1":5,"224gb":23,"2xlarg":30,"4gpu":31,"55gb":23,"5gb":29,"8xlarg":30,"\u03b1":24,"\u03b4":24,"\u03b5":24,"\u03b6":24,"\u03ba":24,"abstract":[0,4,5,6,7],"boolean":[1,5,13],"byte":3,"case":[0,4,5,6,7,9,13,16,18,27,30],"char":0,"class":[0,1,2,3,4,5,6,7,9,12,13,14,15,18,27],"default":[3,5,6,9,13,15,27,29,30],"enum":3,"export":23,"final":[4,7,13,15,23,27,31],"float":[1,2,4,5,7,9,12,13,14,27,30],"function":[0,2,4,5,6,7,9,11,13,14,15,27,30],"import":[3,27,30],"int":[0,1,2,3,4,5,7,9,12,13,14,27],"long":13,"new":[0,1,3,12,13,14,15,25,27,28],"return":[0,1,2,3,4,5,6,7,9,12,13,14,15,18,30],"short":[13,27,31],"static":[0,1,2,3,4,5,6,7,9,13,18,27,30],"true":[0,1,3,5,6,7,9,13,14,16,27,31],"try":[5,9,24,27,29],"while":[1,3,18,24,27,30],AWS:30,Adding:25,And:[0,1,3,30],But:7,EOS:14,For:[1,2,3,4,5,7,9,13,15,18,23,24,27,28,30,31],IDs:3,Its:13,NOT:13,One:[24,27,30],Such:[13,18],That:[0,1,3,4,7],The:[0,1,3,5,7,9,13,14,15,18,23,27,29,30,31],There:27,These:[1,4,7,13,15,30],Use:5,Used:[3,7,14],Uses:[5,15,31],Using:25,Will:[5,7],With:[16,18,31],__call__:5,__init__:[0,1,2,3,4,5,6,7,12,13,15,18,27],_aspect_preserving_res:1,_baseattentionmechan:13,_basemonotonicattentionmechan:13,_batch_exampl:3,_build_attent:4,_build_forward_pass_graph:7,_building_block_v1:5,_building_block_v2:5,_cast_typ:[4,5,6],_central_crop:1,_clip_gradients_by_norm:9,_compute_loss:6,_continue_search:14,_count_and_gen_subtoken:3,_count_token:3,_create_decod:7,_create_encod:7,_create_initial_st:14,_create_loss:7,_create_min_max_boundari:3,_decod:4,_decode_crop_and_flip:1,_distributed_appli:9,_encod:5,_escape_token:3,_expand_to_beam_s:14,_filter_and_bucket_subtoken:3,_filter_max_length:3,_flatten_beam_dim:14,_gather_beam:14,_gather_topk_beam:14,_gather_tre:13,_gen_new_subtoken_list:3,_generate_alphabet_dict:3,_generate_subtoken:3,_generate_subtokens_with_target_vocab_s:3,_get_example_length:3,_get_input_for_group:13,_get_new_alive_st:14,_get_new_finished_st:14,_get_num_objects_per_step:7,_get_symbols_to_logits_fn:4,_grow_alive_seq:14,_item_or_tupl:13,_join_tokens_to_str:3,_length_norm:14,_linear:13,_list_to_index_dict:3,_load_record:3,_load_vocab_fil:3,_maybe_merge_batch_beam:13,_maybe_split_batch_beam:13,_mean_image_subtraction_and_norm:1,_merge_batch_beam:13,_monotonic_probability_fn:13,_native_to_unicod:3,_output:7,_parse_audio_el:2,_parse_audio_transcript_el:2,_parse_exampl:3,_parse_example_proto:1,_read_and_batch_from_fil:3,_resize_imag:1,_save_vocab_fil:3,_search_step:14,_shape_list:14,_smallest_size_at_least:1,_split_batch_beam:13,_split_string_to_token:3,_split_token_to_subtoken:3,_statekei:14,_subtoken_ids_to_token:3,_test:29,_token_to_subtoken_id:3,_unescape_token:3,_unflatten_beam_dim:14,_unicode_to_n:3,abl:[0,1,3,23,29],about:[5,15,31],abov:[13,24,30],abs:[5,13,31],absolut:30,acceler:13,accept:[3,5,11,13],access:[0,4,5,6,7],accord:13,accumul:[7,27],accuraci:[1,7,29,30],achiev:[13,29],across:[3,6,7,27,30],activ:[5,11,13,30],activation_fn:[5,11],actual:[4,7,13,27],adagrad:[7,9,27],adam:[7,9,27,31],adapt:[5,7,27],add:[3,4,5,7,9,13,15,27,30],add_eo:3,add_r:12,added:[0,3,5,7,12,13,14,15,27],adding:15,addit:[2,5,7,9,13,14,23,27,30,31],addition:[1,27],adjust:[7,9,24,27,30],after:[0,1,5,7,9,13,15,23,24,27,29],after_create_sess:15,after_run:15,again:[27,29],aggreg:7,aggregation_method:9,aggregationmethod:9,alben:30,algorithm:[7,9,27,30],align:13,alignment_histori:13,alignments_s:13,aliv:14,alive_cach:14,alive_log_prob:14,alive_seq:14,all:[0,1,2,3,4,5,6,7,13,14,15,18,24,27,29,30,31],allow:[3,24,31],along:[5,13],alpha:14,alphabet:[3,4],alphabet_config_path:4,alreadi:[13,28],also:[0,1,3,7,9,15,16,24,27,28,29,30],altern:[29,30],alwai:[7,9,27,30],amax:9,amount:3,analog:7,analysi:2,ani:[3,4,5,7,9,13,14,15,27,30,31],anoth:[3,13],answer:[7,27],anymor:15,anyth:9,api:[2,28],appear:3,append:[3,27],appli:[3,5,7,9,11,12,13,14,27,30],apply_gradi:9,approach:[7,13,16,30],apt:[23,29],arbitrari:[5,9],architectur:[24,30],archiv:23,aren:3,arg:[13,15],argmax:13,argument:[2,3,4,5,6,7,9,13,14,15,16,23,27],arithmet:30,around:[1,4,5,6,23,29,30],arrai:[2,3,7],arrang:1,array_op:13,array_to_str:15,articl:13,artifici:4,arxiv:[5,9,13,30,31],aspect:1,assign:[0,4],assum:[13,23,29],assumpt:[7,13],attend:13,attent:[4,12,13,14,24,31],attention_bia:14,attention_cel:13,attention_depth:13,attention_dropout:14,attention_lay:[8,10],attention_layer_s:13,attention_mechan:13,attention_or_cell_output:13,attention_st:13,attention_typ:4,attention_wn_lay:[8,10],attention_wrapp:[8,10],attentioninputwrapp:13,attentionlayernorm:12,attentionmechan:13,attentionwrapp:13,attentionwrapperst:13,attribut:[7,27],audio:[2,7,23],augment:2,augment_audio_sign:2,auto:6,automat:[4,5,6,7,9,27,29,31],automatic_loss_sc:30,automatic_loss_scal:8,automaticlossscal:9,autoregress:14,avail:[24,27,28,30,31],averag:[6,7,27],average_across_timestep:6,avoid:13,axi:[7,13],back:[14,30],backoff:[7,9,27,30],backoffscal:9,backpropag:30,backslash:3,bahadanau:13,bahdanau:[4,13],bahdanau_norm:4,bahdanauattent:13,bahdanaumonotonicattent:13,bandwidth:30,base:[0,1,2,3,4,5,6,7,9,12,13,14,15,16,23,24,27,28,29,31],base_model:27,base_param:[27,30],basic:[6,13,28],basic_sequence_loss:6,basicsequenceloss:6,basicslstmcel:13,batch:[3,4,5,6,7,9,11,13,14,18,27,31],batch_in_token:3,batch_norm:5,batch_siz:[2,3,4,5,6,12,13,14],batch_size_per_gpu:[6,7,24,27,31],batches_per_epoch:9,bazel:29,bbox:1,beahvior:13,beam:[4,13,14],beam_indic:14,beam_search:[8,10],beam_search_decoder_output:13,beam_siz:14,beam_width:[4,13],beamsearch:13,beamsearchdecod:13,beamsearchdecoderoutput:13,beamsearchdecoderst:13,beamsearchrnndecoderwithattent:4,becaus:[3,14,24,30],becom:2,been:[9,13,14,15,30],befor:[3,5,7,12,13,15,24,27,30],before_run:15,begin:[3,13,15,30],begin_decay_at:9,behavior:[13,30],being:[3,25],below:[29,30,31],bench_start:27,bench_step:27,benchmark:[7,27,30],benefici:30,bengio:13,besid:7,best:[13,14,29],beta1:9,beta2:9,better:[1,14],between:[3,7,13,14,31],bhadanau:13,bia:[12,13,14],bias:13,bias_initi:13,bidir_rnn_encoder_with_emb:5,bidirect:31,bidirectionalrnnencoderwithembed:5,big:[29,31],bigger:30,bin:29,binari:[3,23,29],bleu:[7,31],blob:14,block:[5,27],block_fn:5,block_lay:5,blue:24,bn_epsilon:[5,11],bn_momentum:[5,11],bn_regular:5,bodi:14,bool:[0,4,5,6,7,9,12,13,14,27],boost:29,bori:30,both:[1,3,7,13,27,30,31],bottleneck:5,bottleneck_block:5,bottleneck_block_v1:5,bottleneck_block_v2:5,bottom:13,bound:1,boundari:[3,9],boundary_scal:3,box:1,bpe_us:7,broadcast:15,broadcastglobalvariableshook:15,bucket:3,buckets_max:3,buckets_min:3,buffer:1,build:[2,4,5,14,15,22,28,29,30],build_graph:[0,1,2,3],build_image_data:1,build_lay:5,build_lm:23,build_pip_packag:29,building_block:5,building_block_v1:5,building_block_v2:5,built:[5,7,27,28],c_state:13,cach:14,calcul:[0,1,3,4,7,12,13,14],calculate_bleu:7,call:[4,5,6,7,9,12,13,14,15,18,30],callabl:[9,13],callback:15,can:[0,1,2,3,4,5,6,7,9,13,15,16,18,24,27,28,29,30,31],candiat:3,candid:3,cannot:[7,13,14,27],cast:[4,5,6,30],cast_typ:15,cat:24,cell:[4,5,13,31],cell_class:13,cell_input_fn:13,cell_param:13,cell_stat:13,center:1,central:1,chang:[13,15,16,24,30],channel:[1,5],channels_first:5,channels_last:[5,11],charact:[3,4],check:[5,7,13,27,28,29,30],check_grad:9,check_param:15,checkpoint:[7,13,15,27,31],child:27,cho:13,choos:[13,30],christoph:13,cifar:1,cifardatalay:1,classic:7,clean:[23,31],cleaned_fil:24,clip:[7,9,27],clip_gradi:9,clip_last_batch:[7,15],clip_spars:15,clone:[13,29],close:[3,13,31],cloud:30,cmake:29,cnn:[5,8,10,28],cnn_encod:8,cnn_layer:5,cnnencod:5,code:[3,7,13,27],coeffici:[9,30],colin:13,collect:[7,9,14,15,30],collect_if_horovod:15,colloqui:1,coloc:9,colocate_gradients_with_op:9,color:1,colorspac:1,column:13,com:[12,14,29],combin:[7,9,14,28],combine_head:14,come:31,command:[16,23,24,27,29,31],comment:31,commit:27,common:[8,10,30],commonli:30,compar:[5,30],compat:9,compil:[7,27],complet:[7,9,19,20,21,26,27,29],complex:30,compon:13,compos:13,compress:23,comput:[1,6,7,9,13,14,30],compute_gradi:[9,30],compute_loss:[6,7,15],concat:13,concaten:[7,13],concret:30,config:[0,1,2,3,4,5,6,7,9,15,16,25,29,31],config_fil:[23,24,27,29,31],configur:[7,16,24,27,28,29,31],conflict:13,conjunct:[7,27],connect:[4,5,7,12,13,14],conrib:13,consecut:3,consist:[4,15],constant:[9,30],constraint:13,construct:[0,1,3,4,5,6,7,13,15],constructor:[0,2,4,5,6,7,9,13,27],consumpt:30,contain:[0,1,2,3,4,5,6,7,9,12,13,14,15,24,27,29,30,31],content:[1,4,5,6],context:13,continu:[4,14,27],continue_learn:27,contrib:13,control:[7,13],conv1d:[5,11],conv1dnetworknorm:12,conv2d:[5,11],conv2d_fixed_pad:5,conv:5,conv_actv:11,conv_block:[8,10],conv_bn_actv:11,conv_lay:5,conv_pad:12,conv_seq2seq:12,conv_wn_lay:[8,10],conveni:27,convent:5,converg:30,convert:[2,3,13,23,30],convnet_lay:5,convolut:[5,11,12,31],convs2:[5,8,10,14,31],convs2s_decod:8,convs2s_encod:8,convs2s_encoder_with_emb:5,convs2sdecod:4,convs2sencod:5,coord:[1,15],coordin:[1,15],copi:[5,7,13,30],copt:29,core:[3,30],core_cel:4,core_cell_param:4,correct:[7,13,14,23,24,27],correctli:[7,29],correspond:[0,1,2,3,7,9,13,14,18,24,27,29,30],correspondingli:[9,18],cosin:14,could:[0,2,4,5,6,7,9,15,27],count:[0,3,4,27],cover:27,cpu:[3,30],creat:[0,1,3,4,5,6,7,13,14,15,22,27,30],create_toy_data:24,creation:[7,30],crop:1,crop_height:1,crop_width:1,cross:6,cross_entropy_loss:8,cross_entropy_with_smooth:6,crossentropyloss:6,crossentropywithsmooth:6,csv:[2,23],ctc:[4,6],ctc_decoder_with_lm:29,ctc_greedy_decod:4,ctc_loss:8,ctcloss:6,cuda:[29,30],cudnn:[5,31],cudnn_gru:5,cudnn_lstm:5,cudnnlstm:13,cumprod:13,cumsum:13,cumul:13,cur_index:14,current:[1,4,5,7,9,12,13,14,27,31],custom:[29,30],cut:[4,7],d_model:9,dai:31,data:[1,2,3,4,5,6,7,8,13,15,17,23,24,25,27,30],data_fil:3,data_format:[5,11],data_lay:[1,2,3,4,5,6,7,8,27],data_layer_param:[7,27],data_root:24,datalay:[0,1,2,3,7,18,27],dataset:[0,1,2,3,7,18,22,24],dataset_fil:2,david:30,dct:15,debug:27,debug_port:[15,27],debugger_port:27,dec:5,decai:[9,30,31],decay_r:9,decay_step:9,deco_print:15,decod:[0,1,3,5,6,7,8,12,13,14,17,23,24,25,27,28,31],decode_and_crop:1,decode_pad:12,decode_pass:4,decoder_cell_typ:4,decoder_cell_unit:4,decoder_dp_input_keep_prob:4,decoder_dp_output_keep_prob:4,decoder_initial_st:13,decoder_library_path:4,decoder_output:6,decoder_param:[4,7,31],decoder_use_skip_connect:4,decreas:[3,30],deep:[5,13,30,31],deepbench:30,deepspeech2:31,deepspeech2encod:[5,27],deepspeech:[5,27,29],defaultdict:3,defin:[0,1,3,4,5,7,9,13,14,24,27,30],definit:5,degre:30,delet:23,delim:[7,15],denomin:[7,27],denot:1,dens:[5,7,13],dense_tensor:6,dense_to_spars:6,depend:[5,13],deprec:13,depth:[1,13,28],deriv:[0,4,5,6,7,27],describ:[0,1,3,4,5,6,7,13,27,29,30,31],descript:[0,1,2,3,4,5,6,7,27,31],design:[28,30],desir:30,detail:[1,2,4,5,7,13,27,28,31],determin:[13,14],dev:[23,24,29,31],deviat:13,devic:15,diamo:30,dict:[0,1,2,3,4,5,6,7,13,14,27],dict_to_log:15,dictionari:[0,1,2,3,4,5,6,7,14,18,27],did:24,diederik:13,diff:27,differ:[1,3,5,13,14,15,24,27,28,29,30],dim:[4,5,6,13],dimens:[1,3,4,5,12,13,14],dimension:[9,13],dimenst:12,diment:12,direct:[5,31],directori:[7,27,29],disabl:[7,13,24,27,29,30],discov:29,disk:24,displai:27,distanc:7,distort:1,distribut:[3,7,13,27,28,29,30],divid:[1,14],divis:[0,2,3,7,13],dnn:30,do_mask:6,doc:[4,5,6,7,27],docker:[29,30],docstr:13,document:[13,27,28,30],doe:[1,4,6,7,13,24,27,29,30,31],doesn:1,domain:2,don:[3,5,14,31],done:[12,24,29],dot:14,dougla:13,download:[23,24],download_lm:29,downsampl:5,dp_input_keep_prob:13,dp_output_keep_prob:13,draw:14,drawn:3,dropout:[4,5,12,13],dropout_keep_prob:5,dropout_keep_prop:5,ds2_encod:[8,27],ds2_large_8gpu:31,ds2_librispeech_larc_config:23,ds2_medium_4gpu:31,ds2_small_1gpu:31,ds2_toy_data_config:[23,29],dtype:[0,4,5,6,7,9,13,14,15,27,30],due:13,dure:[1,3,4,7,13,14,15,24,27,30],dynam:[7,13,30],dynamic_decod:13,dzmitri:13,each:[0,1,3,4,6,7,9,13,14,15,18,27,30],eager:9,earli:[13,31],easi:28,easili:24,eck:13,effect:[13,31],effici:[3,5,13,28],eight:31,either:[0,2,4,5,6,7,9,13,27,30],element:[2,3,7,13],elimin:31,els:[6,9],elsen:30,embed:[4,5,12,13,14],embed_s:12,embed_scal:14,embedding_lay:[8,10],embedding_lookup:13,embedding_s:14,embeddingsharedweight:14,emit:13,emnlp:13,empti:[7,13,18,27],emul:[7,27],enabl:[7,9,13,16,27,29],enable_log:[27,31],enc_emb_w:5,encod:[0,1,3,4,6,7,8,12,13,14,17,24,25,27,28,31],encoder_cell_typ:5,encoder_cell_unit:5,encoder_decod:[4,5,6,8,27],encoder_dp_input_keep_prob:5,encoder_dp_output_keep_prob:5,encoder_final_st:13,encoder_lay:5,encoder_output:[4,5,13],encoder_output_a:12,encoder_output_b:12,encoder_outputs_b:4,encoder_param:[5,7],encoder_sequence_length:4,encoder_st:[5,13],encoder_use_skip_connect:5,encoderdecodermodel:[7,27],encorc:13,end:[3,4,13,14,15,24,29,30],end_compat:9,end_learning_r:9,end_of_choic:3,end_symbol:4,end_token:13,energi:13,enforc:13,english:[3,22],enough:[24,27,30],ensur:[3,13,14,15,30],entri:13,entropi:6,enumer:3,eos:14,eos_id:[3,14,15],epoch:[0,1,3,7,9,27,31],epsilon:[5,7,14,27],equal:[1,7,13,27],equival:[9,13],erich:30,error:[7,13,23,29,31],escap:3,especi:30,essenti:15,estim:30,etc:[4,5,7,18,27,28],etl:18,eval:[1,4,5,7,23,27],eval_input_fn:3,eval_model:15,eval_param:27,eval_step:[7,27],evalu:[0,1,3,7,15,23,24,27,31],evenli:3,event:[27,30],everi:[5,7],every_step:15,everyth:[23,27,28,29],exact:[13,27],exactli:13,exampl:[0,1,2,3,4,5,7,9,13,18,23,24,27,29,30],example_config:[23,24,27,29],example_seri:1,exce:[7,27],except:[5,13,15,27,29,30],execut:[7,9,16,24,27],exist:[0,4,13,25,30],exp:13,exp_decai:9,expect:6,experi:[24,27,28],experiment:28,explicit:5,explicitli:[12,13,30],exponenti:9,exponential_decai:9,express:27,extend:28,extens:30,extract:[2,18,23],fact_siz:13,factor:[13,14,30],fail:29,fairli:27,fals:[0,1,3,4,5,6,7,9,13,14,15,29,31],familiar:28,fc_decod:8,fc_layer:5,featur:[2,4,5,6,28],features_typ:2,fed:18,feed:[13,15],feed_dictionari:18,feedforward:[12,14],feedfowardnetwork:14,feedfowardnetworknorm:12,feel:23,fetch:7,few:31,ffn_layer:[8,10],ffn_wn_layer:[8,10],field:[1,2,4,5,6,13],file:[0,1,2,3,4,7,23,24,27,29,31],file_byte_limit:3,file_pattern:3,file_with_bpe_segment:24,filenam:[1,2,3],filepath:3,filter:[3,5,9,11],filter_s:14,final_output:4,final_sequence_length:4,final_st:[4,13],finalbeamdecoderoutput:13,finalbeamsearchdecoderoutput:13,finalize_evalu:7,finalize_infer:7,find:[14,28],finish:[13,14,24,27],finished_flag:14,finished_scor:14,finished_seq:14,first:[0,1,3,5,7,9,12,13,14,15,16,23,24,27,30],fit:3,fix:9,fixed_lr:9,fixed_pad:5,flag:[13,14],flaot:14,flat_dict:15,flatten_dict:15,flexibl:28,flip:1,float16:[0,4,5,6,7,9,27,30],float32:[0,2,4,5,6,7,9,12,14,27,30,31],flstm:[8,10],flstmcell:13,flush:15,folder:[23,24,27,29],follow:[0,1,3,4,5,6,7,11,18,23,24,27,29,30,31],forc:27,force_var_reus:7,forev:3,forget:[13,31],forget_bia:13,form:[0,3,13],format:[1,3,5,11],formul:14,forward:[7,30],found:3,four:31,fp32:30,fraction:3,frame:[2,7],free:23,frequenc:[2,3],frequent:3,from:[0,1,2,3,4,5,6,7,9,12,13,14,15,18,27,29,30,31],ftrl:[7,27],full:[5,9,30,31],fulli:[4,5,12,14],fully_connected_ctc_decod:4,fully_connected_decod:4,fully_connected_time_decod:4,fullyconnectedctcdecod:[4,23],fullyconnecteddecod:4,fullyconnectedtimedecod:4,func:8,furthermor:30,fuse:1,futur:14,gamma_regular:5,ganesh:30,garcia:30,gate:[9,12,13],gate_gradi:9,gate_graph:9,gate_non:9,gate_op:9,gated_linear_unit:12,gather:[14,15],gen_input_tensor:18,gener:[3,4,5,7,13,14,27],generate_tri:29,geometr:14,german:[3,22],get:[7,9,14,18,23,24,28,29],get_available_gpu:15,get_data_lay:7,get_decoder_self_attention_bia:14,get_git_diff:15,get_git_hash:15,get_next:18,get_num_objects_per_step:7,get_optional_param:[0,1,2,3,4,5,6,7,18,27],get_output_tensor:7,get_pad:14,get_padding_bia:14,get_position_encod:14,get_regularization_loss:9,get_required_param:[0,1,2,3,4,5,6,7,18,27],get_results_for_epoch:15,get_size_in_sampl:[0,1,2,3,18],get_speech_featur:2,get_speech_features_from_fil:2,get_tf_dtyp:7,get_vari:30,get_wmt16_en_dt:24,getter:30,ginsburg:[13,30],git:[27,29],github:[12,14,29],give:12,given:[1,9,13,14,30],global:[9,15],global_gradient_norm:[7,27],global_step:9,glstm:[4,8,10],glstmcell:13,glu:12,gnmt:[4,5,8,10,31],gnmt_encoder_with_emb:5,gnmt_encoder_with_emb_cudnn:5,gnmt_residual_fn:13,gnmt_v2:4,gnmtattentionmulticel:13,gnmtlikeencoderwithembed:5,gnmtlikeencoderwithembedding_cudnn:5,go_symbol:4,goal:24,going:[4,5,19,20,21,26,27,29],good:27,gpu:[0,6,7,15,16,18,24,27,28,29,30,31],gpu_id:[7,27],grad_loss:9,gradient:[7,9,13,27,30],gradient_norm:[7,27],grads_and_var:9,gram:23,graph:[0,1,2,3,4,5,6,7,9,15,27,30],graphic:30,graphkei:9,greater:5,greedi:31,gregori:30,group:[3,13],group_batch_s:3,group_id:13,group_siz:13,grow:14,gru:[4,5,31],guarante:[3,13],half:30,halv:30,handl:13,happen:[0,1,3,4,5,6,7,18],hard:13,hardmax:13,has:[0,1,2,3,4,5,6,7,13,14,15,30,31],has_nan:9,hash:27,hat:9,have:[0,1,2,3,4,5,6,7,9,13,14,18,24,27,28,29,30,31],head:14,height:1,height_in:5,help:[7,23,27],helper:[5,11,14],henc:30,here:[0,1,3,4,5,6,7,13,18,27,30],hetland:7,hidden:[4,5,13],hidden_dropout:12,hidden_s:[4,14],hieu:13,high:30,higher:13,highest:14,highli:31,histori:13,hold:[9,14],hook:[7,8],horovod:[0,3,6,7,9,15,16,27,31],horovod_gpu_broadcast:15,hot:[1,6,13],houston:30,how:[7,9,13,14,22,24,25],howev:[1,30],http:[5,7,9,12,13,14,29,31],human:15,hvd:[7,15,27],hyperparamet:30,iclr:[13,30],icml:13,id_and_audio_filenam:2,ident:[5,13],ids:[0,2,3,7,13,14,15,27],idx2char:7,ignor:[0,7,9,13,16,27],ignore_speci:[7,15],illeg:14,illustr:30,ilsvrc2012_val_00041207:1,imag:[1,5,7],image2label:[0,8,27],image_buff:1,image_s:1,imagenet_preprocess:[0,8],imagenetdatalay:1,implement:[6,7,9,12,13,14,18,27,31],impli:13,import_librivox:23,improv:[1,3,30],in_dim:12,incept:1,includ:[0,1,2,3,4,5,6,7,13,23,30],increas:[14,30],increment:9,independ:[5,6,7,27],index:[13,14],indexedslic:9,indic:[1,3,14,15],inf:13,infer:[0,1,3,4,5,7,13,15,23,27],infer_output_fil:[23,24,27],infer_param:27,infin:14,inform:[5,7,13,14,15,27,28],inherit:[0,4,5,6,7,18],init_from_fil:3,init_var:14,initi:[3,4,5,7,9,12,13,14,15,27,30],initial_cach:14,initial_cell_st:13,initial_id:14,initial_st:13,initializer_param:[4,5,7,27],inner:[13,14],input:[0,1,2,3,4,5,6,7,11,12,13,14,27,28],input_attention_bia:12,input_dict:[4,5,6,15],input_lay:5,input_length:4,input_s:13,input_sequence_length:13,input_tensor:[0,1,2,3,4,5,6,7],input_typ:2,input_valu:7,inputs_attention_bia:4,insensit:[7,9,27],insid:[0,1,3,7,24,30],inspect:30,inspir:[12,13],instabl:13,instal:[23,28],instanc:[0,4,5,6,7,13,30],instead:[1,4,9,13,29,30],instruct:[23,28],insur:14,int32:[1,13,14],int64:[13,14],intact:5,integ:[1,3,5,13],inter:30,intermedi:[7,27,30],intern:[9,13,25],introduc:[1,5,30],invalid:9,invalidargu:13,invari:14,invers:3,involv:14,is_train:1,issu:31,item:[3,13,14],iter:[0,1,2,3,7,15,18,27,30],iter_s:[7,9,27],iterate_data:15,its:[3,12,13,14,23,27],jian:5,join:3,jointli:13,jonah:30,jpeg:1,jul:5,just:[4,5,29],kaim:5,keep:[4,5,6,12,13,15,30],kei:[0,1,3,5,7,9,12,14],kenlm:[23,29],kept:30,kernel:[5,12,31],kernel_initi:13,kernel_regular:5,kernel_s:[5,11],kernel_width:12,key_channel:14,keyword:14,kind:27,kingma:[12,13],knee:1,known:[1,3,13],kpu:29,kuchaev:30,kuchaiev:13,kwarg:[13,14],kyunghyun:13,label:[1,4,6],lambda:[13,30],languag:[4,22],language_model:29,lar:[7,27],larc:[7,9,27,31],larc_eta:[7,27],larc_mod:[7,27],larc_param:[7,9,27],larg:[7,13,27,30],largest:14,last:[4,5,7,12,27],last_batch:[7,15],last_step:[7,15],latenc:30,later:30,latter:13,launch:[15,24],layer:[0,1,2,3,4,5,7,9,12,13,14,17,25,27,30,31],layer_id:12,layer_param:5,layer_typ:5,layernorm:14,layernorm_lstm:5,layout:1,lead:29,learn:[5,7,9,13,24,27,31],learnabl:13,learning_r:[7,9,27],learning_rate_decay_fn:9,least:[5,14],left:13,length:[0,1,2,3,4,5,6,12,13,14],length_i:14,length_penalty_weight:13,length_x:14,less:[3,30,31],level:[27,30],levenshtein:7,libboost:29,libctc_decoder_with_kenlm:29,librari:4,librispeech:[22,31],librivox:23,libsox:23,libtensorflow_cc:29,libtensorflow_framework:29,like:[5,9,14,18,24,31],limit:30,line:[0,3,15,16,24,27],linear:[4,12,13,14],link:[29,31],list:[0,1,2,3,4,5,6,7,9,13,14,18,27,28],liu:13,live:14,lm_binary_path:4,lm_trie_path:4,lm_weight:4,load:[0,3,18],load_pre_existing_vocabulari:0,locat:[14,24,29],log:[7,13,14,25,30],log_fil:15,log_max:9,log_summaries_from_dict:15,logdir:[7,24,27],logger:15,logic:[7,14,18],logit:[4,6,13,14],logits_to_outputs_func:4,logmax:[7,9,27,30],logmaxscal:9,lognorm:30,logspac:13,longer:15,longest:3,look:[5,7,27,28,31],loop:14,lose:30,loss:[0,1,3,4,7,8,9,17,25,27,31],loss_comput:7,loss_input_dict:6,loss_param:7,loss_scal:[7,9,27,30],lot:[23,24,29],lower:[3,30],lr_polici:[7,8,27],lr_policy_param:[7,27],ls_dir:23,lst:3,lstm:[4,5,13,31],lstmstatetupl:13,luong:[4,13],luong_scal:4,luongattent:13,luongmonotonicattent:13,m_state:13,machin:[13,22],mai:[3,13,14],main:[4,5,6,24,27,28],maintain:[14,30],major:13,make:[0,1,3,13,23,24,28,29],malform:9,man:13,mani:30,manner:13,manual:[13,29,30],map:[0,3,4,5,13],mark:[3,13,14],mask:[6,12,13,14],mask_nan:[6,15],mask_pad:14,master:[14,30],match:[1,3,5,13,30,31],matric:13,matrix:[13,14],max:[3,9],max_decode_length:14,max_grad_norm:[7,27],max_length:3,max_lr:9,max_pool2d:5,max_pooling2d:5,max_step:[7,27],max_subtoken_length:3,max_tim:13,max_timescal:14,maxim:9,maximum:[3,7,13,14,27,30],mayb:13,maybe_print_log:7,mean:[1,3,15,30],measur:[24,31],mechan:[4,13,14],memori:[0,13,24,30],memory_sequence_length:13,mention:30,merg:13,method:[0,1,2,3,4,5,6,7,9,13,14,15,18,27,30],methodolog:30,metric:7,mfcc:2,michael:30,micikeviciu:30,might:[24,27,29,30],milli:2,min:3,min_boundari:3,min_count:3,min_idx:0,min_lr:9,min_timescal:14,min_upd:[7,27],minh:13,mini:18,minibatch:13,minim:[7,9,27],minimum:[0,3,13,14],minumum:3,minut:29,mismanag:13,misspel:23,mix:[4,5,7,9,13,14,27,28,31],mixedprecisionoptimizerwrapp:[9,30],mkdir:[23,29],modal:28,mode:[0,1,3,4,5,7,12,13,15,16,23,24,27,29,31],model:[0,1,2,3,4,5,6,8,9,13,14,15,22,24,25,28,30],model_param:30,modifi:[3,4,5,6,15,30],modul:[4,5,7,9,13,15,27],modular:28,momentum:[5,7,27,31],monoton:13,monotonic_attent:13,more:[3,13,14,24,28,30,31],moreov:27,moss:[24,31],most:[3,7,18,27,30],move:27,mozilla:29,mp_regularizer_wrapp:[9,30],mp_wrapper:8,mpi4pi:29,mpi:7,mpiexec:31,mpirun:16,msg:15,much:[3,14,24,29],multi:[7,14,16,18,24,28,31],multicel:13,multihead:14,multipl:[13,14,31],multipli:[9,13,30],multirnncel:13,must:[0,1,3,4,5,6,7,9,13,14,27],mutli:16,myfavoriteattentionmechan:13,n03623198:1,n_hidden:5,name:[2,3,4,5,6,7,9,11,12,13,27,30],namedtupl:13,nan:6,narang:30,nativ:3,nearli:30,necessari:[0,5,7,15,27,29,30],necessarili:14,need:[0,1,3,4,5,7,9,14,16,18,23,24,27,29,30],neg:[13,14],nest:[13,14,27],nest_dict:15,nested_upd:15,network:[5,12,13,14,30],neural:[5,13,30],new_beam_s:14,new_cach:14,new_height:1,new_log_prob:14,new_seq:14,new_width:1,newli:13,newstest2014:[24,31],next:[4,7,13,27,29,30,31],next_batch_feed_dict:18,next_input:13,next_stat:13,nmt:[24,31],nmt_revers:24,no_dir_check:27,noam:9,node:[16,28],nois:13,noise_level_max:2,noise_level_min:2,non:[13,14],none:[0,2,3,4,5,6,7,9,13,14,15,18,27,30],norm:[5,7,9,11,12,27,31],normal:[1,2,4,5,12,13,14,30],normalize_sign:2,note:[0,1,3,4,5,6,7,9,13,15,23,24,27,30],now:[13,23,27,29,30],num:[4,5,6],num_audio_featur:2,num_box:1,num_channel:[1,5],num_class:1,num_cpu_cor:3,num_epoch:[0,1,3,7,9,27],num_featur:[2,4],num_gpu:[7,16,24,27,31],num_head:14,num_iter:3,num_proj:13,num_rnn_lay:5,num_time_step:2,num_unit:13,num_work:[0,1,2,3,18],number:[0,1,2,3,4,5,6,7,9,13,14,16,27,30,31],number_of_group:13,numer:[7,13,15,27,30],numpi:2,nvidia:[28,29,30],object:[0,1,3,4,5,6,7,9,13,14,15,18,27,30],obtain:[29,30,31],occur:13,offici:28,offset:15,offset_target_by_on:6,often:[7,27,30],old:14,oleksii:30,on_horovod:[7,9],onc:[3,13,15,24],one:[0,1,3,4,5,6,7,9,13,14,15,24,27,30,31],ones:28,onli:[0,1,3,4,5,7,9,11,13,15,16,24,27,30],onlin:13,open:7,open_seq2seq:[1,2,3,7,29],openseq2seq:[3,16,23,24,27,30,31],oper:[4,5,9,13,15,29,30],ops:[1,9,13,15],opt:29,optim:[7,8,27,31],optimize_loss:9,optimizer_cls_nam:9,optimizer_param:[7,9,27],optimizer_summari:9,option:[0,1,2,3,4,5,6,7,9,13,18,27],optional_dict:15,order:[3,5,13,23,24,29,30],org:[5,7,9,13,31],org_dict:15,origin:[1,3,5,13,14,15,23],other:[1,4,7,9,13,15,23,27,29,30],otherwis:[1,7,9,13,15,24,27,29,30],our:[28,29,30],out:[13,14,24,28],out_dim:12,out_of_bucket:3,output:[1,4,5,6,7,12,13,14,15,18,23,24,27,28,29],output_attent:13,output_dim:[4,13],output_dir:15,output_dtyp:13,output_fil:[7,15],output_height:1,output_lay:13,output_s:13,output_time_major:13,output_valu:7,output_width:1,outsid:3,over:[6,30],overal:15,overcom:30,overflow:30,overflow_std_dev:9,overrid:13,overridden:13,overwrit:[7,27],overwritten:27,own:22,p_choose_i:13,packag:[4,5],pad2eight:3,pad:[0,1,2,3,5,11,12,14,18,31],pad_2_eight:3,pad_id:[3,15],pad_sym:14,pad_to:2,pad_vocab_to_eight:[0,14],padded_cross_entropy_with_smooth:6,padded_input_length:3,padded_length:3,padded_target_length:3,paddedcrossentropylosswithsmooth:6,padding_valu:14,page:[28,29],pair:[3,7,9,13],paper:24,parallel:[3,13,14,24],parallel_interleav:3,paralleltextdatalay:3,param:[0,1,2,3,4,5,6,7,9,13,14,15,18,27],paramet:[0,1,2,3,4,5,6,7,9,11,12,13,14,15,16,18,24,25,29,30],parent:[0,2,4,5,6,27],pars:[1,2],parse_record:1,part:[1,4,6,7,8,9,11,12,13,14,23,27,30],particular:13,partli:13,pass:[0,1,3,4,5,6,7,9,13,14,15,27,30],past:13,path:[0,2,3,4,7,27,31],pauliu:30,pdf:[5,9],penal:13,per:[2,3,6,7,27],perform:[1,2,4,5,6,7,13,24,27,30],period:[27,30],perl:[24,31],peter:13,pham:13,piecewis:9,piecewise_const:9,pip:29,pip_packag:29,pipelin:3,place:[3,27],placehold:18,plane:5,pleas:[13,23],point:[3,13,15,24,30],polici:[7,9,27],poly_decai:9,polynomi:9,polynomial_decai:9,pool_siz:5,popul:[7,9],posit:[5,13,14],possibl:[7,13,27,30,31],post:[9,14],post_process_gradi:9,power:[9,13],practic:30,pre:[0,13,14,24],preactiv:5,precis:[4,5,7,27,28,31],pred:7,predict:[4,7,13,14,15,27],predicted_id:13,preevious_attent:13,prefer:29,prefix:3,prepar:[13,27],prepostprocessingwrapp:14,preprint:30,preprocess:[1,23],preprocess_imag:1,presenc:30,present:9,preserv:1,previou:[12,13,14],previous_attent:13,primarili:1,principl:30,print:[5,7,15,27],print_bench_info_step:[7,27],print_loss_step:[7,27],print_samples_step:[7,27],printlossandtimehook:15,printsampleshook:15,prior:[3,13],probability_fn:13,probabl:[4,5,13,14,27,30],problem:[13,24,30],proce:27,process:[0,1,2,3,7,9,13,14,15,24,27,29],produc:[4,5,6,13],product:[13,14,28],progress:24,project:[5,12,13,28],projection_shortcut:5,propag:[13,30],proper:13,properli:13,properti:[13,14],propos:[5,13,31],proto:1,protocol:1,provabl:14,proven:30,provid:[1,3,4,5,7,9,13,15,23,30],pull:14,put:29,python:[1,3,4,5,7,9,12,13,14,15,18,23,24,27,29,31],quantiti:9,queri:13,raffel:13,rais:[1,9,13,15],random:[1,3,7,13,15,27],random_se:[7,27],randomli:1,rang:[2,13,30],rank:[1,3,13,14,15],rare:30,rate:[5,7,9,27,31],rather:[0,5,7,27,30],ratio:1,raw:[1,2,5],raw_record:1,raw_str:3,reach:14,read:[0,3,13,27],read_char:0,readabl:15,real:24,realli:4,reason:13,receiv:15,recent:30,recip:[28,30],recogn:23,recognit:[5,7,22,29],recommend:[9,13,18,29,30,31],record:[1,3],recov:15,recurr:[13,30],recurs:13,redefin:30,reduc:[3,13,24],reduce_gradi:9,reduce_mean:6,reduce_sum:15,ref:5,refer:[1,3,14],regress:6,regular:[4,5,7,9,11,27],regularizer_param:[4,5,7,27],relat:[0,1,3,13],relu:5,relu_dropout:14,remov:[7,24],ren:5,reparameter:13,repeat:[3,5,13],replac:[3,13,23],report:9,repositori:29,repres:[1,13,14],represent:[4,5],representation_dim:5,request:15,request_stop:15,requir:[0,1,2,3,4,5,6,7,13,18,27,29,30],required_dict:15,res_rank:14,research:28,reserv:3,reserved_token:3,reshap:[5,13,14],residu:[4,5,12,13],residual_connect:13,resiz:1,resize_imag:1,resize_min:1,resized_imag:1,resizemethod:1,resnet:[1,5],resnet_block:8,resnet_encod:8,resnetencod:5,respect:[13,30],rest:29,restor:[13,15,27],restore_and_get_result:15,result:[7,13,14,15,23,24,29,30,31],results_per_batch:7,retriev:30,reus:[13,30],revers:22,rgb:1,right:13,rmsprop:[7,27],rnn:[4,5,8,10,24,28,31],rnn_beam_search_decod:[8,10],rnn_cell:5,rnn_cell_dim:5,rnn_cell_impl:13,rnn_decod:8,rnn_decoder_with_attent:4,rnn_encod:8,rnn_type:5,rnn_unidirect:5,rnncell:13,rnndecoderwithattent:4,robust:30,ron:13,root:[15,31],root_rank:15,row:[5,7,13,15],row_conv:5,row_conv_width:5,rule:30,run:[4,5,7,9,13,14,15,16,23,25,30,31],run_context:15,run_valu:15,runevaluationhook:[7,15],runtim:13,runtimeerror:9,s_id:[3,15],safe:27,safe_cumprod:13,sai:27,saliman:[12,13],same:[0,1,3,4,5,6,9,13,14,15],sampl:[0,1,2,3,7,13,15,18,27],save:[3,7,27],save_checkpoint_step:[7,27],save_summaries_step:[7,27],scalar:[1,9,13,14],scale:[3,4,7,9,13,14,27],scale_max:9,scale_min:9,scan:13,scheme:[3,9],scope:[4,5,6,9,12,13],score:[7,13,14,31],score_bias_init:13,score_mask_valu:13,score_or_log_prob:14,script:[1,16,23,24,27,29,31],search:[3,4,13,14],second:[2,3,9,12,13,15,30],section:[4,5,7,14,19,20,21,26,27,28,29,31],sed:24,see:[2,4,5,6,7,13,24,27,28,29,30],seed:[7,13,27],select:[7,30],self:[0,1,3,4,5,6,7,13,14,18,27],selfattent:14,semi:3,send:15,sentenc:3,separ:[1,27],seq2seq:[6,13],seq:13,sequenc:[0,1,2,3,4,5,6,7,13,14,22,27,28,29],sequence_beam_search:14,sequence_length:[6,13],sequence_loss:8,sequencebeamsearch:14,seri:1,serial:[1,3],serialized_exampl:3,sess:[7,15],session:15,session_run_hook:15,sessionrunarg:15,sessionruncontext:15,sessionrunhook:15,sessionrunvalu:15,set:[3,4,5,6,7,9,13,15,16,27,28,29,30],setup:[23,31],sgd:[7,9,27,31],shaoq:5,shape:[1,2,3,4,5,6,12,13,14,18],sharan:30,share:[14,27,30],shift:30,shortcut:5,shorter:14,should:[0,1,3,4,5,6,7,9,12,13,14,15,18,23,24,27,29,30,31],shuffl:[0,3,13,18],side:1,sigmoid:13,sigmoid_nois:13,sigmoid_noise_se:13,signal:[2,15],signatur:[5,13],significantli:3,similar:[3,5],simpl:[1,4,13,16,24,27,30],simplest:29,sinc:[3,5,7,13,23,27,29,30],sine:14,singl:[1,3,5,13,14,24,31],single_cel:13,singleton:6,singular:13,situat:[15,30],size:[0,1,2,3,4,5,6,7,12,13,14,15,18,23,27,29,30,31],skip:[27,29,30],skip_update_ph:9,slice:13,slightli:3,sloppi:3,slowest:13,slstm:[8,10],small:[7,13,24,27,30,31],smallest:1,smallest_sid:1,smooth:6,softmax:[6,13,14],solut:13,some:[1,4,5,7,9,13,14,27,28,29,30],someth:29,sometim:27,soon:[13,19,20,21,26,27,31],sort:3,sourc:[0,1,2,3,4,5,6,7,9,11,12,13,14,15,27,29],source_length:2,source_sequ:2,source_tensor:[0,1,2,3,5,7],sox:23,space:12,spars:7,sparse_tensor_to_char:7,sparsemax:13,sparsetensorvalu:7,spatial:5,specialtexttoken:3,specif:[2,4,5,27,29],specifi:[2,7,9,12,14,16,18,27],spectrogram:2,speech2text:[0,8,23,27,29],speech2textdatalay:2,speech:[2,7,22,28],speech_util:[0,8],speed:[3,30],speedup:30,split:[3,4,13,14],split_data:[1,2],split_head:14,src:24,src_emb_dim:12,src_emb_siz:5,src_input:[4,5],src_length:[4,5,6],src_sequenc:5,src_vocab_s:5,stabil:[7,27],stack:13,stai:30,staircas:9,stamp:27,standard:[5,7,13,30],start:[9,13,14,15,16,24,27,28],start_input:13,start_token:13,state:[4,5,13,14],state_is_tupl:13,state_s:13,statist:30,stderr:27,stdout:27,step:[1,7,9,13,15,27,29],step_factor:9,step_window:9,steps_in_epoch:7,steps_per_epoch:9,still:14,stop:[13,15,31],store:[3,7,13,14],str:[0,2,3,4,5,6,7,12],stream:15,strength:14,stride:[2,5,11],string:[1,2,3,4,5,7,9,15,27],structur:[13,14,25],style:13,sub:9,subfold:27,submit:30,subsequ:[1,13,27],subset:[13,31],substitut:31,subtoken:[3,14],subtoken_count:3,subtoken_dict:3,subtoken_list:3,subtract:1,sudo:[23,29],suffer:13,suggest:30,sum:[6,13,15],sum_i:13,summar:30,summari:[7,9,27],sun:5,suppli:9,support:[0,4,5,6,7,11,13,16,18,27,28,29,30],supported_algo:9,sure:[0,1,3,23,24],symbol:[0,4,14],symbols_to_logits_fn:14,symlink:29,synset:1,system:24,t2t:[0,8],tab:27,tabl:31,taht:[0,1,3],take:[4,5,6,9,13,14,15,23,24,29],taken:7,tanh:13,target:[0,1,2,3,4,6,7,12,15,24],target_emb:12,target_length:[2,4],target_s:3,target_sequ:[2,6],target_tensor:[0,1,2,3,4,6,7],target_vocab_s:3,task:22,techniqu:5,tensor2tensor:14,tensor:[0,1,2,3,4,5,6,7,9,12,13,14,15,18,30],tensorarrai:13,tensorboard:[7,9,24,27],tensorflow:[0,1,3,4,5,6,7,9,12,13,14,15,27,28,30],tensorflow_pkg:29,tensorshap:13,term:[9,13],termin:14,tesla:30,test:24,text2text:[0,8,24,27],text:[1,2,3,4,7,15,23,28,29],text_ids_to_str:15,textlinedataset:2,tfrecord:3,tgt:24,tgt_emb_siz:4,tgt_input:4,tgt_length:[4,6],tgt_sequenc:6,tgt_vocab_s:[4,6],than:[1,3,5,9,13,14,24,30,31],thang:13,thei:[1,3,5,13,30],them:[23,29,30],thi:[0,1,3,4,5,6,7,9,12,13,14,15,18,19,20,21,23,24,26,27,28,29,30,31],thing:[3,24,27,30],those:[7,13],thread:15,three:[1,3],threshold:3,through:[1,13,14,28],thu:[3,4,27],tile:[13,14],tile_batch:13,tiled_encoder_final_st:13,tiled_encoder_output:13,tiled_input:13,tiled_sequence_length:13,tim:13,time:[1,3,4,5,6,13,15,23,27,29,30],time_major:5,time_stretch_ratio:2,timestep:[6,13,30],titan:30,tmp:29,tobyyouup:12,todo:[18,29],togeth:[7,15,30],toi:[22,23,29],tok:[24,31],token:[0,4,7,8,13,14],token_count:3,tool:29,toolkit:28,top:[3,13,14],topic:27,total:[0,9,30],total_regularization_loss:9,tower:[7,16],toy_text_data:24,tra:3,track:[13,15],tracks_own_finish:13,train:[0,1,3,4,5,7,9,11,13,14,15,22,24,27,28,29,31],train_ev:[7,23,24,27,29,31],train_input_fn:3,train_model:15,train_op:7,train_param:27,trainable_vari:9,trainer:9,training_step:7,transform:[3,4,8,9,10,12,18,24,31],transform_for_bleu:7,transformer_decod:8,transformer_encod:8,transformer_polici:9,transformerdatalay:3,translat:[3,7,13,14,22],transpos:14,treat:[7,9,27],tri:[5,31],trick:13,trie:[4,23,29],true_batch_s:13,true_siz:[7,15],tupl:[1,2,7,9,13,14],tutori:[27,28],twice:3,two:[1,3,13,14,15,16,23,30],txt:[24,29],type:[0,1,2,3,4,5,6,7,11,12,13,14,30],typeerror:[9,13],typic:[0,4,5,6,30],ubuntu:29,ultim:5,unbatch:3,unchang:[13,14],under:9,underflow:[13,30],undergo:1,underli:[2,30],underlin:3,understand:27,unescap:3,uni:5,unicod:3,unidir_rnn_encoder_with_emb:5,unidirect:31,unidirectionalrnnencoderwithembed:5,uniqu:14,unit:[4,5,12,13,31],unittest:29,unk_id:3,unknown:1,unspecifi:13,upcom:15,upd_dict:15,updat:[3,7,9,13,27,30],update_op:9,use:[0,1,3,4,5,6,7,9,13,16,18,23,24,27,29,30,31],use_horovod:[7,16,27,31],use_language_model:[4,29,31],use_new_attent:13,use_staircase_decai:9,use_swap_memori:5,used:[0,1,3,4,5,6,7,9,12,13,14,15,24,27,30,31],useful:[7,27,28],user:[13,30],uses:[6,12,13,23,24,30],using:[1,2,3,5,7,9,13,15,16,18,24,28,29,30,31],usual:[4,13,23,29,30],util:[1,7,8,10,30],utter:23,v100:30,valid:[0,1,3,4,5,7,9,13,15,27,31],valid_word_count_weight:4,valu:[1,3,4,5,7,9,12,13,14,15,27,30],value_channel:14,valueerror:[1,9,13],var_list:9,var_scope_nam:12,variabl:[3,4,5,6,7,9,12,13,14,15,27,30],variable_norm:[7,27],varianc:30,variant:5,varieti:30,variou:[4,5,9,27,28],vector:[1,12,13],venkatesh:30,verbos:[5,15],veri:[13,24],versa:13,version:[3,4,13,29,30],vgg:1,via:13,vice:13,view:[3,24],visual:[7,9],vocab:[0,3,7,15],vocab_fil:[2,3],vocab_s:[4,14],vocabulari:[0,2,3,4,5,6,15,24],volta:[30,31],w2l_encod:8,w2l_large_8gpu:31,wai:[3,7,29,30],want:[5,23,24,27,29],warm:[9,31],warmup_step:9,wav2lett:[5,31],wave2lett:5,wave2letterencod:5,wave:2,wavelength:14,weight:[4,5,12,13,14,15,30],weiss:13,well:[4,5,7,24],wer:31,were:[5,13,24],what:25,when:[0,1,3,5,7,9,13,14,15,16,27,29,30,31],whenev:[27,30],where:[1,3,7,9,13,14,18,24,27,30],whether:[0,1,3,4,5,6,7,9,12,13,14,27],which:[0,1,3,4,5,6,7,9,13,14,15,18,24,27,30,31],whl:29,whole:[7,27],whose:13,width:[1,4,5,12],width_in:5,window:2,window_s:2,window_strid:2,within:5,without:[1,4,5,14,27,30,31],wmt:24,word:[0,3,4,7,31],word_count_weight:4,work:[7,24,27,29,31],worker:[3,6,7,15],worker_id:[0,1,2,3,7,18],workshop:13,wors:29,worst:[13,14],worth:30,wrap:[3,4,13,15,30],wrapper:[1,4,5,6,13,14,30],write:[15,27],wrong:13,xiangyu:5,xmax:1,xmin:1,ymax:1,ymin:1,yoshua:13,you:[0,1,3,5,7,9,13,14,15,16,18,23,24,27,28,29,30,31],your:[0,1,3,18,22,24,27,29],yourself:30,zero:[7,13,14],zero_st:13,zhang:5},titles:["data","image2label","speech2text","text2text","decoders","encoders","losses","models","API documentation","optimizers","parts","cnns","convs2s","rnns","transformer","utils","Distributed training","Adding new models","Adding new data layer","Adding new decoder","Adding new encoder","Adding new loss","Getting started","Speech Recognition","Machine Translation","In-depth tutorials","Internal structure","Using existing models","OpenSeq2Seq","Installation instructions","Mixed precision training","Models and recipes"],titleterms:{"new":[17,18,19,20,21],Adding:[17,18,19,20,21],Using:27,add:29,adventur:24,api:8,attention_lay:14,attention_wn_lay:12,attention_wrapp:13,automat:30,automatic_loss_scal:9,beam_search:14,being:27,bleu:24,bpe:24,build:23,clean:24,cnn:11,cnn_encod:5,common:14,comput:24,config:27,conv_block:11,conv_wn_lay:12,convs2:12,convs2s_decod:4,convs2s_encod:5,creat:24,cross_entropy_loss:6,ctc:29,ctc_loss:6,data:[0,18],data_lay:0,dataset:23,decod:[4,19,29],depth:25,detail:30,distribut:16,document:8,download:29,ds2_encod:5,embedding_lay:14,enabl:30,encod:[5,20],encoder_decod:7,english:24,exist:27,fc_decod:4,feel:24,ffn_layer:14,ffn_wn_layer:12,flstm:13,func:15,gener:29,german:24,get:22,glstm:13,gnmt:13,hook:15,horovod:29,how:[23,27,29,30],image2label:[1,7],imagenet_preprocess:1,implement:30,infer:24,instal:29,instruct:29,intern:26,languag:[23,29],layer:18,librispeech:23,log:27,loss:[6,21,30],lr_polici:9,machin:[24,31],mix:30,model:[7,17,23,27,29,31],mp_wrapper:9,openseq2seq:[28,29],optim:[9,30],own:23,paramet:27,part:10,precis:30,prerequisit:30,recip:31,recognit:[23,31],regular:30,resnet_block:5,resnet_encod:5,revers:24,rnn:13,rnn_beam_search_decod:13,rnn_decod:4,rnn_encod:5,run:[24,27,29],scale:30,score:24,segment:24,sequenc:24,sequence_loss:6,slstm:13,speech2text:[2,7],speech:[23,29,31],speech_util:2,start:22,structur:26,t2t:3,task:24,tensorflow:29,test:29,text2text:[3,7],toi:24,token:3,train:[16,23,30],transform:14,transformer_decod:4,transformer_encod:5,translat:[24,31],tutori:25,util:[0,13,14,15],w2l_encod:5,what:27,your:23}}) \ No newline at end of file