diff --git a/detokenizer.perl b/detokenizer.perl new file mode 100755 index 000000000..41299baf2 --- /dev/null +++ b/detokenizer.perl @@ -0,0 +1,373 @@ +#!/usr/bin/env perl + +# $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $ +# Sample De-Tokenizer +# written by Josh Schroeder, based on code by Philipp Koehn +# further modifications by Ondrej Bojar +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +use warnings; +use strict; +use utf8; # tell perl this script file is in UTF-8 (see all funny punct below) + +my $language = "en"; +my $QUIET = 0; +my $HELP = 0; +my $UPPERCASE_SENT = 0; +my $PENN = 0; + +while (@ARGV) { + $_ = shift; + /^-b$/ && ($| = 1, next); + /^-l$/ && ($language = shift, next); + /^-q$/ && ($QUIET = 1, next); + /^-h$/ && ($HELP = 1, next); + /^-u$/ && ($UPPERCASE_SENT = 1, next); + /^-penn$/ && ($PENN = 1, next); +} + +if ($HELP) { + print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n"; + print "Options:\n"; + print " -u ... uppercase the first char in the final sentence.\n"; + print " -q ... don't report detokenizer revision.\n"; + print " -b ... disable Perl buffering.\n"; + print " -penn ... assume input is tokenized as per tokenizer.perl's -penn option.\n"; + exit; +} + +if ($language !~ /^(cs|en|fr|it|fi)$/) { + print STDERR "Warning: No built-in rules for language $language.\n" +} + +if ($PENN && $language ne "en") { + print STDERR "Error: -penn option only supported for English text.\n"; + exit; +} + +if (!$QUIET) { + print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n"; + print STDERR "Language: $language\n"; +} + +while() { + if (/^<.+>$/ || /^\s*$/) { + #don't try to detokenize XML/HTML tag lines + print $_; + } elsif ($PENN) { + print &detokenize_penn($_); + } else { + print &detokenize($_); + } +} + + +sub ucsecondarg { + # uppercase the second argument + my $arg1 = shift; + my $arg2 = shift; + return $arg1.uc($arg2); +} + +sub deescape { + # de-escape special chars + my ($text) = @_; + $text =~ s/\&bar;/\|/g; # factor separator (legacy) + $text =~ s/\|/\|/g; # factor separator + $text =~ s/\</\/g; # xml + $text =~ s/\&bra;/\[/g; # syntax non-terminal (legacy) + $text =~ s/\&ket;/\]/g; # syntax non-terminal (legacy) + $text =~ s/\"/\"/g; # xml + $text =~ s/\'/\'/g; # xml + $text =~ s/\[/\[/g; # syntax non-terminal + $text =~ s/\]/\]/g; # syntax non-terminal + $text =~ s/\&/\&/g; # escape escape + return $text; +} + +sub detokenize { + my($text) = @_; + chomp($text); + $text = " $text "; + $text =~ s/ \@\-\@ /-/g; + $text = &deescape($text); + + my $word; + my $i; + my @words = split(/ /,$text); + $text = ""; + my %quoteCount = ("\'"=>0,"\""=>0); + my $prependSpace = " "; + for ($i=0;$i<(scalar(@words));$i++) { + if (&startsWithCJKChar($words[$i])) { + if (($i > 0 && &endsWithCJKChar($words[$i-1])) && ($language ne "ko")) { + # perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word + $text=$text.$words[$i]; + } else { + # ... but do nothing special if this is a CJK word that doesn't follow a CJK word + $text=$text.$prependSpace.$words[$i]; + } + $prependSpace = " "; + } elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { + #perform right shift on currency and other random punctuation items + $text = $text.$prependSpace.$words[$i]; + $prependSpace = ""; + } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){ + if (($language eq "fr") && ($words[$i] =~ /^[\?\!\:\;\\\%]$/)) { + #these punctuations are prefixed with a non-breakable space in french + $text .= " "; } + #perform left shift on punctuation items + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { + #left-shift the contraction for English + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) { + #left-shift floats in Czech + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) { + #right-shift the contraction for French and Italian + $text = $text.$prependSpace.$words[$i]; + $prependSpace = ""; + } elsif (($language eq "cs") && ($i<(scalar(@words)-3)) + && ($words[$i] =~ /[\p{IsAlpha}]$/) + && ($words[$i+1] =~ /^[-–]$/) + && ($words[$i+2] =~ /^li$|^mail.*/i) + ) { + #right-shift "-li" in Czech and a few Czech dashed words (e-mail) + $text = $text.$prependSpace.$words[$i].$words[$i+1]; + $i++; # advance over the dash + $prependSpace = ""; + } elsif ($words[$i] =~ /^[\'\"„“`]+$/) { + #combine punctuation smartly + my $normalized_quo = $words[$i]; + $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/; + $quoteCount{$normalized_quo} = 0 + if !defined $quoteCount{$normalized_quo}; + if ($language eq "cs" && $words[$i] eq "„") { + # this is always the starting quote in Czech + $quoteCount{$normalized_quo} = 0; + } + if ($language eq "cs" && $words[$i] eq "“") { + # this is usually the ending quote in Czech + $quoteCount{$normalized_quo} = 1; + } + if (($quoteCount{$normalized_quo} % 2) eq 0) { + if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) { + #single quote for posesssives ending in s... "The Jones' house" + #left shift + $text=$text.$words[$i]; + $prependSpace = " "; + } else { + #right shift + $text = $text.$prependSpace.$words[$i]; + $prependSpace = ""; + $quoteCount{$normalized_quo} ++; + + } + } else { + #left shift + $text=$text.$words[$i]; + $prependSpace = " "; + $quoteCount{$normalized_quo} ++; + + } + + } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) { + # Finnish : without intervening space if followed by case suffix + # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ... + $text=$text. lc $words[$i]; + $prependSpace = " "; + } else { + $text=$text.$prependSpace.$words[$i]; + $prependSpace = " "; + } + } + + # clean up spaces at head and tail of each line as well as any double-spacing + $text =~ s/ +/ /g; + $text =~ s/\n /\n/g; + $text =~ s/ \n/\n/g; + $text =~ s/^ //g; + $text =~ s/ $//g; + + #add trailing break + $text .= "\n" unless $text =~ /\n$/; + + $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; + + return $text; +} + +sub detokenize_penn { + my($text) = @_; + + chomp($text); + $text = " $text "; + $text =~ s/ \@\-\@ /-/g; + $text =~ s/ \@\/\@ /\//g; + $text = &deescape($text); + + # merge de-contracted forms except where the second word begins with an + # apostrophe (those are handled later) + $text =~ s/ n't /n't /g; + $text =~ s/ N'T /N'T /g; + $text =~ s/ ([Cc])an not / $1annot /g; + $text =~ s/ ([Dd])' ye / $1'ye /g; + $text =~ s/ ([Gg])im me / $1imme /g; + $text =~ s/ ([Gg])on na / $1onna /g; + $text =~ s/ ([Gg])ot ta / $1otta /g; + $text =~ s/ ([Ll])em me / $1emme /g; + $text =~ s/ '([Tt]) is / '$1is /g; + $text =~ s/ '([Tt]) was / '$1was /g; + $text =~ s/ ([Ww])an na / $1anna /g; + + # restore brackets + $text =~ s/-LRB-/\(/g; + $text =~ s/-RRB-/\)/g; + $text =~ s/-LSB-/\[/g; + $text =~ s/-RSB-/\]/g; + $text =~ s/-LCB-/{/g; + $text =~ s/-RCB-/}/g; + + my $i; + my @words = split(/ /,$text); + $text = ""; + my $prependSpace = " "; + for ($i=0;$i<(scalar(@words));$i++) { + if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { + # perform right shift on currency and other random punctuation items + $text = $text.$prependSpace.$words[$i]; + $prependSpace = ""; + } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){ + # perform left shift on punctuation items + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { + # left-shift the contraction + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only + # opening single quote: convert to straight quote and right-shift + $text = $text.$prependSpace."\'"; + $prependSpace = ""; + } elsif ($words[$i] eq "``") { + # opening double quote: convert to straight quote and right-shift + $text = $text.$prependSpace."\""; + $prependSpace = ""; + } elsif ($words[$i] eq "\'") { + # closing single quote: convert to straight quote and left shift + $text = $text."\'"; + $prependSpace = " "; + } elsif ($words[$i] eq "\'\'") { + # closing double quote: convert to straight quote and left shift + $text = $text."\""; + $prependSpace = " "; + } else { + $text = $text.$prependSpace.$words[$i]; + $prependSpace = " "; + } + } + + # clean up spaces at head and tail of each line as well as any double-spacing + $text =~ s/ +/ /g; + $text =~ s/\n /\n/g; + $text =~ s/ \n/\n/g; + $text =~ s/^ //g; + $text =~ s/ $//g; + + # add trailing break + $text .= "\n" unless $text =~ /\n$/; + + $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; + + return $text; +} + +sub startsWithCJKChar { + my ($str) = @_; + return 0 if length($str) == 0; + my $firstChar = substr($str, 0, 1); + return &charIsCJK($firstChar); +} + +sub endsWithCJKChar { + my ($str) = @_; + return 0 if length($str) == 0; + my $lastChar = substr($str, length($str)-1, 1); + return &charIsCJK($lastChar); +} + +# Given a string consisting of one character, returns true iff the character +# is a CJK (Chinese/Japanese/Korean) character +sub charIsCJK { + my ($char) = @_; + # $char should be a string of length 1 + my $codepoint = &codepoint_dec($char); + + # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane + + # Hangul Jamo (1100–11FF) + return 1 if (&between_hexes($codepoint, '1100', '11FF')); + + # CJK Radicals Supplement (2E80–2EFF) + # Kangxi Radicals (2F00–2FDF) + # Ideographic Description Characters (2FF0–2FFF) + # CJK Symbols and Punctuation (3000–303F) + # Hiragana (3040–309F) + # Katakana (30A0–30FF) + # Bopomofo (3100–312F) + # Hangul Compatibility Jamo (3130–318F) + # Kanbun (3190–319F) + # Bopomofo Extended (31A0–31BF) + # CJK Strokes (31C0–31EF) + # Katakana Phonetic Extensions (31F0–31FF) + # Enclosed CJK Letters and Months (3200–32FF) + # CJK Compatibility (3300–33FF) + # CJK Unified Ideographs Extension A (3400–4DBF) + # Yijing Hexagram Symbols (4DC0–4DFF) + # CJK Unified Ideographs (4E00–9FFF) + # Yi Syllables (A000–A48F) + # Yi Radicals (A490–A4CF) + return 1 if (&between_hexes($codepoint, '2E80', 'A4CF')); + + # Phags-pa (A840–A87F) + return 1 if (&between_hexes($codepoint, 'A840', 'A87F')); + + # Hangul Syllables (AC00–D7AF) + return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF')); + + # CJK Compatibility Ideographs (F900–FAFF) + return 1 if (&between_hexes($codepoint, 'F900', 'FAFF')); + + # CJK Compatibility Forms (FE30–FE4F) + return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F')); + + # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters + return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC')); + + # Supplementary Ideographic Plane 20000–2FFFF + return 1 if (&between_hexes($codepoint, '20000', '2FFFF')); + + return 0; +} + +# Returns the code point of a Unicode char, represented as a decimal number +sub codepoint_dec { + if (my $char = shift) { + return unpack('U0U*', $char); + } +} + +sub between_hexes { + my ($num, $left, $right) = @_; + return $num >= hex($left) && $num <= hex($right); +} diff --git a/docs/html/_modules/data/image2label/image2label.html b/docs/html/_modules/data/image2label/image2label.html index 8f82b1463..b83262df4 100644 --- a/docs/html/_modules/data/image2label/image2label.html +++ b/docs/html/_modules/data/image2label/image2label.html @@ -162,11 +162,154 @@

Source code for data.image2label.image2label

import os
 import tensorflow as tf
+import numpy as np
 
 from open_seq2seq.data.data_layer import DataLayer
 from .imagenet_preprocessing import parse_record
 
 
+
[docs]class CifarDataLayer(DataLayer): + _HEIGHT = 28 + _WIDTH = 28 + _NUM_CHANNELS = 3 + _DEFAULT_IMAGE_BYTES = 32 * 32 * 3 + # The record is the image plus a one-byte label + _RECORD_BYTES = _DEFAULT_IMAGE_BYTES + 1 + _NUM_CLASSES = 10 + _NUM_DATA_FILES = 5 + + _NUM_IMAGES = { + 'train': 50000, + 'validation': 10000, + } + +
[docs] @staticmethod + def get_required_params(): + return dict(DataLayer.get_required_params(), **{ + 'data_dir': str, + })
+ +
[docs] @staticmethod + def get_optional_params(): + return dict(DataLayer.get_optional_params(), **{ + 'num_parallel_calls': int, + 'shuffle_buffer': int, + 'image_size': int, + 'num_classes': int, + })
+ + def __init__(self, params, model, num_workers, worker_id): + super(CifarDataLayer, self).__init__(params, model, + num_workers, worker_id) + if self.params['mode'] == 'infer': + raise ValueError('Inference is not supported on CifarDataLayer') + + if self.params['mode'] == 'train': + filenames = [ + os.path.join(self.params['data_dir'], 'data_batch_{}.bin'.format(i)) + for i in range(1, self._NUM_DATA_FILES + 1) + ] + else: + filenames = [os.path.join(self.params['data_dir'], 'test_batch.bin')] + + self.file_names = filenames + self._train_size = 50000 + self._valid_size = 10000 + self._iterator = None + self._input_tensors = None + +
[docs] def preprocess_image(self, image, is_training): + """Preprocess a single image of layout [height, width, depth].""" + if is_training: + # Resize the image to add four extra pixels on each side. + image = tf.image.resize_image_with_crop_or_pad( + image, self._HEIGHT + 8, self._WIDTH + 8) + + # Randomly crop a [_HEIGHT, _WIDTH] section of the image. + image = tf.random_crop(image, [self._HEIGHT, self._WIDTH, + self._NUM_CHANNELS]) + + # Randomly flip the image horizontally. + image = tf.image.random_flip_left_right(image) + + else: + image = tf.image.resize_image_with_crop_or_pad( + image, self._HEIGHT, self._WIDTH) + + # Subtract off the mean and divide by the variance of the pixels. + image = tf.image.per_image_standardization(image) + + return image
+ +
[docs] def parse_record(self, raw_record, is_training, num_classes=10): + """Parse CIFAR-10 image and label from a raw record.""" + # Convert bytes to a vector of uint8 that is record_bytes long. + record_vector = tf.decode_raw(raw_record, tf.uint8) + + # The first byte represents the label, which we convert from uint8 to int32 + # and then to one-hot. + label = tf.cast(record_vector[0], tf.int32) + + # The remaining bytes after the label represent the image, which we reshape + # from [depth * height * width] to [depth, height, width]. + depth_major = tf.reshape(record_vector[1:self._RECORD_BYTES], + [3, 32, 32]) + + # Convert from [depth, height, width] to [height, width, depth], and cast as + # float32. + image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32) + + image = self.preprocess_image(image, is_training) + label = tf.one_hot(tf.reshape(label, shape=[]), num_classes) + + return image, label
+ +
[docs] def build_graph(self): + dataset = tf.data.FixedLengthRecordDataset(self.file_names, + self._RECORD_BYTES) + + dataset = dataset.prefetch(buffer_size=self.params['batch_size']) + if self.params['shuffle']: + # shuffling images + dataset = dataset.shuffle(buffer_size=self.params.get('shuffle_buffer', + 1500)) + dataset = dataset.repeat() + + dataset = dataset.map( + lambda value: self.parse_record( + raw_record=value, + is_training=self.params['mode'] == 'train', + ), + num_parallel_calls=self.params.get('num_parallel_calls', 16), + ) + + dataset = dataset.batch(self.params['batch_size']) + dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) + + self._iterator = dataset.make_initializable_iterator() + inputs, labels = self.iterator.get_next() + if self.params['mode'] == 'train': + tf.summary.image('augmented_images', inputs, max_outputs=1) + self._input_tensors = { + 'source_tensors': [inputs], + 'target_tensors': [labels], + }
+ + @property + def input_tensors(self): + return self._input_tensors + + @property + def iterator(self): + return self._iterator + +
[docs] def get_size_in_samples(self): + if self.params['mode'] == 'train': + return self._train_size + else: + return len(np.arange(self._valid_size)[self._worker_id::self._num_workers])
+ +
[docs]class ImagenetDataLayer(DataLayer):
[docs] @staticmethod def get_required_params(): @@ -179,6 +322,8 @@

Source code for data.image2label.image2label

return dict(DataLayer.get_optional_params(), **{
       'num_parallel_calls': int,
       'shuffle_buffer': int,
+      'image_size': int,
+      'num_classes': int,
     })
def __init__(self, params, model, num_workers, worker_id): @@ -231,12 +376,17 @@

Source code for data.image2label.image2label

dataset = dataset.repeat()
 
     dataset = dataset.map(
-      lambda value: parse_record(value, self.params['mode'] == 'train'),
+      lambda value: parse_record(
+        raw_record=value,
+        is_training=self.params['mode'] == 'train',
+        image_size=self.params.get('image_size', 224),
+        num_classes=self.params.get('num_classes', 1000),
+      ),
       num_parallel_calls=self.params.get('num_parallel_calls', 16),
     )
 
     dataset = dataset.batch(self.params['batch_size'])
-    dataset = dataset.prefetch(1)
+    dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
 
     self._iterator = dataset.make_initializable_iterator()
     inputs, labels = self.iterator.get_next()
diff --git a/docs/html/_modules/data/image2label/imagenet_preprocessing.html b/docs/html/_modules/data/image2label/imagenet_preprocessing.html
index f5f425d67..6c7ad01bf 100644
--- a/docs/html/_modules/data/image2label/imagenet_preprocessing.html
+++ b/docs/html/_modules/data/image2label/imagenet_preprocessing.html
@@ -168,22 +168,17 @@ 

Source code for data.image2label.imagenet_preprocessing

# limitations under the License. # ============================================================================== """Provides utilities to preprocess images. - Training images are sampled using the provided bounding boxes, and subsequently cropped to the sampled bounding box. Images are additionally flipped randomly, then resized to the target output size (without aspect-ratio preservation). - Images used during evaluation are resized (with aspect-ratio preservation) and centrally cropped. - All images undergo mean color subtraction. - Note that these steps are colloquially referred to as "ResNet preprocessing," and they differ from "VGG preprocessing," which does not use bounding boxes and instead does an aspect-preserving resize followed by random crop during training. (These both differ from "Inception preprocessing," which introduces color distortion steps.) - """ from __future__ import absolute_import @@ -202,14 +197,9 @@

Source code for data.image2label.imagenet_preprocessing

# _RESIZE_MIN x (_RESIZE_MIN * 2). _RESIZE_MIN = 256 -_DEFAULT_IMAGE_SIZE = 224 -_NUM_CHANNELS = 3 -_NUM_CLASSES = 1001 -
[docs]def _decode_crop_and_flip(image_buffer, bbox, num_channels): """Crops the given image to a random part of the image, and randomly flips. - We use the fused decode_and_crop op, which performs better than the two ops used separately in series, but note that this requires that the image be passed in as an un-decoded string Tensor. @@ -223,7 +213,6 @@

Source code for data.image2label.imagenet_preprocessing

Returns: 3-D tensor with cropped image. - """ # A large fraction of image datasets contain a human-annotated bounding box # delineating the region of the image containing the object of interest. We @@ -278,12 +267,12 @@

Source code for data.image2label.imagenet_preprocessing

image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
-
[docs]def _mean_image_subtraction(image, means, num_channels): - """Subtracts the given means from each image channel. +
[docs]def _mean_image_subtraction_and_normalization(image, means, num_channels): + """Subtracts the given means from each image channel and divides by 127.5. For example: means = [123.68, 116.779, 103.939] - image = _mean_image_subtraction(image, means) + image = _mean_image_subtraction_and_normalization(image, means) Note that the rank of `image` must be known. @@ -293,7 +282,7 @@

Source code for data.image2label.imagenet_preprocessing

num_channels: number of color channels in the image that will be distorted. Returns: - the centered image. + the centered image and normalized image. Raises: ValueError: If the rank of `image` is unknown, if `image` has a rank other @@ -309,12 +298,11 @@

Source code for data.image2label.imagenet_preprocessing

# We have a 1-D tensor of means; convert to 3-D. means = tf.expand_dims(tf.expand_dims(means, 0), 0) - return image - means
+ return (image - means) / 127.5
[docs]def _smallest_size_at_least(height, width, resize_min): """Computes new shape with the smallest side equal to `smallest_side`. - Computes new shape with the smallest side equal to `smallest_side` while preserving the original aspect ratio. @@ -364,7 +352,6 @@

Source code for data.image2label.imagenet_preprocessing

[docs]def _resize_image(image, height, width): """Simple wrapper around tf.resize_images. - This is primarily to make sure we use the same `ResizeMethod` and other details each time. @@ -385,7 +372,6 @@

Source code for data.image2label.imagenet_preprocessing

[docs]def preprocess_image(image_buffer, bbox, output_height, output_width, num_channels, is_training=False): """Preprocesses the given image. - Preprocessing includes decoding, cropping, and resizing for both training and eval images. Training preprocessing, however, introduces some random distortion of the image to improve accuracy. @@ -416,16 +402,15 @@

Source code for data.image2label.imagenet_preprocessing

image.set_shape([output_height, output_width, num_channels]) - return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)
+ return _mean_image_subtraction_and_normalization(image, _CHANNEL_MEANS, + num_channels)
[docs]def _parse_example_proto(example_serialized): """Parses an Example proto containing a training example of an image. - The output of the build_image_data.py image preprocessing script is a dataset containing serialized Example protocol buffers. Each Example proto contains the following fields (values are included as examples): - image/height: 462 image/width: 581 image/colorspace: 'RGB' @@ -489,16 +474,17 @@

Source code for data.image2label.imagenet_preprocessing

return features['image/encoded'], label, bbox
-
[docs]def parse_record(raw_record, is_training): +
[docs]def parse_record(raw_record, is_training, image_size=224, num_classes=1000): """Parses a record containing a training example of an image. - The input record is parsed into a label and image, and the image is passed through preprocessing steps (cropping, flipping, and so on). Args: raw_record: scalar Tensor tf.string containing a serialized - Example protocol buffer. + Example protocol buffer. is_training: A boolean denoting whether the input is for training. + image_size (int): size that images should be resized to. + num_classes (int): number of output classes. Returns: Tuple with processed image tensor and one-hot-encoded label tensor. @@ -508,12 +494,13 @@

Source code for data.image2label.imagenet_preprocessing

image = preprocess_image( image_buffer=image_buffer, bbox=bbox, - output_height=_DEFAULT_IMAGE_SIZE, - output_width=_DEFAULT_IMAGE_SIZE, - num_channels=_NUM_CHANNELS, + output_height=image_size, + output_width=image_size, + num_channels=3, is_training=is_training) - label = tf.one_hot(tf.reshape(label, shape=[]), _NUM_CLASSES) + # subtracting 1 to make labels go from 0 to 999 + label = tf.one_hot(tf.reshape(label - 1, shape=[]), num_classes) return image, label
diff --git a/docs/html/_modules/data/speech2text/speech2text.html b/docs/html/_modules/data/speech2text/speech2text.html index 4f55f45b1..3f527aaab 100644 --- a/docs/html/_modules/data/speech2text/speech2text.html +++ b/docs/html/_modules/data/speech2text/speech2text.html @@ -175,7 +175,7 @@

Source code for data.speech2text.speech2text

def get_required_params():
     return dict(DataLayer.get_required_params(), **{
       'num_audio_features': int,
-      'input_type': ['spectrogram', 'mfcc'],
+      'input_type': ['spectrogram', 'mfcc', 'logfbank'],
       'vocab_file': str,
       'dataset_files': list,
     })
@@ -187,7 +187,7 @@

Source code for data.speech2text.speech2text

'pad_to': int,
     })
-
[docs] def __init__(self, params, model, num_workers=None, worker_id=None): +
[docs] def __init__(self, params, model, num_workers, worker_id): """Speech-to-text data layer constructor. See parent class for arguments description. @@ -241,7 +241,6 @@

Source code for data.speech2text.speech2text

self._input_tensors = None
[docs] def split_data(self, data): - """Method that performs data split for evaluation.""" if self.params['mode'] != 'train' and self._num_workers is not None: size = len(data) start = size // self._num_workers * self._worker_id @@ -260,12 +259,12 @@

Source code for data.speech2text.speech2text

[docs]  def build_graph(self):
     """Builds data processing graph using ``tf.data`` API."""
-    self._dataset = tf.data.Dataset.from_tensor_slices(self._files)
-    if self.params['shuffle']:
-      self._dataset = self._dataset.shuffle(self._size)
-    self._dataset = self._dataset.repeat()
-
     if self.params['mode'] != 'infer':
+      self._dataset = tf.data.Dataset.from_tensor_slices(self._files)
+      if self.params['shuffle']:
+        self._dataset = self._dataset.shuffle(self._size)
+      self._dataset = self._dataset.repeat()
+
       self._dataset = self._dataset.map(
         lambda line: tf.py_func(
           self._parse_audio_transcript_element,
@@ -280,21 +279,29 @@ 

Source code for data.speech2text.speech2text

padded_shapes=([None, self.params['num_audio_features']], 1, [None], 1)
       )
     else:
+      indices = self.split_data(
+        np.array(list(map(lambda num: str(num), range(len(self.all_files)))))
+      )
+      self._dataset = tf.data.Dataset.from_tensor_slices(
+        np.hstack((indices[:, np.newaxis], self._files[:, np.newaxis]))
+      )
+      self._dataset = self._dataset.repeat()
       self._dataset = self._dataset.map(
         lambda line: tf.py_func(
           self._parse_audio_element,
           [line],
-          [self.params['dtype'], tf.int32],
+          [self.params['dtype'], tf.int32, tf.int32],
           stateful=False,
         ),
         num_parallel_calls=8,
       )
       self._dataset = self._dataset.padded_batch(
         self.params['batch_size'],
-        padded_shapes=([None, self.params['num_audio_features']], 1)
+        padded_shapes=([None, self.params['num_audio_features']], 1, 1)
       )
 
-    self._iterator = self._dataset.prefetch(8).make_initializable_iterator()
+    self._iterator = self._dataset.prefetch(tf.contrib.data.AUTOTUNE)\
+                         .make_initializable_iterator()
 
     if self.params['mode'] != 'infer':
       x, x_length, y, y_length = self._iterator.get_next()
@@ -303,7 +310,9 @@ 

Source code for data.speech2text.speech2text

y.set_shape([self.params['batch_size'], None])
       y_length = tf.reshape(y_length, [self.params['batch_size']])
     else:
-      x, x_length = self._iterator.get_next()
+      x, x_length, x_id = self._iterator.get_next()
+      x_id = tf.reshape(x_id, [self.params['batch_size']])
+
     x.set_shape([self.params['batch_size'], None,
                  self.params['num_audio_features']])
     x_length = tf.reshape(x_length, [self.params['batch_size']])
@@ -311,7 +320,9 @@ 

Source code for data.speech2text.speech2text

self._input_tensors = {}
     self._input_tensors["source_tensors"] = [x, x_length]
     if self.params['mode'] != 'infer':
-      self._input_tensors['target_tensors'] = [y, y_length]
+ self._input_tensors['target_tensors'] = [y, y_length] + else: + self._input_tensors['source_ids'] = [x_id]
[docs] def _parse_audio_transcript_element(self, element): """Parses tf.data element from TextLineDataset into audio and text. @@ -338,15 +349,17 @@

Source code for data.speech2text.speech2text

np.int32(target), \
            np.int32([len(target)])
-
[docs] def _parse_audio_element(self, audio_filename): +
[docs] def _parse_audio_element(self, id_and_audio_filename): """Parses audio from file and returns array of audio features. Args: - audio_filename: audio file name. + id_and_audio_filename: tuple of sample id and corresponding audio file name. Returns: tuple: source audio features as ``np.array``, length of source sequence, + sample id. """ + idx, audio_filename = id_and_audio_filename pad_to = self.params.get('pad_to', 8) source = get_speech_features_from_file( audio_filename, self.params['num_audio_features'], pad_to, @@ -354,7 +367,7 @@

Source code for data.speech2text.speech2text

augmentation=self.params.get('augmentation', None),
     )
     return source.astype(self.params['dtype'].as_numpy_dtype()), \
-           np.int32([len(source)])
+ np.int32([len(source)]), np.int32([idx])
@property def input_tensors(self): diff --git a/docs/html/_modules/data/speech2text/speech_utils.html b/docs/html/_modules/data/speech2text/speech_utils.html index 92f7dd343..0d9bfc77d 100644 --- a/docs/html/_modules/data/speech2text/speech_utils.html +++ b/docs/html/_modules/data/speech2text/speech_utils.html @@ -197,6 +197,13 @@

Source code for data.speech2text.speech_utils

)
+
[docs]def normalize_signal(signal): + """ + Normalize float32 signal to [-1, 1] range + """ + return signal / np.max(np.abs(signal))
+ +
[docs]def augment_audio_signal(signal, fs, augmentation): """Function that performs audio signal augmentation. @@ -208,7 +215,7 @@

Source code for data.speech2text.speech_utils

Returns: np.array: np.array with augmented audio signal. """ - signal_float = signal.astype(np.float32) / 32768.0 + signal_float = normalize_signal(signal.astype(np.float32)) if augmentation['time_stretch_ratio'] > 0: # time stretch (might be slow) @@ -227,7 +234,7 @@

Source code for data.speech2text.speech_utils

signal_float += np.random.randn(signal_float.shape[0]) * \ 10.0 ** (noise_level_db / 20.0) - return (signal_float * 32768.0).astype(np.int16)
+ return (normalize_signal(signal_float) * 32767.0).astype(np.int16)
[docs]def get_speech_features(signal, fs, num_features, pad_to=8, @@ -273,7 +280,7 @@

Source code for data.speech2text.speech_utils

if pad_to > 0: if length % pad_to != 0: pad_size = (pad_to - length % pad_to) * n_window_stride - signal = np.pad(signal, (0, pad_size), mode='reflect') + signal = np.pad(signal, (0, pad_size), mode='constant') if features_type == 'spectrogram': frames = psf.sigproc.framesig(sig=signal, @@ -301,10 +308,22 @@

Source code for data.speech2text.speech_utils

preemph=0.97, ceplifter=2*num_features, appendEnergy=False) + + elif features_type == 'logfbank': + features = psf.logfbank(signal=signal, + samplerate=fs, + winlen=window_size, + winstep=window_stride, + nfilt=num_features, + nfft=512, + lowfreq=0, highfreq=fs/2, + preemph=0.97) + else: raise ValueError('Unknown features type: {}'.format(features_type)) - assert features.shape[0] % pad_to == 0 + if pad_to > 0: + assert features.shape[0] % pad_to == 0 m = np.mean(features) s = np.std(features) features = (features - m) / s diff --git a/docs/html/_modules/data/text2text/t2t.html b/docs/html/_modules/data/text2text/t2t.html index 8caf1ebfe..adbbbff9f 100644 --- a/docs/html/_modules/data/text2text/t2t.html +++ b/docs/html/_modules/data/text2text/t2t.html @@ -200,8 +200,9 @@

Source code for data.text2text.t2t

    is the list of training files. Second, while reading records using
    `parallel_interleave`, the `sloppy` argument is used to generate randomness
    in the order of the examples.
-"""
 
+3. Modified slightly to fit OpenSeq2Seq needs
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -227,7 +228,7 @@ 

Source code for data.text2text.t2t

   return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER)
-
[docs]def _parse_example(serialized_example): +
[docs]def _parse_example(serialized_example, pad_2_eight=False): """Return inputs and targets Tensors from a serialized tf.Example.""" data_fields = { "inputs": tf.VarLenFeature(tf.int64), @@ -236,6 +237,17 @@

Source code for data.text2text.t2t

   parsed = tf.parse_single_example(serialized_example, data_fields)
   inputs = tf.sparse_tensor_to_dense(parsed["inputs"])
   targets = tf.sparse_tensor_to_dense(parsed["targets"])
+
+  if pad_2_eight:
+    inputs = tf.cond(tf.equal(tf.shape(inputs)[0] % 8, 0),
+                     true_fn=lambda:  inputs,
+                     false_fn=lambda: tf.pad(inputs,
+                                      paddings=[[0, 8 - tf.shape(inputs)[0] % 8]]))
+    targets = tf.cond(tf.equal(tf.shape(targets)[0] % 8, 0),
+                     true_fn=lambda:  targets,
+                     false_fn=lambda: tf.pad(targets,
+                                      paddings=[[0, 8 - tf.shape(targets)[0] % 8]]))
+
   return inputs, targets
@@ -283,7 +295,7 @@

Source code for data.text2text.t2t

   return buckets_min, buckets_max
-
[docs]def _batch_examples(dataset, batch_size, max_length): +
[docs]def _batch_examples(dataset, batch_size, max_length, pad_2_eight=True): """Group examples by similar lengths, and return batched dataset. Each batch of similar-length examples are padded to the same length, and may @@ -309,7 +321,12 @@

Source code for data.text2text.t2t

 
   # Create list of batch sizes for each bucket_id, so that
   # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
-  bucket_batch_sizes = [batch_size // x for x in buckets_max]
+  if pad_2_eight: # pad to 8 for HMMA
+    bucket_batch_sizes = [
+      batch_size // x if batch_size // x % 8 == 0 else batch_size // x + (
+            8 - batch_size // x % 8) for x in buckets_max]
+  else:
+    bucket_batch_sizes = [batch_size // x for x in buckets_max]
   # bucket_id will be a tensor, so convert this list to a tensor as well.
   bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
 
@@ -331,7 +348,6 @@ 

Source code for data.text2text.t2t

   def batching_fn(bucket_id, grouped_dataset):
     """Batch and add padding to a dataset of elements with similar lengths."""
     bucket_batch_size = window_size_fn(bucket_id)
-
     # Batch the dataset and add padding so that all input sequences in the
     # examples have the same length, and all target sequences have the same
     # lengths as well. Resulting lengths of inputs and targets can differ.
@@ -346,7 +362,7 @@ 

Source code for data.text2text.t2t

 
 
[docs]def _read_and_batch_from_files( file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat, - num_workers, worker_id): + num_workers, worker_id, batch_in_tokens, pad2eight=True): """Create dataset where each item is a dict of "inputs" and "targets". Args: @@ -359,6 +375,11 @@

Source code for data.text2text.t2t

       repeated forever.
     num_workers: Number of workers or number of Horovod workers
     worker_id: Worker id or Horovod rank
+    batch_in_tokens: whether to batch_size means amounts in tokens or sentence
+    pairs. batching in tokens is more efficient as it reduces PADs. batching in
+    sentences should be used in inference mode since order of
+    sentences is important
+    pad2eight: if True, it will pad both dimensions to be divisible by 8
 
   Returns:
     tf.data.Dataset object containing examples loaded from the files.
@@ -379,14 +400,19 @@ 

Source code for data.text2text.t2t

 
   # Parse each tf.Example into a dictionary
   # TODO: Look into prefetch_input_elements for performance optimization.
-  dataset = dataset.map(_parse_example,
+  dataset = dataset.map(lambda x: _parse_example(x, pad_2_eight=pad2eight),
                         num_parallel_calls=num_cpu_cores)
 
   # Remove examples where the input or target length exceeds the maximum length,
   dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length))
 
-  # Batch such that each batch has examples of similar length.
-  dataset = _batch_examples(dataset, batch_size, max_length)
+  if batch_in_tokens:
+    # Batch such that each batch has examples of similar length.
+    dataset = _batch_examples(dataset, batch_size, max_length,
+                              pad_2_eight=pad2eight)
+  else:
+    # Examples can have different lenghts
+    dataset = dataset.padded_batch(batch_size, ([None], [None]))
   dataset = dataset.repeat(repeat)
 
   # Prefetch the next element to improve speed of input pipeline.
diff --git a/docs/html/_modules/data/text2text/text2text.html b/docs/html/_modules/data/text2text/text2text.html
index 6af9cecb6..c59b0fc53 100644
--- a/docs/html/_modules/data/text2text/text2text.html
+++ b/docs/html/_modules/data/text2text/text2text.html
@@ -232,7 +232,8 @@ 

Source code for data.text2text.text2text

     self._delimiter = self.params.get('delimiter', ' ')
     self._map_parallel_calls = self.params.get('map_parallel_calls', 8)
     self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight', False)
-    self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', 4)
+    self._prefetch_buffer_size = self.params.get('prefetch_buffer_size',
+                                                 tf.contrib.data.AUTOTUNE)
     self._num_workers = num_workers
     self._worker_id = worker_id
     if self._pad_lengths_to_eight and not (self.params['max_length'] % 8 == 0):
@@ -321,7 +322,7 @@ 

Source code for data.text2text.text2text

              [SpecialTextTokens.EOS_ID.value], self._pad_lengths_to_eight), dtype="int32")
 
     _sources = tf.data.TextLineDataset(self.source_file)\
-      .map(lambda line: tf.py_func(func=src_token_to_id,inp=[line],
+      .map(lambda line: tf.py_func(func=src_token_to_id, inp=[line],
                                    Tout=[tf.int32], stateful=False),
            num_parallel_calls=self._map_parallel_calls) \
       .map(lambda tokens: (tokens, tf.size(tokens)),
@@ -409,7 +410,8 @@ 

Source code for data.text2text.text2text

       'repeat': int,
       'num_cpu_cores': int,
       'tgt_vocab_file': str,
-      'm_padding': bool,
+      'pad_data_to_eight': bool,
+      'batch_in_tokens': bool,
     })
def __init__(self, params, model, num_workers=1, worker_id=0): @@ -456,38 +458,13 @@

Source code for data.text2text.text2text

       shuffle=self.params['shuffle'],
       repeat=self.params['repeat'],
       num_workers=self._num_workers,
-      worker_id=self._worker_id)
+      worker_id=self._worker_id,
+      batch_in_tokens=self.params.get('batch_in_tokens', True),
+      pad2eight=self.params.get('pad_data_to_eight', False))
 
     self._iterator = self.batched_dataset.make_initializable_iterator()
     x, y = self.iterator.get_next()
 
-    if self.params.get('m_padding', False):
-      # MAGIC PADDING
-      x = tf.cond(tf.equal(tf.shape(x)[1] % 8, 0),
-                  true_fn = lambda: x,
-                  false_fn = lambda: tf.pad(x,
-                                            paddings=[[0, 0],
-                                                      [0, 8 - tf.shape(x)[1] % 8]]))
-
-      y = tf.cond(tf.equal(tf.shape(y)[1] % 8, 0),
-                  true_fn = lambda: y,
-                  false_fn = lambda: tf.pad(y,
-                                            paddings=[[0, 0],
-                                                      [0, 8 - tf.shape(y)[1] % 8]]))
-
-      x = tf.cond(tf.equal(tf.shape(x)[0] % 8, 0),
-                  true_fn = lambda: x,
-                  false_fn = lambda: tf.pad(x,
-                                            paddings=[[0, 8 - tf.shape(x)[0] % 8],
-                                                      [0, 0]]))
-
-      y = tf.cond(tf.equal(tf.shape(y)[0] % 8, 0),
-                  true_fn=lambda: y,
-                  false_fn=lambda: tf.pad(y,
-                                          paddings=[[0, 8 - tf.shape(y)[0] % 8],
-                                                    [0, 0]]))
-      # ENDOF MAGIC PADDING
-
     len_x = tf.count_nonzero(x, axis=1, dtype=tf.int32)
     len_y = tf.count_nonzero(y, axis=1, dtype=tf.int32)
     if self.params['mode'] == 'train' or self.params['mode'] == 'eval':
diff --git a/docs/html/_modules/decoders/convs2s_decoder.html b/docs/html/_modules/decoders/convs2s_decoder.html
new file mode 100644
index 000000000..1b084dd87
--- /dev/null
+++ b/docs/html/_modules/decoders/convs2s_decoder.html
@@ -0,0 +1,602 @@
+
+
+
+
+  
+
+  
+  
+  
+  
+  decoders.convs2s_decoder — OpenSeq2Seq 0.2 documentation
+  
+
+  
+  
+    
+  
+  
+  
+
+  
+
+  
+  
+    
+
+  
+
+  
+    
+  
+  
+  
+    
+     
+
+  
+  
+
+
+
+
+
+   
+  
+ + + + +
+ + + + + +
+ +
+ + + + + + + + + + + + + + + + + +
+ +
    + +
  • Docs »
  • + +
  • Module code »
  • + +
  • decoders.convs2s_decoder
  • + + +
  • + +
  • + +
+ + +
+
+
+
+ +

Source code for decoders.convs2s_decoder

+from __future__ import absolute_import, division, print_function
+from __future__ import unicode_literals
+
+import tensorflow as tf
+import math
+from .decoder import Decoder
+
+from open_seq2seq.parts.transformer import beam_search
+
+from open_seq2seq.parts.transformer import embedding_layer
+from open_seq2seq.parts.transformer.utils import get_padding
+
+from open_seq2seq.parts.convs2s import ffn_wn_layer, conv_wn_layer, attention_wn_layer
+
+# Default value used if max_input_length is not given
+MAX_INPUT_LENGTH = 128
+
+
+
[docs]class ConvS2SDecoder(Decoder): + +
[docs] @staticmethod + def get_required_params(): + """Static method with description of required parameters. + + Returns: + dict: + Dictionary containing all the parameters that **have to** be + included into the ``params`` parameter of the + class :meth:`__init__` method. + """ + return dict( + Decoder.get_required_params(), **{ + 'batch_size': int, + 'decoder_layers': int, + 'tgt_emb_size': int, + 'tgt_vocab_size': int, + 'shared_embed': bool, + 'embedding_dropout_keep_prob': float, + 'conv_nchannels_kwidth': list, + 'hidden_dropout_keep_prob': float, + 'out_dropout_keep_prob': float, + 'beam_size': int, + 'alpha': float, + 'extra_decode_length': int, + 'EOS_ID': int, + })
+ +
[docs] @staticmethod + def get_optional_params(): + """Static method with description of optional parameters. + + Returns: + dict: + Dictionary containing all the parameters that **can** be + included into the ``params`` parameter of the + class :meth:`__init__` method. + """ + return dict( + Decoder.get_optional_params(), + **{ + 'pad_embeddings_2_eight': bool, + + # if not provided, tgt_emb_size is used as the default value + 'out_emb_size': int, + 'max_input_length': int, + 'GO_SYMBOL': int, + 'PAD_SYMBOL': int, + 'END_SYMBOL': int, + })
+ + def _cast_types(self, input_dict): + return input_dict + + def __init__(self, params, model, name="convs2s_decoder", mode='train'): + super(ConvS2SDecoder, self).__init__(params, model, name, mode) + self.embedding_softmax_layer = None + self.position_embedding_layer = None + self.layers = [] + self._tgt_vocab_size = self.params['tgt_vocab_size'] + self._tgt_emb_size = self.params['tgt_emb_size'] + self._mode = mode + self._pad_sym = self.params.get('PAD_SYMBOL', 0) + self._pad2eight = params.get('pad_embeddings_2_eight', False) + + def _decode(self, input_dict): + targets = input_dict['target_tensors'][0] \ + if 'target_tensors' in input_dict else None + + encoder_outputs = input_dict['encoder_output']['outputs'] + encoder_outputs_b = input_dict['encoder_output'].get( + 'outputs_b', encoder_outputs) + + inputs_attention_bias = input_dict['encoder_output'].get( + 'inputs_attention_bias_cs2s', None) + + with tf.name_scope("decode"): + # prepare decoder layers + if len(self.layers) == 0: + knum_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[0] + kwidth_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[1] + + # preparing embedding layers + with tf.variable_scope("embedding"): + if 'embedding_softmax_layer' in input_dict['encoder_output'] \ + and self.params['shared_embed']: + self.embedding_softmax_layer = \ + input_dict['encoder_output']['embedding_softmax_layer'] + else: + self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self._tgt_vocab_size, + hidden_size=self._tgt_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + with tf.variable_scope("pos_embedding"): + if 'position_embedding_layer' in input_dict['encoder_output'] \ + and self.params['shared_embed']: + self.position_embedding_layer = \ + input_dict['encoder_output']['position_embedding_layer'] + else: + self.position_embedding_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self.params.get("max_input_length", + MAX_INPUT_LENGTH), + hidden_size=self._tgt_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + # linear projection before cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + self._tgt_emb_size, + knum_list[0], + dropout=self.params["embedding_dropout_keep_prob"], + var_scope_name="linear_mapping_before_cnn_layers")) + + for i in range(self.params['decoder_layers']): + in_dim = knum_list[i] if i == 0 else knum_list[i - 1] + out_dim = knum_list[i] + + # linear projection is needed for residual connections if + # input and output of a cnn layer do not match + if in_dim != out_dim: + linear_proj = ffn_wn_layer.FeedFowardNetworkNormalized( + in_dim, + out_dim, + var_scope_name="linear_mapping_cnn_" + str(i + 1), + dropout=1.0) + else: + linear_proj = None + + conv_layer = conv_wn_layer.Conv1DNetworkNormalized( + in_dim, + out_dim, + kernel_width=kwidth_list[i], + mode=self.mode, + layer_id=i + 1, + hidden_dropout=self.params["hidden_dropout_keep_prob"], + conv_padding="VALID", + decode_padding=True) + + att_layer = attention_wn_layer.AttentionLayerNormalized( + out_dim, + embed_size=self._tgt_emb_size, + layer_id=i + 1, + add_res=True) + + self.layers.append([linear_proj, conv_layer, att_layer]) + + # linear projection after cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + knum_list[self.params['decoder_layers'] - 1], + self.params.get("out_emb_size", self._tgt_emb_size), + dropout=1.0, + var_scope_name="linear_mapping_after_cnn_layers")) + + if not self.params['shared_embed']: + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + self.params.get("out_emb_size", self._tgt_emb_size), + self._tgt_vocab_size, + dropout=self.params["out_dropout_keep_prob"], + var_scope_name="linear_mapping_to_vocabspace")) + else: + # if embedding is shared, + # the shared embedding is used as the final linear projection to vocab space + self.layers.append(None) + + if targets is None: + return self.predict(encoder_outputs, encoder_outputs_b, + inputs_attention_bias) + else: + logits = self.decode_pass(targets, encoder_outputs, encoder_outputs_b, + inputs_attention_bias) + return { + "logits": logits, + "outputs": [tf.argmax(logits, axis=-1)], + "final_state": None, + "final_sequence_lengths": None + } + +
[docs] def decode_pass(self, targets, encoder_outputs, encoder_outputs_b, + inputs_attention_bias): + """Generate logits for each value in the target sequence. + + Args: + targets: target values for the output sequence. + int tensor with shape [batch_size, target_length] + encoder_outputs: continuous representation of input sequence. + float tensor with shape [batch_size, input_length, hidden_size] + float tensor with shape [batch_size, input_length, hidden_size] + encoder_outputs_b: continuous representation of input sequence + which includes the source embeddings. + float tensor with shape [batch_size, input_length, hidden_size] + inputs_attention_bias: float tensor with shape [batch_size, 1, input_length] + + Returns: + float32 tensor with shape [batch_size, target_length, vocab_size] + """ + + # Prepare inputs to decoder layers by applying embedding + # and adding positional encoding. + decoder_inputs = self.embedding_softmax_layer(targets) + + with tf.name_scope("add_pos_encoding"): + pos_input = tf.range( + 0, tf.shape(decoder_inputs)[1], delta=1, dtype=tf.int32, name='range') + pos_encoding = self.position_embedding_layer(pos_input) + decoder_inputs = decoder_inputs + tf.cast( + x=pos_encoding, dtype=decoder_inputs.dtype) + + if self.mode == "train": + decoder_inputs = tf.nn.dropout(decoder_inputs, + self.params["embedding_dropout_keep_prob"]) + + # mask the paddings in the target + inputs_padding = get_padding( + targets, padding_value=self._pad_sym, dtype=decoder_inputs.dtype) + decoder_inputs *= tf.expand_dims(1.0 - inputs_padding, 2) + + # do decode + logits = self._call( + decoder_inputs=decoder_inputs, + encoder_outputs_a=encoder_outputs, + encoder_outputs_b=encoder_outputs_b, + input_attention_bias=inputs_attention_bias) + + return logits
+ + def _call(self, decoder_inputs, encoder_outputs_a, encoder_outputs_b, + input_attention_bias): + # run input into the decoder layers and returns the logits + target_embed = decoder_inputs + with tf.variable_scope("linear_layer_before_cnn_layers"): + outputs = self.layers[0](decoder_inputs) + + for i in range(1, len(self.layers) - 2): + linear_proj, conv_layer, att_layer = self.layers[i] + + with tf.variable_scope("layer_%d" % i): + if linear_proj is not None: + res_inputs = linear_proj(outputs) + else: + res_inputs = outputs + + with tf.variable_scope("conv_layer"): + outputs = conv_layer(outputs) + + with tf.variable_scope("attention_layer"): + outputs = att_layer(outputs, target_embed, encoder_outputs_a, + encoder_outputs_b, input_attention_bias) + outputs = (outputs + res_inputs) * math.sqrt(0.5) + + with tf.variable_scope("linear_layer_after_cnn_layers"): + outputs = self.layers[-2](outputs) + + if self.mode == "train": + outputs = tf.nn.dropout(outputs, self.params["out_dropout_keep_prob"]) + + with tf.variable_scope("pre_softmax_projection"): + if self.layers[-1] is None: + logits = self.embedding_softmax_layer.linear(outputs) + else: + logits = self.layers[-1](outputs) + + return tf.cast(logits, dtype=tf.float32) + +
[docs] def predict(self, encoder_outputs, encoder_outputs_b, inputs_attention_bias): + """Return predicted sequence.""" + batch_size = tf.shape(encoder_outputs)[0] + input_length = tf.shape(encoder_outputs)[1] + max_decode_length = input_length + self.params["extra_decode_length"] + + symbols_to_logits_fn = self._get_symbols_to_logits_fn() + + # Create initial set of IDs that will be passed into symbols_to_logits_fn. + initial_ids = tf.zeros( + [batch_size], dtype=tf.int32) + self.params["GO_SYMBOL"] + + cache = {} + # Add encoder outputs and attention bias to the cache. + cache["encoder_outputs"] = encoder_outputs + cache["encoder_outputs_b"] = encoder_outputs_b + if inputs_attention_bias is not None: + cache["inputs_attention_bias"] = inputs_attention_bias + + # Use beam search to find the top beam_size sequences and scores. + decoded_ids, scores = beam_search.sequence_beam_search( + symbols_to_logits_fn=symbols_to_logits_fn, + initial_ids=initial_ids, + initial_cache=cache, + vocab_size=self.params["tgt_vocab_size"], + beam_size=self.params["beam_size"], + alpha=self.params["alpha"], + max_decode_length=max_decode_length, + eos_id=self.params["EOS_ID"]) + + # Get the top sequence for each batch element + top_decoded_ids = decoded_ids[:, 0, :] + top_scores = scores[:, 0] + + # this isn't particularly efficient + logits = self.decode_pass(top_decoded_ids, encoder_outputs, + encoder_outputs_b, inputs_attention_bias) + + return { + "logits": logits, + "outputs": [top_decoded_ids], + "final_state": None, + "final_sequence_lengths": None + }
+ +
[docs] def _get_symbols_to_logits_fn(self): + """Returns a decoding function that calculates logits of the next tokens.""" + + def symbols_to_logits_fn(ids, i, cache): + """Generate logits for next potential IDs. + + Args: + ids: Current decoded sequences. + int tensor with shape [batch_size * beam_size, i - 1] + i: Loop index + cache: dictionary of values storing the encoder output, encoder-decoder + attention bias, and previous decoder attention values. + + Returns: + Tuple of + (logits with shape [batch_size * beam_size, vocab_size], + updated cache values) + """ + + # pass the decoded ids from the beginneing up to the current into the decoder + # not efficient + decoder_outputs = self.decode_pass(ids, cache.get("encoder_outputs"), + cache.get("encoder_outputs_b"), + cache.get("inputs_attention_bias")) + + logits = decoder_outputs[:, i, :] + return logits, cache + + return symbols_to_logits_fn
+
+ +
+ +
+ + +
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/decoders/decoder.html b/docs/html/_modules/decoders/decoder.html index 0751e74a4..bd4c98f82 100644 --- a/docs/html/_modules/decoders/decoder.html +++ b/docs/html/_modules/decoders/decoder.html @@ -242,24 +242,9 @@

Source code for decoders.decoder

       else:
         self._params['dtype'] = tf.float32
 
-    if 'regularizer' not in self._params:
-      if self._model and 'regularizer' in self._model.params:
-        self._params['regularizer'] = self._model.params['regularizer']
-        self._params['regularizer_params'] = self._model.params['regularizer_params']
-
-    if 'regularizer' in self._params:
-      init_dict = self._params.get('regularizer_params', {})
-      self._params['regularizer'] = self._params['regularizer'](**init_dict)
-      if self._params['dtype'] == 'mixed':
-        self._params['regularizer'] = mp_regularizer_wrapper(
-          self._params['regularizer'],
-        )
-
-    if self._params['dtype'] == 'mixed':
-      self._params['dtype'] = tf.float16
-
     self._name = name
-    self._mode = mode
+ self._mode = mode + self._compiled = False
[docs] def decode(self, input_dict): """Wrapper around :meth:`self._decode() <_decode>` method. @@ -272,12 +257,35 @@

Source code for decoders.decoder

     Returns:
       see :meth:`self._decode() <_decode>` docs.
     """
+    if not self._compiled:
+      if 'regularizer' not in self._params:
+        if self._model and 'regularizer' in self._model.params:
+          self._params['regularizer'] = copy.deepcopy(
+            self._model.params['regularizer']
+          )
+          self._params['regularizer_params'] = copy.deepcopy(
+            self._model.params['regularizer_params']
+          )
+
+      if 'regularizer' in self._params:
+        init_dict = self._params.get('regularizer_params', {})
+        self._params['regularizer'] = self._params['regularizer'](**init_dict)
+        if self._params['dtype'] == 'mixed':
+          self._params['regularizer'] = mp_regularizer_wrapper(
+            self._params['regularizer'],
+          )
+
+      if self._params['dtype'] == 'mixed':
+        self._params['dtype'] = tf.float16
+      
     if 'initializer' in self.params:
       init_dict = self.params.get('initializer_params', {})
       initializer = self.params['initializer'](**init_dict)
     else:
       initializer = None
 
+    self._compiled = True
+
     with tf.variable_scope(self._name, initializer=initializer,
                            dtype=self.params['dtype']):
       return self._decode(self._cast_types(input_dict))
@@ -315,7 +323,8 @@

Source code for decoders.decoder

 
           {
             "logits": logits that will be passed to Loss
-            "samples": actual decoded output, e.g. characters instead of logits
+            "outputs": list with actual decoded outputs, e.g. characters
+                       instead of logits
           }
     """
     pass
diff --git a/docs/html/_modules/decoders/fc_decoders.html b/docs/html/_modules/decoders/fc_decoders.html index e2aecfac0..c8680966c 100644 --- a/docs/html/_modules/decoders/fc_decoders.html +++ b/docs/html/_modules/decoders/fc_decoders.html @@ -209,7 +209,7 @@

Source code for decoders.fc_decoders

 
         {
           'logits': logits with the shape=[batch_size, output_dim]
-          'samples': [logits] (same as logits but wrapped in list)
+          'outputs': [logits] (same as logits but wrapped in list)
         }
     """
     inputs = input_dict['encoder_output']['outputs']
@@ -222,7 +222,7 @@ 

Source code for decoders.fc_decoders

       kernel_regularizer=regularizer,
       name='fully_connected',
     )
-    return {'logits': logits, 'samples': [logits]}
+ return {'logits': logits, 'outputs': [logits]}
[docs]class FullyConnectedTimeDecoder(Decoder): @@ -252,7 +252,7 @@

Source code for decoders.fc_decoders

     * **tgt_vocab_size** (int) --- target vocabulary size, i.e. number of
       output features.
     * **logits_to_outputs_func** --- function that maps produced logits to
-      decoder samples, i.e. actual text sequences.
+      decoder outputs, i.e. actual text sequences.
     """
     super(FullyConnectedTimeDecoder, self).__init__(params, model, name, mode)
@@ -274,7 +274,7 @@

Source code for decoders.fc_decoders

 
         {
           'logits': logits with the shape=[time length, batch_size, tgt_vocab_size]
-          'samples': logits_to_outputs_func(logits, input_dict)
+          'outputs': logits_to_outputs_func(logits, input_dict)
         }
     """
     inputs = input_dict['encoder_output']['outputs']
@@ -301,9 +301,9 @@ 

Source code for decoders.fc_decoders

     logits = tf.transpose(logits, [1, 0, 2])
 
     if 'logits_to_outputs_func' in self.params:
-      samples = self.params['logits_to_outputs_func'](logits, input_dict)
+      outputs = self.params['logits_to_outputs_func'](logits, input_dict)
       return {
-        'samples': samples,
+        'outputs': outputs,
         'logits': logits,
         'src_length': input_dict['encoder_output']['src_length'],
       }
diff --git a/docs/html/_modules/decoders/rnn_decoders.html b/docs/html/_modules/decoders/rnn_decoders.html
index f5cd124d2..e049842a6 100644
--- a/docs/html/_modules/decoders/rnn_decoders.html
+++ b/docs/html/_modules/decoders/rnn_decoders.html
@@ -165,7 +165,7 @@ 

Source code for decoders.rnn_decoders

 
 from open_seq2seq.parts.rnns.gnmt import GNMTAttentionMultiCell, \
                                                            gnmt_residual_fn
-from open_seq2seq.parts.rnns.utils import create_rnn_cell
+from open_seq2seq.parts.rnns.utils import single_cell
 from open_seq2seq.parts.rnns.attention_wrapper import BahdanauAttention, \
                                                  LuongAttention, \
                                                  AttentionWrapper
@@ -185,8 +185,7 @@ 

Source code for decoders.rnn_decoders

       'tgt_emb_size': int,
       'attention_layer_size': int,
       'attention_type': ['bahdanau', 'luong', 'gnmt', 'gnmt_v2'],
-      'decoder_cell_units': int,
-      'decoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'],
+      'core_cell': None,
       'decoder_layers': int,
       'decoder_use_skip_connections': bool,
       'batch_size': int,
@@ -195,6 +194,7 @@ 

Source code for decoders.rnn_decoders

 
[docs] @staticmethod def get_optional_params(): return dict(Decoder.get_optional_params(), **{ + 'core_cell_params': dict, 'bahdanau_normalize': bool, 'luong_scale': bool, 'decoder_dp_input_keep_prob': float, @@ -220,8 +220,8 @@

Source code for decoders.rnn_decoders

     * **END_SYMBOL** (int) --- END symbol id, must be the same as used in
       data layer.
     * **tgt_emb_size** (int) --- embedding size to use.
-    * **decoder_cell_units** (int) - number of units in RNN
-    * **decoder_cell_type** (string) - RNN type: lstm, gru, glstm, etc.
+    * **core_cell_params** (dict) - parameters for RNN class
+    * **core_cell** (string) - RNN class.
     * **decoder_dp_input_keep_prob** (float) - dropout input keep probability.
     * **decoder_dp_output_keep_prob** (float) - dropout output keep probability.
     * **decoder_use_skip_connections** (bool) - use residual connections or not.
@@ -339,8 +339,8 @@ 

Source code for decoders.rnn_decoders

       self._tgt_vocab_size, use_bias=False,
     )
 
-    cell_params = copy.deepcopy(self.params)
-    cell_params["num_units"] = self.params['decoder_cell_units']
+    #cell_params = copy.deepcopy(self.params)
+    #cell_params["num_units"] = self.params['decoder_cell_units']
 
     if self._mode == "train":
       dp_input_keep_prob = self.params['decoder_dp_input_keep_prob']
@@ -349,22 +349,17 @@ 

Source code for decoders.rnn_decoders

       dp_input_keep_prob = 1.0
       dp_output_keep_prob = 1.0
 
-    if self.params['attention_type'].startswith('gnmt'):
-      residual_connections = False
-      wrap_to_multi_rnn = False
-    else:
-      residual_connections = self.params['decoder_use_skip_connections']
-      wrap_to_multi_rnn = True
-
-    self._decoder_cells = create_rnn_cell(
-      cell_type=self.params['decoder_cell_type'],
-      cell_params=cell_params,
-      num_layers=self.params['decoder_layers'],
-      dp_input_keep_prob=dp_input_keep_prob,
-      dp_output_keep_prob=dp_output_keep_prob,
-      residual_connections=residual_connections,
-      wrap_to_multi_rnn=wrap_to_multi_rnn,
-    )
+    residual_connections = self.params['decoder_use_skip_connections']
+
+    # list of cells
+    self._decoder_cells = [
+      single_cell(cell_class=self.params['core_cell'],
+                  cell_params=self.params.get('core_cell_params', {}),
+                  dp_input_keep_prob=dp_input_keep_prob,
+                  dp_output_keep_prob=dp_output_keep_prob,
+                  # residual connections are added a little differently for GNMT
+                  residual_connections=False if self.params['attention_type'].startswith('gnmt') else residual_connections,
+                  ) for _ in range(self.params['decoder_layers'])]
 
     attention_mechanism = self._build_attention(
       encoder_outputs,
@@ -372,7 +367,6 @@ 

Source code for decoders.rnn_decoders

     )
     if self.params['attention_type'].startswith('gnmt'):
       attention_cell = self._decoder_cells.pop(0)
-      # attention_cell = tf.contrib.seq2seq.AttentionWrapper(
       attention_cell = AttentionWrapper(
         attention_cell,
         attention_mechanism=attention_mechanism,
@@ -380,12 +374,12 @@ 

Source code for decoders.rnn_decoders

         output_attention=False,
         name="gnmt_attention")
       attentive_decoder_cell = GNMTAttentionMultiCell(
-        attention_cell, self._add_residual_wrapper(self._decoder_cells),
+        attention_cell, self._add_residual_wrapper(self._decoder_cells) if residual_connections else self._decoder_cells,
         use_new_attention=(self.params['attention_type'] == 'gnmt_v2'))
     else:
       # attentive_decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
       attentive_decoder_cell = AttentionWrapper(
-        cell=self._decoder_cells,
+        cell=tf.contrib.rnn.MultiRNNCell(self._decoder_cells),
         attention_mechanism=attention_mechanism,
       )
     if self._mode == "train":
@@ -438,8 +432,9 @@ 

Source code for decoders.rnn_decoders

       output_time_major=time_major,
     )
 
-    return {'logits': final_outputs.rnn_output,
-            'samples': [tf.argmax(final_outputs.rnn_output, axis=-1)],
+    return {'logits': final_outputs.rnn_output if not time_major else
+            tf.transpose(final_outputs.rnn_output, perm=[1, 0, 2]),
+            'outputs': [tf.argmax(final_outputs.rnn_output, axis=-1)],
             'final_state': final_state,
             'final_sequence_lengths': final_sequence_lengths}
@@ -526,8 +521,8 @@

Source code for decoders.rnn_decoders

       self._tgt_vocab_size, use_bias=False,
     )
 
-    cell_params = copy.deepcopy(self.params)
-    cell_params["num_units"] = self.params['decoder_cell_units']
+    #cell_params = copy.deepcopy(self.params)
+    #cell_params["num_units"] = self.params['decoder_cell_units']
 
     if self._mode == "train":
       dp_input_keep_prob = self.params['decoder_dp_input_keep_prob']
@@ -536,22 +531,34 @@ 

Source code for decoders.rnn_decoders

       dp_input_keep_prob = 1.0
       dp_output_keep_prob = 1.0
 
-    if self.params['attention_type'].startswith('gnmt'):
-      residual_connections = False
-      wrap_to_multi_rnn = False
-    else:
-      residual_connections = self.params['decoder_use_skip_connections']
-      wrap_to_multi_rnn = True
-
-    self._decoder_cells = create_rnn_cell(
-      cell_type=self.params['decoder_cell_type'],
-      cell_params=cell_params,
-      num_layers=self.params['decoder_layers'],
-      dp_input_keep_prob=dp_input_keep_prob,
-      dp_output_keep_prob=dp_output_keep_prob,
-      residual_connections=residual_connections,
-      wrap_to_multi_rnn=wrap_to_multi_rnn,
-    )
+    #if self.params['attention_type'].startswith('gnmt'):
+    #  residual_connections = False
+    #  wrap_to_multi_rnn = False
+    #else:
+    #  residual_connections = self.params['decoder_use_skip_connections']
+    #  wrap_to_multi_rnn = True
+
+    #self._decoder_cells = create_rnn_cell(
+    #  cell_type=self.params['decoder_cell_type'],
+    #  cell_params=cell_params,
+    #  num_layers=self.params['decoder_layers'],
+    #  dp_input_keep_prob=dp_input_keep_prob,
+    #  dp_output_keep_prob=dp_output_keep_prob,
+    #  residual_connections=residual_connections,
+    #  wrap_to_multi_rnn=wrap_to_multi_rnn,
+    #)
+    residual_connections = self.params['decoder_use_skip_connections']
+    # list of cells
+    self._decoder_cells = [
+      single_cell(cell_class=self.params['core_cell'],
+                  cell_params=self.params.get('core_cell_params', {}),
+                  dp_input_keep_prob=dp_input_keep_prob,
+                  dp_output_keep_prob=dp_output_keep_prob,
+                  # residual connections are added a little differently for GNMT
+                  residual_connections=False if self.params[
+                    'attention_type'].startswith(
+                    'gnmt') else residual_connections,
+                  ) for _ in range(self.params['decoder_layers'])]
 
     tiled_enc_outputs = tf.contrib.seq2seq.tile_batch(
       encoder_outputs,
@@ -575,18 +582,18 @@ 

Source code for decoders.rnn_decoders

         output_attention=False,
         name="gnmt_attention")
       attentive_decoder_cell = GNMTAttentionMultiCell(
-        attention_cell, self._add_residual_wrapper(self._decoder_cells),
+        attention_cell, self._add_residual_wrapper(self._decoder_cells) if residual_connections else self._decoder_cells,
         use_new_attention=(self.params['attention_type'] == 'gnmt_v2'))
-    else:
+    else: # non-GNMT
       attentive_decoder_cell = AttentionWrapper(
-        cell=self._decoder_cells,
+        cell=tf.contrib.rnn.MultiRNNCell(self._decoder_cells),
         attention_mechanism=attention_mechanism,
       )
     batch_size_tensor = tf.constant(self._batch_size)
     embedding_fn = lambda ids: tf.cast(
       tf.nn.embedding_lookup(self._dec_emb_w, ids),
       dtype=self.params['dtype'])
-    #decoder = tf.contrib.seq2seq.BeamSearchDecoder(
+    # decoder = tf.contrib.seq2seq.BeamSearchDecoder(
     decoder = BeamSearchDecoder(
       cell=attentive_decoder_cell,
       embedding=embedding_fn,
@@ -611,8 +618,9 @@ 

Source code for decoders.rnn_decoders

       output_time_major=time_major,
     )
 
-    return {'logits': final_outputs.predicted_ids[:, :, 0],
-            'samples': [final_outputs.predicted_ids[:, :, 0]],
+    return {'logits': final_outputs.predicted_ids[:, :, 0] if not time_major else
+            tf.transpose(final_outputs.predicted_ids[:, :, 0], perm=[1, 0, 2]),
+            'outputs': [final_outputs.predicted_ids[:, :, 0]],
             'final_state': final_state,
             'final_sequence_lengths': final_sequence_lengths}
diff --git a/docs/html/_modules/encoders/cnn_encoder.html b/docs/html/_modules/encoders/cnn_encoder.html new file mode 100644 index 000000000..fa53218d1 --- /dev/null +++ b/docs/html/_modules/encoders/cnn_encoder.html @@ -0,0 +1,405 @@ + + + + + + + + + + + encoders.cnn_encoder — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + + + + +
+ +
+ + + + + + + + + + + + + + + + + +
+ + + + +
+
+
+
+ +

Source code for encoders.cnn_encoder

+# Copyright (c) 2018 NVIDIA Corporation
+"""
+This module contains classes and functions to build "general" convolutional
+neural networks from the description of arbitrary "layers".
+"""
+from __future__ import absolute_import, division, print_function
+from __future__ import unicode_literals
+from six.moves import range
+
+import tensorflow as tf
+import copy
+
+try:
+    from inspect import signature
+except ImportError:
+    from funcsigs import signature
+
+from .encoder import Encoder
+from open_seq2seq.utils.utils import deco_print
+
+
+
[docs]def build_layer(inputs, layer, layer_params, data_format, + regularizer, training, verbose=True): + """This function builds a layer from the layer function and it's parameters. + + It will automatically add regularizer parameter to the layer_params if the + layer supports regularization. To check this, it will look for the + "regularizer", "kernel_regularizer" and "gamma_regularizer" names in this + order in the ``layer`` call signature. If one of this parameters is supported + it will pass regularizer object as a value for that parameter. Based on the + same "checking signature" technique "data_format" and "training" parameters + will try to be added. + + Args: + inputs: input Tensor that will be passed to the layer. Note that layer has + to accept input as the first parameter. + layer: layer function or class with ``__call__`` method defined. + layer_params (dict): parameters passed to the ``layer``. + data_format (string): data format ("channels_first" or "channels_last") + that will be tried to be passed as an additional argument. + regularizer: regularizer instance that will be tried to be passed as an + additional argument. + training (bool): whether layer is built in training mode. Will be tried to + be passed as an additional argument. + verbose (bool): whether to print information about built layers. + + Returns: + Tensor with layer output. + """ + layer_params_cp = copy.deepcopy(layer_params) + for reg_name in ['regularizer', 'kernel_regularizer', 'gamma_regularizer']: + if reg_name not in layer_params_cp and \ + reg_name in signature(layer).parameters: + layer_params_cp.update({reg_name: regularizer}) + + if 'data_format' not in layer_params_cp and \ + 'data_format' in signature(layer).parameters: + layer_params_cp.update({'data_format': data_format}) + + if 'training' not in layer_params_cp and \ + 'training' in signature(layer).parameters: + layer_params_cp.update({'training': training}) + + outputs = layer(inputs, **layer_params_cp) + + if verbose: + if hasattr(layer, '_tf_api_names'): + layer_name = layer._tf_api_names[0] + else: + layer_name = layer + deco_print("Building layer: {}(inputs, {})".format( + layer_name, + ", ".join("{}={}".format(key, value) + for key, value in layer_params_cp.items()) + )) + return outputs
+ + +
[docs]class CNNEncoder(Encoder): + """General CNN encoder that can be used to construct various different models. + """ +
[docs] @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + 'cnn_layers': list, + })
+ +
[docs] @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + 'data_format': ['channels_first', 'channels_last'], + 'fc_layers': list, + })
+ +
[docs] def __init__(self, params, model, name="cnn_encoder", mode='train'): + """CNN Encoder constructor. + + See parent class for arguments description. + + Config parameters: + + * **cnn_layers** (list) --- list with the description of "convolutional" + layers. For example:: + "conv_layers": [ + (tf.layers.conv2d, { + 'filters': 64, 'kernel_size': (11, 11), + 'strides': (4, 4), 'padding': 'VALID', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 192, 'kernel_size': (5, 5), + 'strides': (1, 1), 'padding': 'SAME', + }), + (tf.layers.batch_normalization, {'momentum': 0.9, 'epsilon': 0.0001}), + (tf.nn.relu, {}), + ] + Note that you don't need to provide "regularizer", "training" and + "data_format" parameters since they will be automatically added. + + * **cnn_layers** (list) --- list with the description of "fully-connected" + layers. The only different from convolutional layers is that the input + will be automatically reshaped to 2D (batch size x num features). + For example:: + 'fc_layers': [ + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.layers.dropout, {'rate': 0.5}), + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.layers.dropout, {'rate': 0.5}), + ], + Note that you don't need to provide "regularizer", "training" and + "data_format" parameters since they will be automatically added. + + * **data_format** (string) --- could be either "channels_first" or + "channels_last". Defaults to "channels_first". + """ + super(CNNEncoder, self).__init__(params, model, name, mode)
+ + def _encode(self, input_dict): + regularizer = self.params.get('regularizer', None) + data_format = self.params.get('data_format', 'channels_first') + + x = input_dict['source_tensors'][0] + if data_format == 'channels_first': + x = tf.transpose(x, [0, 3, 1, 2]) + + for layer, layer_params in self.params['cnn_layers']: + x = build_layer(x, layer, layer_params, data_format, + regularizer, self.mode == 'train') + + if data_format == 'channels_first': + x = tf.transpose(x, [0, 2, 3, 1]) + + fc_layers = self.params.get('fc_layers', []) + + # if fully connected layers exist, flattening the output and applying them + if fc_layers: + input_shape = x.get_shape().as_list() + num_inputs = input_shape[1] * input_shape[2] * input_shape[3] + x = tf.reshape(x, [-1, num_inputs]) + for layer, layer_params in fc_layers: + x = build_layer(x, layer, layer_params, data_format, regularizer, + self.mode == 'train') + else: + # if there are no fully connected layers, doing average pooling + x = tf.reduce_mean(x, [1, 2]) + + return {'outputs': x}
+
+ +
+ +
+ + +
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/encoders/convs2s_encoder.html b/docs/html/_modules/encoders/convs2s_encoder.html new file mode 100644 index 000000000..d870f4b10 --- /dev/null +++ b/docs/html/_modules/encoders/convs2s_encoder.html @@ -0,0 +1,456 @@ + + + + + + + + + + + encoders.convs2s_encoder — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + + + + +
+ +
+ + + + + + + + + + + + + + + + + +
+ +
    + +
  • Docs »
  • + +
  • Module code »
  • + +
  • encoders.convs2s_encoder
  • + + +
  • + +
  • + +
+ + +
+
+
+
+ +

Source code for encoders.convs2s_encoder

+# Copyright (c) 2018 NVIDIA Corporation
+"""
+Conv-based encoder
+"""
+from __future__ import absolute_import, division, print_function
+from __future__ import unicode_literals
+
+import tensorflow as tf
+import math
+from .encoder import Encoder
+
+from open_seq2seq.parts.transformer import embedding_layer
+from open_seq2seq.parts.transformer.utils import get_padding_bias, get_padding
+from open_seq2seq.parts.convs2s import ffn_wn_layer, conv_wn_layer
+
+# Default value used if max_input_length is not given
+MAX_INPUT_LENGTH = 128
+
+
+
[docs]class ConvS2SEncoder(Encoder): + """ + Fully convolutional Encoder of ConvS2S + """ + +
[docs] @staticmethod + def get_required_params(): + return dict( + Encoder.get_required_params(), **{ + "encoder_layers": int, + "src_emb_size": int, + "src_vocab_size": int, + "pad_embeddings_2_eight": bool, + "conv_nchannels_kwidth": list, + "embedding_dropout_keep_prob": float, + "hidden_dropout_keep_prob": float, + })
+ +
[docs] @staticmethod + def get_optional_params(): + return dict( + Encoder.get_optional_params(), **{ + "att_layer_num": int, + 'max_input_length': int, + 'PAD_SYMBOL': int, + })
+ + def __init__(self, + params, + model, + name="convs2s_encoder_with_emb", + mode='train'): + super(ConvS2SEncoder, self).__init__(params, model, name=name, mode=mode) + + self._src_vocab_size = self.params['src_vocab_size'] + self._src_emb_size = self.params['src_emb_size'] + self.layers = [] + self._mode = mode + self._pad_sym = self.params.get('PAD_SYMBOL', 0) + self._pad2eight = params.get('pad_embeddings_2_eight', False) + + def _encode(self, input_dict): + inputs = input_dict['source_tensors'][0] + source_length = input_dict['source_tensors'][1] + + with tf.variable_scope("encode"): + # prepare encoder graph + if len(self.layers) == 0: + knum_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[0] + kwidth_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[1] + + with tf.variable_scope("embedding"): + self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self._src_vocab_size, + hidden_size=self._src_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + with tf.variable_scope("pos_embedding"): + self.position_embedding_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self.params.get("max_input_length", MAX_INPUT_LENGTH), + hidden_size=self._src_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + # linear projection before cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + self._src_emb_size, + knum_list[0], + dropout=self.params["embedding_dropout_keep_prob"], + var_scope_name="linear_mapping_before_cnn_layers")) + + for i in range(self.params['encoder_layers']): + in_dim = knum_list[i] if i == 0 else knum_list[i - 1] + out_dim = knum_list[i] + + # linear projection is needed for residual connections if + # input and output of a cnn layer do not match + if in_dim != out_dim: + linear_proj = ffn_wn_layer.FeedFowardNetworkNormalized( + in_dim, + out_dim, + var_scope_name="linear_mapping_cnn_" + str(i + 1), + dropout=1.0) + else: + linear_proj = None + + conv_layer = conv_wn_layer.Conv1DNetworkNormalized( + in_dim, + out_dim, + kernel_width=kwidth_list[i], + mode=self.mode, + layer_id=i + 1, + hidden_dropout=self.params["hidden_dropout_keep_prob"], + conv_padding="SAME", + decode_padding=False) + + self.layers.append([linear_proj, conv_layer]) + + # linear projection after cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + knum_list[self.params['encoder_layers'] - 1], + self._src_emb_size, + dropout=1.0, + var_scope_name="linear_mapping_after_cnn_layers")) + + encoder_inputs = self.embedding_softmax_layer(inputs) + inputs_attention_bias = get_padding_bias( + inputs, res_rank=3, pad_sym=self._pad_sym) + + with tf.name_scope("add_pos_encoding"): + pos_input = tf.range( + 0, + tf.shape(encoder_inputs)[1], + delta=1, + dtype=tf.int32, + name='range') + pos_encoding = self.position_embedding_layer(pos_input) + encoder_inputs = encoder_inputs + tf.cast( + x=pos_encoding, dtype=encoder_inputs.dtype) + + if self.mode == "train": + encoder_inputs = tf.nn.dropout( + encoder_inputs, self.params["embedding_dropout_keep_prob"]) + + # mask the paddings in the input given to cnn layers + inputs_padding = get_padding( + inputs, self._pad_sym, dtype=encoder_inputs.dtype) + padding_mask = tf.expand_dims(1 - inputs_padding, 2) + encoder_inputs *= padding_mask + + # disables padding masks in middle layers + # padding_mask = None + outputs, outputs_b, final_state = self._call(encoder_inputs, padding_mask) + + return { + 'outputs': outputs, + 'outputs_b': outputs_b, + 'inputs_attention_bias_cs2s': inputs_attention_bias, + 'state': final_state, + 'src_lengths': source_length, # should it include paddings or not? + 'embedding_softmax_layer': self.embedding_softmax_layer, + # TODO: Should we share position embedding? + # 'position_embedding_layer': self.position_embedding_layer, + 'encoder_input': inputs + } + + def _call(self, encoder_inputs, padding_mask): + # Run inputs through the sublayers. + with tf.variable_scope("linear_layer_before_cnn_layers"): + outputs = self.layers[0](encoder_inputs) + + for i in range(1, len(self.layers) - 1): + linear_proj, conv_layer = self.layers[i] + + with tf.variable_scope("layer_%d" % i): + if padding_mask is not None: + outputs *= padding_mask + if linear_proj is not None: + res_inputs = linear_proj(outputs) + else: + res_inputs = outputs + outputs = conv_layer(outputs) + outputs = (outputs + res_inputs) * math.sqrt(0.5) + + with tf.variable_scope("linear_layer_after_cnn_layers"): + outputs = self.layers[-1](outputs) + + if padding_mask is not None: + outputs *= padding_mask + + # Gradients are scaled as the gradients from + # all decoder attention layers enters the encoder + scale = 1.0 / ( + 2.0 * self.params.get("att_layer_num", self.params["encoder_layers"])) + outputs = (1.0 - scale) * tf.stop_gradient(outputs) + scale * outputs + + outputs_b = (outputs + encoder_inputs) * math.sqrt(0.5) + + if padding_mask is not None: + outputs_b *= padding_mask + + # Average of the encoder outputs is calculated as the final state of the encoder + # it can be used for decoders which just accept the final state + final_state = tf.reduce_mean(outputs_b, 1) + return outputs, outputs_b, final_state + + @property + def src_vocab_size(self): + return self._src_vocab_size + + @property + def src_emb_size(self): + return self._src_emb_size
+
+ +
+ +
+ + +
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/encoders/ds2_encoder.html b/docs/html/_modules/encoders/ds2_encoder.html index b6bbbdd1c..e0d79ff53 100644 --- a/docs/html/_modules/encoders/ds2_encoder.html +++ b/docs/html/_modules/encoders/ds2_encoder.html @@ -162,41 +162,14 @@

Source code for encoders.ds2_encoder

 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 
 from .encoder import Encoder
-
-
-
[docs]def conv2d_bn_actv(name, inputs, filters, kernel_size, activation_fn, strides, - padding, regularizer, training, data_format, bn_momentum, - bn_epsilon): - """Helper function that applies convolution, batch norm and activation.""" - conv = tf.layers.conv2d( - name="{}".format(name), - inputs=inputs, - filters=filters, - kernel_size=kernel_size, - strides=strides, - padding=padding, - kernel_regularizer=regularizer, - use_bias=False, - data_format=data_format, - ) - bn = tf.layers.batch_normalization( - name="{}/bn".format(name), - inputs=conv, - gamma_regularizer=regularizer, - training=training, - axis=-1 if data_format == 'channels_last' else 1, - momentum=bn_momentum, - epsilon=bn_epsilon, - ) - output = activation_fn(bn) - return output
+from open_seq2seq.parts.cnns.conv_blocks import conv_bn_actv
[docs]def rnn_cell(rnn_cell_dim, layer_type, dropout_keep_prob=1.0): """Helper function that creates RNN cell.""" if layer_type == "layernorm_lstm": cell = tf.contrib.rnn.LayerNormBasicLSTMCell( - num_units=rnn_cell_dim, dropout_keep_prob=dropout_keep_prob) + num_units=rnn_cell_dim, dropout_keep_prob=dropout_keep_prob) else: if layer_type == "lstm": cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_cell_dim) @@ -210,7 +183,7 @@

Source code for encoders.ds2_encoder

       raise ValueError("Error: not supported rnn type:{}".format(layer_type))
 
     cell = tf.nn.rnn_cell.DropoutWrapper(
-      cell, output_keep_prob=dropout_keep_prob)
+        cell, output_keep_prob=dropout_keep_prob)
   return cell
@@ -230,28 +203,28 @@

Source code for encoders.ds2_encoder

     x = tf.cast(x, tf.float32)
     cast_back = True
   filters = tf.get_variable(
-    name+'/w',
-    shape=[width, 1, channels, 1],
-    regularizer=regularizer,
-    dtype=tf.float32,
+      name + '/w',
+      shape=[width, 1, channels, 1],
+      regularizer=regularizer,
+      dtype=tf.float32,
   )
   strides = [1, 1, 1, 1]
   y = tf.nn.depthwise_conv2d(
-    name=name + '/conv',
-    input=x,
-    filter=filters,
-    strides=strides,
-    padding='SAME',
-    data_format='NHWC' if data_format == 'channels_last' else 'NCHW',
+      name=name + '/conv',
+      input=x,
+      filter=filters,
+      strides=strides,
+      padding='SAME',
+      data_format='NHWC' if data_format == 'channels_last' else 'NCHW',
   )
   bn = tf.layers.batch_normalization(
-    name="{}/bn".format(name),
-    inputs=y,
-    gamma_regularizer=regularizer,
-    training=training,
-    axis=-1 if data_format == 'channels_last' else 1,
-    momentum=bn_momentum,
-    epsilon=bn_epsilon,
+      name="{}/bn".format(name),
+      inputs=y,
+      gamma_regularizer=regularizer,
+      training=training,
+      axis=-1 if data_format == 'channels_last' else 1,
+      momentum=bn_momentum,
+      epsilon=bn_epsilon,
   )
   output = activation_fn(bn)
   if data_format == 'channels_first':
@@ -267,25 +240,25 @@ 

Source code for encoders.ds2_encoder

 
[docs] @staticmethod def get_required_params(): return dict(Encoder.get_required_params(), **{ - 'dropout_keep_prob': float, - 'conv_layers': list, - 'activation_fn': None, # any valid callable - 'num_rnn_layers': int, - 'row_conv': bool, - 'n_hidden': int, - 'use_cudnn_rnn': bool, - 'rnn_cell_dim': int, - 'rnn_type': ['layernorm_lstm', 'lstm', 'gru', 'cudnn_gru', 'cudnn_lstm'], - 'rnn_unidirectional': bool, + 'dropout_keep_prob': float, + 'conv_layers': list, + 'activation_fn': None, # any valid callable + 'num_rnn_layers': int, + 'row_conv': bool, + 'n_hidden': int, + 'use_cudnn_rnn': bool, + 'rnn_cell_dim': int, + 'rnn_type': ['layernorm_lstm', 'lstm', 'gru', 'cudnn_gru', 'cudnn_lstm'], + 'rnn_unidirectional': bool, })
[docs] @staticmethod def get_optional_params(): return dict(Encoder.get_optional_params(), **{ - 'row_conv_width': int, - 'data_format': ['channels_first', 'channels_last'], - 'bn_momentum': float, - 'bn_epsilon': float, + 'row_conv_width': int, + 'data_format': ['channels_first', 'channels_last'], + 'bn_momentum': float, + 'bn_epsilon': float, })
[docs] def __init__(self, params, model, name="ds2_encoder", mode='train'): @@ -369,8 +342,8 @@

Source code for encoders.ds2_encoder

       top_layer = input_layer
     else:
       top_layer = tf.transpose(input_layer, [0, 3, 1, 2])
-    
-    # ----- Convolutional layers -----------------------------------------------
+
+    # ----- Convolutional layers ---------------------------------------------
     conv_layers = self.params['conv_layers']
 
     for idx_conv in range(len(conv_layers)):
@@ -384,19 +357,20 @@ 

Source code for encoders.ds2_encoder

       else:
         src_length = (src_length + strides[0] - 1) // strides[0]
 
-      top_layer = conv2d_bn_actv(
-        name="conv{}".format(idx_conv + 1),
-        inputs=top_layer,
-        filters=ch_out,
-        kernel_size=kernel_size,
-        activation_fn=self.params['activation_fn'],
-        strides=strides,
-        padding=padding,
-        regularizer=regularizer,
-        training=training,
-        data_format=data_format,
-        bn_momentum=bn_momentum,
-        bn_epsilon=bn_epsilon,
+      top_layer = conv_bn_actv(
+          type="conv2d",
+          name="conv{}".format(idx_conv + 1),
+          inputs=top_layer,
+          filters=ch_out,
+          kernel_size=kernel_size,
+          activation_fn=self.params['activation_fn'],
+          strides=strides,
+          padding=padding,
+          regularizer=regularizer,
+          training=training,
+          data_format=data_format,
+          bn_momentum=bn_momentum,
+          bn_epsilon=bn_epsilon,
       )
     if data_format == 'channels_first':
       top_layer = tf.transpose(top_layer, [0, 2, 3, 1])
@@ -422,55 +396,56 @@ 

Source code for encoders.ds2_encoder

 
         if rnn_type == "cudnn_gru" or rnn_type == "gru":
           rnn_block = tf.contrib.cudnn_rnn.CudnnGRU(
-            num_layers=num_rnn_layers,
-            num_units=rnn_cell_dim,
-            direction=direction,
-            dropout=1.0 - dropout_keep_prob,
-            dtype=rnn_input.dtype,
-            name="cudnn_gru",
+              num_layers=num_rnn_layers,
+              num_units=rnn_cell_dim,
+              direction=direction,
+              dropout=1.0 - dropout_keep_prob,
+              dtype=rnn_input.dtype,
+              name="cudnn_gru",
           )
         elif rnn_type == "cudnn_lstm" or rnn_type == "lstm":
           rnn_block = tf.contrib.cudnn_rnn.CudnnLSTM(
-            num_layers=num_rnn_layers,
-            num_units=rnn_cell_dim,
-            direction=direction,
-            dropout=1.0 - dropout_keep_prob,
-            dtype=rnn_input.dtype,
-            name="cudnn_lstm",
+              num_layers=num_rnn_layers,
+              num_units=rnn_cell_dim,
+              direction=direction,
+              dropout=1.0 - dropout_keep_prob,
+              dtype=rnn_input.dtype,
+              name="cudnn_lstm",
           )
         else:
           raise ValueError(
-            "{} is not a valid rnn_type for cudnn_rnn layers".format(rnn_type)
+              "{} is not a valid rnn_type for cudnn_rnn layers".format(
+                  rnn_type)
           )
         top_layer, state = rnn_block(rnn_input)
         top_layer = tf.transpose(top_layer, [1, 0, 2])
       else:
         rnn_input = top_layer
         multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell(
-          [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type,
-                    dropout_keep_prob=dropout_keep_prob)
-           for _ in range(num_rnn_layers)]
+            [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type,
+                      dropout_keep_prob=dropout_keep_prob)
+             for _ in range(num_rnn_layers)]
         )
         if self.params['rnn_unidirectional']:
           top_layer, state = tf.nn.dynamic_rnn(
-            cell=multirnn_cell_fw,
-            inputs=rnn_input,
-            sequence_length=src_length,
-            dtype=rnn_input.dtype,
-            time_major=False,
+              cell=multirnn_cell_fw,
+              inputs=rnn_input,
+              sequence_length=src_length,
+              dtype=rnn_input.dtype,
+              time_major=False,
           )
         else:
           multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell(
-            [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type,
-                      dropout_keep_prob=dropout_keep_prob)
-             for _ in range(num_rnn_layers)]
+              [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type,
+                        dropout_keep_prob=dropout_keep_prob)
+               for _ in range(num_rnn_layers)]
           )
           top_layer, state = tf.nn.bidirectional_dynamic_rnn(
-            cell_fw=multirnn_cell_fw, cell_bw=multirnn_cell_bw,
-            inputs=rnn_input,
-            sequence_length=src_length,
-            dtype=rnn_input.dtype,
-            time_major=False
+              cell_fw=multirnn_cell_fw, cell_bw=multirnn_cell_bw,
+              inputs=rnn_input,
+              sequence_length=src_length,
+              dtype=rnn_input.dtype,
+              time_major=False
           )
           # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim]
           top_layer = tf.concat(top_layer, 2)
@@ -479,43 +454,43 @@ 

Source code for encoders.ds2_encoder

     if self.params['row_conv']:
       channels = top_layer.get_shape().as_list()[-1]
       top_layer = row_conv(
-        name="row_conv",
-        input_layer=top_layer,
-        batch=batch_size,
-        channels=channels,
-        activation_fn=self.params['activation_fn'],
-        width=self.params['row_conv_width'],
-        regularizer=regularizer,
-        training=training,
-        data_format=data_format,
-        bn_momentum=bn_momentum,
-        bn_epsilon=bn_epsilon,
+          name="row_conv",
+          input_layer=top_layer,
+          batch=batch_size,
+          channels=channels,
+          activation_fn=self.params['activation_fn'],
+          width=self.params['row_conv_width'],
+          regularizer=regularizer,
+          training=training,
+          data_format=data_format,
+          bn_momentum=bn_momentum,
+          bn_epsilon=bn_epsilon,
       )
 
     # Reshape [B, T, C] --> [B*T, C]
     c = top_layer.get_shape().as_list()[-1]
     top_layer = tf.reshape(top_layer, [-1, c])
 
-    # --- hidden layer with clipped ReLU activation and dropout-----------------
+    # --- hidden layer with clipped ReLU activation and dropout---------------
     top_layer = tf.layers.dense(
-      inputs=top_layer,
-      units=self.params['n_hidden'],
-      kernel_regularizer=regularizer,
-      activation=self.params['activation_fn'],
-      name='fully_connected',
+        inputs=top_layer,
+        units=self.params['n_hidden'],
+        kernel_regularizer=regularizer,
+        activation=self.params['activation_fn'],
+        name='fully_connected',
     )
     outputs = tf.nn.dropout(x=top_layer, keep_prob=dropout_keep_prob)
 
     # reshape from  [B*T,A] --> [B, T, A].
     # Output shape: [batch_size, n_steps, n_hidden]
     outputs = tf.reshape(
-      outputs,
-      [batch_size, -1, self.params['n_hidden']],
+        outputs,
+        [batch_size, -1, self.params['n_hidden']],
     )
 
     return {
-      'outputs': outputs,
-      'src_length': src_length,
+        'outputs': outputs,
+        'src_length': src_length,
     }
diff --git a/docs/html/_modules/encoders/encoder.html b/docs/html/_modules/encoders/encoder.html index 85a680c45..4172b435e 100644 --- a/docs/html/_modules/encoders/encoder.html +++ b/docs/html/_modules/encoders/encoder.html @@ -242,24 +242,9 @@

Source code for encoders.encoder

       else:
         self._params['dtype'] = tf.float32
 
-    if 'regularizer' not in self._params:
-      if self._model and 'regularizer' in self._model.params:
-        self._params['regularizer'] = self._model.params['regularizer']
-        self._params['regularizer_params'] = self._model.params['regularizer_params']
-
-    if 'regularizer' in self._params:
-      init_dict = self._params.get('regularizer_params', {})
-      self._params['regularizer'] = self._params['regularizer'](**init_dict)
-      if self._params['dtype'] == 'mixed':
-        self._params['regularizer'] = mp_regularizer_wrapper(
-          self._params['regularizer'],
-        )
-
-    if self._params['dtype'] == 'mixed':
-      self._params['dtype'] = tf.float16
-
     self._name = name
-    self._mode = mode
+ self._mode = mode + self._compiled = False
[docs] def encode(self, input_dict): """Wrapper around :meth:`self._encode() <_encode>` method. @@ -272,11 +257,35 @@

Source code for encoders.encoder

     Returns:
       see :meth:`self._encode() <_encode>` docs.
     """
+    if not self._compiled:
+      if 'regularizer' not in self._params:
+        if self._model and 'regularizer' in self._model.params:
+          self._params['regularizer'] = copy.deepcopy(
+            self._model.params['regularizer']
+          )
+          self._params['regularizer_params'] = copy.deepcopy(
+            self._model.params['regularizer_params']
+          )
+
+      if 'regularizer' in self._params:
+        init_dict = self._params.get('regularizer_params', {})
+        self._params['regularizer'] = self._params['regularizer'](**init_dict)
+        if self._params['dtype'] == 'mixed':
+          self._params['regularizer'] = mp_regularizer_wrapper(
+            self._params['regularizer'],
+          )
+
+      if self._params['dtype'] == 'mixed':
+        self._params['dtype'] = tf.float16
+
     if 'initializer' in self.params:
       init_dict = self.params.get('initializer_params', {})
       initializer = self.params['initializer'](**init_dict)
     else:
       initializer = None
+
+    self._compiled = True
+
     with tf.variable_scope(self._name, initializer=initializer,
                            dtype=self.params['dtype']):
       return self._encode(self._cast_types(input_dict))
diff --git a/docs/html/_modules/encoders/rnn_encoders.html b/docs/html/_modules/encoders/rnn_encoders.html index 4e3bdea06..93a9f354b 100644 --- a/docs/html/_modules/encoders/rnn_encoders.html +++ b/docs/html/_modules/encoders/rnn_encoders.html @@ -160,12 +160,11 @@

Source code for encoders.rnn_encoders

 from __future__ import absolute_import, division, print_function
 from __future__ import unicode_literals
 
-import copy
 import tensorflow as tf
 
-from open_seq2seq.parts.rnns.utils import create_rnn_cell
+from open_seq2seq.parts.rnns.utils import single_cell
 from .encoder import Encoder
-
+from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 
 
[docs]class UnidirectionalRNNEncoderWithEmbedding(Encoder): """ @@ -177,8 +176,8 @@

Source code for encoders.rnn_encoders

     return dict(Encoder.get_required_params(), **{
       'src_vocab_size': int,
       'src_emb_size': int,
-      'encoder_cell_units': int,
-      'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'],
+      'core_cell': None,
+      'core_cell_params': dict,
       'encoder_layers': int,
       'encoder_use_skip_connections': bool,
     })
@@ -242,10 +241,6 @@

Source code for encoders.rnn_encoders

     source_sequence = input_dict['source_tensors'][0]
     source_length = input_dict['source_tensors'][1]
 
-
-    cell_params = copy.deepcopy(self.params)
-    cell_params["num_units"] = self.params['encoder_cell_units']
-
     self._enc_emb_w = tf.get_variable(
       name="EncoderEmbeddingMatrix",
       shape=[self._src_vocab_size, self._src_emb_size],
@@ -259,14 +254,16 @@ 

Source code for encoders.rnn_encoders

       dp_input_keep_prob = 1.0
       dp_output_keep_prob = 1.0
 
-    self._encoder_cell_fw = create_rnn_cell(
-      cell_type=self.params['encoder_cell_type'],
-      cell_params=cell_params,
-      num_layers=self.params['encoder_layers'],
-      dp_input_keep_prob=dp_input_keep_prob,
-      dp_output_keep_prob=dp_output_keep_prob,
-      residual_connections=self.params['encoder_use_skip_connections'],
-    )
+    fwd_cells = [
+      single_cell(cell_class=self.params['core_cell'],
+                  cell_params=self.params.get('core_cell_params', {}),
+                  dp_input_keep_prob=dp_input_keep_prob,
+                  dp_output_keep_prob=dp_output_keep_prob,
+                  residual_connections=self.params[
+                    'encoder_use_skip_connections']
+                  ) for _ in range(self.params['encoder_layers'])]
+
+    self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells)
 
     time_major = self.params.get("time_major", False)
     use_swap_memory = self.params.get("use_swap_memory", False)
@@ -312,10 +309,10 @@ 

Source code for encoders.rnn_encoders

     return dict(Encoder.get_required_params(), **{
       'src_vocab_size': int,
       'src_emb_size': int,
-      'encoder_cell_units': int,
-      'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'],
       'encoder_layers': int,
       'encoder_use_skip_connections': bool,
+      'core_cell': None,
+      'core_cell_params': dict,
     })
[docs] @staticmethod @@ -382,9 +379,6 @@

Source code for encoders.rnn_encoders

       dtype=tf.float32
     )
 
-    cell_params = copy.deepcopy(self.params)
-    cell_params["num_units"] = self.params['encoder_cell_units']
-
     if self._mode == "train":
       dp_input_keep_prob = self.params['encoder_dp_input_keep_prob']
       dp_output_keep_prob = self.params['encoder_dp_output_keep_prob']
@@ -392,25 +386,27 @@ 

Source code for encoders.rnn_encoders

       dp_input_keep_prob = 1.0
       dp_output_keep_prob = 1.0
 
+    fwd_cells = [
+      single_cell(cell_class=self.params['core_cell'],
+                  cell_params=self.params.get('core_cell_params', {}),
+                  dp_input_keep_prob=dp_input_keep_prob,
+                  dp_output_keep_prob=dp_output_keep_prob,
+                  residual_connections=self.params['encoder_use_skip_connections']
+                  ) for _ in range(self.params['encoder_layers'])]
+    bwd_cells = [
+      single_cell(cell_class=self.params['core_cell'],
+                  cell_params=self.params.get('core_cell_params', {}),
+                  dp_input_keep_prob=dp_input_keep_prob,
+                  dp_output_keep_prob=dp_output_keep_prob,
+                  residual_connections=self.params['encoder_use_skip_connections']
+                  ) for _ in range(self.params['encoder_layers'])]
+
+
     with tf.variable_scope("FW"):
-      self._encoder_cell_fw = create_rnn_cell(
-        cell_type=self.params['encoder_cell_type'],
-        cell_params=cell_params,
-        num_layers=self.params['encoder_layers'],
-        dp_input_keep_prob=dp_input_keep_prob,
-        dp_output_keep_prob=dp_output_keep_prob,
-        residual_connections=self.params['encoder_use_skip_connections']
-      )
+      self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells)
 
     with tf.variable_scope("BW"):
-      self._encoder_cell_bw = create_rnn_cell(
-        cell_type=self.params['encoder_cell_type'],
-        cell_params=cell_params,
-        num_layers=self.params['encoder_layers'],
-        dp_input_keep_prob=dp_input_keep_prob,
-        dp_output_keep_prob=dp_output_keep_prob,
-        residual_connections=self.params['encoder_use_skip_connections']
-      )
+      self._encoder_cell_bw = tf.contrib.rnn.MultiRNNCell(bwd_cells)
 
     embedded_inputs = tf.cast(tf.nn.embedding_lookup(
       self.enc_emb_w,
@@ -456,8 +452,10 @@ 

Source code for encoders.rnn_encoders

     return dict(Encoder.get_required_params(), **{
       'src_vocab_size': int,
       'src_emb_size': int,
-      'encoder_cell_units': int,
-      'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'],
+      'core_cell': None,
+      'core_cell_params': dict,
+      #'encoder_cell_units': int,
+      #'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'],
       'encoder_layers': int,
       'encoder_use_skip_connections': bool,
     })
@@ -508,27 +506,24 @@

Source code for encoders.rnn_encoders

     if self.params['encoder_layers'] < 2:
       raise ValueError("GNMT encoder must have at least 2 layers")
 
-    cell_params = copy.deepcopy(self.params)
-    cell_params["num_units"] = self.params['encoder_cell_units']
+    #cell_params = copy.deepcopy(self.params)
+    #cell_params["num_units"] = self.params['encoder_cell_units']
 
     with tf.variable_scope("Level1FW"):
-      self._encoder_l1_cell_fw = create_rnn_cell(
-        cell_type=self.params['encoder_cell_type'],
-        cell_params=cell_params,
-        num_layers=1,
+      self._encoder_l1_cell_fw = single_cell(
+        cell_class=self.params['core_cell'],
+        cell_params=self.params.get('core_cell_params', {}),
         dp_input_keep_prob=1.0,
         dp_output_keep_prob=1.0,
-        residual_connections=False,
-      )
+        residual_connections=False)
+
     with tf.variable_scope("Level1BW"):
-      self._encoder_l1_cell_bw = create_rnn_cell(
-        cell_type=self.params['encoder_cell_type'],
-        cell_params=cell_params,
-        num_layers=1,
+      self._encoder_l1_cell_bw = single_cell(
+        cell_class=self.params['core_cell'],
+        cell_params=self.params.get('core_cell_params', {}),
         dp_input_keep_prob=1.0,
         dp_output_keep_prob=1.0,
-        residual_connections=False,
-      )
+        residual_connections=False)
 
     if self._mode == "train":
       dp_input_keep_prob = self.params['encoder_dp_input_keep_prob']
@@ -538,15 +533,13 @@ 

Source code for encoders.rnn_encoders

       dp_output_keep_prob = 1.0
 
     with tf.variable_scope("UniDirLevel"):
-      self._encoder_cells = create_rnn_cell(
-        cell_type=self.params['encoder_cell_type'],
-        cell_params=cell_params,
-        num_layers=self.params['encoder_layers'] - 1,
+      self._encoder_cells = [single_cell(
+        cell_class=self.params['core_cell'],
+        cell_params=self.params.get('core_cell_params', {}),
         dp_input_keep_prob=dp_input_keep_prob,
         dp_output_keep_prob=dp_output_keep_prob,
-        residual_connections=False,
-        wrap_to_multi_rnn=False,
-      )
+        residual_connections=False) for _ in range(self.params['encoder_layers'] - 1)]
+
       # add residual connections starting from the third layer
       for idx, cell in enumerate(self._encoder_cells):
         if idx > 0:
@@ -577,7 +570,7 @@ 

Source code for encoders.rnn_encoders

       inputs=encoder_l1_outputs,
       sequence_length=source_length,
       swap_memory=use_swap_memory,
-      time_major = time_major,
+      time_major=time_major,
       dtype=encoder_l1_outputs.dtype,
     )
 
@@ -597,6 +590,154 @@ 

Source code for encoders.rnn_encoders

   @property
   def enc_emb_w(self):
     return self._enc_emb_w
+ +
[docs]class GNMTLikeEncoderWithEmbedding_cuDNN(Encoder): + """ + Encoder similar to the one used in + GNMT model: https://arxiv.org/abs/1609.08144. + Must have at least 2 layers. Uses cuDNN RNN blocks for efficiency + """ + +
[docs] @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + 'src_vocab_size': int, + 'src_emb_size': int, + 'encoder_cell_units': int, + 'encoder_cell_type': ['lstm', 'gru'], + 'encoder_layers': int, + #'core_cell': None, + #'core_cell_params': dict, + })
+ +
[docs] @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + 'encoder_dp_output_keep_prob': float, + })
+ +
[docs] def __init__(self, params, model, + name="gnmt_encoder_with_emb_cudnn", mode='train'): + """ + Encodes data into representation + :param params: a Python dictionary. + Must define: + * src_inputs - a Tensor of shape [batch_size, time] or [time, batch_size] + (depending on time_major param) + * src_lengths - a Tensor of shape [batch_size] + :return: a Python dictionary with: + * encoder_outputs - a Tensor of shape + [batch_size, time, representation_dim] + or [time, batch_size, representation_dim] + * encoder_state - a Tensor of shape [batch_size, dim] + * src_lengths - (copy ref from input) a Tensor of shape [batch_size] + """ + super(GNMTLikeEncoderWithEmbedding_cuDNN, self).__init__( + params, model, name=name, mode=mode, + ) + + self._src_vocab_size = self.params['src_vocab_size'] + self._src_emb_size = self.params['src_emb_size']
+ + def _encode(self, input_dict): + source_sequence = input_dict['source_tensors'][0] + source_length = input_dict['source_tensors'][1] + self._enc_emb_w = tf.get_variable( + name="EncoderEmbeddingMatrix", + shape=[self._src_vocab_size, self._src_emb_size], + dtype=tf.float32 + ) + + if self.params['encoder_layers'] < 2: + raise ValueError("GNMT encoder must have at least 2 layers") + + if self._mode == "train": + dp_output_keep_prob = self.params['encoder_dp_output_keep_prob'] + else: + dp_output_keep_prob = 1.0 + + # source_sequence is of [batch, time] shape + embedded_inputs = tf.cast(tf.nn.embedding_lookup( + self.enc_emb_w, + tf.transpose(source_sequence), # cudnn wants [time, batch, ...] + ), self.params['dtype']) + + with tf.variable_scope("Bi_Directional_Layer"): + direction = cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION + if self.params['encoder_cell_type'] == "gru": + bidirectional_block = tf.contrib.cudnn_rnn.CudnnGRU( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=0.0, + dtype=self.params['dtype'], + name="cudnn_gru_bidi", + ) + elif self.params['encoder_cell_type'] == "lstm": + bidirectional_block = tf.contrib.cudnn_rnn.CudnnLSTM( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=0.0, + dtype=self.params['dtype'], + name="cudnn_lstm_bidi", + ) + else: + raise ValueError( + "{} is not a valid rnn_type for cudnn_rnn layers" + .format(self.params['encoder_cell_units']) + ) + bidi_output, bidi_state = bidirectional_block(embedded_inputs) + + with tf.variable_scope("Uni_Directional_Layer"): + direction = cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION + layer_input = bidi_output + for ind in range(self.params['encoder_layers'] - 1): + with tf.variable_scope("uni_layer_{}".format(ind)): + if self.params['encoder_cell_type'] == "gru": + unidirectional_block = tf.contrib.cudnn_rnn.CudnnGRU( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=1.0 - dp_output_keep_prob, + dtype=self.params['dtype'], + name="cudnn_gru_uni_".format(ind), + ) + elif self.params['encoder_cell_type'] == "lstm": + unidirectional_block = tf.contrib.cudnn_rnn.CudnnLSTM( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=1.0 - dp_output_keep_prob, + dtype=self.params['dtype'], + name="cudnn_lstm_uni_".format(ind), + ) + layer_output, encoder_state = unidirectional_block( + layer_input) + if ind > 0: # add residual connection + layer_output = layer_input + layer_output + layer_input = layer_output + + return {'outputs': tf.transpose(layer_input, perm=[1, 0, 2]), + 'state': None, + 'src_lengths': source_length, + 'encoder_input': source_sequence} + + + @property + def src_vocab_size(self): + return self._src_vocab_size + + @property + def src_emb_size(self): + return self._src_emb_size + + @property + def enc_emb_w(self): + return self._enc_emb_w
+ + +
diff --git a/docs/html/_modules/encoders/w2l_encoder.html b/docs/html/_modules/encoders/w2l_encoder.html new file mode 100644 index 000000000..c7e0c51ac --- /dev/null +++ b/docs/html/_modules/encoders/w2l_encoder.html @@ -0,0 +1,390 @@ + + + + + + + + + + + encoders.w2l_encoder — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + + + + +
+ +
+ + + + + + + + + + + + + + + + + +
+ + + + +
+
+
+
+ +

Source code for encoders.w2l_encoder

+# Copyright (c) 2018 NVIDIA Corporation
+from __future__ import absolute_import, division, print_function
+from __future__ import unicode_literals
+from six.moves import range
+
+import tensorflow as tf
+
+from .encoder import Encoder
+from open_seq2seq.parts.cnns.conv_blocks import *
+
+
+
[docs]class Wave2LetterEncoder(Encoder): + """Wave2Letter like encoder. Fully convolutional model""" + +
[docs] @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + 'dropout_keep_prob': float, + 'convnet_layers': list, + 'activation_fn': None, # any valid callable + })
+ +
[docs] @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + 'data_format': ['channels_first', 'channels_last'], + 'normalization': [None, 'batch_norm'], + 'bn_momentum': float, + 'bn_epsilon': float, + })
+ +
[docs] def __init__(self, params, model, name="w2l_encoder", mode='train'): + """Wave2Letter like encoder constructor. + + See parent class for arguments description. + + Config parameters: + + * **dropout_keep_prop** (float) --- keep probability for dropout. + * **convnet_layers** (list) --- list with the description of convolutional + layers. For example:: + "convnet_layers": [ + { + "type": "conv1d", "repeat" : 5, + "kernel_size": [7], "stride": [1], + "num_channels": 250, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [11], "stride": [1], + "num_channels": 500, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [32], "stride": [1], + "num_channels": 1000, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [1], "stride": [1], + "num_channels": 1000, "padding": "SAME" + }, + ] + * **activation_fn** --- activation function to use. + * **data_format** (string) --- could be either "channels_first" or + "channels_last". Defaults to "channels_last". + * **normalization** --- normalization to use. Accepts [None, 'batch_norm']. + Use None if you don't want to use normalization. Defaults to 'batch_norm'. + * **bn_momentum** (float) --- momentum for batch norm. Defaults to 0.90. + * **bn_epsilon** (float) --- epsilon for batch norm. Defaults to 1e-3. + """ + super(Wave2LetterEncoder, self).__init__(params, model, name, mode)
+ +
[docs] def _encode(self, input_dict): + """Creates TensorFlow graph for Wav2Letter like encoder. + + Args: + input_dict (dict): input dictionary that has to contain + the following fields:: + input_dict = { + "source_tensors": [ + src_sequence (shape=[batch_size, sequence length, num features]), + src_length (shape=[batch_size]) + ] + } + + Returns: + dict: dictionary with the following tensors:: + + { + 'outputs': hidden state, shape=[batch_size, sequence length, n_hidden] + 'src_length': tensor, shape=[batch_size] + } + """ + + source_sequence, src_length = input_dict['source_tensors'] + + training = (self._mode == "train") + dropout_keep_prob = self.params['dropout_keep_prob'] if training else 1.0 + regularizer = self.params.get('regularizer', None) + data_format = self.params.get('data_format', 'channels_last') + normalization = self.params.get('normalization', 'batch_norm') + + normalization_params = {} + if normalization == None: + conv_block = conv_actv + elif normalization == "batch_norm": + conv_block = conv_bn_actv + normalization_params['bn_momentum'] = self.params.get( + 'bn_momentum', 0.90) + normalization_params['bn_epsilon'] = self.params.get('bn_epsilon', 1e-3) + + conv_inputs = source_sequence + batch_size = conv_inputs.get_shape().as_list()[0] + if data_format == 'channels_last': + conv_feats = conv_inputs # B T F + else: + conv_feats = tf.transpose(conv_inputs, [0, 2, 1]) # B F T + + # ----- Convolutional layers --------------------------------------------- + convnet_layers = self.params['convnet_layers'] + + for idx_convnet in range(len(convnet_layers)): + layer_type = convnet_layers[idx_convnet]['type'] + layer_repeat = convnet_layers[idx_convnet]['repeat'] + ch_out = convnet_layers[idx_convnet]['num_channels'] + kernel_size = convnet_layers[idx_convnet]['kernel_size'] + strides = convnet_layers[idx_convnet]['stride'] + padding = convnet_layers[idx_convnet]['padding'] + + for idx_layer in range(layer_repeat): + conv_feats = conv_block( + type=layer_type, + name="conv{}{}".format( + idx_convnet + 1, idx_layer + 1), + inputs=conv_feats, + filters=ch_out, + kernel_size=kernel_size, + activation_fn=self.params['activation_fn'], + strides=strides, + padding=padding, + regularizer=regularizer, + training=training, + data_format=data_format, + **normalization_params + ) + outputs = tf.nn.dropout(x=conv_feats, keep_prob=dropout_keep_prob) + + if data_format == 'channels_first': + outputs = tf.transpose(outputs, [0, 2, 1]) + + return { + 'outputs': outputs, + 'src_length': src_length, + }
+
+ +
+ +
+ + +
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/index.html b/docs/html/_modules/index.html index 979fef39a..26c5b9829 100644 --- a/docs/html/_modules/index.html +++ b/docs/html/_modules/index.html @@ -160,14 +160,18 @@

All modules for which code is available

  • data.text2text.text2text
  • data.text2text.tokenizer
  • data.utils
  • +
  • decoders.convs2s_decoder
  • decoders.decoder
  • decoders.fc_decoders
  • decoders.rnn_decoders
  • +
  • encoders.cnn_encoder
  • +
  • encoders.convs2s_encoder
  • encoders.ds2_encoder
  • encoders.encoder
  • encoders.resnet_blocks
  • encoders.resnet_encoder
  • encoders.rnn_encoders
  • +
  • encoders.w2l_encoder
  • losses.cross_entropy_loss
  • losses.ctc_loss
  • losses.loss
  • @@ -181,6 +185,10 @@

    All modules for which code is available

  • optimizers.lr_policies
  • optimizers.mp_wrapper
  • optimizers.optimizers
  • +
  • parts.cnns.conv_blocks
  • +
  • parts.convs2s.attention_wn_layer
  • +
  • parts.convs2s.conv_wn_layer
  • +
  • parts.convs2s.ffn_wn_layer
  • parts.rnns.attention_wrapper
  • parts.rnns.flstm
  • parts.rnns.glstm
  • @@ -190,7 +198,6 @@

    All modules for which code is available

  • parts.rnns.utils
  • parts.transformer.attention_layer
  • parts.transformer.beam_search
  • -
  • parts.transformer.beam_search_test
  • parts.transformer.common
  • parts.transformer.embedding_layer
  • parts.transformer.ffn_layer
  • diff --git a/docs/html/_modules/losses/sequence_loss.html b/docs/html/_modules/losses/sequence_loss.html index 8906dc30a..e1f239446 100644 --- a/docs/html/_modules/losses/sequence_loss.html +++ b/docs/html/_modules/losses/sequence_loss.html @@ -407,11 +407,17 @@

    Source code for losses.sequence_loss

           'batch_size': int,
           'tgt_vocab_size': int,
           'label_smoothing': float,
    +      'pad_embeddings_2_eight': bool,
         })
    def __init__(self, params, model, name="padded_cross_entropy_with_smoothing"): super(PaddedCrossEntropyLossWithSmoothing, self).__init__(params, model, name) - self._tgt_vocab_size = self.params["tgt_vocab_size"] + if self.params.get('pad_embeddings_2_eight', False): + self._tgt_vocab_size = self.params["tgt_vocab_size"] if self.params[ + "tgt_vocab_size"] % 8 == 0 else \ + self.params["tgt_vocab_size"] + (8 - self.params["tgt_vocab_size"] % 8) + else: + self._tgt_vocab_size = self.params["tgt_vocab_size"] self._label_smoothing = self.params.get("label_smoothing", 0.0) def _compute_loss(self, input_dict): diff --git a/docs/html/_modules/models/encoder_decoder.html b/docs/html/_modules/models/encoder_decoder.html index 1fdcf3f46..205a65200 100644 --- a/docs/html/_modules/models/encoder_decoder.html +++ b/docs/html/_modules/models/encoder_decoder.html @@ -285,8 +285,8 @@

    Source code for models.encoder_decoder

     
         Returns:
           tuple: tuple containing loss tensor as returned from
    -      ``loss.compute_loss()`` and samples tensor, which is taken from
    -      ``decoder.decode()['samples']``. When ``mode == 'infer'``, loss will
    +      ``loss.compute_loss()`` and list of outputs tensors, which is taken from
    +      ``decoder.decode()['outputs']``. When ``mode == 'infer'``, loss will
           be None.
         """
         if not isinstance(input_tensors, dict) or \
    @@ -314,7 +314,7 @@ 

    Source code for models.encoder_decoder

           if self.mode == "train":
             decoder_input['target_tensors'] = target_tensors
           decoder_output = self.decoder.decode(input_dict=decoder_input)
    -      decoder_samples = decoder_output.get("samples", None)
    +      model_outputs = decoder_output.get("outputs", None)
     
           if self.mode == "train" or self.mode == "eval":
             with tf.variable_scope("Loss"):
    @@ -326,7 +326,7 @@ 

    Source code for models.encoder_decoder

           else:
             deco_print("Inference Mode. Loss part of graph isn't built.")
             loss = None
    -      return loss, decoder_samples
    + return loss, model_outputs
    @property def encoder(self): diff --git a/docs/html/_modules/models/image2label.html b/docs/html/_modules/models/image2label.html index 50f0d80be..813c9b37c 100644 --- a/docs/html/_modules/models/image2label.html +++ b/docs/html/_modules/models/image2label.html @@ -167,7 +167,7 @@

    Source code for models.image2label

     
     
     
    [docs]class Image2Label(EncoderDecoderModel): -
    [docs] def maybe_print_logs(self, input_values, output_values): +
    [docs] def maybe_print_logs(self, input_values, output_values, training_step): labels = input_values['target_tensors'][0] logits = output_values[0] @@ -186,7 +186,7 @@

    Source code for models.image2label

           "Train batch top-5": top5,
         }
    -
    [docs] def finalize_evaluation(self, results_per_batch): +
    [docs] def finalize_evaluation(self, results_per_batch, training_step=None): top1 = 0.0 top5 = 0.0 total = 0.0 @@ -211,11 +211,12 @@

    Source code for models.image2label

         labels = np.where(labels == 1)[1]
     
         total = logits.shape[0]
    -    top1 = np.sum(np.argmax(logits, axis=1) == labels)
    -    top5 = np.sum(labels[:, np.newaxis] == np.argpartition(logits, -5)[:, -5:])
    +    top1 = np.sum(np.equal(np.argmax(logits, axis=1), labels))
    +    top5 = np.sum(np.equal(labels[:, np.newaxis],
    +                           np.argpartition(logits, -5)[:, -5:]))
         return total, top1, top5
    -
    [docs] def get_num_objects_per_step(self, worker_id=0): +
    [docs] def _get_num_objects_per_step(self, worker_id=0): """Returns number of images in current batch, i.e. batch size.""" data_layer = self.get_data_layer(worker_id) num_images = tf.shape(data_layer.input_tensors['source_tensors'][0])[0] diff --git a/docs/html/_modules/models/model.html b/docs/html/_modules/models/model.html index 97595e7d6..ed8c775af 100644 --- a/docs/html/_modules/models/model.html +++ b/docs/html/_modules/models/model.html @@ -165,6 +165,11 @@

    Source code for models.model

     import copy
     import time
     
    +try:
    +    from inspect import signature
    +except ImportError:
    +    from funcsigs import signature
    +
     from open_seq2seq.utils.utils import deco_print, clip_last_batch
     from open_seq2seq.optimizers import optimize_loss, get_regularization_loss
     from open_seq2seq.utils.utils import check_params
    @@ -210,6 +215,7 @@ 

    Source code for models.model

           'save_summaries_steps': None,  # could be int or None
           'print_loss_steps': None,  # could be int or None
           'print_samples_steps': None,  # could be int or None
    +      'print_bench_info_steps': None,  # could be int or None
           'save_checkpoint_steps': None,  # could be int or None
           'eval_steps': int,
     
    @@ -230,9 +236,9 @@ 

    Source code for models.model

           'lr_policy_params': dict,
           'max_grad_norm': float,
           'larc_params': dict,
    -      'loss_scale': float,
    -      'automatic_loss_scaling': [None, 'Backoff', 'LogMax'],
    +      'loss_scaling': None,  # float, "Backoff" or "LogMax"
           'summaries': list,
    +      'iter_size': int,
         }
    [docs] def __init__(self, params, mode="train", hvd=None): @@ -276,6 +282,11 @@

    Source code for models.model

         * **print_samples_steps** (int or None) --- how often to print training
           samples (input sequences, correct answers and model predictions).
           Setting it to None disables samples printing.
    +    * **print_bench_info_steps** (int or None) --- how often to print training
    +      benchmarking information (average number of objects processed per step).
    +      Setting it to None disables intermediate benchmarking printing, but
    +      the average information across the whole training will always be printed
    +      after the last iteration.
         * **save_checkpoint_steps** (int or None) --- how often to save model
           checkpoints. Setting it to None disables checkpoint saving.
         * **eval_steps** (int) --- how often to run evaluation during training.
    @@ -311,14 +322,17 @@ 

    Source code for models.model

         * **max_grad_norm** (float) --- maximum value of gradient norm. Clipping
           will be performed if some gradients exceed this value (this is checked
           for each variable independently).
    -    * **loss_scale** (float) --- static loss scale to use. For details see
    -      :ref:`mixed precision training <mixed_precision>` section in docs.
    -    * **automatic_loss_scaling** --- automatic loss scaling mode. Could be
    -      either None, "Backoff" or "Logmax". For details see
    -      :ref:`mixed precision training <mixed_precision>` section in docs.
    +    * **loss_scaling** --- could be float or string. If float, static loss
    +      scaling is applied. If string, the corresponding automatic
    +      loss scaling algorithm is used. Must be one of 'Backoff'
    +      of 'LogMax' (case insensitive). Only used when dtype="mixed". For details
    +      see :ref:`mixed precision training <mixed_precision>` section in docs.
         * **summaries** (list) --- which summaries to log. Could contain
           "learning_rate", "gradients", "gradient_norm", "global_gradient_norm",
           "variables", "variable_norm".
    +    * **iter_size** (int) --- use this parameter to emulate large batches.
    +      The gradients will be accumulated for ``iter_size`` number of steps before
    +      applying update.
         * **larc_params** --- dictionary with parameters for LARC (or LARS)
           optimization algorithms. Can contain the following parameters:
     
    @@ -335,6 +349,9 @@ 

    Source code for models.model

     
         self._params = copy.deepcopy(params)
     
    +    if self._params.get('iter_size', 1) > 1 and hvd is None:
    +      raise ValueError("iter_size is only supported in Horovod mode")
    +
         # parameter checks
         self._mode = mode
         if self._mode not in ["train", "infer", "eval"]:
    @@ -356,6 +373,8 @@ 

    Source code for models.model

           self._params['save_checkpoint_steps'] = None
         if 'save_summaries_steps' not in self._params:
           self._params['save_summaries_steps'] = None
    +    if 'print_bench_info_steps' not in self._params:
    +      self._params['print_bench_info_steps'] = None
     
         # checking that frequencies of samples and loss are aligned
         s_fr = self._params['print_samples_steps']
    @@ -421,15 +440,21 @@ 

    Source code for models.model

               self._steps_in_epoch //= self._hvd.size()
             else:
               self._steps_in_epoch //= self.num_gpus
    +        self._steps_in_epoch //= self._params.get('iter_size', 1)
    +        if self._steps_in_epoch == 0:
    +          raise ValueError("Overall batch size is too big for this dataset.")
             self._last_step = self._params['num_epochs'] * self._steps_in_epoch
     
         if self.on_horovod:
           self._output = None
         else:
           self._outputs = [None] * self.num_gpus
    +
         self.loss = None
         self.train_op = None
    -    self.eval_losses = None
    + self.eval_losses = None + self._num_objects_per_step = None + self.skip_update_ph = None
    [docs] def compile(self, force_var_reuse=False): """TensorFlow graph is built here.""" @@ -461,7 +486,7 @@

    Source code for models.model

               )
               if self._outputs[gpu_cnt] is not None and \
                  not isinstance(self._outputs[gpu_cnt], list):
    -            raise ValueError('Decoder samples have to be either None or list')
    +            raise ValueError('Decoder outputs have to be either None or list')
               if self._mode == "train" or self._mode == "eval":
                 losses.append(loss)
           # end of for gpu_ind loop
    @@ -487,13 +512,19 @@ 

    Source code for models.model

             loss, self._output = self._build_forward_pass_graph(input_tensors,
                                                                 gpu_id=0)
             if self._output is not None and not isinstance(self._output, list):
    -          raise ValueError('Decoder samples have to be either None or list')
    +          raise ValueError('Decoder outputs have to be either None or list')
     
             if self._mode == "train":
               self.loss = loss
             if self._mode == "eval":
               self.eval_losses = [loss]
     
    +    try:
    +      self._num_objects_per_step = [self._get_num_objects_per_step(worker_id)
    +                                    for worker_id in range(self.num_gpus)]
    +    except NotImplementedError:
    +      pass
    +
         if self._mode == "train":
           if 'lr_policy' not in self.params:
             lr_policy = None
    @@ -501,34 +532,31 @@ 

    Source code for models.model

             lr_params = self.params.get('lr_policy_params', {})
             # adding default decay_steps = max_steps if lr_policy supports it and
             # different value is not provided
    -        if 'decay_steps' in self.params['lr_policy'].__code__.co_varnames and \
    -           'decay_steps' not in lr_params:
    +        func_params = signature(self.params['lr_policy']).parameters
    +        if 'decay_steps' in func_params and 'decay_steps' not in lr_params:
               lr_params['decay_steps'] = self._last_step
    -        if 'steps_per_epoch' in self.params['lr_policy'].__code__.co_varnames and \
    +        if 'steps_per_epoch' in func_params and \
                'steps_per_epoch' not in lr_params and 'num_epochs' in self.params:
               lr_params['steps_per_epoch'] = self.steps_in_epoch
             lr_policy = lambda gs: self.params['lr_policy'](global_step=gs,
                                                             **lr_params)
     
    +      if self.params.get('iter_size', 1) > 1:
    +        self.skip_update_ph = tf.placeholder(tf.bool)
    +
           self.train_op = optimize_loss(
             loss=tf.cast(self.loss, tf.float32) + get_regularization_loss(),
             dtype=self.params['dtype'],
             optimizer=self.params['optimizer'],
             optimizer_params=self.params.get('optimizer_params', {}),
    -        gradient_noise_scale=None,
    -        gradient_multipliers=None,
             clip_gradients=self.params.get('max_grad_norm', None),
             learning_rate_decay_fn=lr_policy,
    -        update_ops=None,
    -        variables=None,
    -        name="Loss_Optimization",
             summaries=self.params.get('summaries', None),
    -        colocate_gradients_with_ops=True,
    -        increment_global_step=True,
             larc_params=self.params.get('larc_params', None),
    -        loss_scale=self.params.get('loss_scale', 1.0),
    -        automatic_loss_scaling=self.params.get('automatic_loss_scaling', None),
    +        loss_scaling=self.params.get('loss_scaling', 1.0),
             on_horovod=self.on_horovod,
    +        iter_size=self.params.get('iter_size', 1),
    +        skip_update_ph=self.skip_update_ph,
           )
           tf.summary.scalar(name="train_loss", tensor=self.loss)
           if self.steps_in_epoch:
    @@ -569,7 +597,7 @@ 

    Source code for models.model

               is constructed. For Horovod this is always zero.
     
         Returns:
    -      tuple: tuple containing loss tensor and samples tensor.
    +      tuple: tuple containing loss tensor and list of outputs tensors.
     
           Loss tensor will be automatically provided to the optimizer and
           corresponding :attr:`train_op` will be created.
    @@ -579,12 +607,12 @@ 

    Source code for models.model

           this happens inside :class:`utils.hooks.RunEvaluationHook`
           to fetch output values for evaluation.
     
    -      Both loss and samples can be None when corresponding part of the graph
    +      Both loss and outputs can be None when corresponding part of the graph
           is not built.
         """
         pass
    -
    [docs] def maybe_print_logs(self, input_values, output_values): +
    [docs] def maybe_print_logs(self, input_values, output_values, training_step): """This method can be used to print logs that help to visualize training. For example, you can print sample input sequences and their corresponding predictions. This method will be called every ``print_samples_steps`` @@ -602,6 +630,7 @@

    Source code for models.model

           output_values: evaluation of
               :meth:`self.get_output_tensors(0) <get_output_tensors>`,
               that is, output tensors for one batch on the *first* GPU.
    +      training_step (int): Current training step.
     
         Returns:
           dict: dictionary with values that need to be logged to TensorBoard
    @@ -646,7 +675,7 @@ 

    Source code for models.model

         """
         return []
    -
    [docs] def finalize_evaluation(self, results_per_batch): +
    [docs] def finalize_evaluation(self, results_per_batch, training_step=None): """This method can be used in conjunction with :meth:`self.evaluate()<evaluate>` to calculate evaluation metrics. @@ -669,6 +698,8 @@

    Source code for models.model

           results_per_batch (list): aggregation of values returned from all calls
               to :meth:`self.evaluate()<evaluate>` method (number of calls will be
               equal to number of evaluation batches).
    +      training_step (int): current training step. Will only be passed if mode
    +          is "train_eval".
     
         Returns:
           dict: dictionary with values that need to be logged to TensorBoard
    @@ -776,7 +807,7 @@ 

    Source code for models.model

         else:
           return self.params['dtype']
    -
    [docs] def get_num_objects_per_step(self, worker_id=0): +
    [docs] def _get_num_objects_per_step(self, worker_id=0): """Define this method if you need benchmarking functionality. For example, for translation models, this method should return number of tokens in current batch, for image recognition model should return number @@ -791,6 +822,12 @@

    Source code for models.model

         """
         raise NotImplementedError()
    +
    [docs] def get_num_objects_per_step(self, worker_id=0): + if self._num_objects_per_step: + return self._num_objects_per_step[worker_id] + else: + raise NotImplementedError()
    + @property def params(self): """Parameters used to construct the model (dictionary).""" diff --git a/docs/html/_modules/models/speech2text.html b/docs/html/_modules/models/speech2text.html index 80796538c..ee88e2bf3 100644 --- a/docs/html/_modules/models/speech2text.html +++ b/docs/html/_modules/models/speech2text.html @@ -204,7 +204,7 @@

    Source code for models.speech2text

         )
         return super(Speech2Text, self)._create_decoder()
     
    -
    [docs] def maybe_print_logs(self, input_values, output_values): +
    [docs] def maybe_print_logs(self, input_values, output_values, training_step): y, len_y = input_values['target_tensors'] decoded_sequence = output_values y_one_sample = y[0] @@ -229,7 +229,7 @@

    Source code for models.speech2text

           'Sample WER': sample_wer,
         }
    -
    [docs] def finalize_evaluation(self, results_per_batch): +
    [docs] def finalize_evaluation(self, results_per_batch, training_step=None): total_word_lev = 0.0 total_word_count = 0.0 for word_lev, word_count in results_per_batch: @@ -275,13 +275,21 @@

    Source code for models.speech2text

         )
         for sample_id in range(len(decoded_texts)):
           preds.append("".join(decoded_texts[sample_id]))
    -    return preds
    + return preds, input_values['source_ids']
    [docs] def finalize_inference(self, results_per_batch, output_file): preds = [] + ids = [] - for result in results_per_batch: + for result, idx in results_per_batch: preds.extend(result) + ids.extend(idx) + + preds = np.array(preds) + ids = np.hstack(ids) + # restoring the correct order + preds = preds[np.argsort(ids)] + pd.DataFrame( { 'wav_filename': self.get_data_layer().all_files, @@ -290,7 +298,7 @@

    Source code for models.speech2text

           columns=['wav_filename', 'predicted_transcript'],
         ).to_csv(output_file, index=False)
    -
    [docs] def get_num_objects_per_step(self, worker_id=0): +
    [docs] def _get_num_objects_per_step(self, worker_id=0): """Returns number of audio frames in current batch.""" data_layer = self.get_data_layer(worker_id) num_frames = tf.reduce_sum(data_layer.input_tensors['source_tensors'][1]) diff --git a/docs/html/_modules/models/text2text.html b/docs/html/_modules/models/text2text.html index 7efd1f609..fdf38dcdf 100644 --- a/docs/html/_modules/models/text2text.html +++ b/docs/html/_modules/models/text2text.html @@ -235,22 +235,23 @@

    Source code for models.text2text

         input_strings, output_strings = [], []
         input_values = input_values['source_tensors']
         for input_sample, output_sample in zip(input_values, output_values):
    -      output_strings.append(text_ids_to_string(
    -        output_sample[0],
    -        self.get_data_layer().params['target_idx2seq'],
    -        S_ID=self.decoder.params['GO_SYMBOL'],
    -        EOS_ID=self.decoder.params['END_SYMBOL'],
    -        PAD_ID=self.decoder.params['PAD_SYMBOL'],
    -        ignore_special=True, delim=' ',
    -      ))
    -      input_strings.append(text_ids_to_string(
    -        input_sample[0],
    -        self.get_data_layer().params['source_idx2seq'],
    -        S_ID=self.decoder.params['GO_SYMBOL'],
    -        EOS_ID=self.decoder.params['END_SYMBOL'],
    -        PAD_ID=self.decoder.params['PAD_SYMBOL'],
    -        ignore_special=True, delim=' ',
    -      ))
    +      for i in range(0, input_sample.shape[0]): # iterate over batch dimension
    +        output_strings.append(text_ids_to_string(
    +          output_sample[i],
    +          self.get_data_layer().params['target_idx2seq'],
    +          S_ID=self.decoder.params['GO_SYMBOL'],
    +          EOS_ID=self.decoder.params['END_SYMBOL'],
    +          PAD_ID=self.decoder.params['PAD_SYMBOL'],
    +          ignore_special=True, delim=' ',
    +        ))
    +        input_strings.append(text_ids_to_string(
    +          input_sample[i],
    +          self.get_data_layer().params['source_idx2seq'],
    +          S_ID=self.decoder.params['GO_SYMBOL'],
    +          EOS_ID=self.decoder.params['END_SYMBOL'],
    +          PAD_ID=self.decoder.params['PAD_SYMBOL'],
    +          ignore_special=True, delim=' ',
    +        ))
         return input_strings, output_strings
    [docs] def finalize_inference(self, results_per_batch, output_file): @@ -265,7 +266,7 @@

    Source code for models.text2text

                 deco_print("")
               step += 1
    -
    [docs] def maybe_print_logs(self, input_values, output_values): +
    [docs] def maybe_print_logs(self, input_values, output_values, training_step): x, len_x = input_values['source_tensors'] y, len_y = input_values['target_tensors'] samples = output_values[0] @@ -355,7 +356,7 @@

    Source code for models.text2text

     
         return preds, targets
    -
    [docs] def finalize_evaluation(self, results_per_batch): +
    [docs] def finalize_evaluation(self, results_per_batch, training_step=None): preds, targets = [], [] for preds_cur, targets_cur in results_per_batch: if self.params.get('eval_using_bleu', True): @@ -369,13 +370,18 @@

    Source code for models.text2text

     
         return {}
    -
    [docs] def get_num_objects_per_step(self, worker_id=0): +
    [docs] def _get_num_objects_per_step(self, worker_id=0): """Returns number of source tokens + number of target tokens in batch.""" data_layer = self.get_data_layer(worker_id) # sum of source length in batch num_tokens = tf.reduce_sum(data_layer.input_tensors['source_tensors'][1]) - # sum of target length in batch - num_tokens += tf.reduce_sum(data_layer.input_tensors['target_tensors'][1]) + if self.mode != "infer": + # sum of target length in batch + num_tokens += tf.reduce_sum(data_layer.input_tensors['target_tensors'][1]) + else: + # TODO: this is not going to be correct when batch size > 1, since it will + # count padding? + num_tokens += tf.reduce_sum(tf.shape(self.get_output_tensors(worker_id)[0])) return num_tokens
    diff --git a/docs/html/_modules/optimizers/automatic_loss_scaler.html b/docs/html/_modules/optimizers/automatic_loss_scaler.html index d2f005f41..e13af41f7 100644 --- a/docs/html/_modules/optimizers/automatic_loss_scaler.html +++ b/docs/html/_modules/optimizers/automatic_loss_scaler.html @@ -163,25 +163,25 @@

    Source code for optimizers.automatic_loss_scaler

    import tensorflow as tf -
    [docs]class AutomaticLossScaler: - SUPPORTED_ALGOS = ['Backoff', 'LogMax'] +
    [docs]class AutomaticLossScaler(object): + SUPPORTED_ALGOS = ['backoff', 'logmax'] def __init__(self, algorithm='Backoff', scale_min=1.0, scale_max=2.**24): - if algorithm == 'Backoff': + algorithm = algorithm.lower().strip() + if algorithm == 'backoff': self.scaler = BackoffScaler(scale_min=scale_min, scale_max=scale_max, step_factor=2.0, step_window=2000) - elif algorithm == 'LogMax': + elif algorithm == 'logmax': self.scaler = LogMaxScaler(scale_min=scale_min, scale_max=scale_max, log_max=16., beta1=0.99, beta2=0.999, - overflow_std_dev=3.09) # ppf(.999) + overflow_std_dev=3.09) # ppf(.999) else: - raise ValueError('Unknown dynamic scaling algorithm: %s' - % algorithm_name) + raise ValueError('Unknown scaling algorithm: {}'.format(algorithm))
    [docs] def update_op(self, has_nan, amax): return self.scaler.update_op(has_nan, amax)
    @@ -210,7 +210,7 @@

    Source code for optimizers.automatic_loss_scaler

    return has_nan, amax
    -
    [docs]class BackoffScaler: +
    [docs]class BackoffScaler(object): def __init__(self, scale_min, scale_max, step_factor, step_window): self.scale_min = scale_min self.scale_max = scale_max @@ -260,7 +260,7 @@

    Source code for optimizers.automatic_loss_scaler

    return self.scale
    -
    [docs]class LogMaxScaler: +
    [docs]class LogMaxScaler(object): def __init__(self, scale_min, scale_max, log_max, beta1, beta2, overflow_std_dev): self.scale_min = scale_min self.scale_max = scale_max diff --git a/docs/html/_modules/optimizers/mp_wrapper.html b/docs/html/_modules/optimizers/mp_wrapper.html index 66489ab33..34c0746f5 100644 --- a/docs/html/_modules/optimizers/mp_wrapper.html +++ b/docs/html/_modules/optimizers/mp_wrapper.html @@ -235,7 +235,6 @@

    Source code for optimizers.mp_wrapper

         return grads_and_vars_fp32
    [docs] def apply_gradients(self, grads_and_vars, global_step=None, name=None): - def apply_ops_wrapper(): update_op = self._optimizer.apply_gradients(grads_and_vars, global_step, name) @@ -256,9 +255,7 @@

    Source code for optimizers.mp_wrapper

           loss_scale_update_op = self._loss_scaler.update_op(grad_has_nans,
                                                              grad_amax)
           with tf.control_dependencies([loss_scale_update_op]):
    -        return tf.cond(should_skip_update,
    -                       tf.no_op,
    -                       apply_ops_wrapper)
    +        return tf.cond(should_skip_update, tf.no_op, apply_ops_wrapper)
         else:
           return apply_ops_wrapper()
    diff --git a/docs/html/_modules/optimizers/optimizers.html b/docs/html/_modules/optimizers/optimizers.html index 980ce9ba3..a995d301e 100644 --- a/docs/html/_modules/optimizers/optimizers.html +++ b/docs/html/_modules/optimizers/optimizers.html @@ -182,23 +182,8 @@

    Source code for optimizers.optimizers

     
     import six
     import tensorflow as tf
    -
    -from tensorflow.contrib import framework as contrib_framework
    -from tensorflow.python.framework import constant_op
    -from tensorflow.python.framework import dtypes
    -from tensorflow.python.framework import ops
    -from tensorflow.python.ops import array_ops
    -from tensorflow.python.ops import clip_ops
     from tensorflow.python.ops import control_flow_ops
    -from tensorflow.python.ops import init_ops
    -from tensorflow.python.ops import math_ops
    -from tensorflow.python.ops import random_ops
    -from tensorflow.python.ops import variable_scope as vs
    -from tensorflow.python.ops import variables as vars_
    -from tensorflow.python.summary import summary
    -from tensorflow.python.training import moving_averages
    -from tensorflow.python.training import optimizer as optimizer_
    -from tensorflow.python.training import training as train
    +
     
     from .automatic_loss_scaler import AutomaticLossScaler
     from .mp_wrapper import MixedPrecisionOptimizerWrapper
    @@ -206,12 +191,12 @@ 

    Source code for optimizers.optimizers

     
     
     OPTIMIZER_CLS_NAMES = {
    -  "Adagrad": train.AdagradOptimizer,
    -  "Adam": train.AdamOptimizer,
    -  "Ftrl": train.FtrlOptimizer,
    -  "Momentum": train.MomentumOptimizer,
    -  "RMSProp": train.RMSPropOptimizer,
    -  "SGD": train.GradientDescentOptimizer,
    +  "Adagrad": tf.train.AdagradOptimizer,
    +  "Adam": tf.train.AdamOptimizer,
    +  "Ftrl": tf.train.FtrlOptimizer,
    +  "Momentum": tf.train.MomentumOptimizer,
    +  "RMSProp": tf.train.RMSPropOptimizer,
    +  "SGD": tf.train.GradientDescentOptimizer,
     }
     
     OPTIMIZER_SUMMARIES = [
    @@ -236,534 +221,302 @@ 

    Source code for optimizers.optimizers

       """
       losses = tf.losses.get_regularization_losses(scope)
       if losses:
    -    return math_ops.add_n(list(map(lambda x: tf.cast(x, tf.float32), losses)),
    -                          name=name)
    +    return tf.add_n(list(map(lambda x: tf.cast(x, tf.float32), losses)),
    +                    name=name)
       else:
    -    return constant_op.constant(0.0)
    - - -
    [docs]class DistributedOptimizer(tf.train.Optimizer): - """An optimizer that wraps another tf.Optimizer, using an allreduce to - average gradient values before applying gradients to model weights.""" - -
    [docs] def __init__(self, optimizer, name=None, use_locking=False, device_dense='', - device_sparse=''): - """Construct a new DistributedOptimizer, which uses another optimizer - under the hood for computing single-process gradient values and - applying gradient updates after the gradient values have been averaged - across all the Horovod ranks. - Args: - optimizer: - Optimizer to use for computing gradients and applying updates. - name: - Optional name prefix for the operations created when applying - gradients. Defaults to "Distributed" followed by the provided - optimizer type. - use_locking: - Whether to use locking when updating variables. - See Optimizer.__init__ for more info. - device_dense: - Device to be used for dense tensors. Uses GPU by default - if Horovod was build with HOROVOD_GPU_ALLREDUCE. - device_sparse: - Device to be used for sparse tensors. Uses GPU by default - if Horovod was build with HOROVOD_GPU_ALLGATHER. - """ - if name is None: - name = "Distributed{}".format(type(optimizer).__name__) - - self._optimizer = optimizer - self._device_dense = device_dense - self._device_sparse = device_sparse - super(DistributedOptimizer, self).__init__( - name=name, use_locking=use_locking)
    - -
    [docs] def compute_gradients(self, *args, **kwargs): - """Compute gradients of all trainable variables. - See Optimizer.compute_gradients() for more info. - In DistributedOptimizer, compute_gradients() is overriden to also - allreduce the gradients before returning them. - """ - gradients = self._optimizer.compute_gradients(*args, **kwargs) + return tf.constant(0.0)
    + + +
    [docs]def reduce_gradients(grads_and_vars, on_horovod): + if on_horovod: from horovod.common import size from horovod.tensorflow import allreduce if size() > 1: - averaged_gradients = [] - with tf.name_scope(self._name + "_Allreduce"): - for grad, var in gradients: + averaged_grads_and_vars = [] + with tf.name_scope("all_reduce"): + for grad, var in grads_and_vars: if grad is not None: - avg_grad = allreduce(grad, device_dense=self._device_dense, - device_sparse=self._device_sparse) - averaged_gradients.append((avg_grad, var)) + avg_grad = allreduce(grad) + averaged_grads_and_vars.append((avg_grad, var)) else: - averaged_gradients.append((None, var)) - return averaged_gradients + averaged_grads_and_vars.append((None, var)) + return averaged_grads_and_vars else: - return gradients
    - -
    [docs] def apply_gradients(self, grads_and_vars, global_step=None, name=None): - """Calls this same method on the underlying optimizer.""" - return self._optimizer.apply_gradients(grads_and_vars, global_step, name)
    + return grads_and_vars + else: + raise NotImplementedError("Reduce in tower-mode is not implemented.")
    [docs]def optimize_loss(loss, optimizer, optimizer_params, learning_rate_decay_fn, - global_step=None, dtype=tf.float32, - gradient_noise_scale=None, - gradient_multipliers=None, clip_gradients=None, - update_ops=None, - variables=None, - name=None, summaries=None, - colocate_gradients_with_ops=False, - increment_global_step=True, larc_params=None, - loss_scale=1.0, - automatic_loss_scaling=None, - on_horovod=False): + loss_scaling=1.0, + on_horovod=False, + iter_size=1, + skip_update_ph=None): """Given loss and parameters for optimizer, returns a training op. - Various ways of passing optimizers include: - - - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES - for full list. E.g. `optimize_loss(..., optimizer='Adam')`. - - by function taking learning rate `Tensor` as argument and returning an - `Optimizer` instance. E.g. `optimize_loss(..., - optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`. - Alternatively, if `learning_rate` is `None`, the function takes no - arguments. E.g. `optimize_loss(..., learning_rate=None, - optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`. - - by a subclass of `Optimizer` having a single-argument constructor - (the argument is the learning rate), such as AdamOptimizer or - AdagradOptimizer. E.g. `optimize_loss(..., - optimizer=tf.train.AdagradOptimizer)`. - - by an instance of a subclass of `Optimizer`. - E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`. - Args: loss: Scalar `Tensor`. - global_step: Scalar int `Tensor`, step counter to update on each step - unless `increment_global_step` is `False`. If not supplied, - it will be fetched from the default graph (see - `tf.train.get_global_step` for details). If it has - not been created, no step will be incremented with each weight - update. `learning_rate_decay_fn` requires `global_step`. - learning_rate: float or `Tensor`, magnitude of update per each training - step. Can be `None`. - optimizer: string, class or optimizer instance, used as trainer. - string should be name of optimizer, like 'SGD', - 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. - class should be sub-class of `tf.Optimizer` that implements - `compute_gradients` and `apply_gradients` functions. - optimizer instance should be instantiation of `tf.Optimizer` - sub-class and have `compute_gradients` and `apply_gradients` - functions. - gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this - value. - gradient_multipliers: dict of variables or variable names to floats. - If present, gradients for specified - variables will be multiplied by given constant. - clip_gradients: float, callable or `None`. If float, is provided, a global - clipping is applied to prevent the norm of the gradient to exceed this - value. Alternatively, a callable can be provided e.g.: adaptive_clipping. - This callable takes a `list` of `(gradients, variables)` `tuple`s and - returns the same thing with the gradients modified. - learning_rate_decay_fn: function, takes `learning_rate` and `global_step` - `Tensor`s, returns `Tensor`. - Can be used to implement any learning rate decay - functions. - For example: `tf.train.exponential_decay`. - Ignored if `learning_rate` is not supplied. - update_ops: list of update `Operation`s to execute at each step. If `None`, - uses elements of UPDATE_OPS collection. The order of execution - between `update_ops` and `loss` is non-deterministic. - variables: list of variables to optimize or - `None` to use all trainable variables. - name: The name for this operation is used to scope operations and summaries. + optimizer: string or class of optimizer, used as trainer. + string should be name of optimizer, like 'SGD', + 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. + class should be sub-class of `tf.Optimizer` that implements + `compute_gradients` and `apply_gradients` functions. + optimizer_params: parameters of the optimizer. + dtype: model dtype (tf.float16, tf.float32 or "mixed"). + learning_rate_decay_fn: function, takes `global_step` + `Tensor`s, returns `Tensor`. + Can be used to implement any learning rate decay + functions. + For example: `tf.train.exponential_decay`. + Ignored if `learning_rate` is not supplied. + clip_gradients: float, max gradient norm to clip to. summaries: List of internal quantities to visualize on tensorboard. If not - set only the loss and the learning rate will be reported. The - complete list is in OPTIMIZER_SUMMARIES. - colocate_gradients_with_ops: If True, try colocating gradients with the - corresponding op. - increment_global_step: Whether to increment `global_step`. If your model - calls `optimize_loss` multiple times per training step (e.g. to optimize - different parts of the model), use this arg to avoid incrementing - `global_step` more times than necessary. - LARC_mode: 'scale' or 'clip' - LARC_nu: If not None, LARC re-scaling will be - applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu - automatic_loss_scaling: if not None, use the corresponding automatic - loss scaling algorithm. Must be one of 'Backoff' - of 'LogMax'. `dtype` must be "mixed" to use ALS. + set only the loss and the learning rate will be reported. The + complete list is in OPTIMIZER_SUMMARIES. + larc_params: If not None, LARC re-scaling will + be applied with corresponding parameters. + loss_scaling: could be float or string. If float, static loss scaling + is applied. If string, the corresponding automatic + loss scaling algorithm is used. Must be one of 'Backoff' + of 'LogMax' (case insensitive). Only used when dtype="mixed". + on_horovod: whether the model is run on horovod. + Returns: - Training op. - - Raises: - ValueError: if: - * `loss` is an invalid type or shape. - * `global_step` is an invalid type or shape. - * `learning_rate` is an invalid type or value. - * `optimizer` has the wrong type. - * `clip_gradients` is neither float nor callable. - * `learning_rate` and `learning_rate_decay_fn` are supplied, but no - `global_step` is available. - * `gradients` is empty. + training op. """ - loss = ops.convert_to_tensor(loss) - contrib_framework.assert_scalar(loss) - if global_step is None: - global_step = tf.train.get_or_create_global_step() + if summaries is None: + summaries = ["learning_rate", "global_gradient_norm"] else: - tf.train.assert_global_step(global_step) - with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]): - # Update ops take UPDATE_OPS collection if not provided. - if update_ops is None: - update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) - # Make sure update ops are ran before computing loss. - if update_ops: - loss = control_flow_ops.with_dependencies(list(update_ops), loss) - - if summaries is None: - summaries = ["learning_rate", "global_gradient_norm"] - else: - for summ in summaries: - if summ not in OPTIMIZER_SUMMARIES: - raise ValueError("Summaries should be one of [%s], you provided %s." % - (", ".join(OPTIMIZER_SUMMARIES), summ)) - if global_step is None: - raise ValueError("global_step is required for learning_rate_decay_fn.") - lr = learning_rate_decay_fn(global_step) + for summ in summaries: + if summ not in OPTIMIZER_SUMMARIES: + raise ValueError( + "Summaries should be one of [{}], you provided {}.".format( + ", ".join(OPTIMIZER_SUMMARIES), summ, + )) + if clip_gradients is not None and larc_params is not None: + raise AttributeError( + "LARC and gradient norm clipping should not be used together" + ) + + global_step = tf.train.get_or_create_global_step() + lr = learning_rate_decay_fn(global_step) + if "learning_rate" in summaries: + tf.summary.scalar("learning_rate", lr) - if "learning_rate" in summaries: - summary.scalar("learning_rate", lr) + with tf.variable_scope("Loss_Optimization"): + update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) + loss = control_flow_ops.with_dependencies(list(update_ops), loss) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): - if lr is None: - raise ValueError("Learning rate is None, but should be specified if " - "optimizer is string (%s)." % optimizer) if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( - "Optimizer name should be one of [%s], you provided %s." % - (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) - opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr, **optimizer_params) - elif (isinstance(optimizer, type) and - issubclass(optimizer, optimizer_.Optimizer)): - if lr is None: - raise ValueError("Learning rate is None, but should be specified if " - "optimizer is class (%s)." % optimizer) - opt = optimizer(learning_rate=lr, **optimizer_params) - elif isinstance(optimizer, optimizer_.Optimizer): - opt = optimizer - elif callable(optimizer): - if lr is not None: - opt = optimizer(lr, **optimizer_params) - else: - opt = optimizer(**optimizer_params) - if not isinstance(opt, optimizer_.Optimizer): - raise ValueError("Unrecognized optimizer: function should return " - "subclass of Optimizer. Got %s." % str(opt)) - else: - raise ValueError("Unrecognized optimizer: should be string, " - "subclass of Optimizer, instance of " - "subclass of Optimizer or function with one argument. " - "Got %s." % str(optimizer)) - # All trainable variables, if specific variables are not specified. - if variables is None: - variables = vars_.trainable_variables() - - if automatic_loss_scaling is not None: - if automatic_loss_scaling not in AutomaticLossScaler.SUPPORTED_ALGOS: - raise ValueError("Unknown automatic loss scaling algorithm: %s." - % automatic_loss_sclaing) - if dtype != "mixed": - raise ValueError("Automatic loss scaling can be used only with " - "dtype=mixed.") - loss_scale = AutomaticLossScaler(algorithm=automatic_loss_scaling) + "Optimizer name should be one of [{}], you provided {}.".format( + ", ".join(OPTIMIZER_CLS_NAMES), optimizer + )) + optimizer = OPTIMIZER_CLS_NAMES[optimizer] + opt = optimizer(learning_rate=lr, **optimizer_params) + + if isinstance(loss_scaling, six.string_types): + loss_scaling = AutomaticLossScaler(algorithm=loss_scaling) if dtype == 'mixed': - opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scale) - if on_horovod: - opt = DistributedOptimizer(opt) + opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scaling) # Compute gradients. - gradients = opt.compute_gradients( - loss, variables, - colocate_gradients_with_ops=colocate_gradients_with_ops, + grads_and_vars = opt.compute_gradients( + loss, colocate_gradients_with_ops=True, ) - # Optionally add gradient noise. - if gradient_noise_scale is not None: - gradients = _add_scaled_noise_to_gradients(gradients, - gradient_noise_scale) - - # Multiply some gradients. - if gradient_multipliers is not None: - gradients = _multiply_gradients(gradients, gradient_multipliers) - if not gradients: - raise ValueError( - "Empty list of (gradient, var) pairs encountered. This is most " - "likely to be caused by an improper value of gradient_multipliers.") - - if "global_gradient_norm" in summaries or "gradient_norm" in summaries: - summary.scalar( - "global_norm/gradient_norm", - clip_ops.global_norm(list(map( - lambda x: tf.cast(x, tf.float32), - list(zip(*gradients))[0]) - )), - ) - - # Optionally clip gradients by global norm. - if clip_gradients is not None and larc_params is not None: - raise AttributeError( - "LARC and gradient norm clipping should not be used together" - ) - if isinstance(clip_gradients, float): - gradients = _clip_gradients_by_norm(gradients, clip_gradients) - elif callable(clip_gradients): - gradients = clip_gradients(gradients) - elif clip_gradients is not None: - raise ValueError( - "Unknown type %s for clip_gradients" % type(clip_gradients)) - - # Add histograms for variables, gradients and gradient norms. - for gradient, variable in gradients: - if isinstance(gradient, ops.IndexedSlices): - grad_values = gradient.values - else: - grad_values = gradient - - if isinstance(variable, ops.IndexedSlices): - var_values = variable.values + if on_horovod: + if iter_size > 1: + grads_and_vars_accum = [] + accum_ops = [] + for grad, var in grads_and_vars: + # necessary to use tf.Variable directly to instantiate cudnn rnn cells + # which don't have explicit shape. + grad_accum = tf.Variable( + initial_value=tf.zeros_like(var), + name=grad.name.split(":")[0] + "_accum", + expected_shape=var.shape, + dtype=grad.dtype, + trainable=False, + validate_shape=bool(var.get_shape()) + ) + if isinstance(grad, tf.IndexedSlices): + add_grads = tf.scatter_nd_add(grad_accum, grad.indices, + grad.values / iter_size) + else: + add_grads = grad_accum + grad / iter_size + + accum_ops.append(tf.assign(grad_accum, add_grads)) + grads_and_vars_accum.append((grad_accum, var)) + + accum_op = tf.group(accum_ops) + + def update_and_clear_op(): + with tf.control_dependencies([accum_op]): + red_grad_updates = opt.apply_gradients( + post_process_gradients( + reduce_gradients(grads_and_vars_accum, on_horovod=True), + lr=lr, + clip_gradients=clip_gradients, + larc_params=larc_params, + summaries=summaries, + ), + global_step=global_step, + ) + + with tf.control_dependencies([red_grad_updates]): + return tf.group([tf.assign(g, tf.zeros_like(g)) + for g, v in grads_and_vars_accum]) + + grad_updates = tf.cond( + pred=skip_update_ph, + true_fn=lambda: accum_op, + false_fn=update_and_clear_op, + ) else: - var_values = variable - - if grad_values is not None: - var_name = variable.name.replace(":", "_") - if "gradients" in summaries: - summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) - if "gradient_norm" in summaries: - summary.scalar("gradient_norm/%s" % var_name, - clip_ops.global_norm([grad_values])) - if "variables" in summaries: - summary.histogram("variables/%s" % var_name, var_values) - if "variable_norm" in summaries: - summary.scalar("variable_norm/%s" % var_name, - clip_ops.global_norm([var_values])) - - if clip_gradients is not None and ("global_gradient_norm" in summaries or - "gradient_norm" in summaries): - summary.scalar( - "global_norm/clipped_gradient_norm", - clip_ops.global_norm(list(map( - lambda x: tf.cast(x, tf.float32), - list(zip(*gradients))[0]) - )), - ) - - # LARC gradient re-scaling - if larc_params is not None: - check_params( - config=larc_params, - required_dict={'larc_eta': float}, - optional_dict={ - 'larc_mode': ['clip', 'scale'], - 'min_update': float, - 'epsilon': float - }, + grad_updates = opt.apply_gradients( + post_process_gradients( + reduce_gradients(grads_and_vars, on_horovod=True), + lr=lr, + clip_gradients=clip_gradients, + larc_params=larc_params, + summaries=summaries, + ), + global_step=global_step, + ) + else: + grad_updates = opt.apply_gradients( + post_process_gradients( + grads_and_vars, + lr=lr, + clip_gradients=clip_gradients, + larc_params=larc_params, + summaries=summaries, + ), + global_step=global_step, ) - larc_eta = larc_params['larc_eta'] - larc_mode = larc_params.get('larc_mode', 'clip') - min_update = larc_params.get('min_update', 1e-7) - eps = larc_params.get('epsilon', 1e-7) - - for idx, (g, v) in enumerate(gradients): - var_dtype = v.dtype - v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) - g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) - - if larc_mode == 'clip': - larc_grad_update = tf.maximum( - larc_eta * v_norm / (lr * (g_norm + eps)), - min_update, - ) - if "larc_summaries" in summaries: - summary.scalar('larc_clip_on/{}'.format(v.name), - tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) - larc_grad_update = tf.minimum(larc_grad_update, 1.0) - else: - larc_grad_update = tf.maximum( - larc_eta * v_norm / (g_norm + eps), - min_update, - ) - larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) - gradients[idx] = (larc_grad_update * g, v) - # adding additional summary - if "larc_summaries" in summaries: - summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update) - summary.scalar("larc_final_lr/{}".format(v.name), - tf.cast(lr, var_dtype) * larc_grad_update) - - # Create gradient updates. - grad_updates = opt.apply_gradients( - gradients, - global_step=global_step if increment_global_step else None, - name="train") - - # # Ensure the train_tensor computes grad_updates. + # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) return train_tensor
    -
    [docs]def _clip_gradients_by_norm(grads_and_vars, clip_gradients): - """Clips gradients by global norm.""" - gradients, variables = zip(*grads_and_vars) - clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_gradients) - return list(zip(clipped_gradients, variables))
    - - -
    [docs]def _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name): - """Find max_norm given norm and previous average.""" - with vs.variable_scope(name, "AdaptiveMaxNorm", [norm]): - log_norm = math_ops.log(norm + epsilon) - - def moving_average(name, value, decay): - moving_average_variable = vs.get_variable( - name, - shape=value.get_shape(), - dtype=value.dtype, - initializer=init_ops.zeros_initializer(), - trainable=False) - return moving_averages.assign_moving_average( - moving_average_variable, value, decay, zero_debias=False) - - # quicker adaptation at the beginning - if global_step is not None: - n = math_ops.to_float(global_step) - decay = math_ops.minimum(decay, n / (n + 1.)) - - # update averages - mean = moving_average("mean", log_norm, decay) - sq_mean = moving_average("sq_mean", math_ops.square(log_norm), decay) - - variance = sq_mean - math_ops.square(mean) - std = math_ops.sqrt(math_ops.maximum(epsilon, variance)) - max_norms = math_ops.exp(mean + std_factor * std) - return max_norms, mean
    - - -
    [docs]def adaptive_clipping_fn(std_factor=2., - decay=0.95, - static_max_norm=None, - global_step=None, - report_summary=False, - epsilon=1e-8, - name=None): - """Adapt the clipping value using statistics on the norms. - - Implement adaptive gradient as presented in section 3.2.1 of - https://arxiv.org/abs/1412.1602. - - Keeps a moving average of the mean and std of the log(norm) of the gradient. - If the norm exceeds `exp(mean + std_factor*std)` then all gradients will be - rescaled such that the global norm becomes `exp(mean)`. - - Args: - std_factor: Python scaler (or tensor). - `max_norm = exp(mean + std_factor*std)` - decay: The smoothing factor of the moving averages. - static_max_norm: If provided, will threshold the norm to this value as an - extra safety. - global_step: Optional global_step. If provided, `decay = decay*n/(n+1)`. - This provides a quicker adaptation of the mean for the first steps. - report_summary: If `True`, will add histogram summaries of the `max_norm`. - epsilon: Small value chosen to avoid zero variance. - name: The name for this operation is used to scope operations and summaries. - - Returns: - A function for applying gradient clipping. - """ - - def gradient_clipping(grads_and_vars): - """Internal function for adaptive clipping.""" - grads, variables = zip(*grads_and_vars) - - norm = clip_ops.global_norm(grads) - - max_norm, log_mean = _adaptive_max_norm(norm, std_factor, decay, - global_step, epsilon, name) +
    [docs]def post_process_gradients(grads_and_vars, summaries, lr, + clip_gradients, larc_params): + """Applies post processing to gradients, i.e. clipping, LARC, summaries.""" + if "global_gradient_norm" in summaries: + tf.summary.scalar( + "global_gradient_norm", + _global_norm_with_cast(grads_and_vars), + ) - # reports the max gradient norm for debugging - if report_summary: - summary.scalar("global_norm/adaptive_max_gradient_norm", max_norm) + # Optionally clip gradients by global norm. + if clip_gradients is not None: + grads_and_vars = _clip_gradients_by_norm(grads_and_vars, clip_gradients) - # factor will be 1. if norm is smaller than max_norm - factor = array_ops.where(norm < max_norm, - array_ops.ones_like(norm), - math_ops.exp(log_mean) / norm) + # Add histograms for variables, gradients and gradient norms. + for gradient, variable in grads_and_vars: + if isinstance(gradient, tf.IndexedSlices): + grad_values = gradient.values + else: + grad_values = gradient - if static_max_norm is not None: - factor = math_ops.minimum(static_max_norm / norm, factor) + if isinstance(variable, tf.IndexedSlices): + var_values = variable.values + else: + var_values = variable + + if grad_values is not None: + var_name = variable.name.replace(":", "_") + if "gradients" in summaries: + # need to mask nans for automatic loss scaling + tf.summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) + if "gradient_norm" in summaries: + tf.summary.scalar("gradient_norm/%s" % var_name, tf.norm(grad_values)) + if "variables" in summaries: + tf.summary.histogram("variables/%s" % var_name, var_values) + if "variable_norm" in summaries: + tf.summary.scalar("variable_norm/%s" % var_name, tf.norm(var_values)) + + if clip_gradients is not None and "global_gradient_norm" in summaries: + tf.summary.scalar( + "global_clipped_gradient_norm", + _global_norm_with_cast(grads_and_vars), + ) - # apply factor - clipped_grads = [] - for grad in grads: - if grad is None: - clipped_grads.append(None) - elif isinstance(grad, ops.IndexedSlices): - clipped_grads.append( - ops.IndexedSlices(grad.values * factor, grad.indices, - grad.dense_shape)) + # LARC gradient re-scaling + if larc_params is not None: + check_params( + config=larc_params, + required_dict={'larc_eta': float}, + optional_dict={ + 'larc_mode': ['clip', 'scale'], + 'min_update': float, + 'epsilon': float + }, + ) + larc_eta = larc_params['larc_eta'] + larc_mode = larc_params.get('larc_mode', 'clip') + min_update = larc_params.get('min_update', 1e-7) + eps = larc_params.get('epsilon', 1e-7) + + grads_and_vars_larc = [None] * len(grads_and_vars) + for idx, (g, v) in enumerate(grads_and_vars): + var_dtype = v.dtype + v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) + g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) + + if larc_mode == 'clip': + larc_grad_update = tf.maximum( + larc_eta * v_norm / (lr * (g_norm + eps)), + min_update, + ) + if "larc_summaries" in summaries: + tf.summary.scalar('larc_clip_on/{}'.format(v.name), + tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) + larc_grad_update = tf.minimum(larc_grad_update, 1.0) else: - clipped_grads.append(grad * factor) - - return list(zip(clipped_grads, variables)) + larc_grad_update = tf.maximum( + larc_eta * v_norm / (g_norm + eps), + min_update, + ) + larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) + grads_and_vars_larc[idx] = (larc_grad_update * g, v) + + # adding additional summary + if "larc_summaries" in summaries: + tf.summary.scalar('larc_grad_update/{}'.format(v.name), + larc_grad_update) + tf.summary.scalar("larc_final_lr/{}".format(v.name), + tf.cast(lr, var_dtype) * larc_grad_update) + grads_and_vars = grads_and_vars_larc + return grads_and_vars
    + + +def _global_norm_with_cast(grads_and_vars): + return tf.global_norm(list(map( + lambda x: tf.cast(x, tf.float32), + list(zip(*grads_and_vars))[0]) + )) - return gradient_clipping
    - -
    [docs]def _add_scaled_noise_to_gradients(grads_and_vars, gradient_noise_scale): - """Adds scaled noise from a 0-mean normal distribution to gradients.""" +
    [docs]def _clip_gradients_by_norm(grads_and_vars, clip_gradients): + """Clips gradients by global norm.""" gradients, variables = zip(*grads_and_vars) - noisy_gradients = [] - for gradient in gradients: - if gradient is None: - noisy_gradients.append(None) - continue - if isinstance(gradient, ops.IndexedSlices): - gradient_shape = gradient.dense_shape - else: - gradient_shape = gradient.get_shape() - noise = random_ops.truncated_normal(gradient_shape) * gradient_noise_scale - noisy_gradients.append(gradient + noise) - return list(zip(noisy_gradients, variables))
    - - -
    [docs]def _multiply_gradients(grads_and_vars, gradient_multipliers): - """Multiply specified gradients.""" - multiplied_grads_and_vars = [] - for grad, var in grads_and_vars: - if grad is not None and \ - (var in gradient_multipliers or var.name in gradient_multipliers): - key = var if var in gradient_multipliers else var.name - multiplier = constant_op.constant( - gradient_multipliers[key], dtype=dtypes.float32) - if isinstance(grad, ops.IndexedSlices): - grad_values = grad.values * multiplier - grad = ops.IndexedSlices(grad_values, grad.indices, grad.dense_shape) - else: - grad *= multiplier - multiplied_grads_and_vars.append((grad, var)) - return multiplied_grads_and_vars
    - + clipped_gradients, _ = tf.clip_by_global_norm(gradients, clip_gradients) + return list(zip(clipped_gradients, variables))
    diff --git a/docs/html/_modules/parts/cnns/conv_blocks.html b/docs/html/_modules/parts/cnns/conv_blocks.html new file mode 100644 index 000000000..fc1db4850 --- /dev/null +++ b/docs/html/_modules/parts/cnns/conv_blocks.html @@ -0,0 +1,327 @@ + + + + + + + + + + + parts.cnns.conv_blocks — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + + + + +
    +
    +
    +
    + +

    Source code for parts.cnns.conv_blocks

    +# Copyright (c) 2018 NVIDIA Corporation
    +from __future__ import absolute_import
    +from __future__ import division
    +from __future__ import print_function
    +from __future__ import unicode_literals
    +from six.moves import range
    +
    +import tensorflow as tf
    +
    +
    +
    [docs]def conv_actv(type, name, inputs, filters, kernel_size, activation_fn, strides, + padding, regularizer, training, data_format): + """Helper function that applies convolution and activation. + + Args: + type: the following types are supported + 'conv1d', 'conv2d' + """ + if type == "conv1d": + layer = tf.layers.conv1d + elif type == "conv2d": + layer = tf.layers.conv2d + + conv = layer( + name="{}".format(name), + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=padding, + kernel_regularizer=regularizer, + use_bias=False, + data_format=data_format, + ) + + output = conv + if activation_fn is not None: + output = activation_fn(output) + return output
    + + +
    [docs]def conv_bn_actv(type, name, inputs, filters, kernel_size, activation_fn, + strides, padding, regularizer, training, data_format, + bn_momentum, bn_epsilon): + """Helper function that applies convolution, batch norm and activation. + Accepts inputs in 'channels_last' format only. + + Args: + type: the following types are supported + 'conv1d', 'conv2d' + """ + if type == "conv1d": + layer = tf.layers.conv1d + elif type == "conv2d": + layer = tf.layers.conv2d + + conv = layer( + name="{}".format(name), + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=padding, + kernel_regularizer=regularizer, + use_bias=False, + data_format=data_format, + ) + + # trick to make batchnorm work for mixed precision training. + # To-Do check if batchnorm works smoothly for >4 dimensional tensors + squeeze = False + if type == "conv1d": + conv = tf.expand_dims(conv, axis=1) # NWC --> NHWC + squeeze = True + + bn = tf.layers.batch_normalization( + name="{}/bn".format(name), + inputs=conv, + gamma_regularizer=regularizer, + training=training, + axis=-1 if data_format == 'channels_last' else 1, + momentum=bn_momentum, + epsilon=bn_epsilon, + ) + + if squeeze: + bn = tf.squeeze(bn, axis=1) + + output = bn + if activation_fn is not None: + output = activation_fn(output) + return output
    +
    + +
    + +
    + + +
    +
    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/parts/convs2s/attention_wn_layer.html b/docs/html/_modules/parts/convs2s/attention_wn_layer.html new file mode 100644 index 000000000..ee507bc84 --- /dev/null +++ b/docs/html/_modules/parts/convs2s/attention_wn_layer.html @@ -0,0 +1,325 @@ + + + + + + + + + + + parts.convs2s.attention_wn_layer — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + +
      + +
    • Docs »
    • + +
    • Module code »
    • + +
    • parts.convs2s.attention_wn_layer
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for parts.convs2s.attention_wn_layer

    +"""Implementation of the attention layer for convs2s.
    +Inspired from https://github.com/tobyyouup/conv_seq2seq"""
    +
    +from __future__ import absolute_import
    +from __future__ import division
    +from __future__ import print_function
    +from __future__ import unicode_literals
    +
    +import tensorflow as tf
    +import math
    +from open_seq2seq.parts.convs2s.ffn_wn_layer import FeedFowardNetworkNormalized
    +
    +
    +
    [docs]class AttentionLayerNormalized(tf.layers.Layer): + """Attention layer for convs2s with weight normalization""" + +
    [docs] def __init__(self, in_dim, embed_size, layer_id, add_res): + """initializes the attention layer. + It uses weight normalization for linear projections + (Salimans & Kingma, 2016) w = g * v/2-norm(v) + + Args: + in_dim: int last dimension of the inputs + embed_size: int target embedding size + layer_id: int the id of current convolution layer + add_res: bool whether residual connection should be added or not + """ + super(AttentionLayerNormalized, self).__init__() + + self.add_res = add_res + with tf.variable_scope("attention_layer_" + str(layer_id)): + + # linear projection layer to project the attention input to target space + self.tgt_embed_proj = FeedFowardNetworkNormalized( + in_dim, + embed_size, + dropout=1.0, + var_scope_name="att_linear_mapping_tgt_embed") + + # linear projection layer to project back to the input space + self.out_proj = FeedFowardNetworkNormalized( + embed_size, + in_dim, + dropout=1.0, + var_scope_name="att_linear_mapping_out")
    + +
    [docs] def call(self, input, target_embed, encoder_output_a, encoder_output_b, + input_attention_bias): + """Calculates the attention vectors. + + Args: + input: A float32 tensor with shape [batch_size, length, in_dim] + target_embed: A float32 tensor with shape [batch_size, length, in_dim] + containing the target embeddings + encoder_output_a: A float32 tensor with shape [batch_size, length, out_dim] + containing the first encoder outputs, uses as the keys + encoder_output_b: A float32 tensor with shape [batch_size, length, src_emb_dim] + containing the second encoder outputs, uses as the values + input_attention_bias: A float32 tensor with shape [batch_size, length, 1] + containing the bias used to mask the paddings + + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + + h_proj = self.tgt_embed_proj(input) + d_proj = (h_proj + target_embed) * math.sqrt(0.5) + att_score = tf.matmul(d_proj, encoder_output_a, transpose_b=True) + + # Masking need to be done in float32. Added to support mixed-precision training. + att_score = tf.cast(x=att_score, dtype=tf.float32) + + # mask out the paddings + if input_attention_bias is not None: + att_score = att_score + input_attention_bias + + att_score = tf.nn.softmax(att_score) + + # Cast back to original type + att_score = tf.cast(x=att_score, dtype=encoder_output_b.dtype) + + length = tf.cast(tf.shape(encoder_output_b), encoder_output_b.dtype) + output = tf.matmul(att_score, encoder_output_b) * \ + length[1] * tf.cast(tf.sqrt(1.0 / length[1]), dtype=encoder_output_b.dtype) + output = self.out_proj(output) + + if self.add_res: + output = (output + input) * math.sqrt(0.5) + + return output
    +
    + +
    + +
    + + +
    +
    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/parts/convs2s/conv_wn_layer.html b/docs/html/_modules/parts/convs2s/conv_wn_layer.html new file mode 100644 index 000000000..39bf548d9 --- /dev/null +++ b/docs/html/_modules/parts/convs2s/conv_wn_layer.html @@ -0,0 +1,338 @@ + + + + + + + + + + + parts.convs2s.conv_wn_layer — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + +
      + +
    • Docs »
    • + +
    • Module code »
    • + +
    • parts.convs2s.conv_wn_layer
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for parts.convs2s.conv_wn_layer

    +"""Implementation of a 1d convolutional layer with weight normalization.
    +Inspired from https://github.com/tobyyouup/conv_seq2seq"""
    +
    +from __future__ import absolute_import
    +from __future__ import division
    +from __future__ import print_function
    +from __future__ import unicode_literals
    +
    +import tensorflow as tf
    +import math
    +
    +
    +
    [docs]class Conv1DNetworkNormalized(tf.layers.Layer): + """1D convolutional layer with weight normalization""" + +
    [docs] def __init__(self, in_dim, out_dim, kernel_width, mode, layer_id, + hidden_dropout, conv_padding, decode_padding): + """initializes the 1D convolution layer. + It uses weight normalization (Salimans & Kingma, 2016) w = g * v/2-norm(v) + + Args: + in_dim: int last dimension of the inputs + out_dim: int new dimension for the output + kernel_width: int width of kernel + mode: str the current mode + layer_id: int the id of current convolution layer + hidden_dropout: float the keep-dropout value used on the input. + Give 1.0 if no dropout. + It is used to initialize the weights of convolution. + conv_padding: str the type of padding done for convolution + decode_padding: bool specifies if this convolution layer is in decoder or not + in decoder padding is done explicitly before convolution + """ + + super(Conv1DNetworkNormalized, self).__init__() + self.mode = mode + self.conv_padding = conv_padding + self.decode_padding = decode_padding + self.hidden_dropout = hidden_dropout + self.kernel_width = kernel_width + + with tf.variable_scope("conv_layer_" + str(layer_id)): + V_std = math.sqrt(4.0 * hidden_dropout / (kernel_width * in_dim)) + self.V = tf.get_variable( + 'V', + shape=[kernel_width, in_dim, 2 * out_dim], + initializer=tf.random_normal_initializer(mean=0, stddev=V_std), + trainable=True) + self.V_norm = tf.norm(self.V.initialized_value(), axis=[0, 1]) + self.g = tf.get_variable('g', initializer=self.V_norm, trainable=True) + self.b = tf.get_variable( + 'b', + shape=[2 * out_dim], + initializer=tf.zeros_initializer(), + trainable=True) + + self.W = tf.reshape(self.g, [1, 1, 2 * out_dim]) * tf.nn.l2_normalize( + self.V, [0, 1])
    + +
    [docs] def call(self, input): + """Applies convolution with gated linear units on x. + + Args: + x: A float32 tensor with shape [batch_size, length, in_dim] + + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + x = input + if self.mode == "train": + x = tf.nn.dropout(x, self.hidden_dropout) + + if self.decode_padding: + x = tf.pad( + x, [[0, 0], [self.kernel_width - 1, self.kernel_width - 1], [0, 0]], + "CONSTANT") + + output = tf.nn.bias_add( + tf.nn.conv1d( + value=x, filters=self.W, stride=1, padding=self.conv_padding), + self.b) + + if self.decode_padding and self.kernel_width > 1: + output = output[:, 0:-self.kernel_width + 1, :] + + output = self.gated_linear_units(output) + + return output
    + +
    [docs] def gated_linear_units(self, inputs): + """Gated Linear Units (GLU) on x. + + Args: + x: A float32 tensor with shape [batch_size, length, 2*out_dim] + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + input_shape = inputs.get_shape().as_list() + assert len(input_shape) == 3 + input_pass = inputs[:, :, 0:int(input_shape[2] / 2)] + input_gate = inputs[:, :, int(input_shape[2] / 2):] + input_gate = tf.sigmoid(input_gate) + return tf.multiply(input_pass, input_gate)
    +
    + +
    + +
    + + +
    +
    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/parts/convs2s/ffn_wn_layer.html b/docs/html/_modules/parts/convs2s/ffn_wn_layer.html new file mode 100644 index 000000000..dca35bdc1 --- /dev/null +++ b/docs/html/_modules/parts/convs2s/ffn_wn_layer.html @@ -0,0 +1,303 @@ + + + + + + + + + + + parts.convs2s.ffn_wn_layer — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + +
      + +
    • Docs »
    • + +
    • Module code »
    • + +
    • parts.convs2s.ffn_wn_layer
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for parts.convs2s.ffn_wn_layer

    +"""Implementation of fully connected network with weight normalization.
    +Inspired from https://github.com/tobyyouup/conv_seq2seq"""
    +
    +from __future__ import absolute_import
    +from __future__ import division
    +from __future__ import print_function
    +from __future__ import unicode_literals
    +
    +import tensorflow as tf
    +import math
    +
    +
    +
    [docs]class FeedFowardNetworkNormalized(tf.layers.Layer): + """Fully connected feedforward network with weight normalization""" + +
    [docs] def __init__(self, in_dim, out_dim, dropout, var_scope_name): + """initializes the linear layer. + This layer projects from in_dim-dimenstional space to out_dim-dimentional space. + It uses weight normalization (Salimans & Kingma, 2016) w = g * v/2-norm(v) + + Args: + in_dim: int last dimension of the inputs + out_dim: int new dimension for the output + dropout: float the keep-dropout value used in the previous layer. + It is used to initialize the weights. Give 1.0 if no dropout. + var_scope_name: str the scope name for the weight variables + """ + super(FeedFowardNetworkNormalized, self).__init__() + self.out_dim = out_dim + self.in_dim = in_dim + + with tf.variable_scope(var_scope_name): + V_initializer = \ + tf.random_normal_initializer(mean=0, stddev=math.sqrt(dropout * 1.0 / in_dim)) + self.V = tf.get_variable( + 'V', + shape=[in_dim, out_dim], + initializer=V_initializer, + trainable=True) + self.V_norm = tf.norm(self.V.initialized_value(), axis=0) + self.g = tf.get_variable('g', initializer=self.V_norm, trainable=True) + self.b = tf.get_variable( + 'b', + shape=[out_dim], + initializer=tf.zeros_initializer(), + trainable=True)
    + +
    [docs] def call(self, x): + """Projects x with its linear transformation. + + Args: + x: A float32 tensor with shape [batch_size, length, in_dim] + + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + batch_size = tf.shape(x)[0] + + x = tf.reshape(x, [-1, self.in_dim]) + output = tf.matmul(x, self.V) + output = tf.reshape(output, [batch_size, -1, self.out_dim]) + + # x*(v*(g/2-norm(v))) + b + scaler = tf.div(self.g, tf.norm(self.V, axis=0)) + output = tf.reshape(scaler, [1, self.out_dim]) * output + \ + tf.reshape(self.b, [1, self.out_dim]) + + return output
    +
    + +
    + +
    + + +
    +
    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/_modules/parts/rnns/utils.html b/docs/html/_modules/parts/rnns/utils.html index 0752d0e09..0406f0b3e 100644 --- a/docs/html/_modules/parts/rnns/utils.html +++ b/docs/html/_modules/parts/rnns/utils.html @@ -166,137 +166,32 @@

    Source code for parts.rnns.utils

     import tensorflow as tf
     
     
    -
    [docs]def create_rnn_cell(cell_type, - cell_params, - num_layers=1, - dp_input_keep_prob=1.0, - dp_output_keep_prob=1.0, - residual_connections=False, - wrap_to_multi_rnn=True): - """ - TODO: MOVE THIS properly to utils. Write doc - :param cell_type: - :param cell_params: - :param num_layers: - :param dp_input_keep_prob: - :param dp_output_keep_prob: - :param residual_connections: - :return: - """ - def single_cell(cell_params): - # TODO: This method is ugly - redo - size = cell_params["num_units"] - proj_size = None if "proj_size" not in cell_params else cell_params["proj_size"] - - if cell_type == "lstm": - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return tf.nn.rnn_cell.LSTMCell(num_units=size, - num_proj=proj_size, - forget_bias=1.0) - else: - return DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=size, - num_proj=proj_size, - forget_bias=1.0), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(tf.nn.rnn_cell.LSTMCell(num_units=size, - num_proj=proj_size, - forget_bias=1.0)) - else: - return ResidualWrapper(DropoutWrapper( - tf.nn.rnn_cell.LSTMCell( - num_units=size, - num_proj=proj_size, - forget_bias=1.0, - ), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - )) - elif cell_type == "gru": - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return tf.nn.rnn_cell.GRUCell(num_units=size) - else: - return DropoutWrapper( - tf.nn.rnn_cell.GRUCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - ) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(tf.nn.rnn_cell.GRUCell(num_units=size)) - else: - return ResidualWrapper(DropoutWrapper( - tf.nn.rnn_cell.GRUCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob), - ) - elif cell_type == "glstm": - num_groups = cell_params["num_groups"] - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return GLSTMCell(num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0) - else: - return DropoutWrapper(GLSTMCell(num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(GLSTMCell(num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0)) - else: - return ResidualWrapper(DropoutWrapper( - GLSTMCell( - num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0, - ), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - )) - elif cell_type == "slstm": - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return BasicSLSTMCell(num_units=size) - else: - return DropoutWrapper(BasicSLSTMCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob - ) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(BasicSLSTMCell(num_units=size)) - else: - return ResidualWrapper(DropoutWrapper( - BasicSLSTMCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - )) - else: - raise ValueError("Unknown RNN cell class: {}".format(cell_type)) - - if num_layers > 1: - if wrap_to_multi_rnn: - return MultiRNNCell([single_cell(cell_params) for _ in range(num_layers)]) - else: - cells = [] # for GNMT-like attention in decoder - for i in range(num_layers): - cells.append(single_cell(cell_params)) - return cells - else: - return single_cell(cell_params)
    +
    [docs]def single_cell(cell_class, + cell_params, + dp_input_keep_prob=1.0, + dp_output_keep_prob=1.0, + residual_connections=False): + """Creates an instance of the rnn cell. + Such cell describes one step one layer and can include residual connection + and/or dropout + + Args: + cell_class: Tensorflow RNN cell class + cell_params (dict): cell parameters + dp_input_keep_prob (float): (default: 1.0) input dropout keep probability + dp_output_keep_prob (float): (default: 1.0) output dropout keep probability + residual_connections (bool): whether to add residual connection + + Returns: + TF RNN instance + """ + cell = cell_class(**cell_params) + if residual_connections: + cell = ResidualWrapper(cell) + if dp_input_keep_prob != 1.0 or dp_output_keep_prob != 1.0: + cell = DropoutWrapper(cell, input_keep_prob=dp_input_keep_prob, + output_keep_prob=dp_output_keep_prob) + return cell
    diff --git a/docs/html/_modules/parts/transformer/beam_search_test.html b/docs/html/_modules/parts/transformer/beam_search_test.html deleted file mode 100644 index 3c9e88cfe..000000000 --- a/docs/html/_modules/parts/transformer/beam_search_test.html +++ /dev/null @@ -1,337 +0,0 @@ - - - - - - - - - - - parts.transformer.beam_search_test — OpenSeq2Seq 0.2 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - - -
    - - - - - -
    - -
    - - - - - - - - - - - - - - - - - -
    - -
      - -
    • Docs »
    • - -
    • Module code »
    • - -
    • parts.transformer.beam_search_test
    • - - -
    • - -
    • - -
    - - -
    -
    -
    -
    - -

    Source code for parts.transformer.beam_search_test

    -# Copyright 2018 MLBenchmark Group. All Rights Reserved.
    -#
    -# Licensed under the Apache License, Version 2.0 (the "License");
    -# you may not use this file except in compliance with the License.
    -# You may obtain a copy of the License at
    -#
    -#     http://www.apache.org/licenses/LICENSE-2.0
    -#
    -# Unless required by applicable law or agreed to in writing, software
    -# distributed under the License is distributed on an "AS IS" BASIS,
    -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    -# See the License for the specific language governing permissions and
    -# limitations under the License.
    -# ==============================================================================
    -"""Test beam search helper methods."""
    -
    -from __future__ import absolute_import
    -from __future__ import division
    -from __future__ import print_function
    -
    -import tensorflow as tf
    -
    -from . import beam_search
    -#import beam_search
    -
    -
    -
    [docs]class BeamSearchHelperTests(tf.test.TestCase): - -
    [docs] def test_expand_to_beam_size(self): - x = tf.ones([7, 4, 2, 5]) - x = beam_search._expand_to_beam_size(x, 3) - with self.test_session() as sess: - shape = sess.run(tf.shape(x)) - self.assertAllEqual([7, 3, 4, 2, 5], shape)
    - -
    [docs] def test_shape_list(self): - y = tf.constant(4.0) - x = tf.ones([7, tf.to_int32(tf.sqrt(y)), 2, 5]) - shape = beam_search._shape_list(x) - self.assertIsInstance(shape[0], int) - self.assertIsInstance(shape[1], tf.Tensor) - self.assertIsInstance(shape[2], int) - self.assertIsInstance(shape[3], int)
    - -
    [docs] def test_get_shape_keep_last_dim(self): - y = tf.constant(4.0) - x = tf.ones([7, tf.to_int32(tf.sqrt(y)), 2, 5]) - shape = beam_search._get_shape_keep_last_dim(x) - self.assertAllEqual([None, None, None, 5], - shape.as_list())
    - -
    [docs] def test_flatten_beam_dim(self): - x = tf.ones([7, 4, 2, 5]) - x = beam_search._flatten_beam_dim(x) - with self.test_session() as sess: - shape = sess.run(tf.shape(x)) - self.assertAllEqual([28, 2, 5], shape)
    - -
    [docs] def test_unflatten_beam_dim(self): - x = tf.ones([28, 2, 5]) - x = beam_search._unflatten_beam_dim(x, 7, 4) - with self.test_session() as sess: - shape = sess.run(tf.shape(x)) - self.assertAllEqual([7, 4, 2, 5], shape)
    - -
    [docs] def test_gather_beams(self): - x = tf.reshape(tf.range(24), [2, 3, 4]) - # x looks like: [[[ 0 1 2 3] - # [ 4 5 6 7] - # [ 8 9 10 11]] - # - # [[12 13 14 15] - # [16 17 18 19] - # [20 21 22 23]]] - - y = beam_search._gather_beams(x, [[1, 2], [0, 2]], 2, 2) - with self.test_session() as sess: - y = sess.run(y) - - self.assertAllEqual([[[4, 5, 6, 7], - [8, 9, 10, 11]], - [[12, 13, 14, 15], - [20, 21, 22, 23]]], - y)
    - -
    [docs] def test_gather_topk_beams(self): - x = tf.reshape(tf.range(24), [2, 3, 4]) - x_scores = [[0, 1, 1], [1, 0, 1]] - - y = beam_search._gather_topk_beams(x, x_scores, 2, 2) - with self.test_session() as sess: - y = sess.run(y) - - self.assertAllEqual([[[4, 5, 6, 7], - [8, 9, 10, 11]], - [[12, 13, 14, 15], - [20, 21, 22, 23]]], - y)
    - - -if __name__ == "__main__": - tf.test.main() -
    - -
    - -
    - - -
    -
    - -
    - -
    - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/docs/html/_modules/parts/transformer/embedding_layer.html b/docs/html/_modules/parts/transformer/embedding_layer.html index f51ff5d3c..5e5cef04e 100644 --- a/docs/html/_modules/parts/transformer/embedding_layer.html +++ b/docs/html/_modules/parts/transformer/embedding_layer.html @@ -181,23 +181,31 @@

    Source code for parts.transformer.embedding_layer

    [docs]class EmbeddingSharedWeights(tf.layers.Layer): """Calculates input embeddings and pre-softmax linear with shared weights.""" - def __init__(self, vocab_size, hidden_size, pad2eight=False): + def __init__(self, vocab_size, hidden_size, pad_vocab_to_eight=False, init_var=None, + embed_scale=True, pad_sym=0, mask_paddings=True): super(EmbeddingSharedWeights, self).__init__() - self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.embed_scale = embed_scale + self.pad_sym = pad_sym + self.mask_paddings = mask_paddings + padf = lambda x: x if x % 8 == 0 else x + 8 - x % 8 - if pad2eight: - self.hidden_size = padf(hidden_size) + if pad_vocab_to_eight: + self.vocab_size = padf(vocab_size) + else: + self.vocab_size = vocab_size + + if init_var is None: + self.init_var = hidden_size ** -0.5 else: - self.hidden_size = hidden_size + self.init_var = init_var
    [docs] def build(self, _): with tf.variable_scope("embedding_and_softmax", reuse=tf.AUTO_REUSE): # Create and initialize weights. The random normal initializer was chosen # randomly, and works well. - self.shared_weights = tf.get_variable( - "weights", [self.vocab_size, self.hidden_size], - initializer=tf.random_normal_initializer( - 0., self.hidden_size ** -0.5)) + self.shared_weights = tf.get_variable("weights", [self.vocab_size, self.hidden_size], + initializer=tf.random_normal_initializer(0., self.init_var)) self.built = True
    @@ -213,18 +221,18 @@

    Source code for parts.transformer.embedding_layer

    """ with tf.name_scope("embedding"): embeddings = tf.gather(self.shared_weights, x) - - # Scale embedding by the sqrt of the hidden size - embeddings *= self.hidden_size ** 0.5 - - # Create binary array of size [batch_size, length] - # where 1 = padding, 0 = not padding - padding = model_utils.get_padding(x) - - # Set all padding embedding values to 0 - #embeddings *= tf.expand_dims(1 - padding, -1) - embeddings *= tf.cast(tf.expand_dims(1 - padding, -1), - dtype=embeddings.dtype) + if self.embed_scale: + # Scale embedding by the sqrt of the hidden size + embeddings *= self.hidden_size ** 0.5 + + if self.mask_paddings: + # Create binary array of size [batch_size, length] + # where 1 = padding, 0 = not padding + padding = model_utils.get_padding(x, padding_value=self.pad_sym) + + # Set all padding embedding values to 0 + #embeddings *= tf.expand_dims(1 - padding, -1) + embeddings *= tf.cast(tf.expand_dims(1.0 - padding, -1), dtype=embeddings.dtype) return embeddings
    [docs] def linear(self, x): diff --git a/docs/html/_modules/parts/transformer/utils.html b/docs/html/_modules/parts/transformer/utils.html index 14c3a5604..166eeefe4 100644 --- a/docs/html/_modules/parts/transformer/utils.html +++ b/docs/html/_modules/parts/transformer/utils.html @@ -230,23 +230,23 @@

    Source code for parts.transformer.utils

       return decoder_bias
    -
    [docs]def get_padding(x, padding_value=0): +
    [docs]def get_padding(x, padding_value=0, dtype=tf.float32): """Return float tensor representing the padding values in x. Args: x: int tensor with any shape padding_value: int value that + dtype: type of the output Returns: flaot tensor with same shape as x containing values 0 or 1. 0 -> non-padding, 1 -> padding """ with tf.name_scope("padding"): - return tf.to_float(tf.equal(x, padding_value))
    - #return tf.cast(tf.equal(x, padding_value), dtype=x.dtype) + return tf.cast(tf.equal(x, padding_value), dtype=dtype)
    -
    [docs]def get_padding_bias(x): +
    [docs]def get_padding_bias(x, res_rank=4, pad_sym=0): """Calculate bias tensor from padding values in tensor. Bias tensor that is added to the pre-softmax multi-headed attention logits, @@ -255,17 +255,25 @@

    Source code for parts.transformer.utils

     
       Args:
         x: int tensor with shape [batch_size, length]
    +    res_rank: int indicates the rank of attention_bias.
    +    dtype: type of the output attention_bias
    +    pad_sym: int the symbol used for padding
     
       Returns:
    -    Attention bias tensor of shape [batch_size, 1, 1, length].
    +    Attention bias tensor of shape
    +    [batch_size, 1, 1, length] if  res_rank = 4 - for Transformer
    +    or [batch_size, 1, length] if res_rank = 3 - for ConvS2S
       """
       with tf.name_scope("attention_bias"):
    -    padding = get_padding(x)
    +    padding = get_padding(x, padding_value=pad_sym)
         attention_bias = padding * _NEG_INF
    -    attention_bias = tf.expand_dims(
    -        tf.expand_dims(attention_bias, axis=1), axis=1)
    +    if res_rank == 4:
    +      attention_bias = tf.expand_dims(tf.expand_dims(attention_bias, axis=1), axis=1)
    +    elif res_rank == 3:
    +      attention_bias = tf.expand_dims(attention_bias, axis=1)
    +    else:
    +      raise ValueError("res_rank should be 3 or 4 but got {}".format(res_rank))
       return attention_bias
    -
    diff --git a/docs/html/_modules/utils/funcs.html b/docs/html/_modules/utils/funcs.html index e18613100..28a2ac808 100644 --- a/docs/html/_modules/utils/funcs.html +++ b/docs/html/_modules/utils/funcs.html @@ -164,7 +164,8 @@

    Source code for utils.funcs

     
     from .hooks import PrintSamplesHook, RunEvaluationHook, PrintLossAndTimeHook, \
                        BroadcastGlobalVariablesHook
    -from open_seq2seq.utils.utils import deco_print, get_results_for_epoch
    +from open_seq2seq.utils.utils import deco_print, get_results_for_epoch, \
    +                                     collect_if_horovod
     from tensorflow.python import debug as tf_debug
     
     
    @@ -272,7 +273,17 @@ 

    Source code for utils.funcs

             break
           tm = time.time()
           try:
    -        fetches_vals = sess.run(fetches)
    +        feed_dict = {}
    +        iter_size = train_model.params.get('iter_size', 1)
    +        if iter_size > 1:
    +          feed_dict[train_model.skip_update_ph] = step % iter_size != 0
    +        if step % iter_size == 0:
    +          fetches_vals = sess.run(fetches, feed_dict)
    +        else:
    +          # necessary to skip "no-update" steps when iter_size > 1
    +          def run_with_no_hooks(step_context):
    +            return step_context.session.run(fetches, feed_dict)
    +          fetches_vals = sess.run_step_fn(run_with_no_hooks)
           except tf.errors.OutOfRangeError:
             break
           if step >= bench_start:
    @@ -280,29 +291,29 @@ 

    Source code for utils.funcs

             if len(fetches) > 1:
               for i in range(train_model.num_gpus):
                 total_objects += np.sum(fetches_vals[i + 1])
    +          if train_model.params['print_bench_info_steps'] is not None:
    +            if step % train_model.params['print_bench_info_steps'] == 0:
    +              total_objects_cur = collect_if_horovod(total_objects, hvd,
    +                                                     mode="sum")
    +              if master_worker:
    +                avg_objects = 1.0 * total_objects_cur / total_time
    +                deco_print("Avg objects per second: {:.3f}".format(avg_objects))
    +
           step += 1
     
    -  if hvd is not None:
    -    deco_print("Finished training on rank {}".format(hvd.rank()))
    -  else:
    -    deco_print("Finished training")
    +  if len(fetches) > 1:
    +    total_objects = collect_if_horovod(total_objects, hvd, mode="sum")
     
    -  if train_model.on_horovod:
    -    ending = " on worker {}".format(hvd.rank())
    -  else:
    -    ending = ""
    -  if step > bench_start:
    -    deco_print(
    -      "Avg time per step{}: {:.3f}s".format(
    -        ending, 1.0 * total_time / (step - bench_start))
    -    )
    -    if len(fetches) > 1:
    -      deco_print(
    -        "Avg objects per second{}: {:.3f}".format(
    -          ending, 1.0 * total_objects / total_time)
    -      )
    -  else:
    -    deco_print("Not enough steps for benchmarking{}".format(ending))
    + if master_worker: + deco_print("Finished training") + if step > bench_start: + avg_time = 1.0 * total_time / (step - bench_start) + deco_print("Avg time per step: {:.3f}s".format(avg_time)) + if len(fetches) > 1: + avg_objects = 1.0 * total_objects / total_time + deco_print("Avg objects per second: {:.3f}".format(avg_objects)) + else: + deco_print("Not enough steps for benchmarking")
    [docs]def restore_and_get_results(model, checkpoint, mode): diff --git a/docs/html/_modules/utils/hooks.html b/docs/html/_modules/utils/hooks.html index 60851a684..e4c74a11a 100644 --- a/docs/html/_modules/utils/hooks.html +++ b/docs/html/_modules/utils/hooks.html @@ -244,10 +244,10 @@

    Source code for utils.hooks

         self._timer.update_last_triggered_step(self._iter_count - 1)
     
         input_values, output_values = results
    -    dict_to_log = self._model.maybe_print_logs(input_values, output_values)
    +    dict_to_log = self._model.maybe_print_logs(input_values, output_values, step)
         # optionally logging to tensorboard any values
         # returned from maybe_print_logs
    -    if dict_to_log:
    +    if self._model.params['save_summaries_steps'] and dict_to_log:
           log_summaries_from_dict(
             dict_to_log,
             self._model.params['logdir'],
    @@ -348,11 +348,12 @@ 

    Source code for utils.hooks

         if not self._model.on_horovod or self._model.hvd.rank() == 0:
           deco_print("Validation loss: {:.4f}".format(total_loss), offset=4)
     
    -      dict_to_log = self._model.finalize_evaluation(results_per_batch)
    +      dict_to_log = self._model.finalize_evaluation(results_per_batch, step)
           dict_to_log['eval_loss'] = total_loss
     
           # saving the best validation model
    -      if total_loss < self._best_eval_loss:
    +      if self._model.params['save_checkpoint_steps'] and \
    +         total_loss < self._best_eval_loss:
             self._best_eval_loss = total_loss
             self._eval_saver.save(
               run_context.session,
    @@ -363,7 +364,7 @@ 

    Source code for utils.hooks

     
           # optionally logging to tensorboard any values
           # returned from maybe_print_logs
    -      if dict_to_log:
    +      if self._model.params['save_summaries_steps']:
             log_summaries_from_dict(
               dict_to_log,
               self._model.params['logdir'],
    diff --git a/docs/html/_modules/utils/utils.html b/docs/html/_modules/utils/utils.html
    index 412bf547e..1dc41f439 100644
    --- a/docs/html/_modules/utils/utils.html
    +++ b/docs/html/_modules/utils/utils.html
    @@ -159,6 +159,7 @@ 

    Source code for utils.utils

     from six.moves import range
     from six import string_types
     
    +import six
     import tensorflow as tf
     import subprocess
     import numpy as np
    @@ -186,6 +187,42 @@ 

    Source code for utils.utils

                                   dense_shape_clipped)
    +
    [docs]def collect_if_horovod(value, hvd, mode='sum'): + """Collects values from all workers if run on Horovod. + Note, that on all workers except first this function will return None. + + Args: + value: value to collect. + hvd: horovod.tensorflow module or None + mode: could be "sum", "mean" or "gather", indicating reduce_sum or gather. + For "sum" and "mean" value has to be numerical, for "gather", value has + to be iterable. + + Returns: + collected results if run on Horovod or value otherwise. + """ + if hvd is None: + return value + + import mpi4py.rc + mpi4py.rc.initialize = False + from mpi4py import MPI + + values = MPI.COMM_WORLD.gather(value) + # synchronize all workers + MPI.COMM_WORLD.Barrier() + + if MPI.COMM_WORLD.Get_rank() != 0: + return None + + if mode == 'sum': + return np.sum(values) + elif mode == 'mean': + return np.mean(values) + elif mode == 'gather': + return [item for sl in values for item in sl]
    + +
    [docs]def clip_last_batch(last_batch, true_size): last_batch_clipped = [] for val in last_batch: @@ -196,179 +233,172 @@

    Source code for utils.utils

       return last_batch_clipped
    -
    [docs]def iterate_data_layer(model, dl_id, sess, compute_loss, mode, verbose): +
    [docs]def iterate_data(model, sess, compute_loss, mode, verbose): total_time = 0.0 bench_start = model.params.get('bench_start', 10) results_per_batch = [] - if model.on_horovod: - data_layer = model.get_data_layer() - if compute_loss: - loss_tensor = model.eval_losses[0] - output_tensors = model.get_output_tensors() - else: - data_layer = model.get_data_layer(dl_id) - if compute_loss: - loss_tensor = model.eval_losses[dl_id] - output_tensors = model.get_output_tensors(dl_id) - - sess.run(data_layer.iterator.initializer) - - fetches = [ - data_layer.input_tensors, - output_tensors, - ] + size_defined = model.get_data_layer().get_size_in_samples() is not None + if size_defined: + dl_sizes = [] if compute_loss: - fetches.append(loss_tensor) total_loss = 0.0 - total_samples = 0.0 - size_defined = data_layer.get_size_in_samples() is not None + total_samples = [] + fetches = [] - if size_defined: - data_size = data_layer.get_size_in_samples() // \ - data_layer.params['batch_size'] - last_batch_size = data_layer.get_size_in_samples() % \ - data_layer.params['batch_size'] + # on horovod num_gpus is 1 + for worker_id in range(model.num_gpus): + cur_fetches = [ + model.get_data_layer(worker_id).input_tensors, + model.get_output_tensors(worker_id), + ] + if compute_loss: + cur_fetches.append(model.eval_losses[worker_id]) + if size_defined: + dl_sizes.append(model.get_data_layer(worker_id).get_size_in_samples()) + try: + total_objects = 0.0 + cur_fetches.append(model.get_num_objects_per_step(worker_id)) + except NotImplementedError: + total_objects = None + deco_print("WARNING: Can't compute number of objects per step, since " + "train model does not define get_num_objects_per_step method.") + fetches.append(cur_fetches) + total_samples.append(0.0) + + sess.run([model.get_data_layer(i).iterator.initializer + for i in range(model.num_gpus)]) + + step = 0 + processed_batches = 0 + if verbose: + if model.on_horovod: + ending = " on worker {}".format(model.hvd.rank()) + else: + ending = "" - if model.on_horovod: - worker_id = model.hvd.rank() - else: - worker_id = dl_id + while True: + tm = time.time() + fetches_vals = {} + if size_defined: + fetches_to_run = {} + # removing finished data layers + for worker_id in range(model.num_gpus): + if total_samples[worker_id] < dl_sizes[worker_id]: + fetches_to_run[worker_id] = fetches[worker_id] + fetches_vals = sess.run(fetches_to_run) + else: + # if size is not defined we have to process fetches sequentially, so not + # to lose data when exception is thrown on one data layer + for worker_id, one_fetch in enumerate(fetches): + try: + fetches_vals[worker_id] = sess.run(one_fetch) + except tf.errors.OutOfRangeError: + continue - cross_over = 0 - if size_defined: - if data_size == 0: - raise ValueError( - "Batch size is bigger than dataset size: {} > {}".format( - data_layer.params['batch_size'], data_layer.get_size_in_samples() - ) - ) - if last_batch_size != 0: - cross_over = 1 - else: - # setting data_size to be infinity and assume - # that tf.errors.OutOfRangeError will be raised - data_size = 1000000000000 + if step >= bench_start: + total_time += time.time() - tm - for step in range(data_size + cross_over): - tm = time.time() - try: + # looping over num_gpus. In Horovod case this loop is "dummy", + # since num_gpus = 1 + for worker_id, fetches_val in fetches_vals.items(): if compute_loss: - inputs, outputs, loss = sess.run(fetches) + inputs, outputs, loss = fetches_val[:3] else: - inputs, outputs = sess.run(fetches) - except tf.errors.OutOfRangeError: - break - if step >= bench_start: - total_time += time.time() - tm + inputs, outputs = fetches_val[:2] - # assuming any element of inputs["source_tensors"][ shape[0] is batch size - batch_size = inputs["source_tensors"][0].shape[0] + if total_objects is not None: + total_objects += np.sum(fetches_val[-1]) - if compute_loss: - total_loss += loss * batch_size - total_samples += batch_size + # assuming any element of inputs["source_tensors"] .shape[0] is batch size + batch_size = inputs["source_tensors"][0].shape[0] + total_samples[worker_id] += batch_size - if size_defined and step == data_size: - inputs["source_tensors"] = model.clip_last_batch( - inputs["source_tensors"], last_batch_size, - ) - if 'target_tensors' in inputs: - inputs["target_tensors"] = model.clip_last_batch( - inputs["target_tensors"], last_batch_size, - ) - outputs = model.clip_last_batch(outputs, last_batch_size) - - if mode == 'eval': - results_per_batch.append(model.evaluate(inputs, outputs)) - elif mode == 'infer': - results_per_batch.append(model.infer(inputs, outputs)) - else: - raise ValueError("Unknown mode: {}".format(mode)) + if size_defined: + # this data_layer is at the last batch with few more elements, cutting + if total_samples[worker_id] > dl_sizes[worker_id]: + last_batch_size = dl_sizes[worker_id] % batch_size + for key, value in inputs.items(): + inputs[key] = model.clip_last_batch(value, last_batch_size) + outputs = model.clip_last_batch(outputs, last_batch_size) + + processed_batches += 1 + + if compute_loss: + total_loss += loss * batch_size + + if mode == 'eval': + results_per_batch.append(model.evaluate(inputs, outputs)) + elif mode == 'infer': + results_per_batch.append(model.infer(inputs, outputs)) + else: + raise ValueError("Unknown mode: {}".format(mode)) if verbose: if size_defined: - if data_size > 10 and step % (data_size // 10) == 0: - deco_print("Processed {}/{} batches on worker {}".format( - step + 1, data_size, worker_id)) + data_size = int(np.sum(np.ceil(np.array(dl_sizes) / + model.params['batch_size_per_gpu']))) + if step == 0 or len(fetches_vals) == 0 or \ + (data_size > 10 and processed_batches % (data_size // 10) == 0): + deco_print("Processed {}/{} batches{}".format( + processed_batches, data_size, ending)) else: - deco_print("Processed {} batches".format(step + 1), end='\r') + deco_print("Processed {} batches{}".format(processed_batches, ending), + end='\r') + + if len(fetches_vals) == 0: + break + step += 1 if verbose: if step > bench_start: deco_print( - "Avg time per step: {:.3}s on worker {}".format( - 1.0 * total_time / (step - bench_start), worker_id), + "Avg time per step{}: {:.3}s".format( + ending, 1.0 * total_time / (step - bench_start)), ) + if total_objects is not None: + avg_objects = 1.0 * total_objects / total_time + deco_print("Avg objects per second{}: {:.3f}".format(ending, + avg_objects)) else: deco_print( - "Not enough steps for benchmarking on worker {}".format(worker_id) + "Not enough steps for benchmarking{}".format(ending) ) if compute_loss: - return results_per_batch, total_loss, total_samples + return results_per_batch, total_loss, np.sum(total_samples) else: return results_per_batch
    [docs]def get_results_for_epoch(model, sess, compute_loss, mode, verbose=False): - if model.on_horovod: - if compute_loss: - results_per_batch, total_loss, total_samples = iterate_data_layer( - model, 0, sess, compute_loss, mode, verbose, - ) - else: - results_per_batch = iterate_data_layer( - model, 0, sess, compute_loss, mode, verbose, - ) + if compute_loss: + results_per_batch, total_loss, total_samples = iterate_data( + model, sess, compute_loss, mode, verbose, + ) else: - results_per_batch_all = [] - total_loss_all = [] - total_samples_all = [] - for dl_id in range(model.num_gpus): - if compute_loss: - results_per_batch, total_loss, total_samples = iterate_data_layer( - model, dl_id, sess, compute_loss, mode, verbose, - ) - total_loss_all.append(total_loss) - total_samples_all.append(total_samples) - else: - results_per_batch = iterate_data_layer( - model, dl_id, sess, compute_loss, mode, verbose, - ) - results_per_batch_all.append(results_per_batch) + results_per_batch = iterate_data( + model, sess, compute_loss, mode, verbose, + ) - if model.on_horovod: - import mpi4py.rc - mpi4py.rc.initialize = False - from mpi4py import MPI + if compute_loss: + total_samples = collect_if_horovod(total_samples, model.hvd, 'sum') + total_loss = collect_if_horovod(total_loss, model.hvd, 'sum') + results_per_batch = collect_if_horovod(results_per_batch, model.hvd, 'gather') + if results_per_batch is None: + # returning dummy tuple of correct shape if not in master worker if compute_loss: - total_samples_all = MPI.COMM_WORLD.gather(total_samples) - total_loss_all = MPI.COMM_WORLD.gather(total_loss) - results_per_batch_all = MPI.COMM_WORLD.gather(results_per_batch) - - MPI.COMM_WORLD.Barrier() - if MPI.COMM_WORLD.Get_rank() != 0: - # returning dummy tuple of correct shape - if compute_loss: - return None, None - else: - return None - - if compute_loss: - total_loss = np.sum(total_loss_all) - total_samples = np.sum(total_samples_all) - # moving GPU dimension into the batch dimension - results_per_batch = [item for sl in results_per_batch_all for item in sl] + return None, None + else: + return None if compute_loss: - total_loss /= total_samples - return results_per_batch, total_loss - - return results_per_batch
    + return results_per_batch, total_loss / total_samples + else: + return results_per_batch
    [docs]def log_summaries_from_dict(dict_to_log, output_dir, step): @@ -441,7 +471,14 @@

    Source code for utils.utils

     
    [docs]def nested_update(org_dict, upd_dict): for key, value in upd_dict.items(): if isinstance(value, dict): - nested_update(org_dict[key], value) + if key in org_dict: + if not isinstance(org_dict[key], dict): + raise ValueError( + "Mismatch between org_dict and upd_dict at node {}".format(key) + ) + nested_update(org_dict[key], value) + else: + org_dict[key] = value else: org_dict[key] = value
    @@ -454,7 +491,10 @@

    Source code for utils.utils

     
     
     
    [docs]def deco_print(line, offset=0, start="*** ", end='\n'): - print(start + " " * offset + line, end=end)
    + if six.PY2: + print((start + " " * offset + line).encode('utf-8'), end=end) + else: + print(start + " " * offset + line, end=end)
    [docs]def array_to_string(row, vocab, delim=' '): diff --git a/docs/html/_sources/api-docs/decoders.rst.txt b/docs/html/_sources/api-docs/decoders.rst.txt index 07c22c247..681f5d1e1 100644 --- a/docs/html/_sources/api-docs/decoders.rst.txt +++ b/docs/html/_sources/api-docs/decoders.rst.txt @@ -37,3 +37,11 @@ transformer\_decoders :members: :undoc-members: :show-inheritance: + +convs2s\_decoder +------------------------------------- + +.. automodule:: decoders.convs2s_decoder + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/html/_sources/api-docs/encoders.rst.txt b/docs/html/_sources/api-docs/encoders.rst.txt index 362287eb6..5c11d1f26 100644 --- a/docs/html/_sources/api-docs/encoders.rst.txt +++ b/docs/html/_sources/api-docs/encoders.rst.txt @@ -22,6 +22,14 @@ ds2\_encoder :undoc-members: :show-inheritance: +w2l\_encoder +---------------------------- + +.. automodule:: encoders.w2l_encoder + :members: + :undoc-members: + :show-inheritance: + rnn\_encoders ----------------------------- @@ -38,6 +46,14 @@ transformer\_encoders :undoc-members: :show-inheritance: +convs2s\_encoder +------------------------------------- + +.. automodule:: encoders.convs2s_encoder + :members: + :undoc-members: + :show-inheritance: + resnet\_encoder ---------------------------------- @@ -53,3 +69,12 @@ resnet\_blocks :members: :undoc-members: :show-inheritance: + + +cnn\_encoder +-------------------------------- + +.. automodule:: encoders.cnn_encoder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/html/_sources/api-docs/parts.cnns.rst.txt b/docs/html/_sources/api-docs/parts.cnns.rst.txt new file mode 100644 index 000000000..631cb86c1 --- /dev/null +++ b/docs/html/_sources/api-docs/parts.cnns.rst.txt @@ -0,0 +1,15 @@ +cnns +======================================= + +.. automodule:: parts.cnns + :members: + :undoc-members: + :show-inheritance: + +conv\_blocks +------------------------------------------------------- + +.. automodule:: parts.cnns.conv_blocks + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/html/_sources/api-docs/parts.convs2s.rst.txt b/docs/html/_sources/api-docs/parts.convs2s.rst.txt new file mode 100644 index 000000000..226652c72 --- /dev/null +++ b/docs/html/_sources/api-docs/parts.convs2s.rst.txt @@ -0,0 +1,31 @@ +convs2s +======================================= + +.. automodule:: parts.convs2s + :members: + :undoc-members: + :show-inheritance: + +attention\_wn\_layer +------------------------------------------------------- + +.. automodule:: parts.convs2s.attention_wn_layer + :members: + :undoc-members: + :show-inheritance: + +conv\_wn\_layer +------------------------------------------------------- + +.. automodule:: parts.convs2s.conv_wn_layer + :members: + :undoc-members: + :show-inheritance: + +ffn\_wn\_layer +------------------------------------------------------- + +.. automodule:: parts.convs2s.ffn_wn_layer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/html/_sources/api-docs/parts.rst.txt b/docs/html/_sources/api-docs/parts.rst.txt index 3f85cb82a..6a57d9287 100644 --- a/docs/html/_sources/api-docs/parts.rst.txt +++ b/docs/html/_sources/api-docs/parts.rst.txt @@ -10,3 +10,5 @@ parts parts.rnns parts.transformer + parts.convs2s + parts.cnns \ No newline at end of file diff --git a/docs/html/_sources/api-docs/parts.transformer.rst.txt b/docs/html/_sources/api-docs/parts.transformer.rst.txt index 7dab39e8a..8fa9237fd 100644 --- a/docs/html/_sources/api-docs/parts.transformer.rst.txt +++ b/docs/html/_sources/api-docs/parts.transformer.rst.txt @@ -22,14 +22,6 @@ beam\_search :undoc-members: :show-inheritance: -beam\_search\_test ---------------------------------------------------------- - -.. automodule:: parts.transformer.beam_search_test - :members: - :undoc-members: - :show-inheritance: - common --------------------------------------------- diff --git a/docs/html/_sources/installation-instructions.rst.txt b/docs/html/_sources/installation-instructions.rst.txt index ba09bbccc..0e4c7d110 100644 --- a/docs/html/_sources/installation-instructions.rst.txt +++ b/docs/html/_sources/installation-instructions.rst.txt @@ -32,7 +32,7 @@ run unittests:: python -m unittest discover -s open_seq2seq -p '*_test.py' -It might take up to 10 minutes. You should see a lot of output, but no errors +It might take up to 30 minutes. You should see a lot of output, but no errors in the end. .. _installation_speech: diff --git a/docs/html/_sources/models-and-recipes.rst.txt b/docs/html/_sources/models-and-recipes.rst.txt index da71d6e6a..70de3f5ce 100644 --- a/docs/html/_sources/models-and-recipes.rst.txt +++ b/docs/html/_sources/models-and-recipes.rst.txt @@ -3,18 +3,17 @@ Models and recipes ================== -.. This section will contain information about different models that OpenSeq2Seq -.. supports, exact config parameters to train them, final training/validation/test -.. metrics and links to checkpoints (tensorboards also?) of trained models. .. note:: Currently OpenSeq2Seq has model implementations for machine translation and - automatic speech recognition. All models work both in float32 and mixed precision. - We recommend you use :ref:`mixed precision training ` when training on Volta GPUs. + automatic speech recognition. + All models work both in float32 and mixed precision. + We recommend you use :ref:`mixed precision training ` + when training on Volta GPUs. -To train models you can use the following -commands (don't forget to substitute valid config_file path there and number of GPUs if using Horovod). +To train models you can use the following commands (don't forget to substitute +valid config_file path there and number of GPUs if using Horovod). With Horovod (highly recommended when using multiple GPUs):: @@ -29,6 +28,16 @@ The description of implemented models is available in the next sections: Machine translation ------------------- +The table below contains description and results of +machine translation models available in OpenSeq2Seq. +Currently, we have GNMT-based model, Transformer-based models and +ConvS2S-based models. + +We measure BLEU score on newstest2014.tok.de file using ``multi-bleu.perl`` script from Mosses. +For more details about model descriptions and training setup, +have a look at the `configuration files `_. + + .. list-table:: :widths: 1 1 1 1 1 :header-rows: 1 @@ -38,72 +47,87 @@ Machine translation - Training setup and additional comments - Short description of the model - Checkpoint - * - `en-de-nmt-small.py `_ + * - `en-de-nmt-small.py `_ - 20.23 - This model should train on a single GPU such as 1080Ti. It is trained using Adam optimizer. - RNN-based. Bi-directional encoder with 2 layers and. GNMT-like decoder with 2 layers and attention. Uses LSTM cells of size 512. - `link `_ - * - `en-de-gnmt-like-4GPUs.py `_ + * - `en-de-gnmt-like-4GPUs.py `_ - 23.89 - This model was trained on 4 GPUs with Adam optimizer and learning rate decay. - RNN-based. This is GNMT-like model which tries to match the one described in https://arxiv.org/abs/1609.08144 as close as possible. - `link `_ - * - `transformer-big.py `_ + * - `transformer-big.py `_ - 26.17 - This model was trained on 4 GPUs with Adam optimizer and learning rate decay. - Transformer "big" model. This model does not have any RNN layers - `link `_ + * - `en-de-convs2s.py `_ + - xx.xx + - This model was trained on 4 GPUs with Adam optimizer, learning rate decay and warm-up. + - This is an implementation of the ConvS2S model proposed in https://arxiv.org/abs/1705.03122. + - Coming soon. -GNMT model description can be found `here `_. -Transformer model description can be found `here `_. -We measure BLEU score on newstest2014.tok.de file using ``multi-bleu.perl`` script from Mosses. +GNMT model description: https://arxiv.org/abs/1609.08144. + +Transformer model description: https://arxiv.org/abs/1706.03762. + +ConvS2S model description: https://arxiv.org/abs/1705.03122. Speech recognition ------------------ -Deep Speech 2 based models -~~~~~~~~~~~~~~~~~~~~~~~~~~ -Original Deep Speech 2 model description: https://arxiv.org/abs/1512.02595. The table below contains description and results of -Deep Speech 2 based models available in OpenSeq2Seq. +speech recognition models available in OpenSeq2Seq. +Currently, we have DeepSpeech2-based models and Wav2Letter-based models. -WER-512 and WER-2048 is word error rate obtained with beam width of 512 and 2048 -correspondingly on a dev-clean subset of LibriSpeech. For beam width of 2048 we also used ``batch_size_per_gpu = 1`` +WER is the word error rate obtained on a dev-clean subset of LibriSpeech using +greedy decoder (``decoder_params/use_language_model = False``). +For the final evaluation we used ``batch_size_per_gpu = 1`` to eliminate the effect of `cudnn padding issue `_. For more details about model descriptions and training setup, -have a look at the `configuration files `_. +have a look at the `configuration files `_. .. list-table:: - :widths: 1 1 1 1 1 1 + :widths: 1 1 1 1 1 :header-rows: 1 * - Config file - - WER-512 - - WER-2048 + - WER - Training setup and additional comments - Short description of the model - Checkpoint * - `ds2_large_8gpus.py `_ - - 4.90% - - 4.59% + - 9.28% - This model was trained for 50 epochs using SGD with Momentum and LARC on the full LibriSpeech in a few days using Horovod on eight GPUs. - This model has 2 convolutional layers and 5 bidirectional GRU layers with 800 units. - - `link `_ + - `link `_ * - `ds2_medium_4gpus.py `_ - - 6.12% - - 5.49% + - 22.60% - This model was trained for 50 epochs using Adam on the full LibriSpeech in a few days using Horovod on four GPUs. - This model has 3 convolutional layers and 3 unidirectional GRU layers with 1024 units. - `link `_ * - `ds2_small_1gpu.py `_ - - 11.77% - - 9.32% + - 39.08% - This model was trained for 12 epochs using Adam on a "clean" subset of LibriSpeech in less than a day using a single GPU. - This model has 2 convolutional layers and 2 bidirectional GRU layers with 512 units. - `link `_ + * - `w2l_large_8gpus.py `_ + - 15.44% + - This model was trained for 18 epochs (with early stopping based on + validation loss) using SGD with Momentum and LARC on + the full LibriSpeech in a few days on eight GPUs. + - The model has 19 convolutional layers (200--1000 units, 7--21 kernel size). + We use batch norm between all layers. + - `link `_ + + +Deep Speech 2 model description: https://arxiv.org/abs/1512.02595. + +Wav2Letter model description: https://arxiv.org/abs/1609.03193, https://arxiv.org/abs/1712.09444. diff --git a/docs/html/api-docs/data.image2label.html b/docs/html/api-docs/data.image2label.html index 53efdff46..27f21f736 100644 --- a/docs/html/api-docs/data.image2label.html +++ b/docs/html/api-docs/data.image2label.html @@ -186,6 +186,108 @@

    image2label

    +
    +class data.image2label.image2label.CifarDataLayer(params, model, num_workers, worker_id)[source]
    +

    Bases: open_seq2seq.data.data_layer.DataLayer

    +
    +
    +build_graph()[source]
    +

    Here all TensorFlow graph construction should happen.

    +
    + +
    +
    +static get_optional_params()[source]
    +

    Static method with description of optional parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that can be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +static get_required_params()[source]
    +

    Static method with description of required parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that have to be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +get_size_in_samples()[source]
    +

    Should return the dataset size in samples. +That is, the number of objects in the dataset. This method is used to +calculate a valid epoch size. If this method is not defined, you will need +to make sure that your dataset for evaluation is created only for +one epoch. You will also not be able to use num_epochs parameter in the +base config.

    + +++ + + + + + +
    Returns:dataset size in samples.
    Return type:int
    +
    + +
    +
    +input_tensors
    +

    Dictionary containing input tensors. +This dictionary has to define the following keys: source_tensors, +which should contain all tensors describing the input object (i.e. tensors +that are passed to the encoder, e.g. input sequence and input length). And +when self.params['mode'] != "infer" data layer should also define +target_tensors which is the list of all tensors related to the +corresponding target object (i.e. tensors taht are passed to the decoder and +loss, e.g. target sequence and target length). Note that all tensors have +to be created inside self.build_graph() method.

    +
    + +
    +
    +iterator
    +

    tf.data.Dataset iterator. +Should be created by self.build_graph().

    +
    + +
    +
    +parse_record(raw_record, is_training, num_classes=10)[source]
    +

    Parse CIFAR-10 image and label from a raw record.

    +
    + +
    +
    +preprocess_image(image, is_training)[source]
    +

    Preprocess a single image of layout [height, width, depth].

    +
    + +
    + +
    class data.image2label.image2label.ImagenetDataLayer(params, model, num_workers, worker_id)[source]

    Bases: open_seq2seq.data.data_layer.DataLayer

    @@ -283,14 +385,14 @@

    image2label

    imagenet_preprocessing

    -

    Provides utilities to preprocess images.

    -

    Training images are sampled using the provided bounding boxes, and subsequently +

    Provides utilities to preprocess images. +Training images are sampled using the provided bounding boxes, and subsequently cropped to the sampled bounding box. Images are additionally flipped randomly, -then resized to the target output size (without aspect-ratio preservation).

    -

    Images used during evaluation are resized (with aspect-ratio preservation) and -centrally cropped.

    -

    All images undergo mean color subtraction.

    -

    Note that these steps are colloquially referred to as “ResNet preprocessing,” +then resized to the target output size (without aspect-ratio preservation). +Images used during evaluation are resized (with aspect-ratio preservation) and +centrally cropped. +All images undergo mean color subtraction. +Note that these steps are colloquially referred to as “ResNet preprocessing,” and they differ from “VGG preprocessing,” which does not use bounding boxes and instead does an aspect-preserving resize followed by random crop during training. (These both differ from “Inception preprocessing,” which introduces @@ -345,8 +447,8 @@

    image2label
    data.image2label.imagenet_preprocessing._decode_crop_and_flip(image_buffer, bbox, num_channels)[source]
    -

    Crops the given image to a random part of the image, and randomly flips.

    -

    We use the fused decode_and_crop op, which performs better than the two ops +

    Crops the given image to a random part of the image, and randomly flips. +We use the fused decode_and_crop op, which performs better than the two ops used separately in series, but note that this requires that the image be passed in as an un-decoded string Tensor.

    @@ -370,13 +472,13 @@

    image2label -
    -data.image2label.imagenet_preprocessing._mean_image_subtraction(image, means, num_channels)[source]
    -

    Subtracts the given means from each image channel.

    +
    +data.image2label.imagenet_preprocessing._mean_image_subtraction_and_normalization(image, means, num_channels)[source]
    +

    Subtracts the given means from each image channel and divides by 127.5.

    For example:
    means = [123.68, 116.779, 103.939] -image = _mean_image_subtraction(image, means)
    +image = _mean_image_subtraction_and_normalization(image, means)

    Note that the rank of image must be known.

    @@ -390,7 +492,7 @@

    image2label

    - +equal to number of evaluation batches). +
  • training_step (int) – current training step. Will only be passed if mode +is “train_eval”.
  • + +
    - + - +
    Returns:

    the centered image.

    +
    Returns:

    the centered image and normalized image.

    Raises:

    ValueError – If the rank of image is unknown, if image has a rank other @@ -405,8 +507,8 @@

    image2label
    data.image2label.imagenet_preprocessing._parse_example_proto(example_serialized)[source]
    -

    Parses an Example proto containing a training example of an image.

    -

    The output of the build_image_data.py image preprocessing script is a dataset +

    Parses an Example proto containing a training example of an image. +The output of the build_image_data.py image preprocessing script is a dataset containing serialized Example protocol buffers. Each Example proto contains the following fields (values are included as examples):

    @@ -448,8 +550,8 @@

    image2label
    data.image2label.imagenet_preprocessing._resize_image(image, height, width)[source]
    -

    Simple wrapper around tf.resize_images.

    -

    This is primarily to make sure we use the same ResizeMethod and other +

    Simple wrapper around tf.resize_images. +This is primarily to make sure we use the same ResizeMethod and other details each time.

    @@ -480,8 +582,8 @@

    image2label
    data.image2label.imagenet_preprocessing._smallest_size_at_least(height, width, resize_min)[source]
    -

    Computes new shape with the smallest side equal to smallest_side.

    -

    Computes new shape with the smallest side equal to smallest_side while +

    Computes new shape with the smallest side equal to smallest_side. +Computes new shape with the smallest side equal to smallest_side while preserving the original aspect ratio.

    @@ -508,9 +610,9 @@

    image2label
    -data.image2label.imagenet_preprocessing.parse_record(raw_record, is_training)[source]
    -

    Parses a record containing a training example of an image.

    -

    The input record is parsed into a label and image, and the image is passed +data.image2label.imagenet_preprocessing.parse_record(raw_record, is_training, image_size=224, num_classes=1000)[source] +

    Parses a record containing a training example of an image. +The input record is parsed into a label and image, and the image is passed through preprocessing steps (cropping, flipping, and so on).

    @@ -520,6 +622,8 @@

    image2label
    data.image2label.imagenet_preprocessing.preprocess_image(image_buffer, bbox, output_height, output_width, num_channels, is_training=False)[source]
    -

    Preprocesses the given image.

    -

    Preprocessing includes decoding, cropping, and resizing for both training +

    Preprocesses the given image. +Preprocessing includes decoding, cropping, and resizing for both training and eval images. Training preprocessing, however, introduces some random distortion of the image to improve accuracy.

    diff --git a/docs/html/api-docs/data.speech2text.html b/docs/html/api-docs/data.speech2text.html index 3dbeab305..c6ac82900 100644 --- a/docs/html/api-docs/data.speech2text.html +++ b/docs/html/api-docs/data.speech2text.html @@ -187,12 +187,12 @@

    speech2text

    -class data.speech2text.speech2text.Speech2TextDataLayer(params, model, num_workers=None, worker_id=None)[source]
    +class data.speech2text.speech2text.Speech2TextDataLayer(params, model, num_workers, worker_id)[source]

    Bases: open_seq2seq.data.data_layer.DataLayer

    Speech-to-text data layer class.

    -__init__(params, model, num_workers=None, worker_id=None)[source]
    +__init__(params, model, num_workers, worker_id)[source]

    Speech-to-text data layer constructor.

    See parent class for arguments description.

    Config parameters:

    @@ -223,15 +223,16 @@

    speech2text
    -_parse_audio_element(audio_filename)[source]
    +_parse_audio_element(id_and_audio_filename)[source]

    Parses audio from file and returns array of audio features.

    - + - + @@ -335,8 +336,7 @@

    speech2text
    split_data(data)[source]
    -

    Method that performs data split for evaluation.

    -
    +
    @@ -439,6 +439,12 @@

    speech2text +
    +data.speech2text.speech_utils.normalize_signal(signal)[source]
    +

    Normalize float32 signal to [-1, 1] range

    +
    + diff --git a/docs/html/api-docs/data.text2text.html b/docs/html/api-docs/data.text2text.html index 63610cc2d..2e34a5c9c 100644 --- a/docs/html/api-docs/data.text2text.html +++ b/docs/html/api-docs/data.text2text.html @@ -227,10 +227,12 @@ parallel_interleave, the sloppy argument is used to generate randomness in the order of the examples.

    +
  • Modified slightly to fit OpenSeq2Seq needs

    +
  • -data.text2text.t2t._batch_examples(dataset, batch_size, max_length)[source]
    +data.text2text.t2t._batch_examples(dataset, batch_size, max_length, pad_2_eight=True)[source]

    Group examples by similar lengths, and return batched dataset.

    Each batch of similar-length examples are padded to the same length, and may have different number of elements in each batch, such that:

    @@ -303,13 +305,13 @@
    -data.text2text.t2t._parse_example(serialized_example)[source]
    +data.text2text.t2t._parse_example(serialized_example, pad_2_eight=False)[source]

    Return inputs and targets Tensors from a serialized tf.Example.

    -data.text2text.t2t._read_and_batch_from_files(file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat, num_workers, worker_id)[source]
    +data.text2text.t2t._read_and_batch_from_files(file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat, num_workers, worker_id, batch_in_tokens, pad2eight=True)[source]

    Create dataset where each item is a dict of “inputs” and “targets”.

    Parameters:audio_filename – audio file name.
    Parameters:id_and_audio_filename – tuple of sample id and corresponding audio file name.
    Returns:source audio features as np.array, length of source sequence,
    Returns:source audio features as np.array, length of source sequence, +sample id.
    Return type:tuple
    @@ -325,6 +327,11 @@ repeated forever.
  • num_workers – Number of workers or number of Horovod workers
  • worker_id – Worker id or Horovod rank
  • +
  • batch_in_tokens – whether to batch_size means amounts in tokens or sentence
  • +
  • batching in tokens is more efficient as it reduces PADs. batching in (pairs.) –
  • +
  • should be used in inference mode since order of (sentences) –
  • +
  • is important (sentences) –
  • +
  • pad2eight – if True, it will pad both dimensions to be divisible by 8
  • diff --git a/docs/html/api-docs/decoders.html b/docs/html/api-docs/decoders.html index edeafe227..de9b48e7c 100644 --- a/docs/html/api-docs/decoders.html +++ b/docs/html/api-docs/decoders.html @@ -100,6 +100,7 @@
  • fc_decoders
  • rnn_decoders
  • transformer_decoders
  • +
  • convs2s_decoder
  • losses
  • @@ -269,7 +270,8 @@ - @@ -328,6 +336,26 @@
    Returns:dictionary of decoder outputs. Typically this will be just:
    {
       "logits": logits that will be passed to Loss
    -  "samples": actual decoded output, e.g. characters instead of logits
    +  "outputs": list with actual decoded outputs, e.g. characters
    +             instead of logits
     }
     
    @@ -469,7 +471,7 @@
    Returns:dictionary with the following tensors:
    {
       'logits': logits with the shape=[batch_size, output_dim]
    -  'samples': [logits] (same as logits but wrapped in list)
    +  'outputs': [logits] (same as logits but wrapped in list)
     }
     
    @@ -517,7 +519,7 @@
  • tgt_vocab_size (int) — target vocabulary size, i.e. number of output features.
  • logits_to_outputs_func — function that maps produced logits to -decoder samples, i.e. actual text sequences.
  • +decoder outputs, i.e. actual text sequences. @@ -543,7 +545,7 @@
    Returns:dictionary with the following tensors:
    {
       'logits': logits with the shape=[time length, batch_size, tgt_vocab_size]
    -  'samples': logits_to_outputs_func(logits, input_dict)
    +  'outputs': logits_to_outputs_func(logits, input_dict)
     }
     
    @@ -703,8 +705,8 @@
  • END_SYMBOL (int) — END symbol id, must be the same as used in data layer.
  • tgt_emb_size (int) — embedding size to use.
  • -
  • decoder_cell_units (int) - number of units in RNN
  • -
  • decoder_cell_type (string) - RNN type: lstm, gru, glstm, etc.
  • +
  • core_cell_params (dict) - parameters for RNN class
  • +
  • core_cell (string) - RNN class.
  • decoder_dp_input_keep_prob (float) - dropout input keep probability.
  • decoder_dp_output_keep_prob (float) - dropout output keep probability.
  • decoder_use_skip_connections (bool) - use residual connections or not.
  • @@ -814,6 +816,91 @@

    transformer_decoders

    +
    +
    +

    convs2s_decoder

    +
    +
    +class decoders.convs2s_decoder.ConvS2SDecoder(params, model, name='convs2s_decoder', mode='train')[source]
    +

    Bases: decoders.decoder.Decoder

    +
    +
    +_get_symbols_to_logits_fn()[source]
    +

    Returns a decoding function that calculates logits of the next tokens.

    +
    + +
    +
    +decode_pass(targets, encoder_outputs, encoder_outputs_b, inputs_attention_bias)[source]
    +

    Generate logits for each value in the target sequence.

    + +++ + + + + + +
    Parameters:
      +
    • targets – target values for the output sequence. +int tensor with shape [batch_size, target_length]
    • +
    • encoder_outputs – continuous representation of input sequence. +float tensor with shape [batch_size, input_length, hidden_size] +float tensor with shape [batch_size, input_length, hidden_size]
    • +
    • encoder_outputs_b – continuous representation of input sequence +which includes the source embeddings. +float tensor with shape [batch_size, input_length, hidden_size]
    • +
    • inputs_attention_bias – float tensor with shape [batch_size, 1, input_length]
    • +
    +
    Returns:

    float32 tensor with shape [batch_size, target_length, vocab_size]

    +
    +
    + +
    +
    +static get_optional_params()[source]
    +

    Static method with description of optional parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that can be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +static get_required_params()[source]
    +

    Static method with description of required parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that have to be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +predict(encoder_outputs, encoder_outputs_b, inputs_attention_bias)[source]
    +

    Return predicted sequence.

    +
    + +
    +
    diff --git a/docs/html/api-docs/encoders.html b/docs/html/api-docs/encoders.html index 64d919f02..eb4f9bd7c 100644 --- a/docs/html/api-docs/encoders.html +++ b/docs/html/api-docs/encoders.html @@ -97,10 +97,13 @@
  • encoders
  • decoders
  • @@ -496,12 +499,6 @@ -
    -
    -encoders.ds2_encoder.conv2d_bn_actv(name, inputs, filters, kernel_size, activation_fn, strides, padding, regularizer, training, data_format, bn_momentum, bn_epsilon)[source]
    -

    Helper function that applies convolution, batch norm and activation.

    -
    -
    encoders.ds2_encoder.rnn_cell(rnn_cell_dim, layer_type, dropout_keep_prob=1.0)[source]
    @@ -514,6 +511,137 @@

    Helper function that applies “row” or “in plane” convolution.

    + +
    +

    w2l_encoder

    +
    +
    +class encoders.w2l_encoder.Wave2LetterEncoder(params, model, name='w2l_encoder', mode='train')[source]
    +

    Bases: encoders.encoder.Encoder

    +

    Wave2Letter like encoder. Fully convolutional model

    +
    +
    +__init__(params, model, name='w2l_encoder', mode='train')[source]
    +

    Wave2Letter like encoder constructor.

    +

    See parent class for arguments description.

    +

    Config parameters:

    +
      +
    • dropout_keep_prop (float) — keep probability for dropout.

      +
    • +
    • convnet_layers (list) — list with the description of convolutional +layers. For example:

      +
      "convnet_layers": [
      +  {
      +    "type": "conv1d", "repeat" : 5,
      +    "kernel_size": [7], "stride": [1],
      +    "num_channels": 250, "padding": "SAME"
      +  },
      +  {
      +    "type": "conv1d", "repeat" : 3,
      +    "kernel_size": [11], "stride": [1],
      +    "num_channels": 500, "padding": "SAME"
      +  },
      +  {
      +    "type": "conv1d", "repeat" : 1,
      +    "kernel_size": [32], "stride": [1],
      +    "num_channels": 1000, "padding": "SAME"
      +  },
      +  {
      +    "type": "conv1d", "repeat" : 1,
      +    "kernel_size": [1], "stride": [1],
      +    "num_channels": 1000, "padding": "SAME"
      +  },
      +]
      +
      +
      +
    • +
    • activation_fn — activation function to use.

      +
    • +
    • data_format (string) — could be either “channels_first” or +“channels_last”. Defaults to “channels_last”.

      +
    • +
    • normalization — normalization to use. Accepts [None, ‘batch_norm’]. +Use None if you don’t want to use normalization. Defaults to ‘batch_norm’.

      +
    • +
    • bn_momentum (float) — momentum for batch norm. Defaults to 0.90.

      +
    • +
    • bn_epsilon (float) — epsilon for batch norm. Defaults to 1e-3.

      +
    • +
    +
    + +
    +
    +_encode(input_dict)[source]
    +

    Creates TensorFlow graph for Wav2Letter like encoder.

    + +++ + + + + + + + +
    Parameters:input_dict (dict) –

    input dictionary that has to contain +the following fields:

    +
    input_dict = {
    +  "source_tensors": [
    +    src_sequence (shape=[batch_size, sequence length, num features]),
    +    src_length (shape=[batch_size])
    +  ]
    +}
    +
    +
    +
    Returns:dictionary with the following tensors:
    {
    +  'outputs': hidden state, shape=[batch_size, sequence length, n_hidden]
    +  'src_length': tensor, shape=[batch_size]
    +}
    +
    +
    +
    Return type:dict
    +
    + +
    +
    +static get_optional_params()[source]
    +

    Static method with description of optional parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that can be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +static get_required_params()[source]
    +

    Static method with description of required parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that have to be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +

    rnn_encoders

    @@ -733,6 +861,98 @@ +
    +
    +class encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding_cuDNN(params, model, name='gnmt_encoder_with_emb_cudnn', mode='train')[source]
    +

    Bases: encoders.encoder.Encoder

    +

    Encoder similar to the one used in +GNMT model: https://arxiv.org/abs/1609.08144. +Must have at least 2 layers. Uses cuDNN RNN blocks for efficiency

    +
    +
    +__init__(params, model, name='gnmt_encoder_with_emb_cudnn', mode='train')[source]
    +

    Encodes data into representation +:param params: a Python dictionary. +Must define:

    +
    +
      +
    • +
      src_inputs - a Tensor of shape [batch_size, time] or [time, batch_size]
      +
      (depending on time_major param)
      +
      +
    • +
    • src_lengths - a Tensor of shape [batch_size]
    • +
    +
    + +++ + + + +
    Returns:a Python dictionary with: +* encoder_outputs - a Tensor of shape
    +
    [batch_size, time, representation_dim]
    +

    or [time, batch_size, representation_dim] +* encoder_state - a Tensor of shape [batch_size, dim] +* src_lengths - (copy ref from input) a Tensor of shape [batch_size]

    +
    +
    + +
    +
    +enc_emb_w
    +
    + +
    +
    +static get_optional_params()[source]
    +

    Static method with description of optional parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that can be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +static get_required_params()[source]
    +

    Static method with description of required parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that have to be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +src_emb_size
    +
    + +
    +
    +src_vocab_size
    +
    + +
    +
    class encoders.rnn_encoders.UnidirectionalRNNEncoderWithEmbedding(params, model, name='unidir_rnn_encoder_with_emb', mode='train')[source]
    @@ -851,6 +1071,63 @@

    transformer_encoders

    +
    +
    +

    convs2s_encoder

    +

    Conv-based encoder

    +
    +
    +class encoders.convs2s_encoder.ConvS2SEncoder(params, model, name='convs2s_encoder_with_emb', mode='train')[source]
    +

    Bases: encoders.encoder.Encoder

    +

    Fully convolutional Encoder of ConvS2S

    +
    +
    +static get_optional_params()[source]
    +

    Static method with description of optional parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that can be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +static get_required_params()[source]
    +

    Static method with description of required parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that have to be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +src_emb_size
    +
    + +
    +
    +src_vocab_size
    +
    + +
    +

    resnet_encoder

    @@ -1138,6 +1415,142 @@

    transformer_encoders +

    +
    +

    cnn_encoder

    +

    This module contains classes and functions to build “general” convolutional +neural networks from the description of arbitrary “layers”.

    +
    +
    +class encoders.cnn_encoder.CNNEncoder(params, model, name='cnn_encoder', mode='train')[source]
    +

    Bases: encoders.encoder.Encoder

    +

    General CNN encoder that can be used to construct various different models.

    +
    +
    +__init__(params, model, name='cnn_encoder', mode='train')[source]
    +

    CNN Encoder constructor.

    +

    See parent class for arguments description.

    +

    Config parameters:

    +
      +
    • cnn_layers (list) — list with the description of “convolutional” +layers. For example:

      +
      "conv_layers": [
      +    (tf.layers.conv2d, {
      +        'filters': 64, 'kernel_size': (11, 11),
      +        'strides': (4, 4), 'padding': 'VALID',
      +        'activation': tf.nn.relu,
      +    }),
      +    (tf.layers.max_pooling2d, {
      +        'pool_size': (3, 3), 'strides': (2, 2),
      +    }),
      +    (tf.layers.conv2d, {
      +        'filters': 192, 'kernel_size': (5, 5),
      +        'strides': (1, 1), 'padding': 'SAME',
      +    }),
      +    (tf.layers.batch_normalization, {'momentum': 0.9, 'epsilon': 0.0001}),
      +    (tf.nn.relu, {}),
      +]
      +
      +
      +

      Note that you don’t need to provide “regularizer”, “training” and +“data_format” parameters since they will be automatically added.

      +
    • +
    • cnn_layers (list) — list with the description of “fully-connected” +layers. The only different from convolutional layers is that the input +will be automatically reshaped to 2D (batch size x num features). +For example:

      +
      'fc_layers': [
      +    (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}),
      +    (tf.layers.dropout, {'rate': 0.5}),
      +    (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}),
      +    (tf.layers.dropout, {'rate': 0.5}),
      +],
      +
      +
      +

      Note that you don’t need to provide “regularizer”, “training” and +“data_format” parameters since they will be automatically added.

      +
    • +
    • data_format (string) — could be either “channels_first” or +“channels_last”. Defaults to “channels_first”.

      +
    • +
    +
    + +
    +
    +static get_optional_params()[source]
    +

    Static method with description of optional parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that can be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    +
    +static get_required_params()[source]
    +

    Static method with description of required parameters.

    + +++ + + + + + +
    Returns:Dictionary containing all the parameters that have to be +included into the params parameter of the +class __init__() method.
    Return type:dict
    +
    + +
    + +
    +
    +encoders.cnn_encoder.build_layer(inputs, layer, layer_params, data_format, regularizer, training, verbose=True)[source]
    +

    This function builds a layer from the layer function and it’s parameters.

    +

    It will automatically add regularizer parameter to the layer_params if the +layer supports regularization. To check this, it will look for the +“regularizer”, “kernel_regularizer” and “gamma_regularizer” names in this +order in the layer call signature. If one of this parameters is supported +it will pass regularizer object as a value for that parameter. Based on the +same “checking signature” technique “data_format” and “training” parameters +will try to be added.

    + +++ + + + + + +
    Parameters:
      +
    • inputs – input Tensor that will be passed to the layer. Note that layer has +to accept input as the first parameter.
    • +
    • layer – layer function or class with __call__ method defined.
    • +
    • layer_params (dict) – parameters passed to the layer.
    • +
    • data_format (string) – data format (“channels_first” or “channels_last”) +that will be tried to be passed as an additional argument.
    • +
    • regularizer – regularizer instance that will be tried to be passed as an +additional argument.
    • +
    • training (bool) – whether layer is built in training mode. Will be tried to +be passed as an additional argument.
    • +
    • verbose (bool) – whether to print information about built layers.
    • +
    +
    Returns:

    Tensor with layer output.

    +
    +
    +
    diff --git a/docs/html/api-docs/models.html b/docs/html/api-docs/models.html index 0b49514d6..61c902754 100644 --- a/docs/html/api-docs/models.html +++ b/docs/html/api-docs/models.html @@ -236,6 +236,11 @@
  • print_samples_steps (int or None) — how often to print training samples (input sequences, correct answers and model predictions). Setting it to None disables samples printing.
  • +
  • print_bench_info_steps (int or None) — how often to print training +benchmarking information (average number of objects processed per step). +Setting it to None disables intermediate benchmarking printing, but +the average information across the whole training will always be printed +after the last iteration.
  • save_checkpoint_steps (int or None) — how often to save model checkpoints. Setting it to None disables checkpoint saving.
  • eval_steps (int) — how often to run evaluation during training. @@ -271,14 +276,17 @@
  • max_grad_norm (float) — maximum value of gradient norm. Clipping will be performed if some gradients exceed this value (this is checked for each variable independently).
  • -
  • loss_scale (float) — static loss scale to use. For details see -mixed precision training section in docs.
  • -
  • automatic_loss_scaling — automatic loss scaling mode. Could be -either None, “Backoff” or “Logmax”. For details see -mixed precision training section in docs.
  • +
  • loss_scaling — could be float or string. If float, static loss +scaling is applied. If string, the corresponding automatic +loss scaling algorithm is used. Must be one of ‘Backoff’ +of ‘LogMax’ (case insensitive). Only used when dtype=”mixed”. For details +see mixed precision training section in docs.
  • summaries (list) — which summaries to log. Could contain “learning_rate”, “gradients”, “gradient_norm”, “global_gradient_norm”, “variables”, “variable_norm”.
  • +
  • iter_size (int) — use this parameter to emulate large batches. +The gradients will be accumulated for iter_size number of steps before +applying update.
  • larc_params — dictionary with parameters for LARC (or LARS) optimization algorithms. Can contain the following parameters:
    • larc_mode — Could be either “scale” (LARS) or “clip” (LARC). @@ -309,14 +317,14 @@
  • Returns:

    tuple containing loss tensor and samples tensor.

    +
    Returns:

    tuple containing loss tensor and list of outputs tensors.

    Loss tensor will be automatically provided to the optimizer and corresponding train_op will be created.

    Samples tensors are stored in the _outputs attribute and can be accessed by calling get_output_tensors() function. For example, this happens inside utils.hooks.RunEvaluationHook to fetch output values for evaluation.

    -

    Both loss and samples can be None when corresponding part of the graph +

    Both loss and outputs can be None when corresponding part of the graph is not built.

    +
    +
    +_get_num_objects_per_step(worker_id=0)[source]
    +

    Define this method if you need benchmarking functionality. +For example, for translation models, this method should return number of +tokens in current batch, for image recognition model should return number +of images in current batch.

    + +++ + + + + + +
    Parameters:worker_id (int) – id of the worker to get data layer from +(not used for Horovod).
    Returns:tf.Tensor with number of objects in batch.
    +
    +
    clip_last_batch(last_batch, true_size)[source]
    @@ -410,7 +438,7 @@
    -finalize_evaluation(results_per_batch)[source]
    +finalize_evaluation(results_per_batch, training_step=None)[source]

    This method can be used in conjunction with self.evaluate() to calculate evaluation metrics. @@ -432,14 +460,21 @@

    Parameters:results_per_batch (list) – aggregation of values returned from all calls +
    Parameters:
      +
    • results_per_batch (list) – aggregation of values returned from all calls to self.evaluate() method (number of calls will be -equal to number of evaluation batches).
    Returns:dictionary with values that need to be logged to TensorBoard -(can be empty).
    Returns:

    dictionary with values that need to be logged to TensorBoard +(can be empty).

    +
    Return type:dict
    Return type:

    dict

    +
    @@ -491,22 +526,7 @@
    get_num_objects_per_step(worker_id=0)[source]
    -

    Define this method if you need benchmarking functionality. -For example, for translation models, this method should return number of -tokens in current batch, for image recognition model should return number -of images in current batch.

    - --- - - - - - -
    Parameters:worker_id (int) – id of the worker to get data layer from -(not used for Horovod).
    Returns:tf.Tensor with number of objects in batch.
    -
    +

    @@ -619,7 +639,7 @@
    -maybe_print_logs(input_values, output_values)[source]
    +maybe_print_logs(input_values, output_values, training_step)[source]

    This method can be used to print logs that help to visualize training. For example, you can print sample input sequences and their corresponding predictions. This method will be called every print_samples_steps @@ -639,6 +659,7 @@

  • output_values – evaluation of self.get_output_tensors(0), that is, output tensors for one batch on the first GPU.
  • +
  • training_step (int) – Current training step.
  • @@ -770,8 +791,8 @@ Returns:

    tuple containing loss tensor as returned from -loss.compute_loss() and samples tensor, which is taken from -decoder.decode()['samples']. When mode == 'infer', loss will +loss.compute_loss() and list of outputs tensors, which is taken from +decoder.decode()['outputs']. When mode == 'infer', loss will be None.

    @@ -893,6 +914,12 @@
    class models.speech2text.Speech2Text(params, mode='train', hvd=None)[source]

    Bases: models.encoder_decoder.EncoderDecoderModel

    +
    +
    +_get_num_objects_per_step(worker_id=0)[source]
    +

    Returns number of audio frames in current batch.

    +
    +
    evaluate(input_values, output_values)[source]
    @@ -942,7 +969,7 @@
    -finalize_evaluation(results_per_batch)[source]
    +finalize_evaluation(results_per_batch, training_step=None)[source]

    This method can be used in conjunction with self.evaluate() to calculate evaluation metrics. @@ -964,14 +991,21 @@ -Parameters:results_per_batch (list) – aggregation of values returned from all calls +Parameters:

      +
    • results_per_batch (list) – aggregation of values returned from all calls to self.evaluate() method (number of calls will be -equal to number of evaluation batches). +equal to number of evaluation batches).
    • +
    • training_step (int) – current training step. Will only be passed if mode +is “train_eval”.
    • +
    + -Returns:dictionary with values that need to be logged to TensorBoard -(can be empty). +Returns:

    dictionary with values that need to be logged to TensorBoard +(can be empty).

    + -Return type:dict +Return type:

    dict

    + @@ -1000,12 +1034,6 @@
    -
    -
    -get_num_objects_per_step(worker_id=0)[source]
    -

    Returns number of audio frames in current batch.

    -
    -
    infer(input_values, output_values)[source]
    @@ -1043,7 +1071,7 @@
    -maybe_print_logs(input_values, output_values)[source]
    +maybe_print_logs(input_values, output_values, training_step)[source]

    This method can be used to print logs that help to visualize training. For example, you can print sample input sequences and their corresponding predictions. This method will be called every print_samples_steps @@ -1063,6 +1091,7 @@

  • output_values – evaluation of self.get_output_tensors(0), that is, output tensors for one batch on the first GPU.
  • +
  • training_step (int) – Current training step.
  • @@ -1099,6 +1128,12 @@ class models.text2text.Text2Text(params, mode='train', hvd=None)[source]

    Bases: models.encoder_decoder.EncoderDecoderModel

    An example class implementing classical text-to-text model.

    +
    +
    +_get_num_objects_per_step(worker_id=0)[source]
    +

    Returns number of source tokens + number of target tokens in batch.

    +
    +
    evaluate(input_values, output_values)[source]
    @@ -1148,7 +1183,7 @@
    -finalize_evaluation(results_per_batch)[source]
    +finalize_evaluation(results_per_batch, training_step=None)[source]

    This method can be used in conjunction with self.evaluate() to calculate evaluation metrics. @@ -1170,14 +1205,21 @@ -Parameters:results_per_batch (list) – aggregation of values returned from all calls +Parameters:

      +
    • results_per_batch (list) – aggregation of values returned from all calls to self.evaluate() method (number of calls will be -equal to number of evaluation batches). +equal to number of evaluation batches).
    • +
    • training_step (int) – current training step. Will only be passed if mode +is “train_eval”.
    • +
    + -Returns:dictionary with values that need to be logged to TensorBoard -(can be empty). +Returns:

    dictionary with values that need to be logged to TensorBoard +(can be empty).

    + -Return type:dict +Return type:

    dict

    + @@ -1206,12 +1248,6 @@
    -
    -
    -get_num_objects_per_step(worker_id=0)[source]
    -

    Returns number of source tokens + number of target tokens in batch.

    -
    -
    infer(input_values, output_values)[source]
    @@ -1249,7 +1285,7 @@
    -maybe_print_logs(input_values, output_values)[source]
    +maybe_print_logs(input_values, output_values, training_step)[source]

    This method can be used to print logs that help to visualize training. For example, you can print sample input sequences and their corresponding predictions. This method will be called every print_samples_steps @@ -1269,6 +1305,7 @@

  • output_values – evaluation of self.get_output_tensors(0), that is, output tensors for one batch on the first GPU.
  • +
  • training_step (int) – Current training step.
  • @@ -1317,6 +1354,12 @@
    class models.image2label.Image2Label(params, mode='train', hvd=None)[source]

    Bases: models.encoder_decoder.EncoderDecoderModel

    +
    +
    +_get_num_objects_per_step(worker_id=0)[source]
    +

    Returns number of images in current batch, i.e. batch size.

    +
    +
    evaluate(input_values, output_values)[source]
    @@ -1366,7 +1409,7 @@
    -finalize_evaluation(results_per_batch)[source]
    +finalize_evaluation(results_per_batch, training_step=None)[source]

    This method can be used in conjunction with self.evaluate() to calculate evaluation metrics. @@ -1388,28 +1431,29 @@ -Parameters:results_per_batch (list) – aggregation of values returned from all calls +Parameters:

      +
    • results_per_batch (list) – aggregation of values returned from all calls to self.evaluate() method (number of calls will be -equal to number of evaluation batches). +equal to number of evaluation batches).
    • +
    • training_step (int) – current training step. Will only be passed if mode +is “train_eval”.
    • +
    + -Returns:dictionary with values that need to be logged to TensorBoard -(can be empty). +Returns:

    dictionary with values that need to be logged to TensorBoard +(can be empty).

    + -Return type:dict +Return type:

    dict

    +
    -
    -
    -get_num_objects_per_step(worker_id=0)[source]
    -

    Returns number of images in current batch, i.e. batch size.

    -
    -
    -maybe_print_logs(input_values, output_values)[source]
    +maybe_print_logs(input_values, output_values, training_step)[source]

    This method can be used to print logs that help to visualize training. For example, you can print sample input sequences and their corresponding predictions. This method will be called every print_samples_steps @@ -1429,6 +1473,7 @@

  • output_values – evaluation of self.get_output_tensors(0), that is, output tensors for one batch on the first GPU.
  • +
  • training_step (int) – Current training step.
  • diff --git a/docs/html/api-docs/modules.html b/docs/html/api-docs/modules.html index e6e21831a..d3c3a83bc 100644 --- a/docs/html/api-docs/modules.html +++ b/docs/html/api-docs/modules.html @@ -202,10 +202,13 @@
  • encoders
  • decoders
  • losses
  • utils
      diff --git a/docs/html/api-docs/optimizers.html b/docs/html/api-docs/optimizers.html index c73ae40fe..38768754e 100644 --- a/docs/html/api-docs/optimizers.html +++ b/docs/html/api-docs/optimizers.html @@ -179,117 +179,12 @@

      optimizers

      Optimizer ops for use in layers and tf.learn.

      -
      -
      -class optimizers.optimizers.DistributedOptimizer(optimizer, name=None, use_locking=False, device_dense='', device_sparse='')[source]
      -

      Bases: tensorflow.python.training.optimizer.Optimizer

      -

      An optimizer that wraps another tf.Optimizer, using an allreduce to -average gradient values before applying gradients to model weights.

      -
      -
      -__init__(optimizer, name=None, use_locking=False, device_dense='', device_sparse='')[source]
      -

      Construct a new DistributedOptimizer, which uses another optimizer -under the hood for computing single-process gradient values and -applying gradient updates after the gradient values have been averaged -across all the Horovod ranks. -:param optimizer: Optimizer to use for computing gradients and applying updates. -:param name: Optional name prefix for the operations created when applying

      -
      -
      gradients. Defaults to “Distributed” followed by the provided -optimizer type.
      - --- - - - -
      Parameters:
        -
      • use_locking – Whether to use locking when updating variables. -See Optimizer.__init__ for more info.
      • -
      • device_dense – Device to be used for dense tensors. Uses GPU by default -if Horovod was build with HOROVOD_GPU_ALLREDUCE.
      • -
      • device_sparse – Device to be used for sparse tensors. Uses GPU by default -if Horovod was build with HOROVOD_GPU_ALLGATHER.
      • -
      -
      -
      - -
      -
      -apply_gradients(grads_and_vars, global_step=None, name=None)[source]
      -

      Calls this same method on the underlying optimizer.

      -
      - -
      -
      -compute_gradients(*args, **kwargs)[source]
      -

      Compute gradients of all trainable variables. -See Optimizer.compute_gradients() for more info. -In DistributedOptimizer, compute_gradients() is overriden to also -allreduce the gradients before returning them.

      -
      - -
      - -
      -
      -optimizers.optimizers._adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name)[source]
      -

      Find max_norm given norm and previous average.

      -
      - -
      -
      -optimizers.optimizers._add_scaled_noise_to_gradients(grads_and_vars, gradient_noise_scale)[source]
      -

      Adds scaled noise from a 0-mean normal distribution to gradients.

      -
      -
      optimizers.optimizers._clip_gradients_by_norm(grads_and_vars, clip_gradients)[source]

      Clips gradients by global norm.

      -
      -
      -optimizers.optimizers._multiply_gradients(grads_and_vars, gradient_multipliers)[source]
      -

      Multiply specified gradients.

      -
      - -
      -
      -optimizers.optimizers.adaptive_clipping_fn(std_factor=2.0, decay=0.95, static_max_norm=None, global_step=None, report_summary=False, epsilon=1e-08, name=None)[source]
      -

      Adapt the clipping value using statistics on the norms.

      -

      Implement adaptive gradient as presented in section 3.2.1 of -https://arxiv.org/abs/1412.1602.

      -

      Keeps a moving average of the mean and std of the log(norm) of the gradient. -If the norm exceeds exp(mean + std_factor*std) then all gradients will be -rescaled such that the global norm becomes exp(mean).

      - --- - - - - - -
      Parameters:
        -
      • std_factor – Python scaler (or tensor). -max_norm = exp(mean + std_factor*std)
      • -
      • decay – The smoothing factor of the moving averages.
      • -
      • static_max_norm – If provided, will threshold the norm to this value as an -extra safety.
      • -
      • global_step – Optional global_step. If provided, decay = decay*n/(n+1). -This provides a quicker adaptation of the mean for the first steps.
      • -
      • report_summary – If True, will add histogram summaries of the max_norm.
      • -
      • epsilon – Small value chosen to avoid zero variance.
      • -
      • name – The name for this operation is used to scope operations and summaries.
      • -
      -
      Returns:

      A function for applying gradient clipping.

      -
      -
      -
      optimizers.optimizers.get_regularization_loss(scope=None, name='total_regularization_loss')[source]
      @@ -308,131 +203,59 @@

      optimizers
      -optimizers.optimizers.optimize_loss(loss, optimizer, optimizer_params, learning_rate_decay_fn, global_step=None, dtype=tf.float32, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, update_ops=None, variables=None, name=None, summaries=None, colocate_gradients_with_ops=False, increment_global_step=True, larc_params=None, loss_scale=1.0, automatic_loss_scaling=None, on_horovod=False)[source]
      +optimizers.optimizers.optimize_loss(loss, optimizer, optimizer_params, learning_rate_decay_fn, dtype=tf.float32, clip_gradients=None, summaries=None, larc_params=None, loss_scaling=1.0, on_horovod=False, iter_size=1, skip_update_ph=None)[source]

      Given loss and parameters for optimizer, returns a training op.

      -

      Various ways of passing optimizers include:

      -
        -
      • -
        by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES
        -

        for full list. E.g. optimize_loss(…, optimizer=’Adam’).

        -
        -
        -
      • -
      • -
        by function taking learning rate Tensor as argument and returning an
        -

        Optimizer instance. E.g. optimize_loss(…, -optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5)).

        -
        -
        -

        Alternatively, if learning_rate is None, the function takes no -arguments. E.g. `optimize_loss(…, learning_rate=None,

        -
        -

        optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.

        -
        -
      • -
      • -
        by a subclass of Optimizer having a single-argument constructor
        -

        (the argument is the learning rate), such as AdamOptimizer or -AdagradOptimizer. E.g. optimize_loss(…, -optimizer=tf.train.AdagradOptimizer).

        -
        -
        -
      • -
      • -
        by an instance of a subclass of Optimizer.
        -

        E.g., optimize_loss(…, optimizer=tf.train.AdagradOptimizer(0.5)).

        -
        -
        -
      • -
      - - -
      Parameters:
      • loss – Scalar Tensor.
      • -
      • global_step – Scalar int Tensor, step counter to update on each step -unless increment_global_step is False. If not supplied, -it will be fetched from the default graph (see -tf.train.get_global_step for details). If it has -not been created, no step will be incremented with each weight -update. learning_rate_decay_fn requires global_step.
      • -
      • learning_rate – float or Tensor, magnitude of update per each training -step. Can be None.
      • -
      • optimizer

        string, class or optimizer instance, used as trainer. -string should be name of optimizer, like ‘SGD’,

        -
        -
        ’Adam’, ‘Adagrad’. Full list in OPTIMIZER_CLS_NAMES constant.
        -
        -
        class should be sub-class of tf.Optimizer that implements
        -
        compute_gradients and apply_gradients functions.
        -
        optimizer instance should be instantiation of tf.Optimizer
        -
        sub-class and have compute_gradients and apply_gradients -functions.
        -
        -
      • -
      • gradient_noise_scale – float or None, adds 0-mean normal noise scaled by this -value.
      • -
      • gradient_multipliers – dict of variables or variable names to floats. -If present, gradients for specified -variables will be multiplied by given constant.
      • -
      • clip_gradients – float, callable or None. If float, is provided, a global -clipping is applied to prevent the norm of the gradient to exceed this -value. Alternatively, a callable can be provided e.g.: adaptive_clipping. -This callable takes a list of (gradients, variables) `tuple`s and -returns the same thing with the gradients modified.
      • -
      • learning_rate_decay_fn – function, takes learning_rate and global_step +
      • optimizer – string or class of optimizer, used as trainer. +string should be name of optimizer, like ‘SGD’, +‘Adam’, ‘Adagrad’. Full list in OPTIMIZER_CLS_NAMES constant. +class should be sub-class of tf.Optimizer that implements +compute_gradients and apply_gradients functions.
      • +
      • optimizer_params – parameters of the optimizer.
      • +
      • dtype – model dtype (tf.float16, tf.float32 or “mixed”).
      • +
      • learning_rate_decay_fn – function, takes global_step Tensor`s, returns `Tensor. Can be used to implement any learning rate decay functions. For example: tf.train.exponential_decay. Ignored if learning_rate is not supplied.
      • -
      • update_ops – list of update Operation`s to execute at each step. If `None, -uses elements of UPDATE_OPS collection. The order of execution -between update_ops and loss is non-deterministic.
      • -
      • variables – list of variables to optimize or -None to use all trainable variables.
      • -
      • name – The name for this operation is used to scope operations and summaries.
      • +
      • clip_gradients – float, max gradient norm to clip to.
      • summaries – List of internal quantities to visualize on tensorboard. If not set only the loss and the learning rate will be reported. The complete list is in OPTIMIZER_SUMMARIES.
      • -
      • colocate_gradients_with_ops – If True, try colocating gradients with the -corresponding op.
      • -
      • increment_global_step – Whether to increment global_step. If your model -calls optimize_loss multiple times per training step (e.g. to optimize -different parts of the model), use this arg to avoid incrementing -global_step more times than necessary.
      • -
      • LARC_mode – ‘scale’ or ‘clip’
      • -
      • LARC_nu – If not None, LARC re-scaling will be -applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu
      • -
      • automatic_loss_scaling – if not None, use the corresponding automatic -loss scaling algorithm. Must be one of ‘Backoff’ -of ‘LogMax’. dtype must be “mixed” to use ALS.
      • +
      • larc_params – If not None, LARC re-scaling will +be applied with corresponding parameters.
      • +
      • loss_scaling – could be float or string. If float, static loss scaling +is applied. If string, the corresponding automatic +loss scaling algorithm is used. Must be one of ‘Backoff’ +of ‘LogMax’ (case insensitive). Only used when dtype=”mixed”.
      • +
      • on_horovod – whether the model is run on horovod.
      Returns:

      Training op.

      -
      Raises:

      ValueError – if: -* loss is an invalid type or shape. -* global_step is an invalid type or shape. -* learning_rate is an invalid type or value. -* optimizer has the wrong type. -* clip_gradients is neither float nor callable. -* learning_rate and learning_rate_decay_fn are supplied, but no

      -
      -

      global_step is available.

      -
      -
        -
      • gradients is empty.
      • -
      +
      Returns:

      training op.

      +
      +
      +optimizers.optimizers.post_process_gradients(grads_and_vars, summaries, lr, clip_gradients, larc_params)[source]
      +

      Applies post processing to gradients, i.e. clipping, LARC, summaries.

      +
      + +
      +
      +optimizers.optimizers.reduce_gradients(grads_and_vars, on_horovod)[source]
      +
      +

      mp_wrapper

      @@ -541,7 +364,7 @@

      optimizersobject

      -SUPPORTED_ALGOS = ['Backoff', 'LogMax']
      +SUPPORTED_ALGOS = ['backoff', 'logmax']
      diff --git a/docs/html/api-docs/parts.cnns.html b/docs/html/api-docs/parts.cnns.html new file mode 100644 index 000000000..cc9297c39 --- /dev/null +++ b/docs/html/api-docs/parts.cnns.html @@ -0,0 +1,307 @@ + + + + + + + + + + + cnns — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + + + + +
      + + + + + +
      + +
      + + + + + + + + + + + + + + + + + +
      + + + + +
      +
      +
      +
      + +
      +

      cnns

      +
      +

      conv_blocks

      +
      +
      +parts.cnns.conv_blocks.conv_actv(type, name, inputs, filters, kernel_size, activation_fn, strides, padding, regularizer, training, data_format)[source]
      +

      Helper function that applies convolution and activation.

      + +++ + + + +
      Parameters:type – the following types are supported +‘conv1d’, ‘conv2d’
      +
      + +
      +
      +parts.cnns.conv_blocks.conv_bn_actv(type, name, inputs, filters, kernel_size, activation_fn, strides, padding, regularizer, training, data_format, bn_momentum, bn_epsilon)[source]
      +

      Helper function that applies convolution, batch norm and activation. +Accepts inputs in ‘channels_last’ format only.

      + +++ + + + +
      Parameters:type – the following types are supported +‘conv1d’, ‘conv2d’
      +
      + +
      +
      + + +
      + +
      + + +
      +
      + +
      + +
      + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/api-docs/parts.convs2s.html b/docs/html/api-docs/parts.convs2s.html new file mode 100644 index 000000000..264585c1d --- /dev/null +++ b/docs/html/api-docs/parts.convs2s.html @@ -0,0 +1,460 @@ + + + + + + + + + + + convs2s — OpenSeq2Seq 0.2 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + + + + +
      + + + + + +
      + +
      + + + + + + + + + + + + + + + + + +
      + + + + +
      +
      +
      +
      + +
      +

      convs2s

      +
      +

      attention_wn_layer

      +

      Implementation of the attention layer for convs2s. +Inspired from https://github.com/tobyyouup/conv_seq2seq

      +
      +
      +class parts.convs2s.attention_wn_layer.AttentionLayerNormalized(in_dim, embed_size, layer_id, add_res)[source]
      +

      Bases: tensorflow.python.layers.base.Layer

      +

      Attention layer for convs2s with weight normalization

      +
      +
      +__init__(in_dim, embed_size, layer_id, add_res)[source]
      +

      initializes the attention layer. +It uses weight normalization for linear projections +(Salimans & Kingma, 2016) w = g * v/2-norm(v)

      + +++ + + + +
      Parameters:
        +
      • in_dim – int last dimension of the inputs
      • +
      • embed_size – int target embedding size
      • +
      • layer_id – int the id of current convolution layer
      • +
      • add_res – bool whether residual connection should be added or not
      • +
      +
      +
      + +
      +
      +call(input, target_embed, encoder_output_a, encoder_output_b, input_attention_bias)[source]
      +

      Calculates the attention vectors.

      + +++ + + + + + +
      Parameters:
        +
      • input – A float32 tensor with shape [batch_size, length, in_dim]
      • +
      • target_embed – A float32 tensor with shape [batch_size, length, in_dim] +containing the target embeddings
      • +
      • encoder_output_a – A float32 tensor with shape [batch_size, length, out_dim] +containing the first encoder outputs, uses as the keys
      • +
      • encoder_output_b – A float32 tensor with shape [batch_size, length, src_emb_dim] +containing the second encoder outputs, uses as the values
      • +
      • input_attention_bias – A float32 tensor with shape [batch_size, length, 1] +containing the bias used to mask the paddings
      • +
      +
      Returns:

      float32 tensor with shape [batch_size, length, out_dim].

      +
      +
      + +
      + +
      +
      +

      conv_wn_layer

      +

      Implementation of a 1d convolutional layer with weight normalization. +Inspired from https://github.com/tobyyouup/conv_seq2seq

      +
      +
      +class parts.convs2s.conv_wn_layer.Conv1DNetworkNormalized(in_dim, out_dim, kernel_width, mode, layer_id, hidden_dropout, conv_padding, decode_padding)[source]
      +

      Bases: tensorflow.python.layers.base.Layer

      +

      1D convolutional layer with weight normalization

      +
      +
      +__init__(in_dim, out_dim, kernel_width, mode, layer_id, hidden_dropout, conv_padding, decode_padding)[source]
      +

      initializes the 1D convolution layer. +It uses weight normalization (Salimans & Kingma, 2016) w = g * v/2-norm(v)

      + +++ + + + +
      Parameters:
        +
      • in_dim – int last dimension of the inputs
      • +
      • out_dim – int new dimension for the output
      • +
      • kernel_width – int width of kernel
      • +
      • mode – str the current mode
      • +
      • layer_id – int the id of current convolution layer
      • +
      • hidden_dropout – float the keep-dropout value used on the input. +Give 1.0 if no dropout. +It is used to initialize the weights of convolution.
      • +
      • conv_padding – str the type of padding done for convolution
      • +
      • decode_padding – bool specifies if this convolution layer is in decoder or not +in decoder padding is done explicitly before convolution
      • +
      +
      +
      + +
      +
      +call(input)[source]
      +

      Applies convolution with gated linear units on x.

      + +++ + + + + + +
      Parameters:x – A float32 tensor with shape [batch_size, length, in_dim]
      Returns:float32 tensor with shape [batch_size, length, out_dim].
      +
      + +
      +
      +gated_linear_units(inputs)[source]
      +

      Gated Linear Units (GLU) on x.

      + +++ + + + + + +
      Parameters:x – A float32 tensor with shape [batch_size, length, 2*out_dim]
      Returns:float32 tensor with shape [batch_size, length, out_dim].
      +
      + +
      + +
      +
      +

      ffn_wn_layer

      +

      Implementation of fully connected network with weight normalization. +Inspired from https://github.com/tobyyouup/conv_seq2seq

      +
      +
      +class parts.convs2s.ffn_wn_layer.FeedFowardNetworkNormalized(in_dim, out_dim, dropout, var_scope_name)[source]
      +

      Bases: tensorflow.python.layers.base.Layer

      +

      Fully connected feedforward network with weight normalization

      +
      +
      +__init__(in_dim, out_dim, dropout, var_scope_name)[source]
      +

      initializes the linear layer. +This layer projects from in_dim-dimenstional space to out_dim-dimentional space. +It uses weight normalization (Salimans & Kingma, 2016) w = g * v/2-norm(v)

      + +++ + + + +
      Parameters:
        +
      • in_dim – int last dimension of the inputs
      • +
      • out_dim – int new dimension for the output
      • +
      • dropout – float the keep-dropout value used in the previous layer. +It is used to initialize the weights. Give 1.0 if no dropout.
      • +
      • var_scope_name – str the scope name for the weight variables
      • +
      +
      +
      + +
      +
      +call(x)[source]
      +

      Projects x with its linear transformation.

      + +++ + + + + + +
      Parameters:x – A float32 tensor with shape [batch_size, length, in_dim]
      Returns:float32 tensor with shape [batch_size, length, out_dim].
      +
      + +
      + +
      +
      + + +
      + +
      + + +
      +
      + +
      + +
      + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/html/api-docs/parts.html b/docs/html/api-docs/parts.html index 214851326..06963ed73 100644 --- a/docs/html/api-docs/parts.html +++ b/docs/html/api-docs/parts.html @@ -101,6 +101,8 @@
    • parts
    • utils
    • @@ -189,13 +191,22 @@
    • transformer
    • +
    • convs2s +
    • +
    • cnns +
  • diff --git a/docs/html/api-docs/parts.rnns.html b/docs/html/api-docs/parts.rnns.html index 1950858be..583c41dea 100644 --- a/docs/html/api-docs/parts.rnns.html +++ b/docs/html/api-docs/parts.rnns.html @@ -110,6 +110,8 @@
  • transformer
  • +
  • convs2s
  • +
  • cnns
  • utils
  • @@ -1521,16 +1523,29 @@

    utils

    -
    -parts.rnns.utils.create_rnn_cell(cell_type, cell_params, num_layers=1, dp_input_keep_prob=1.0, dp_output_keep_prob=1.0, residual_connections=False, wrap_to_multi_rnn=True)[source]
    -

    TODO: MOVE THIS properly to utils. Write doc -:param cell_type: -:param cell_params: -:param num_layers: -:param dp_input_keep_prob: -:param dp_output_keep_prob: -:param residual_connections: -:return:

    +
    +parts.rnns.utils.single_cell(cell_class, cell_params, dp_input_keep_prob=1.0, dp_output_keep_prob=1.0, residual_connections=False)[source]
    +

    Creates an instance of the rnn cell. +Such cell describes one step one layer and can include residual connection +and/or dropout

    + +++ + + + + + +
    Parameters:
      +
    • cell_class – Tensorflow RNN cell class
    • +
    • cell_params (dict) – cell parameters
    • +
    • dp_input_keep_prob (float) – (default: 1.0) input dropout keep probability
    • +
    • dp_output_keep_prob (float) – (default: 1.0) output dropout keep probability
    • +
    • residual_connections (bool) – whether to add residual connection
    • +
    +
    Returns:

    TF RNN instance

    +
    diff --git a/docs/html/api-docs/parts.transformer.html b/docs/html/api-docs/parts.transformer.html index 55d5d135c..e891a2849 100644 --- a/docs/html/api-docs/parts.transformer.html +++ b/docs/html/api-docs/parts.transformer.html @@ -33,7 +33,7 @@ - + @@ -103,13 +103,14 @@
  • transformer
  • +
  • convs2s
  • +
  • cnns
  • utils
  • @@ -675,51 +676,6 @@ -
    -
    -

    beam_search_test

    -

    Test beam search helper methods.

    -
    -
    -class parts.transformer.beam_search_test.BeamSearchHelperTests(methodName='runTest')[source]
    -

    Bases: tensorflow.python.framework.test_util.TensorFlowTestCase

    -
    -
    -test_expand_to_beam_size()[source]
    -
    - -
    -
    -test_flatten_beam_dim()[source]
    -
    - -
    -
    -test_gather_beams()[source]
    -
    - -
    -
    -test_gather_topk_beams()[source]
    -
    - -
    -
    -test_get_shape_keep_last_dim()[source]
    -
    - -
    -
    -test_shape_list()[source]
    -
    - -
    -
    -test_unflatten_beam_dim()[source]
    -
    - -
    -

    common

    @@ -770,7 +726,7 @@

    Implementation of embedding layer with shared weights.

    -class parts.transformer.embedding_layer.EmbeddingSharedWeights(vocab_size, hidden_size, pad2eight=False)[source]
    +class parts.transformer.embedding_layer.EmbeddingSharedWeights(vocab_size, hidden_size, pad_vocab_to_eight=False, init_var=None, embed_scale=True, pad_sym=0, mask_paddings=True)[source]

    Bases: tensorflow.python.layers.base.Layer

    Calculates input embeddings and pre-softmax linear with shared weights.

    @@ -875,7 +831,7 @@
    -parts.transformer.utils.get_padding(x, padding_value=0)[source]
    +parts.transformer.utils.get_padding(x, padding_value=0, dtype=tf.float32)[source]

    Return float tensor representing the padding values in x.

    @@ -884,6 +840,7 @@ @@ -901,7 +858,7 @@
    -parts.transformer.utils.get_padding_bias(x)[source]
    +parts.transformer.utils.get_padding_bias(x, res_rank=4, pad_sym=0)[source]

    Calculate bias tensor from padding values in tensor.

    Bias tensor that is added to the pre-softmax multi-headed attention logits, which has shape [batch_size, num_heads, length, length]. The tensor is zero at @@ -910,9 +867,18 @@

    - + - +
    Parameters:
    • x – int tensor with any shape
    • padding_value – int value that
    • +
    • dtype – type of the output
    Parameters:x – int tensor with shape [batch_size, length]
    Parameters:
      +
    • x – int tensor with shape [batch_size, length]
    • +
    • res_rank – int indicates the rank of attention_bias.
    • +
    • dtype – type of the output attention_bias
    • +
    • pad_sym – int the symbol used for padding
    • +
    +
    Returns:Attention bias tensor of shape [batch_size, 1, 1, length].
    Returns:

    Attention bias tensor of shape +[batch_size, 1, 1, length] if res_rank = 4 - for Transformer +or [batch_size, 1, length] if res_rank = 3 - for ConvS2S

    +
    @@ -955,7 +921,7 @@
    +
    +
    +utils.utils.collect_if_horovod(value, hvd, mode='sum')[source]
    +

    Collects values from all workers if run on Horovod. +Note, that on all workers except first this function will return None.

    + +++ + + + + + +
    Parameters:
      +
    • value – value to collect.
    • +
    • hvd – horovod.tensorflow module or None
    • +
    • mode – could be “sum”, “mean” or “gather”, indicating reduce_sum or gather. +For “sum” and “mean” value has to be numerical, for “gather”, value has +to be iterable.
    • +
    +
    Returns:

    collected results if run on Horovod or value otherwise.

    +
    +
    +
    utils.utils.deco_print(line, offset=0, start='*** ', end='\n')[source]
    @@ -536,8 +561,8 @@

    utils

    -
    -utils.utils.iterate_data_layer(model, dl_id, sess, compute_loss, mode, verbose)[source]
    +
    +utils.utils.iterate_data(model, sess, compute_loss, mode, verbose)[source]
    @@ -579,7 +604,7 @@

    utils diff --git a/docs/html/genindex.html b/docs/html/genindex.html index 3e0a46a10..593fd7f67 100644 --- a/docs/html/genindex.html +++ b/docs/html/genindex.html @@ -201,6 +201,8 @@

    _

  • (decoders.rnn_decoders.BeamSearchRNNDecoderWithAttention method)
  • (decoders.rnn_decoders.RNNDecoderWithAttention method) +
  • +
  • (encoders.cnn_encoder.CNNEncoder method)
  • (encoders.ds2_encoder.DeepSpeech2Encoder method)
  • @@ -209,8 +211,12 @@

    _

  • (encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding method)
  • (encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding method) +
  • +
  • (encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding_cuDNN method)
  • (encoders.rnn_encoders.UnidirectionalRNNEncoderWithEmbedding method) +
  • +
  • (encoders.w2l_encoder.Wave2LetterEncoder method)
  • (losses.ctc_loss.CTCLoss method)
  • @@ -224,7 +230,11 @@

    _

  • (models.model.Model method)
  • -
  • (optimizers.optimizers.DistributedOptimizer method) +
  • (parts.convs2s.attention_wn_layer.AttentionLayerNormalized method) +
  • +
  • (parts.convs2s.conv_wn_layer.Conv1DNetworkNormalized method) +
  • +
  • (parts.convs2s.ffn_wn_layer.FeedFowardNetworkNormalized method)
  • (parts.rnns.attention_wrapper.AttentionWrapper method)
  • @@ -249,10 +259,6 @@

    _

  • (utils.hooks.BroadcastGlobalVariablesHook method)
  • -
  • _adaptive_max_norm() (in module optimizers.optimizers) -
  • -
  • _add_scaled_noise_to_gradients() (in module optimizers.optimizers) -
  • _aspect_preserving_resize() (in module data.image2label.imagenet_preprocessing)
  • _batch_examples() (in module data.text2text.t2t) @@ -327,6 +333,8 @@

    _

  • (encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding method)
  • (encoders.rnn_encoders.UnidirectionalRNNEncoderWithEmbedding method) +
  • +
  • (encoders.w2l_encoder.Wave2LetterEncoder method)
  • _escape_token() (in module data.text2text.tokenizer) @@ -358,6 +366,18 @@

    _

  • _get_new_alive_state() (parts.transformer.beam_search.SequenceBeamSearch method)
  • _get_new_finished_state() (parts.transformer.beam_search.SequenceBeamSearch method) +
  • +
  • _get_num_objects_per_step() (models.image2label.Image2Label method) + +
  • +
  • _get_symbols_to_logits_fn() (decoders.convs2s_decoder.ConvS2SDecoder method)
  • _grow_alive_seq() (parts.transformer.beam_search.SequenceBeamSearch method)
  • @@ -379,11 +399,9 @@

    _

  • _maybe_split_batch_beams() (parts.rnns.rnn_beam_search_decoder.BeamSearchDecoder method)
  • -
  • _mean_image_subtraction() (in module data.image2label.imagenet_preprocessing) +
  • _mean_image_subtraction_and_normalization() (in module data.image2label.imagenet_preprocessing)
  • _merge_batch_beams() (parts.rnns.rnn_beam_search_decoder.BeamSearchDecoder method) -
  • -
  • _multiply_gradients() (in module optimizers.optimizers)
  • _native_to_unicode() (in module data.text2text.tokenizer)
  • @@ -431,8 +449,6 @@

    _

    A

    @@ -701,6 +733,8 @@

    E

      +
    • encoders.encoder (module) +
    • encoders.resnet_blocks (module)
    • encoders.resnet_encoder (module)
    • encoders.rnn_encoders (module) +
    • +
    • encoders.w2l_encoder (module)
    • END_OF_CHOICE (data.text2text.text2text.SpecialTextTokens attribute)
    • @@ -758,6 +798,8 @@

      F

      + -
      • parts.rnns.rnn_beam_search_decoder (module)
      • parts.rnns.slstm (module) @@ -1236,8 +1322,6 @@

        P

      • parts.transformer.attention_layer (module)
      • parts.transformer.beam_search (module) -
      • -
      • parts.transformer.beam_search_test (module)
      • parts.transformer.common (module)
      • @@ -1250,11 +1334,19 @@

        P

      • piecewise_constant() (in module optimizers.lr_policies)
      • poly_decay() (in module optimizers.lr_policies) +
      • +
      • post_process_gradients() (in module optimizers.optimizers) +
      • +
      • predict() (decoders.convs2s_decoder.ConvS2SDecoder method)
      • PrePostProcessingWrapper (class in parts.transformer.common)
      • -
      • preprocess_image() (in module data.image2label.imagenet_preprocessing) +
      • preprocess_image() (data.image2label.image2label.CifarDataLayer method) + +
      • PrintLossAndTimeHook (class in utils.hooks)
      • PrintSamplesHook (class in utils.hooks) @@ -1265,14 +1357,16 @@

        P

        R

      • split_heads() (parts.transformer.attention_layer.Attention method)
      • -
      • src_emb_size (encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding attribute) +
      • src_emb_size (encoders.convs2s_encoder.ConvS2SEncoder attribute)
        • -
        • src_vocab_size (encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding attribute) +
        • src_vocab_size (encoders.convs2s_encoder.ConvS2SEncoder attribute)
            +
          • (encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding attribute) +
          • (encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding attribute) +
          • +
          • (encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding_cuDNN attribute)
          • (encoders.rnn_encoders.UnidirectionalRNNEncoderWithEmbedding attribute)
          • @@ -1357,30 +1461,16 @@

            S

            T

            - +
            • train() (in module utils.funcs)
            • train_input_fn() (in module data.text2text.t2t) @@ -1424,6 +1514,10 @@

              U

              W

              + + + + + + + + + + + + + @@ -392,6 +412,36 @@

              Python Module Index

              + + + + + + + + + + + + + + + + + + - - -
              • write() (utils.utils.Logger method)
              • diff --git a/docs/html/in-depth-tutorials/using-existing-models.html b/docs/html/in-depth-tutorials/using-existing-models.html index 54eb6113a..43480f24c 100644 --- a/docs/html/in-depth-tutorials/using-existing-models.html +++ b/docs/html/in-depth-tutorials/using-existing-models.html @@ -304,6 +304,11 @@

                How to run modelsmixed precision training section in docs. -
              • automatic_loss_scaling — automatic loss scaling mode. Could be -either None, “Backoff” or “Logmax”. For details see -mixed precision training section in docs.
              • +
              • loss_scaling — could be float or string. If float, static loss +scaling is applied. If string, the corresponding automatic +loss scaling algorithm is used. Must be one of ‘Backoff’ +of ‘LogMax’ (case insensitive). Only used when dtype=”mixed”. For details +see mixed precision training section in docs.
              • summaries (list) — which summaries to log. Could contain “learning_rate”, “gradients”, “gradient_norm”, “global_gradient_norm”, “variables”, “variable_norm”.
              • +
              • iter_size (int) — use this parameter to emulate large batches. +The gradients will be accumulated for iter_size number of steps before +applying update.
              • larc_params — dictionary with parameters for LARC (or LARS) optimization algorithms. Can contain the following parameters:
              decoders
                  + decoders.convs2s_decoder +
                  @@ -266,6 +271,16 @@

              Python Module Index

              encoders
                  + encoders.cnn_encoder +
                  + encoders.convs2s_encoder +
                  @@ -291,6 +306,11 @@

              Python Module Index

                  encoders.rnn_encoders
                  + encoders.w2l_encoder +
               
              l
              parts
                  + parts.cnns +
                  + parts.cnns.conv_blocks +
                  + parts.convs2s +
                  + parts.convs2s.attention_wn_layer +
                  + parts.convs2s.conv_wn_layer +
                  + parts.convs2s.ffn_wn_layer +
                  @@ -447,11 +497,6 @@

              Python Module Index

                  parts.transformer.beam_search
                  - parts.transformer.beam_search_test -
                  diff --git a/docs/html/searchindex.js b/docs/html/searchindex.js index dd1d04595..52eb7af85 100644 --- a/docs/html/searchindex.js +++ b/docs/html/searchindex.js @@ -1 +1 @@ -Search.setIndex({docnames:["api-docs/data","api-docs/data.image2label","api-docs/data.speech2text","api-docs/data.text2text","api-docs/decoders","api-docs/encoders","api-docs/losses","api-docs/models","api-docs/modules","api-docs/optimizers","api-docs/parts","api-docs/parts.rnns","api-docs/parts.transformer","api-docs/utils","distr-training","extending","extending/adding-new-data-layer","extending/adding-new-decoder","extending/adding-new-encoder","extending/adding-new-loss","getting-started","getting-started/asr","getting-started/nmt","in-depth-tutorials","in-depth-tutorials/internal-structure","in-depth-tutorials/using-existing-models","index","installation-instructions","mixed-precision","models-and-recipes"],envversion:53,filenames:["api-docs/data.rst","api-docs/data.image2label.rst","api-docs/data.speech2text.rst","api-docs/data.text2text.rst","api-docs/decoders.rst","api-docs/encoders.rst","api-docs/losses.rst","api-docs/models.rst","api-docs/modules.rst","api-docs/optimizers.rst","api-docs/parts.rst","api-docs/parts.rnns.rst","api-docs/parts.transformer.rst","api-docs/utils.rst","distr-training.rst","extending.rst","extending/adding-new-data-layer.rst","extending/adding-new-decoder.rst","extending/adding-new-encoder.rst","extending/adding-new-loss.rst","getting-started.rst","getting-started/asr.rst","getting-started/nmt.rst","in-depth-tutorials.rst","in-depth-tutorials/internal-structure.rst","in-depth-tutorials/using-existing-models.rst","index.rst","installation-instructions.rst","mixed-precision.rst","models-and-recipes.rst"],objects:{"":{data:[0,0,0,"-"],decoders:[4,0,0,"-"],encoders:[5,0,0,"-"],losses:[6,0,0,"-"],models:[7,0,0,"-"],optimizers:[9,0,0,"-"],parts:[10,0,0,"-"],utils:[13,0,0,"-"]},"data.data_layer":{DataLayer:[0,1,1,""]},"data.data_layer.DataLayer":{__init__:[0,2,1,""],build_graph:[0,2,1,""],get_optional_params:[0,3,1,""],get_required_params:[0,3,1,""],get_size_in_samples:[0,2,1,""],input_tensors:[0,4,1,""],iterator:[0,4,1,""],params:[0,4,1,""]},"data.image2label":{image2label:[1,0,0,"-"],imagenet_preprocessing:[1,0,0,"-"]},"data.image2label.image2label":{ImagenetDataLayer:[1,1,1,""]},"data.image2label.image2label.ImagenetDataLayer":{build_graph:[1,2,1,""],get_optional_params:[1,3,1,""],get_required_params:[1,3,1,""],get_size_in_samples:[1,2,1,""],input_tensors:[1,4,1,""],iterator:[1,4,1,""],split_data:[1,2,1,""]},"data.image2label.imagenet_preprocessing":{_aspect_preserving_resize:[1,5,1,""],_central_crop:[1,5,1,""],_decode_crop_and_flip:[1,5,1,""],_mean_image_subtraction:[1,5,1,""],_parse_example_proto:[1,5,1,""],_resize_image:[1,5,1,""],_smallest_size_at_least:[1,5,1,""],parse_record:[1,5,1,""],preprocess_image:[1,5,1,""]},"data.speech2text":{speech2text:[2,0,0,"-"],speech_utils:[2,0,0,"-"]},"data.speech2text.speech2text":{Speech2TextDataLayer:[2,1,1,""]},"data.speech2text.speech2text.Speech2TextDataLayer":{__init__:[2,2,1,""],_parse_audio_element:[2,2,1,""],_parse_audio_transcript_element:[2,2,1,""],build_graph:[2,2,1,""],get_optional_params:[2,3,1,""],get_required_params:[2,3,1,""],get_size_in_samples:[2,2,1,""],input_tensors:[2,4,1,""],iterator:[2,4,1,""],split_data:[2,2,1,""]},"data.speech2text.speech_utils":{augment_audio_signal:[2,5,1,""],get_speech_features:[2,5,1,""],get_speech_features_from_file:[2,5,1,""]},"data.text2text":{t2t:[3,0,0,"-"],text2text:[3,0,0,"-"],tokenizer:[3,0,0,"-"]},"data.text2text.t2t":{_batch_examples:[3,5,1,""],_create_min_max_boundaries:[3,5,1,""],_filter_max_length:[3,5,1,""],_get_example_length:[3,5,1,""],_load_records:[3,5,1,""],_parse_example:[3,5,1,""],_read_and_batch_from_files:[3,5,1,""],eval_input_fn:[3,5,1,""],train_input_fn:[3,5,1,""]},"data.text2text.text2text":{ParallelTextDataLayer:[3,1,1,""],SpecialTextTokens:[3,1,1,""],TransformerDataLayer:[3,1,1,""]},"data.text2text.text2text.ParallelTextDataLayer":{build_graph:[3,2,1,""],get_optional_params:[3,3,1,""],get_required_params:[3,3,1,""],get_size_in_samples:[3,2,1,""],input_tensors:[3,4,1,""],iterator:[3,4,1,""]},"data.text2text.text2text.SpecialTextTokens":{END_OF_CHOICE:[3,4,1,""],EOS_ID:[3,4,1,""],OUT_OF_BUCKET:[3,4,1,""],PAD_ID:[3,4,1,""],S_ID:[3,4,1,""],UNK_ID:[3,4,1,""]},"data.text2text.text2text.TransformerDataLayer":{build_graph:[3,2,1,""],get_optional_params:[3,3,1,""],get_required_params:[3,3,1,""],input_tensors:[3,4,1,""],iterator:[3,4,1,""]},"data.text2text.tokenizer":{Subtokenizer:[3,1,1,""],_count_and_gen_subtokens:[3,5,1,""],_count_tokens:[3,5,1,""],_escape_token:[3,5,1,""],_filter_and_bucket_subtokens:[3,5,1,""],_gen_new_subtoken_list:[3,5,1,""],_generate_alphabet_dict:[3,5,1,""],_generate_subtokens:[3,5,1,""],_generate_subtokens_with_target_vocab_size:[3,5,1,""],_join_tokens_to_string:[3,5,1,""],_list_to_index_dict:[3,5,1,""],_load_vocab_file:[3,5,1,""],_native_to_unicode:[3,5,1,""],_save_vocab_file:[3,5,1,""],_split_string_to_tokens:[3,5,1,""],_split_token_to_subtokens:[3,5,1,""],_unescape_token:[3,5,1,""],_unicode_to_native:[3,5,1,""]},"data.text2text.tokenizer.Subtokenizer":{__init__:[3,2,1,""],_subtoken_ids_to_tokens:[3,2,1,""],_token_to_subtoken_ids:[3,2,1,""],decode:[3,2,1,""],encode:[3,2,1,""],init_from_files:[3,3,1,""]},"data.utils":{load_pre_existing_vocabulary:[0,5,1,""],pad_vocab_to_eight:[0,5,1,""]},"decoders.decoder":{Decoder:[4,1,1,""]},"decoders.decoder.Decoder":{__init__:[4,2,1,""],_cast_types:[4,2,1,""],_decode:[4,2,1,""],decode:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""],mode:[4,4,1,""],name:[4,4,1,""],params:[4,4,1,""]},"decoders.fc_decoders":{FullyConnectedCTCDecoder:[4,1,1,""],FullyConnectedDecoder:[4,1,1,""],FullyConnectedTimeDecoder:[4,1,1,""]},"decoders.fc_decoders.FullyConnectedCTCDecoder":{__init__:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""]},"decoders.fc_decoders.FullyConnectedDecoder":{__init__:[4,2,1,""],_decode:[4,2,1,""],get_required_params:[4,3,1,""]},"decoders.fc_decoders.FullyConnectedTimeDecoder":{__init__:[4,2,1,""],_decode:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""]},"decoders.rnn_decoders":{BeamSearchRNNDecoderWithAttention:[4,1,1,""],RNNDecoderWithAttention:[4,1,1,""]},"decoders.rnn_decoders.BeamSearchRNNDecoderWithAttention":{__init__:[4,2,1,""],_decode:[4,2,1,""],get_optional_params:[4,3,1,""]},"decoders.rnn_decoders.RNNDecoderWithAttention":{__init__:[4,2,1,""],_build_attention:[4,2,1,""],_decode:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""]},"encoders.ds2_encoder":{DeepSpeech2Encoder:[5,1,1,""],conv2d_bn_actv:[5,5,1,""],rnn_cell:[5,5,1,""],row_conv:[5,5,1,""]},"encoders.ds2_encoder.DeepSpeech2Encoder":{__init__:[5,2,1,""],_encode:[5,2,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""]},"encoders.encoder":{Encoder:[5,1,1,""]},"encoders.encoder.Encoder":{__init__:[5,2,1,""],_cast_types:[5,2,1,""],_encode:[5,2,1,""],encode:[5,2,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],mode:[5,4,1,""],name:[5,4,1,""],params:[5,4,1,""]},"encoders.resnet_blocks":{batch_norm:[5,5,1,""],block_layer:[5,5,1,""],bottleneck_block_v1:[5,5,1,""],bottleneck_block_v2:[5,5,1,""],building_block_v1:[5,5,1,""],building_block_v2:[5,5,1,""],conv2d_fixed_padding:[5,5,1,""],fixed_padding:[5,5,1,""]},"encoders.resnet_encoder":{ResNetEncoder:[5,1,1,""]},"encoders.resnet_encoder.ResNetEncoder":{get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""]},"encoders.rnn_encoders":{BidirectionalRNNEncoderWithEmbedding:[5,1,1,""],GNMTLikeEncoderWithEmbedding:[5,1,1,""],UnidirectionalRNNEncoderWithEmbedding:[5,1,1,""]},"encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding":{__init__:[5,2,1,""],_encode:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding":{__init__:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.rnn_encoders.UnidirectionalRNNEncoderWithEmbedding":{__init__:[5,2,1,""],_encode:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"losses.cross_entropy_loss":{CrossEntropyLoss:[6,1,1,""]},"losses.ctc_loss":{CTCLoss:[6,1,1,""],dense_to_sparse:[6,5,1,""]},"losses.ctc_loss.CTCLoss":{__init__:[6,2,1,""],_compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""]},"losses.loss":{Loss:[6,1,1,""]},"losses.loss.Loss":{__init__:[6,2,1,""],_cast_types:[6,2,1,""],_compute_loss:[6,2,1,""],compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""],get_required_params:[6,3,1,""],name:[6,4,1,""],params:[6,4,1,""]},"losses.sequence_loss":{BasicSequenceLoss:[6,1,1,""],CrossEntropyWithSmoothing:[6,1,1,""],PaddedCrossEntropyLossWithSmoothing:[6,1,1,""]},"losses.sequence_loss.BasicSequenceLoss":{__init__:[6,2,1,""],_compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""],get_required_params:[6,3,1,""]},"losses.sequence_loss.CrossEntropyWithSmoothing":{__init__:[6,2,1,""],_compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""],get_required_params:[6,3,1,""]},"losses.sequence_loss.PaddedCrossEntropyLossWithSmoothing":{get_optional_params:[6,3,1,""]},"models.encoder_decoder":{EncoderDecoderModel:[7,1,1,""]},"models.encoder_decoder.EncoderDecoderModel":{__init__:[7,2,1,""],_build_forward_pass_graph:[7,2,1,""],_create_decoder:[7,2,1,""],_create_encoder:[7,2,1,""],_create_loss:[7,2,1,""],decoder:[7,4,1,""],encoder:[7,4,1,""],get_optional_params:[7,3,1,""],get_required_params:[7,3,1,""],loss_computator:[7,4,1,""]},"models.image2label":{Image2Label:[7,1,1,""]},"models.image2label.Image2Label":{evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],get_num_objects_per_step:[7,2,1,""],maybe_print_logs:[7,2,1,""]},"models.model":{Model:[7,1,1,""]},"models.model.Model":{__init__:[7,2,1,""],_build_forward_pass_graph:[7,2,1,""],clip_last_batch:[7,2,1,""],compile:[7,2,1,""],evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],finalize_inference:[7,2,1,""],get_data_layer:[7,2,1,""],get_num_objects_per_step:[7,2,1,""],get_optional_params:[7,3,1,""],get_output_tensors:[7,2,1,""],get_required_params:[7,3,1,""],get_tf_dtype:[7,2,1,""],hvd:[7,4,1,""],infer:[7,2,1,""],last_step:[7,4,1,""],maybe_print_logs:[7,2,1,""],mode:[7,4,1,""],num_gpus:[7,4,1,""],on_horovod:[7,4,1,""],params:[7,4,1,""],steps_in_epoch:[7,4,1,""]},"models.speech2text":{Speech2Text:[7,1,1,""],levenshtein:[7,5,1,""],sparse_tensor_to_chars:[7,5,1,""]},"models.speech2text.Speech2Text":{evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],finalize_inference:[7,2,1,""],get_num_objects_per_step:[7,2,1,""],infer:[7,2,1,""],maybe_print_logs:[7,2,1,""]},"models.text2text":{Text2Text:[7,1,1,""],calculate_bleu:[7,5,1,""],transform_for_bleu:[7,5,1,""]},"models.text2text.Text2Text":{evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],finalize_inference:[7,2,1,""],get_num_objects_per_step:[7,2,1,""],infer:[7,2,1,""],maybe_print_logs:[7,2,1,""]},"optimizers.automatic_loss_scaler":{AutomaticLossScaler:[9,1,1,""],BackoffScaler:[9,1,1,""],LogMaxScaler:[9,1,1,""]},"optimizers.automatic_loss_scaler.AutomaticLossScaler":{SUPPORTED_ALGOS:[9,4,1,""],check_grads:[9,3,1,""],loss_scale:[9,4,1,""],update_op:[9,2,1,""]},"optimizers.automatic_loss_scaler.BackoffScaler":{loss_scale:[9,4,1,""],update_op:[9,2,1,""]},"optimizers.automatic_loss_scaler.LogMaxScaler":{loss_scale:[9,4,1,""],update_op:[9,2,1,""]},"optimizers.lr_policies":{exp_decay:[9,5,1,""],fixed_lr:[9,5,1,""],piecewise_constant:[9,5,1,""],poly_decay:[9,5,1,""],transformer_policy:[9,5,1,""]},"optimizers.mp_wrapper":{MixedPrecisionOptimizerWrapper:[9,1,1,""],mp_regularizer_wrapper:[9,5,1,""]},"optimizers.mp_wrapper.MixedPrecisionOptimizerWrapper":{apply_gradients:[9,2,1,""],compute_gradients:[9,2,1,""]},"optimizers.optimizers":{DistributedOptimizer:[9,1,1,""],_adaptive_max_norm:[9,5,1,""],_add_scaled_noise_to_gradients:[9,5,1,""],_clip_gradients_by_norm:[9,5,1,""],_multiply_gradients:[9,5,1,""],adaptive_clipping_fn:[9,5,1,""],get_regularization_loss:[9,5,1,""],optimize_loss:[9,5,1,""]},"optimizers.optimizers.DistributedOptimizer":{__init__:[9,2,1,""],apply_gradients:[9,2,1,""],compute_gradients:[9,2,1,""]},"parts.rnns":{attention_wrapper:[11,0,0,"-"],flstm:[11,0,0,"-"],glstm:[11,0,0,"-"],gnmt:[11,0,0,"-"],rnn_beam_search_decoder:[11,0,0,"-"],slstm:[11,0,0,"-"],utils:[11,0,0,"-"]},"parts.rnns.attention_wrapper":{AttentionMechanism:[11,1,1,""],AttentionWrapper:[11,1,1,""],AttentionWrapperState:[11,1,1,""],BahdanauAttention:[11,1,1,""],BahdanauMonotonicAttention:[11,1,1,""],LuongAttention:[11,1,1,""],LuongMonotonicAttention:[11,1,1,""],hardmax:[11,5,1,""],monotonic_attention:[11,5,1,""],safe_cumprod:[11,5,1,""]},"parts.rnns.attention_wrapper.AttentionMechanism":{alignments_size:[11,4,1,""],state_size:[11,4,1,""]},"parts.rnns.attention_wrapper.AttentionWrapper":{__init__:[11,2,1,""],_item_or_tuple:[11,2,1,""],call:[11,2,1,""],output_size:[11,4,1,""],state_size:[11,4,1,""],zero_state:[11,2,1,""]},"parts.rnns.attention_wrapper.AttentionWrapperState":{clone:[11,2,1,""]},"parts.rnns.attention_wrapper.BahdanauAttention":{__init__:[11,2,1,""]},"parts.rnns.attention_wrapper.BahdanauMonotonicAttention":{__init__:[11,2,1,""]},"parts.rnns.attention_wrapper.LuongAttention":{__init__:[11,2,1,""]},"parts.rnns.attention_wrapper.LuongMonotonicAttention":{__init__:[11,2,1,""]},"parts.rnns.flstm":{FLSTMCell:[11,1,1,""]},"parts.rnns.flstm.FLSTMCell":{__init__:[11,2,1,""],call:[11,2,1,""],output_size:[11,4,1,""],state_size:[11,4,1,""]},"parts.rnns.glstm":{GLSTMCell:[11,1,1,""]},"parts.rnns.glstm.GLSTMCell":{__init__:[11,2,1,""],_get_input_for_group:[11,2,1,""],call:[11,2,1,""],output_size:[11,4,1,""],state_size:[11,4,1,""]},"parts.rnns.gnmt":{GNMTAttentionMultiCell:[11,1,1,""],gnmt_residual_fn:[11,5,1,""]},"parts.rnns.gnmt.GNMTAttentionMultiCell":{__init__:[11,2,1,""]},"parts.rnns.rnn_beam_search_decoder":{BeamSearchDecoder:[11,1,1,""],BeamSearchDecoderOutput:[11,1,1,""],BeamSearchDecoderState:[11,1,1,""],FinalBeamSearchDecoderOutput:[11,1,1,""],tile_batch:[11,5,1,""]},"parts.rnns.rnn_beam_search_decoder.BeamSearchDecoder":{__init__:[11,2,1,""],_maybe_merge_batch_beams:[11,2,1,""],_maybe_split_batch_beams:[11,2,1,""],_merge_batch_beams:[11,2,1,""],_split_batch_beams:[11,2,1,""],batch_size:[11,4,1,""],finalize:[11,2,1,""],initialize:[11,2,1,""],output_dtype:[11,4,1,""],output_size:[11,4,1,""],step:[11,2,1,""],tracks_own_finished:[11,4,1,""]},"parts.rnns.slstm":{BasicSLSTMCell:[11,1,1,""],_linear:[11,5,1,""]},"parts.rnns.slstm.BasicSLSTMCell":{__init__:[11,2,1,""],call:[11,2,1,""],output_size:[11,4,1,""],state_size:[11,4,1,""]},"parts.rnns.utils":{create_rnn_cell:[11,5,1,""]},"parts.transformer":{attention_layer:[12,0,0,"-"],beam_search:[12,0,0,"-"],beam_search_test:[12,0,0,"-"],common:[12,0,0,"-"],embedding_layer:[12,0,0,"-"],ffn_layer:[12,0,0,"-"],utils:[12,0,0,"-"]},"parts.transformer.attention_layer":{Attention:[12,1,1,""],SelfAttention:[12,1,1,""]},"parts.transformer.attention_layer.Attention":{call:[12,2,1,""],combine_heads:[12,2,1,""],split_heads:[12,2,1,""]},"parts.transformer.attention_layer.SelfAttention":{call:[12,2,1,""]},"parts.transformer.beam_search":{SequenceBeamSearch:[12,1,1,""],_StateKeys:[12,1,1,""],_expand_to_beam_size:[12,5,1,""],_flatten_beam_dim:[12,5,1,""],_gather_beams:[12,5,1,""],_gather_topk_beams:[12,5,1,""],_length_normalization:[12,5,1,""],_shape_list:[12,5,1,""],_unflatten_beam_dim:[12,5,1,""],sequence_beam_search:[12,5,1,""]},"parts.transformer.beam_search.SequenceBeamSearch":{_continue_search:[12,2,1,""],_create_initial_state:[12,2,1,""],_get_new_alive_state:[12,2,1,""],_get_new_finished_state:[12,2,1,""],_grow_alive_seq:[12,2,1,""],_search_step:[12,2,1,""],search:[12,2,1,""]},"parts.transformer.beam_search._StateKeys":{ALIVE_CACHE:[12,4,1,""],ALIVE_LOG_PROBS:[12,4,1,""],ALIVE_SEQ:[12,4,1,""],CUR_INDEX:[12,4,1,""],FINISHED_FLAGS:[12,4,1,""],FINISHED_SCORES:[12,4,1,""],FINISHED_SEQ:[12,4,1,""]},"parts.transformer.beam_search_test":{BeamSearchHelperTests:[12,1,1,""]},"parts.transformer.beam_search_test.BeamSearchHelperTests":{test_expand_to_beam_size:[12,2,1,""],test_flatten_beam_dim:[12,2,1,""],test_gather_beams:[12,2,1,""],test_gather_topk_beams:[12,2,1,""],test_get_shape_keep_last_dim:[12,2,1,""],test_shape_list:[12,2,1,""],test_unflatten_beam_dim:[12,2,1,""]},"parts.transformer.common":{LayerNormalization:[12,1,1,""],PrePostProcessingWrapper:[12,1,1,""]},"parts.transformer.common.LayerNormalization":{build:[12,2,1,""],call:[12,2,1,""]},"parts.transformer.embedding_layer":{EmbeddingSharedWeights:[12,1,1,""]},"parts.transformer.embedding_layer.EmbeddingSharedWeights":{build:[12,2,1,""],call:[12,2,1,""],linear:[12,2,1,""]},"parts.transformer.ffn_layer":{FeedFowardNetwork:[12,1,1,""]},"parts.transformer.ffn_layer.FeedFowardNetwork":{call:[12,2,1,""]},"parts.transformer.utils":{get_decoder_self_attention_bias:[12,5,1,""],get_padding:[12,5,1,""],get_padding_bias:[12,5,1,""],get_position_encoding:[12,5,1,""]},"utils.funcs":{evaluate:[13,5,1,""],infer:[13,5,1,""],restore_and_get_results:[13,5,1,""],train:[13,5,1,""]},"utils.hooks":{BroadcastGlobalVariablesHook:[13,1,1,""],PrintLossAndTimeHook:[13,1,1,""],PrintSamplesHook:[13,1,1,""],RunEvaluationHook:[13,1,1,""]},"utils.hooks.BroadcastGlobalVariablesHook":{__init__:[13,2,1,""],after_create_session:[13,2,1,""],begin:[13,2,1,""]},"utils.hooks.PrintLossAndTimeHook":{after_run:[13,2,1,""],before_run:[13,2,1,""],begin:[13,2,1,""]},"utils.hooks.PrintSamplesHook":{after_run:[13,2,1,""],before_run:[13,2,1,""],begin:[13,2,1,""]},"utils.hooks.RunEvaluationHook":{after_run:[13,2,1,""],before_run:[13,2,1,""],begin:[13,2,1,""]},"utils.utils":{Logger:[13,1,1,""],array_to_string:[13,5,1,""],cast_types:[13,5,1,""],check_params:[13,5,1,""],clip_last_batch:[13,5,1,""],clip_sparse:[13,5,1,""],deco_print:[13,5,1,""],flatten_dict:[13,5,1,""],get_available_gpus:[13,5,1,""],get_git_diff:[13,5,1,""],get_git_hash:[13,5,1,""],get_results_for_epoch:[13,5,1,""],iterate_data_layer:[13,5,1,""],log_summaries_from_dict:[13,5,1,""],mask_nans:[13,5,1,""],nest_dict:[13,5,1,""],nested_update:[13,5,1,""],text_ids_to_string:[13,5,1,""]},"utils.utils.Logger":{flush:[13,2,1,""],write:[13,2,1,""]},data:{data_layer:[0,0,0,"-"],image2label:[1,0,0,"-"],speech2text:[2,0,0,"-"],text2text:[3,0,0,"-"],utils:[0,0,0,"-"]},decoders:{decoder:[4,0,0,"-"],fc_decoders:[4,0,0,"-"],rnn_decoders:[4,0,0,"-"]},encoders:{ds2_encoder:[5,0,0,"-"],encoder:[5,0,0,"-"],resnet_blocks:[5,0,0,"-"],resnet_encoder:[5,0,0,"-"],rnn_encoders:[5,0,0,"-"]},losses:{cross_entropy_loss:[6,0,0,"-"],ctc_loss:[6,0,0,"-"],loss:[6,0,0,"-"],sequence_loss:[6,0,0,"-"]},models:{encoder_decoder:[7,0,0,"-"],image2label:[7,0,0,"-"],model:[7,0,0,"-"],speech2text:[7,0,0,"-"],text2text:[7,0,0,"-"]},optimizers:{automatic_loss_scaler:[9,0,0,"-"],lr_policies:[9,0,0,"-"],mp_wrapper:[9,0,0,"-"],optimizers:[9,0,0,"-"]},parts:{rnns:[11,0,0,"-"],transformer:[12,0,0,"-"]},utils:{funcs:[13,0,0,"-"],hooks:[13,0,0,"-"],utils:[13,0,0,"-"]}},objnames:{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","method","Python method"],"3":["py","staticmethod","Python static method"],"4":["py","attribute","Python attribute"],"5":["py","function","Python function"]},objtypes:{"0":"py:module","1":"py:class","2":"py:method","3":"py:staticmethod","4":"py:attribute","5":"py:function"},terms:{"106gb":21,"1080ti":29,"16xlarg":28,"1e6":3,"1e9":12,"1x1":5,"224gb":21,"2xlarg":28,"4gpu":29,"55gb":21,"5gb":27,"8xlarg":28,"\u03b1":22,"\u03b4":22,"\u03b5":22,"\u03b6":22,"\u03ba":22,"abstract":[0,4,5,6,7],"boolean":[1,5,11],"byte":3,"case":[0,4,5,6,7,11,14,16,25,28],"char":0,"class":[0,1,2,3,4,5,6,7,9,11,12,13,16,25],"default":[3,5,6,9,11,13,25,27,28],"enum":3,"export":21,"final":[4,7,11,13,21,25],"float":[1,2,4,5,7,9,11,12,25,28],"function":[0,2,4,5,6,7,9,11,12,13,25,28],"import":[25,28],"int":[0,1,2,3,4,5,7,9,11,12,25],"long":11,"new":[0,1,3,9,11,12,13,23,25,26],"return":[0,1,2,3,4,5,6,7,9,11,12,13,16,28],"short":[11,25,29],"static":[0,1,2,3,4,5,6,7,9,11,16,25,28],"true":[0,1,3,5,6,7,9,11,12,14,25,29],"try":[9,22,25,27],"while":[1,3,16,22,25,28],ALS:9,AWS:28,Adding:23,And:[0,1,3,28],But:7,EOS:12,For:[1,2,3,4,5,7,9,11,13,16,21,22,25,26,28,29],IDs:3,Its:11,NOT:11,One:[22,25,28],Such:16,That:[0,1,3,4,7],The:[0,1,3,5,7,9,11,12,13,16,21,25,27,28,29],There:25,These:[1,4,7,11,13,28],Used:[3,7,12],Uses:[9,13,29],Using:23,With:[14,16,29],__init__:[0,1,2,3,4,5,6,7,9,11,13,16,25],_adaptive_max_norm:9,_add_scaled_noise_to_gradi:9,_aspect_preserving_res:1,_baseattentionmechan:11,_basemonotonicattentionmechan:11,_batch_exampl:3,_build_attent:4,_build_forward_pass_graph:7,_building_block_v1:5,_building_block_v2:5,_cast_typ:[4,5,6],_central_crop:1,_clip_gradients_by_norm:9,_compute_loss:6,_continue_search:12,_count_and_gen_subtoken:3,_count_token:3,_create_decod:7,_create_encod:7,_create_initial_st:12,_create_loss:7,_create_min_max_boundari:3,_decod:4,_decode_crop_and_flip:1,_distributed_appli:9,_encod:5,_escape_token:3,_expand_to_beam_s:12,_filter_and_bucket_subtoken:3,_filter_max_length:3,_flatten_beam_dim:12,_gather_beam:12,_gather_topk_beam:12,_gather_tre:11,_gen_new_subtoken_list:3,_generate_alphabet_dict:3,_generate_subtoken:3,_generate_subtokens_with_target_vocab_s:3,_get_example_length:3,_get_input_for_group:11,_get_new_alive_st:12,_get_new_finished_st:12,_grow_alive_seq:12,_item_or_tupl:11,_join_tokens_to_str:3,_length_norm:12,_linear:11,_list_to_index_dict:3,_load_record:3,_load_vocab_fil:3,_maybe_merge_batch_beam:11,_maybe_split_batch_beam:11,_mean_image_subtract:1,_merge_batch_beam:11,_monotonic_probability_fn:11,_multiply_gradi:9,_native_to_unicod:3,_output:7,_parse_audio_el:2,_parse_audio_transcript_el:2,_parse_exampl:3,_parse_example_proto:1,_read_and_batch_from_fil:3,_resize_imag:1,_save_vocab_fil:3,_search_step:12,_shape_list:12,_smallest_size_at_least:1,_split_batch_beam:11,_split_string_to_token:3,_split_token_to_subtoken:3,_statekei:12,_subtoken_ids_to_token:3,_test:27,_token_to_subtoken_id:3,_unescape_token:3,_unflatten_beam_dim:12,_unicode_to_n:3,abl:[0,1,3,21,27],about:[13,29],abov:[11,22,28],abs:[5,9,11,29],absolut:28,acceler:11,accept:[3,11],access:[0,4,5,6,7],accord:11,accuraci:[1,7,27,28],achiev:[11,27],across:[3,6,7,9,25,28],activ:[5,11,28],activation_fn:5,actual:[4,7,11,25],adagrad:[7,9,25],adagradoptim:9,adam:[7,9,25,29],adamoptim:9,adapt:[5,7,9,25],adaptive_clip:9,adaptive_clipping_fn:9,add:[3,4,5,7,9,11,13,25,28],add_eo:3,added:[0,3,7,11,12,13,25],adding:13,addit:[2,7,9,11,12,21,25,28,29],addition:[1,25],adjust:[7,9,22,25,28],after:[0,1,5,9,11,13,21,22,25,27],after_create_sess:13,after_run:13,again:[25,27],aggreg:7,aggregation_method:9,aggregationmethod:9,alben:28,algorithm:[7,9,25,28],align:11,alignment_histori:11,alignments_s:11,aliv:12,alive_cach:12,alive_log_prob:12,alive_seq:12,all:[0,1,2,3,4,5,6,7,9,11,12,13,16,22,25,27,28,29],allow:[3,22,29],allreduc:9,along:[5,11],alpha:12,alphabet:[3,4],alphabet_config_path:4,alreadi:[11,26],also:[0,1,3,7,9,13,14,22,25,26,27,28,29],altern:[9,27,28],alwai:[7,9,28],amax:9,amount:3,analog:7,analysi:2,ani:[3,4,5,7,9,11,12,13,25,28,29],anoth:[3,9,11],answer:[7,25],anymor:13,anyth:9,api:[2,26],appear:3,append:[3,25],appli:[3,5,9,11,12,28],apply_gradi:9,approach:[7,11,14,28],apt:[21,27],arbitrari:9,architectur:[22,28],archiv:21,aren:3,arg:[9,11,13],argmax:11,argument:[2,3,4,5,6,7,9,11,12,13,14,21,25],arithmet:28,around:[1,4,5,6,21,27,28],arrai:[2,3,7],arrang:1,array_op:11,array_to_str:13,articl:11,artifici:4,arxiv:[5,9,11,28,29],aspect:1,assign:[0,4],assum:[11,21,27],assumpt:[7,11],attend:11,attent:[4,11,12,22,29],attention_cel:11,attention_depth:11,attention_dropout:12,attention_lay:[8,10],attention_layer_s:11,attention_mechan:11,attention_or_cell_output:11,attention_st:11,attention_typ:4,attention_wrapp:[8,10],attentioninputwrapp:11,attentionmechan:11,attentionwrapp:11,attentionwrapperst:11,attribut:[7,25],audio:[2,7,21],audio_filenam:2,augment:2,augment_audio_sign:2,auto:6,automat:[4,5,6,7,9,25,27,29],automatic_loss_sc:[7,9,25,28],automatic_loss_scal:8,automaticlossscal:9,autoregress:12,avail:[9,22,25,26,28,29],averag:[6,7,9],average_across_timestep:6,avoid:[9,11],axi:[7,11],back:[12,28],backoff:[7,9,25,28],backoffscal:9,backpropag:28,backslash:3,bahadanau:11,bahdanau:[4,11],bahdanau_norm:4,bahdanauattent:11,bahdanaumonotonicattent:11,bandwidth:28,base:[0,1,2,3,4,5,6,7,9,11,12,13,14,21,22,25,26,27],base_model:25,base_param:[25,28],basic:[6,11,26],basic_sequence_loss:6,basicsequenceloss:6,basicslstmcel:11,batch:[3,4,5,6,7,9,11,12,16,25],batch_norm:5,batch_siz:[2,3,4,5,6,11,12],batch_size_per_gpu:[6,7,22,25,29],batches_per_epoch:9,bazel:27,bbox:1,beahvior:11,beam:[4,11,12,29],beam_indic:12,beam_search:[8,10],beam_search_decoder_output:11,beam_search_test:[8,10],beam_siz:12,beam_width:[4,11],beamsearch:11,beamsearchdecod:11,beamsearchdecoderoutput:11,beamsearchdecoderst:11,beamsearchhelpertest:12,beamsearchrnndecoderwithattent:4,becaus:[3,12,22,28],becom:[2,9],been:[9,11,12,13,28],befor:[3,5,9,11,13,22,28],before_run:13,begin:[3,11,13,28],begin_decay_at:9,behavior:[11,28],being:[3,23],below:[27,28,29],bench_start:25,bench_step:25,benchmark:[7,25,28],benefici:28,bengio:11,besid:7,best:[11,12,27],beta1:9,beta2:9,better:[1,12],between:[3,7,9,11,12],bhadanau:11,bia:[11,12],bias:11,bias_initi:11,bidir_rnn_encoder_with_emb:5,bidirect:29,bidirectionalrnnencoderwithembed:5,big:[27,29],bigger:28,bin:27,binari:[3,21,27],bleu:[7,29],blob:12,block:[5,25],block_fn:5,block_lay:5,blue:22,bn_epsilon:5,bn_momentum:5,bn_regular:5,bodi:12,bool:[0,4,5,6,7,9,11,12,25],boost:27,bori:28,both:[1,7,11,25,28,29],bottleneck:5,bottleneck_block:5,bottleneck_block_v1:5,bottleneck_block_v2:5,bottom:11,bound:1,boundari:[3,9],boundary_scal:3,box:1,bpe_us:7,broadcast:13,broadcastglobalvariableshook:13,bucket:3,buckets_max:3,buckets_min:3,buffer:1,build:[2,4,9,12,13,20,26,27,28],build_graph:[0,1,2,3],build_image_data:1,build_lm:21,build_pip_packag:27,building_block:5,building_block_v1:5,building_block_v2:5,built:[7,25,26],c_state:11,cach:12,calcul:[0,1,3,7,11,12],calculate_bleu:7,call:[4,5,6,7,9,11,12,13,16,28],callabl:[9,11],callback:13,can:[0,1,2,3,4,5,6,7,9,11,13,14,16,22,25,26,27,28,29],candiat:3,candid:3,cannot:[7,11,12,25],cast:[4,5,6,28],cast_typ:13,cat:22,cell:[4,5,11,29],cell_input_fn:11,cell_param:11,cell_stat:11,cell_typ:11,center:1,central:1,chang:[11,13,14,22,28],channel:[1,5],channels_first:5,channels_last:5,charact:[3,4],check:[7,11,25,26,27,28],check_grad:9,check_param:13,checkpoint:[7,11,13,25,29],child:25,cho:11,choos:[11,28],chosen:9,christoph:11,classic:7,clean:[21,29],cleaned_fil:22,clip:[7,9,25],clip_gradi:9,clip_last_batch:[7,13],clip_spars:13,clone:[11,27],close:[3,11,29],cloud:28,cmake:27,cnn:26,code:[3,7,11,25],coeffici:[9,28],colin:11,collect:[7,9,12,28],colloqui:1,coloc:9,colocate_gradients_with_op:9,color:1,colorspac:1,column:11,com:[12,27],combin:[7,9,12,26],combine_head:12,command:[14,21,22,25,27,29],comment:29,commit:25,common:[8,10,28],commonli:28,compar:[5,28],compat:9,compil:[7,25],complet:[7,9,17,18,19,24,25,27],complex:28,compon:11,compos:11,compress:21,comput:[1,6,7,9,11,12,28],compute_gradi:[9,28],compute_loss:[6,7,13],concat:11,concaten:[7,11],concret:28,config:[0,1,2,3,4,5,6,7,9,13,14,23,27,29],config_fil:[21,22,25,27,29],configur:[7,14,22,25,26,27,29],conflict:11,conjunct:[7,25],connect:[4,5,7,12],conrib:11,consecut:3,consist:[4,13],constant:[9,28],constraint:11,construct:[0,1,3,4,5,6,7,9,11,13],constructor:[0,2,4,5,6,7,9,11,25],consumpt:28,contain:[0,1,2,3,4,5,6,7,9,11,12,13,22,25,27,28,29],content:[1,4,5,6],context:11,continu:[12,25],continue_learn:25,contrib:11,control:[7,11],conv2d:5,conv2d_bn_actv:5,conv2d_fixed_pad:5,conv_lay:5,conveni:25,convent:5,converg:28,convert:[2,3,11,21,28],convolut:[5,29],coord:[1,13],coordin:[1,13],copi:[5,7,11,28],copt:27,core:[3,28],correct:[7,11,12,21,22,25],correctli:[7,27],correspond:[0,1,3,7,9,11,12,16,22,25,27,28],correspondingli:[9,16,29],cosin:12,could:[0,2,4,5,6,7,9,25],count:[0,3,4,25],counter:9,cover:25,cpu:[3,28],creat:[0,1,3,4,5,6,7,9,11,12,13,20,25,28],create_rnn_cel:11,create_toy_data:22,creation:[7,28],crop:1,crop_height:1,crop_width:1,cross:6,cross_entropy_loss:8,cross_entropy_with_smooth:6,crossentropyloss:6,crossentropywithsmooth:6,csv:[2,21],ctc:[4,6],ctc_decoder_with_lm:27,ctc_greedy_decod:4,ctc_loss:8,ctcloss:6,cuda:[27,28],cudnn:29,cudnn_gru:5,cudnn_lstm:5,cudnnlstm:11,cumprod:11,cumsum:11,cumul:11,cur_index:12,current:[1,4,5,7,9,11,12,25,29],custom:[27,28],cut:[4,7],d_model:9,dai:29,data:[1,2,3,4,5,6,7,8,11,13,15,21,22,23,25,28],data_fil:3,data_format:5,data_lay:[1,2,3,4,5,6,7,8,25],data_layer_param:[7,25],data_root:22,datalay:[0,1,2,3,7,16,25],dataset:[0,1,2,3,7,16,20,22],dataset_fil:2,david:28,dct:13,debug:25,debug_port:[13,25],debugger_port:25,dec:5,decai:[9,28,29],decay_r:9,decay_step:9,deco_print:13,decod:[0,1,3,5,6,7,8,11,12,15,21,22,23,25,26,29],decode_and_crop:1,decoder_cell_typ:4,decoder_cell_unit:4,decoder_dp_input_keep_prob:4,decoder_dp_output_keep_prob:4,decoder_initial_st:11,decoder_library_path:4,decoder_output:6,decoder_param:[4,7],decoder_use_skip_connect:4,decreas:[3,28],deep:[5,11,28],deepbench:28,deepspeech2encod:[5,25],deepspeech:[5,25,27],defaultdict:3,defin:[0,1,3,4,5,7,9,11,12,22,25,28],definit:5,degre:28,delet:21,delim:[7,13],denomin:[7,25],denot:1,dens:[7,9,11],dense_tensor:6,dense_to_spars:6,depend:[5,11],deprec:11,depth:[1,11,26],deriv:[0,4,5,6,7,25],describ:[0,1,3,4,5,6,7,11,25,27,28,29],descript:[0,1,2,3,4,5,6,7,25,29],design:[26,28],desir:28,detail:[1,2,4,5,7,9,11,25,26,29],determin:[11,12],determinist:9,dev:[21,22,27,29],deviat:11,devic:[9,13],device_dens:9,device_spars:9,diamo:28,dict:[0,1,2,3,4,5,6,7,9,12,25],dict_to_log:13,dictionari:[0,1,2,3,4,5,6,7,12,16,25],did:22,diederik:11,diff:25,differ:[1,3,5,9,11,12,13,22,25,26,27,28],dim:[4,5,6,11],dimens:[1,4,5,11,12],dimension:[9,11],direct:[5,29],directori:[7,25,27],disabl:[7,11,22,25,27,28],discov:27,disk:22,displai:25,distanc:7,distort:1,distribut:[3,7,9,11,25,26,27,28],distributedoptim:9,divid:12,divis:[0,2,7,11],dl_id:13,dnn:28,do_mask:6,doc:[4,5,6,7,11,25],docker:[27,28],docstr:11,document:[11,25,26,28],doe:[1,4,6,7,11,22,25,27,28,29],doesn:1,domain:2,don:[3,12,29],done:[22,27],dot:12,dougla:11,download:[21,22],download_lm:27,downsampl:5,dp_input_keep_prob:11,dp_output_keep_prob:11,draw:12,drawn:3,dropout:[4,5],dropout_keep_prob:5,dropout_keep_prop:5,ds2_encod:[8,25],ds2_large_8gpu:29,ds2_librispeech_larc_config:21,ds2_medium_4gpu:29,ds2_small_1gpu:29,ds2_toy_data_config:[21,27],dtype:[0,4,5,6,7,9,11,13,25,28],due:11,dure:[1,3,4,7,11,12,13,22,25,28],dynam:[7,11,28],dynamic_decod:11,dzmitri:11,each:[0,1,3,6,7,9,11,12,13,16,25,28],eager:9,earli:11,easi:26,easili:22,eck:11,effect:[11,29],effici:[11,26],eight:29,either:[0,2,4,5,6,7,9,11,25,28],element:[2,3,7,9,11],elimin:29,els:[6,9],elsen:28,embed:[4,5,11,12],embedding_lay:[8,10],embedding_lookup:11,embedding_s:12,embeddingsharedweight:12,emit:11,emnlp:11,empti:[7,9,11,16,25],enabl:[7,9,11,14,25,27],enable_log:[25,29],enc_emb_w:5,encod:[0,1,3,4,6,7,8,11,12,15,22,23,25,26,29],encoder_cell_typ:5,encoder_cell_unit:5,encoder_decod:[4,5,6,8,25],encoder_dp_input_keep_prob:5,encoder_dp_output_keep_prob:5,encoder_final_st:11,encoder_lay:5,encoder_output:[4,5,11],encoder_param:[5,7],encoder_sequence_length:4,encoder_st:[5,11],encoder_use_skip_connect:5,encoderdecodermodel:[7,25],encorc:11,end:[3,4,11,12,13,22,27,28],end_compat:9,end_learning_r:9,end_of_choic:3,end_symbol:4,end_token:11,energi:11,enforc:11,english:[3,20],enough:[22,25,28],ensur:[3,11,12,13,28],entri:11,entropi:6,enumer:3,eos:12,eos_id:[3,12,13],epoch:[0,1,3,7,9,25,29],epsilon:[5,7,9,12,25],equal:[1,7,11,25],equival:[9,11],erich:28,error:[7,11,21,27,29],escap:3,especi:28,essenti:13,estim:28,etc:[4,5,7,16,25,26],etl:16,eval:[1,4,5,7,21,25],eval_input_fn:3,eval_model:13,eval_param:25,eval_step:[7,25],evalu:[0,1,2,3,7,13,21,22,25],evenli:3,event:[25,28],everi:[5,7],every_step:13,everyth:[21,25,26,27],exact:[11,25],exactli:11,exampl:[0,1,2,3,4,5,7,9,11,16,21,22,25,27,28],example_config:[21,22,25,27],example_seri:1,exce:[7,9,25],except:[5,11,13,25,27,28],execut:[7,9,14,22,25],exist:[0,4,11,23,28],exp:[9,11],exp_decai:9,expect:6,experi:[22,25,26],experiment:26,explicit:5,explicitli:[11,28],exponenti:9,exponential_decai:9,express:25,extend:26,extens:28,extra:9,extract:[2,16,21],fact_siz:11,factor:[9,11,12,28],fail:27,fairli:25,fals:[0,1,3,4,5,6,7,9,11,12,13,27],familiar:26,fc_decod:8,featur:[2,4,5,6,26],features_typ:2,fed:16,feed:[11,13],feed_dictionari:16,feedforward:12,feedfowardnetwork:12,feel:21,fetch:[7,9],few:29,ffn_layer:[8,10],field:[1,2,4,5,6,11],file:[0,1,2,3,4,7,21,22,25,27,29],file_byte_limit:3,file_pattern:3,file_with_bpe_segment:22,filenam:[1,2,3],filepath:3,filter:[3,5,9],filter_s:12,final_output:4,final_sequence_length:4,final_st:[4,11],finalbeamdecoderoutput:11,finalbeamsearchdecoderoutput:11,finalize_evalu:7,finalize_infer:7,find:[9,12,26],finish:[11,12,22,25],finished_flag:12,finished_scor:12,finished_seq:12,first:[0,1,3,5,7,9,11,12,14,21,22,25,28],fix:9,fixed_lr:9,fixed_pad:5,flag:[11,12],flaot:12,flat_dict:13,flatten_dict:13,flexibl:26,flip:1,float16:[0,4,5,6,7,25,28],float32:[0,4,5,6,7,9,12,25,28,29],flstm:[8,10],flstmcell:11,flush:13,folder:[21,22,25,27],follow:[0,1,3,4,5,6,7,9,16,21,22,25,27,28,29],forc:25,force_var_reus:7,forev:3,forget:[11,29],forget_bia:11,form:[0,3,11],format:[1,3,5],formul:12,forward:[7,28],found:[3,29],four:29,fp32:28,fraction:3,frame:[2,7],framework:12,free:21,frequenc:[2,3],frequent:3,from:[0,1,2,3,4,5,6,7,9,11,12,13,16,25,27,28,29],ftrl:[7,25],full:[5,9,28,29],fulli:[4,5,12],fully_connected_ctc_decod:4,fully_connected_decod:4,fully_connected_time_decod:4,fullyconnectedctcdecod:[4,21],fullyconnecteddecod:4,fullyconnectedtimedecod:4,func:8,furthermor:28,fuse:1,futur:12,ganesh:28,garcia:28,gate:[9,11],gate_gradi:9,gate_graph:9,gate_non:9,gate_op:9,gather:12,gen_input_tensor:16,gener:[3,4,7,11,12,25],generate_tri:27,geometr:12,german:[3,20],get:[7,9,12,16,21,22,26,27],get_available_gpu:13,get_data_lay:7,get_decoder_self_attention_bia:12,get_git_diff:13,get_git_hash:13,get_global_step:9,get_next:16,get_num_objects_per_step:7,get_optional_param:[0,1,2,3,4,5,6,7,16,25],get_output_tensor:7,get_pad:12,get_padding_bia:12,get_position_encod:12,get_regularization_loss:9,get_required_param:[0,1,2,3,4,5,6,7,16,25],get_results_for_epoch:13,get_size_in_sampl:[0,1,2,3,16],get_speech_featur:2,get_speech_features_from_fil:2,get_tf_dtyp:7,get_vari:28,get_wmt16_en_dt:22,getter:28,ginsburg:[11,28],git:[25,27],github:[12,27],given:[1,9,11,12,28],global:[9,13],global_gradient_norm:[7,25],global_step:9,glstm:[4,8,10],glstmcell:11,gnmt:[4,5,8,10,29],gnmt_encoder_with_emb:5,gnmt_residual_fn:11,gnmt_v2:4,gnmtattentionmulticel:11,gnmtlikeencoderwithembed:5,go_symbol:4,goal:22,going:[4,5,17,18,19,24,25,27],good:25,gpu:[0,6,7,9,13,14,16,22,25,26,27,28,29],gpu_id:[7,25],grad_loss:9,gradient:[7,9,11,25,28],gradient_multipli:9,gradient_noise_scal:9,gradient_norm:[7,25],grads_and_var:9,gram:21,graph:[0,1,2,3,4,5,6,7,9,13,25,28],graphic:28,graphkei:9,greater:5,gregori:28,group:[3,11],group_batch_s:3,group_id:11,group_siz:11,grow:12,gru:[4,5,29],guarante:[3,11],half:28,halv:28,handl:11,happen:[0,1,3,4,5,6,7,16],hard:11,hardmax:11,has:[0,1,2,3,4,5,6,7,9,11,12,13,28,29],has_nan:9,hash:25,hat:9,have:[0,1,2,3,4,5,6,7,9,11,12,16,22,25,26,27,28,29],head:12,height:1,height_in:5,help:[7,21,25],helper:[5,12],henc:28,here:[0,1,3,4,5,6,7,11,16,25,28,29],hetland:7,hidden:[4,5,11],hidden_s:12,hieu:11,high:28,higher:11,highest:12,highli:29,histogram:9,histori:11,hold:[9,12],hood:9,hook:[7,8],horovod:[0,3,6,7,9,13,14,25,29],horovod_gpu_allgath:9,horovod_gpu_allreduc:9,horovod_gpu_broadcast:13,hot:[1,6,11],houston:28,how:[7,9,11,12,20,22,23],howev:[1,28],http:[5,7,9,11,12,27,29],human:13,hvd:[7,25],hyperparamet:28,iclr:[11,28],icml:11,ident:[5,11],ids:[0,2,3,7,11,12,13,25],idx2char:7,ignor:[0,7,9,11,14,25],ignore_speci:[7,13],illeg:12,illustr:28,ilsvrc2012_val_00041207:1,imag:[1,5,7],image2label:[0,8,25],image_buff:1,imagenet_preprocess:[0,8],imagenetdatalay:1,implement:[6,7,9,11,12,16,25,29],impli:11,import_librivox:21,improv:[1,3,28],incept:1,includ:[0,1,2,3,4,5,6,7,9,11,21,28],increas:[12,28],increment:9,increment_global_step:9,independ:[5,6,7,25],index:[11,12],indexedslic:9,indic:[1,3,12,13],inf:11,infer:[0,1,3,4,5,7,11,13,21,25],infer_output_fil:[21,22,25],infer_param:25,infin:12,info:9,inform:[11,12,13,25,26],inherit:[0,4,5,6,7,16],init_from_fil:3,initi:[3,4,5,7,9,11,12,13,25,28],initial_cach:12,initial_cell_st:11,initial_id:12,initial_st:11,initializer_param:[4,5,7,25],inner:[11,12],input:[0,1,2,3,4,5,6,7,11,12,25,26],input_dict:[4,5,6,13],input_lay:5,input_s:11,input_sequence_length:11,input_tensor:[0,1,2,3,4,5,6,7],input_typ:2,input_valu:7,insid:[0,1,3,7,22,28],inspect:28,inspir:11,instabl:11,instal:[21,26],instanc:[0,4,5,6,7,9,11,28],instanti:9,instead:[1,4,9,11,27,28],instruct:[21,26],insur:12,int32:[1,11,12],int64:[11,12],intact:5,integ:[1,3,5,11],inter:28,intermedi:28,intern:[9,11,23],introduc:[1,5,28],invalid:9,invalidargu:11,invari:12,invers:3,involv:12,is_train:1,issu:29,item:[3,11,12],iter:[0,1,2,3,7,13,16,28],iterate_data_lay:13,its:[3,11,12,21,25],jian:5,join:3,jointli:11,jonah:28,jpeg:1,jul:5,just:[4,5,27],kaim:5,keep:[4,5,6,9,13,28],kei:[0,1,3,5,7,9,12],kenlm:[21,27],kept:28,kernel:5,kernel_initi:11,kernel_s:5,key_channel:12,keyword:12,kind:25,kingma:11,knee:1,known:[1,3,11],kpu:27,kuchaev:28,kuchaiev:11,kwarg:[9,11,12],kyunghyun:11,label:[1,4,6],lambda:[9,11,28],languag:[4,20],language_model:27,lar:[7,25],larc:[7,9,25,29],larc_eta:[7,25],larc_mod:[7,9,25],larc_nu:9,larc_param:[7,9,25],larg:[11,28],largest:12,last:[4,5,7],last_batch:[7,13],last_step:[7,13],latenc:28,later:28,latter:11,launch:[13,22],layer:[0,1,2,3,4,5,7,9,11,12,15,23,25,28,29],layer_typ:5,layernorm:12,layernorm_lstm:5,lead:27,learn:[5,7,9,11,22,25,29],learnabl:11,learning_r:[7,9,25],learning_rate_decay_fn:9,least:[5,12],left:11,length:[0,1,2,3,4,5,6,11,12],length_i:12,length_penalty_weight:11,length_x:12,less:[3,28,29],level:[25,28],levenshtein:7,libboost:27,libctc_decoder_with_kenlm:27,librari:4,librispeech:[20,29],librivox:21,libsox:21,libtensorflow_cc:27,libtensorflow_framework:27,like:[5,9,12,16,22,29],limit:28,line:[0,3,13,14,22,25],linear:[4,11,12],link:[27,29],list:[0,1,2,3,4,5,6,7,9,11,12,16,25,26],liu:11,live:12,lm_binary_path:4,lm_trie_path:4,lm_weight:4,load:[0,3,16],load_pre_existing_vocabulari:0,locat:[12,22,27],lock:9,log:[7,9,11,12,23,28],log_fil:13,log_max:9,log_summaries_from_dict:13,logdir:[7,22,25],logger:13,logic:[7,12,16],logit:[4,6,11,12],logits_to_outputs_func:4,logmax:[7,9,25,28],logmaxscal:9,lognorm:28,logspac:11,longer:13,longest:3,look:[7,25,26,29],loop:12,lose:28,loss:[0,1,3,4,7,8,9,15,23,25],loss_comput:7,loss_input_dict:6,loss_param:7,loss_scal:[7,9,25,28],lot:[21,22,27],lower:[3,28],lr_polici:[7,8,25],lr_policy_param:[7,25],ls_dir:21,lst:3,lstm:[4,5,11,29],lstmstatetupl:11,luong:[4,11],luong_scal:4,luongattent:11,luongmonotonicattent:11,m_state:11,machin:[11,20],magnitud:9,mai:[3,11,12],main:[4,5,6,22,25,26],maintain:[12,28],major:11,make:[0,1,3,11,21,22,26,27],malform:9,man:11,mani:28,manner:11,manual:[11,27,28],map:[0,3,4,5,11],mark:[3,11,12],mask:[6,11,12],mask_nan:[6,13],master:[12,28],match:[1,3,5,11,28,29],matric:11,matrix:[11,12],max:3,max_decode_length:12,max_grad_norm:[7,25],max_length:3,max_lr:9,max_norm:9,max_pool2d:5,max_step:[7,25],max_subtoken_length:3,max_tim:11,max_timescal:12,maxim:9,maximum:[3,7,11,12,25,28],mayb:11,maybe_print_log:7,mean:[1,9,28],measur:[22,29],mechan:[4,11,12],memori:[0,11,22,28],memory_sequence_length:11,mention:28,merg:11,method:[0,1,2,3,4,5,6,7,9,11,12,13,16,25,28],methodnam:12,methodolog:28,metric:7,mfcc:2,michael:28,micikeviciu:28,might:[22,25,27,28],milli:2,min:3,min_boundari:3,min_count:3,min_idx:0,min_lr:9,min_timescal:12,min_upd:[7,25],minh:11,mini:16,minibatch:11,minim:[7,9,25],minimum:[0,3,11,12],minumum:3,minut:27,mismanag:11,misspel:21,mix:[4,5,7,9,11,12,25,26,29],mixedprecisionoptimizerwrapp:[9,28],mkdir:[21,27],modal:26,mode:[0,1,3,4,5,7,11,13,14,21,22,25,27,29],model:[0,1,2,3,4,5,6,8,9,11,12,13,20,22,23,26,28],model_param:28,modifi:[4,5,6,9,13,28],modul:[4,7,9,11,25],modular:26,momentum:[5,7,9,25,29],momentumoptim:9,monoton:11,monotonic_attent:11,more:[9,11,12,22,26,28,29],moreov:25,moss:[22,29],most:[3,7,16,25,28],move:[9,11,25],mozilla:27,mp_regularizer_wrapp:[9,28],mp_wrapper:8,mpi4pi:27,mpi:7,mpiexec:29,mpirun:14,msg:13,much:[3,12,22,27],multi:[7,12,14,16,22,26,29],multicel:11,multihead:12,multipl:[9,11,12,29],multipli:[9,11,28],multirnncel:11,must:[0,1,3,4,5,6,9,11,12,25],mutli:14,myfavoriteattentionmechan:11,n03623198:1,n_hidden:5,name:[2,3,4,5,6,7,9,11,25,28],namedtupl:11,nan:6,narang:28,nativ:3,nearli:28,necessari:[0,5,7,9,13,25,27,28],necessarili:12,need:[0,1,3,4,5,7,9,12,14,16,21,22,25,27,28],neg:[11,12],neither:9,nest:[11,12,25],nest_dict:13,nested_upd:13,network:[5,11,12,28],neural:[11,28],new_beam_s:12,new_cach:12,new_height:1,new_log_prob:12,new_seq:12,new_width:1,newli:11,newstest2014:[22,29],next:[7,11,25,27,28,29],next_batch_feed_dict:16,next_input:11,next_stat:11,nmt:[22,29],nmt_revers:22,no_dir_check:25,noam:9,node:[14,26],nois:[9,11],noise_level_max:2,noise_level_min:2,non:[9,11,12],none:[0,2,3,4,5,6,7,9,11,12,13,16,25,28],nor:9,norm:[5,7,9,25],normal:[4,5,9,11,12,28],note:[0,1,3,4,5,6,7,9,11,21,22,25,28],now:[11,21,25,27,28],num:[4,5,6],num_audio_featur:2,num_box:1,num_channel:[1,5],num_cpu_cor:3,num_epoch:[0,1,3,7,9,25],num_featur:[2,4],num_gpu:[7,14,22,25,29],num_head:12,num_iter:3,num_lay:11,num_proj:11,num_rnn_lay:5,num_time_step:2,num_unit:11,num_work:[0,1,2,3,16],number:[0,1,2,3,4,5,6,7,9,11,12,14,25,28,29],number_of_group:11,numer:[7,11,25,28],numpi:2,nvidia:[26,27,28],object:[0,1,3,4,5,6,7,9,11,12,13,16,28],obtain:[27,28,29],occur:11,offici:26,offset:13,offset_target_by_on:6,often:[7,25,28],old:12,oleksii:28,on_horovod:[7,9],onc:[3,11,13,22],one:[0,1,3,4,5,6,7,9,11,12,13,22,25,28,29],ones:26,onli:[0,1,3,4,7,9,11,13,14,22,25,28],onlin:11,open:7,open_seq2seq:[1,2,3,7,27],openseq2seq:[3,14,21,22,25,28,29],oper:[4,5,9,11,13,27,28],ops:[1,9,11,13],opt:27,optim:[7,8,25,29],optimize_loss:9,optimizer_cls_nam:9,optimizer_param:[7,9,25],optimizer_summari:9,option:[0,1,2,3,4,5,6,7,9,11,16,25],optional_dict:13,order:[3,5,9,11,21,22,27,28],org:[5,7,9,11,29],org_dict:13,origin:[1,3,5,11,12,13,21,29],other:[1,4,7,9,11,13,21,25,27,28],otherwis:[1,7,9,11,22,25,27,28],our:[26,27,28],out:[11,12,22,26],out_of_bucket:3,output:[1,4,5,6,7,11,12,13,16,21,22,25,26,27],output_attent:11,output_dim:[4,11],output_dir:13,output_dtyp:11,output_fil:[7,13],output_height:1,output_lay:11,output_s:11,output_time_major:11,output_valu:7,output_width:1,outsid:3,over:[6,28],overal:13,overcom:28,overflow:28,overflow_std_dev:9,overrid:11,overridden:11,overriden:9,overwrit:[7,25],overwritten:25,own:20,p_choose_i:11,packag:[4,5],pad2eight:12,pad:[0,1,2,3,5,12,16,29],pad_id:[3,13],pad_to:2,pad_vocab_to_eight:0,padded_cross_entropy_with_smooth:6,padded_input_length:3,padded_length:3,padded_target_length:3,paddedcrossentropylosswithsmooth:6,padding_valu:12,page:[26,27],pair:[7,9,11],paper:22,parallel:[3,11,12,22],parallel_interleav:3,paralleltextdatalay:3,param:[0,1,2,3,4,5,6,7,9,11,12,13,16,25],paramet:[0,1,2,3,4,5,6,7,9,11,12,13,14,16,22,23,27,28],parent:[0,2,4,5,6,25],pars:[1,2],parse_record:1,part:[1,4,6,7,8,9,11,12,21,25,28],particular:11,partli:11,pass:[0,1,3,4,5,6,7,9,11,12,13,25,28],past:11,path:[0,2,3,4,7,25,29],pauliu:28,pdf:[5,9],penal:11,per:[2,3,6,9],perform:[1,2,4,5,6,7,11,22,25,28],period:[25,28],perl:[22,29],peter:11,pham:11,piecewis:9,piecewise_const:9,pip:27,pip_packag:27,pipelin:3,place:[3,25],placehold:16,plane:5,pleas:[11,21],point:[3,11,13,22,28],polici:[7,9,25],poly_decai:9,polynomi:9,polynomial_decai:9,popul:[7,9],posit:[5,11,12],possibl:[7,11,25,28,29],post:12,power:[9,11],practic:28,pre:[0,11,12,22],preactiv:5,precis:[4,5,7,25,26,29],pred:7,predict:[7,11,12,13,25],predicted_id:11,preevious_attent:11,prefer:27,prefix:[3,9],prepar:[11,25],prepostprocessingwrapp:12,preprint:28,preprocess:[1,21],preprocess_imag:1,presenc:28,present:9,preserv:1,prevent:9,previou:[9,11,12],previous_attent:11,primarili:1,principl:28,print:[7,13,25],print_loss_step:[7,25],print_samples_step:[7,25],printlossandtimehook:13,printsampleshook:13,prior:[3,11],probability_fn:11,probabl:[4,5,11,12,25,28],problem:[11,22,28],proce:25,process:[0,1,2,3,9,11,12,13,22,27],produc:[4,5,6,11],product:[11,12,26],progress:22,project:[5,11,26],projection_shortcut:5,propag:[11,28],proper:11,properli:11,properti:[11,12],propos:[5,11],proto:1,protocol:1,provabl:12,proven:28,provid:[1,3,4,5,7,9,11,13,21,28],pull:12,put:27,python:[1,3,4,5,7,9,11,12,13,16,21,22,25,27,29],quantiti:9,queri:11,quicker:9,raffel:11,rais:[1,9,11,13],random:[1,3,7,11,13,25],random_se:[7,25],randomli:1,rang:[11,28],rank:[1,3,9,11,13],rare:28,rate:[7,9,25,29],rather:[0,5,7,25,28],ratio:1,raw:[1,2,5],raw_record:1,raw_str:3,reach:12,read:[0,3,11,25],read_char:0,readabl:13,real:22,realli:4,reason:11,receiv:13,recent:28,recip:[26,28],recogn:21,recognit:[5,7,20,27],recommend:[9,11,16,27,28,29],record:[1,3],recov:13,recurr:[11,28],recurs:11,redefin:28,reduc:[11,22],reduce_mean:6,ref:5,refer:[1,3,12],regress:6,regular:[4,5,7,9,25],regularizer_param:[4,5,7,25],relat:[0,1,3,11],relu:5,relu_dropout:12,remov:[7,22],ren:5,reparameter:11,repeat:[3,11],replac:[3,11,21],report:9,report_summari:9,repositori:27,repres:[1,11,12],represent:[4,5],representation_dim:5,request:13,request_stop:13,requir:[0,1,2,3,4,5,6,7,9,11,16,25,27,28],required_dict:13,rescal:9,research:26,reserv:3,reserved_token:3,reshap:[11,12],residu:[4,5,11],residual_connect:11,resiz:1,resize_imag:1,resize_min:1,resized_imag:1,resizemethod:1,resnet:[1,5],resnet_block:8,resnet_encod:8,resnetencod:5,respect:[11,28],rest:27,restor:[11,13,25],restore_and_get_result:13,result:[7,11,12,13,21,22,27,28,29],results_per_batch:7,retriev:28,reus:[11,28],revers:20,rgb:1,right:11,rmsprop:[7,25],rnn:[4,5,8,10,22,26,29],rnn_beam_search_decod:[8,10],rnn_cell:5,rnn_cell_dim:5,rnn_cell_impl:11,rnn_decod:8,rnn_decoder_with_attent:4,rnn_encod:8,rnn_type:5,rnn_unidirect:5,rnncell:11,rnndecoderwithattent:4,robust:28,ron:11,root:[13,29],root_rank:13,row:[5,7,11,13],row_conv:5,row_conv_width:5,rule:28,run:[4,5,7,11,12,13,14,21,23,28,29],run_context:13,run_valu:13,runevaluationhook:[7,13],runtest:12,runtim:11,runtimeerror:9,s_id:[3,13],safe:25,safe_cumprod:11,safeti:9,sai:25,saliman:11,same:[0,1,3,4,5,6,9,11,12,13],sampl:[0,1,3,4,7,11,13,16,25],save:[3,7,25],save_checkpoint_step:[7,25],save_summaries_step:[7,25],scalar:[1,9,11,12],scale:[3,4,7,9,11,12,25],scale_max:9,scale_min:9,scaler:9,scan:11,scheme:[3,9],scope:[4,5,6,9,11],score:[7,11,12,29],score_bias_init:11,score_mask_valu:11,score_or_log_prob:12,script:[1,14,21,22,25,27,29],search:[3,4,11,12],second:[2,3,9,11,13,28],section:[4,5,7,9,12,17,18,19,24,25,26,27,29],sed:22,see:[2,4,5,6,7,9,11,22,25,26,27,28],seed:[7,11,25],select:[7,28],self:[0,1,3,4,5,6,7,11,12,16,25],selfattent:12,semi:3,send:13,separ:[1,25],seq2seq:[6,11],seq:11,sequenc:[0,1,2,3,4,5,6,7,11,12,20,25,26,27],sequence_beam_search:12,sequence_length:[6,11],sequence_loss:8,sequencebeamsearch:12,seri:1,serial:[1,3],serialized_exampl:3,sess:[7,13],session:13,session_run_hook:13,sessionrunarg:13,sessionruncontext:13,sessionrunhook:13,sessionrunvalu:13,set:[3,4,5,6,7,9,11,13,14,25,26,27,28],setup:[21,29],sgd:[7,9,25,29],shaoq:5,shape:[1,2,3,4,5,6,9,11,12,16],sharan:28,share:[12,25,28],shift:28,shortcut:5,shorter:12,should:[0,1,3,4,5,6,7,9,11,12,13,16,21,22,25,27,28,29],shuffl:[0,3,11,16],side:1,sigmoid:11,sigmoid_nois:11,sigmoid_noise_se:11,signal:[2,13],signatur:11,significantli:3,similar:[3,5],simpl:[1,4,11,14,22,25,28],simplest:27,sinc:[7,11,21,25,27,28],sine:12,singl:[3,5,9,11,12,22,29],singleton:6,singular:11,situat:[13,28],size:[0,1,2,3,4,5,6,7,11,12,13,16,21,25,27,28,29],skip:[25,27,28],slice:11,sloppi:3,slowest:11,slstm:[8,10],small:[7,9,11,22,25,28,29],smallest:1,smallest_sid:1,smooth:[6,9],softmax:[6,11,12],solut:11,some:[1,4,5,7,9,11,12,25,26,27,28],someth:27,sometim:25,soon:[11,17,18,19,24,25],sort:3,sourc:[0,1,2,3,4,5,6,7,9,11,12,13,25,27],source_length:2,source_sequ:2,source_tensor:[0,1,2,3,5,7],sox:21,spars:[7,9],sparse_tensor_to_char:7,sparsemax:11,sparsetensorvalu:7,spatial:5,specialtexttoken:3,specif:[2,4,5,25,27],specifi:[2,7,9,12,14,16,25],spectrogram:2,speech2text:[0,8,21,25,27],speech2textdatalay:2,speech:[2,7,20,26],speech_util:[0,8],speed:[3,28],speedup:28,split:[2,3,4,11,12],split_data:[1,2],split_head:12,src:22,src_emb_siz:5,src_input:[4,5],src_length:[4,5,6],src_sequenc:5,src_vocab_s:5,stabil:[7,25],stack:11,stai:28,staircas:9,stamp:25,standard:[5,7,11,28],start:[9,11,12,13,14,22,25,26],start_input:11,start_token:11,state:[4,5,11,12],state_is_tupl:11,state_s:11,static_max_norm:9,statist:[9,28],std:9,std_factor:9,stderr:25,stdout:25,step:[1,7,9,11,13,25,27],step_factor:9,step_window:9,steps_in_epoch:7,steps_per_epoch:9,still:12,stop:[11,13],store:[3,7,11,12],str:[0,2,3,4,5,6,7],stream:13,strength:12,stride:[2,5],string:[1,2,3,4,5,7,9,13,25],structur:[11,12,23],style:11,sub:9,subclass:9,subfold:25,submit:28,subsequ:[1,11,25],subset:[11,29],substitut:29,subtoken:[3,12],subtoken_count:3,subtoken_dict:3,subtoken_list:3,subtract:1,sudo:[21,27],suffer:11,suggest:28,sum:[6,11],sum_i:11,summar:28,summari:[7,9,25],sun:5,suppli:9,support:[0,4,5,6,7,11,14,16,25,26,27,28],supported_algo:9,sure:[0,1,3,21,22],symbol:[0,4],symbols_to_logits_fn:12,symlink:27,synset:1,system:22,t2t:[0,8],tab:25,tabl:29,taht:[0,1,3],take:[4,5,6,9,11,12,13,21,22,27],taken:7,tanh:11,target:[0,1,2,3,4,6,7,13,22],target_length:2,target_s:3,target_sequ:[2,6],target_tensor:[0,1,2,3,4,6,7],target_vocab_s:3,task:20,tensor2tensor:12,tensor:[0,1,2,3,4,5,6,7,9,11,12,13,16,28],tensorarrai:11,tensorboard:[7,9,22,25],tensorflow:[0,1,3,4,5,6,7,9,11,12,13,25,26,28],tensorflow_pkg:27,tensorflowtestcas:12,tensorshap:11,term:[9,11],termin:12,tesla:28,test:[12,22],test_expand_to_beam_s:12,test_flatten_beam_dim:12,test_gather_beam:12,test_gather_topk_beam:12,test_get_shape_keep_last_dim:12,test_shape_list:12,test_unflatten_beam_dim:12,test_util:12,text2text:[0,8,22,25],text:[1,2,3,4,7,13,21,26,27],text_ids_to_str:13,textlinedataset:2,tfrecord:3,tgt:22,tgt_emb_siz:4,tgt_input:4,tgt_length:[4,6],tgt_sequenc:6,tgt_vocab_s:[4,6],than:[1,3,5,9,11,12,22,28,29],thang:11,thei:[1,3,11,28],them:[9,21,27,28],thi:[0,1,3,4,5,6,7,9,11,12,13,16,17,18,19,21,22,24,25,26,27,28,29],thing:[3,9,22,25,28],those:[7,11],thread:13,three:[1,3],threshold:[3,9],through:[1,11,12,26],thu:[3,4,25],tile:[11,12],tile_batch:11,tiled_encoder_final_st:11,tiled_encoder_output:11,tiled_input:11,tiled_sequence_length:11,tim:11,time:[1,3,4,5,6,9,11,13,21,25,27,28],time_major:5,time_stretch_ratio:2,timestep:[6,11,28],titan:28,tmp:27,todo:[11,16,27],togeth:[7,13,28],toi:[20,21,27],tok:[22,29],token:[0,7,8,11,12],token_count:3,tool:27,toolkit:26,top:[3,11,12],topic:25,total:[0,9,28],total_regularization_loss:9,tower:[7,14],toy_text_data:22,tra:3,track:[11,13],tracks_own_finish:11,train:[0,1,3,4,5,7,9,11,12,13,20,22,25,26,27,29],train_ev:[7,21,22,25,27,29],train_input_fn:3,train_model:13,train_op:7,train_param:25,trainabl:9,trainable_vari:9,trainer:9,transform:[3,4,8,9,10,16,22,29],transform_for_bleu:7,transformer_decod:8,transformer_encod:8,transformer_polici:9,transformerdatalay:3,translat:[3,7,11,12,20],transpos:12,treat:[7,9,25],tri:29,trick:11,trie:[4,21,27],true_batch_s:11,true_siz:[7,13],tupl:[1,2,7,9,11,12],tutori:[25,26],twice:3,two:[1,3,11,12,13,14,21,28],txt:[22,27],type:[0,1,2,3,4,5,6,7,9,11,12,28],typeerror:[9,11],typic:[0,4,5,6,28],ubuntu:27,ultim:5,unbatch:3,unchang:[11,12],under:9,underflow:[11,28],undergo:1,underli:[2,9,28],underlin:3,understand:25,unescap:3,uni:5,unicod:3,unidir_rnn_encoder_with_emb:5,unidirect:29,unidirectionalrnnencoderwithembed:5,uniqu:12,unit:[4,5,11,29],unittest:27,unk_id:3,unknown:1,unless:9,unspecifi:11,upcom:13,upd_dict:13,updat:[3,7,9,11,25,28],update_op:9,use:[0,1,3,4,5,6,7,9,11,14,16,21,22,25,27,28,29],use_horovod:[7,14,25,29],use_language_model:[4,27],use_lock:9,use_new_attent:11,use_staircase_decai:9,use_swap_memori:5,used:[0,1,3,4,5,6,7,9,11,12,13,22,25,28,29],useful:[7,25,26],user:[11,28],uses:[6,9,11,21,22,28],using:[1,2,3,5,7,9,11,13,14,16,22,26,27,28,29],usual:[4,11,21,27,28],util:[1,7,8,10,28],utter:21,v100:28,valid:[0,1,3,4,5,7,9,11,13,25,29],valid_word_count_weight:4,valu:[1,3,7,9,11,12,13,25,28],value_channel:12,valueerror:[1,9,11],var_list:9,variabl:[3,4,5,6,7,9,11,12,13,25,28],variable_norm:[7,25],varianc:[9,28],variant:5,varieti:28,variou:[4,5,9,25,26],vector:[1,11],venkatesh:28,verbos:13,veri:[11,22],versa:11,version:[3,4,11,27,28],vgg:1,via:11,vice:11,view:[3,22],visual:[7,9],vocab:[0,3,7,13],vocab_fil:[2,3],vocab_s:12,vocabulari:[0,2,3,4,5,6,13,22],volta:[28,29],wai:[3,7,9,27,28],want:[21,22,25,27],warm:9,warmup_step:9,wave:2,wavelength:12,weight:[4,5,9,11,12,13,28],weiss:11,well:[4,5,7,22],wer:29,were:[5,11,22],what:23,when:[0,1,3,5,7,9,11,12,13,14,25,27,28,29],whenev:[25,28],where:[1,3,7,9,11,12,16,22,25,28],whether:[0,1,3,4,5,6,7,9,11,12,25],which:[0,1,3,4,5,6,7,9,11,12,13,16,22,25,28,29],whl:27,whose:11,width:[1,4,5,29],width_in:5,window:2,window_s:2,window_strid:2,within:5,without:[1,4,5,12,25,28,29],wmt:22,word:[0,3,4,7,29],word_count_weight:4,work:[7,22,25,27,29],worker:[3,6,7,13],worker_id:[0,1,2,3,7,16],workshop:11,wors:27,worst:[11,12],worth:28,wrap:[3,4,9,11,13,28],wrap_to_multi_rnn:11,wrapper:[1,4,5,6,11,12,28],write:[11,13,25],wrong:[9,11],xiangyu:5,xmax:1,xmin:1,ymax:1,ymin:1,yoshua:11,you:[0,1,3,7,9,11,12,13,14,16,21,22,25,26,27,28,29],your:[0,1,3,9,16,20,22,25,27],yourself:28,zero:[7,9,11,12],zero_st:11,zhang:5},titles:["data","image2label","speech2text","text2text","decoders","encoders","losses","models","API documentation","optimizers","parts","rnns","transformer","utils","Distributed training","Adding new models","Adding new data layer","Adding new decoder","Adding new encoder","Adding new loss","Getting started","Speech Recognition","Machine Translation","In-depth tutorials","Internal structure","Using existing models","OpenSeq2Seq","Installation instructions","Mixed precision training","Models and recipes"],titleterms:{"new":[15,16,17,18,19],Adding:[15,16,17,18,19],Using:25,add:27,adventur:22,api:8,attention_lay:12,attention_wrapp:11,automat:28,automatic_loss_scal:9,base:29,beam_search:12,beam_search_test:12,being:25,bleu:22,bpe:22,build:21,clean:22,common:12,comput:22,config:25,creat:22,cross_entropy_loss:6,ctc:27,ctc_loss:6,data:[0,16],data_lay:0,dataset:21,decod:[4,17,27],deep:29,depth:23,detail:28,distribut:14,document:8,download:27,ds2_encod:5,embedding_lay:12,enabl:28,encod:[5,18],encoder_decod:7,english:22,exist:25,fc_decod:4,feel:22,ffn_layer:12,flstm:11,func:13,gener:27,german:22,get:20,glstm:11,gnmt:11,hook:13,horovod:27,how:[21,25,27,28],image2label:[1,7],imagenet_preprocess:1,implement:28,infer:22,instal:27,instruct:27,intern:24,languag:[21,27],layer:16,librispeech:21,log:25,loss:[6,19,28],lr_polici:9,machin:[22,29],mix:28,model:[7,15,21,25,27,29],mp_wrapper:9,openseq2seq:[26,27],optim:[9,28],own:21,paramet:25,part:10,precis:28,prerequisit:28,recip:29,recognit:[21,29],regular:28,resnet_block:5,resnet_encod:5,revers:22,rnn:11,rnn_beam_search_decod:11,rnn_decod:4,rnn_encod:5,run:[22,25,27],scale:28,score:22,segment:22,sequenc:22,sequence_loss:6,slstm:11,speech2text:[2,7],speech:[21,27,29],speech_util:2,start:20,structur:24,t2t:3,task:22,tensorflow:27,test:27,text2text:[3,7],toi:22,token:3,train:[14,21,28],transform:12,transformer_decod:4,transformer_encod:5,translat:[22,29],tutori:23,util:[0,11,12,13],what:25,your:21}}) \ No newline at end of file +Search.setIndex({docnames:["api-docs/data","api-docs/data.image2label","api-docs/data.speech2text","api-docs/data.text2text","api-docs/decoders","api-docs/encoders","api-docs/losses","api-docs/models","api-docs/modules","api-docs/optimizers","api-docs/parts","api-docs/parts.cnns","api-docs/parts.convs2s","api-docs/parts.rnns","api-docs/parts.transformer","api-docs/utils","distr-training","extending","extending/adding-new-data-layer","extending/adding-new-decoder","extending/adding-new-encoder","extending/adding-new-loss","getting-started","getting-started/asr","getting-started/nmt","in-depth-tutorials","in-depth-tutorials/internal-structure","in-depth-tutorials/using-existing-models","index","installation-instructions","mixed-precision","models-and-recipes"],envversion:53,filenames:["api-docs/data.rst","api-docs/data.image2label.rst","api-docs/data.speech2text.rst","api-docs/data.text2text.rst","api-docs/decoders.rst","api-docs/encoders.rst","api-docs/losses.rst","api-docs/models.rst","api-docs/modules.rst","api-docs/optimizers.rst","api-docs/parts.rst","api-docs/parts.cnns.rst","api-docs/parts.convs2s.rst","api-docs/parts.rnns.rst","api-docs/parts.transformer.rst","api-docs/utils.rst","distr-training.rst","extending.rst","extending/adding-new-data-layer.rst","extending/adding-new-decoder.rst","extending/adding-new-encoder.rst","extending/adding-new-loss.rst","getting-started.rst","getting-started/asr.rst","getting-started/nmt.rst","in-depth-tutorials.rst","in-depth-tutorials/internal-structure.rst","in-depth-tutorials/using-existing-models.rst","index.rst","installation-instructions.rst","mixed-precision.rst","models-and-recipes.rst"],objects:{"":{data:[0,0,0,"-"],decoders:[4,0,0,"-"],encoders:[5,0,0,"-"],losses:[6,0,0,"-"],models:[7,0,0,"-"],optimizers:[9,0,0,"-"],parts:[10,0,0,"-"],utils:[15,0,0,"-"]},"data.data_layer":{DataLayer:[0,1,1,""]},"data.data_layer.DataLayer":{__init__:[0,2,1,""],build_graph:[0,2,1,""],get_optional_params:[0,3,1,""],get_required_params:[0,3,1,""],get_size_in_samples:[0,2,1,""],input_tensors:[0,4,1,""],iterator:[0,4,1,""],params:[0,4,1,""]},"data.image2label":{image2label:[1,0,0,"-"],imagenet_preprocessing:[1,0,0,"-"]},"data.image2label.image2label":{CifarDataLayer:[1,1,1,""],ImagenetDataLayer:[1,1,1,""]},"data.image2label.image2label.CifarDataLayer":{build_graph:[1,2,1,""],get_optional_params:[1,3,1,""],get_required_params:[1,3,1,""],get_size_in_samples:[1,2,1,""],input_tensors:[1,4,1,""],iterator:[1,4,1,""],parse_record:[1,2,1,""],preprocess_image:[1,2,1,""]},"data.image2label.image2label.ImagenetDataLayer":{build_graph:[1,2,1,""],get_optional_params:[1,3,1,""],get_required_params:[1,3,1,""],get_size_in_samples:[1,2,1,""],input_tensors:[1,4,1,""],iterator:[1,4,1,""],split_data:[1,2,1,""]},"data.image2label.imagenet_preprocessing":{_aspect_preserving_resize:[1,5,1,""],_central_crop:[1,5,1,""],_decode_crop_and_flip:[1,5,1,""],_mean_image_subtraction_and_normalization:[1,5,1,""],_parse_example_proto:[1,5,1,""],_resize_image:[1,5,1,""],_smallest_size_at_least:[1,5,1,""],parse_record:[1,5,1,""],preprocess_image:[1,5,1,""]},"data.speech2text":{speech2text:[2,0,0,"-"],speech_utils:[2,0,0,"-"]},"data.speech2text.speech2text":{Speech2TextDataLayer:[2,1,1,""]},"data.speech2text.speech2text.Speech2TextDataLayer":{__init__:[2,2,1,""],_parse_audio_element:[2,2,1,""],_parse_audio_transcript_element:[2,2,1,""],build_graph:[2,2,1,""],get_optional_params:[2,3,1,""],get_required_params:[2,3,1,""],get_size_in_samples:[2,2,1,""],input_tensors:[2,4,1,""],iterator:[2,4,1,""],split_data:[2,2,1,""]},"data.speech2text.speech_utils":{augment_audio_signal:[2,5,1,""],get_speech_features:[2,5,1,""],get_speech_features_from_file:[2,5,1,""],normalize_signal:[2,5,1,""]},"data.text2text":{t2t:[3,0,0,"-"],text2text:[3,0,0,"-"],tokenizer:[3,0,0,"-"]},"data.text2text.t2t":{_batch_examples:[3,5,1,""],_create_min_max_boundaries:[3,5,1,""],_filter_max_length:[3,5,1,""],_get_example_length:[3,5,1,""],_load_records:[3,5,1,""],_parse_example:[3,5,1,""],_read_and_batch_from_files:[3,5,1,""],eval_input_fn:[3,5,1,""],train_input_fn:[3,5,1,""]},"data.text2text.text2text":{ParallelTextDataLayer:[3,1,1,""],SpecialTextTokens:[3,1,1,""],TransformerDataLayer:[3,1,1,""]},"data.text2text.text2text.ParallelTextDataLayer":{build_graph:[3,2,1,""],get_optional_params:[3,3,1,""],get_required_params:[3,3,1,""],get_size_in_samples:[3,2,1,""],input_tensors:[3,4,1,""],iterator:[3,4,1,""]},"data.text2text.text2text.SpecialTextTokens":{END_OF_CHOICE:[3,4,1,""],EOS_ID:[3,4,1,""],OUT_OF_BUCKET:[3,4,1,""],PAD_ID:[3,4,1,""],S_ID:[3,4,1,""],UNK_ID:[3,4,1,""]},"data.text2text.text2text.TransformerDataLayer":{build_graph:[3,2,1,""],get_optional_params:[3,3,1,""],get_required_params:[3,3,1,""],input_tensors:[3,4,1,""],iterator:[3,4,1,""]},"data.text2text.tokenizer":{Subtokenizer:[3,1,1,""],_count_and_gen_subtokens:[3,5,1,""],_count_tokens:[3,5,1,""],_escape_token:[3,5,1,""],_filter_and_bucket_subtokens:[3,5,1,""],_gen_new_subtoken_list:[3,5,1,""],_generate_alphabet_dict:[3,5,1,""],_generate_subtokens:[3,5,1,""],_generate_subtokens_with_target_vocab_size:[3,5,1,""],_join_tokens_to_string:[3,5,1,""],_list_to_index_dict:[3,5,1,""],_load_vocab_file:[3,5,1,""],_native_to_unicode:[3,5,1,""],_save_vocab_file:[3,5,1,""],_split_string_to_tokens:[3,5,1,""],_split_token_to_subtokens:[3,5,1,""],_unescape_token:[3,5,1,""],_unicode_to_native:[3,5,1,""]},"data.text2text.tokenizer.Subtokenizer":{__init__:[3,2,1,""],_subtoken_ids_to_tokens:[3,2,1,""],_token_to_subtoken_ids:[3,2,1,""],decode:[3,2,1,""],encode:[3,2,1,""],init_from_files:[3,3,1,""]},"data.utils":{load_pre_existing_vocabulary:[0,5,1,""],pad_vocab_to_eight:[0,5,1,""]},"decoders.convs2s_decoder":{ConvS2SDecoder:[4,1,1,""]},"decoders.convs2s_decoder.ConvS2SDecoder":{_get_symbols_to_logits_fn:[4,2,1,""],decode_pass:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""],predict:[4,2,1,""]},"decoders.decoder":{Decoder:[4,1,1,""]},"decoders.decoder.Decoder":{__init__:[4,2,1,""],_cast_types:[4,2,1,""],_decode:[4,2,1,""],decode:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""],mode:[4,4,1,""],name:[4,4,1,""],params:[4,4,1,""]},"decoders.fc_decoders":{FullyConnectedCTCDecoder:[4,1,1,""],FullyConnectedDecoder:[4,1,1,""],FullyConnectedTimeDecoder:[4,1,1,""]},"decoders.fc_decoders.FullyConnectedCTCDecoder":{__init__:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""]},"decoders.fc_decoders.FullyConnectedDecoder":{__init__:[4,2,1,""],_decode:[4,2,1,""],get_required_params:[4,3,1,""]},"decoders.fc_decoders.FullyConnectedTimeDecoder":{__init__:[4,2,1,""],_decode:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""]},"decoders.rnn_decoders":{BeamSearchRNNDecoderWithAttention:[4,1,1,""],RNNDecoderWithAttention:[4,1,1,""]},"decoders.rnn_decoders.BeamSearchRNNDecoderWithAttention":{__init__:[4,2,1,""],_decode:[4,2,1,""],get_optional_params:[4,3,1,""]},"decoders.rnn_decoders.RNNDecoderWithAttention":{__init__:[4,2,1,""],_build_attention:[4,2,1,""],_decode:[4,2,1,""],get_optional_params:[4,3,1,""],get_required_params:[4,3,1,""]},"encoders.cnn_encoder":{CNNEncoder:[5,1,1,""],build_layer:[5,5,1,""]},"encoders.cnn_encoder.CNNEncoder":{__init__:[5,2,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""]},"encoders.convs2s_encoder":{ConvS2SEncoder:[5,1,1,""]},"encoders.convs2s_encoder.ConvS2SEncoder":{get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.ds2_encoder":{DeepSpeech2Encoder:[5,1,1,""],rnn_cell:[5,5,1,""],row_conv:[5,5,1,""]},"encoders.ds2_encoder.DeepSpeech2Encoder":{__init__:[5,2,1,""],_encode:[5,2,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""]},"encoders.encoder":{Encoder:[5,1,1,""]},"encoders.encoder.Encoder":{__init__:[5,2,1,""],_cast_types:[5,2,1,""],_encode:[5,2,1,""],encode:[5,2,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],mode:[5,4,1,""],name:[5,4,1,""],params:[5,4,1,""]},"encoders.resnet_blocks":{batch_norm:[5,5,1,""],block_layer:[5,5,1,""],bottleneck_block_v1:[5,5,1,""],bottleneck_block_v2:[5,5,1,""],building_block_v1:[5,5,1,""],building_block_v2:[5,5,1,""],conv2d_fixed_padding:[5,5,1,""],fixed_padding:[5,5,1,""]},"encoders.resnet_encoder":{ResNetEncoder:[5,1,1,""]},"encoders.resnet_encoder.ResNetEncoder":{get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""]},"encoders.rnn_encoders":{BidirectionalRNNEncoderWithEmbedding:[5,1,1,""],GNMTLikeEncoderWithEmbedding:[5,1,1,""],GNMTLikeEncoderWithEmbedding_cuDNN:[5,1,1,""],UnidirectionalRNNEncoderWithEmbedding:[5,1,1,""]},"encoders.rnn_encoders.BidirectionalRNNEncoderWithEmbedding":{__init__:[5,2,1,""],_encode:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding":{__init__:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.rnn_encoders.GNMTLikeEncoderWithEmbedding_cuDNN":{__init__:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.rnn_encoders.UnidirectionalRNNEncoderWithEmbedding":{__init__:[5,2,1,""],_encode:[5,2,1,""],enc_emb_w:[5,4,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""],src_emb_size:[5,4,1,""],src_vocab_size:[5,4,1,""]},"encoders.w2l_encoder":{Wave2LetterEncoder:[5,1,1,""]},"encoders.w2l_encoder.Wave2LetterEncoder":{__init__:[5,2,1,""],_encode:[5,2,1,""],get_optional_params:[5,3,1,""],get_required_params:[5,3,1,""]},"losses.cross_entropy_loss":{CrossEntropyLoss:[6,1,1,""]},"losses.ctc_loss":{CTCLoss:[6,1,1,""],dense_to_sparse:[6,5,1,""]},"losses.ctc_loss.CTCLoss":{__init__:[6,2,1,""],_compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""]},"losses.loss":{Loss:[6,1,1,""]},"losses.loss.Loss":{__init__:[6,2,1,""],_cast_types:[6,2,1,""],_compute_loss:[6,2,1,""],compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""],get_required_params:[6,3,1,""],name:[6,4,1,""],params:[6,4,1,""]},"losses.sequence_loss":{BasicSequenceLoss:[6,1,1,""],CrossEntropyWithSmoothing:[6,1,1,""],PaddedCrossEntropyLossWithSmoothing:[6,1,1,""]},"losses.sequence_loss.BasicSequenceLoss":{__init__:[6,2,1,""],_compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""],get_required_params:[6,3,1,""]},"losses.sequence_loss.CrossEntropyWithSmoothing":{__init__:[6,2,1,""],_compute_loss:[6,2,1,""],get_optional_params:[6,3,1,""],get_required_params:[6,3,1,""]},"losses.sequence_loss.PaddedCrossEntropyLossWithSmoothing":{get_optional_params:[6,3,1,""]},"models.encoder_decoder":{EncoderDecoderModel:[7,1,1,""]},"models.encoder_decoder.EncoderDecoderModel":{__init__:[7,2,1,""],_build_forward_pass_graph:[7,2,1,""],_create_decoder:[7,2,1,""],_create_encoder:[7,2,1,""],_create_loss:[7,2,1,""],decoder:[7,4,1,""],encoder:[7,4,1,""],get_optional_params:[7,3,1,""],get_required_params:[7,3,1,""],loss_computator:[7,4,1,""]},"models.image2label":{Image2Label:[7,1,1,""]},"models.image2label.Image2Label":{_get_num_objects_per_step:[7,2,1,""],evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],maybe_print_logs:[7,2,1,""]},"models.model":{Model:[7,1,1,""]},"models.model.Model":{__init__:[7,2,1,""],_build_forward_pass_graph:[7,2,1,""],_get_num_objects_per_step:[7,2,1,""],clip_last_batch:[7,2,1,""],compile:[7,2,1,""],evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],finalize_inference:[7,2,1,""],get_data_layer:[7,2,1,""],get_num_objects_per_step:[7,2,1,""],get_optional_params:[7,3,1,""],get_output_tensors:[7,2,1,""],get_required_params:[7,3,1,""],get_tf_dtype:[7,2,1,""],hvd:[7,4,1,""],infer:[7,2,1,""],last_step:[7,4,1,""],maybe_print_logs:[7,2,1,""],mode:[7,4,1,""],num_gpus:[7,4,1,""],on_horovod:[7,4,1,""],params:[7,4,1,""],steps_in_epoch:[7,4,1,""]},"models.speech2text":{Speech2Text:[7,1,1,""],levenshtein:[7,5,1,""],sparse_tensor_to_chars:[7,5,1,""]},"models.speech2text.Speech2Text":{_get_num_objects_per_step:[7,2,1,""],evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],finalize_inference:[7,2,1,""],infer:[7,2,1,""],maybe_print_logs:[7,2,1,""]},"models.text2text":{Text2Text:[7,1,1,""],calculate_bleu:[7,5,1,""],transform_for_bleu:[7,5,1,""]},"models.text2text.Text2Text":{_get_num_objects_per_step:[7,2,1,""],evaluate:[7,2,1,""],finalize_evaluation:[7,2,1,""],finalize_inference:[7,2,1,""],infer:[7,2,1,""],maybe_print_logs:[7,2,1,""]},"optimizers.automatic_loss_scaler":{AutomaticLossScaler:[9,1,1,""],BackoffScaler:[9,1,1,""],LogMaxScaler:[9,1,1,""]},"optimizers.automatic_loss_scaler.AutomaticLossScaler":{SUPPORTED_ALGOS:[9,4,1,""],check_grads:[9,3,1,""],loss_scale:[9,4,1,""],update_op:[9,2,1,""]},"optimizers.automatic_loss_scaler.BackoffScaler":{loss_scale:[9,4,1,""],update_op:[9,2,1,""]},"optimizers.automatic_loss_scaler.LogMaxScaler":{loss_scale:[9,4,1,""],update_op:[9,2,1,""]},"optimizers.lr_policies":{exp_decay:[9,5,1,""],fixed_lr:[9,5,1,""],piecewise_constant:[9,5,1,""],poly_decay:[9,5,1,""],transformer_policy:[9,5,1,""]},"optimizers.mp_wrapper":{MixedPrecisionOptimizerWrapper:[9,1,1,""],mp_regularizer_wrapper:[9,5,1,""]},"optimizers.mp_wrapper.MixedPrecisionOptimizerWrapper":{apply_gradients:[9,2,1,""],compute_gradients:[9,2,1,""]},"optimizers.optimizers":{_clip_gradients_by_norm:[9,5,1,""],get_regularization_loss:[9,5,1,""],optimize_loss:[9,5,1,""],post_process_gradients:[9,5,1,""],reduce_gradients:[9,5,1,""]},"parts.cnns":{conv_blocks:[11,0,0,"-"]},"parts.cnns.conv_blocks":{conv_actv:[11,5,1,""],conv_bn_actv:[11,5,1,""]},"parts.convs2s":{attention_wn_layer:[12,0,0,"-"],conv_wn_layer:[12,0,0,"-"],ffn_wn_layer:[12,0,0,"-"]},"parts.convs2s.attention_wn_layer":{AttentionLayerNormalized:[12,1,1,""]},"parts.convs2s.attention_wn_layer.AttentionLayerNormalized":{__init__:[12,2,1,""],call:[12,2,1,""]},"parts.convs2s.conv_wn_layer":{Conv1DNetworkNormalized:[12,1,1,""]},"parts.convs2s.conv_wn_layer.Conv1DNetworkNormalized":{__init__:[12,2,1,""],call:[12,2,1,""],gated_linear_units:[12,2,1,""]},"parts.convs2s.ffn_wn_layer":{FeedFowardNetworkNormalized:[12,1,1,""]},"parts.convs2s.ffn_wn_layer.FeedFowardNetworkNormalized":{__init__:[12,2,1,""],call:[12,2,1,""]},"parts.rnns":{attention_wrapper:[13,0,0,"-"],flstm:[13,0,0,"-"],glstm:[13,0,0,"-"],gnmt:[13,0,0,"-"],rnn_beam_search_decoder:[13,0,0,"-"],slstm:[13,0,0,"-"],utils:[13,0,0,"-"]},"parts.rnns.attention_wrapper":{AttentionMechanism:[13,1,1,""],AttentionWrapper:[13,1,1,""],AttentionWrapperState:[13,1,1,""],BahdanauAttention:[13,1,1,""],BahdanauMonotonicAttention:[13,1,1,""],LuongAttention:[13,1,1,""],LuongMonotonicAttention:[13,1,1,""],hardmax:[13,5,1,""],monotonic_attention:[13,5,1,""],safe_cumprod:[13,5,1,""]},"parts.rnns.attention_wrapper.AttentionMechanism":{alignments_size:[13,4,1,""],state_size:[13,4,1,""]},"parts.rnns.attention_wrapper.AttentionWrapper":{__init__:[13,2,1,""],_item_or_tuple:[13,2,1,""],call:[13,2,1,""],output_size:[13,4,1,""],state_size:[13,4,1,""],zero_state:[13,2,1,""]},"parts.rnns.attention_wrapper.AttentionWrapperState":{clone:[13,2,1,""]},"parts.rnns.attention_wrapper.BahdanauAttention":{__init__:[13,2,1,""]},"parts.rnns.attention_wrapper.BahdanauMonotonicAttention":{__init__:[13,2,1,""]},"parts.rnns.attention_wrapper.LuongAttention":{__init__:[13,2,1,""]},"parts.rnns.attention_wrapper.LuongMonotonicAttention":{__init__:[13,2,1,""]},"parts.rnns.flstm":{FLSTMCell:[13,1,1,""]},"parts.rnns.flstm.FLSTMCell":{__init__:[13,2,1,""],call:[13,2,1,""],output_size:[13,4,1,""],state_size:[13,4,1,""]},"parts.rnns.glstm":{GLSTMCell:[13,1,1,""]},"parts.rnns.glstm.GLSTMCell":{__init__:[13,2,1,""],_get_input_for_group:[13,2,1,""],call:[13,2,1,""],output_size:[13,4,1,""],state_size:[13,4,1,""]},"parts.rnns.gnmt":{GNMTAttentionMultiCell:[13,1,1,""],gnmt_residual_fn:[13,5,1,""]},"parts.rnns.gnmt.GNMTAttentionMultiCell":{__init__:[13,2,1,""]},"parts.rnns.rnn_beam_search_decoder":{BeamSearchDecoder:[13,1,1,""],BeamSearchDecoderOutput:[13,1,1,""],BeamSearchDecoderState:[13,1,1,""],FinalBeamSearchDecoderOutput:[13,1,1,""],tile_batch:[13,5,1,""]},"parts.rnns.rnn_beam_search_decoder.BeamSearchDecoder":{__init__:[13,2,1,""],_maybe_merge_batch_beams:[13,2,1,""],_maybe_split_batch_beams:[13,2,1,""],_merge_batch_beams:[13,2,1,""],_split_batch_beams:[13,2,1,""],batch_size:[13,4,1,""],finalize:[13,2,1,""],initialize:[13,2,1,""],output_dtype:[13,4,1,""],output_size:[13,4,1,""],step:[13,2,1,""],tracks_own_finished:[13,4,1,""]},"parts.rnns.slstm":{BasicSLSTMCell:[13,1,1,""],_linear:[13,5,1,""]},"parts.rnns.slstm.BasicSLSTMCell":{__init__:[13,2,1,""],call:[13,2,1,""],output_size:[13,4,1,""],state_size:[13,4,1,""]},"parts.rnns.utils":{single_cell:[13,5,1,""]},"parts.transformer":{attention_layer:[14,0,0,"-"],beam_search:[14,0,0,"-"],common:[14,0,0,"-"],embedding_layer:[14,0,0,"-"],ffn_layer:[14,0,0,"-"],utils:[14,0,0,"-"]},"parts.transformer.attention_layer":{Attention:[14,1,1,""],SelfAttention:[14,1,1,""]},"parts.transformer.attention_layer.Attention":{call:[14,2,1,""],combine_heads:[14,2,1,""],split_heads:[14,2,1,""]},"parts.transformer.attention_layer.SelfAttention":{call:[14,2,1,""]},"parts.transformer.beam_search":{SequenceBeamSearch:[14,1,1,""],_StateKeys:[14,1,1,""],_expand_to_beam_size:[14,5,1,""],_flatten_beam_dim:[14,5,1,""],_gather_beams:[14,5,1,""],_gather_topk_beams:[14,5,1,""],_length_normalization:[14,5,1,""],_shape_list:[14,5,1,""],_unflatten_beam_dim:[14,5,1,""],sequence_beam_search:[14,5,1,""]},"parts.transformer.beam_search.SequenceBeamSearch":{_continue_search:[14,2,1,""],_create_initial_state:[14,2,1,""],_get_new_alive_state:[14,2,1,""],_get_new_finished_state:[14,2,1,""],_grow_alive_seq:[14,2,1,""],_search_step:[14,2,1,""],search:[14,2,1,""]},"parts.transformer.beam_search._StateKeys":{ALIVE_CACHE:[14,4,1,""],ALIVE_LOG_PROBS:[14,4,1,""],ALIVE_SEQ:[14,4,1,""],CUR_INDEX:[14,4,1,""],FINISHED_FLAGS:[14,4,1,""],FINISHED_SCORES:[14,4,1,""],FINISHED_SEQ:[14,4,1,""]},"parts.transformer.common":{LayerNormalization:[14,1,1,""],PrePostProcessingWrapper:[14,1,1,""]},"parts.transformer.common.LayerNormalization":{build:[14,2,1,""],call:[14,2,1,""]},"parts.transformer.embedding_layer":{EmbeddingSharedWeights:[14,1,1,""]},"parts.transformer.embedding_layer.EmbeddingSharedWeights":{build:[14,2,1,""],call:[14,2,1,""],linear:[14,2,1,""]},"parts.transformer.ffn_layer":{FeedFowardNetwork:[14,1,1,""]},"parts.transformer.ffn_layer.FeedFowardNetwork":{call:[14,2,1,""]},"parts.transformer.utils":{get_decoder_self_attention_bias:[14,5,1,""],get_padding:[14,5,1,""],get_padding_bias:[14,5,1,""],get_position_encoding:[14,5,1,""]},"utils.funcs":{evaluate:[15,5,1,""],infer:[15,5,1,""],restore_and_get_results:[15,5,1,""],train:[15,5,1,""]},"utils.hooks":{BroadcastGlobalVariablesHook:[15,1,1,""],PrintLossAndTimeHook:[15,1,1,""],PrintSamplesHook:[15,1,1,""],RunEvaluationHook:[15,1,1,""]},"utils.hooks.BroadcastGlobalVariablesHook":{__init__:[15,2,1,""],after_create_session:[15,2,1,""],begin:[15,2,1,""]},"utils.hooks.PrintLossAndTimeHook":{after_run:[15,2,1,""],before_run:[15,2,1,""],begin:[15,2,1,""]},"utils.hooks.PrintSamplesHook":{after_run:[15,2,1,""],before_run:[15,2,1,""],begin:[15,2,1,""]},"utils.hooks.RunEvaluationHook":{after_run:[15,2,1,""],before_run:[15,2,1,""],begin:[15,2,1,""]},"utils.utils":{Logger:[15,1,1,""],array_to_string:[15,5,1,""],cast_types:[15,5,1,""],check_params:[15,5,1,""],clip_last_batch:[15,5,1,""],clip_sparse:[15,5,1,""],collect_if_horovod:[15,5,1,""],deco_print:[15,5,1,""],flatten_dict:[15,5,1,""],get_available_gpus:[15,5,1,""],get_git_diff:[15,5,1,""],get_git_hash:[15,5,1,""],get_results_for_epoch:[15,5,1,""],iterate_data:[15,5,1,""],log_summaries_from_dict:[15,5,1,""],mask_nans:[15,5,1,""],nest_dict:[15,5,1,""],nested_update:[15,5,1,""],text_ids_to_string:[15,5,1,""]},"utils.utils.Logger":{flush:[15,2,1,""],write:[15,2,1,""]},data:{data_layer:[0,0,0,"-"],image2label:[1,0,0,"-"],speech2text:[2,0,0,"-"],text2text:[3,0,0,"-"],utils:[0,0,0,"-"]},decoders:{convs2s_decoder:[4,0,0,"-"],decoder:[4,0,0,"-"],fc_decoders:[4,0,0,"-"],rnn_decoders:[4,0,0,"-"]},encoders:{cnn_encoder:[5,0,0,"-"],convs2s_encoder:[5,0,0,"-"],ds2_encoder:[5,0,0,"-"],encoder:[5,0,0,"-"],resnet_blocks:[5,0,0,"-"],resnet_encoder:[5,0,0,"-"],rnn_encoders:[5,0,0,"-"],w2l_encoder:[5,0,0,"-"]},losses:{cross_entropy_loss:[6,0,0,"-"],ctc_loss:[6,0,0,"-"],loss:[6,0,0,"-"],sequence_loss:[6,0,0,"-"]},models:{encoder_decoder:[7,0,0,"-"],image2label:[7,0,0,"-"],model:[7,0,0,"-"],speech2text:[7,0,0,"-"],text2text:[7,0,0,"-"]},optimizers:{automatic_loss_scaler:[9,0,0,"-"],lr_policies:[9,0,0,"-"],mp_wrapper:[9,0,0,"-"],optimizers:[9,0,0,"-"]},parts:{cnns:[11,0,0,"-"],convs2s:[12,0,0,"-"],rnns:[13,0,0,"-"],transformer:[14,0,0,"-"]},utils:{funcs:[15,0,0,"-"],hooks:[15,0,0,"-"],utils:[15,0,0,"-"]}},objnames:{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","method","Python method"],"3":["py","staticmethod","Python static method"],"4":["py","attribute","Python attribute"],"5":["py","function","Python function"]},objtypes:{"0":"py:module","1":"py:class","2":"py:method","3":"py:staticmethod","4":"py:attribute","5":"py:function"},terms:{"106gb":23,"1080ti":31,"16xlarg":30,"1e6":3,"1e9":14,"1x1":5,"224gb":23,"2xlarg":30,"4gpu":31,"55gb":23,"5gb":29,"8xlarg":30,"\u03b1":24,"\u03b4":24,"\u03b5":24,"\u03b6":24,"\u03ba":24,"abstract":[0,4,5,6,7],"boolean":[1,5,13],"byte":3,"case":[0,4,5,6,7,9,13,16,18,27,30],"char":0,"class":[0,1,2,3,4,5,6,7,9,12,13,14,15,18,27],"default":[3,5,6,9,13,15,27,29,30],"enum":3,"export":23,"final":[4,7,13,15,23,27,31],"float":[1,2,4,5,7,9,12,13,14,27,30],"function":[0,2,4,5,6,7,9,11,13,14,15,27,30],"import":[3,27,30],"int":[0,1,2,3,4,5,7,9,12,13,14,27],"long":13,"new":[0,1,3,12,13,14,15,25,27,28],"return":[0,1,2,3,4,5,6,7,9,12,13,14,15,18,30],"short":[13,27,31],"static":[0,1,2,3,4,5,6,7,9,13,18,27,30],"true":[0,1,3,5,6,7,9,13,14,16,27,31],"try":[5,9,24,27,29],"while":[1,3,18,24,27,30],AWS:30,Adding:25,And:[0,1,3,30],But:7,EOS:14,For:[1,2,3,4,5,7,9,13,15,18,23,24,27,28,30,31],IDs:3,Its:13,NOT:13,One:[24,27,30],Such:[13,18],That:[0,1,3,4,7],The:[0,1,3,5,7,9,13,14,15,18,23,27,29,30,31],There:27,These:[1,4,7,13,15,30],Use:5,Used:[3,7,14],Uses:[5,15,31],Using:25,Will:[5,7],With:[16,18,31],__call__:5,__init__:[0,1,2,3,4,5,6,7,12,13,15,18,27],_aspect_preserving_res:1,_baseattentionmechan:13,_basemonotonicattentionmechan:13,_batch_exampl:3,_build_attent:4,_build_forward_pass_graph:7,_building_block_v1:5,_building_block_v2:5,_cast_typ:[4,5,6],_central_crop:1,_clip_gradients_by_norm:9,_compute_loss:6,_continue_search:14,_count_and_gen_subtoken:3,_count_token:3,_create_decod:7,_create_encod:7,_create_initial_st:14,_create_loss:7,_create_min_max_boundari:3,_decod:4,_decode_crop_and_flip:1,_distributed_appli:9,_encod:5,_escape_token:3,_expand_to_beam_s:14,_filter_and_bucket_subtoken:3,_filter_max_length:3,_flatten_beam_dim:14,_gather_beam:14,_gather_topk_beam:14,_gather_tre:13,_gen_new_subtoken_list:3,_generate_alphabet_dict:3,_generate_subtoken:3,_generate_subtokens_with_target_vocab_s:3,_get_example_length:3,_get_input_for_group:13,_get_new_alive_st:14,_get_new_finished_st:14,_get_num_objects_per_step:7,_get_symbols_to_logits_fn:4,_grow_alive_seq:14,_item_or_tupl:13,_join_tokens_to_str:3,_length_norm:14,_linear:13,_list_to_index_dict:3,_load_record:3,_load_vocab_fil:3,_maybe_merge_batch_beam:13,_maybe_split_batch_beam:13,_mean_image_subtraction_and_norm:1,_merge_batch_beam:13,_monotonic_probability_fn:13,_native_to_unicod:3,_output:7,_parse_audio_el:2,_parse_audio_transcript_el:2,_parse_exampl:3,_parse_example_proto:1,_read_and_batch_from_fil:3,_resize_imag:1,_save_vocab_fil:3,_search_step:14,_shape_list:14,_smallest_size_at_least:1,_split_batch_beam:13,_split_string_to_token:3,_split_token_to_subtoken:3,_statekei:14,_subtoken_ids_to_token:3,_test:29,_token_to_subtoken_id:3,_unescape_token:3,_unflatten_beam_dim:14,_unicode_to_n:3,abl:[0,1,3,23,29],about:[5,15,31],abov:[13,24,30],abs:[5,13,31],absolut:30,acceler:13,accept:[3,5,11,13],access:[0,4,5,6,7],accord:13,accumul:[7,27],accuraci:[1,7,29,30],achiev:[13,29],across:[3,6,7,27,30],activ:[5,11,13,30],activation_fn:[5,11],actual:[4,7,13,27],adagrad:[7,9,27],adam:[7,9,27,31],adapt:[5,7,27],add:[3,4,5,7,9,13,15,27,30],add_eo:3,add_r:12,added:[0,3,5,7,12,13,14,15,27],adding:15,addit:[2,5,7,9,13,14,23,27,30,31],addition:[1,27],adjust:[7,9,24,27,30],after:[0,1,5,7,9,13,15,23,24,27,29],after_create_sess:15,after_run:15,again:[27,29],aggreg:7,aggregation_method:9,aggregationmethod:9,alben:30,algorithm:[7,9,27,30],align:13,alignment_histori:13,alignments_s:13,aliv:14,alive_cach:14,alive_log_prob:14,alive_seq:14,all:[0,1,2,3,4,5,6,7,13,14,15,18,24,27,29,30,31],allow:[3,24,31],along:[5,13],alpha:14,alphabet:[3,4],alphabet_config_path:4,alreadi:[13,28],also:[0,1,3,7,9,15,16,24,27,28,29,30],altern:[29,30],alwai:[7,9,27,30],amax:9,amount:3,analog:7,analysi:2,ani:[3,4,5,7,9,13,14,15,27,30,31],anoth:[3,13],answer:[7,27],anymor:15,anyth:9,api:[2,28],appear:3,append:[3,27],appli:[3,5,7,9,11,12,13,14,27,30],apply_gradi:9,approach:[7,13,16,30],apt:[23,29],arbitrari:[5,9],architectur:[24,30],archiv:23,aren:3,arg:[13,15],argmax:13,argument:[2,3,4,5,6,7,9,13,14,15,16,23,27],arithmet:30,around:[1,4,5,6,23,29,30],arrai:[2,3,7],arrang:1,array_op:13,array_to_str:15,articl:13,artifici:4,arxiv:[5,9,13,30,31],aspect:1,assign:[0,4],assum:[13,23,29],assumpt:[7,13],attend:13,attent:[4,12,13,14,24,31],attention_bia:14,attention_cel:13,attention_depth:13,attention_dropout:14,attention_lay:[8,10],attention_layer_s:13,attention_mechan:13,attention_or_cell_output:13,attention_st:13,attention_typ:4,attention_wn_lay:[8,10],attention_wrapp:[8,10],attentioninputwrapp:13,attentionlayernorm:12,attentionmechan:13,attentionwrapp:13,attentionwrapperst:13,attribut:[7,27],audio:[2,7,23],augment:2,augment_audio_sign:2,auto:6,automat:[4,5,6,7,9,27,29,31],automatic_loss_sc:30,automatic_loss_scal:8,automaticlossscal:9,autoregress:14,avail:[24,27,28,30,31],averag:[6,7,27],average_across_timestep:6,avoid:13,axi:[7,13],back:[14,30],backoff:[7,9,27,30],backoffscal:9,backpropag:30,backslash:3,bahadanau:13,bahdanau:[4,13],bahdanau_norm:4,bahdanauattent:13,bahdanaumonotonicattent:13,bandwidth:30,base:[0,1,2,3,4,5,6,7,9,12,13,14,15,16,23,24,27,28,29,31],base_model:27,base_param:[27,30],basic:[6,13,28],basic_sequence_loss:6,basicsequenceloss:6,basicslstmcel:13,batch:[3,4,5,6,7,9,11,13,14,18,27,31],batch_in_token:3,batch_norm:5,batch_siz:[2,3,4,5,6,12,13,14],batch_size_per_gpu:[6,7,24,27,31],batches_per_epoch:9,bazel:29,bbox:1,beahvior:13,beam:[4,13,14],beam_indic:14,beam_search:[8,10],beam_search_decoder_output:13,beam_siz:14,beam_width:[4,13],beamsearch:13,beamsearchdecod:13,beamsearchdecoderoutput:13,beamsearchdecoderst:13,beamsearchrnndecoderwithattent:4,becaus:[3,14,24,30],becom:2,been:[9,13,14,15,30],befor:[3,5,7,12,13,15,24,27,30],before_run:15,begin:[3,13,15,30],begin_decay_at:9,behavior:[13,30],being:[3,25],below:[29,30,31],bench_start:27,bench_step:27,benchmark:[7,27,30],benefici:30,bengio:13,besid:7,best:[13,14,29],beta1:9,beta2:9,better:[1,14],between:[3,7,13,14,31],bhadanau:13,bia:[12,13,14],bias:13,bias_initi:13,bidir_rnn_encoder_with_emb:5,bidirect:31,bidirectionalrnnencoderwithembed:5,big:[29,31],bigger:30,bin:29,binari:[3,23,29],bleu:[7,31],blob:14,block:[5,27],block_fn:5,block_lay:5,blue:24,bn_epsilon:[5,11],bn_momentum:[5,11],bn_regular:5,bodi:14,bool:[0,4,5,6,7,9,12,13,14,27],boost:29,bori:30,both:[1,3,7,13,27,30,31],bottleneck:5,bottleneck_block:5,bottleneck_block_v1:5,bottleneck_block_v2:5,bottom:13,bound:1,boundari:[3,9],boundary_scal:3,box:1,bpe_us:7,broadcast:15,broadcastglobalvariableshook:15,bucket:3,buckets_max:3,buckets_min:3,buffer:1,build:[2,4,5,14,15,22,28,29,30],build_graph:[0,1,2,3],build_image_data:1,build_lay:5,build_lm:23,build_pip_packag:29,building_block:5,building_block_v1:5,building_block_v2:5,built:[5,7,27,28],c_state:13,cach:14,calcul:[0,1,3,4,7,12,13,14],calculate_bleu:7,call:[4,5,6,7,9,12,13,14,15,18,30],callabl:[9,13],callback:15,can:[0,1,2,3,4,5,6,7,9,13,15,16,18,24,27,28,29,30,31],candiat:3,candid:3,cannot:[7,13,14,27],cast:[4,5,6,30],cast_typ:15,cat:24,cell:[4,5,13,31],cell_class:13,cell_input_fn:13,cell_param:13,cell_stat:13,center:1,central:1,chang:[13,15,16,24,30],channel:[1,5],channels_first:5,channels_last:[5,11],charact:[3,4],check:[5,7,13,27,28,29,30],check_grad:9,check_param:15,checkpoint:[7,13,15,27,31],child:27,cho:13,choos:[13,30],christoph:13,cifar:1,cifardatalay:1,classic:7,clean:[23,31],cleaned_fil:24,clip:[7,9,27],clip_gradi:9,clip_last_batch:[7,15],clip_spars:15,clone:[13,29],close:[3,13,31],cloud:30,cmake:29,cnn:[5,8,10,28],cnn_encod:8,cnn_layer:5,cnnencod:5,code:[3,7,13,27],coeffici:[9,30],colin:13,collect:[7,9,14,15,30],collect_if_horovod:15,colloqui:1,coloc:9,colocate_gradients_with_op:9,color:1,colorspac:1,column:13,com:[12,14,29],combin:[7,9,14,28],combine_head:14,come:31,command:[16,23,24,27,29,31],comment:31,commit:27,common:[8,10,30],commonli:30,compar:[5,30],compat:9,compil:[7,27],complet:[7,9,19,20,21,26,27,29],complex:30,compon:13,compos:13,compress:23,comput:[1,6,7,9,13,14,30],compute_gradi:[9,30],compute_loss:[6,7,15],concat:13,concaten:[7,13],concret:30,config:[0,1,2,3,4,5,6,7,9,15,16,25,29,31],config_fil:[23,24,27,29,31],configur:[7,16,24,27,28,29,31],conflict:13,conjunct:[7,27],connect:[4,5,7,12,13,14],conrib:13,consecut:3,consist:[4,15],constant:[9,30],constraint:13,construct:[0,1,3,4,5,6,7,13,15],constructor:[0,2,4,5,6,7,9,13,27],consumpt:30,contain:[0,1,2,3,4,5,6,7,9,12,13,14,15,24,27,29,30,31],content:[1,4,5,6],context:13,continu:[4,14,27],continue_learn:27,contrib:13,control:[7,13],conv1d:[5,11],conv1dnetworknorm:12,conv2d:[5,11],conv2d_fixed_pad:5,conv:5,conv_actv:11,conv_block:[8,10],conv_bn_actv:11,conv_lay:5,conv_pad:12,conv_seq2seq:12,conv_wn_lay:[8,10],conveni:27,convent:5,converg:30,convert:[2,3,13,23,30],convnet_lay:5,convolut:[5,11,12,31],convs2:[5,8,10,14,31],convs2s_decod:8,convs2s_encod:8,convs2s_encoder_with_emb:5,convs2sdecod:4,convs2sencod:5,coord:[1,15],coordin:[1,15],copi:[5,7,13,30],copt:29,core:[3,30],core_cel:4,core_cell_param:4,correct:[7,13,14,23,24,27],correctli:[7,29],correspond:[0,1,2,3,7,9,13,14,18,24,27,29,30],correspondingli:[9,18],cosin:14,could:[0,2,4,5,6,7,9,15,27],count:[0,3,4,27],cover:27,cpu:[3,30],creat:[0,1,3,4,5,6,7,13,14,15,22,27,30],create_toy_data:24,creation:[7,30],crop:1,crop_height:1,crop_width:1,cross:6,cross_entropy_loss:8,cross_entropy_with_smooth:6,crossentropyloss:6,crossentropywithsmooth:6,csv:[2,23],ctc:[4,6],ctc_decoder_with_lm:29,ctc_greedy_decod:4,ctc_loss:8,ctcloss:6,cuda:[29,30],cudnn:[5,31],cudnn_gru:5,cudnn_lstm:5,cudnnlstm:13,cumprod:13,cumsum:13,cumul:13,cur_index:14,current:[1,4,5,7,9,12,13,14,27,31],custom:[29,30],cut:[4,7],d_model:9,dai:31,data:[1,2,3,4,5,6,7,8,13,15,17,23,24,25,27,30],data_fil:3,data_format:[5,11],data_lay:[1,2,3,4,5,6,7,8,27],data_layer_param:[7,27],data_root:24,datalay:[0,1,2,3,7,18,27],dataset:[0,1,2,3,7,18,22,24],dataset_fil:2,david:30,dct:15,debug:27,debug_port:[15,27],debugger_port:27,dec:5,decai:[9,30,31],decay_r:9,decay_step:9,deco_print:15,decod:[0,1,3,5,6,7,8,12,13,14,17,23,24,25,27,28,31],decode_and_crop:1,decode_pad:12,decode_pass:4,decoder_cell_typ:4,decoder_cell_unit:4,decoder_dp_input_keep_prob:4,decoder_dp_output_keep_prob:4,decoder_initial_st:13,decoder_library_path:4,decoder_output:6,decoder_param:[4,7,31],decoder_use_skip_connect:4,decreas:[3,30],deep:[5,13,30,31],deepbench:30,deepspeech2:31,deepspeech2encod:[5,27],deepspeech:[5,27,29],defaultdict:3,defin:[0,1,3,4,5,7,9,13,14,24,27,30],definit:5,degre:30,delet:23,delim:[7,15],denomin:[7,27],denot:1,dens:[5,7,13],dense_tensor:6,dense_to_spars:6,depend:[5,13],deprec:13,depth:[1,13,28],deriv:[0,4,5,6,7,27],describ:[0,1,3,4,5,6,7,13,27,29,30,31],descript:[0,1,2,3,4,5,6,7,27,31],design:[28,30],desir:30,detail:[1,2,4,5,7,13,27,28,31],determin:[13,14],dev:[23,24,29,31],deviat:13,devic:15,diamo:30,dict:[0,1,2,3,4,5,6,7,13,14,27],dict_to_log:15,dictionari:[0,1,2,3,4,5,6,7,14,18,27],did:24,diederik:13,diff:27,differ:[1,3,5,13,14,15,24,27,28,29,30],dim:[4,5,6,13],dimens:[1,3,4,5,12,13,14],dimension:[9,13],dimenst:12,diment:12,direct:[5,31],directori:[7,27,29],disabl:[7,13,24,27,29,30],discov:29,disk:24,displai:27,distanc:7,distort:1,distribut:[3,7,13,27,28,29,30],divid:[1,14],divis:[0,2,3,7,13],dnn:30,do_mask:6,doc:[4,5,6,7,27],docker:[29,30],docstr:13,document:[13,27,28,30],doe:[1,4,6,7,13,24,27,29,30,31],doesn:1,domain:2,don:[3,5,14,31],done:[12,24,29],dot:14,dougla:13,download:[23,24],download_lm:29,downsampl:5,dp_input_keep_prob:13,dp_output_keep_prob:13,draw:14,drawn:3,dropout:[4,5,12,13],dropout_keep_prob:5,dropout_keep_prop:5,ds2_encod:[8,27],ds2_large_8gpu:31,ds2_librispeech_larc_config:23,ds2_medium_4gpu:31,ds2_small_1gpu:31,ds2_toy_data_config:[23,29],dtype:[0,4,5,6,7,9,13,14,15,27,30],due:13,dure:[1,3,4,7,13,14,15,24,27,30],dynam:[7,13,30],dynamic_decod:13,dzmitri:13,each:[0,1,3,4,6,7,9,13,14,15,18,27,30],eager:9,earli:[13,31],easi:28,easili:24,eck:13,effect:[13,31],effici:[3,5,13,28],eight:31,either:[0,2,4,5,6,7,9,13,27,30],element:[2,3,7,13],elimin:31,els:[6,9],elsen:30,embed:[4,5,12,13,14],embed_s:12,embed_scal:14,embedding_lay:[8,10],embedding_lookup:13,embedding_s:14,embeddingsharedweight:14,emit:13,emnlp:13,empti:[7,13,18,27],emul:[7,27],enabl:[7,9,13,16,27,29],enable_log:[27,31],enc_emb_w:5,encod:[0,1,3,4,6,7,8,12,13,14,17,24,25,27,28,31],encoder_cell_typ:5,encoder_cell_unit:5,encoder_decod:[4,5,6,8,27],encoder_dp_input_keep_prob:5,encoder_dp_output_keep_prob:5,encoder_final_st:13,encoder_lay:5,encoder_output:[4,5,13],encoder_output_a:12,encoder_output_b:12,encoder_outputs_b:4,encoder_param:[5,7],encoder_sequence_length:4,encoder_st:[5,13],encoder_use_skip_connect:5,encoderdecodermodel:[7,27],encorc:13,end:[3,4,13,14,15,24,29,30],end_compat:9,end_learning_r:9,end_of_choic:3,end_symbol:4,end_token:13,energi:13,enforc:13,english:[3,22],enough:[24,27,30],ensur:[3,13,14,15,30],entri:13,entropi:6,enumer:3,eos:14,eos_id:[3,14,15],epoch:[0,1,3,7,9,27,31],epsilon:[5,7,14,27],equal:[1,7,13,27],equival:[9,13],erich:30,error:[7,13,23,29,31],escap:3,especi:30,essenti:15,estim:30,etc:[4,5,7,18,27,28],etl:18,eval:[1,4,5,7,23,27],eval_input_fn:3,eval_model:15,eval_param:27,eval_step:[7,27],evalu:[0,1,3,7,15,23,24,27,31],evenli:3,event:[27,30],everi:[5,7],every_step:15,everyth:[23,27,28,29],exact:[13,27],exactli:13,exampl:[0,1,2,3,4,5,7,9,13,18,23,24,27,29,30],example_config:[23,24,27,29],example_seri:1,exce:[7,27],except:[5,13,15,27,29,30],execut:[7,9,16,24,27],exist:[0,4,13,25,30],exp:13,exp_decai:9,expect:6,experi:[24,27,28],experiment:28,explicit:5,explicitli:[12,13,30],exponenti:9,exponential_decai:9,express:27,extend:28,extens:30,extract:[2,18,23],fact_siz:13,factor:[13,14,30],fail:29,fairli:27,fals:[0,1,3,4,5,6,7,9,13,14,15,29,31],familiar:28,fc_decod:8,fc_layer:5,featur:[2,4,5,6,28],features_typ:2,fed:18,feed:[13,15],feed_dictionari:18,feedforward:[12,14],feedfowardnetwork:14,feedfowardnetworknorm:12,feel:23,fetch:7,few:31,ffn_layer:[8,10],ffn_wn_layer:[8,10],field:[1,2,4,5,6,13],file:[0,1,2,3,4,7,23,24,27,29,31],file_byte_limit:3,file_pattern:3,file_with_bpe_segment:24,filenam:[1,2,3],filepath:3,filter:[3,5,9,11],filter_s:14,final_output:4,final_sequence_length:4,final_st:[4,13],finalbeamdecoderoutput:13,finalbeamsearchdecoderoutput:13,finalize_evalu:7,finalize_infer:7,find:[14,28],finish:[13,14,24,27],finished_flag:14,finished_scor:14,finished_seq:14,first:[0,1,3,5,7,9,12,13,14,15,16,23,24,27,30],fit:3,fix:9,fixed_lr:9,fixed_pad:5,flag:[13,14],flaot:14,flat_dict:15,flatten_dict:15,flexibl:28,flip:1,float16:[0,4,5,6,7,9,27,30],float32:[0,2,4,5,6,7,9,12,14,27,30,31],flstm:[8,10],flstmcell:13,flush:15,folder:[23,24,27,29],follow:[0,1,3,4,5,6,7,11,18,23,24,27,29,30,31],forc:27,force_var_reus:7,forev:3,forget:[13,31],forget_bia:13,form:[0,3,13],format:[1,3,5,11],formul:14,forward:[7,30],found:3,four:31,fp32:30,fraction:3,frame:[2,7],free:23,frequenc:[2,3],frequent:3,from:[0,1,2,3,4,5,6,7,9,12,13,14,15,18,27,29,30,31],ftrl:[7,27],full:[5,9,30,31],fulli:[4,5,12,14],fully_connected_ctc_decod:4,fully_connected_decod:4,fully_connected_time_decod:4,fullyconnectedctcdecod:[4,23],fullyconnecteddecod:4,fullyconnectedtimedecod:4,func:8,furthermor:30,fuse:1,futur:14,gamma_regular:5,ganesh:30,garcia:30,gate:[9,12,13],gate_gradi:9,gate_graph:9,gate_non:9,gate_op:9,gated_linear_unit:12,gather:[14,15],gen_input_tensor:18,gener:[3,4,5,7,13,14,27],generate_tri:29,geometr:14,german:[3,22],get:[7,9,14,18,23,24,28,29],get_available_gpu:15,get_data_lay:7,get_decoder_self_attention_bia:14,get_git_diff:15,get_git_hash:15,get_next:18,get_num_objects_per_step:7,get_optional_param:[0,1,2,3,4,5,6,7,18,27],get_output_tensor:7,get_pad:14,get_padding_bia:14,get_position_encod:14,get_regularization_loss:9,get_required_param:[0,1,2,3,4,5,6,7,18,27],get_results_for_epoch:15,get_size_in_sampl:[0,1,2,3,18],get_speech_featur:2,get_speech_features_from_fil:2,get_tf_dtyp:7,get_vari:30,get_wmt16_en_dt:24,getter:30,ginsburg:[13,30],git:[27,29],github:[12,14,29],give:12,given:[1,9,13,14,30],global:[9,15],global_gradient_norm:[7,27],global_step:9,glstm:[4,8,10],glstmcell:13,glu:12,gnmt:[4,5,8,10,31],gnmt_encoder_with_emb:5,gnmt_encoder_with_emb_cudnn:5,gnmt_residual_fn:13,gnmt_v2:4,gnmtattentionmulticel:13,gnmtlikeencoderwithembed:5,gnmtlikeencoderwithembedding_cudnn:5,go_symbol:4,goal:24,going:[4,5,19,20,21,26,27,29],good:27,gpu:[0,6,7,15,16,18,24,27,28,29,30,31],gpu_id:[7,27],grad_loss:9,gradient:[7,9,13,27,30],gradient_norm:[7,27],grads_and_var:9,gram:23,graph:[0,1,2,3,4,5,6,7,9,15,27,30],graphic:30,graphkei:9,greater:5,greedi:31,gregori:30,group:[3,13],group_batch_s:3,group_id:13,group_siz:13,grow:14,gru:[4,5,31],guarante:[3,13],half:30,halv:30,handl:13,happen:[0,1,3,4,5,6,7,18],hard:13,hardmax:13,has:[0,1,2,3,4,5,6,7,13,14,15,30,31],has_nan:9,hash:27,hat:9,have:[0,1,2,3,4,5,6,7,9,13,14,18,24,27,28,29,30,31],head:14,height:1,height_in:5,help:[7,23,27],helper:[5,11,14],henc:30,here:[0,1,3,4,5,6,7,13,18,27,30],hetland:7,hidden:[4,5,13],hidden_dropout:12,hidden_s:[4,14],hieu:13,high:30,higher:13,highest:14,highli:31,histori:13,hold:[9,14],hook:[7,8],horovod:[0,3,6,7,9,15,16,27,31],horovod_gpu_broadcast:15,hot:[1,6,13],houston:30,how:[7,9,13,14,22,24,25],howev:[1,30],http:[5,7,9,12,13,14,29,31],human:15,hvd:[7,15,27],hyperparamet:30,iclr:[13,30],icml:13,id_and_audio_filenam:2,ident:[5,13],ids:[0,2,3,7,13,14,15,27],idx2char:7,ignor:[0,7,9,13,16,27],ignore_speci:[7,15],illeg:14,illustr:30,ilsvrc2012_val_00041207:1,imag:[1,5,7],image2label:[0,8,27],image_buff:1,image_s:1,imagenet_preprocess:[0,8],imagenetdatalay:1,implement:[6,7,9,12,13,14,18,27,31],impli:13,import_librivox:23,improv:[1,3,30],in_dim:12,incept:1,includ:[0,1,2,3,4,5,6,7,13,23,30],increas:[14,30],increment:9,independ:[5,6,7,27],index:[13,14],indexedslic:9,indic:[1,3,14,15],inf:13,infer:[0,1,3,4,5,7,13,15,23,27],infer_output_fil:[23,24,27],infer_param:27,infin:14,inform:[5,7,13,14,15,27,28],inherit:[0,4,5,6,7,18],init_from_fil:3,init_var:14,initi:[3,4,5,7,9,12,13,14,15,27,30],initial_cach:14,initial_cell_st:13,initial_id:14,initial_st:13,initializer_param:[4,5,7,27],inner:[13,14],input:[0,1,2,3,4,5,6,7,11,12,13,14,27,28],input_attention_bia:12,input_dict:[4,5,6,15],input_lay:5,input_length:4,input_s:13,input_sequence_length:13,input_tensor:[0,1,2,3,4,5,6,7],input_typ:2,input_valu:7,inputs_attention_bia:4,insensit:[7,9,27],insid:[0,1,3,7,24,30],inspect:30,inspir:[12,13],instabl:13,instal:[23,28],instanc:[0,4,5,6,7,13,30],instead:[1,4,9,13,29,30],instruct:[23,28],insur:14,int32:[1,13,14],int64:[13,14],intact:5,integ:[1,3,5,13],inter:30,intermedi:[7,27,30],intern:[9,13,25],introduc:[1,5,30],invalid:9,invalidargu:13,invari:14,invers:3,involv:14,is_train:1,issu:31,item:[3,13,14],iter:[0,1,2,3,7,15,18,27,30],iter_s:[7,9,27],iterate_data:15,its:[3,12,13,14,23,27],jian:5,join:3,jointli:13,jonah:30,jpeg:1,jul:5,just:[4,5,29],kaim:5,keep:[4,5,6,12,13,15,30],kei:[0,1,3,5,7,9,12,14],kenlm:[23,29],kept:30,kernel:[5,12,31],kernel_initi:13,kernel_regular:5,kernel_s:[5,11],kernel_width:12,key_channel:14,keyword:14,kind:27,kingma:[12,13],knee:1,known:[1,3,13],kpu:29,kuchaev:30,kuchaiev:13,kwarg:[13,14],kyunghyun:13,label:[1,4,6],lambda:[13,30],languag:[4,22],language_model:29,lar:[7,27],larc:[7,9,27,31],larc_eta:[7,27],larc_mod:[7,27],larc_param:[7,9,27],larg:[7,13,27,30],largest:14,last:[4,5,7,12,27],last_batch:[7,15],last_step:[7,15],latenc:30,later:30,latter:13,launch:[15,24],layer:[0,1,2,3,4,5,7,9,12,13,14,17,25,27,30,31],layer_id:12,layer_param:5,layer_typ:5,layernorm:14,layernorm_lstm:5,layout:1,lead:29,learn:[5,7,9,13,24,27,31],learnabl:13,learning_r:[7,9,27],learning_rate_decay_fn:9,least:[5,14],left:13,length:[0,1,2,3,4,5,6,12,13,14],length_i:14,length_penalty_weight:13,length_x:14,less:[3,30,31],level:[27,30],levenshtein:7,libboost:29,libctc_decoder_with_kenlm:29,librari:4,librispeech:[22,31],librivox:23,libsox:23,libtensorflow_cc:29,libtensorflow_framework:29,like:[5,9,14,18,24,31],limit:30,line:[0,3,15,16,24,27],linear:[4,12,13,14],link:[29,31],list:[0,1,2,3,4,5,6,7,9,13,14,18,27,28],liu:13,live:14,lm_binary_path:4,lm_trie_path:4,lm_weight:4,load:[0,3,18],load_pre_existing_vocabulari:0,locat:[14,24,29],log:[7,13,14,25,30],log_fil:15,log_max:9,log_summaries_from_dict:15,logdir:[7,24,27],logger:15,logic:[7,14,18],logit:[4,6,13,14],logits_to_outputs_func:4,logmax:[7,9,27,30],logmaxscal:9,lognorm:30,logspac:13,longer:15,longest:3,look:[5,7,27,28,31],loop:14,lose:30,loss:[0,1,3,4,7,8,9,17,25,27,31],loss_comput:7,loss_input_dict:6,loss_param:7,loss_scal:[7,9,27,30],lot:[23,24,29],lower:[3,30],lr_polici:[7,8,27],lr_policy_param:[7,27],ls_dir:23,lst:3,lstm:[4,5,13,31],lstmstatetupl:13,luong:[4,13],luong_scal:4,luongattent:13,luongmonotonicattent:13,m_state:13,machin:[13,22],mai:[3,13,14],main:[4,5,6,24,27,28],maintain:[14,30],major:13,make:[0,1,3,13,23,24,28,29],malform:9,man:13,mani:30,manner:13,manual:[13,29,30],map:[0,3,4,5,13],mark:[3,13,14],mask:[6,12,13,14],mask_nan:[6,15],mask_pad:14,master:[14,30],match:[1,3,5,13,30,31],matric:13,matrix:[13,14],max:[3,9],max_decode_length:14,max_grad_norm:[7,27],max_length:3,max_lr:9,max_pool2d:5,max_pooling2d:5,max_step:[7,27],max_subtoken_length:3,max_tim:13,max_timescal:14,maxim:9,maximum:[3,7,13,14,27,30],mayb:13,maybe_print_log:7,mean:[1,3,15,30],measur:[24,31],mechan:[4,13,14],memori:[0,13,24,30],memory_sequence_length:13,mention:30,merg:13,method:[0,1,2,3,4,5,6,7,9,13,14,15,18,27,30],methodolog:30,metric:7,mfcc:2,michael:30,micikeviciu:30,might:[24,27,29,30],milli:2,min:3,min_boundari:3,min_count:3,min_idx:0,min_lr:9,min_timescal:14,min_upd:[7,27],minh:13,mini:18,minibatch:13,minim:[7,9,27],minimum:[0,3,13,14],minumum:3,minut:29,mismanag:13,misspel:23,mix:[4,5,7,9,13,14,27,28,31],mixedprecisionoptimizerwrapp:[9,30],mkdir:[23,29],modal:28,mode:[0,1,3,4,5,7,12,13,15,16,23,24,27,29,31],model:[0,1,2,3,4,5,6,8,9,13,14,15,22,24,25,28,30],model_param:30,modifi:[3,4,5,6,15,30],modul:[4,5,7,9,13,15,27],modular:28,momentum:[5,7,27,31],monoton:13,monotonic_attent:13,more:[3,13,14,24,28,30,31],moreov:27,moss:[24,31],most:[3,7,18,27,30],move:27,mozilla:29,mp_regularizer_wrapp:[9,30],mp_wrapper:8,mpi4pi:29,mpi:7,mpiexec:31,mpirun:16,msg:15,much:[3,14,24,29],multi:[7,14,16,18,24,28,31],multicel:13,multihead:14,multipl:[13,14,31],multipli:[9,13,30],multirnncel:13,must:[0,1,3,4,5,6,7,9,13,14,27],mutli:16,myfavoriteattentionmechan:13,n03623198:1,n_hidden:5,name:[2,3,4,5,6,7,9,11,12,13,27,30],namedtupl:13,nan:6,narang:30,nativ:3,nearli:30,necessari:[0,5,7,15,27,29,30],necessarili:14,need:[0,1,3,4,5,7,9,14,16,18,23,24,27,29,30],neg:[13,14],nest:[13,14,27],nest_dict:15,nested_upd:15,network:[5,12,13,14,30],neural:[5,13,30],new_beam_s:14,new_cach:14,new_height:1,new_log_prob:14,new_seq:14,new_width:1,newli:13,newstest2014:[24,31],next:[4,7,13,27,29,30,31],next_batch_feed_dict:18,next_input:13,next_stat:13,nmt:[24,31],nmt_revers:24,no_dir_check:27,noam:9,node:[16,28],nois:13,noise_level_max:2,noise_level_min:2,non:[13,14],none:[0,2,3,4,5,6,7,9,13,14,15,18,27,30],norm:[5,7,9,11,12,27,31],normal:[1,2,4,5,12,13,14,30],normalize_sign:2,note:[0,1,3,4,5,6,7,9,13,15,23,24,27,30],now:[13,23,27,29,30],num:[4,5,6],num_audio_featur:2,num_box:1,num_channel:[1,5],num_class:1,num_cpu_cor:3,num_epoch:[0,1,3,7,9,27],num_featur:[2,4],num_gpu:[7,16,24,27,31],num_head:14,num_iter:3,num_proj:13,num_rnn_lay:5,num_time_step:2,num_unit:13,num_work:[0,1,2,3,18],number:[0,1,2,3,4,5,6,7,9,13,14,16,27,30,31],number_of_group:13,numer:[7,13,15,27,30],numpi:2,nvidia:[28,29,30],object:[0,1,3,4,5,6,7,9,13,14,15,18,27,30],obtain:[29,30,31],occur:13,offici:28,offset:15,offset_target_by_on:6,often:[7,27,30],old:14,oleksii:30,on_horovod:[7,9],onc:[3,13,15,24],one:[0,1,3,4,5,6,7,9,13,14,15,24,27,30,31],ones:28,onli:[0,1,3,4,5,7,9,11,13,15,16,24,27,30],onlin:13,open:7,open_seq2seq:[1,2,3,7,29],openseq2seq:[3,16,23,24,27,30,31],oper:[4,5,9,13,15,29,30],ops:[1,9,13,15],opt:29,optim:[7,8,27,31],optimize_loss:9,optimizer_cls_nam:9,optimizer_param:[7,9,27],optimizer_summari:9,option:[0,1,2,3,4,5,6,7,9,13,18,27],optional_dict:15,order:[3,5,13,23,24,29,30],org:[5,7,9,13,31],org_dict:15,origin:[1,3,5,13,14,15,23],other:[1,4,7,9,13,15,23,27,29,30],otherwis:[1,7,9,13,15,24,27,29,30],our:[28,29,30],out:[13,14,24,28],out_dim:12,out_of_bucket:3,output:[1,4,5,6,7,12,13,14,15,18,23,24,27,28,29],output_attent:13,output_dim:[4,13],output_dir:15,output_dtyp:13,output_fil:[7,15],output_height:1,output_lay:13,output_s:13,output_time_major:13,output_valu:7,output_width:1,outsid:3,over:[6,30],overal:15,overcom:30,overflow:30,overflow_std_dev:9,overrid:13,overridden:13,overwrit:[7,27],overwritten:27,own:22,p_choose_i:13,packag:[4,5],pad2eight:3,pad:[0,1,2,3,5,11,12,14,18,31],pad_2_eight:3,pad_id:[3,15],pad_sym:14,pad_to:2,pad_vocab_to_eight:[0,14],padded_cross_entropy_with_smooth:6,padded_input_length:3,padded_length:3,padded_target_length:3,paddedcrossentropylosswithsmooth:6,padding_valu:14,page:[28,29],pair:[3,7,9,13],paper:24,parallel:[3,13,14,24],parallel_interleav:3,paralleltextdatalay:3,param:[0,1,2,3,4,5,6,7,9,13,14,15,18,27],paramet:[0,1,2,3,4,5,6,7,9,11,12,13,14,15,16,18,24,25,29,30],parent:[0,2,4,5,6,27],pars:[1,2],parse_record:1,part:[1,4,6,7,8,9,11,12,13,14,23,27,30],particular:13,partli:13,pass:[0,1,3,4,5,6,7,9,13,14,15,27,30],past:13,path:[0,2,3,4,7,27,31],pauliu:30,pdf:[5,9],penal:13,per:[2,3,6,7,27],perform:[1,2,4,5,6,7,13,24,27,30],period:[27,30],perl:[24,31],peter:13,pham:13,piecewis:9,piecewise_const:9,pip:29,pip_packag:29,pipelin:3,place:[3,27],placehold:18,plane:5,pleas:[13,23],point:[3,13,15,24,30],polici:[7,9,27],poly_decai:9,polynomi:9,polynomial_decai:9,pool_siz:5,popul:[7,9],posit:[5,13,14],possibl:[7,13,27,30,31],post:[9,14],post_process_gradi:9,power:[9,13],practic:30,pre:[0,13,14,24],preactiv:5,precis:[4,5,7,27,28,31],pred:7,predict:[4,7,13,14,15,27],predicted_id:13,preevious_attent:13,prefer:29,prefix:3,prepar:[13,27],prepostprocessingwrapp:14,preprint:30,preprocess:[1,23],preprocess_imag:1,presenc:30,present:9,preserv:1,previou:[12,13,14],previous_attent:13,primarili:1,principl:30,print:[5,7,15,27],print_bench_info_step:[7,27],print_loss_step:[7,27],print_samples_step:[7,27],printlossandtimehook:15,printsampleshook:15,prior:[3,13],probability_fn:13,probabl:[4,5,13,14,27,30],problem:[13,24,30],proce:27,process:[0,1,2,3,7,9,13,14,15,24,27,29],produc:[4,5,6,13],product:[13,14,28],progress:24,project:[5,12,13,28],projection_shortcut:5,propag:[13,30],proper:13,properli:13,properti:[13,14],propos:[5,13,31],proto:1,protocol:1,provabl:14,proven:30,provid:[1,3,4,5,7,9,13,15,23,30],pull:14,put:29,python:[1,3,4,5,7,9,12,13,14,15,18,23,24,27,29,31],quantiti:9,queri:13,raffel:13,rais:[1,9,13,15],random:[1,3,7,13,15,27],random_se:[7,27],randomli:1,rang:[2,13,30],rank:[1,3,13,14,15],rare:30,rate:[5,7,9,27,31],rather:[0,5,7,27,30],ratio:1,raw:[1,2,5],raw_record:1,raw_str:3,reach:14,read:[0,3,13,27],read_char:0,readabl:15,real:24,realli:4,reason:13,receiv:15,recent:30,recip:[28,30],recogn:23,recognit:[5,7,22,29],recommend:[9,13,18,29,30,31],record:[1,3],recov:15,recurr:[13,30],recurs:13,redefin:30,reduc:[3,13,24],reduce_gradi:9,reduce_mean:6,reduce_sum:15,ref:5,refer:[1,3,14],regress:6,regular:[4,5,7,9,11,27],regularizer_param:[4,5,7,27],relat:[0,1,3,13],relu:5,relu_dropout:14,remov:[7,24],ren:5,reparameter:13,repeat:[3,5,13],replac:[3,13,23],report:9,repositori:29,repres:[1,13,14],represent:[4,5],representation_dim:5,request:15,request_stop:15,requir:[0,1,2,3,4,5,6,7,13,18,27,29,30],required_dict:15,res_rank:14,research:28,reserv:3,reserved_token:3,reshap:[5,13,14],residu:[4,5,12,13],residual_connect:13,resiz:1,resize_imag:1,resize_min:1,resized_imag:1,resizemethod:1,resnet:[1,5],resnet_block:8,resnet_encod:8,resnetencod:5,respect:[13,30],rest:29,restor:[13,15,27],restore_and_get_result:15,result:[7,13,14,15,23,24,29,30,31],results_per_batch:7,retriev:30,reus:[13,30],revers:22,rgb:1,right:13,rmsprop:[7,27],rnn:[4,5,8,10,24,28,31],rnn_beam_search_decod:[8,10],rnn_cell:5,rnn_cell_dim:5,rnn_cell_impl:13,rnn_decod:8,rnn_decoder_with_attent:4,rnn_encod:8,rnn_type:5,rnn_unidirect:5,rnncell:13,rnndecoderwithattent:4,robust:30,ron:13,root:[15,31],root_rank:15,row:[5,7,13,15],row_conv:5,row_conv_width:5,rule:30,run:[4,5,7,9,13,14,15,16,23,25,30,31],run_context:15,run_valu:15,runevaluationhook:[7,15],runtim:13,runtimeerror:9,s_id:[3,15],safe:27,safe_cumprod:13,sai:27,saliman:[12,13],same:[0,1,3,4,5,6,9,13,14,15],sampl:[0,1,2,3,7,13,15,18,27],save:[3,7,27],save_checkpoint_step:[7,27],save_summaries_step:[7,27],scalar:[1,9,13,14],scale:[3,4,7,9,13,14,27],scale_max:9,scale_min:9,scan:13,scheme:[3,9],scope:[4,5,6,9,12,13],score:[7,13,14,31],score_bias_init:13,score_mask_valu:13,score_or_log_prob:14,script:[1,16,23,24,27,29,31],search:[3,4,13,14],second:[2,3,9,12,13,15,30],section:[4,5,7,14,19,20,21,26,27,28,29,31],sed:24,see:[2,4,5,6,7,13,24,27,28,29,30],seed:[7,13,27],select:[7,30],self:[0,1,3,4,5,6,7,13,14,18,27],selfattent:14,semi:3,send:15,sentenc:3,separ:[1,27],seq2seq:[6,13],seq:13,sequenc:[0,1,2,3,4,5,6,7,13,14,22,27,28,29],sequence_beam_search:14,sequence_length:[6,13],sequence_loss:8,sequencebeamsearch:14,seri:1,serial:[1,3],serialized_exampl:3,sess:[7,15],session:15,session_run_hook:15,sessionrunarg:15,sessionruncontext:15,sessionrunhook:15,sessionrunvalu:15,set:[3,4,5,6,7,9,13,15,16,27,28,29,30],setup:[23,31],sgd:[7,9,27,31],shaoq:5,shape:[1,2,3,4,5,6,12,13,14,18],sharan:30,share:[14,27,30],shift:30,shortcut:5,shorter:14,should:[0,1,3,4,5,6,7,9,12,13,14,15,18,23,24,27,29,30,31],shuffl:[0,3,13,18],side:1,sigmoid:13,sigmoid_nois:13,sigmoid_noise_se:13,signal:[2,15],signatur:[5,13],significantli:3,similar:[3,5],simpl:[1,4,13,16,24,27,30],simplest:29,sinc:[3,5,7,13,23,27,29,30],sine:14,singl:[1,3,5,13,14,24,31],single_cel:13,singleton:6,singular:13,situat:[15,30],size:[0,1,2,3,4,5,6,7,12,13,14,15,18,23,27,29,30,31],skip:[27,29,30],skip_update_ph:9,slice:13,slightli:3,sloppi:3,slowest:13,slstm:[8,10],small:[7,13,24,27,30,31],smallest:1,smallest_sid:1,smooth:6,softmax:[6,13,14],solut:13,some:[1,4,5,7,9,13,14,27,28,29,30],someth:29,sometim:27,soon:[13,19,20,21,26,27,31],sort:3,sourc:[0,1,2,3,4,5,6,7,9,11,12,13,14,15,27,29],source_length:2,source_sequ:2,source_tensor:[0,1,2,3,5,7],sox:23,space:12,spars:7,sparse_tensor_to_char:7,sparsemax:13,sparsetensorvalu:7,spatial:5,specialtexttoken:3,specif:[2,4,5,27,29],specifi:[2,7,9,12,14,16,18,27],spectrogram:2,speech2text:[0,8,23,27,29],speech2textdatalay:2,speech:[2,7,22,28],speech_util:[0,8],speed:[3,30],speedup:30,split:[3,4,13,14],split_data:[1,2],split_head:14,src:24,src_emb_dim:12,src_emb_siz:5,src_input:[4,5],src_length:[4,5,6],src_sequenc:5,src_vocab_s:5,stabil:[7,27],stack:13,stai:30,staircas:9,stamp:27,standard:[5,7,13,30],start:[9,13,14,15,16,24,27,28],start_input:13,start_token:13,state:[4,5,13,14],state_is_tupl:13,state_s:13,statist:30,stderr:27,stdout:27,step:[1,7,9,13,15,27,29],step_factor:9,step_window:9,steps_in_epoch:7,steps_per_epoch:9,still:14,stop:[13,15,31],store:[3,7,13,14],str:[0,2,3,4,5,6,7,12],stream:15,strength:14,stride:[2,5,11],string:[1,2,3,4,5,7,9,15,27],structur:[13,14,25],style:13,sub:9,subfold:27,submit:30,subsequ:[1,13,27],subset:[13,31],substitut:31,subtoken:[3,14],subtoken_count:3,subtoken_dict:3,subtoken_list:3,subtract:1,sudo:[23,29],suffer:13,suggest:30,sum:[6,13,15],sum_i:13,summar:30,summari:[7,9,27],sun:5,suppli:9,support:[0,4,5,6,7,11,13,16,18,27,28,29,30],supported_algo:9,sure:[0,1,3,23,24],symbol:[0,4,14],symbols_to_logits_fn:14,symlink:29,synset:1,system:24,t2t:[0,8],tab:27,tabl:31,taht:[0,1,3],take:[4,5,6,9,13,14,15,23,24,29],taken:7,tanh:13,target:[0,1,2,3,4,6,7,12,15,24],target_emb:12,target_length:[2,4],target_s:3,target_sequ:[2,6],target_tensor:[0,1,2,3,4,6,7],target_vocab_s:3,task:22,techniqu:5,tensor2tensor:14,tensor:[0,1,2,3,4,5,6,7,9,12,13,14,15,18,30],tensorarrai:13,tensorboard:[7,9,24,27],tensorflow:[0,1,3,4,5,6,7,9,12,13,14,15,27,28,30],tensorflow_pkg:29,tensorshap:13,term:[9,13],termin:14,tesla:30,test:24,text2text:[0,8,24,27],text:[1,2,3,4,7,15,23,28,29],text_ids_to_str:15,textlinedataset:2,tfrecord:3,tgt:24,tgt_emb_siz:4,tgt_input:4,tgt_length:[4,6],tgt_sequenc:6,tgt_vocab_s:[4,6],than:[1,3,5,9,13,14,24,30,31],thang:13,thei:[1,3,5,13,30],them:[23,29,30],thi:[0,1,3,4,5,6,7,9,12,13,14,15,18,19,20,21,23,24,26,27,28,29,30,31],thing:[3,24,27,30],those:[7,13],thread:15,three:[1,3],threshold:3,through:[1,13,14,28],thu:[3,4,27],tile:[13,14],tile_batch:13,tiled_encoder_final_st:13,tiled_encoder_output:13,tiled_input:13,tiled_sequence_length:13,tim:13,time:[1,3,4,5,6,13,15,23,27,29,30],time_major:5,time_stretch_ratio:2,timestep:[6,13,30],titan:30,tmp:29,tobyyouup:12,todo:[18,29],togeth:[7,15,30],toi:[22,23,29],tok:[24,31],token:[0,4,7,8,13,14],token_count:3,tool:29,toolkit:28,top:[3,13,14],topic:27,total:[0,9,30],total_regularization_loss:9,tower:[7,16],toy_text_data:24,tra:3,track:[13,15],tracks_own_finish:13,train:[0,1,3,4,5,7,9,11,13,14,15,22,24,27,28,29,31],train_ev:[7,23,24,27,29,31],train_input_fn:3,train_model:15,train_op:7,train_param:27,trainable_vari:9,trainer:9,training_step:7,transform:[3,4,8,9,10,12,18,24,31],transform_for_bleu:7,transformer_decod:8,transformer_encod:8,transformer_polici:9,transformerdatalay:3,translat:[3,7,13,14,22],transpos:14,treat:[7,9,27],tri:[5,31],trick:13,trie:[4,23,29],true_batch_s:13,true_siz:[7,15],tupl:[1,2,7,9,13,14],tutori:[27,28],twice:3,two:[1,3,13,14,15,16,23,30],txt:[24,29],type:[0,1,2,3,4,5,6,7,11,12,13,14,30],typeerror:[9,13],typic:[0,4,5,6,30],ubuntu:29,ultim:5,unbatch:3,unchang:[13,14],under:9,underflow:[13,30],undergo:1,underli:[2,30],underlin:3,understand:27,unescap:3,uni:5,unicod:3,unidir_rnn_encoder_with_emb:5,unidirect:31,unidirectionalrnnencoderwithembed:5,uniqu:14,unit:[4,5,12,13,31],unittest:29,unk_id:3,unknown:1,unspecifi:13,upcom:15,upd_dict:15,updat:[3,7,9,13,27,30],update_op:9,use:[0,1,3,4,5,6,7,9,13,16,18,23,24,27,29,30,31],use_horovod:[7,16,27,31],use_language_model:[4,29,31],use_new_attent:13,use_staircase_decai:9,use_swap_memori:5,used:[0,1,3,4,5,6,7,9,12,13,14,15,24,27,30,31],useful:[7,27,28],user:[13,30],uses:[6,12,13,23,24,30],using:[1,2,3,5,7,9,13,15,16,18,24,28,29,30,31],usual:[4,13,23,29,30],util:[1,7,8,10,30],utter:23,v100:30,valid:[0,1,3,4,5,7,9,13,15,27,31],valid_word_count_weight:4,valu:[1,3,4,5,7,9,12,13,14,15,27,30],value_channel:14,valueerror:[1,9,13],var_list:9,var_scope_nam:12,variabl:[3,4,5,6,7,9,12,13,14,15,27,30],variable_norm:[7,27],varianc:30,variant:5,varieti:30,variou:[4,5,9,27,28],vector:[1,12,13],venkatesh:30,verbos:[5,15],veri:[13,24],versa:13,version:[3,4,13,29,30],vgg:1,via:13,vice:13,view:[3,24],visual:[7,9],vocab:[0,3,7,15],vocab_fil:[2,3],vocab_s:[4,14],vocabulari:[0,2,3,4,5,6,15,24],volta:[30,31],w2l_encod:8,w2l_large_8gpu:31,wai:[3,7,29,30],want:[5,23,24,27,29],warm:[9,31],warmup_step:9,wav2lett:[5,31],wave2lett:5,wave2letterencod:5,wave:2,wavelength:14,weight:[4,5,12,13,14,15,30],weiss:13,well:[4,5,7,24],wer:31,were:[5,13,24],what:25,when:[0,1,3,5,7,9,13,14,15,16,27,29,30,31],whenev:[27,30],where:[1,3,7,9,13,14,18,24,27,30],whether:[0,1,3,4,5,6,7,9,12,13,14,27],which:[0,1,3,4,5,6,7,9,13,14,15,18,24,27,30,31],whl:29,whole:[7,27],whose:13,width:[1,4,5,12],width_in:5,window:2,window_s:2,window_strid:2,within:5,without:[1,4,5,14,27,30,31],wmt:24,word:[0,3,4,7,31],word_count_weight:4,work:[7,24,27,29,31],worker:[3,6,7,15],worker_id:[0,1,2,3,7,18],workshop:13,wors:29,worst:[13,14],worth:30,wrap:[3,4,13,15,30],wrapper:[1,4,5,6,13,14,30],write:[15,27],wrong:13,xiangyu:5,xmax:1,xmin:1,ymax:1,ymin:1,yoshua:13,you:[0,1,3,5,7,9,13,14,15,16,18,23,24,27,28,29,30,31],your:[0,1,3,18,22,24,27,29],yourself:30,zero:[7,13,14],zero_st:13,zhang:5},titles:["data","image2label","speech2text","text2text","decoders","encoders","losses","models","API documentation","optimizers","parts","cnns","convs2s","rnns","transformer","utils","Distributed training","Adding new models","Adding new data layer","Adding new decoder","Adding new encoder","Adding new loss","Getting started","Speech Recognition","Machine Translation","In-depth tutorials","Internal structure","Using existing models","OpenSeq2Seq","Installation instructions","Mixed precision training","Models and recipes"],titleterms:{"new":[17,18,19,20,21],Adding:[17,18,19,20,21],Using:27,add:29,adventur:24,api:8,attention_lay:14,attention_wn_lay:12,attention_wrapp:13,automat:30,automatic_loss_scal:9,beam_search:14,being:27,bleu:24,bpe:24,build:23,clean:24,cnn:11,cnn_encod:5,common:14,comput:24,config:27,conv_block:11,conv_wn_lay:12,convs2:12,convs2s_decod:4,convs2s_encod:5,creat:24,cross_entropy_loss:6,ctc:29,ctc_loss:6,data:[0,18],data_lay:0,dataset:23,decod:[4,19,29],depth:25,detail:30,distribut:16,document:8,download:29,ds2_encod:5,embedding_lay:14,enabl:30,encod:[5,20],encoder_decod:7,english:24,exist:27,fc_decod:4,feel:24,ffn_layer:14,ffn_wn_layer:12,flstm:13,func:15,gener:29,german:24,get:22,glstm:13,gnmt:13,hook:15,horovod:29,how:[23,27,29,30],image2label:[1,7],imagenet_preprocess:1,implement:30,infer:24,instal:29,instruct:29,intern:26,languag:[23,29],layer:18,librispeech:23,log:27,loss:[6,21,30],lr_polici:9,machin:[24,31],mix:30,model:[7,17,23,27,29,31],mp_wrapper:9,openseq2seq:[28,29],optim:[9,30],own:23,paramet:27,part:10,precis:30,prerequisit:30,recip:31,recognit:[23,31],regular:30,resnet_block:5,resnet_encod:5,revers:24,rnn:13,rnn_beam_search_decod:13,rnn_decod:4,rnn_encod:5,run:[24,27,29],scale:30,score:24,segment:24,sequenc:24,sequence_loss:6,slstm:13,speech2text:[2,7],speech:[23,29,31],speech_util:2,start:22,structur:26,t2t:3,task:24,tensorflow:29,test:29,text2text:[3,7],toi:24,token:3,train:[16,23,30],transform:14,transformer_decod:4,transformer_encod:5,translat:[24,31],tutori:25,util:[0,13,14,15],w2l_encod:5,what:27,your:23}}) \ No newline at end of file diff --git a/docs/sources/source/api-docs/decoders.rst b/docs/sources/source/api-docs/decoders.rst index 07c22c247..681f5d1e1 100644 --- a/docs/sources/source/api-docs/decoders.rst +++ b/docs/sources/source/api-docs/decoders.rst @@ -37,3 +37,11 @@ transformer\_decoders :members: :undoc-members: :show-inheritance: + +convs2s\_decoder +------------------------------------- + +.. automodule:: decoders.convs2s_decoder + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/sources/source/api-docs/encoders.rst b/docs/sources/source/api-docs/encoders.rst index 362287eb6..5c11d1f26 100644 --- a/docs/sources/source/api-docs/encoders.rst +++ b/docs/sources/source/api-docs/encoders.rst @@ -22,6 +22,14 @@ ds2\_encoder :undoc-members: :show-inheritance: +w2l\_encoder +---------------------------- + +.. automodule:: encoders.w2l_encoder + :members: + :undoc-members: + :show-inheritance: + rnn\_encoders ----------------------------- @@ -38,6 +46,14 @@ transformer\_encoders :undoc-members: :show-inheritance: +convs2s\_encoder +------------------------------------- + +.. automodule:: encoders.convs2s_encoder + :members: + :undoc-members: + :show-inheritance: + resnet\_encoder ---------------------------------- @@ -53,3 +69,12 @@ resnet\_blocks :members: :undoc-members: :show-inheritance: + + +cnn\_encoder +-------------------------------- + +.. automodule:: encoders.cnn_encoder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sources/source/api-docs/parts.cnns.rst b/docs/sources/source/api-docs/parts.cnns.rst new file mode 100644 index 000000000..631cb86c1 --- /dev/null +++ b/docs/sources/source/api-docs/parts.cnns.rst @@ -0,0 +1,15 @@ +cnns +======================================= + +.. automodule:: parts.cnns + :members: + :undoc-members: + :show-inheritance: + +conv\_blocks +------------------------------------------------------- + +.. automodule:: parts.cnns.conv_blocks + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sources/source/api-docs/parts.convs2s.rst b/docs/sources/source/api-docs/parts.convs2s.rst new file mode 100644 index 000000000..226652c72 --- /dev/null +++ b/docs/sources/source/api-docs/parts.convs2s.rst @@ -0,0 +1,31 @@ +convs2s +======================================= + +.. automodule:: parts.convs2s + :members: + :undoc-members: + :show-inheritance: + +attention\_wn\_layer +------------------------------------------------------- + +.. automodule:: parts.convs2s.attention_wn_layer + :members: + :undoc-members: + :show-inheritance: + +conv\_wn\_layer +------------------------------------------------------- + +.. automodule:: parts.convs2s.conv_wn_layer + :members: + :undoc-members: + :show-inheritance: + +ffn\_wn\_layer +------------------------------------------------------- + +.. automodule:: parts.convs2s.ffn_wn_layer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/sources/source/api-docs/parts.rst b/docs/sources/source/api-docs/parts.rst index 3f85cb82a..6a57d9287 100644 --- a/docs/sources/source/api-docs/parts.rst +++ b/docs/sources/source/api-docs/parts.rst @@ -10,3 +10,5 @@ parts parts.rnns parts.transformer + parts.convs2s + parts.cnns \ No newline at end of file diff --git a/docs/sources/source/api-docs/parts.transformer.rst b/docs/sources/source/api-docs/parts.transformer.rst index 7dab39e8a..8fa9237fd 100644 --- a/docs/sources/source/api-docs/parts.transformer.rst +++ b/docs/sources/source/api-docs/parts.transformer.rst @@ -22,14 +22,6 @@ beam\_search :undoc-members: :show-inheritance: -beam\_search\_test ---------------------------------------------------------- - -.. automodule:: parts.transformer.beam_search_test - :members: - :undoc-members: - :show-inheritance: - common --------------------------------------------- diff --git a/docs/sources/source/installation-instructions.rst b/docs/sources/source/installation-instructions.rst index ba09bbccc..0e4c7d110 100644 --- a/docs/sources/source/installation-instructions.rst +++ b/docs/sources/source/installation-instructions.rst @@ -32,7 +32,7 @@ run unittests:: python -m unittest discover -s open_seq2seq -p '*_test.py' -It might take up to 10 minutes. You should see a lot of output, but no errors +It might take up to 30 minutes. You should see a lot of output, but no errors in the end. .. _installation_speech: diff --git a/docs/sources/source/models-and-recipes.rst b/docs/sources/source/models-and-recipes.rst index da71d6e6a..70de3f5ce 100644 --- a/docs/sources/source/models-and-recipes.rst +++ b/docs/sources/source/models-and-recipes.rst @@ -3,18 +3,17 @@ Models and recipes ================== -.. This section will contain information about different models that OpenSeq2Seq -.. supports, exact config parameters to train them, final training/validation/test -.. metrics and links to checkpoints (tensorboards also?) of trained models. .. note:: Currently OpenSeq2Seq has model implementations for machine translation and - automatic speech recognition. All models work both in float32 and mixed precision. - We recommend you use :ref:`mixed precision training ` when training on Volta GPUs. + automatic speech recognition. + All models work both in float32 and mixed precision. + We recommend you use :ref:`mixed precision training ` + when training on Volta GPUs. -To train models you can use the following -commands (don't forget to substitute valid config_file path there and number of GPUs if using Horovod). +To train models you can use the following commands (don't forget to substitute +valid config_file path there and number of GPUs if using Horovod). With Horovod (highly recommended when using multiple GPUs):: @@ -29,6 +28,16 @@ The description of implemented models is available in the next sections: Machine translation ------------------- +The table below contains description and results of +machine translation models available in OpenSeq2Seq. +Currently, we have GNMT-based model, Transformer-based models and +ConvS2S-based models. + +We measure BLEU score on newstest2014.tok.de file using ``multi-bleu.perl`` script from Mosses. +For more details about model descriptions and training setup, +have a look at the `configuration files `_. + + .. list-table:: :widths: 1 1 1 1 1 :header-rows: 1 @@ -38,72 +47,87 @@ Machine translation - Training setup and additional comments - Short description of the model - Checkpoint - * - `en-de-nmt-small.py `_ + * - `en-de-nmt-small.py `_ - 20.23 - This model should train on a single GPU such as 1080Ti. It is trained using Adam optimizer. - RNN-based. Bi-directional encoder with 2 layers and. GNMT-like decoder with 2 layers and attention. Uses LSTM cells of size 512. - `link `_ - * - `en-de-gnmt-like-4GPUs.py `_ + * - `en-de-gnmt-like-4GPUs.py `_ - 23.89 - This model was trained on 4 GPUs with Adam optimizer and learning rate decay. - RNN-based. This is GNMT-like model which tries to match the one described in https://arxiv.org/abs/1609.08144 as close as possible. - `link `_ - * - `transformer-big.py `_ + * - `transformer-big.py `_ - 26.17 - This model was trained on 4 GPUs with Adam optimizer and learning rate decay. - Transformer "big" model. This model does not have any RNN layers - `link `_ + * - `en-de-convs2s.py `_ + - xx.xx + - This model was trained on 4 GPUs with Adam optimizer, learning rate decay and warm-up. + - This is an implementation of the ConvS2S model proposed in https://arxiv.org/abs/1705.03122. + - Coming soon. -GNMT model description can be found `here `_. -Transformer model description can be found `here `_. -We measure BLEU score on newstest2014.tok.de file using ``multi-bleu.perl`` script from Mosses. +GNMT model description: https://arxiv.org/abs/1609.08144. + +Transformer model description: https://arxiv.org/abs/1706.03762. + +ConvS2S model description: https://arxiv.org/abs/1705.03122. Speech recognition ------------------ -Deep Speech 2 based models -~~~~~~~~~~~~~~~~~~~~~~~~~~ -Original Deep Speech 2 model description: https://arxiv.org/abs/1512.02595. The table below contains description and results of -Deep Speech 2 based models available in OpenSeq2Seq. +speech recognition models available in OpenSeq2Seq. +Currently, we have DeepSpeech2-based models and Wav2Letter-based models. -WER-512 and WER-2048 is word error rate obtained with beam width of 512 and 2048 -correspondingly on a dev-clean subset of LibriSpeech. For beam width of 2048 we also used ``batch_size_per_gpu = 1`` +WER is the word error rate obtained on a dev-clean subset of LibriSpeech using +greedy decoder (``decoder_params/use_language_model = False``). +For the final evaluation we used ``batch_size_per_gpu = 1`` to eliminate the effect of `cudnn padding issue `_. For more details about model descriptions and training setup, -have a look at the `configuration files `_. +have a look at the `configuration files `_. .. list-table:: - :widths: 1 1 1 1 1 1 + :widths: 1 1 1 1 1 :header-rows: 1 * - Config file - - WER-512 - - WER-2048 + - WER - Training setup and additional comments - Short description of the model - Checkpoint * - `ds2_large_8gpus.py `_ - - 4.90% - - 4.59% + - 9.28% - This model was trained for 50 epochs using SGD with Momentum and LARC on the full LibriSpeech in a few days using Horovod on eight GPUs. - This model has 2 convolutional layers and 5 bidirectional GRU layers with 800 units. - - `link `_ + - `link `_ * - `ds2_medium_4gpus.py `_ - - 6.12% - - 5.49% + - 22.60% - This model was trained for 50 epochs using Adam on the full LibriSpeech in a few days using Horovod on four GPUs. - This model has 3 convolutional layers and 3 unidirectional GRU layers with 1024 units. - `link `_ * - `ds2_small_1gpu.py `_ - - 11.77% - - 9.32% + - 39.08% - This model was trained for 12 epochs using Adam on a "clean" subset of LibriSpeech in less than a day using a single GPU. - This model has 2 convolutional layers and 2 bidirectional GRU layers with 512 units. - `link `_ + * - `w2l_large_8gpus.py `_ + - 15.44% + - This model was trained for 18 epochs (with early stopping based on + validation loss) using SGD with Momentum and LARC on + the full LibriSpeech in a few days on eight GPUs. + - The model has 19 convolutional layers (200--1000 units, 7--21 kernel size). + We use batch norm between all layers. + - `link `_ + + +Deep Speech 2 model description: https://arxiv.org/abs/1512.02595. + +Wav2Letter model description: https://arxiv.org/abs/1609.03193, https://arxiv.org/abs/1712.09444. diff --git a/example_configs/image2label/alexnet_owt.py b/example_configs/image2label/alexnet_owt.py new file mode 100644 index 000000000..3dce00b63 --- /dev/null +++ b/example_configs/image2label/alexnet_owt.py @@ -0,0 +1,104 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders.cnn_encoder import CNNEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import poly_decay +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 120, + + "num_gpus": 4, + "batch_size_per_gpu": 256, + "dtype": tf.float32, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/alexnet-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.04, + "power": 1.0, + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0005, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": CNNEncoder, + "encoder_params": { + 'data_format': 'channels_first', + 'cnn_layers': [ + (tf.layers.conv2d, { + 'filters': 64, 'kernel_size': (11, 11), + 'strides': (4, 4), 'padding': 'VALID', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 192, 'kernel_size': (5, 5), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 384, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + ], + 'fc_layers': [ + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.layers.dropout, {'rate': 0.5}), + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.layers.dropout, {'rate': 0.5}), + ], + }, + + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1000, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + "image_size": 227, + "num_classes": 1000, + }, +} \ No newline at end of file diff --git a/example_configs/image2label/cifar-nv.py b/example_configs/image2label/cifar-nv.py new file mode 100644 index 000000000..c42eb1e18 --- /dev/null +++ b/example_configs/image2label/cifar-nv.py @@ -0,0 +1,115 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders.cnn_encoder import CNNEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data.image2label.image2label import CifarDataLayer +from open_seq2seq.optimizers.lr_policies import poly_decay +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 200, + + "num_gpus": 1, + "batch_size_per_gpu": 32, + "dtype": tf.float32, + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/test-cifar", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.001, + "power": 1.0, + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0002, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": CNNEncoder, + "encoder_params": { + 'data_format': 'channels_first', + 'cnn_layers': [ + # block 1 + (tf.layers.conv2d, { + 'filters': 128, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 128, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 128, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': None, 'use_bias': False, + }), + (tf.layers.batch_normalization, {'momentum': 0.9, 'epsilon': 0.0001}), + (tf.nn.relu, {}), + (tf.layers.max_pooling2d, { + 'pool_size': 3, 'strides': 2, 'padding': 'SAME', + }), + # block 2 + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 256, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': None, 'use_bias': False, + }), + (tf.layers.batch_normalization, {'momentum': 0.9, 'epsilon': 0.0001}), + (tf.nn.relu, {}), + (tf.layers.max_pooling2d, { + 'pool_size': 3, 'strides': 2, 'padding': 'SAME', + }), + # block 3 + (tf.layers.conv2d, { + 'filters': 320, 'kernel_size': (3, 3), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + (tf.layers.conv2d, { + 'filters': 320, 'kernel_size': (1, 1), + 'strides': (1, 1), 'padding': 'SAME', + 'activation': tf.nn.relu, + }), + ], + }, + + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 10, + }, + "loss": CrossEntropyLoss, + "data_layer": CifarDataLayer, + "data_layer_params": { + "data_dir": "data/cifar-10-batches-bin", + }, +} diff --git a/example_configs/image2label/resnet-50-v2-mp.py b/example_configs/image2label/resnet-50-v2-mp.py new file mode 100644 index 000000000..3666f9b7d --- /dev/null +++ b/example_configs/image2label/resnet-50-v2-mp.py @@ -0,0 +1,64 @@ +from open_seq2seq.models import Image2Label +from open_seq2seq.encoders import ResNetEncoder +from open_seq2seq.decoders import FullyConnectedDecoder +from open_seq2seq.losses import CrossEntropyLoss +from open_seq2seq.data import ImagenetDataLayer +from open_seq2seq.optimizers.lr_policies import piecewise_constant +import tensorflow as tf + + +base_model = Image2Label + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 100, + + "num_gpus": 8, + "batch_size_per_gpu": 32, + "dtype": "mixed", + "loss_scaling": "Backoff", + + "save_summaries_steps": 2000, + "print_loss_steps": 100, + "print_samples_steps": 2000, + "eval_steps": 5000, + "save_checkpoint_steps": 5000, + "logdir": "experiments/resnet50-imagenet", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": piecewise_constant, + "lr_policy_params": { + "learning_rate": 0.1, + "boundaries": [30, 60, 80, 90], + "decay_rates": [0.1, 0.01, 0.001, 1e-4], + }, + + "initializer": tf.variance_scaling_initializer, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0001, + }, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + "encoder": ResNetEncoder, + "encoder_params": { + 'resnet_size': 50, + "regularize_bn": False, + }, + "decoder": FullyConnectedDecoder, + "decoder_params": { + "output_dim": 1000, + }, + "loss": CrossEntropyLoss, + "data_layer": ImagenetDataLayer, + "data_layer_params": { + "data_dir": "data/tf-imagenet", + "image_size": 224, + "num_classes": 1000, + }, +} diff --git a/example_configs/image2label/resnet-50-v2.py b/example_configs/image2label/resnet-50-v2.py index 856e44f21..43981118c 100644 --- a/example_configs/image2label/resnet-50-v2.py +++ b/example_configs/image2label/resnet-50-v2.py @@ -51,11 +51,12 @@ }, "decoder": FullyConnectedDecoder, "decoder_params": { - "output_dim": 1001, + "output_dim": 1000, }, "loss": CrossEntropyLoss, "data_layer": ImagenetDataLayer, "data_layer_params": { "data_dir": "data/tf-imagenet", + "image_size": 224, }, } diff --git a/example_configs/speech2text/w2l_large_8gpus.py b/example_configs/speech2text/w2l_large_8gpus.py new file mode 100644 index 000000000..e093d06b8 --- /dev/null +++ b/example_configs/speech2text/w2l_large_8gpus.py @@ -0,0 +1,161 @@ +import tensorflow as tf +from open_seq2seq.models import Speech2Text +from open_seq2seq.encoders import Wave2LetterEncoder +from open_seq2seq.decoders import FullyConnectedCTCDecoder +from open_seq2seq.data import Speech2TextDataLayer +from open_seq2seq.losses import CTCLoss +from open_seq2seq.optimizers.lr_policies import poly_decay + + +base_model = Speech2Text + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 50, + + "num_gpus": 8, + "batch_size_per_gpu": 32, + + "save_summaries_steps": 100, + "print_loss_steps": 10, + "print_samples_steps": 2200, + "eval_steps": 2200, + "save_checkpoint_steps": 1000, + "logdir": "w2l_log_folder", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.001, + "power": 0.5, + }, + "larc_params": { + "larc_eta": 0.001, + }, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0005 + }, + + #"max_grad_norm": 15.0, + "dtype": tf.float32, + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": Wave2LetterEncoder, + "encoder_params": { + "convnet_layers": [ + { + "type": "conv1d", "repeat" : 5, + "kernel_size": [7], "stride": [1], + "num_channels": 200, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [11], "stride": [1], + "num_channels": 400, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [15], "stride": [1], + "num_channels": 400, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [19], "stride": [1], + "num_channels": 400, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [23], "stride": [1], + "num_channels": 600, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [29], "stride": [1], + "num_channels": 800, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [1], "stride": [1], + "num_channels": 1000, "padding": "SAME" + }, + ], + + "dropout_keep_prob": 0.8, + + "initializer": tf.contrib.layers.xavier_initializer, + "initializer_params": { + 'uniform': False, + }, + "normalization" : "batch_norm", + "activation_fn" : lambda x: tf.minimum(tf.nn.relu(x), 20.0), + "data_format": "channels_last", + }, + + "decoder": FullyConnectedCTCDecoder, + "decoder_params": { + "initializer": tf.contrib.layers.xavier_initializer, + "use_language_model": True, + + # params for decoding the sequence with language model + "beam_width": 512, + "lm_weight": 2.0, + "word_count_weight": 1.5, + "valid_word_count_weight": 2.5, + + "decoder_library_path": "ctc_decoder_with_lm/libctc_decoder_with_kenlm.so", + "lm_binary_path": "language_model/lm.binary", + "lm_trie_path": "language_model/trie", + "alphabet_config_path": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + }, + "loss": CTCLoss, + "loss_params": {}, +} + +train_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "data/librispeech/librivox-train-clean-100.csv", + "data/librispeech/librivox-train-clean-360.csv", + "data/librispeech/librivox-train-other-500.csv", + ], + "shuffle": True, + }, +} + +eval_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "data/librispeech/librivox-dev-clean.csv", + ], + "shuffle": False, + }, +} + +infer_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "data/librispeech/librivox-test-clean.csv", + ], + "shuffle": False, + }, +} diff --git a/example_configs/speech2text/w2l_large_8gpus_mp.py b/example_configs/speech2text/w2l_large_8gpus_mp.py new file mode 100644 index 000000000..b3514f452 --- /dev/null +++ b/example_configs/speech2text/w2l_large_8gpus_mp.py @@ -0,0 +1,162 @@ +import tensorflow as tf +from open_seq2seq.models import Speech2Text +from open_seq2seq.encoders import Wave2LetterEncoder +from open_seq2seq.decoders import FullyConnectedCTCDecoder +from open_seq2seq.data import Speech2TextDataLayer +from open_seq2seq.losses import CTCLoss +from open_seq2seq.optimizers.lr_policies import poly_decay + + +base_model = Speech2Text + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 50, + + "num_gpus": 8, + "batch_size_per_gpu": 32, + + "save_summaries_steps": 100, + "print_loss_steps": 10, + "print_samples_steps": 2200, + "eval_steps": 2200, + "save_checkpoint_steps": 1000, + "logdir": "w2l_log_folder", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.001, + "power": 0.5, + }, + "larc_params": { + "larc_eta": 0.001, + }, + + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 0.0005 + }, + + #"max_grad_norm": 15.0, + "dtype": "mixed", + "loss_scaling": "Backoff", + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": Wave2LetterEncoder, + "encoder_params": { + "convnet_layers": [ + { + "type": "conv1d", "repeat" : 5, + "kernel_size": [7], "stride": [1], + "num_channels": 200, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [11], "stride": [1], + "num_channels": 400, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [15], "stride": [1], + "num_channels": 400, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [19], "stride": [1], + "num_channels": 400, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [23], "stride": [1], + "num_channels": 600, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [29], "stride": [1], + "num_channels": 800, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [1], "stride": [1], + "num_channels": 1000, "padding": "SAME" + }, + ], + + "dropout_keep_prob": 0.8, + + "initializer": tf.contrib.layers.xavier_initializer, + "initializer_params": { + 'uniform': False, + }, + "normalization" : "batch_norm", + "activation_fn" : lambda x: tf.minimum(tf.nn.relu(x), 20.0), + "data_format": "channels_last", + }, + + "decoder": FullyConnectedCTCDecoder, + "decoder_params": { + "initializer": tf.contrib.layers.xavier_initializer, + "use_language_model": True, + + # params for decoding the sequence with language model + "beam_width": 512, + "lm_weight": 2.0, + "word_count_weight": 1.5, + "valid_word_count_weight": 2.5, + + "decoder_library_path": "ctc_decoder_with_lm/libctc_decoder_with_kenlm.so", + "lm_binary_path": "language_model/lm.binary", + "lm_trie_path": "language_model/trie", + "alphabet_config_path": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + }, + "loss": CTCLoss, + "loss_params": {}, +} + +train_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "data/librispeech/librivox-train-clean-100.csv", + "data/librispeech/librivox-train-clean-360.csv", + "data/librispeech/librivox-train-other-500.csv", + ], + "shuffle": True, + }, +} + +eval_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "data/librispeech/librivox-dev-clean.csv", + ], + "shuffle": False, + }, +} + +infer_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "data/librispeech/librivox-test-clean.csv", + ], + "shuffle": False, + }, +} diff --git a/example_configs/text2text/en-de/en-de-convs2s.py b/example_configs/text2text/en-de/en-de-convs2s.py new file mode 100644 index 000000000..60ff79ec5 --- /dev/null +++ b/example_configs/text2text/en-de/en-de-convs2s.py @@ -0,0 +1,178 @@ +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf + +from open_seq2seq.models import Text2Text +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer + +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.data.text2text.tokenizer import EOS_ID + +from open_seq2seq.encoders import ConvS2SEncoder +from open_seq2seq.decoders import ConvS2SDecoder + +from open_seq2seq.losses import BasicSequenceLoss, PaddedCrossEntropyLossWithSmoothing + +from open_seq2seq.optimizers.lr_policies import transformer_policy + +# REPLACE THIS TO THE PATH WITH YOUR WMT DATA +data_root = "./wmt16_en_dt/" + +base_model = Text2Text +num_layers = 15 +d_model = 768 +max_length = 128 + +batch_size = 64 +num_gpus = 4 +epoch_num = 30 + +base_params = { + "use_horovod": False, + "num_gpus": num_gpus, + # set max_step to achieve the given epoch_num, 4.5M is the size of the dataset + "max_steps": int((4500000 / (num_gpus * batch_size)) * epoch_num), + "batch_size_per_gpu": batch_size, + "save_summaries_steps": 50, + "print_loss_steps": 50, + "print_samples_steps": 50, + "eval_steps": 4000, + "save_checkpoint_steps": 1000, + "logdir": "RealData-CC", + + + "optimizer": "Adam", + "optimizer_params": {}, + "lr_policy": transformer_policy, + "lr_policy_params": { + "learning_rate": 9, + "max_lr": 1e-3, + "warmup_steps": 4000, + "d_model": d_model, + }, + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + + "max_grad_norm": 0.1, + #"dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + + "encoder": ConvS2SEncoder, + "encoder_params": { + "encoder_layers": num_layers, + + "src_emb_size": d_model, + "pad_embeddings_2_eight": False, + "att_layer_num": num_layers, + + # original paper + #"conv_nchannels_kwidth": [(512, 3)]*10 + [(768, 3)]*3 + [(2048, 1)]*2, + + # fairseq config + "conv_nchannels_kwidth": [(512, 3)]*9 + [(1024, 3)]*4 + [(2048, 1)]*2, + + "embedding_dropout_keep_prob": 0.8, + "hidden_dropout_keep_prob": 0.8, + + "max_input_length": max_length, + + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + }, + + + "decoder": ConvS2SDecoder, + "decoder_params": { + "decoder_layers": num_layers, + + "shared_embed": True, + "tgt_emb_size": d_model, + "pad_embeddings_2_eight": False, + "out_emb_size": d_model, + + # original paper + #"conv_nchannels_kwidth": [(512, 3)]*10 + [(768, 3)]*3 + [(2048, 1)]*2, + + # fairseq config + "conv_nchannels_kwidth": [(512, 3)]*9 + [(1024, 3)]*4 + [(2048, 1)]*2, + + "embedding_dropout_keep_prob": 0.8, + "hidden_dropout_keep_prob": 0.8, + "out_dropout_keep_prob": 0.8, + + "max_input_length": max_length, + "extra_decode_length": 56, + "beam_size": 5, + "alpha": 0.6, + + "EOS_ID": EOS_ID, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } + +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "src_vocab_file": data_root + "vocab.bpe.32000", + "tgt_vocab_file": data_root + "vocab.bpe.32000", + "source_file": data_root+"train.tok.clean.bpe.32000.en", + "target_file": data_root+"train.tok.clean.bpe.32000.de", + "delimiter": " ", + "shuffle": False, + "repeat": True, + "map_parallel_calls": 8, + "prefetch_buffer_size": 4, + "max_length": max_length, + }, +} + +eval_params = { + "batch_size_per_gpu": 64, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2014.tok.bpe.32000.en", + "target_file": data_root+"newstest2014.tok.bpe.32000.de", + "delimiter": " ", + "shuffle": False, + "repeat": True, + "max_length": 64, + }, + +} + +infer_params = { + "batch_size_per_gpu": 1, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "pad_vocab_to_eight": False, + "src_vocab_file": data_root+"vocab.bpe.32000", + "tgt_vocab_file": data_root+"vocab.bpe.32000", + "source_file": data_root+"newstest2013.tok.bpe.32000.en", + # this is intentional to be sure that model is not using target + "target_file": data_root+"newstest2013.tok.bpe.32000.en", + "delimiter": " ", + "shuffle": False, + "repeat": False, + "max_length": max_length, + }, +} + + diff --git a/example_configs/text2text/en-de-gnmt-like-4GPUs.py b/example_configs/text2text/en-de/en-de-gnmt-like-4GPUs.py similarity index 92% rename from example_configs/text2text/en-de-gnmt-like-4GPUs.py rename to example_configs/text2text/en-de/en-de-gnmt-like-4GPUs.py index d6a904dd3..585bf4389 100644 --- a/example_configs/text2text/en-de-gnmt-like-4GPUs.py +++ b/example_configs/text2text/en-de/en-de-gnmt-like-4GPUs.py @@ -50,8 +50,11 @@ "minval": -0.1, "maxval": 0.1, }, - "encoder_cell_type": "lstm", - "encoder_cell_units": 1024, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 1024, + "forget_bias": 1.0, + }, "encoder_layers": 7, "encoder_dp_input_keep_prob": 0.8, "encoder_dp_output_keep_prob": 1.0, @@ -66,8 +69,11 @@ "minval": -0.1, "maxval": 0.1, }, - "decoder_cell_type": "lstm", - "decoder_cell_units": 1024, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 1024, + "forget_bias": 1.0, + }, "decoder_layers": 8, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, @@ -128,8 +134,11 @@ "decoder_params": { "beam_width": 10, "length_penalty": 1.0, - "decoder_cell_type": "lstm", - "decoder_cell_units": 1024, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 1024, + "forget_bias": 1.0, + }, "decoder_layers": 8, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, @@ -155,4 +164,4 @@ "repeat": False, "max_length": 512, }, -} +} \ No newline at end of file diff --git a/example_configs/text2text/en-de-nmt-small.py b/example_configs/text2text/en-de/en-de-nmt-small.py similarity index 91% rename from example_configs/text2text/en-de-nmt-small.py rename to example_configs/text2text/en-de/en-de-nmt-small.py index 0e01d82b9..16ae97895 100644 --- a/example_configs/text2text/en-de-nmt-small.py +++ b/example_configs/text2text/en-de/en-de-nmt-small.py @@ -42,8 +42,11 @@ "encoder": BidirectionalRNNEncoderWithEmbedding, "encoder_params": { "initializer": tf.glorot_uniform_initializer, - "encoder_cell_type": "lstm", - "encoder_cell_units": 512, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 512, + "forget_bias": 1.0, + }, "encoder_layers": 2, "encoder_dp_input_keep_prob": 0.8, "encoder_dp_output_keep_prob": 1.0, @@ -55,8 +58,11 @@ "decoder": RNNDecoderWithAttention, "decoder_params": { "initializer": tf.glorot_uniform_initializer, - "decoder_cell_type": "lstm", - "decoder_cell_units": 512, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 512, + "forget_bias": 1.0, + }, "decoder_layers": 2, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, @@ -116,8 +122,11 @@ "decoder_params": { "beam_width": 10, "length_penalty": 1.0, - "decoder_cell_type": "lstm", - "decoder_cell_units": 512, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 512, + "forget_bias": 1.0, + }, "decoder_layers": 2, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, @@ -142,4 +151,4 @@ "max_length": 256, "prefetch_buffer_size": 1, }, -} +} \ No newline at end of file diff --git a/example_configs/text2text/transformer-big.py b/example_configs/text2text/en-de/transformer-big.py similarity index 96% rename from example_configs/text2text/transformer-big.py rename to example_configs/text2text/en-de/transformer-big.py index fd819bca3..a1a56039c 100644 --- a/example_configs/text2text/transformer-big.py +++ b/example_configs/text2text/en-de/transformer-big.py @@ -18,7 +18,8 @@ d_model = 512 num_layers = 6 -data_root = "[REPLACE THIS TO THE PATH WITH YOUR WMT DATA]" +# REPLACE THIS TO THE PATH WITH YOUR WMT DATA +data_root = "./wmt16_en_dt/" base_params = { "use_horovod": False, @@ -31,9 +32,10 @@ "eval_steps": 4001, "save_checkpoint_steps": 4000, "logdir": "Transformer-FP32", - "dtype": tf.float32, - # "dtype": "mixed", - # "automatic_loss_scaling": "Backoff", + #"dtype": tf.float32, + "dtype": "mixed", + "loss_scaling": "Backoff", + "optimizer": tf.contrib.opt.LazyAdamOptimizer, "optimizer_params": { "beta1": 0.9, diff --git a/example_configs/text2text/toy-reversal/nmt-reversal-CC.py b/example_configs/text2text/toy-reversal/nmt-reversal-CC.py new file mode 100644 index 000000000..a74b94c9d --- /dev/null +++ b/example_configs/text2text/toy-reversal/nmt-reversal-CC.py @@ -0,0 +1,147 @@ +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf + +from open_seq2seq.models import Text2Text + +from open_seq2seq.decoders import ConvS2SDecoder +from open_seq2seq.encoders import ConvS2SEncoder + +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer +from open_seq2seq.losses import BasicSequenceLoss + +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.data.text2text.tokenizer import EOS_ID +from open_seq2seq.optimizers.lr_policies import fixed_lr + +""" +This configuration file describes fully convolutional model (ConvS2S) +on the toy task of reversing sequences +""" + +base_model = Text2Text +d_model = 128 +num_layers = 2 + +base_params = { + "use_horovod": False, + "num_gpus": 1, + "batch_size_per_gpu": 64, + "max_steps": 1000, + "save_summaries_steps": 10, + "print_loss_steps": 10, + "print_samples_steps": 20, + "eval_steps": 50, + "save_checkpoint_steps": 200, + + "logdir": "ReversalTask-CC", + + "optimizer": "Adam", + "optimizer_params": {"epsilon": 1e-9}, + "lr_policy": fixed_lr, + "lr_policy_params": { + 'learning_rate': 1e-3 + }, + + "max_grad_norm": 3.0, + "dtype": tf.float32, + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": ConvS2SEncoder, + "encoder_params": { + "encoder_layers": num_layers, + + "src_emb_size": d_model, + "embedding_dropout_keep_prob": 0.9, + "pad_embeddings_2_eight": False, + "att_layer_num": num_layers, + + "conv_nchannels_kwidth": [(d_model, 3)] * num_layers, + + "hidden_dropout_keep_prob": 0.9, + + "max_input_length": 100, + + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + }, + + "decoder": ConvS2SDecoder, + "decoder_params": { + "decoder_layers": num_layers, + + "shared_embed": True, + "tgt_emb_size": d_model, + "embedding_dropout_keep_prob": 0.9, + "pad_embeddings_2_eight": False, + + "conv_nchannels_kwidth": [(d_model, 3)] * num_layers, + + "hidden_dropout_keep_prob": 0.9, + "out_dropout_keep_prob": 0.9, + + "max_input_length": 120, + "extra_decode_length": 10, + "beam_size": 5, + "alpha": 0.6, + + "EOS_ID": EOS_ID, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/target.txt", + "source_file": "toy_text_data/train/source.txt", + "target_file": "toy_text_data/train/target.txt", + "shuffle": True, + "repeat": True, + "max_length": 56, + "delimiter": " ", + }, +} + +eval_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/target.txt", + "source_file": "toy_text_data/dev/source.txt", + "target_file": "toy_text_data/dev/target.txt", + "shuffle": False, + "repeat": True, + "max_length": 56, + "delimiter": " ", + }, +} + + +infer_params = { + "batch_size_per_gpu": 1, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/source.txt", + "source_file": "toy_text_data/test/source.txt", + # this is intentional to be sure model is not using ground truth + "target_file": "toy_text_data/test/source.txt", + "shuffle": False, + "repeat": False, + "max_length": 256, + "delimiter": " ", + }, +} diff --git a/example_configs/text2text/toy-reversal/nmt-reversal-CR.py b/example_configs/text2text/toy-reversal/nmt-reversal-CR.py new file mode 100644 index 000000000..cf6ab6e7b --- /dev/null +++ b/example_configs/text2text/toy-reversal/nmt-reversal-CR.py @@ -0,0 +1,160 @@ +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf + +from open_seq2seq.models import Text2Text +from open_seq2seq.decoders import RNNDecoderWithAttention, BeamSearchRNNDecoderWithAttention +from open_seq2seq.encoders import ConvS2SEncoder + +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer +from open_seq2seq.losses import BasicSequenceLoss + +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.optimizers.lr_policies import fixed_lr + +""" +This configuration file describes convolutional encoder and rnn decoder with attention +on the toy task of reversing sequences +""" + +base_model = Text2Text +d_model = 128 +num_layers = 2 + +base_params = { + "use_horovod": False, + "num_gpus": 1, + "batch_size_per_gpu": 64, + "max_steps": 1000, + "save_summaries_steps": 10, + "print_loss_steps": 10, + "print_samples_steps": 20, + "eval_steps": 50, + "save_checkpoint_steps": 200, + + "logdir": "ReversalTask-CR", + + "optimizer": "Adam", + "optimizer_params": {"epsilon": 1e-9}, + "lr_policy": fixed_lr, + "lr_policy_params": { + 'learning_rate': 1e-3 + }, + + "max_grad_norm": 3.0, + "dtype": tf.float32, + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": ConvS2SEncoder, + "encoder_params": { + "encoder_layers": num_layers, + + "src_emb_size": d_model, + "att_layer_num": num_layers, + "embedding_dropout_keep_prob": 0.9, + "pad_embeddings_2_eight": True, + + "hidden_dropout_keep_prob": 0.9, + + "conv_nchannels_kwidth": [(d_model, 3)] * num_layers, + + "max_input_length": 100, + + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + }, + + "decoder": RNNDecoderWithAttention, + "decoder_params": { + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": d_model, + }, + "decoder_layers": num_layers, + + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": False, + + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + + "tgt_emb_size": d_model, + "attention_type": "luong", + "luong_scale": False, + "attention_layer_size": 128, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/target.txt", + "source_file": "toy_text_data/train/source.txt", + "target_file": "toy_text_data/train/target.txt", + "shuffle": True, + "repeat": True, + "max_length": 56, + "delimiter": " ", + }, +} + +eval_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/target.txt", + "source_file": "toy_text_data/dev/source.txt", + "target_file": "toy_text_data/dev/target.txt", + "shuffle": False, + "repeat": True, + "max_length": 56, + "delimiter": " ", + }, +} + +infer_params = { + "batch_size_per_gpu": 1, + "decoder": BeamSearchRNNDecoderWithAttention, + "decoder_params": { + "decoder_cell_type": "lstm", + "decoder_cell_units": d_model, + "decoder_layers": num_layers, + "decoder_dp_input_keep_prob": 0.8, + "decoder_dp_output_keep_prob": 1.0, + "decoder_use_skip_connections": False, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + "tgt_emb_size": d_model, + "attention_type": "luong", + "luong_scale": False, + "attention_layer_size": d_model, + "beam_width": 5, + "length_penalty": 1.0, + }, + + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/source.txt", + "source_file": "toy_text_data/test/source.txt", + "target_file": "toy_text_data/test/source.txt", + "shuffle": False, + "repeat": False, + "max_length": 256, + "delimiter": " ", + }, + +} diff --git a/example_configs/text2text/toy-reversal/nmt-reversal-RC.py b/example_configs/text2text/toy-reversal/nmt-reversal-RC.py new file mode 100644 index 000000000..5ed166fdb --- /dev/null +++ b/example_configs/text2text/toy-reversal/nmt-reversal-RC.py @@ -0,0 +1,140 @@ +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf + +from open_seq2seq.models import Text2Text +from open_seq2seq.encoders import BidirectionalRNNEncoderWithEmbedding +from open_seq2seq.decoders import ConvS2SDecoder + +from open_seq2seq.data.text2text.text2text import ParallelTextDataLayer +from open_seq2seq.losses import BasicSequenceLoss + +from open_seq2seq.data.text2text.tokenizer import EOS_ID +from open_seq2seq.data.text2text.text2text import SpecialTextTokens +from open_seq2seq.optimizers.lr_policies import fixed_lr + +""" +This configuration file describes bidirectional rnn based encoder and convolutional decoder +on the toy task of reversing sequences +""" + +base_model = Text2Text +d_model = 128 +num_layers = 2 + +base_params = { + "use_horovod": False, + "num_gpus": 1, + "batch_size_per_gpu": 64, + "max_steps": 1000, + "save_summaries_steps": 10, + "print_loss_steps": 10, + "print_samples_steps": 20, + "eval_steps": 50, + "save_checkpoint_steps": 200, + + "logdir": "ReversalTask-RC", + + "optimizer": "Adam", + "optimizer_params": {"epsilon": 1e-9}, + "lr_policy": fixed_lr, + "lr_policy_params": { + 'learning_rate': 1e-3 + }, + + "max_grad_norm": 3.0, + "dtype": tf.float32, + + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": BidirectionalRNNEncoderWithEmbedding, + "encoder_params": { + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": int(d_model/2), + }, + + "encoder_layers": num_layers, + "encoder_dp_input_keep_prob": 0.8, + "encoder_dp_output_keep_prob": 1.0, + "encoder_use_skip_connections": False, + "src_emb_size": d_model, + }, + + "decoder": ConvS2SDecoder, + "decoder_params": { + "decoder_layers": num_layers, + + "shared_embed": True, + "tgt_emb_size": d_model, + + "conv_nchannels_kwidth": [(d_model, 3)] * num_layers, + + "embedding_dropout_keep_prob": 0.9, + "hidden_dropout_keep_prob": 0.9, + "out_dropout_keep_prob": 0.9, + + "max_input_length": 100, + "extra_decode_length": 10, + "beam_size": 5, + "alpha": 0.6, + + "EOS_ID": EOS_ID, + "GO_SYMBOL": SpecialTextTokens.S_ID.value, + "END_SYMBOL": SpecialTextTokens.EOS_ID.value, + "PAD_SYMBOL": SpecialTextTokens.PAD_ID.value, + }, + + "loss": BasicSequenceLoss, + "loss_params": { + "offset_target_by_one": True, + "average_across_timestep": True, + "do_mask": True + } +} + +train_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/target.txt", + "source_file": "toy_text_data/train/source.txt", + "target_file": "toy_text_data/train/target.txt", + "shuffle": True, + "repeat": True, + "max_length": 56, + "delimiter": " ", + }, +} + +eval_params = { + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/target.txt", + "source_file": "toy_text_data/dev/source.txt", + "target_file": "toy_text_data/dev/target.txt", + "shuffle": False, + "repeat": True, + "max_length": 56, + "delimiter": " ", + }, +} + +infer_params = { + "batch_size_per_gpu": 1, + "data_layer": ParallelTextDataLayer, + "data_layer_params": { + "src_vocab_file": "toy_text_data/vocab/source.txt", + "tgt_vocab_file": "toy_text_data/vocab/source.txt", + "source_file": "toy_text_data/test/source.txt", + # this is intentional to be sure model is not using ground truth + "target_file": "toy_text_data/test/source.txt", + "shuffle": False, + "repeat": False, + "max_length": 256, + "delimiter": " ", + }, +} diff --git a/example_configs/text2text/nmt-reversal-RR.py b/example_configs/text2text/toy-reversal/nmt-reversal-RR.py similarity index 88% rename from example_configs/text2text/nmt-reversal-RR.py rename to example_configs/text2text/toy-reversal/nmt-reversal-RR.py index 8e6b038a7..ba76b2b12 100644 --- a/example_configs/text2text/nmt-reversal-RR.py +++ b/example_configs/text2text/toy-reversal/nmt-reversal-RR.py @@ -18,6 +18,7 @@ base_params = { "use_horovod": False, + #"iter_size": 10, # set this to number of available GPUs "num_gpus": 1, "batch_size_per_gpu": 64, @@ -37,11 +38,15 @@ }, "max_grad_norm": 3.0, "dtype": tf.float32, + #"dtype": "mixed", "encoder": BidirectionalRNNEncoderWithEmbedding, "encoder_params": { - "encoder_cell_type": "lstm", - "encoder_cell_units": 128, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 128, + "forget_bias": 1.0, + }, "encoder_layers": 1, "encoder_dp_input_keep_prob": 0.8, "encoder_dp_output_keep_prob": 1.0, @@ -51,8 +56,11 @@ "decoder": RNNDecoderWithAttention, "decoder_params": { - "decoder_cell_type": "lstm", - "decoder_cell_units": 128, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 128, + # "forget_bias": 1.0, + }, "decoder_layers": 1, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, @@ -106,8 +114,13 @@ "batch_size_per_gpu": 1, "decoder": BeamSearchRNNDecoderWithAttention, "decoder_params": { - "decoder_cell_type": "lstm", - "decoder_cell_units": 128, + #"decoder_cell_type": "lstm", + #"decoder_cell_units": 128, + "core_cell": tf.nn.rnn_cell.LSTMCell, + "core_cell_params": { + "num_units": 128, + "forget_bias": 1.0, + }, "decoder_layers": 1, "decoder_dp_input_keep_prob": 0.8, "decoder_dp_output_keep_prob": 1.0, diff --git a/example_configs/text2text/nmt-reversal-RT.py b/example_configs/text2text/toy-reversal/nmt-reversal-RT.py similarity index 97% rename from example_configs/text2text/nmt-reversal-RT.py rename to example_configs/text2text/toy-reversal/nmt-reversal-RT.py index 0b04d9a3b..0d1247171 100644 --- a/example_configs/text2text/nmt-reversal-RT.py +++ b/example_configs/text2text/toy-reversal/nmt-reversal-RT.py @@ -44,8 +44,10 @@ "encoder": BidirectionalRNNEncoderWithEmbedding, "encoder_params": { - "encoder_cell_type": "lstm", - "encoder_cell_units": 128, + "core_cell_params": { + "num_units": 128, + "forget_bias": 1.0, + }, "encoder_layers": 1, "encoder_dp_input_keep_prob": 0.8, "encoder_dp_output_keep_prob": 1.0, diff --git a/example_configs/text2text/nmt-reversal-TT.py b/example_configs/text2text/toy-reversal/nmt-reversal-TT.py similarity index 98% rename from example_configs/text2text/nmt-reversal-TT.py rename to example_configs/text2text/toy-reversal/nmt-reversal-TT.py index a995d6b6f..61ae37ebb 100644 --- a/example_configs/text2text/nmt-reversal-TT.py +++ b/example_configs/text2text/toy-reversal/nmt-reversal-TT.py @@ -29,8 +29,8 @@ "save_checkpoint_steps": 300, "logdir": "ReversalTask-Transformer-MP", "dtype": tf.float32, - #"dtype": "mixed", - #"automatic_loss_scaling": "Backoff", + # "dtype": "mixed", + # "loss_scaling": "Backoff", "optimizer": tf.contrib.opt.LazyAdamOptimizer, "optimizer_params": { diff --git a/open_seq2seq/data/image2label/cifar10_download_and_extract.py b/open_seq2seq/data/image2label/cifar10_download_and_extract.py new file mode 100644 index 000000000..ee4f48942 --- /dev/null +++ b/open_seq2seq/data/image2label/cifar10_download_and_extract.py @@ -0,0 +1,63 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Downloads and extracts the binary version of the CIFAR-10 dataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os +import sys +import tarfile + +from six.moves import urllib +import tensorflow as tf + +DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz' + +parser = argparse.ArgumentParser() + +parser.add_argument( + '--data_dir', type=str, default='data/', + help='Directory to download data and extract the tarball') + + +def main(_): + """Download and extract the tarball from Alex's website.""" + if not os.path.exists(FLAGS.data_dir): + os.makedirs(FLAGS.data_dir) + + filename = DATA_URL.split('/')[-1] + filepath = os.path.join(FLAGS.data_dir, filename) + + if not os.path.exists(filepath): + def _progress(count, block_size, total_size): + sys.stdout.write('\r>> Downloading %s %.1f%%' % ( + filename, 100.0 * count * block_size / total_size)) + sys.stdout.flush() + + filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress) + print() + statinfo = os.stat(filepath) + print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') + + tarfile.open(filepath, 'r:gz').extractall(FLAGS.data_dir) + + +if __name__ == '__main__': + FLAGS, unparsed = parser.parse_known_args() + tf.app.run(argv=[sys.argv[0]] + unparsed) diff --git a/open_seq2seq/data/image2label/image2label.py b/open_seq2seq/data/image2label/image2label.py index c37696772..a16023327 100644 --- a/open_seq2seq/data/image2label/image2label.py +++ b/open_seq2seq/data/image2label/image2label.py @@ -7,11 +7,154 @@ import os import tensorflow as tf +import numpy as np from open_seq2seq.data.data_layer import DataLayer from .imagenet_preprocessing import parse_record +class CifarDataLayer(DataLayer): + _HEIGHT = 28 + _WIDTH = 28 + _NUM_CHANNELS = 3 + _DEFAULT_IMAGE_BYTES = 32 * 32 * 3 + # The record is the image plus a one-byte label + _RECORD_BYTES = _DEFAULT_IMAGE_BYTES + 1 + _NUM_CLASSES = 10 + _NUM_DATA_FILES = 5 + + _NUM_IMAGES = { + 'train': 50000, + 'validation': 10000, + } + + @staticmethod + def get_required_params(): + return dict(DataLayer.get_required_params(), **{ + 'data_dir': str, + }) + + @staticmethod + def get_optional_params(): + return dict(DataLayer.get_optional_params(), **{ + 'num_parallel_calls': int, + 'shuffle_buffer': int, + 'image_size': int, + 'num_classes': int, + }) + + def __init__(self, params, model, num_workers, worker_id): + super(CifarDataLayer, self).__init__(params, model, + num_workers, worker_id) + if self.params['mode'] == 'infer': + raise ValueError('Inference is not supported on CifarDataLayer') + + if self.params['mode'] == 'train': + filenames = [ + os.path.join(self.params['data_dir'], 'data_batch_{}.bin'.format(i)) + for i in range(1, self._NUM_DATA_FILES + 1) + ] + else: + filenames = [os.path.join(self.params['data_dir'], 'test_batch.bin')] + + self.file_names = filenames + self._train_size = 50000 + self._valid_size = 10000 + self._iterator = None + self._input_tensors = None + + def preprocess_image(self, image, is_training): + """Preprocess a single image of layout [height, width, depth].""" + if is_training: + # Resize the image to add four extra pixels on each side. + image = tf.image.resize_image_with_crop_or_pad( + image, self._HEIGHT + 8, self._WIDTH + 8) + + # Randomly crop a [_HEIGHT, _WIDTH] section of the image. + image = tf.random_crop(image, [self._HEIGHT, self._WIDTH, + self._NUM_CHANNELS]) + + # Randomly flip the image horizontally. + image = tf.image.random_flip_left_right(image) + + else: + image = tf.image.resize_image_with_crop_or_pad( + image, self._HEIGHT, self._WIDTH) + + # Subtract off the mean and divide by the variance of the pixels. + image = tf.image.per_image_standardization(image) + + return image + + def parse_record(self, raw_record, is_training, num_classes=10): + """Parse CIFAR-10 image and label from a raw record.""" + # Convert bytes to a vector of uint8 that is record_bytes long. + record_vector = tf.decode_raw(raw_record, tf.uint8) + + # The first byte represents the label, which we convert from uint8 to int32 + # and then to one-hot. + label = tf.cast(record_vector[0], tf.int32) + + # The remaining bytes after the label represent the image, which we reshape + # from [depth * height * width] to [depth, height, width]. + depth_major = tf.reshape(record_vector[1:self._RECORD_BYTES], + [3, 32, 32]) + + # Convert from [depth, height, width] to [height, width, depth], and cast as + # float32. + image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32) + + image = self.preprocess_image(image, is_training) + label = tf.one_hot(tf.reshape(label, shape=[]), num_classes) + + return image, label + + def build_graph(self): + dataset = tf.data.FixedLengthRecordDataset(self.file_names, + self._RECORD_BYTES) + + dataset = dataset.prefetch(buffer_size=self.params['batch_size']) + if self.params['shuffle']: + # shuffling images + dataset = dataset.shuffle(buffer_size=self.params.get('shuffle_buffer', + 1500)) + dataset = dataset.repeat() + + dataset = dataset.map( + lambda value: self.parse_record( + raw_record=value, + is_training=self.params['mode'] == 'train', + ), + num_parallel_calls=self.params.get('num_parallel_calls', 16), + ) + + dataset = dataset.batch(self.params['batch_size']) + dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) + + self._iterator = dataset.make_initializable_iterator() + inputs, labels = self.iterator.get_next() + if self.params['mode'] == 'train': + tf.summary.image('augmented_images', inputs, max_outputs=1) + self._input_tensors = { + 'source_tensors': [inputs], + 'target_tensors': [labels], + } + + @property + def input_tensors(self): + return self._input_tensors + + @property + def iterator(self): + return self._iterator + + def get_size_in_samples(self): + if self.params['mode'] == 'train': + return self._train_size + else: + return len(np.arange(self._valid_size)[self._worker_id::self._num_workers]) + + class ImagenetDataLayer(DataLayer): @staticmethod def get_required_params(): @@ -24,6 +167,8 @@ def get_optional_params(): return dict(DataLayer.get_optional_params(), **{ 'num_parallel_calls': int, 'shuffle_buffer': int, + 'image_size': int, + 'num_classes': int, }) def __init__(self, params, model, num_workers, worker_id): @@ -76,12 +221,17 @@ def build_graph(self): dataset = dataset.repeat() dataset = dataset.map( - lambda value: parse_record(value, self.params['mode'] == 'train'), + lambda value: parse_record( + raw_record=value, + is_training=self.params['mode'] == 'train', + image_size=self.params.get('image_size', 224), + num_classes=self.params.get('num_classes', 1000), + ), num_parallel_calls=self.params.get('num_parallel_calls', 16), ) dataset = dataset.batch(self.params['batch_size']) - dataset = dataset.prefetch(1) + dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) self._iterator = dataset.make_initializable_iterator() inputs, labels = self.iterator.get_next() diff --git a/open_seq2seq/data/image2label/imagenet_preprocessing.py b/open_seq2seq/data/image2label/imagenet_preprocessing.py index 559257359..0bcda0c20 100644 --- a/open_seq2seq/data/image2label/imagenet_preprocessing.py +++ b/open_seq2seq/data/image2label/imagenet_preprocessing.py @@ -13,22 +13,17 @@ # limitations under the License. # ============================================================================== """Provides utilities to preprocess images. - Training images are sampled using the provided bounding boxes, and subsequently cropped to the sampled bounding box. Images are additionally flipped randomly, then resized to the target output size (without aspect-ratio preservation). - Images used during evaluation are resized (with aspect-ratio preservation) and centrally cropped. - All images undergo mean color subtraction. - Note that these steps are colloquially referred to as "ResNet preprocessing," and they differ from "VGG preprocessing," which does not use bounding boxes and instead does an aspect-preserving resize followed by random crop during training. (These both differ from "Inception preprocessing," which introduces color distortion steps.) - """ from __future__ import absolute_import @@ -47,14 +42,9 @@ # _RESIZE_MIN x (_RESIZE_MIN * 2). _RESIZE_MIN = 256 -_DEFAULT_IMAGE_SIZE = 224 -_NUM_CHANNELS = 3 -_NUM_CLASSES = 1001 - def _decode_crop_and_flip(image_buffer, bbox, num_channels): """Crops the given image to a random part of the image, and randomly flips. - We use the fused decode_and_crop op, which performs better than the two ops used separately in series, but note that this requires that the image be passed in as an un-decoded string Tensor. @@ -68,7 +58,6 @@ def _decode_crop_and_flip(image_buffer, bbox, num_channels): Returns: 3-D tensor with cropped image. - """ # A large fraction of image datasets contain a human-annotated bounding box # delineating the region of the image containing the object of interest. We @@ -123,12 +112,12 @@ def _central_crop(image, crop_height, crop_width): image, [crop_top, crop_left, 0], [crop_height, crop_width, -1]) -def _mean_image_subtraction(image, means, num_channels): - """Subtracts the given means from each image channel. +def _mean_image_subtraction_and_normalization(image, means, num_channels): + """Subtracts the given means from each image channel and divides by 127.5. For example: means = [123.68, 116.779, 103.939] - image = _mean_image_subtraction(image, means) + image = _mean_image_subtraction_and_normalization(image, means) Note that the rank of `image` must be known. @@ -138,7 +127,7 @@ def _mean_image_subtraction(image, means, num_channels): num_channels: number of color channels in the image that will be distorted. Returns: - the centered image. + the centered image and normalized image. Raises: ValueError: If the rank of `image` is unknown, if `image` has a rank other @@ -154,12 +143,11 @@ def _mean_image_subtraction(image, means, num_channels): # We have a 1-D tensor of means; convert to 3-D. means = tf.expand_dims(tf.expand_dims(means, 0), 0) - return image - means + return (image - means) / 127.5 def _smallest_size_at_least(height, width, resize_min): """Computes new shape with the smallest side equal to `smallest_side`. - Computes new shape with the smallest side equal to `smallest_side` while preserving the original aspect ratio. @@ -209,7 +197,6 @@ def _aspect_preserving_resize(image, resize_min): def _resize_image(image, height, width): """Simple wrapper around tf.resize_images. - This is primarily to make sure we use the same `ResizeMethod` and other details each time. @@ -230,7 +217,6 @@ def _resize_image(image, height, width): def preprocess_image(image_buffer, bbox, output_height, output_width, num_channels, is_training=False): """Preprocesses the given image. - Preprocessing includes decoding, cropping, and resizing for both training and eval images. Training preprocessing, however, introduces some random distortion of the image to improve accuracy. @@ -261,16 +247,15 @@ def preprocess_image(image_buffer, bbox, output_height, output_width, image.set_shape([output_height, output_width, num_channels]) - return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels) + return _mean_image_subtraction_and_normalization(image, _CHANNEL_MEANS, + num_channels) def _parse_example_proto(example_serialized): """Parses an Example proto containing a training example of an image. - The output of the build_image_data.py image preprocessing script is a dataset containing serialized Example protocol buffers. Each Example proto contains the following fields (values are included as examples): - image/height: 462 image/width: 581 image/colorspace: 'RGB' @@ -334,16 +319,17 @@ def _parse_example_proto(example_serialized): return features['image/encoded'], label, bbox -def parse_record(raw_record, is_training): +def parse_record(raw_record, is_training, image_size=224, num_classes=1000): """Parses a record containing a training example of an image. - The input record is parsed into a label and image, and the image is passed through preprocessing steps (cropping, flipping, and so on). Args: raw_record: scalar Tensor tf.string containing a serialized - Example protocol buffer. + Example protocol buffer. is_training: A boolean denoting whether the input is for training. + image_size (int): size that images should be resized to. + num_classes (int): number of output classes. Returns: Tuple with processed image tensor and one-hot-encoded label tensor. @@ -353,11 +339,12 @@ def parse_record(raw_record, is_training): image = preprocess_image( image_buffer=image_buffer, bbox=bbox, - output_height=_DEFAULT_IMAGE_SIZE, - output_width=_DEFAULT_IMAGE_SIZE, - num_channels=_NUM_CHANNELS, + output_height=image_size, + output_width=image_size, + num_channels=3, is_training=is_training) - label = tf.one_hot(tf.reshape(label, shape=[]), _NUM_CLASSES) + # subtracting 1 to make labels go from 0 to 999 + label = tf.one_hot(tf.reshape(label - 1, shape=[]), num_classes) - return image, label \ No newline at end of file + return image, label diff --git a/open_seq2seq/data/speech2text/speech2text.py b/open_seq2seq/data/speech2text/speech2text.py index 938338672..7baeb85a8 100644 --- a/open_seq2seq/data/speech2text/speech2text.py +++ b/open_seq2seq/data/speech2text/speech2text.py @@ -20,7 +20,7 @@ class Speech2TextDataLayer(DataLayer): def get_required_params(): return dict(DataLayer.get_required_params(), **{ 'num_audio_features': int, - 'input_type': ['spectrogram', 'mfcc'], + 'input_type': ['spectrogram', 'mfcc', 'logfbank'], 'vocab_file': str, 'dataset_files': list, }) @@ -32,7 +32,7 @@ def get_optional_params(): 'pad_to': int, }) - def __init__(self, params, model, num_workers=None, worker_id=None): + def __init__(self, params, model, num_workers, worker_id): """Speech-to-text data layer constructor. See parent class for arguments description. @@ -86,7 +86,6 @@ def __init__(self, params, model, num_workers=None, worker_id=None): self._input_tensors = None def split_data(self, data): - """Method that performs data split for evaluation.""" if self.params['mode'] != 'train' and self._num_workers is not None: size = len(data) start = size // self._num_workers * self._worker_id @@ -105,12 +104,12 @@ def iterator(self): def build_graph(self): """Builds data processing graph using ``tf.data`` API.""" - self._dataset = tf.data.Dataset.from_tensor_slices(self._files) - if self.params['shuffle']: - self._dataset = self._dataset.shuffle(self._size) - self._dataset = self._dataset.repeat() - if self.params['mode'] != 'infer': + self._dataset = tf.data.Dataset.from_tensor_slices(self._files) + if self.params['shuffle']: + self._dataset = self._dataset.shuffle(self._size) + self._dataset = self._dataset.repeat() + self._dataset = self._dataset.map( lambda line: tf.py_func( self._parse_audio_transcript_element, @@ -125,21 +124,29 @@ def build_graph(self): padded_shapes=([None, self.params['num_audio_features']], 1, [None], 1) ) else: + indices = self.split_data( + np.array(list(map(lambda num: str(num), range(len(self.all_files))))) + ) + self._dataset = tf.data.Dataset.from_tensor_slices( + np.hstack((indices[:, np.newaxis], self._files[:, np.newaxis])) + ) + self._dataset = self._dataset.repeat() self._dataset = self._dataset.map( lambda line: tf.py_func( self._parse_audio_element, [line], - [self.params['dtype'], tf.int32], + [self.params['dtype'], tf.int32, tf.int32], stateful=False, ), num_parallel_calls=8, ) self._dataset = self._dataset.padded_batch( self.params['batch_size'], - padded_shapes=([None, self.params['num_audio_features']], 1) + padded_shapes=([None, self.params['num_audio_features']], 1, 1) ) - self._iterator = self._dataset.prefetch(8).make_initializable_iterator() + self._iterator = self._dataset.prefetch(tf.contrib.data.AUTOTUNE)\ + .make_initializable_iterator() if self.params['mode'] != 'infer': x, x_length, y, y_length = self._iterator.get_next() @@ -148,7 +155,9 @@ def build_graph(self): y.set_shape([self.params['batch_size'], None]) y_length = tf.reshape(y_length, [self.params['batch_size']]) else: - x, x_length = self._iterator.get_next() + x, x_length, x_id = self._iterator.get_next() + x_id = tf.reshape(x_id, [self.params['batch_size']]) + x.set_shape([self.params['batch_size'], None, self.params['num_audio_features']]) x_length = tf.reshape(x_length, [self.params['batch_size']]) @@ -157,6 +166,8 @@ def build_graph(self): self._input_tensors["source_tensors"] = [x, x_length] if self.params['mode'] != 'infer': self._input_tensors['target_tensors'] = [y, y_length] + else: + self._input_tensors['source_ids'] = [x_id] def _parse_audio_transcript_element(self, element): """Parses tf.data element from TextLineDataset into audio and text. @@ -183,15 +194,17 @@ def _parse_audio_transcript_element(self, element): np.int32(target), \ np.int32([len(target)]) - def _parse_audio_element(self, audio_filename): + def _parse_audio_element(self, id_and_audio_filename): """Parses audio from file and returns array of audio features. Args: - audio_filename: audio file name. + id_and_audio_filename: tuple of sample id and corresponding audio file name. Returns: tuple: source audio features as ``np.array``, length of source sequence, + sample id. """ + idx, audio_filename = id_and_audio_filename pad_to = self.params.get('pad_to', 8) source = get_speech_features_from_file( audio_filename, self.params['num_audio_features'], pad_to, @@ -199,7 +212,7 @@ def _parse_audio_element(self, audio_filename): augmentation=self.params.get('augmentation', None), ) return source.astype(self.params['dtype'].as_numpy_dtype()), \ - np.int32([len(source)]) + np.int32([len(source)]), np.int32([idx]) @property def input_tensors(self): diff --git a/open_seq2seq/data/speech2text/speech_utils.py b/open_seq2seq/data/speech2text/speech_utils.py index 73bd9f1a9..d93d54348 100644 --- a/open_seq2seq/data/speech2text/speech_utils.py +++ b/open_seq2seq/data/speech2text/speech_utils.py @@ -42,6 +42,13 @@ def get_speech_features_from_file(filename, num_features, pad_to=8, ) +def normalize_signal(signal): + """ + Normalize float32 signal to [-1, 1] range + """ + return signal / np.max(np.abs(signal)) + + def augment_audio_signal(signal, fs, augmentation): """Function that performs audio signal augmentation. @@ -53,7 +60,7 @@ def augment_audio_signal(signal, fs, augmentation): Returns: np.array: np.array with augmented audio signal. """ - signal_float = signal.astype(np.float32) / 32768.0 + signal_float = normalize_signal(signal.astype(np.float32)) if augmentation['time_stretch_ratio'] > 0: # time stretch (might be slow) @@ -72,7 +79,7 @@ def augment_audio_signal(signal, fs, augmentation): signal_float += np.random.randn(signal_float.shape[0]) * \ 10.0 ** (noise_level_db / 20.0) - return (signal_float * 32768.0).astype(np.int16) + return (normalize_signal(signal_float) * 32767.0).astype(np.int16) def get_speech_features(signal, fs, num_features, pad_to=8, @@ -118,7 +125,7 @@ def get_speech_features(signal, fs, num_features, pad_to=8, if pad_to > 0: if length % pad_to != 0: pad_size = (pad_to - length % pad_to) * n_window_stride - signal = np.pad(signal, (0, pad_size), mode='reflect') + signal = np.pad(signal, (0, pad_size), mode='constant') if features_type == 'spectrogram': frames = psf.sigproc.framesig(sig=signal, @@ -146,10 +153,22 @@ def get_speech_features(signal, fs, num_features, pad_to=8, preemph=0.97, ceplifter=2*num_features, appendEnergy=False) + + elif features_type == 'logfbank': + features = psf.logfbank(signal=signal, + samplerate=fs, + winlen=window_size, + winstep=window_stride, + nfilt=num_features, + nfft=512, + lowfreq=0, highfreq=fs/2, + preemph=0.97) + else: raise ValueError('Unknown features type: {}'.format(features_type)) - assert features.shape[0] % pad_to == 0 + if pad_to > 0: + assert features.shape[0] % pad_to == 0 m = np.mean(features) s = np.std(features) features = (features - m) / s diff --git a/open_seq2seq/data/speech2text/speech_utils_test.py b/open_seq2seq/data/speech2text/speech_utils_test.py index 6c400877d..33b8ca7ea 100644 --- a/open_seq2seq/data/speech2text/speech_utils_test.py +++ b/open_seq2seq/data/speech2text/speech_utils_test.py @@ -47,7 +47,7 @@ def test_get_speech_features_from_file(self): for num_features in [161, 120]: for window_stride in [10e-3, 5e-3, 40e-3]: for window_size in [20e-3, 30e-3]: - for features_type in ['spectrogram', 'mfcc']: + for features_type in ['spectrogram', 'mfcc', 'logfbank']: fs, signal = wave.read(filename) n_window_size = int(fs * window_size) n_window_stride = int(fs * window_stride) @@ -118,7 +118,7 @@ def test_get_speech_features_with_sine(self): fs = 16000.0 t = np.arange(0, 0.5, 1.0 / fs) signal = np.sin(2 * np.pi * 4000 * t) - features = get_speech_features(signal, fs, 161) + features = get_speech_features(signal, fs, 161, pad_to=0) npt.assert_allclose( np.abs(features - features[0]), np.zeros_like(features), diff --git a/open_seq2seq/data/text2text/__init__.py b/open_seq2seq/data/text2text/__init__.py index e69de29bb..15904c843 100644 --- a/open_seq2seq/data/text2text/__init__.py +++ b/open_seq2seq/data/text2text/__init__.py @@ -0,0 +1 @@ +from . import tokenizer \ No newline at end of file diff --git a/open_seq2seq/data/text2text/parse_output.py b/open_seq2seq/data/text2text/parse_output.py new file mode 100644 index 000000000..cfc4b80a9 --- /dev/null +++ b/open_seq2seq/data/text2text/parse_output.py @@ -0,0 +1,50 @@ +# Copyright (c) 2017 NVIDIA Corporation +""" +This file takes output of the inference stage produced using +TransformerDataLayer and converts it to simple tokenized text +""" +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import argparse +import sys +import tokenizer + +def main(argv): + + with open(FLAGS.input_file, 'r') as in_file: + def trim(token): + return token[1:-1] + + print("******Reading from file: {}".format(FLAGS.input_file)) + with open(FLAGS.output_file, 'w') as out_file: + print("******Writing to file: {}".format(FLAGS.output_file)) + for line in in_file: + # merge and split by _ + escaped_tokens = "".join([trim(t) for t in line.strip().split(" ")]) + escaped_tokens = escaped_tokens.split("_") + + # unescape + unescaped_tokens = [] + for token in escaped_tokens: + if token: + unescaped_tokens.append(tokenizer._unescape_token(token)) + + # join and write + out_file.write(tokenizer._join_tokens_to_string(unescaped_tokens)+'\n') + print("******All done!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_file", "-if", type=str, default="", + help="output of the inference stage produced using model with " + "TransformerDataLayer", + metavar="") + parser.add_argument( + "--output_file", "-of", type=str, default="tokenized_output.txt", + help="where to save output", + metavar="") + FLAGS, unparsed = parser.parse_known_args() + main(sys.argv) \ No newline at end of file diff --git a/open_seq2seq/data/text2text/process_data.py b/open_seq2seq/data/text2text/process_data.py new file mode 100644 index 000000000..2c7303f3a --- /dev/null +++ b/open_seq2seq/data/text2text/process_data.py @@ -0,0 +1,433 @@ +# Copyright 2018 MLBenchmark Group. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Download and preprocess WMT17 ende training and evaluation datasets.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os +import random +import sys +import tarfile +import urllib + +import six +import tensorflow as tf +import urllib.request + +import tokenizer + +# Data sources for training/evaluating the transformer translation model. +# If any of the training sources are changed, then either: +# 1) use the flag `--search` to find the best min count or +# 2) update the _TRAIN_DATA_MIN_COUNT constant. +# min_count is the minimum number of times a token must appear in the data +# before it is added to the vocabulary. "Best min count" refers to the value +# that generates a vocabulary set that is closest in size to _TARGET_VOCAB_SIZE. +_TRAIN_DATA_SOURCES = [ + { + "url": "http://data.statmt.org/wmt17/translation-task/" + "training-parallel-nc-v12.tgz", + "input": "news-commentary-v12.de-en.en", + "target": "news-commentary-v12.de-en.de", + }, + { + "url": "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz", + "input": "commoncrawl.de-en.en", + "target": "commoncrawl.de-en.de", + }, + { + "url": "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz", + "input": "europarl-v7.de-en.en", + "target": "europarl-v7.de-en.de", + }, +] +# Use pre-defined minimum count to generate subtoken vocabulary. +_TRAIN_DATA_MIN_COUNT = 6 + + +_EVAL_DATA_SOURCES = [ + { + "url": "http://data.statmt.org/wmt17/translation-task/dev.tgz", + "input": "newstest2013.en", + "target": "newstest2013.de", + } +] + +_TEST_DATA_SOURCES = [ + { + "url": "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2014.en", + "input": "newstest2014.en", + "target": "newstest2014.en", + } +] + +# Vocabulary constants +_TARGET_VOCAB_SIZE = 32768 # Number of subtokens in the vocabulary list. +_TARGET_THRESHOLD = 327 # Accept vocabulary if size is within this threshold +_VOCAB_FILE = "vocab.ende.%d" % _TARGET_VOCAB_SIZE + +# Strings to inclue in the generated files. +_PREFIX = "wmt32k" +_TRAIN_TAG = "train" +_EVAL_TAG = "dev" # Following WMT and Tensor2Tensor conventions, in which the + # evaluation datasets are tagged as "dev" for development. +_TEST_TAG = "test" + +# Number of files to split train and evaluation data +_TRAIN_SHARDS = 100 +_EVAL_SHARDS = 1 +_TEST_SHARDS = 1 + +def find_file(path, filename, max_depth=5): + """Returns full filepath if the file is in path or a subdirectory.""" + for root, dirs, files in os.walk(path): + if filename in files: + return os.path.join(root, filename) + + # Don't search past max_depth + depth = root[len(path) + 1:].count(os.sep) + if depth > max_depth: + del dirs[:] # Clear dirs + return None + + +############################################################################### +# Download and extraction functions +############################################################################### +def get_raw_files(raw_dir, data_source): + """Return raw files from source. Downloads/extracts if needed. + + Args: + raw_dir: string directory to store raw files + data_source: dictionary with + {"url": url of compressed dataset containing input and target files + "input": file with data in input language + "target": file with data in target language} + + Returns: + dictionary with + {"inputs": list of files containing data in input language + "targets": list of files containing corresponding data in target language + } + """ + raw_files = { + "inputs": [], + "targets": [], + } # keys + for d in data_source: + input_file, target_file = download_and_extract( + raw_dir, d["url"], d["input"], d["target"]) + raw_files["inputs"].append(input_file) + raw_files["targets"].append(target_file) + return raw_files + + +def download_report_hook(count, block_size, total_size): + """Report hook for download progress. + + Args: + count: current block number + block_size: block size + total_size: total size + """ + percent = int(count * block_size * 100 / total_size) + print("\r%d%%" % percent + " completed", end="\r") + + +def download_from_url(path, url): + """Download content from a url. + + Args: + path: string directory where file will be downloaded + url: string url + + Returns: + Full path to downloaded file + """ + filename = url.split("/")[-1] + found_file = find_file(path, filename, max_depth=0) + if found_file is None: + filename = os.path.join(path, filename) + tf.logging.info("Downloading from %s to %s." % (url, filename)) + inprogress_filepath = filename + ".incomplete" + inprogress_filepath, _ = urllib.request.urlretrieve( + url, inprogress_filepath, reporthook=download_report_hook) + # Print newline to clear the carriage return from the download progress. + print() + tf.gfile.Rename(inprogress_filepath, filename) + return filename + else: + tf.logging.info("Already downloaded: %s (at %s)." % (url, found_file)) + return found_file + + +def download_and_extract(path, url, input_filename, target_filename): + """Extract files from downloaded compressed archive file. + + Args: + path: string directory where the files will be downloaded + url: url containing the compressed input and target files + input_filename: name of file containing data in source language + target_filename: name of file containing data in target language + + Returns: + Full paths to extracted input and target files. + + Raises: + OSError: if the the download/extraction fails. + """ + # Check if extracted files already exist in path + input_file = find_file(path, input_filename) + target_file = find_file(path, target_filename) + if input_file and target_file: + tf.logging.info("Already downloaded and extracted %s." % url) + return input_file, target_file + + # Download archive file if it doesn't already exist. + compressed_file = download_from_url(path, url) + + # Extract compressed files + tf.logging.info("Extracting %s." % compressed_file) + with tarfile.open(compressed_file, "r:gz") as corpus_tar: + corpus_tar.extractall(path) + + # Return filepaths of the requested files. + input_file = find_file(path, input_filename) + target_file = find_file(path, target_filename) + + if input_file and target_file: + return input_file, target_file + + raise OSError("Download/extraction failed for url %s to path %s" % + (url, path)) + + +def txt_line_iterator(path): + """Iterate through lines of file.""" + with tf.gfile.Open(path) as f: + for line in f: + yield line.strip() + + +def compile_files(raw_dir, raw_files, tag): + """Compile raw files into a single file for each language. + + Args: + raw_dir: Directory containing downloaded raw files. + raw_files: Dict containing filenames of input and target data. + {"inputs": list of files containing data in input language + "targets": list of files containing corresponding data in target language + } + tag: String to append to the compiled filename. + + Returns: + Full path of compiled input and target files. + """ + tf.logging.info("Compiling files with tag %s." % tag) + filename = "%s-%s" % (_PREFIX, tag) + input_compiled_file = os.path.join(raw_dir, filename + ".lang1") + target_compiled_file = os.path.join(raw_dir, filename + ".lang2") + + with tf.gfile.Open(input_compiled_file, mode="w") as input_writer: + with tf.gfile.Open(target_compiled_file, mode="w") as target_writer: + for i in range(len(raw_files["inputs"])): + input_file = raw_files["inputs"][i] + target_file = raw_files["targets"][i] + + tf.logging.info("Reading files %s and %s." % (input_file, target_file)) + write_file(input_writer, input_file) + write_file(target_writer, target_file) + return input_compiled_file, target_compiled_file + + +def write_file(writer, filename): + """Write all of lines from file using the writer.""" + for line in txt_line_iterator(filename): + writer.write(line) + writer.write("\n") + + +############################################################################### +# Data preprocessing +############################################################################### +def encode_and_save_files( + subtokenizer, data_dir, raw_files, tag, total_shards): + """Save data from files as encoded Examples in TFrecord format. + + Args: + subtokenizer: Subtokenizer object that will be used to encode the strings. + data_dir: The directory in which to write the examples + raw_files: A tuple of (input, target) data files. Each line in the input and + the corresponding line in target file will be saved in a tf.Example. + tag: String that will be added onto the file names. + total_shards: Number of files to divide the data into. + + Returns: + List of all files produced. + """ + # Create a file for each shard. + filepaths = [shard_filename(data_dir, tag, n + 1, total_shards) + for n in range(total_shards)] + + if all_exist(filepaths): + tf.logging.info("Files with tag %s already exist." % tag) + return filepaths + + tf.logging.info("Saving files with tag %s." % tag) + input_file = raw_files[0] + target_file = raw_files[1] + + # Write examples to each shard in round robin order. + tmp_filepaths = [fname + ".incomplete" for fname in filepaths] + writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filepaths] + counter, shard = 0, 0 + for counter, (input_line, target_line) in enumerate(zip( + txt_line_iterator(input_file), txt_line_iterator(target_file))): + if counter > 0 and counter % 100000 == 0: + tf.logging.info("\tSaving case %d." % counter) + example = dict_to_example( + {"inputs": subtokenizer.encode(input_line, add_eos=True), + "targets": subtokenizer.encode(target_line, add_eos=True)}) + writers[shard].write(example.SerializeToString()) + shard = (shard + 1) % total_shards + for writer in writers: + writer.close() + + for tmp_name, final_name in zip(tmp_filepaths, filepaths): + tf.gfile.Rename(tmp_name, final_name) + + tf.logging.info("Saved %d Examples", counter) + return filepaths + + +def shard_filename(path, tag, shard_num, total_shards): + """Create filename for data shard.""" + return os.path.join( + path, "%s-%s-%.5d-of-%.5d" % (_PREFIX, tag, shard_num, total_shards)) + + +def shuffle_records(fname): + """Shuffle records in a single file.""" + tf.logging.info("Shuffling records in file %s" % fname) + + # Rename file prior to shuffling + tmp_fname = fname + ".unshuffled" + tf.gfile.Rename(fname, tmp_fname) + + reader = tf.python_io.tf_record_iterator(tmp_fname) + records = [] + for record in reader: + records.append(record) + if len(records) % 100000 == 0: + tf.logging.info("\tRead: %d", len(records)) + + random.shuffle(records) + + # Write shuffled records to original file name + with tf.python_io.TFRecordWriter(fname) as w: + for count, record in enumerate(records): + w.write(record) + if count > 0 and count % 100000 == 0: + tf.logging.info("\tWriting record: %d" % count) + + tf.gfile.Remove(tmp_fname) + + +def dict_to_example(dictionary): + """Converts a dictionary of string->int to a tf.Example.""" + features = {} + for k, v in six.iteritems(dictionary): + features[k] = tf.train.Feature(int64_list=tf.train.Int64List(value=v)) + return tf.train.Example(features=tf.train.Features(feature=features)) + + +def all_exist(filepaths): + """Returns true if all files in the list exist.""" + for fname in filepaths: + if not tf.gfile.Exists(fname): + return False + return True + + +def make_dir(path): + if not tf.gfile.Exists(path): + tf.logging.info("Creating directory %s" % path) + tf.gfile.MakeDirs(path) + + +def main(unused_argv): + """Obtain training and evaluation data for the Transformer model.""" + tf.logging.set_verbosity(tf.logging.INFO) + + make_dir(FLAGS.raw_dir) + make_dir(FLAGS.data_dir) + + # Get paths of download/extracted training and evaluation files. + tf.logging.info("Step 1/4: Downloading data from source") + train_files = get_raw_files(FLAGS.raw_dir, _TRAIN_DATA_SOURCES) + eval_files = get_raw_files(FLAGS.raw_dir, _EVAL_DATA_SOURCES) + test_files = get_raw_files(FLAGS.raw_dir, _TEST_DATA_SOURCES) + + # Create subtokenizer based on the training files. + tf.logging.info("Step 2/4: Creating subtokenizer and building vocabulary") + train_files_flat = train_files["inputs"] + train_files["targets"] + vocab_file = os.path.join(FLAGS.data_dir, _VOCAB_FILE) + subtokenizer = tokenizer.Subtokenizer.init_from_files( + vocab_file, train_files_flat, _TARGET_VOCAB_SIZE, _TARGET_THRESHOLD, + min_count=None if FLAGS.search else _TRAIN_DATA_MIN_COUNT) + + tf.logging.info("Step 3/4: Compiling training and evaluation data") + compiled_train_files = compile_files(FLAGS.raw_dir, train_files, _TRAIN_TAG) + compiled_eval_files = compile_files(FLAGS.raw_dir, eval_files, _EVAL_TAG) + compiled_test_files = compile_files(FLAGS.raw_dir, test_files, _TEST_TAG) + + # Tokenize and save data as Examples in the TFRecord format. + tf.logging.info("Step 4/4: Preprocessing and saving data") + train_tfrecord_files = encode_and_save_files( + subtokenizer, FLAGS.data_dir, compiled_train_files, _TRAIN_TAG, + _TRAIN_SHARDS) + encode_and_save_files( + subtokenizer, FLAGS.data_dir, compiled_eval_files, _EVAL_TAG, + _EVAL_SHARDS) + encode_and_save_files( + subtokenizer, FLAGS.data_dir, compiled_test_files, _TEST_TAG, + _TEST_SHARDS) + + for fname in train_tfrecord_files: + shuffle_records(fname) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--data_dir", "-dd", type=str, default="/tmp/translate_ende", + help="[default: %(default)s] Directory for where the " + "translate_ende_wmt32k dataset is saved.", + metavar="
              ") + parser.add_argument( + "--raw_dir", "-rd", type=str, default="/tmp/translate_ende_raw", + help="[default: %(default)s] Path where the raw data will be downloaded " + "and extracted.", + metavar="") + parser.add_argument( + "--search", action="store_true", + help="If set, use binary search to find the vocabulary set with size" + "closest to the target size (%d)." % _TARGET_VOCAB_SIZE) + + FLAGS, unparsed = parser.parse_known_args() + main(sys.argv) \ No newline at end of file diff --git a/open_seq2seq/data/text2text/t2t.py b/open_seq2seq/data/text2text/t2t.py index c24ac23e6..8a2632c45 100644 --- a/open_seq2seq/data/text2text/t2t.py +++ b/open_seq2seq/data/text2text/t2t.py @@ -45,8 +45,9 @@ is the list of training files. Second, while reading records using `parallel_interleave`, the `sloppy` argument is used to generate randomness in the order of the examples. -""" +3. Modified slightly to fit OpenSeq2Seq needs +""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -72,7 +73,7 @@ def _load_records(filename): return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER) -def _parse_example(serialized_example): +def _parse_example(serialized_example, pad_2_eight=False): """Return inputs and targets Tensors from a serialized tf.Example.""" data_fields = { "inputs": tf.VarLenFeature(tf.int64), @@ -81,6 +82,17 @@ def _parse_example(serialized_example): parsed = tf.parse_single_example(serialized_example, data_fields) inputs = tf.sparse_tensor_to_dense(parsed["inputs"]) targets = tf.sparse_tensor_to_dense(parsed["targets"]) + + if pad_2_eight: + inputs = tf.cond(tf.equal(tf.shape(inputs)[0] % 8, 0), + true_fn=lambda: inputs, + false_fn=lambda: tf.pad(inputs, + paddings=[[0, 8 - tf.shape(inputs)[0] % 8]])) + targets = tf.cond(tf.equal(tf.shape(targets)[0] % 8, 0), + true_fn=lambda: targets, + false_fn=lambda: tf.pad(targets, + paddings=[[0, 8 - tf.shape(targets)[0] % 8]])) + return inputs, targets @@ -128,7 +140,7 @@ def _create_min_max_boundaries( return buckets_min, buckets_max -def _batch_examples(dataset, batch_size, max_length): +def _batch_examples(dataset, batch_size, max_length, pad_2_eight=True): """Group examples by similar lengths, and return batched dataset. Each batch of similar-length examples are padded to the same length, and may @@ -154,7 +166,12 @@ def _batch_examples(dataset, batch_size, max_length): # Create list of batch sizes for each bucket_id, so that # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size - bucket_batch_sizes = [batch_size // x for x in buckets_max] + if pad_2_eight: # pad to 8 for HMMA + bucket_batch_sizes = [ + batch_size // x if batch_size // x % 8 == 0 else batch_size // x + ( + 8 - batch_size // x % 8) for x in buckets_max] + else: + bucket_batch_sizes = [batch_size // x for x in buckets_max] # bucket_id will be a tensor, so convert this list to a tensor as well. bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64) @@ -176,7 +193,6 @@ def window_size_fn(bucket_id): def batching_fn(bucket_id, grouped_dataset): """Batch and add padding to a dataset of elements with similar lengths.""" bucket_batch_size = window_size_fn(bucket_id) - # Batch the dataset and add padding so that all input sequences in the # examples have the same length, and all target sequences have the same # lengths as well. Resulting lengths of inputs and targets can differ. @@ -191,7 +207,7 @@ def batching_fn(bucket_id, grouped_dataset): def _read_and_batch_from_files( file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat, - num_workers, worker_id): + num_workers, worker_id, batch_in_tokens, pad2eight=True): """Create dataset where each item is a dict of "inputs" and "targets". Args: @@ -204,6 +220,11 @@ def _read_and_batch_from_files( repeated forever. num_workers: Number of workers or number of Horovod workers worker_id: Worker id or Horovod rank + batch_in_tokens: whether to batch_size means amounts in tokens or sentence + pairs. batching in tokens is more efficient as it reduces PADs. batching in + sentences should be used in inference mode since order of + sentences is important + pad2eight: if True, it will pad both dimensions to be divisible by 8 Returns: tf.data.Dataset object containing examples loaded from the files. @@ -224,14 +245,19 @@ def _read_and_batch_from_files( # Parse each tf.Example into a dictionary # TODO: Look into prefetch_input_elements for performance optimization. - dataset = dataset.map(_parse_example, + dataset = dataset.map(lambda x: _parse_example(x, pad_2_eight=pad2eight), num_parallel_calls=num_cpu_cores) # Remove examples where the input or target length exceeds the maximum length, dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length)) - # Batch such that each batch has examples of similar length. - dataset = _batch_examples(dataset, batch_size, max_length) + if batch_in_tokens: + # Batch such that each batch has examples of similar length. + dataset = _batch_examples(dataset, batch_size, max_length, + pad_2_eight=pad2eight) + else: + # Examples can have different lenghts + dataset = dataset.padded_batch(batch_size, ([None], [None])) dataset = dataset.repeat(repeat) # Prefetch the next element to improve speed of input pipeline. diff --git a/open_seq2seq/data/text2text/text2text.py b/open_seq2seq/data/text2text/text2text.py index b2dcfcc6e..4f87ae2ad 100644 --- a/open_seq2seq/data/text2text/text2text.py +++ b/open_seq2seq/data/text2text/text2text.py @@ -77,7 +77,8 @@ def __init__(self, params, model, num_workers=1, worker_id=0): self._delimiter = self.params.get('delimiter', ' ') self._map_parallel_calls = self.params.get('map_parallel_calls', 8) self._pad_lengths_to_eight = self.params.get('pad_lengths_to_eight', False) - self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', 4) + self._prefetch_buffer_size = self.params.get('prefetch_buffer_size', + tf.contrib.data.AUTOTUNE) self._num_workers = num_workers self._worker_id = worker_id if self._pad_lengths_to_eight and not (self.params['max_length'] % 8 == 0): @@ -166,7 +167,7 @@ def tgt_token_to_id(line): [SpecialTextTokens.EOS_ID.value], self._pad_lengths_to_eight), dtype="int32") _sources = tf.data.TextLineDataset(self.source_file)\ - .map(lambda line: tf.py_func(func=src_token_to_id,inp=[line], + .map(lambda line: tf.py_func(func=src_token_to_id, inp=[line], Tout=[tf.int32], stateful=False), num_parallel_calls=self._map_parallel_calls) \ .map(lambda tokens: (tokens, tf.size(tokens)), @@ -254,7 +255,8 @@ def get_optional_params(): 'repeat': int, 'num_cpu_cores': int, 'tgt_vocab_file': str, - 'm_padding': bool, + 'pad_data_to_eight': bool, + 'batch_in_tokens': bool, }) def __init__(self, params, model, num_workers=1, worker_id=0): @@ -301,38 +303,13 @@ def build_graph(self): shuffle=self.params['shuffle'], repeat=self.params['repeat'], num_workers=self._num_workers, - worker_id=self._worker_id) + worker_id=self._worker_id, + batch_in_tokens=self.params.get('batch_in_tokens', True), + pad2eight=self.params.get('pad_data_to_eight', False)) self._iterator = self.batched_dataset.make_initializable_iterator() x, y = self.iterator.get_next() - if self.params.get('m_padding', False): - # MAGIC PADDING - x = tf.cond(tf.equal(tf.shape(x)[1] % 8, 0), - true_fn = lambda: x, - false_fn = lambda: tf.pad(x, - paddings=[[0, 0], - [0, 8 - tf.shape(x)[1] % 8]])) - - y = tf.cond(tf.equal(tf.shape(y)[1] % 8, 0), - true_fn = lambda: y, - false_fn = lambda: tf.pad(y, - paddings=[[0, 0], - [0, 8 - tf.shape(y)[1] % 8]])) - - x = tf.cond(tf.equal(tf.shape(x)[0] % 8, 0), - true_fn = lambda: x, - false_fn = lambda: tf.pad(x, - paddings=[[0, 8 - tf.shape(x)[0] % 8], - [0, 0]])) - - y = tf.cond(tf.equal(tf.shape(y)[0] % 8, 0), - true_fn=lambda: y, - false_fn=lambda: tf.pad(y, - paddings=[[0, 8 - tf.shape(y)[0] % 8], - [0, 0]])) - # ENDOF MAGIC PADDING - len_x = tf.count_nonzero(x, axis=1, dtype=tf.int32) len_y = tf.count_nonzero(y, axis=1, dtype=tf.int32) if self.params['mode'] == 'train' or self.params['mode'] == 'eval': diff --git a/open_seq2seq/decoders/__init__.py b/open_seq2seq/decoders/__init__.py index 3601364a7..09c95dedf 100644 --- a/open_seq2seq/decoders/__init__.py +++ b/open_seq2seq/decoders/__init__.py @@ -6,5 +6,10 @@ from .decoder import Decoder from .rnn_decoders import RNNDecoderWithAttention, \ BeamSearchRNNDecoderWithAttention + from .transformer_decoder import TransformerDecoder from .fc_decoders import FullyConnectedCTCDecoder, FullyConnectedDecoder + +from .convs2s_decoder import ConvS2SDecoder +#from .convs2s_decoder_old import ConvS2SDecoder + diff --git a/open_seq2seq/decoders/convs2s_decoder.py b/open_seq2seq/decoders/convs2s_decoder.py new file mode 100644 index 000000000..1944845f8 --- /dev/null +++ b/open_seq2seq/decoders/convs2s_decoder.py @@ -0,0 +1,367 @@ +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf +import math +from .decoder import Decoder + +from open_seq2seq.parts.transformer import beam_search + +from open_seq2seq.parts.transformer import embedding_layer +from open_seq2seq.parts.transformer.utils import get_padding + +from open_seq2seq.parts.convs2s import ffn_wn_layer, conv_wn_layer, attention_wn_layer + +# Default value used if max_input_length is not given +MAX_INPUT_LENGTH = 128 + + +class ConvS2SDecoder(Decoder): + + @staticmethod + def get_required_params(): + """Static method with description of required parameters. + + Returns: + dict: + Dictionary containing all the parameters that **have to** be + included into the ``params`` parameter of the + class :meth:`__init__` method. + """ + return dict( + Decoder.get_required_params(), **{ + 'batch_size': int, + 'decoder_layers': int, + 'tgt_emb_size': int, + 'tgt_vocab_size': int, + 'shared_embed': bool, + 'embedding_dropout_keep_prob': float, + 'conv_nchannels_kwidth': list, + 'hidden_dropout_keep_prob': float, + 'out_dropout_keep_prob': float, + 'beam_size': int, + 'alpha': float, + 'extra_decode_length': int, + 'EOS_ID': int, + }) + + @staticmethod + def get_optional_params(): + """Static method with description of optional parameters. + + Returns: + dict: + Dictionary containing all the parameters that **can** be + included into the ``params`` parameter of the + class :meth:`__init__` method. + """ + return dict( + Decoder.get_optional_params(), + **{ + 'pad_embeddings_2_eight': bool, + + # if not provided, tgt_emb_size is used as the default value + 'out_emb_size': int, + 'max_input_length': int, + 'GO_SYMBOL': int, + 'PAD_SYMBOL': int, + 'END_SYMBOL': int, + }) + + def _cast_types(self, input_dict): + return input_dict + + def __init__(self, params, model, name="convs2s_decoder", mode='train'): + super(ConvS2SDecoder, self).__init__(params, model, name, mode) + self.embedding_softmax_layer = None + self.position_embedding_layer = None + self.layers = [] + self._tgt_vocab_size = self.params['tgt_vocab_size'] + self._tgt_emb_size = self.params['tgt_emb_size'] + self._mode = mode + self._pad_sym = self.params.get('PAD_SYMBOL', 0) + self._pad2eight = params.get('pad_embeddings_2_eight', False) + + def _decode(self, input_dict): + targets = input_dict['target_tensors'][0] \ + if 'target_tensors' in input_dict else None + + encoder_outputs = input_dict['encoder_output']['outputs'] + encoder_outputs_b = input_dict['encoder_output'].get( + 'outputs_b', encoder_outputs) + + inputs_attention_bias = input_dict['encoder_output'].get( + 'inputs_attention_bias_cs2s', None) + + with tf.name_scope("decode"): + # prepare decoder layers + if len(self.layers) == 0: + knum_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[0] + kwidth_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[1] + + # preparing embedding layers + with tf.variable_scope("embedding"): + if 'embedding_softmax_layer' in input_dict['encoder_output'] \ + and self.params['shared_embed']: + self.embedding_softmax_layer = \ + input_dict['encoder_output']['embedding_softmax_layer'] + else: + self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self._tgt_vocab_size, + hidden_size=self._tgt_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + with tf.variable_scope("pos_embedding"): + if 'position_embedding_layer' in input_dict['encoder_output'] \ + and self.params['shared_embed']: + self.position_embedding_layer = \ + input_dict['encoder_output']['position_embedding_layer'] + else: + self.position_embedding_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self.params.get("max_input_length", + MAX_INPUT_LENGTH), + hidden_size=self._tgt_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + # linear projection before cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + self._tgt_emb_size, + knum_list[0], + dropout=self.params["embedding_dropout_keep_prob"], + var_scope_name="linear_mapping_before_cnn_layers")) + + for i in range(self.params['decoder_layers']): + in_dim = knum_list[i] if i == 0 else knum_list[i - 1] + out_dim = knum_list[i] + + # linear projection is needed for residual connections if + # input and output of a cnn layer do not match + if in_dim != out_dim: + linear_proj = ffn_wn_layer.FeedFowardNetworkNormalized( + in_dim, + out_dim, + var_scope_name="linear_mapping_cnn_" + str(i + 1), + dropout=1.0) + else: + linear_proj = None + + conv_layer = conv_wn_layer.Conv1DNetworkNormalized( + in_dim, + out_dim, + kernel_width=kwidth_list[i], + mode=self.mode, + layer_id=i + 1, + hidden_dropout=self.params["hidden_dropout_keep_prob"], + conv_padding="VALID", + decode_padding=True) + + att_layer = attention_wn_layer.AttentionLayerNormalized( + out_dim, + embed_size=self._tgt_emb_size, + layer_id=i + 1, + add_res=True) + + self.layers.append([linear_proj, conv_layer, att_layer]) + + # linear projection after cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + knum_list[self.params['decoder_layers'] - 1], + self.params.get("out_emb_size", self._tgt_emb_size), + dropout=1.0, + var_scope_name="linear_mapping_after_cnn_layers")) + + if not self.params['shared_embed']: + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + self.params.get("out_emb_size", self._tgt_emb_size), + self._tgt_vocab_size, + dropout=self.params["out_dropout_keep_prob"], + var_scope_name="linear_mapping_to_vocabspace")) + else: + # if embedding is shared, + # the shared embedding is used as the final linear projection to vocab space + self.layers.append(None) + + if targets is None: + return self.predict(encoder_outputs, encoder_outputs_b, + inputs_attention_bias) + else: + logits = self.decode_pass(targets, encoder_outputs, encoder_outputs_b, + inputs_attention_bias) + return { + "logits": logits, + "outputs": [tf.argmax(logits, axis=-1)], + "final_state": None, + "final_sequence_lengths": None + } + + def decode_pass(self, targets, encoder_outputs, encoder_outputs_b, + inputs_attention_bias): + """Generate logits for each value in the target sequence. + + Args: + targets: target values for the output sequence. + int tensor with shape [batch_size, target_length] + encoder_outputs: continuous representation of input sequence. + float tensor with shape [batch_size, input_length, hidden_size] + float tensor with shape [batch_size, input_length, hidden_size] + encoder_outputs_b: continuous representation of input sequence + which includes the source embeddings. + float tensor with shape [batch_size, input_length, hidden_size] + inputs_attention_bias: float tensor with shape [batch_size, 1, input_length] + + Returns: + float32 tensor with shape [batch_size, target_length, vocab_size] + """ + + # Prepare inputs to decoder layers by applying embedding + # and adding positional encoding. + decoder_inputs = self.embedding_softmax_layer(targets) + + with tf.name_scope("add_pos_encoding"): + pos_input = tf.range( + 0, tf.shape(decoder_inputs)[1], delta=1, dtype=tf.int32, name='range') + pos_encoding = self.position_embedding_layer(pos_input) + decoder_inputs = decoder_inputs + tf.cast( + x=pos_encoding, dtype=decoder_inputs.dtype) + + if self.mode == "train": + decoder_inputs = tf.nn.dropout(decoder_inputs, + self.params["embedding_dropout_keep_prob"]) + + # mask the paddings in the target + inputs_padding = get_padding( + targets, padding_value=self._pad_sym, dtype=decoder_inputs.dtype) + decoder_inputs *= tf.expand_dims(1.0 - inputs_padding, 2) + + # do decode + logits = self._call( + decoder_inputs=decoder_inputs, + encoder_outputs_a=encoder_outputs, + encoder_outputs_b=encoder_outputs_b, + input_attention_bias=inputs_attention_bias) + + return logits + + def _call(self, decoder_inputs, encoder_outputs_a, encoder_outputs_b, + input_attention_bias): + # run input into the decoder layers and returns the logits + target_embed = decoder_inputs + with tf.variable_scope("linear_layer_before_cnn_layers"): + outputs = self.layers[0](decoder_inputs) + + for i in range(1, len(self.layers) - 2): + linear_proj, conv_layer, att_layer = self.layers[i] + + with tf.variable_scope("layer_%d" % i): + if linear_proj is not None: + res_inputs = linear_proj(outputs) + else: + res_inputs = outputs + + with tf.variable_scope("conv_layer"): + outputs = conv_layer(outputs) + + with tf.variable_scope("attention_layer"): + outputs = att_layer(outputs, target_embed, encoder_outputs_a, + encoder_outputs_b, input_attention_bias) + outputs = (outputs + res_inputs) * math.sqrt(0.5) + + with tf.variable_scope("linear_layer_after_cnn_layers"): + outputs = self.layers[-2](outputs) + + if self.mode == "train": + outputs = tf.nn.dropout(outputs, self.params["out_dropout_keep_prob"]) + + with tf.variable_scope("pre_softmax_projection"): + if self.layers[-1] is None: + logits = self.embedding_softmax_layer.linear(outputs) + else: + logits = self.layers[-1](outputs) + + return tf.cast(logits, dtype=tf.float32) + + def predict(self, encoder_outputs, encoder_outputs_b, inputs_attention_bias): + """Return predicted sequence.""" + batch_size = tf.shape(encoder_outputs)[0] + input_length = tf.shape(encoder_outputs)[1] + max_decode_length = input_length + self.params["extra_decode_length"] + + symbols_to_logits_fn = self._get_symbols_to_logits_fn() + + # Create initial set of IDs that will be passed into symbols_to_logits_fn. + initial_ids = tf.zeros( + [batch_size], dtype=tf.int32) + self.params["GO_SYMBOL"] + + cache = {} + # Add encoder outputs and attention bias to the cache. + cache["encoder_outputs"] = encoder_outputs + cache["encoder_outputs_b"] = encoder_outputs_b + if inputs_attention_bias is not None: + cache["inputs_attention_bias"] = inputs_attention_bias + + # Use beam search to find the top beam_size sequences and scores. + decoded_ids, scores = beam_search.sequence_beam_search( + symbols_to_logits_fn=symbols_to_logits_fn, + initial_ids=initial_ids, + initial_cache=cache, + vocab_size=self.params["tgt_vocab_size"], + beam_size=self.params["beam_size"], + alpha=self.params["alpha"], + max_decode_length=max_decode_length, + eos_id=self.params["EOS_ID"]) + + # Get the top sequence for each batch element + top_decoded_ids = decoded_ids[:, 0, :] + top_scores = scores[:, 0] + + # this isn't particularly efficient + logits = self.decode_pass(top_decoded_ids, encoder_outputs, + encoder_outputs_b, inputs_attention_bias) + + return { + "logits": logits, + "outputs": [top_decoded_ids], + "final_state": None, + "final_sequence_lengths": None + } + + def _get_symbols_to_logits_fn(self): + """Returns a decoding function that calculates logits of the next tokens.""" + + def symbols_to_logits_fn(ids, i, cache): + """Generate logits for next potential IDs. + + Args: + ids: Current decoded sequences. + int tensor with shape [batch_size * beam_size, i - 1] + i: Loop index + cache: dictionary of values storing the encoder output, encoder-decoder + attention bias, and previous decoder attention values. + + Returns: + Tuple of + (logits with shape [batch_size * beam_size, vocab_size], + updated cache values) + """ + + # pass the decoded ids from the beginneing up to the current into the decoder + # not efficient + decoder_outputs = self.decode_pass(ids, cache.get("encoder_outputs"), + cache.get("encoder_outputs_b"), + cache.get("inputs_attention_bias")) + + logits = decoder_outputs[:, i, :] + return logits, cache + + return symbols_to_logits_fn diff --git a/open_seq2seq/decoders/decoder.py b/open_seq2seq/decoders/decoder.py index 2c2ae46fc..4eb811b3b 100644 --- a/open_seq2seq/decoders/decoder.py +++ b/open_seq2seq/decoders/decoder.py @@ -87,24 +87,9 @@ def __init__(self, params, model, name="decoder", mode='train'): else: self._params['dtype'] = tf.float32 - if 'regularizer' not in self._params: - if self._model and 'regularizer' in self._model.params: - self._params['regularizer'] = self._model.params['regularizer'] - self._params['regularizer_params'] = self._model.params['regularizer_params'] - - if 'regularizer' in self._params: - init_dict = self._params.get('regularizer_params', {}) - self._params['regularizer'] = self._params['regularizer'](**init_dict) - if self._params['dtype'] == 'mixed': - self._params['regularizer'] = mp_regularizer_wrapper( - self._params['regularizer'], - ) - - if self._params['dtype'] == 'mixed': - self._params['dtype'] = tf.float16 - self._name = name self._mode = mode + self._compiled = False def decode(self, input_dict): """Wrapper around :meth:`self._decode() <_decode>` method. @@ -117,12 +102,35 @@ def decode(self, input_dict): Returns: see :meth:`self._decode() <_decode>` docs. """ + if not self._compiled: + if 'regularizer' not in self._params: + if self._model and 'regularizer' in self._model.params: + self._params['regularizer'] = copy.deepcopy( + self._model.params['regularizer'] + ) + self._params['regularizer_params'] = copy.deepcopy( + self._model.params['regularizer_params'] + ) + + if 'regularizer' in self._params: + init_dict = self._params.get('regularizer_params', {}) + self._params['regularizer'] = self._params['regularizer'](**init_dict) + if self._params['dtype'] == 'mixed': + self._params['regularizer'] = mp_regularizer_wrapper( + self._params['regularizer'], + ) + + if self._params['dtype'] == 'mixed': + self._params['dtype'] = tf.float16 + if 'initializer' in self.params: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) else: initializer = None + self._compiled = True + with tf.variable_scope(self._name, initializer=initializer, dtype=self.params['dtype']): return self._decode(self._cast_types(input_dict)) @@ -160,7 +168,8 @@ def _decode(self, input_dict): { "logits": logits that will be passed to Loss - "samples": actual decoded output, e.g. characters instead of logits + "outputs": list with actual decoded outputs, e.g. characters + instead of logits } """ pass diff --git a/open_seq2seq/decoders/fc_decoders.py b/open_seq2seq/decoders/fc_decoders.py index 55d59a8f7..46106b135 100644 --- a/open_seq2seq/decoders/fc_decoders.py +++ b/open_seq2seq/decoders/fc_decoders.py @@ -54,7 +54,7 @@ def _decode(self, input_dict): { 'logits': logits with the shape=[batch_size, output_dim] - 'samples': [logits] (same as logits but wrapped in list) + 'outputs': [logits] (same as logits but wrapped in list) } """ inputs = input_dict['encoder_output']['outputs'] @@ -67,7 +67,7 @@ def _decode(self, input_dict): kernel_regularizer=regularizer, name='fully_connected', ) - return {'logits': logits, 'samples': [logits]} + return {'logits': logits, 'outputs': [logits]} class FullyConnectedTimeDecoder(Decoder): @@ -97,7 +97,7 @@ def __init__(self, params, model, * **tgt_vocab_size** (int) --- target vocabulary size, i.e. number of output features. * **logits_to_outputs_func** --- function that maps produced logits to - decoder samples, i.e. actual text sequences. + decoder outputs, i.e. actual text sequences. """ super(FullyConnectedTimeDecoder, self).__init__(params, model, name, mode) @@ -119,7 +119,7 @@ def _decode(self, input_dict): { 'logits': logits with the shape=[time length, batch_size, tgt_vocab_size] - 'samples': logits_to_outputs_func(logits, input_dict) + 'outputs': logits_to_outputs_func(logits, input_dict) } """ inputs = input_dict['encoder_output']['outputs'] @@ -146,9 +146,9 @@ def _decode(self, input_dict): logits = tf.transpose(logits, [1, 0, 2]) if 'logits_to_outputs_func' in self.params: - samples = self.params['logits_to_outputs_func'](logits, input_dict) + outputs = self.params['logits_to_outputs_func'](logits, input_dict) return { - 'samples': samples, + 'outputs': outputs, 'logits': logits, 'src_length': input_dict['encoder_output']['src_length'], } diff --git a/open_seq2seq/decoders/rnn_decoders.py b/open_seq2seq/decoders/rnn_decoders.py index a0fa1b10b..096181d89 100644 --- a/open_seq2seq/decoders/rnn_decoders.py +++ b/open_seq2seq/decoders/rnn_decoders.py @@ -10,7 +10,7 @@ from open_seq2seq.parts.rnns.gnmt import GNMTAttentionMultiCell, \ gnmt_residual_fn -from open_seq2seq.parts.rnns.utils import create_rnn_cell +from open_seq2seq.parts.rnns.utils import single_cell from open_seq2seq.parts.rnns.attention_wrapper import BahdanauAttention, \ LuongAttention, \ AttentionWrapper @@ -30,8 +30,7 @@ def get_required_params(): 'tgt_emb_size': int, 'attention_layer_size': int, 'attention_type': ['bahdanau', 'luong', 'gnmt', 'gnmt_v2'], - 'decoder_cell_units': int, - 'decoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'], + 'core_cell': None, 'decoder_layers': int, 'decoder_use_skip_connections': bool, 'batch_size': int, @@ -40,6 +39,7 @@ def get_required_params(): @staticmethod def get_optional_params(): return dict(Decoder.get_optional_params(), **{ + 'core_cell_params': dict, 'bahdanau_normalize': bool, 'luong_scale': bool, 'decoder_dp_input_keep_prob': float, @@ -65,8 +65,8 @@ def __init__(self, params, model, * **END_SYMBOL** (int) --- END symbol id, must be the same as used in data layer. * **tgt_emb_size** (int) --- embedding size to use. - * **decoder_cell_units** (int) - number of units in RNN - * **decoder_cell_type** (string) - RNN type: lstm, gru, glstm, etc. + * **core_cell_params** (dict) - parameters for RNN class + * **core_cell** (string) - RNN class. * **decoder_dp_input_keep_prob** (float) - dropout input keep probability. * **decoder_dp_output_keep_prob** (float) - dropout output keep probability. * **decoder_use_skip_connections** (bool) - use residual connections or not. @@ -184,8 +184,8 @@ def _decode(self, input_dict): self._tgt_vocab_size, use_bias=False, ) - cell_params = copy.deepcopy(self.params) - cell_params["num_units"] = self.params['decoder_cell_units'] + #cell_params = copy.deepcopy(self.params) + #cell_params["num_units"] = self.params['decoder_cell_units'] if self._mode == "train": dp_input_keep_prob = self.params['decoder_dp_input_keep_prob'] @@ -194,22 +194,17 @@ def _decode(self, input_dict): dp_input_keep_prob = 1.0 dp_output_keep_prob = 1.0 - if self.params['attention_type'].startswith('gnmt'): - residual_connections = False - wrap_to_multi_rnn = False - else: - residual_connections = self.params['decoder_use_skip_connections'] - wrap_to_multi_rnn = True - - self._decoder_cells = create_rnn_cell( - cell_type=self.params['decoder_cell_type'], - cell_params=cell_params, - num_layers=self.params['decoder_layers'], - dp_input_keep_prob=dp_input_keep_prob, - dp_output_keep_prob=dp_output_keep_prob, - residual_connections=residual_connections, - wrap_to_multi_rnn=wrap_to_multi_rnn, - ) + residual_connections = self.params['decoder_use_skip_connections'] + + # list of cells + self._decoder_cells = [ + single_cell(cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), + dp_input_keep_prob=dp_input_keep_prob, + dp_output_keep_prob=dp_output_keep_prob, + # residual connections are added a little differently for GNMT + residual_connections=False if self.params['attention_type'].startswith('gnmt') else residual_connections, + ) for _ in range(self.params['decoder_layers'])] attention_mechanism = self._build_attention( encoder_outputs, @@ -217,7 +212,6 @@ def _decode(self, input_dict): ) if self.params['attention_type'].startswith('gnmt'): attention_cell = self._decoder_cells.pop(0) - # attention_cell = tf.contrib.seq2seq.AttentionWrapper( attention_cell = AttentionWrapper( attention_cell, attention_mechanism=attention_mechanism, @@ -225,12 +219,12 @@ def _decode(self, input_dict): output_attention=False, name="gnmt_attention") attentive_decoder_cell = GNMTAttentionMultiCell( - attention_cell, self._add_residual_wrapper(self._decoder_cells), + attention_cell, self._add_residual_wrapper(self._decoder_cells) if residual_connections else self._decoder_cells, use_new_attention=(self.params['attention_type'] == 'gnmt_v2')) else: # attentive_decoder_cell = tf.contrib.seq2seq.AttentionWrapper( attentive_decoder_cell = AttentionWrapper( - cell=self._decoder_cells, + cell=tf.contrib.rnn.MultiRNNCell(self._decoder_cells), attention_mechanism=attention_mechanism, ) if self._mode == "train": @@ -283,8 +277,9 @@ def _decode(self, input_dict): output_time_major=time_major, ) - return {'logits': final_outputs.rnn_output, - 'samples': [tf.argmax(final_outputs.rnn_output, axis=-1)], + return {'logits': final_outputs.rnn_output if not time_major else + tf.transpose(final_outputs.rnn_output, perm=[1, 0, 2]), + 'outputs': [tf.argmax(final_outputs.rnn_output, axis=-1)], 'final_state': final_state, 'final_sequence_lengths': final_sequence_lengths} @@ -371,8 +366,8 @@ def _decode(self, input_dict): self._tgt_vocab_size, use_bias=False, ) - cell_params = copy.deepcopy(self.params) - cell_params["num_units"] = self.params['decoder_cell_units'] + #cell_params = copy.deepcopy(self.params) + #cell_params["num_units"] = self.params['decoder_cell_units'] if self._mode == "train": dp_input_keep_prob = self.params['decoder_dp_input_keep_prob'] @@ -381,22 +376,34 @@ def _decode(self, input_dict): dp_input_keep_prob = 1.0 dp_output_keep_prob = 1.0 - if self.params['attention_type'].startswith('gnmt'): - residual_connections = False - wrap_to_multi_rnn = False - else: - residual_connections = self.params['decoder_use_skip_connections'] - wrap_to_multi_rnn = True - - self._decoder_cells = create_rnn_cell( - cell_type=self.params['decoder_cell_type'], - cell_params=cell_params, - num_layers=self.params['decoder_layers'], - dp_input_keep_prob=dp_input_keep_prob, - dp_output_keep_prob=dp_output_keep_prob, - residual_connections=residual_connections, - wrap_to_multi_rnn=wrap_to_multi_rnn, - ) + #if self.params['attention_type'].startswith('gnmt'): + # residual_connections = False + # wrap_to_multi_rnn = False + #else: + # residual_connections = self.params['decoder_use_skip_connections'] + # wrap_to_multi_rnn = True + + #self._decoder_cells = create_rnn_cell( + # cell_type=self.params['decoder_cell_type'], + # cell_params=cell_params, + # num_layers=self.params['decoder_layers'], + # dp_input_keep_prob=dp_input_keep_prob, + # dp_output_keep_prob=dp_output_keep_prob, + # residual_connections=residual_connections, + # wrap_to_multi_rnn=wrap_to_multi_rnn, + #) + residual_connections = self.params['decoder_use_skip_connections'] + # list of cells + self._decoder_cells = [ + single_cell(cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), + dp_input_keep_prob=dp_input_keep_prob, + dp_output_keep_prob=dp_output_keep_prob, + # residual connections are added a little differently for GNMT + residual_connections=False if self.params[ + 'attention_type'].startswith( + 'gnmt') else residual_connections, + ) for _ in range(self.params['decoder_layers'])] tiled_enc_outputs = tf.contrib.seq2seq.tile_batch( encoder_outputs, @@ -420,18 +427,18 @@ def _decode(self, input_dict): output_attention=False, name="gnmt_attention") attentive_decoder_cell = GNMTAttentionMultiCell( - attention_cell, self._add_residual_wrapper(self._decoder_cells), + attention_cell, self._add_residual_wrapper(self._decoder_cells) if residual_connections else self._decoder_cells, use_new_attention=(self.params['attention_type'] == 'gnmt_v2')) - else: + else: # non-GNMT attentive_decoder_cell = AttentionWrapper( - cell=self._decoder_cells, + cell=tf.contrib.rnn.MultiRNNCell(self._decoder_cells), attention_mechanism=attention_mechanism, ) batch_size_tensor = tf.constant(self._batch_size) embedding_fn = lambda ids: tf.cast( tf.nn.embedding_lookup(self._dec_emb_w, ids), dtype=self.params['dtype']) - #decoder = tf.contrib.seq2seq.BeamSearchDecoder( + # decoder = tf.contrib.seq2seq.BeamSearchDecoder( decoder = BeamSearchDecoder( cell=attentive_decoder_cell, embedding=embedding_fn, @@ -456,7 +463,8 @@ def _decode(self, input_dict): output_time_major=time_major, ) - return {'logits': final_outputs.predicted_ids[:, :, 0], - 'samples': [final_outputs.predicted_ids[:, :, 0]], + return {'logits': final_outputs.predicted_ids[:, :, 0] if not time_major else + tf.transpose(final_outputs.predicted_ids[:, :, 0], perm=[1, 0, 2]), + 'outputs': [final_outputs.predicted_ids[:, :, 0]], 'final_state': final_state, 'final_sequence_lengths': final_sequence_lengths} diff --git a/open_seq2seq/decoders/transformer_decoder.py b/open_seq2seq/decoders/transformer_decoder.py index 2300b3550..dd7a2080a 100644 --- a/open_seq2seq/decoders/transformer_decoder.py +++ b/open_seq2seq/decoders/transformer_decoder.py @@ -10,7 +10,8 @@ from open_seq2seq.parts.transformer import utils, attention_layer, \ ffn_layer, beam_search from open_seq2seq.parts.transformer.common import PrePostProcessingWrapper, \ - LayerNormalization + LayerNormalization + class TransformerDecoder(Decoder): @staticmethod @@ -71,7 +72,7 @@ def __init__(self, params, model, self.layers = [] def _decode(self, input_dict): - #targets = input_dict['tgt_sequence'] + # targets = input_dict['tgt_sequence'] targets = input_dict['target_tensors'][0] if 'target_tensors' \ in input_dict else None encoder_outputs = input_dict['encoder_output']['outputs'] @@ -110,13 +111,12 @@ def _decode(self, input_dict): else: logits = self.decode_pass(targets, encoder_outputs, inputs_attention_bias) return {"logits": logits, - "samples": [tf.argmax(logits, axis=-1)], + "outputs": [tf.argmax(logits, axis=-1)], "final_state": None, "final_sequence_lengths": None} - def _call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias, - attention_bias, cache=None): + attention_bias, cache=None): for n, layer in enumerate(self.layers): self_attention_layer = layer[0] enc_dec_attention_layer = layer[1] @@ -128,7 +128,7 @@ def _call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias, with tf.variable_scope(layer_name): with tf.variable_scope("self_attention"): # TODO: Figure out why this is needed - #decoder_self_attention_bias = tf.cast(x=decoder_self_attention_bias, + # decoder_self_attention_bias = tf.cast(x=decoder_self_attention_bias, # dtype=decoder_inputs.dtype) decoder_inputs = self_attention_layer( decoder_inputs, decoder_self_attention_bias, cache=layer_cache) @@ -140,7 +140,6 @@ def _call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias, return self.output_normalization(decoder_inputs) - def decode_pass(self, targets, encoder_outputs, inputs_attention_bias): """Generate logits for each value in the target sequence. @@ -163,7 +162,7 @@ def decode_pass(self, targets, encoder_outputs, inputs_attention_bias): decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] - #decoder_inputs += utils.get_position_encoding( + # decoder_inputs += utils.get_position_encoding( # length, self.params["hidden_size"]) decoder_inputs += tf.cast(utils.get_position_encoding( length, self.params["hidden_size"]), dtype=self.params['dtype']) @@ -172,11 +171,10 @@ def decode_pass(self, targets, encoder_outputs, inputs_attention_bias): decoder_inputs, 1 - self.params["layer_postprocess_dropout"]) # Run values - #decoder_self_attention_bias = tf.cast(x=utils.get_decoder_self_attention_bias( + # decoder_self_attention_bias = tf.cast(x=utils.get_decoder_self_attention_bias( # length), dtype=decoder_inputs.dtype) decoder_self_attention_bias = utils.get_decoder_self_attention_bias(length) - # do decode outputs = self._call(decoder_inputs=decoder_inputs, encoder_outputs=encoder_outputs, @@ -191,7 +189,7 @@ def _get_symbols_to_logits_fn(self, max_decode_length): timing_signal = utils.get_position_encoding( max_decode_length + 1, self.params["hidden_size"]) - #decoder_self_attention_bias = tf.cast(x=utils.get_decoder_self_attention_bias( + # decoder_self_attention_bias = tf.cast(x=utils.get_decoder_self_attention_bias( # max_decode_length), dtype=self.params['dtype']) decoder_self_attention_bias = utils.get_decoder_self_attention_bias( max_decode_length) @@ -279,8 +277,6 @@ def predict(self, encoder_outputs, encoder_decoder_attention_bias): # tf.shape(top_decoded_ids)[1], # self.params["tgt_vocab_size"]]), "logits": logits, - "samples": [top_decoded_ids], + "outputs": [top_decoded_ids], "final_state": None, "final_sequence_lengths": None} - - diff --git a/open_seq2seq/encoders/__init__.py b/open_seq2seq/encoders/__init__.py index 7d0a39923..827ef1c32 100644 --- a/open_seq2seq/encoders/__init__.py +++ b/open_seq2seq/encoders/__init__.py @@ -6,7 +6,11 @@ from .encoder import Encoder from .rnn_encoders import UnidirectionalRNNEncoderWithEmbedding, \ BidirectionalRNNEncoderWithEmbedding, \ - GNMTLikeEncoderWithEmbedding + GNMTLikeEncoderWithEmbedding,\ + GNMTLikeEncoderWithEmbedding_cuDNN from .transformer_encoder import TransformerEncoder from .ds2_encoder import DeepSpeech2Encoder from .resnet_encoder import ResNetEncoder +from .w2l_encoder import Wave2LetterEncoder +from .convs2s_encoder import ConvS2SEncoder +#from .convs2s_encoder_old import ConvS2SEncoder diff --git a/open_seq2seq/encoders/cnn_encoder.py b/open_seq2seq/encoders/cnn_encoder.py new file mode 100644 index 000000000..dc382a586 --- /dev/null +++ b/open_seq2seq/encoders/cnn_encoder.py @@ -0,0 +1,170 @@ +# Copyright (c) 2018 NVIDIA Corporation +""" +This module contains classes and functions to build "general" convolutional +neural networks from the description of arbitrary "layers". +""" +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals +from six.moves import range + +import tensorflow as tf +import copy + +try: + from inspect import signature +except ImportError: + from funcsigs import signature + +from .encoder import Encoder +from open_seq2seq.utils.utils import deco_print + + +def build_layer(inputs, layer, layer_params, data_format, + regularizer, training, verbose=True): + """This function builds a layer from the layer function and it's parameters. + + It will automatically add regularizer parameter to the layer_params if the + layer supports regularization. To check this, it will look for the + "regularizer", "kernel_regularizer" and "gamma_regularizer" names in this + order in the ``layer`` call signature. If one of this parameters is supported + it will pass regularizer object as a value for that parameter. Based on the + same "checking signature" technique "data_format" and "training" parameters + will try to be added. + + Args: + inputs: input Tensor that will be passed to the layer. Note that layer has + to accept input as the first parameter. + layer: layer function or class with ``__call__`` method defined. + layer_params (dict): parameters passed to the ``layer``. + data_format (string): data format ("channels_first" or "channels_last") + that will be tried to be passed as an additional argument. + regularizer: regularizer instance that will be tried to be passed as an + additional argument. + training (bool): whether layer is built in training mode. Will be tried to + be passed as an additional argument. + verbose (bool): whether to print information about built layers. + + Returns: + Tensor with layer output. + """ + layer_params_cp = copy.deepcopy(layer_params) + for reg_name in ['regularizer', 'kernel_regularizer', 'gamma_regularizer']: + if reg_name not in layer_params_cp and \ + reg_name in signature(layer).parameters: + layer_params_cp.update({reg_name: regularizer}) + + if 'data_format' not in layer_params_cp and \ + 'data_format' in signature(layer).parameters: + layer_params_cp.update({'data_format': data_format}) + + if 'training' not in layer_params_cp and \ + 'training' in signature(layer).parameters: + layer_params_cp.update({'training': training}) + + outputs = layer(inputs, **layer_params_cp) + + if verbose: + if hasattr(layer, '_tf_api_names'): + layer_name = layer._tf_api_names[0] + else: + layer_name = layer + deco_print("Building layer: {}(inputs, {})".format( + layer_name, + ", ".join("{}={}".format(key, value) + for key, value in layer_params_cp.items()) + )) + return outputs + + +class CNNEncoder(Encoder): + """General CNN encoder that can be used to construct various different models. + """ + @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + 'cnn_layers': list, + }) + + @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + 'data_format': ['channels_first', 'channels_last'], + 'fc_layers': list, + }) + + def __init__(self, params, model, name="cnn_encoder", mode='train'): + """CNN Encoder constructor. + + See parent class for arguments description. + + Config parameters: + + * **cnn_layers** (list) --- list with the description of "convolutional" + layers. For example:: + "conv_layers": [ + (tf.layers.conv2d, { + 'filters': 64, 'kernel_size': (11, 11), + 'strides': (4, 4), 'padding': 'VALID', + 'activation': tf.nn.relu, + }), + (tf.layers.max_pooling2d, { + 'pool_size': (3, 3), 'strides': (2, 2), + }), + (tf.layers.conv2d, { + 'filters': 192, 'kernel_size': (5, 5), + 'strides': (1, 1), 'padding': 'SAME', + }), + (tf.layers.batch_normalization, {'momentum': 0.9, 'epsilon': 0.0001}), + (tf.nn.relu, {}), + ] + Note that you don't need to provide "regularizer", "training" and + "data_format" parameters since they will be automatically added. + + * **cnn_layers** (list) --- list with the description of "fully-connected" + layers. The only different from convolutional layers is that the input + will be automatically reshaped to 2D (batch size x num features). + For example:: + 'fc_layers': [ + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.layers.dropout, {'rate': 0.5}), + (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}), + (tf.layers.dropout, {'rate': 0.5}), + ], + Note that you don't need to provide "regularizer", "training" and + "data_format" parameters since they will be automatically added. + + * **data_format** (string) --- could be either "channels_first" or + "channels_last". Defaults to "channels_first". + """ + super(CNNEncoder, self).__init__(params, model, name, mode) + + def _encode(self, input_dict): + regularizer = self.params.get('regularizer', None) + data_format = self.params.get('data_format', 'channels_first') + + x = input_dict['source_tensors'][0] + if data_format == 'channels_first': + x = tf.transpose(x, [0, 3, 1, 2]) + + for layer, layer_params in self.params['cnn_layers']: + x = build_layer(x, layer, layer_params, data_format, + regularizer, self.mode == 'train') + + if data_format == 'channels_first': + x = tf.transpose(x, [0, 2, 3, 1]) + + fc_layers = self.params.get('fc_layers', []) + + # if fully connected layers exist, flattening the output and applying them + if fc_layers: + input_shape = x.get_shape().as_list() + num_inputs = input_shape[1] * input_shape[2] * input_shape[3] + x = tf.reshape(x, [-1, num_inputs]) + for layer, layer_params in fc_layers: + x = build_layer(x, layer, layer_params, data_format, regularizer, + self.mode == 'train') + else: + # if there are no fully connected layers, doing average pooling + x = tf.reduce_mean(x, [1, 2]) + + return {'outputs': x} diff --git a/open_seq2seq/encoders/convs2s_encoder.py b/open_seq2seq/encoders/convs2s_encoder.py new file mode 100644 index 000000000..13c9e204e --- /dev/null +++ b/open_seq2seq/encoders/convs2s_encoder.py @@ -0,0 +1,221 @@ +# Copyright (c) 2018 NVIDIA Corporation +""" +Conv-based encoder +""" +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf +import math +from .encoder import Encoder + +from open_seq2seq.parts.transformer import embedding_layer +from open_seq2seq.parts.transformer.utils import get_padding_bias, get_padding +from open_seq2seq.parts.convs2s import ffn_wn_layer, conv_wn_layer + +# Default value used if max_input_length is not given +MAX_INPUT_LENGTH = 128 + + +class ConvS2SEncoder(Encoder): + """ + Fully convolutional Encoder of ConvS2S + """ + + @staticmethod + def get_required_params(): + return dict( + Encoder.get_required_params(), **{ + "encoder_layers": int, + "src_emb_size": int, + "src_vocab_size": int, + "pad_embeddings_2_eight": bool, + "conv_nchannels_kwidth": list, + "embedding_dropout_keep_prob": float, + "hidden_dropout_keep_prob": float, + }) + + @staticmethod + def get_optional_params(): + return dict( + Encoder.get_optional_params(), **{ + "att_layer_num": int, + 'max_input_length': int, + 'PAD_SYMBOL': int, + }) + + def __init__(self, + params, + model, + name="convs2s_encoder_with_emb", + mode='train'): + super(ConvS2SEncoder, self).__init__(params, model, name=name, mode=mode) + + self._src_vocab_size = self.params['src_vocab_size'] + self._src_emb_size = self.params['src_emb_size'] + self.layers = [] + self._mode = mode + self._pad_sym = self.params.get('PAD_SYMBOL', 0) + self._pad2eight = params.get('pad_embeddings_2_eight', False) + + def _encode(self, input_dict): + inputs = input_dict['source_tensors'][0] + source_length = input_dict['source_tensors'][1] + + with tf.variable_scope("encode"): + # prepare encoder graph + if len(self.layers) == 0: + knum_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[0] + kwidth_list = list(zip(*self.params.get("conv_nchannels_kwidth")))[1] + + with tf.variable_scope("embedding"): + self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self._src_vocab_size, + hidden_size=self._src_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + with tf.variable_scope("pos_embedding"): + self.position_embedding_layer = embedding_layer.EmbeddingSharedWeights( + vocab_size=self.params.get("max_input_length", MAX_INPUT_LENGTH), + hidden_size=self._src_emb_size, + pad_vocab_to_eight=self._pad2eight, + init_var=0.1, + embed_scale=False, + pad_sym=self._pad_sym, + mask_paddings=False) + + # linear projection before cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + self._src_emb_size, + knum_list[0], + dropout=self.params["embedding_dropout_keep_prob"], + var_scope_name="linear_mapping_before_cnn_layers")) + + for i in range(self.params['encoder_layers']): + in_dim = knum_list[i] if i == 0 else knum_list[i - 1] + out_dim = knum_list[i] + + # linear projection is needed for residual connections if + # input and output of a cnn layer do not match + if in_dim != out_dim: + linear_proj = ffn_wn_layer.FeedFowardNetworkNormalized( + in_dim, + out_dim, + var_scope_name="linear_mapping_cnn_" + str(i + 1), + dropout=1.0) + else: + linear_proj = None + + conv_layer = conv_wn_layer.Conv1DNetworkNormalized( + in_dim, + out_dim, + kernel_width=kwidth_list[i], + mode=self.mode, + layer_id=i + 1, + hidden_dropout=self.params["hidden_dropout_keep_prob"], + conv_padding="SAME", + decode_padding=False) + + self.layers.append([linear_proj, conv_layer]) + + # linear projection after cnn layers + self.layers.append( + ffn_wn_layer.FeedFowardNetworkNormalized( + knum_list[self.params['encoder_layers'] - 1], + self._src_emb_size, + dropout=1.0, + var_scope_name="linear_mapping_after_cnn_layers")) + + encoder_inputs = self.embedding_softmax_layer(inputs) + inputs_attention_bias = get_padding_bias( + inputs, res_rank=3, pad_sym=self._pad_sym) + + with tf.name_scope("add_pos_encoding"): + pos_input = tf.range( + 0, + tf.shape(encoder_inputs)[1], + delta=1, + dtype=tf.int32, + name='range') + pos_encoding = self.position_embedding_layer(pos_input) + encoder_inputs = encoder_inputs + tf.cast( + x=pos_encoding, dtype=encoder_inputs.dtype) + + if self.mode == "train": + encoder_inputs = tf.nn.dropout( + encoder_inputs, self.params["embedding_dropout_keep_prob"]) + + # mask the paddings in the input given to cnn layers + inputs_padding = get_padding( + inputs, self._pad_sym, dtype=encoder_inputs.dtype) + padding_mask = tf.expand_dims(1 - inputs_padding, 2) + encoder_inputs *= padding_mask + + # disables padding masks in middle layers + # padding_mask = None + outputs, outputs_b, final_state = self._call(encoder_inputs, padding_mask) + + return { + 'outputs': outputs, + 'outputs_b': outputs_b, + 'inputs_attention_bias_cs2s': inputs_attention_bias, + 'state': final_state, + 'src_lengths': source_length, # should it include paddings or not? + 'embedding_softmax_layer': self.embedding_softmax_layer, + # TODO: Should we share position embedding? + # 'position_embedding_layer': self.position_embedding_layer, + 'encoder_input': inputs + } + + def _call(self, encoder_inputs, padding_mask): + # Run inputs through the sublayers. + with tf.variable_scope("linear_layer_before_cnn_layers"): + outputs = self.layers[0](encoder_inputs) + + for i in range(1, len(self.layers) - 1): + linear_proj, conv_layer = self.layers[i] + + with tf.variable_scope("layer_%d" % i): + if padding_mask is not None: + outputs *= padding_mask + if linear_proj is not None: + res_inputs = linear_proj(outputs) + else: + res_inputs = outputs + outputs = conv_layer(outputs) + outputs = (outputs + res_inputs) * math.sqrt(0.5) + + with tf.variable_scope("linear_layer_after_cnn_layers"): + outputs = self.layers[-1](outputs) + + if padding_mask is not None: + outputs *= padding_mask + + # Gradients are scaled as the gradients from + # all decoder attention layers enters the encoder + scale = 1.0 / ( + 2.0 * self.params.get("att_layer_num", self.params["encoder_layers"])) + outputs = (1.0 - scale) * tf.stop_gradient(outputs) + scale * outputs + + outputs_b = (outputs + encoder_inputs) * math.sqrt(0.5) + + if padding_mask is not None: + outputs_b *= padding_mask + + # Average of the encoder outputs is calculated as the final state of the encoder + # it can be used for decoders which just accept the final state + final_state = tf.reduce_mean(outputs_b, 1) + return outputs, outputs_b, final_state + + @property + def src_vocab_size(self): + return self._src_vocab_size + + @property + def src_emb_size(self): + return self._src_emb_size diff --git a/open_seq2seq/encoders/ds2_encoder.py b/open_seq2seq/encoders/ds2_encoder.py index ec093fbae..0fd455d00 100644 --- a/open_seq2seq/encoders/ds2_encoder.py +++ b/open_seq2seq/encoders/ds2_encoder.py @@ -7,41 +7,14 @@ from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops from .encoder import Encoder - - -def conv2d_bn_actv(name, inputs, filters, kernel_size, activation_fn, strides, - padding, regularizer, training, data_format, bn_momentum, - bn_epsilon): - """Helper function that applies convolution, batch norm and activation.""" - conv = tf.layers.conv2d( - name="{}".format(name), - inputs=inputs, - filters=filters, - kernel_size=kernel_size, - strides=strides, - padding=padding, - kernel_regularizer=regularizer, - use_bias=False, - data_format=data_format, - ) - bn = tf.layers.batch_normalization( - name="{}/bn".format(name), - inputs=conv, - gamma_regularizer=regularizer, - training=training, - axis=-1 if data_format == 'channels_last' else 1, - momentum=bn_momentum, - epsilon=bn_epsilon, - ) - output = activation_fn(bn) - return output +from open_seq2seq.parts.cnns.conv_blocks import conv_bn_actv def rnn_cell(rnn_cell_dim, layer_type, dropout_keep_prob=1.0): """Helper function that creates RNN cell.""" if layer_type == "layernorm_lstm": cell = tf.contrib.rnn.LayerNormBasicLSTMCell( - num_units=rnn_cell_dim, dropout_keep_prob=dropout_keep_prob) + num_units=rnn_cell_dim, dropout_keep_prob=dropout_keep_prob) else: if layer_type == "lstm": cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_cell_dim) @@ -55,7 +28,7 @@ def rnn_cell(rnn_cell_dim, layer_type, dropout_keep_prob=1.0): raise ValueError("Error: not supported rnn type:{}".format(layer_type)) cell = tf.nn.rnn_cell.DropoutWrapper( - cell, output_keep_prob=dropout_keep_prob) + cell, output_keep_prob=dropout_keep_prob) return cell @@ -75,28 +48,28 @@ def row_conv(name, input_layer, batch, channels, width, activation_fn, x = tf.cast(x, tf.float32) cast_back = True filters = tf.get_variable( - name+'/w', - shape=[width, 1, channels, 1], - regularizer=regularizer, - dtype=tf.float32, + name + '/w', + shape=[width, 1, channels, 1], + regularizer=regularizer, + dtype=tf.float32, ) strides = [1, 1, 1, 1] y = tf.nn.depthwise_conv2d( - name=name + '/conv', - input=x, - filter=filters, - strides=strides, - padding='SAME', - data_format='NHWC' if data_format == 'channels_last' else 'NCHW', + name=name + '/conv', + input=x, + filter=filters, + strides=strides, + padding='SAME', + data_format='NHWC' if data_format == 'channels_last' else 'NCHW', ) bn = tf.layers.batch_normalization( - name="{}/bn".format(name), - inputs=y, - gamma_regularizer=regularizer, - training=training, - axis=-1 if data_format == 'channels_last' else 1, - momentum=bn_momentum, - epsilon=bn_epsilon, + name="{}/bn".format(name), + inputs=y, + gamma_regularizer=regularizer, + training=training, + axis=-1 if data_format == 'channels_last' else 1, + momentum=bn_momentum, + epsilon=bn_epsilon, ) output = activation_fn(bn) if data_format == 'channels_first': @@ -112,25 +85,25 @@ class DeepSpeech2Encoder(Encoder): @staticmethod def get_required_params(): return dict(Encoder.get_required_params(), **{ - 'dropout_keep_prob': float, - 'conv_layers': list, - 'activation_fn': None, # any valid callable - 'num_rnn_layers': int, - 'row_conv': bool, - 'n_hidden': int, - 'use_cudnn_rnn': bool, - 'rnn_cell_dim': int, - 'rnn_type': ['layernorm_lstm', 'lstm', 'gru', 'cudnn_gru', 'cudnn_lstm'], - 'rnn_unidirectional': bool, + 'dropout_keep_prob': float, + 'conv_layers': list, + 'activation_fn': None, # any valid callable + 'num_rnn_layers': int, + 'row_conv': bool, + 'n_hidden': int, + 'use_cudnn_rnn': bool, + 'rnn_cell_dim': int, + 'rnn_type': ['layernorm_lstm', 'lstm', 'gru', 'cudnn_gru', 'cudnn_lstm'], + 'rnn_unidirectional': bool, }) @staticmethod def get_optional_params(): return dict(Encoder.get_optional_params(), **{ - 'row_conv_width': int, - 'data_format': ['channels_first', 'channels_last'], - 'bn_momentum': float, - 'bn_epsilon': float, + 'row_conv_width': int, + 'data_format': ['channels_first', 'channels_last'], + 'bn_momentum': float, + 'bn_epsilon': float, }) def __init__(self, params, model, name="ds2_encoder", mode='train'): @@ -214,8 +187,8 @@ def _encode(self, input_dict): top_layer = input_layer else: top_layer = tf.transpose(input_layer, [0, 3, 1, 2]) - - # ----- Convolutional layers ----------------------------------------------- + + # ----- Convolutional layers --------------------------------------------- conv_layers = self.params['conv_layers'] for idx_conv in range(len(conv_layers)): @@ -229,19 +202,20 @@ def _encode(self, input_dict): else: src_length = (src_length + strides[0] - 1) // strides[0] - top_layer = conv2d_bn_actv( - name="conv{}".format(idx_conv + 1), - inputs=top_layer, - filters=ch_out, - kernel_size=kernel_size, - activation_fn=self.params['activation_fn'], - strides=strides, - padding=padding, - regularizer=regularizer, - training=training, - data_format=data_format, - bn_momentum=bn_momentum, - bn_epsilon=bn_epsilon, + top_layer = conv_bn_actv( + type="conv2d", + name="conv{}".format(idx_conv + 1), + inputs=top_layer, + filters=ch_out, + kernel_size=kernel_size, + activation_fn=self.params['activation_fn'], + strides=strides, + padding=padding, + regularizer=regularizer, + training=training, + data_format=data_format, + bn_momentum=bn_momentum, + bn_epsilon=bn_epsilon, ) if data_format == 'channels_first': top_layer = tf.transpose(top_layer, [0, 2, 3, 1]) @@ -267,55 +241,56 @@ def _encode(self, input_dict): if rnn_type == "cudnn_gru" or rnn_type == "gru": rnn_block = tf.contrib.cudnn_rnn.CudnnGRU( - num_layers=num_rnn_layers, - num_units=rnn_cell_dim, - direction=direction, - dropout=1.0 - dropout_keep_prob, - dtype=rnn_input.dtype, - name="cudnn_gru", + num_layers=num_rnn_layers, + num_units=rnn_cell_dim, + direction=direction, + dropout=1.0 - dropout_keep_prob, + dtype=rnn_input.dtype, + name="cudnn_gru", ) elif rnn_type == "cudnn_lstm" or rnn_type == "lstm": rnn_block = tf.contrib.cudnn_rnn.CudnnLSTM( - num_layers=num_rnn_layers, - num_units=rnn_cell_dim, - direction=direction, - dropout=1.0 - dropout_keep_prob, - dtype=rnn_input.dtype, - name="cudnn_lstm", + num_layers=num_rnn_layers, + num_units=rnn_cell_dim, + direction=direction, + dropout=1.0 - dropout_keep_prob, + dtype=rnn_input.dtype, + name="cudnn_lstm", ) else: raise ValueError( - "{} is not a valid rnn_type for cudnn_rnn layers".format(rnn_type) + "{} is not a valid rnn_type for cudnn_rnn layers".format( + rnn_type) ) top_layer, state = rnn_block(rnn_input) top_layer = tf.transpose(top_layer, [1, 0, 2]) else: rnn_input = top_layer multirnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell( - [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type, - dropout_keep_prob=dropout_keep_prob) - for _ in range(num_rnn_layers)] + [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type, + dropout_keep_prob=dropout_keep_prob) + for _ in range(num_rnn_layers)] ) if self.params['rnn_unidirectional']: top_layer, state = tf.nn.dynamic_rnn( - cell=multirnn_cell_fw, - inputs=rnn_input, - sequence_length=src_length, - dtype=rnn_input.dtype, - time_major=False, + cell=multirnn_cell_fw, + inputs=rnn_input, + sequence_length=src_length, + dtype=rnn_input.dtype, + time_major=False, ) else: multirnn_cell_bw = tf.nn.rnn_cell.MultiRNNCell( - [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type, - dropout_keep_prob=dropout_keep_prob) - for _ in range(num_rnn_layers)] + [rnn_cell(rnn_cell_dim=rnn_cell_dim, layer_type=rnn_type, + dropout_keep_prob=dropout_keep_prob) + for _ in range(num_rnn_layers)] ) top_layer, state = tf.nn.bidirectional_dynamic_rnn( - cell_fw=multirnn_cell_fw, cell_bw=multirnn_cell_bw, - inputs=rnn_input, - sequence_length=src_length, - dtype=rnn_input.dtype, - time_major=False + cell_fw=multirnn_cell_fw, cell_bw=multirnn_cell_bw, + inputs=rnn_input, + sequence_length=src_length, + dtype=rnn_input.dtype, + time_major=False ) # concat 2 tensors [B, T, n_cell_dim] --> [B, T, 2*n_cell_dim] top_layer = tf.concat(top_layer, 2) @@ -324,41 +299,41 @@ def _encode(self, input_dict): if self.params['row_conv']: channels = top_layer.get_shape().as_list()[-1] top_layer = row_conv( - name="row_conv", - input_layer=top_layer, - batch=batch_size, - channels=channels, - activation_fn=self.params['activation_fn'], - width=self.params['row_conv_width'], - regularizer=regularizer, - training=training, - data_format=data_format, - bn_momentum=bn_momentum, - bn_epsilon=bn_epsilon, + name="row_conv", + input_layer=top_layer, + batch=batch_size, + channels=channels, + activation_fn=self.params['activation_fn'], + width=self.params['row_conv_width'], + regularizer=regularizer, + training=training, + data_format=data_format, + bn_momentum=bn_momentum, + bn_epsilon=bn_epsilon, ) # Reshape [B, T, C] --> [B*T, C] c = top_layer.get_shape().as_list()[-1] top_layer = tf.reshape(top_layer, [-1, c]) - # --- hidden layer with clipped ReLU activation and dropout----------------- + # --- hidden layer with clipped ReLU activation and dropout--------------- top_layer = tf.layers.dense( - inputs=top_layer, - units=self.params['n_hidden'], - kernel_regularizer=regularizer, - activation=self.params['activation_fn'], - name='fully_connected', + inputs=top_layer, + units=self.params['n_hidden'], + kernel_regularizer=regularizer, + activation=self.params['activation_fn'], + name='fully_connected', ) outputs = tf.nn.dropout(x=top_layer, keep_prob=dropout_keep_prob) # reshape from [B*T,A] --> [B, T, A]. # Output shape: [batch_size, n_steps, n_hidden] outputs = tf.reshape( - outputs, - [batch_size, -1, self.params['n_hidden']], + outputs, + [batch_size, -1, self.params['n_hidden']], ) return { - 'outputs': outputs, - 'src_length': src_length, + 'outputs': outputs, + 'src_length': src_length, } diff --git a/open_seq2seq/encoders/encoder.py b/open_seq2seq/encoders/encoder.py index 555b456a8..c2e969c65 100644 --- a/open_seq2seq/encoders/encoder.py +++ b/open_seq2seq/encoders/encoder.py @@ -87,24 +87,9 @@ def __init__(self, params, model, name="encoder", mode='train'): else: self._params['dtype'] = tf.float32 - if 'regularizer' not in self._params: - if self._model and 'regularizer' in self._model.params: - self._params['regularizer'] = self._model.params['regularizer'] - self._params['regularizer_params'] = self._model.params['regularizer_params'] - - if 'regularizer' in self._params: - init_dict = self._params.get('regularizer_params', {}) - self._params['regularizer'] = self._params['regularizer'](**init_dict) - if self._params['dtype'] == 'mixed': - self._params['regularizer'] = mp_regularizer_wrapper( - self._params['regularizer'], - ) - - if self._params['dtype'] == 'mixed': - self._params['dtype'] = tf.float16 - self._name = name self._mode = mode + self._compiled = False def encode(self, input_dict): """Wrapper around :meth:`self._encode() <_encode>` method. @@ -117,11 +102,35 @@ def encode(self, input_dict): Returns: see :meth:`self._encode() <_encode>` docs. """ + if not self._compiled: + if 'regularizer' not in self._params: + if self._model and 'regularizer' in self._model.params: + self._params['regularizer'] = copy.deepcopy( + self._model.params['regularizer'] + ) + self._params['regularizer_params'] = copy.deepcopy( + self._model.params['regularizer_params'] + ) + + if 'regularizer' in self._params: + init_dict = self._params.get('regularizer_params', {}) + self._params['regularizer'] = self._params['regularizer'](**init_dict) + if self._params['dtype'] == 'mixed': + self._params['regularizer'] = mp_regularizer_wrapper( + self._params['regularizer'], + ) + + if self._params['dtype'] == 'mixed': + self._params['dtype'] = tf.float16 + if 'initializer' in self.params: init_dict = self.params.get('initializer_params', {}) initializer = self.params['initializer'](**init_dict) else: initializer = None + + self._compiled = True + with tf.variable_scope(self._name, initializer=initializer, dtype=self.params['dtype']): return self._encode(self._cast_types(input_dict)) diff --git a/open_seq2seq/encoders/rnn_encoders.py b/open_seq2seq/encoders/rnn_encoders.py index 472ba85a5..25838730c 100644 --- a/open_seq2seq/encoders/rnn_encoders.py +++ b/open_seq2seq/encoders/rnn_encoders.py @@ -5,12 +5,11 @@ from __future__ import absolute_import, division, print_function from __future__ import unicode_literals -import copy import tensorflow as tf -from open_seq2seq.parts.rnns.utils import create_rnn_cell +from open_seq2seq.parts.rnns.utils import single_cell from .encoder import Encoder - +from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops class UnidirectionalRNNEncoderWithEmbedding(Encoder): """ @@ -22,8 +21,8 @@ def get_required_params(): return dict(Encoder.get_required_params(), **{ 'src_vocab_size': int, 'src_emb_size': int, - 'encoder_cell_units': int, - 'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'], + 'core_cell': None, + 'core_cell_params': dict, 'encoder_layers': int, 'encoder_use_skip_connections': bool, }) @@ -87,10 +86,6 @@ def _encode(self, input_dict): source_sequence = input_dict['source_tensors'][0] source_length = input_dict['source_tensors'][1] - - cell_params = copy.deepcopy(self.params) - cell_params["num_units"] = self.params['encoder_cell_units'] - self._enc_emb_w = tf.get_variable( name="EncoderEmbeddingMatrix", shape=[self._src_vocab_size, self._src_emb_size], @@ -104,14 +99,16 @@ def _encode(self, input_dict): dp_input_keep_prob = 1.0 dp_output_keep_prob = 1.0 - self._encoder_cell_fw = create_rnn_cell( - cell_type=self.params['encoder_cell_type'], - cell_params=cell_params, - num_layers=self.params['encoder_layers'], - dp_input_keep_prob=dp_input_keep_prob, - dp_output_keep_prob=dp_output_keep_prob, - residual_connections=self.params['encoder_use_skip_connections'], - ) + fwd_cells = [ + single_cell(cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), + dp_input_keep_prob=dp_input_keep_prob, + dp_output_keep_prob=dp_output_keep_prob, + residual_connections=self.params[ + 'encoder_use_skip_connections'] + ) for _ in range(self.params['encoder_layers'])] + + self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells) time_major = self.params.get("time_major", False) use_swap_memory = self.params.get("use_swap_memory", False) @@ -157,10 +154,10 @@ def get_required_params(): return dict(Encoder.get_required_params(), **{ 'src_vocab_size': int, 'src_emb_size': int, - 'encoder_cell_units': int, - 'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'], 'encoder_layers': int, 'encoder_use_skip_connections': bool, + 'core_cell': None, + 'core_cell_params': dict, }) @staticmethod @@ -227,9 +224,6 @@ def _encode(self, input_dict): dtype=tf.float32 ) - cell_params = copy.deepcopy(self.params) - cell_params["num_units"] = self.params['encoder_cell_units'] - if self._mode == "train": dp_input_keep_prob = self.params['encoder_dp_input_keep_prob'] dp_output_keep_prob = self.params['encoder_dp_output_keep_prob'] @@ -237,25 +231,27 @@ def _encode(self, input_dict): dp_input_keep_prob = 1.0 dp_output_keep_prob = 1.0 + fwd_cells = [ + single_cell(cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), + dp_input_keep_prob=dp_input_keep_prob, + dp_output_keep_prob=dp_output_keep_prob, + residual_connections=self.params['encoder_use_skip_connections'] + ) for _ in range(self.params['encoder_layers'])] + bwd_cells = [ + single_cell(cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), + dp_input_keep_prob=dp_input_keep_prob, + dp_output_keep_prob=dp_output_keep_prob, + residual_connections=self.params['encoder_use_skip_connections'] + ) for _ in range(self.params['encoder_layers'])] + + with tf.variable_scope("FW"): - self._encoder_cell_fw = create_rnn_cell( - cell_type=self.params['encoder_cell_type'], - cell_params=cell_params, - num_layers=self.params['encoder_layers'], - dp_input_keep_prob=dp_input_keep_prob, - dp_output_keep_prob=dp_output_keep_prob, - residual_connections=self.params['encoder_use_skip_connections'] - ) + self._encoder_cell_fw = tf.contrib.rnn.MultiRNNCell(fwd_cells) with tf.variable_scope("BW"): - self._encoder_cell_bw = create_rnn_cell( - cell_type=self.params['encoder_cell_type'], - cell_params=cell_params, - num_layers=self.params['encoder_layers'], - dp_input_keep_prob=dp_input_keep_prob, - dp_output_keep_prob=dp_output_keep_prob, - residual_connections=self.params['encoder_use_skip_connections'] - ) + self._encoder_cell_bw = tf.contrib.rnn.MultiRNNCell(bwd_cells) embedded_inputs = tf.cast(tf.nn.embedding_lookup( self.enc_emb_w, @@ -301,8 +297,10 @@ def get_required_params(): return dict(Encoder.get_required_params(), **{ 'src_vocab_size': int, 'src_emb_size': int, - 'encoder_cell_units': int, - 'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'], + 'core_cell': None, + 'core_cell_params': dict, + #'encoder_cell_units': int, + #'encoder_cell_type': ['lstm', 'gru', 'glstm', 'slstm'], 'encoder_layers': int, 'encoder_use_skip_connections': bool, }) @@ -353,27 +351,24 @@ def _encode(self, input_dict): if self.params['encoder_layers'] < 2: raise ValueError("GNMT encoder must have at least 2 layers") - cell_params = copy.deepcopy(self.params) - cell_params["num_units"] = self.params['encoder_cell_units'] + #cell_params = copy.deepcopy(self.params) + #cell_params["num_units"] = self.params['encoder_cell_units'] with tf.variable_scope("Level1FW"): - self._encoder_l1_cell_fw = create_rnn_cell( - cell_type=self.params['encoder_cell_type'], - cell_params=cell_params, - num_layers=1, + self._encoder_l1_cell_fw = single_cell( + cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), dp_input_keep_prob=1.0, dp_output_keep_prob=1.0, - residual_connections=False, - ) + residual_connections=False) + with tf.variable_scope("Level1BW"): - self._encoder_l1_cell_bw = create_rnn_cell( - cell_type=self.params['encoder_cell_type'], - cell_params=cell_params, - num_layers=1, + self._encoder_l1_cell_bw = single_cell( + cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), dp_input_keep_prob=1.0, dp_output_keep_prob=1.0, - residual_connections=False, - ) + residual_connections=False) if self._mode == "train": dp_input_keep_prob = self.params['encoder_dp_input_keep_prob'] @@ -383,15 +378,13 @@ def _encode(self, input_dict): dp_output_keep_prob = 1.0 with tf.variable_scope("UniDirLevel"): - self._encoder_cells = create_rnn_cell( - cell_type=self.params['encoder_cell_type'], - cell_params=cell_params, - num_layers=self.params['encoder_layers'] - 1, + self._encoder_cells = [single_cell( + cell_class=self.params['core_cell'], + cell_params=self.params.get('core_cell_params', {}), dp_input_keep_prob=dp_input_keep_prob, dp_output_keep_prob=dp_output_keep_prob, - residual_connections=False, - wrap_to_multi_rnn=False, - ) + residual_connections=False) for _ in range(self.params['encoder_layers'] - 1)] + # add residual connections starting from the third layer for idx, cell in enumerate(self._encoder_cells): if idx > 0: @@ -422,7 +415,7 @@ def _encode(self, input_dict): inputs=encoder_l1_outputs, sequence_length=source_length, swap_memory=use_swap_memory, - time_major = time_major, + time_major=time_major, dtype=encoder_l1_outputs.dtype, ) @@ -442,3 +435,151 @@ def src_emb_size(self): @property def enc_emb_w(self): return self._enc_emb_w + +class GNMTLikeEncoderWithEmbedding_cuDNN(Encoder): + """ + Encoder similar to the one used in + GNMT model: https://arxiv.org/abs/1609.08144. + Must have at least 2 layers. Uses cuDNN RNN blocks for efficiency + """ + + @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + 'src_vocab_size': int, + 'src_emb_size': int, + 'encoder_cell_units': int, + 'encoder_cell_type': ['lstm', 'gru'], + 'encoder_layers': int, + #'core_cell': None, + #'core_cell_params': dict, + }) + + @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + 'encoder_dp_output_keep_prob': float, + }) + + def __init__(self, params, model, + name="gnmt_encoder_with_emb_cudnn", mode='train'): + """ + Encodes data into representation + :param params: a Python dictionary. + Must define: + * src_inputs - a Tensor of shape [batch_size, time] or [time, batch_size] + (depending on time_major param) + * src_lengths - a Tensor of shape [batch_size] + :return: a Python dictionary with: + * encoder_outputs - a Tensor of shape + [batch_size, time, representation_dim] + or [time, batch_size, representation_dim] + * encoder_state - a Tensor of shape [batch_size, dim] + * src_lengths - (copy ref from input) a Tensor of shape [batch_size] + """ + super(GNMTLikeEncoderWithEmbedding_cuDNN, self).__init__( + params, model, name=name, mode=mode, + ) + + self._src_vocab_size = self.params['src_vocab_size'] + self._src_emb_size = self.params['src_emb_size'] + + def _encode(self, input_dict): + source_sequence = input_dict['source_tensors'][0] + source_length = input_dict['source_tensors'][1] + self._enc_emb_w = tf.get_variable( + name="EncoderEmbeddingMatrix", + shape=[self._src_vocab_size, self._src_emb_size], + dtype=tf.float32 + ) + + if self.params['encoder_layers'] < 2: + raise ValueError("GNMT encoder must have at least 2 layers") + + if self._mode == "train": + dp_output_keep_prob = self.params['encoder_dp_output_keep_prob'] + else: + dp_output_keep_prob = 1.0 + + # source_sequence is of [batch, time] shape + embedded_inputs = tf.cast(tf.nn.embedding_lookup( + self.enc_emb_w, + tf.transpose(source_sequence), # cudnn wants [time, batch, ...] + ), self.params['dtype']) + + with tf.variable_scope("Bi_Directional_Layer"): + direction = cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION + if self.params['encoder_cell_type'] == "gru": + bidirectional_block = tf.contrib.cudnn_rnn.CudnnGRU( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=0.0, + dtype=self.params['dtype'], + name="cudnn_gru_bidi", + ) + elif self.params['encoder_cell_type'] == "lstm": + bidirectional_block = tf.contrib.cudnn_rnn.CudnnLSTM( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=0.0, + dtype=self.params['dtype'], + name="cudnn_lstm_bidi", + ) + else: + raise ValueError( + "{} is not a valid rnn_type for cudnn_rnn layers" + .format(self.params['encoder_cell_units']) + ) + bidi_output, bidi_state = bidirectional_block(embedded_inputs) + + with tf.variable_scope("Uni_Directional_Layer"): + direction = cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION + layer_input = bidi_output + for ind in range(self.params['encoder_layers'] - 1): + with tf.variable_scope("uni_layer_{}".format(ind)): + if self.params['encoder_cell_type'] == "gru": + unidirectional_block = tf.contrib.cudnn_rnn.CudnnGRU( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=1.0 - dp_output_keep_prob, + dtype=self.params['dtype'], + name="cudnn_gru_uni_".format(ind), + ) + elif self.params['encoder_cell_type'] == "lstm": + unidirectional_block = tf.contrib.cudnn_rnn.CudnnLSTM( + num_layers=1, + num_units=self.params['encoder_cell_units'], + direction=direction, + dropout=1.0 - dp_output_keep_prob, + dtype=self.params['dtype'], + name="cudnn_lstm_uni_".format(ind), + ) + layer_output, encoder_state = unidirectional_block( + layer_input) + if ind > 0: # add residual connection + layer_output = layer_input + layer_output + layer_input = layer_output + + return {'outputs': tf.transpose(layer_input, perm=[1, 0, 2]), + 'state': None, + 'src_lengths': source_length, + 'encoder_input': source_sequence} + + + @property + def src_vocab_size(self): + return self._src_vocab_size + + @property + def src_emb_size(self): + return self._src_emb_size + + @property + def enc_emb_w(self): + return self._enc_emb_w + + + diff --git a/open_seq2seq/encoders/transformer_encoder.py b/open_seq2seq/encoders/transformer_encoder.py index 342b992e4..cd2d6edbd 100644 --- a/open_seq2seq/encoders/transformer_encoder.py +++ b/open_seq2seq/encoders/transformer_encoder.py @@ -79,7 +79,7 @@ def _encode(self, input_dict): # prepare encoder graph self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( self.params["src_vocab_size"], self.params["hidden_size"], - pad2eight=self.params.get('pad_embeddings_2_eight', False)) + pad_vocab_to_eight=self.params.get('pad_embeddings_2_eight', False)) for _ in range(self.params['encoder_layers']): # Create sublayers for each layer. diff --git a/open_seq2seq/encoders/w2l_encoder.py b/open_seq2seq/encoders/w2l_encoder.py new file mode 100644 index 000000000..75b5b6a56 --- /dev/null +++ b/open_seq2seq/encoders/w2l_encoder.py @@ -0,0 +1,155 @@ +# Copyright (c) 2018 NVIDIA Corporation +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals +from six.moves import range + +import tensorflow as tf + +from .encoder import Encoder +from open_seq2seq.parts.cnns.conv_blocks import * + + +class Wave2LetterEncoder(Encoder): + """Wave2Letter like encoder. Fully convolutional model""" + + @staticmethod + def get_required_params(): + return dict(Encoder.get_required_params(), **{ + 'dropout_keep_prob': float, + 'convnet_layers': list, + 'activation_fn': None, # any valid callable + }) + + @staticmethod + def get_optional_params(): + return dict(Encoder.get_optional_params(), **{ + 'data_format': ['channels_first', 'channels_last'], + 'normalization': [None, 'batch_norm'], + 'bn_momentum': float, + 'bn_epsilon': float, + }) + + def __init__(self, params, model, name="w2l_encoder", mode='train'): + """Wave2Letter like encoder constructor. + + See parent class for arguments description. + + Config parameters: + + * **dropout_keep_prop** (float) --- keep probability for dropout. + * **convnet_layers** (list) --- list with the description of convolutional + layers. For example:: + "convnet_layers": [ + { + "type": "conv1d", "repeat" : 5, + "kernel_size": [7], "stride": [1], + "num_channels": 250, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [11], "stride": [1], + "num_channels": 500, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [32], "stride": [1], + "num_channels": 1000, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [1], "stride": [1], + "num_channels": 1000, "padding": "SAME" + }, + ] + * **activation_fn** --- activation function to use. + * **data_format** (string) --- could be either "channels_first" or + "channels_last". Defaults to "channels_last". + * **normalization** --- normalization to use. Accepts [None, 'batch_norm']. + Use None if you don't want to use normalization. Defaults to 'batch_norm'. + * **bn_momentum** (float) --- momentum for batch norm. Defaults to 0.90. + * **bn_epsilon** (float) --- epsilon for batch norm. Defaults to 1e-3. + """ + super(Wave2LetterEncoder, self).__init__(params, model, name, mode) + + def _encode(self, input_dict): + """Creates TensorFlow graph for Wav2Letter like encoder. + + Args: + input_dict (dict): input dictionary that has to contain + the following fields:: + input_dict = { + "source_tensors": [ + src_sequence (shape=[batch_size, sequence length, num features]), + src_length (shape=[batch_size]) + ] + } + + Returns: + dict: dictionary with the following tensors:: + + { + 'outputs': hidden state, shape=[batch_size, sequence length, n_hidden] + 'src_length': tensor, shape=[batch_size] + } + """ + + source_sequence, src_length = input_dict['source_tensors'] + + training = (self._mode == "train") + dropout_keep_prob = self.params['dropout_keep_prob'] if training else 1.0 + regularizer = self.params.get('regularizer', None) + data_format = self.params.get('data_format', 'channels_last') + normalization = self.params.get('normalization', 'batch_norm') + + normalization_params = {} + if normalization == None: + conv_block = conv_actv + elif normalization == "batch_norm": + conv_block = conv_bn_actv + normalization_params['bn_momentum'] = self.params.get( + 'bn_momentum', 0.90) + normalization_params['bn_epsilon'] = self.params.get('bn_epsilon', 1e-3) + + conv_inputs = source_sequence + batch_size = conv_inputs.get_shape().as_list()[0] + if data_format == 'channels_last': + conv_feats = conv_inputs # B T F + else: + conv_feats = tf.transpose(conv_inputs, [0, 2, 1]) # B F T + + # ----- Convolutional layers --------------------------------------------- + convnet_layers = self.params['convnet_layers'] + + for idx_convnet in range(len(convnet_layers)): + layer_type = convnet_layers[idx_convnet]['type'] + layer_repeat = convnet_layers[idx_convnet]['repeat'] + ch_out = convnet_layers[idx_convnet]['num_channels'] + kernel_size = convnet_layers[idx_convnet]['kernel_size'] + strides = convnet_layers[idx_convnet]['stride'] + padding = convnet_layers[idx_convnet]['padding'] + + for idx_layer in range(layer_repeat): + conv_feats = conv_block( + type=layer_type, + name="conv{}{}".format( + idx_convnet + 1, idx_layer + 1), + inputs=conv_feats, + filters=ch_out, + kernel_size=kernel_size, + activation_fn=self.params['activation_fn'], + strides=strides, + padding=padding, + regularizer=regularizer, + training=training, + data_format=data_format, + **normalization_params + ) + outputs = tf.nn.dropout(x=conv_feats, keep_prob=dropout_keep_prob) + + if data_format == 'channels_first': + outputs = tf.transpose(outputs, [0, 2, 1]) + + return { + 'outputs': outputs, + 'src_length': src_length, + } diff --git a/open_seq2seq/losses/sequence_loss.py b/open_seq2seq/losses/sequence_loss.py index 718b053c2..f665a05c4 100644 --- a/open_seq2seq/losses/sequence_loss.py +++ b/open_seq2seq/losses/sequence_loss.py @@ -252,11 +252,17 @@ def get_optional_params(): 'batch_size': int, 'tgt_vocab_size': int, 'label_smoothing': float, + 'pad_embeddings_2_eight': bool, }) def __init__(self, params, model, name="padded_cross_entropy_with_smoothing"): super(PaddedCrossEntropyLossWithSmoothing, self).__init__(params, model, name) - self._tgt_vocab_size = self.params["tgt_vocab_size"] + if self.params.get('pad_embeddings_2_eight', False): + self._tgt_vocab_size = self.params["tgt_vocab_size"] if self.params[ + "tgt_vocab_size"] % 8 == 0 else \ + self.params["tgt_vocab_size"] + (8 - self.params["tgt_vocab_size"] % 8) + else: + self._tgt_vocab_size = self.params["tgt_vocab_size"] self._label_smoothing = self.params.get("label_smoothing", 0.0) def _compute_loss(self, input_dict): diff --git a/open_seq2seq/models/encoder_decoder.py b/open_seq2seq/models/encoder_decoder.py index cc74f0871..4a3858b3f 100644 --- a/open_seq2seq/models/encoder_decoder.py +++ b/open_seq2seq/models/encoder_decoder.py @@ -130,8 +130,8 @@ def _build_forward_pass_graph(self, input_tensors, gpu_id=0): Returns: tuple: tuple containing loss tensor as returned from - ``loss.compute_loss()`` and samples tensor, which is taken from - ``decoder.decode()['samples']``. When ``mode == 'infer'``, loss will + ``loss.compute_loss()`` and list of outputs tensors, which is taken from + ``decoder.decode()['outputs']``. When ``mode == 'infer'``, loss will be None. """ if not isinstance(input_tensors, dict) or \ @@ -159,7 +159,7 @@ def _build_forward_pass_graph(self, input_tensors, gpu_id=0): if self.mode == "train": decoder_input['target_tensors'] = target_tensors decoder_output = self.decoder.decode(input_dict=decoder_input) - decoder_samples = decoder_output.get("samples", None) + model_outputs = decoder_output.get("outputs", None) if self.mode == "train" or self.mode == "eval": with tf.variable_scope("Loss"): @@ -171,7 +171,7 @@ def _build_forward_pass_graph(self, input_tensors, gpu_id=0): else: deco_print("Inference Mode. Loss part of graph isn't built.") loss = None - return loss, decoder_samples + return loss, model_outputs @property def encoder(self): diff --git a/open_seq2seq/models/image2label.py b/open_seq2seq/models/image2label.py index bc932b6d8..1c8565487 100644 --- a/open_seq2seq/models/image2label.py +++ b/open_seq2seq/models/image2label.py @@ -12,7 +12,7 @@ class Image2Label(EncoderDecoderModel): - def maybe_print_logs(self, input_values, output_values): + def maybe_print_logs(self, input_values, output_values, training_step): labels = input_values['target_tensors'][0] logits = output_values[0] @@ -31,7 +31,7 @@ def maybe_print_logs(self, input_values, output_values): "Train batch top-5": top5, } - def finalize_evaluation(self, results_per_batch): + def finalize_evaluation(self, results_per_batch, training_step=None): top1 = 0.0 top5 = 0.0 total = 0.0 @@ -56,11 +56,12 @@ def evaluate(self, input_values, output_values): labels = np.where(labels == 1)[1] total = logits.shape[0] - top1 = np.sum(np.argmax(logits, axis=1) == labels) - top5 = np.sum(labels[:, np.newaxis] == np.argpartition(logits, -5)[:, -5:]) + top1 = np.sum(np.equal(np.argmax(logits, axis=1), labels)) + top5 = np.sum(np.equal(labels[:, np.newaxis], + np.argpartition(logits, -5)[:, -5:])) return total, top1, top5 - def get_num_objects_per_step(self, worker_id=0): + def _get_num_objects_per_step(self, worker_id=0): """Returns number of images in current batch, i.e. batch size.""" data_layer = self.get_data_layer(worker_id) num_images = tf.shape(data_layer.input_tensors['source_tensors'][0])[0] diff --git a/open_seq2seq/models/model.py b/open_seq2seq/models/model.py index 598fbd6e8..588dd8450 100644 --- a/open_seq2seq/models/model.py +++ b/open_seq2seq/models/model.py @@ -10,6 +10,11 @@ import copy import time +try: + from inspect import signature +except ImportError: + from funcsigs import signature + from open_seq2seq.utils.utils import deco_print, clip_last_batch from open_seq2seq.optimizers import optimize_loss, get_regularization_loss from open_seq2seq.utils.utils import check_params @@ -55,6 +60,7 @@ class :meth:`__init__` method. 'save_summaries_steps': None, # could be int or None 'print_loss_steps': None, # could be int or None 'print_samples_steps': None, # could be int or None + 'print_bench_info_steps': None, # could be int or None 'save_checkpoint_steps': None, # could be int or None 'eval_steps': int, @@ -75,9 +81,9 @@ class :meth:`__init__` method. 'lr_policy_params': dict, 'max_grad_norm': float, 'larc_params': dict, - 'loss_scale': float, - 'automatic_loss_scaling': [None, 'Backoff', 'LogMax'], + 'loss_scaling': None, # float, "Backoff" or "LogMax" 'summaries': list, + 'iter_size': int, } def __init__(self, params, mode="train", hvd=None): @@ -121,6 +127,11 @@ def __init__(self, params, mode="train", hvd=None): * **print_samples_steps** (int or None) --- how often to print training samples (input sequences, correct answers and model predictions). Setting it to None disables samples printing. + * **print_bench_info_steps** (int or None) --- how often to print training + benchmarking information (average number of objects processed per step). + Setting it to None disables intermediate benchmarking printing, but + the average information across the whole training will always be printed + after the last iteration. * **save_checkpoint_steps** (int or None) --- how often to save model checkpoints. Setting it to None disables checkpoint saving. * **eval_steps** (int) --- how often to run evaluation during training. @@ -156,14 +167,17 @@ class docs. * **max_grad_norm** (float) --- maximum value of gradient norm. Clipping will be performed if some gradients exceed this value (this is checked for each variable independently). - * **loss_scale** (float) --- static loss scale to use. For details see - :ref:`mixed precision training ` section in docs. - * **automatic_loss_scaling** --- automatic loss scaling mode. Could be - either None, "Backoff" or "Logmax". For details see - :ref:`mixed precision training ` section in docs. + * **loss_scaling** --- could be float or string. If float, static loss + scaling is applied. If string, the corresponding automatic + loss scaling algorithm is used. Must be one of 'Backoff' + of 'LogMax' (case insensitive). Only used when dtype="mixed". For details + see :ref:`mixed precision training ` section in docs. * **summaries** (list) --- which summaries to log. Could contain "learning_rate", "gradients", "gradient_norm", "global_gradient_norm", "variables", "variable_norm". + * **iter_size** (int) --- use this parameter to emulate large batches. + The gradients will be accumulated for ``iter_size`` number of steps before + applying update. * **larc_params** --- dictionary with parameters for LARC (or LARS) optimization algorithms. Can contain the following parameters: @@ -180,6 +194,9 @@ class docs. self._params = copy.deepcopy(params) + if self._params.get('iter_size', 1) > 1 and hvd is None: + raise ValueError("iter_size is only supported in Horovod mode") + # parameter checks self._mode = mode if self._mode not in ["train", "infer", "eval"]: @@ -201,6 +218,8 @@ class docs. self._params['save_checkpoint_steps'] = None if 'save_summaries_steps' not in self._params: self._params['save_summaries_steps'] = None + if 'print_bench_info_steps' not in self._params: + self._params['print_bench_info_steps'] = None # checking that frequencies of samples and loss are aligned s_fr = self._params['print_samples_steps'] @@ -266,15 +285,21 @@ class docs. self._steps_in_epoch //= self._hvd.size() else: self._steps_in_epoch //= self.num_gpus + self._steps_in_epoch //= self._params.get('iter_size', 1) + if self._steps_in_epoch == 0: + raise ValueError("Overall batch size is too big for this dataset.") self._last_step = self._params['num_epochs'] * self._steps_in_epoch if self.on_horovod: self._output = None else: self._outputs = [None] * self.num_gpus + self.loss = None self.train_op = None self.eval_losses = None + self._num_objects_per_step = None + self.skip_update_ph = None def compile(self, force_var_reuse=False): """TensorFlow graph is built here.""" @@ -306,7 +331,7 @@ def compile(self, force_var_reuse=False): ) if self._outputs[gpu_cnt] is not None and \ not isinstance(self._outputs[gpu_cnt], list): - raise ValueError('Decoder samples have to be either None or list') + raise ValueError('Decoder outputs have to be either None or list') if self._mode == "train" or self._mode == "eval": losses.append(loss) # end of for gpu_ind loop @@ -332,13 +357,19 @@ def compile(self, force_var_reuse=False): loss, self._output = self._build_forward_pass_graph(input_tensors, gpu_id=0) if self._output is not None and not isinstance(self._output, list): - raise ValueError('Decoder samples have to be either None or list') + raise ValueError('Decoder outputs have to be either None or list') if self._mode == "train": self.loss = loss if self._mode == "eval": self.eval_losses = [loss] + try: + self._num_objects_per_step = [self._get_num_objects_per_step(worker_id) + for worker_id in range(self.num_gpus)] + except NotImplementedError: + pass + if self._mode == "train": if 'lr_policy' not in self.params: lr_policy = None @@ -346,34 +377,31 @@ def compile(self, force_var_reuse=False): lr_params = self.params.get('lr_policy_params', {}) # adding default decay_steps = max_steps if lr_policy supports it and # different value is not provided - if 'decay_steps' in self.params['lr_policy'].__code__.co_varnames and \ - 'decay_steps' not in lr_params: + func_params = signature(self.params['lr_policy']).parameters + if 'decay_steps' in func_params and 'decay_steps' not in lr_params: lr_params['decay_steps'] = self._last_step - if 'steps_per_epoch' in self.params['lr_policy'].__code__.co_varnames and \ + if 'steps_per_epoch' in func_params and \ 'steps_per_epoch' not in lr_params and 'num_epochs' in self.params: lr_params['steps_per_epoch'] = self.steps_in_epoch lr_policy = lambda gs: self.params['lr_policy'](global_step=gs, **lr_params) + if self.params.get('iter_size', 1) > 1: + self.skip_update_ph = tf.placeholder(tf.bool) + self.train_op = optimize_loss( loss=tf.cast(self.loss, tf.float32) + get_regularization_loss(), dtype=self.params['dtype'], optimizer=self.params['optimizer'], optimizer_params=self.params.get('optimizer_params', {}), - gradient_noise_scale=None, - gradient_multipliers=None, clip_gradients=self.params.get('max_grad_norm', None), learning_rate_decay_fn=lr_policy, - update_ops=None, - variables=None, - name="Loss_Optimization", summaries=self.params.get('summaries', None), - colocate_gradients_with_ops=True, - increment_global_step=True, larc_params=self.params.get('larc_params', None), - loss_scale=self.params.get('loss_scale', 1.0), - automatic_loss_scaling=self.params.get('automatic_loss_scaling', None), + loss_scaling=self.params.get('loss_scaling', 1.0), on_horovod=self.on_horovod, + iter_size=self.params.get('iter_size', 1), + skip_update_ph=self.skip_update_ph, ) tf.summary.scalar(name="train_loss", tensor=self.loss) if self.steps_in_epoch: @@ -414,7 +442,7 @@ def _build_forward_pass_graph(self, input_tensors, gpu_id=0): is constructed. For Horovod this is always zero. Returns: - tuple: tuple containing loss tensor and samples tensor. + tuple: tuple containing loss tensor and list of outputs tensors. Loss tensor will be automatically provided to the optimizer and corresponding :attr:`train_op` will be created. @@ -424,12 +452,12 @@ def _build_forward_pass_graph(self, input_tensors, gpu_id=0): this happens inside :class:`utils.hooks.RunEvaluationHook` to fetch output values for evaluation. - Both loss and samples can be None when corresponding part of the graph + Both loss and outputs can be None when corresponding part of the graph is not built. """ pass - def maybe_print_logs(self, input_values, output_values): + def maybe_print_logs(self, input_values, output_values, training_step): """This method can be used to print logs that help to visualize training. For example, you can print sample input sequences and their corresponding predictions. This method will be called every ``print_samples_steps`` @@ -447,6 +475,7 @@ def maybe_print_logs(self, input_values, output_values): output_values: evaluation of :meth:`self.get_output_tensors(0) `, that is, output tensors for one batch on the *first* GPU. + training_step (int): Current training step. Returns: dict: dictionary with values that need to be logged to TensorBoard @@ -491,7 +520,7 @@ def evaluate(self, input_values, output_values): """ return [] - def finalize_evaluation(self, results_per_batch): + def finalize_evaluation(self, results_per_batch, training_step=None): """This method can be used in conjunction with :meth:`self.evaluate()` to calculate evaluation metrics. @@ -514,6 +543,8 @@ def finalize_evaluation(self, results_per_batch): results_per_batch (list): aggregation of values returned from all calls to :meth:`self.evaluate()` method (number of calls will be equal to number of evaluation batches). + training_step (int): current training step. Will only be passed if mode + is "train_eval". Returns: dict: dictionary with values that need to be logged to TensorBoard @@ -621,7 +652,7 @@ def get_tf_dtype(self): else: return self.params['dtype'] - def get_num_objects_per_step(self, worker_id=0): + def _get_num_objects_per_step(self, worker_id=0): """Define this method if you need benchmarking functionality. For example, for translation models, this method should return number of tokens in current batch, for image recognition model should return number @@ -636,6 +667,12 @@ def get_num_objects_per_step(self, worker_id=0): """ raise NotImplementedError() + def get_num_objects_per_step(self, worker_id=0): + if self._num_objects_per_step: + return self._num_objects_per_step[worker_id] + else: + raise NotImplementedError() + @property def params(self): """Parameters used to construct the model (dictionary).""" diff --git a/open_seq2seq/models/speech2text.py b/open_seq2seq/models/speech2text.py index 8a5c061e0..62f1cf001 100644 --- a/open_seq2seq/models/speech2text.py +++ b/open_seq2seq/models/speech2text.py @@ -49,7 +49,7 @@ def _create_decoder(self): ) return super(Speech2Text, self)._create_decoder() - def maybe_print_logs(self, input_values, output_values): + def maybe_print_logs(self, input_values, output_values, training_step): y, len_y = input_values['target_tensors'] decoded_sequence = output_values y_one_sample = y[0] @@ -74,7 +74,7 @@ def maybe_print_logs(self, input_values, output_values): 'Sample WER': sample_wer, } - def finalize_evaluation(self, results_per_batch): + def finalize_evaluation(self, results_per_batch, training_step=None): total_word_lev = 0.0 total_word_count = 0.0 for word_lev, word_count in results_per_batch: @@ -120,13 +120,21 @@ def infer(self, input_values, output_values): ) for sample_id in range(len(decoded_texts)): preds.append("".join(decoded_texts[sample_id])) - return preds + return preds, input_values['source_ids'] def finalize_inference(self, results_per_batch, output_file): preds = [] + ids = [] - for result in results_per_batch: + for result, idx in results_per_batch: preds.extend(result) + ids.extend(idx) + + preds = np.array(preds) + ids = np.hstack(ids) + # restoring the correct order + preds = preds[np.argsort(ids)] + pd.DataFrame( { 'wav_filename': self.get_data_layer().all_files, @@ -135,7 +143,7 @@ def finalize_inference(self, results_per_batch, output_file): columns=['wav_filename', 'predicted_transcript'], ).to_csv(output_file, index=False) - def get_num_objects_per_step(self, worker_id=0): + def _get_num_objects_per_step(self, worker_id=0): """Returns number of audio frames in current batch.""" data_layer = self.get_data_layer(worker_id) num_frames = tf.reduce_sum(data_layer.input_tensors['source_tensors'][1]) diff --git a/open_seq2seq/models/speech2text_ds2_test.py b/open_seq2seq/models/speech2text_ds2_test.py new file mode 100644 index 000000000..7a433c109 --- /dev/null +++ b/open_seq2seq/models/speech2text_ds2_test.py @@ -0,0 +1,47 @@ +# Copyright (c) 2017 NVIDIA Corporation +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf + +from open_seq2seq.test_utils.test_speech_configs.ds2_test_config import \ + base_params, train_params, eval_params, base_model + +from .speech2text_test import Speech2TextModelTests + + +class DS2ModelTests(Speech2TextModelTests): + + def setUp(self): + self.base_model = base_model + self.base_params = base_params + self.train_params = train_params + self.eval_params = eval_params + + def tearDown(self): + pass + + def test_regularizer(self): + return self.regularizer_test() + + def test_convergence(self): + return self.convergence_test(5.0, 30.0, 0.1) + + def test_convergence_with_iter_size(self): + return self.convergence_with_iter_size_test() + + def test_infer(self): + return self.infer_test() + + def test_mp_collection(self): + return self.mp_collection_test(14, 7) + + def test_levenshtein(self): + return self.levenshtein_test() + + def test_maybe_functions(self): + return self.maybe_functions_test() + + +if __name__ == '__main__': + tf.test.main() diff --git a/open_seq2seq/models/speech2text_test.py b/open_seq2seq/models/speech2text_test.py index 687bda883..7493e90ac 100644 --- a/open_seq2seq/models/speech2text_test.py +++ b/open_seq2seq/models/speech2text_test.py @@ -12,26 +12,17 @@ import pandas as pd from .speech2text import levenshtein -from open_seq2seq.test_utils.test_speech_config import base_params, \ - train_params, \ - eval_params, \ - base_model from open_seq2seq.utils import train, evaluate, infer from open_seq2seq.utils.utils import get_available_gpus class Speech2TextModelTests(tf.test.TestCase): - def setUp(self): - pass - def tearDown(self): - pass - - def run_model(self, train_config, eval_config): + def run_model(self, train_config, eval_config, hvd=None): with tf.Graph().as_default() as g: - train_model = base_model(params=train_config, mode="train", hvd=None) + train_model = self.base_model(params=train_config, mode="train", hvd=hvd) train_model.compile() - eval_model = base_model(params=eval_config, mode="eval", hvd=None) + eval_model = self.base_model(params=eval_config, mode="eval", hvd=hvd) eval_model.compile(force_var_reuse=True) train(train_model, eval_model) @@ -50,33 +41,34 @@ def run_model(self, train_config, eval_config): eval_loss = np.mean(eval_losses) weights_new = sess.run(tf.trainable_variables()) - # checking that the weights has not changed from just computing the loss + # checking that the weights has not changed from just computing the + # loss for w, w_new in zip(weights, weights_new): npt.assert_allclose(w, w_new) eval_dict = evaluate(eval_model, checkpoint) return loss, eval_loss, eval_dict def prepare_config(self): - base_params['logdir'] = tempfile.mktemp() - train_config = copy.deepcopy(base_params) - eval_config = copy.deepcopy(base_params) - train_config.update(copy.deepcopy(train_params)) - eval_config.update(copy.deepcopy(eval_params)) + self.base_params['logdir'] = tempfile.mktemp() + train_config = copy.deepcopy(self.base_params) + eval_config = copy.deepcopy(self.base_params) + train_config.update(copy.deepcopy(self.train_params)) + eval_config.update(copy.deepcopy(self.eval_params)) return train_config, eval_config - def test_regularizer(self): + def regularizer_test(self): for dtype in [tf.float16, tf.float32, 'mixed']: train_config, eval_config = self.prepare_config() train_config['num_epochs'] = 60 train_config.update({ - "dtype": dtype, - "regularizer": tf.contrib.layers.l2_regularizer, - "regularizer_params": { - 'scale': 1e4, - }, + "dtype": dtype, + "regularizer": tf.contrib.layers.l2_regularizer, + "regularizer_params": { + 'scale': 1e4, + }, }) eval_config.update({ - "dtype": dtype, + "dtype": dtype, }) loss, eval_loss, eval_dict = self.run_model(train_config, eval_config) @@ -84,24 +76,54 @@ def test_regularizer(self): self.assertGreaterEqual(eval_loss, 500.0) self.assertGreaterEqual(eval_dict['Eval WER'], 0.95) - def test_convergence(self): + def convergence_test(self, train_loss_threshold, eval_loss_threshold, eval_wer_threshold): for dtype in [tf.float32, "mixed"]: train_config, eval_config = self.prepare_config() train_config.update({ - "dtype": dtype, + "dtype": dtype, }) eval_config.update({ - "dtype": dtype, + "dtype": dtype, }) loss, eval_loss, eval_dict = self.run_model(train_config, eval_config) - self.assertLess(loss, 5.0) - self.assertLess(eval_loss, 200.0) + self.assertLess(loss, train_loss_threshold) + self.assertLess(eval_loss, eval_loss_threshold) + self.assertLess(eval_dict['Eval WER'], eval_wer_threshold) + + def convergence_with_iter_size_test(self): + try: + import horovod.tensorflow as hvd + hvd.init() + except ImportError: + print("Horovod not installed skipping test_convergence_with_iter_size") + return + + for dtype in [tf.float32, "mixed"]: + train_config, eval_config = self.prepare_config() + train_config.update({ + "dtype": dtype, + "iter_size": 5, + "batch_size_per_gpu": 2, + "use_horovod": True, + "num_epochs": 200, + }) + eval_config.update({ + "dtype": dtype, + "iter_size": 5, + "batch_size_per_gpu": 2, + "use_horovod": True, + }) + loss, eval_loss, eval_dict = self.run_model( + train_config, eval_config, hvd) + + self.assertLess(loss, 10.0) + self.assertLess(eval_loss, 30.0) self.assertLess(eval_dict['Eval WER'], 0.1) - def test_infer(self): + def infer_test(self): train_config, infer_config = self.prepare_config() - train_config['num_epochs'] = 200 + train_config['num_epochs'] = 250 infer_config['batch_size_per_gpu'] = 4 with tf.Graph().as_default() as g: @@ -114,24 +136,26 @@ def test_infer(self): infer_config['num_gpus'] = 1 with tf.Graph().as_default(): - train_model = base_model(params=train_config, mode="train", hvd=None) + train_model = self.base_model( + params=train_config, mode="train", hvd=None) train_model.compile() train(train_model, None) with tf.Graph().as_default(): - infer_model = base_model(params=infer_config, mode="infer", hvd=None) + infer_model = self.base_model( + params=infer_config, mode="infer", hvd=None) infer_model.compile() print(train_model.params['logdir']) output_file = os.path.join(train_model.params['logdir'], 'infer_out.csv') infer( - infer_model, - tf.train.latest_checkpoint(train_model.params['logdir']), - output_file, + infer_model, + tf.train.latest_checkpoint(train_model.params['logdir']), + output_file, ) pred_csv = pd.read_csv(output_file) true_csv = pd.read_csv( - 'open_seq2seq/test_utils/toy_speech_data/toy_data.csv', + 'open_seq2seq/test_utils/toy_speech_data/toy_data.csv', ) for pred_row, true_row in zip(pred_csv.as_matrix(), true_csv.as_matrix()): # checking file name @@ -139,20 +163,20 @@ def test_infer(self): # checking prediction self.assertEqual(pred_row[-1], true_row[-1]) - def test_mp_collection(self): + def mp_collection_test(self, num_train_vars, num_master_copies): train_config, eval_config = self.prepare_config() train_config['dtype'] = 'mixed' with tf.Graph().as_default(): - model = base_model(params=train_config, mode="train", hvd=None) + model = self.base_model(params=train_config, mode="train", hvd=None) model.compile() - self.assertEqual(len(tf.trainable_variables()), 14) + self.assertEqual(len(tf.trainable_variables()), num_train_vars) self.assertEqual( - len(tf.get_collection('FP32_MASTER_COPIES')), - 7, # minus batch norm beta and gamma and row_conv vars + len(tf.get_collection('FP32_MASTER_COPIES')), + num_master_copies, # minus batch norm beta and gamma and row_conv vars ) - def test_levenshtein(self): + def levenshtein_test(self): s1 = 'this is a great day' s2 = 'this is great day' self.assertEqual(levenshtein(s1.split(), s2.split()), 1) @@ -181,28 +205,28 @@ def test_levenshtein(self): self.assertEqual(levenshtein(s1, s2), 11) self.assertEqual(levenshtein(s2, s1), 11) - def test_maybe_functions(self): + def maybe_functions_test(self): train_config, eval_config = self.prepare_config() with tf.Graph().as_default(): - model = base_model(params=train_config, mode="train", hvd=None) + model = self.base_model(params=train_config, mode="train", hvd=None) model.compile() model._gpu_ids = range(5) model.params['batch_size_per_gpu'] = 2 char2idx = model.get_data_layer().params['char2idx'] inputs = [ - ['this is a great day', 'london is the capital of great britain'], - ['ooo', 'lll'], - ['a b c\' asdf', 'blah blah bblah'], - ['this is great day', 'london capital gret britain'], - ['aaaaaaaasdfdasdf', 'df d sdf asd fd f sdf df blah\' blah'], + ['this is a great day', 'london is the capital of great britain'], + ['ooo', 'lll'], + ['a b c\' asdf', 'blah blah bblah'], + ['this is great day', 'london capital gret britain'], + ['aaaaaaaasdfdasdf', 'df d sdf asd fd f sdf df blah\' blah'], ] outputs = [ - ['this is great a day', 'london capital gret britain'], - ['ooo', 'lll'], - ['aaaaaaaasdfdasdf', 'df d sdf asd fd f sdf df blah blah'], - ['this is a great day', 'london is the capital of great britain'], - ['a b c\' asdf', 'blah blah\' bblah'], + ['this is great a day', 'london capital gret britain'], + ['ooo', 'lll'], + ['aaaaaaaasdfdasdf', 'df d sdf asd fd f sdf df blah blah'], + ['this is a great day', 'london is the capital of great britain'], + ['a b c\' asdf', 'blah blah\' bblah'], ] y = [None] * len(inputs) len_y = [None] * len(inputs) @@ -219,7 +243,7 @@ def test_maybe_functions(self): len_y[gpu_id][sample_id] = num_letters for letter_id in range(num_letters): y[gpu_id][sample_id, letter_id] = char2idx[ - inputs[gpu_id][sample_id][letter_id] + inputs[gpu_id][sample_id][letter_id] ] num_gpus = len(outputs) @@ -233,7 +257,7 @@ def test_maybe_functions(self): num_letters = len(outputs[gpu_id][sample_id]) for letter_id in range(num_letters): values[gpu_id].append( - char2idx[outputs[gpu_id][sample_id][letter_id]] + char2idx[outputs[gpu_id][sample_id][letter_id]] ) indices[gpu_id].append(np.array([sample_id, letter_id])) values[gpu_id] = np.array(values[gpu_id], dtype=np.int) @@ -243,8 +267,8 @@ def test_maybe_functions(self): len_x = [None] * len(y) input_values = list(zip(x, len_x, y, len_y)) output_values = [ - [tf.SparseTensorValue(indices[i], values[i], dense_shape[i])] - for i in range(num_gpus) + [tf.SparseTensorValue(indices[i], values[i], dense_shape[i])] + for i in range(num_gpus) ] results = [] @@ -272,9 +296,5 @@ def test_maybe_functions(self): inp_dict = {'source_tensors': [input_values[0][0], input_values[0][1]], 'target_tensors': [input_values[0][2], input_values[0][3]]} - output_dict = model.maybe_print_logs(inp_dict, output_values[0]) + output_dict = model.maybe_print_logs(inp_dict, output_values[0], 0) self.assertEqual(output_dict['Sample WER'], 0.4) - - -if __name__ == '__main__': - tf.test.main() diff --git a/open_seq2seq/models/speech2text_w2l_test.py b/open_seq2seq/models/speech2text_w2l_test.py new file mode 100644 index 000000000..de18448a3 --- /dev/null +++ b/open_seq2seq/models/speech2text_w2l_test.py @@ -0,0 +1,31 @@ +# Copyright (c) 2017 NVIDIA Corporation +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals + +import tensorflow as tf + +from open_seq2seq.test_utils.test_speech_configs.w2l_test_config import \ + base_params, train_params, eval_params, base_model +from .speech2text_test import Speech2TextModelTests + + +class W2LModelTests(Speech2TextModelTests): + + def setUp(self): + self.base_model = base_model + self.base_params = base_params + self.train_params = train_params + self.eval_params = eval_params + + def tearDown(self): + pass + + def test_convergence(self): + return self.convergence_test(5.0, 30.0, 0.1) + + def test_mp_collection(self): + return self.mp_collection_test(14, 6) + + +if __name__ == '__main__': + tf.test.main() diff --git a/open_seq2seq/models/text2text.py b/open_seq2seq/models/text2text.py index dd563963e..8ef385339 100644 --- a/open_seq2seq/models/text2text.py +++ b/open_seq2seq/models/text2text.py @@ -80,22 +80,23 @@ def infer(self, input_values, output_values): input_strings, output_strings = [], [] input_values = input_values['source_tensors'] for input_sample, output_sample in zip(input_values, output_values): - output_strings.append(text_ids_to_string( - output_sample[0], - self.get_data_layer().params['target_idx2seq'], - S_ID=self.decoder.params['GO_SYMBOL'], - EOS_ID=self.decoder.params['END_SYMBOL'], - PAD_ID=self.decoder.params['PAD_SYMBOL'], - ignore_special=True, delim=' ', - )) - input_strings.append(text_ids_to_string( - input_sample[0], - self.get_data_layer().params['source_idx2seq'], - S_ID=self.decoder.params['GO_SYMBOL'], - EOS_ID=self.decoder.params['END_SYMBOL'], - PAD_ID=self.decoder.params['PAD_SYMBOL'], - ignore_special=True, delim=' ', - )) + for i in range(0, input_sample.shape[0]): # iterate over batch dimension + output_strings.append(text_ids_to_string( + output_sample[i], + self.get_data_layer().params['target_idx2seq'], + S_ID=self.decoder.params['GO_SYMBOL'], + EOS_ID=self.decoder.params['END_SYMBOL'], + PAD_ID=self.decoder.params['PAD_SYMBOL'], + ignore_special=True, delim=' ', + )) + input_strings.append(text_ids_to_string( + input_sample[i], + self.get_data_layer().params['source_idx2seq'], + S_ID=self.decoder.params['GO_SYMBOL'], + EOS_ID=self.decoder.params['END_SYMBOL'], + PAD_ID=self.decoder.params['PAD_SYMBOL'], + ignore_special=True, delim=' ', + )) return input_strings, output_strings def finalize_inference(self, results_per_batch, output_file): @@ -110,7 +111,7 @@ def finalize_inference(self, results_per_batch, output_file): deco_print("") step += 1 - def maybe_print_logs(self, input_values, output_values): + def maybe_print_logs(self, input_values, output_values, training_step): x, len_x = input_values['source_tensors'] y, len_y = input_values['target_tensors'] samples = output_values[0] @@ -200,7 +201,7 @@ def evaluate(self, input_values, output_values): return preds, targets - def finalize_evaluation(self, results_per_batch): + def finalize_evaluation(self, results_per_batch, training_step=None): preds, targets = [], [] for preds_cur, targets_cur in results_per_batch: if self.params.get('eval_using_bleu', True): @@ -214,11 +215,16 @@ def finalize_evaluation(self, results_per_batch): return {} - def get_num_objects_per_step(self, worker_id=0): + def _get_num_objects_per_step(self, worker_id=0): """Returns number of source tokens + number of target tokens in batch.""" data_layer = self.get_data_layer(worker_id) # sum of source length in batch num_tokens = tf.reduce_sum(data_layer.input_tensors['source_tensors'][1]) - # sum of target length in batch - num_tokens += tf.reduce_sum(data_layer.input_tensors['target_tensors'][1]) + if self.mode != "infer": + # sum of target length in batch + num_tokens += tf.reduce_sum(data_layer.input_tensors['target_tensors'][1]) + else: + # TODO: this is not going to be correct when batch size > 1, since it will + # count padding? + num_tokens += tf.reduce_sum(tf.shape(self.get_output_tensors(worker_id)[0])) return num_tokens diff --git a/open_seq2seq/models/text2text_test.py b/open_seq2seq/models/text2text_test.py index 75c77f220..cbd244134 100644 --- a/open_seq2seq/models/text2text_test.py +++ b/open_seq2seq/models/text2text_test.py @@ -21,7 +21,7 @@ def tearDown(self): def test_train(self): config_module = runpy.run_path( - "./example_configs/text2text/nmt-reversal-RR.py") + "./example_configs/text2text/toy-reversal/nmt-reversal-RR.py") train_config = config_module['base_params'] if 'train_params' in config_module: train_config.update(config_module['train_params']) @@ -70,7 +70,7 @@ def test_train(self): print("Attempting BasicSeq2SeqWithAttention on Horovod") hvd.init() config_module = runpy.run_path( - "./example_configs/text2text/nmt-reversal-RR.py") + "./example_configs/text2text/toy-reversal/nmt-reversal-RR.py") train_config = config_module['base_params'] if 'train_params' in config_module: train_config.update(config_module['train_params']) diff --git a/open_seq2seq/optimizers/automatic_loss_scaler.py b/open_seq2seq/optimizers/automatic_loss_scaler.py index 740e63508..5da055d79 100644 --- a/open_seq2seq/optimizers/automatic_loss_scaler.py +++ b/open_seq2seq/optimizers/automatic_loss_scaler.py @@ -8,25 +8,25 @@ import tensorflow as tf -class AutomaticLossScaler: - SUPPORTED_ALGOS = ['Backoff', 'LogMax'] +class AutomaticLossScaler(object): + SUPPORTED_ALGOS = ['backoff', 'logmax'] def __init__(self, algorithm='Backoff', scale_min=1.0, scale_max=2.**24): - if algorithm == 'Backoff': + algorithm = algorithm.lower().strip() + if algorithm == 'backoff': self.scaler = BackoffScaler(scale_min=scale_min, scale_max=scale_max, step_factor=2.0, step_window=2000) - elif algorithm == 'LogMax': + elif algorithm == 'logmax': self.scaler = LogMaxScaler(scale_min=scale_min, scale_max=scale_max, log_max=16., beta1=0.99, beta2=0.999, - overflow_std_dev=3.09) # ppf(.999) + overflow_std_dev=3.09) # ppf(.999) else: - raise ValueError('Unknown dynamic scaling algorithm: %s' - % algorithm_name) + raise ValueError('Unknown scaling algorithm: {}'.format(algorithm)) def update_op(self, has_nan, amax): return self.scaler.update_op(has_nan, amax) @@ -55,7 +55,7 @@ def check_grads(grads_and_vars): return has_nan, amax -class BackoffScaler: +class BackoffScaler(object): def __init__(self, scale_min, scale_max, step_factor, step_window): self.scale_min = scale_min self.scale_max = scale_max @@ -105,7 +105,7 @@ def loss_scale(self): return self.scale -class LogMaxScaler: +class LogMaxScaler(object): def __init__(self, scale_min, scale_max, log_max, beta1, beta2, overflow_std_dev): self.scale_min = scale_min self.scale_max = scale_max diff --git a/open_seq2seq/optimizers/mp_wrapper.py b/open_seq2seq/optimizers/mp_wrapper.py index 633e3a3e5..f947a3e9a 100644 --- a/open_seq2seq/optimizers/mp_wrapper.py +++ b/open_seq2seq/optimizers/mp_wrapper.py @@ -80,7 +80,6 @@ def compute_gradients(self, loss, var_list=None, return grads_and_vars_fp32 def apply_gradients(self, grads_and_vars, global_step=None, name=None): - def apply_ops_wrapper(): update_op = self._optimizer.apply_gradients(grads_and_vars, global_step, name) @@ -101,9 +100,7 @@ def apply_ops_wrapper(): loss_scale_update_op = self._loss_scaler.update_op(grad_has_nans, grad_amax) with tf.control_dependencies([loss_scale_update_op]): - return tf.cond(should_skip_update, - tf.no_op, - apply_ops_wrapper) + return tf.cond(should_skip_update, tf.no_op, apply_ops_wrapper) else: return apply_ops_wrapper() diff --git a/open_seq2seq/optimizers/optimizers.py b/open_seq2seq/optimizers/optimizers.py index a98dd4830..9f10fdc61 100644 --- a/open_seq2seq/optimizers/optimizers.py +++ b/open_seq2seq/optimizers/optimizers.py @@ -27,23 +27,8 @@ import six import tensorflow as tf - -from tensorflow.contrib import framework as contrib_framework -from tensorflow.python.framework import constant_op -from tensorflow.python.framework import dtypes -from tensorflow.python.framework import ops -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import clip_ops from tensorflow.python.ops import control_flow_ops -from tensorflow.python.ops import init_ops -from tensorflow.python.ops import math_ops -from tensorflow.python.ops import random_ops -from tensorflow.python.ops import variable_scope as vs -from tensorflow.python.ops import variables as vars_ -from tensorflow.python.summary import summary -from tensorflow.python.training import moving_averages -from tensorflow.python.training import optimizer as optimizer_ -from tensorflow.python.training import training as train + from .automatic_loss_scaler import AutomaticLossScaler from .mp_wrapper import MixedPrecisionOptimizerWrapper @@ -51,12 +36,12 @@ OPTIMIZER_CLS_NAMES = { - "Adagrad": train.AdagradOptimizer, - "Adam": train.AdamOptimizer, - "Ftrl": train.FtrlOptimizer, - "Momentum": train.MomentumOptimizer, - "RMSProp": train.RMSPropOptimizer, - "SGD": train.GradientDescentOptimizer, + "Adagrad": tf.train.AdagradOptimizer, + "Adam": tf.train.AdamOptimizer, + "Ftrl": tf.train.FtrlOptimizer, + "Momentum": tf.train.MomentumOptimizer, + "RMSProp": tf.train.RMSPropOptimizer, + "SGD": tf.train.GradientDescentOptimizer, } OPTIMIZER_SUMMARIES = [ @@ -81,531 +66,299 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"): """ losses = tf.losses.get_regularization_losses(scope) if losses: - return math_ops.add_n(list(map(lambda x: tf.cast(x, tf.float32), losses)), - name=name) + return tf.add_n(list(map(lambda x: tf.cast(x, tf.float32), losses)), + name=name) else: - return constant_op.constant(0.0) - - -class DistributedOptimizer(tf.train.Optimizer): - """An optimizer that wraps another tf.Optimizer, using an allreduce to - average gradient values before applying gradients to model weights.""" - - def __init__(self, optimizer, name=None, use_locking=False, device_dense='', - device_sparse=''): - """Construct a new DistributedOptimizer, which uses another optimizer - under the hood for computing single-process gradient values and - applying gradient updates after the gradient values have been averaged - across all the Horovod ranks. - Args: - optimizer: - Optimizer to use for computing gradients and applying updates. - name: - Optional name prefix for the operations created when applying - gradients. Defaults to "Distributed" followed by the provided - optimizer type. - use_locking: - Whether to use locking when updating variables. - See Optimizer.__init__ for more info. - device_dense: - Device to be used for dense tensors. Uses GPU by default - if Horovod was build with HOROVOD_GPU_ALLREDUCE. - device_sparse: - Device to be used for sparse tensors. Uses GPU by default - if Horovod was build with HOROVOD_GPU_ALLGATHER. - """ - if name is None: - name = "Distributed{}".format(type(optimizer).__name__) - - self._optimizer = optimizer - self._device_dense = device_dense - self._device_sparse = device_sparse - super(DistributedOptimizer, self).__init__( - name=name, use_locking=use_locking) - - def compute_gradients(self, *args, **kwargs): - """Compute gradients of all trainable variables. - See Optimizer.compute_gradients() for more info. - In DistributedOptimizer, compute_gradients() is overriden to also - allreduce the gradients before returning them. - """ - gradients = self._optimizer.compute_gradients(*args, **kwargs) + return tf.constant(0.0) + + +def reduce_gradients(grads_and_vars, on_horovod): + if on_horovod: from horovod.common import size from horovod.tensorflow import allreduce if size() > 1: - averaged_gradients = [] - with tf.name_scope(self._name + "_Allreduce"): - for grad, var in gradients: + averaged_grads_and_vars = [] + with tf.name_scope("all_reduce"): + for grad, var in grads_and_vars: if grad is not None: - avg_grad = allreduce(grad, device_dense=self._device_dense, - device_sparse=self._device_sparse) - averaged_gradients.append((avg_grad, var)) + avg_grad = allreduce(grad) + averaged_grads_and_vars.append((avg_grad, var)) else: - averaged_gradients.append((None, var)) - return averaged_gradients + averaged_grads_and_vars.append((None, var)) + return averaged_grads_and_vars else: - return gradients - - def apply_gradients(self, grads_and_vars, global_step=None, name=None): - """Calls this same method on the underlying optimizer.""" - return self._optimizer.apply_gradients(grads_and_vars, global_step, name) + return grads_and_vars + else: + raise NotImplementedError("Reduce in tower-mode is not implemented.") def optimize_loss(loss, optimizer, optimizer_params, learning_rate_decay_fn, - global_step=None, dtype=tf.float32, - gradient_noise_scale=None, - gradient_multipliers=None, clip_gradients=None, - update_ops=None, - variables=None, - name=None, summaries=None, - colocate_gradients_with_ops=False, - increment_global_step=True, larc_params=None, - loss_scale=1.0, - automatic_loss_scaling=None, - on_horovod=False): + loss_scaling=1.0, + on_horovod=False, + iter_size=1, + skip_update_ph=None): """Given loss and parameters for optimizer, returns a training op. - Various ways of passing optimizers include: - - - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES - for full list. E.g. `optimize_loss(..., optimizer='Adam')`. - - by function taking learning rate `Tensor` as argument and returning an - `Optimizer` instance. E.g. `optimize_loss(..., - optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`. - Alternatively, if `learning_rate` is `None`, the function takes no - arguments. E.g. `optimize_loss(..., learning_rate=None, - optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`. - - by a subclass of `Optimizer` having a single-argument constructor - (the argument is the learning rate), such as AdamOptimizer or - AdagradOptimizer. E.g. `optimize_loss(..., - optimizer=tf.train.AdagradOptimizer)`. - - by an instance of a subclass of `Optimizer`. - E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`. - Args: loss: Scalar `Tensor`. - global_step: Scalar int `Tensor`, step counter to update on each step - unless `increment_global_step` is `False`. If not supplied, - it will be fetched from the default graph (see - `tf.train.get_global_step` for details). If it has - not been created, no step will be incremented with each weight - update. `learning_rate_decay_fn` requires `global_step`. - learning_rate: float or `Tensor`, magnitude of update per each training - step. Can be `None`. - optimizer: string, class or optimizer instance, used as trainer. - string should be name of optimizer, like 'SGD', - 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. - class should be sub-class of `tf.Optimizer` that implements - `compute_gradients` and `apply_gradients` functions. - optimizer instance should be instantiation of `tf.Optimizer` - sub-class and have `compute_gradients` and `apply_gradients` - functions. - gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this - value. - gradient_multipliers: dict of variables or variable names to floats. - If present, gradients for specified - variables will be multiplied by given constant. - clip_gradients: float, callable or `None`. If float, is provided, a global - clipping is applied to prevent the norm of the gradient to exceed this - value. Alternatively, a callable can be provided e.g.: adaptive_clipping. - This callable takes a `list` of `(gradients, variables)` `tuple`s and - returns the same thing with the gradients modified. - learning_rate_decay_fn: function, takes `learning_rate` and `global_step` - `Tensor`s, returns `Tensor`. - Can be used to implement any learning rate decay - functions. - For example: `tf.train.exponential_decay`. - Ignored if `learning_rate` is not supplied. - update_ops: list of update `Operation`s to execute at each step. If `None`, - uses elements of UPDATE_OPS collection. The order of execution - between `update_ops` and `loss` is non-deterministic. - variables: list of variables to optimize or - `None` to use all trainable variables. - name: The name for this operation is used to scope operations and summaries. + optimizer: string or class of optimizer, used as trainer. + string should be name of optimizer, like 'SGD', + 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. + class should be sub-class of `tf.Optimizer` that implements + `compute_gradients` and `apply_gradients` functions. + optimizer_params: parameters of the optimizer. + dtype: model dtype (tf.float16, tf.float32 or "mixed"). + learning_rate_decay_fn: function, takes `global_step` + `Tensor`s, returns `Tensor`. + Can be used to implement any learning rate decay + functions. + For example: `tf.train.exponential_decay`. + Ignored if `learning_rate` is not supplied. + clip_gradients: float, max gradient norm to clip to. summaries: List of internal quantities to visualize on tensorboard. If not - set only the loss and the learning rate will be reported. The - complete list is in OPTIMIZER_SUMMARIES. - colocate_gradients_with_ops: If True, try colocating gradients with the - corresponding op. - increment_global_step: Whether to increment `global_step`. If your model - calls `optimize_loss` multiple times per training step (e.g. to optimize - different parts of the model), use this arg to avoid incrementing - `global_step` more times than necessary. - LARC_mode: 'scale' or 'clip' - LARC_nu: If not None, LARC re-scaling will be - applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu - automatic_loss_scaling: if not None, use the corresponding automatic - loss scaling algorithm. Must be one of 'Backoff' - of 'LogMax'. `dtype` must be "mixed" to use ALS. + set only the loss and the learning rate will be reported. The + complete list is in OPTIMIZER_SUMMARIES. + larc_params: If not None, LARC re-scaling will + be applied with corresponding parameters. + loss_scaling: could be float or string. If float, static loss scaling + is applied. If string, the corresponding automatic + loss scaling algorithm is used. Must be one of 'Backoff' + of 'LogMax' (case insensitive). Only used when dtype="mixed". + on_horovod: whether the model is run on horovod. + Returns: - Training op. - - Raises: - ValueError: if: - * `loss` is an invalid type or shape. - * `global_step` is an invalid type or shape. - * `learning_rate` is an invalid type or value. - * `optimizer` has the wrong type. - * `clip_gradients` is neither float nor callable. - * `learning_rate` and `learning_rate_decay_fn` are supplied, but no - `global_step` is available. - * `gradients` is empty. + training op. """ - loss = ops.convert_to_tensor(loss) - contrib_framework.assert_scalar(loss) - if global_step is None: - global_step = tf.train.get_or_create_global_step() + if summaries is None: + summaries = ["learning_rate", "global_gradient_norm"] else: - tf.train.assert_global_step(global_step) - with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]): - # Update ops take UPDATE_OPS collection if not provided. - if update_ops is None: - update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)) - # Make sure update ops are ran before computing loss. - if update_ops: - loss = control_flow_ops.with_dependencies(list(update_ops), loss) - - if summaries is None: - summaries = ["learning_rate", "global_gradient_norm"] - else: - for summ in summaries: - if summ not in OPTIMIZER_SUMMARIES: - raise ValueError("Summaries should be one of [%s], you provided %s." % - (", ".join(OPTIMIZER_SUMMARIES), summ)) - if global_step is None: - raise ValueError("global_step is required for learning_rate_decay_fn.") - lr = learning_rate_decay_fn(global_step) + for summ in summaries: + if summ not in OPTIMIZER_SUMMARIES: + raise ValueError( + "Summaries should be one of [{}], you provided {}.".format( + ", ".join(OPTIMIZER_SUMMARIES), summ, + )) + if clip_gradients is not None and larc_params is not None: + raise AttributeError( + "LARC and gradient norm clipping should not be used together" + ) + + global_step = tf.train.get_or_create_global_step() + lr = learning_rate_decay_fn(global_step) + if "learning_rate" in summaries: + tf.summary.scalar("learning_rate", lr) - if "learning_rate" in summaries: - summary.scalar("learning_rate", lr) + with tf.variable_scope("Loss_Optimization"): + update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) + loss = control_flow_ops.with_dependencies(list(update_ops), loss) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): - if lr is None: - raise ValueError("Learning rate is None, but should be specified if " - "optimizer is string (%s)." % optimizer) if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError( - "Optimizer name should be one of [%s], you provided %s." % - (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) - opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr, **optimizer_params) - elif (isinstance(optimizer, type) and - issubclass(optimizer, optimizer_.Optimizer)): - if lr is None: - raise ValueError("Learning rate is None, but should be specified if " - "optimizer is class (%s)." % optimizer) - opt = optimizer(learning_rate=lr, **optimizer_params) - elif isinstance(optimizer, optimizer_.Optimizer): - opt = optimizer - elif callable(optimizer): - if lr is not None: - opt = optimizer(lr, **optimizer_params) - else: - opt = optimizer(**optimizer_params) - if not isinstance(opt, optimizer_.Optimizer): - raise ValueError("Unrecognized optimizer: function should return " - "subclass of Optimizer. Got %s." % str(opt)) - else: - raise ValueError("Unrecognized optimizer: should be string, " - "subclass of Optimizer, instance of " - "subclass of Optimizer or function with one argument. " - "Got %s." % str(optimizer)) - # All trainable variables, if specific variables are not specified. - if variables is None: - variables = vars_.trainable_variables() - - if automatic_loss_scaling is not None: - if automatic_loss_scaling not in AutomaticLossScaler.SUPPORTED_ALGOS: - raise ValueError("Unknown automatic loss scaling algorithm: %s." - % automatic_loss_sclaing) - if dtype != "mixed": - raise ValueError("Automatic loss scaling can be used only with " - "dtype=mixed.") - loss_scale = AutomaticLossScaler(algorithm=automatic_loss_scaling) + "Optimizer name should be one of [{}], you provided {}.".format( + ", ".join(OPTIMIZER_CLS_NAMES), optimizer + )) + optimizer = OPTIMIZER_CLS_NAMES[optimizer] + opt = optimizer(learning_rate=lr, **optimizer_params) + + if isinstance(loss_scaling, six.string_types): + loss_scaling = AutomaticLossScaler(algorithm=loss_scaling) if dtype == 'mixed': - opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scale) - if on_horovod: - opt = DistributedOptimizer(opt) + opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scaling) # Compute gradients. - gradients = opt.compute_gradients( - loss, variables, - colocate_gradients_with_ops=colocate_gradients_with_ops, + grads_and_vars = opt.compute_gradients( + loss, colocate_gradients_with_ops=True, ) - # Optionally add gradient noise. - if gradient_noise_scale is not None: - gradients = _add_scaled_noise_to_gradients(gradients, - gradient_noise_scale) - - # Multiply some gradients. - if gradient_multipliers is not None: - gradients = _multiply_gradients(gradients, gradient_multipliers) - if not gradients: - raise ValueError( - "Empty list of (gradient, var) pairs encountered. This is most " - "likely to be caused by an improper value of gradient_multipliers.") - - if "global_gradient_norm" in summaries or "gradient_norm" in summaries: - summary.scalar( - "global_norm/gradient_norm", - clip_ops.global_norm(list(map( - lambda x: tf.cast(x, tf.float32), - list(zip(*gradients))[0]) - )), - ) - - # Optionally clip gradients by global norm. - if clip_gradients is not None and larc_params is not None: - raise AttributeError( - "LARC and gradient norm clipping should not be used together" - ) - if isinstance(clip_gradients, float): - gradients = _clip_gradients_by_norm(gradients, clip_gradients) - elif callable(clip_gradients): - gradients = clip_gradients(gradients) - elif clip_gradients is not None: - raise ValueError( - "Unknown type %s for clip_gradients" % type(clip_gradients)) - - # Add histograms for variables, gradients and gradient norms. - for gradient, variable in gradients: - if isinstance(gradient, ops.IndexedSlices): - grad_values = gradient.values - else: - grad_values = gradient - - if isinstance(variable, ops.IndexedSlices): - var_values = variable.values + if on_horovod: + if iter_size > 1: + grads_and_vars_accum = [] + accum_ops = [] + for grad, var in grads_and_vars: + # necessary to use tf.Variable directly to instantiate cudnn rnn cells + # which don't have explicit shape. + grad_accum = tf.Variable( + initial_value=tf.zeros_like(var), + name=grad.name.split(":")[0] + "_accum", + expected_shape=var.shape, + dtype=grad.dtype, + trainable=False, + validate_shape=bool(var.get_shape()) + ) + if isinstance(grad, tf.IndexedSlices): + add_grads = tf.scatter_nd_add(grad_accum, grad.indices, + grad.values / iter_size) + else: + add_grads = grad_accum + grad / iter_size + + accum_ops.append(tf.assign(grad_accum, add_grads)) + grads_and_vars_accum.append((grad_accum, var)) + + accum_op = tf.group(accum_ops) + + def update_and_clear_op(): + with tf.control_dependencies([accum_op]): + red_grad_updates = opt.apply_gradients( + post_process_gradients( + reduce_gradients(grads_and_vars_accum, on_horovod=True), + lr=lr, + clip_gradients=clip_gradients, + larc_params=larc_params, + summaries=summaries, + ), + global_step=global_step, + ) + + with tf.control_dependencies([red_grad_updates]): + return tf.group([tf.assign(g, tf.zeros_like(g)) + for g, v in grads_and_vars_accum]) + + grad_updates = tf.cond( + pred=skip_update_ph, + true_fn=lambda: accum_op, + false_fn=update_and_clear_op, + ) else: - var_values = variable - - if grad_values is not None: - var_name = variable.name.replace(":", "_") - if "gradients" in summaries: - summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) - if "gradient_norm" in summaries: - summary.scalar("gradient_norm/%s" % var_name, - clip_ops.global_norm([grad_values])) - if "variables" in summaries: - summary.histogram("variables/%s" % var_name, var_values) - if "variable_norm" in summaries: - summary.scalar("variable_norm/%s" % var_name, - clip_ops.global_norm([var_values])) - - if clip_gradients is not None and ("global_gradient_norm" in summaries or - "gradient_norm" in summaries): - summary.scalar( - "global_norm/clipped_gradient_norm", - clip_ops.global_norm(list(map( - lambda x: tf.cast(x, tf.float32), - list(zip(*gradients))[0]) - )), - ) - - # LARC gradient re-scaling - if larc_params is not None: - check_params( - config=larc_params, - required_dict={'larc_eta': float}, - optional_dict={ - 'larc_mode': ['clip', 'scale'], - 'min_update': float, - 'epsilon': float - }, + grad_updates = opt.apply_gradients( + post_process_gradients( + reduce_gradients(grads_and_vars, on_horovod=True), + lr=lr, + clip_gradients=clip_gradients, + larc_params=larc_params, + summaries=summaries, + ), + global_step=global_step, + ) + else: + grad_updates = opt.apply_gradients( + post_process_gradients( + grads_and_vars, + lr=lr, + clip_gradients=clip_gradients, + larc_params=larc_params, + summaries=summaries, + ), + global_step=global_step, ) - larc_eta = larc_params['larc_eta'] - larc_mode = larc_params.get('larc_mode', 'clip') - min_update = larc_params.get('min_update', 1e-7) - eps = larc_params.get('epsilon', 1e-7) - - for idx, (g, v) in enumerate(gradients): - var_dtype = v.dtype - v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) - g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) - - if larc_mode == 'clip': - larc_grad_update = tf.maximum( - larc_eta * v_norm / (lr * (g_norm + eps)), - min_update, - ) - if "larc_summaries" in summaries: - summary.scalar('larc_clip_on/{}'.format(v.name), - tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) - larc_grad_update = tf.minimum(larc_grad_update, 1.0) - else: - larc_grad_update = tf.maximum( - larc_eta * v_norm / (g_norm + eps), - min_update, - ) - larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) - gradients[idx] = (larc_grad_update * g, v) - # adding additional summary - if "larc_summaries" in summaries: - summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update) - summary.scalar("larc_final_lr/{}".format(v.name), - tf.cast(lr, var_dtype) * larc_grad_update) - - # Create gradient updates. - grad_updates = opt.apply_gradients( - gradients, - global_step=global_step if increment_global_step else None, - name="train") - - # # Ensure the train_tensor computes grad_updates. + # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], loss) return train_tensor -def _clip_gradients_by_norm(grads_and_vars, clip_gradients): - """Clips gradients by global norm.""" - gradients, variables = zip(*grads_and_vars) - clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_gradients) - return list(zip(clipped_gradients, variables)) - - -def _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name): - """Find max_norm given norm and previous average.""" - with vs.variable_scope(name, "AdaptiveMaxNorm", [norm]): - log_norm = math_ops.log(norm + epsilon) - - def moving_average(name, value, decay): - moving_average_variable = vs.get_variable( - name, - shape=value.get_shape(), - dtype=value.dtype, - initializer=init_ops.zeros_initializer(), - trainable=False) - return moving_averages.assign_moving_average( - moving_average_variable, value, decay, zero_debias=False) - - # quicker adaptation at the beginning - if global_step is not None: - n = math_ops.to_float(global_step) - decay = math_ops.minimum(decay, n / (n + 1.)) - - # update averages - mean = moving_average("mean", log_norm, decay) - sq_mean = moving_average("sq_mean", math_ops.square(log_norm), decay) - - variance = sq_mean - math_ops.square(mean) - std = math_ops.sqrt(math_ops.maximum(epsilon, variance)) - max_norms = math_ops.exp(mean + std_factor * std) - return max_norms, mean - - -def adaptive_clipping_fn(std_factor=2., - decay=0.95, - static_max_norm=None, - global_step=None, - report_summary=False, - epsilon=1e-8, - name=None): - """Adapt the clipping value using statistics on the norms. - - Implement adaptive gradient as presented in section 3.2.1 of - https://arxiv.org/abs/1412.1602. - - Keeps a moving average of the mean and std of the log(norm) of the gradient. - If the norm exceeds `exp(mean + std_factor*std)` then all gradients will be - rescaled such that the global norm becomes `exp(mean)`. - - Args: - std_factor: Python scaler (or tensor). - `max_norm = exp(mean + std_factor*std)` - decay: The smoothing factor of the moving averages. - static_max_norm: If provided, will threshold the norm to this value as an - extra safety. - global_step: Optional global_step. If provided, `decay = decay*n/(n+1)`. - This provides a quicker adaptation of the mean for the first steps. - report_summary: If `True`, will add histogram summaries of the `max_norm`. - epsilon: Small value chosen to avoid zero variance. - name: The name for this operation is used to scope operations and summaries. - - Returns: - A function for applying gradient clipping. - """ - - def gradient_clipping(grads_and_vars): - """Internal function for adaptive clipping.""" - grads, variables = zip(*grads_and_vars) - - norm = clip_ops.global_norm(grads) - - max_norm, log_mean = _adaptive_max_norm(norm, std_factor, decay, - global_step, epsilon, name) +def post_process_gradients(grads_and_vars, summaries, lr, + clip_gradients, larc_params): + """Applies post processing to gradients, i.e. clipping, LARC, summaries.""" + if "global_gradient_norm" in summaries: + tf.summary.scalar( + "global_gradient_norm", + _global_norm_with_cast(grads_and_vars), + ) - # reports the max gradient norm for debugging - if report_summary: - summary.scalar("global_norm/adaptive_max_gradient_norm", max_norm) + # Optionally clip gradients by global norm. + if clip_gradients is not None: + grads_and_vars = _clip_gradients_by_norm(grads_and_vars, clip_gradients) - # factor will be 1. if norm is smaller than max_norm - factor = array_ops.where(norm < max_norm, - array_ops.ones_like(norm), - math_ops.exp(log_mean) / norm) + # Add histograms for variables, gradients and gradient norms. + for gradient, variable in grads_and_vars: + if isinstance(gradient, tf.IndexedSlices): + grad_values = gradient.values + else: + grad_values = gradient - if static_max_norm is not None: - factor = math_ops.minimum(static_max_norm / norm, factor) + if isinstance(variable, tf.IndexedSlices): + var_values = variable.values + else: + var_values = variable + + if grad_values is not None: + var_name = variable.name.replace(":", "_") + if "gradients" in summaries: + # need to mask nans for automatic loss scaling + tf.summary.histogram("gradients/%s" % var_name, mask_nans(grad_values)) + if "gradient_norm" in summaries: + tf.summary.scalar("gradient_norm/%s" % var_name, tf.norm(grad_values)) + if "variables" in summaries: + tf.summary.histogram("variables/%s" % var_name, var_values) + if "variable_norm" in summaries: + tf.summary.scalar("variable_norm/%s" % var_name, tf.norm(var_values)) + + if clip_gradients is not None and "global_gradient_norm" in summaries: + tf.summary.scalar( + "global_clipped_gradient_norm", + _global_norm_with_cast(grads_and_vars), + ) - # apply factor - clipped_grads = [] - for grad in grads: - if grad is None: - clipped_grads.append(None) - elif isinstance(grad, ops.IndexedSlices): - clipped_grads.append( - ops.IndexedSlices(grad.values * factor, grad.indices, - grad.dense_shape)) + # LARC gradient re-scaling + if larc_params is not None: + check_params( + config=larc_params, + required_dict={'larc_eta': float}, + optional_dict={ + 'larc_mode': ['clip', 'scale'], + 'min_update': float, + 'epsilon': float + }, + ) + larc_eta = larc_params['larc_eta'] + larc_mode = larc_params.get('larc_mode', 'clip') + min_update = larc_params.get('min_update', 1e-7) + eps = larc_params.get('epsilon', 1e-7) + + grads_and_vars_larc = [None] * len(grads_and_vars) + for idx, (g, v) in enumerate(grads_and_vars): + var_dtype = v.dtype + v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) + g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) + + if larc_mode == 'clip': + larc_grad_update = tf.maximum( + larc_eta * v_norm / (lr * (g_norm + eps)), + min_update, + ) + if "larc_summaries" in summaries: + tf.summary.scalar('larc_clip_on/{}'.format(v.name), + tf.cast(tf.less(larc_grad_update, 1.0), tf.int32)) + larc_grad_update = tf.minimum(larc_grad_update, 1.0) else: - clipped_grads.append(grad * factor) - - return list(zip(clipped_grads, variables)) + larc_grad_update = tf.maximum( + larc_eta * v_norm / (g_norm + eps), + min_update, + ) + larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) + grads_and_vars_larc[idx] = (larc_grad_update * g, v) + + # adding additional summary + if "larc_summaries" in summaries: + tf.summary.scalar('larc_grad_update/{}'.format(v.name), + larc_grad_update) + tf.summary.scalar("larc_final_lr/{}".format(v.name), + tf.cast(lr, var_dtype) * larc_grad_update) + grads_and_vars = grads_and_vars_larc + return grads_and_vars + + +def _global_norm_with_cast(grads_and_vars): + return tf.global_norm(list(map( + lambda x: tf.cast(x, tf.float32), + list(zip(*grads_and_vars))[0]) + )) - return gradient_clipping - -def _add_scaled_noise_to_gradients(grads_and_vars, gradient_noise_scale): - """Adds scaled noise from a 0-mean normal distribution to gradients.""" +def _clip_gradients_by_norm(grads_and_vars, clip_gradients): + """Clips gradients by global norm.""" gradients, variables = zip(*grads_and_vars) - noisy_gradients = [] - for gradient in gradients: - if gradient is None: - noisy_gradients.append(None) - continue - if isinstance(gradient, ops.IndexedSlices): - gradient_shape = gradient.dense_shape - else: - gradient_shape = gradient.get_shape() - noise = random_ops.truncated_normal(gradient_shape) * gradient_noise_scale - noisy_gradients.append(gradient + noise) - return list(zip(noisy_gradients, variables)) - - -def _multiply_gradients(grads_and_vars, gradient_multipliers): - """Multiply specified gradients.""" - multiplied_grads_and_vars = [] - for grad, var in grads_and_vars: - if grad is not None and \ - (var in gradient_multipliers or var.name in gradient_multipliers): - key = var if var in gradient_multipliers else var.name - multiplier = constant_op.constant( - gradient_multipliers[key], dtype=dtypes.float32) - if isinstance(grad, ops.IndexedSlices): - grad_values = grad.values * multiplier - grad = ops.IndexedSlices(grad_values, grad.indices, grad.dense_shape) - else: - grad *= multiplier - multiplied_grads_and_vars.append((grad, var)) - return multiplied_grads_and_vars - + clipped_gradients, _ = tf.clip_by_global_norm(gradients, clip_gradients) + return list(zip(clipped_gradients, variables)) diff --git a/open_seq2seq/optimizers/optimizers_test.py b/open_seq2seq/optimizers/optimizers_test.py new file mode 100644 index 000000000..bf386edf9 --- /dev/null +++ b/open_seq2seq/optimizers/optimizers_test.py @@ -0,0 +1,84 @@ +# Copyright (c) 2017 NVIDIA Corporation +from __future__ import absolute_import, division, print_function +from __future__ import unicode_literals +from six.moves import range + +import tensorflow as tf +import numpy as np +import numpy.testing as npt + +from open_seq2seq.optimizers import optimize_loss +from .lr_policies import fixed_lr + + +class IterSizeTests(tf.test.TestCase): + def setUp(self): + pass + + def tearDown(self): + pass + + def test_updates(self): + try: + import horovod.tensorflow as hvd + hvd.init() + except ImportError: + print("Horovod not installed skipping test_updates") + return + + dtype = tf.float32 + with tf.Graph().as_default() as g: + n_samples = 10 + n_hid = 10 + var_dtype = tf.float32 if dtype == tf.float32 else tf.float16 + + np.random.seed(0) + X = np.random.rand(n_samples, n_hid) + y = np.random.rand(n_samples, 1) + w = np.linalg.solve(X.T.dot(X), X.T.dot(y)) + + x_ph = tf.placeholder(var_dtype, [n_samples, n_hid]) + y_ph = tf.placeholder(var_dtype, [n_samples, 1]) + + y_pred = tf.layers.dense(x_ph, 1, use_bias=False) + loss = tf.losses.mean_squared_error(y_ph, y_pred) + loss += tf.losses.get_regularization_loss() + skip_update_ph = tf.placeholder(tf.bool) + iter_size = 8 + train_op = optimize_loss(loss, "SGD", {}, + lambda gs: fixed_lr(gs, 0.1), dtype=dtype, + iter_size=iter_size, on_horovod=True, + skip_update_ph=skip_update_ph) + grad_accum = [var for var in tf.global_variables() if 'accum' in var.name][0] + var = tf.trainable_variables()[0] + with self.test_session(g, use_gpu=True) as sess: + sess.run(tf.global_variables_initializer()) + for _ in range(3): + g, v = sess.run([grad_accum, var]) + npt.assert_allclose(g, np.zeros(g.shape)) + + true_g = 2 * (X.T.dot(X).dot(v) - X.T.dot(y)) / X.shape[0] / iter_size + + sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: True}) + g_new, v_new = sess.run([grad_accum, var]) + npt.assert_allclose(g_new, true_g, atol=1e-7) + npt.assert_allclose(v_new, v) + + sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: True}) + g_new, v_new = sess.run([grad_accum, var]) + npt.assert_allclose(g_new, true_g * 2, atol=1e-7) + npt.assert_allclose(v_new, v) + + sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: True}) + g_new, v_new = sess.run([grad_accum, var]) + npt.assert_allclose(g_new, true_g * 3, atol=1e-7) + npt.assert_allclose(v_new, v) + + sess.run(train_op, {x_ph: X, y_ph: y, skip_update_ph: False}) + g_new, v_new = sess.run([grad_accum, var]) + npt.assert_allclose(g_new, np.zeros(g.shape)) + npt.assert_allclose(v_new, v - 0.1 * true_g * 4, atol=1e-7) + + +if __name__ == '__main__': + tf.test.main() diff --git a/open_seq2seq/parts/cnns/__init__.py b/open_seq2seq/parts/cnns/__init__.py new file mode 100644 index 000000000..856829f6e --- /dev/null +++ b/open_seq2seq/parts/cnns/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2018 NVIDIA Corporation diff --git a/open_seq2seq/parts/cnns/conv_blocks.py b/open_seq2seq/parts/cnns/conv_blocks.py new file mode 100644 index 000000000..31087d6e9 --- /dev/null +++ b/open_seq2seq/parts/cnns/conv_blocks.py @@ -0,0 +1,92 @@ +# Copyright (c) 2018 NVIDIA Corporation +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from six.moves import range + +import tensorflow as tf + + +def conv_actv(type, name, inputs, filters, kernel_size, activation_fn, strides, + padding, regularizer, training, data_format): + """Helper function that applies convolution and activation. + + Args: + type: the following types are supported + 'conv1d', 'conv2d' + """ + if type == "conv1d": + layer = tf.layers.conv1d + elif type == "conv2d": + layer = tf.layers.conv2d + + conv = layer( + name="{}".format(name), + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=padding, + kernel_regularizer=regularizer, + use_bias=False, + data_format=data_format, + ) + + output = conv + if activation_fn is not None: + output = activation_fn(output) + return output + + +def conv_bn_actv(type, name, inputs, filters, kernel_size, activation_fn, + strides, padding, regularizer, training, data_format, + bn_momentum, bn_epsilon): + """Helper function that applies convolution, batch norm and activation. + Accepts inputs in 'channels_last' format only. + + Args: + type: the following types are supported + 'conv1d', 'conv2d' + """ + if type == "conv1d": + layer = tf.layers.conv1d + elif type == "conv2d": + layer = tf.layers.conv2d + + conv = layer( + name="{}".format(name), + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=padding, + kernel_regularizer=regularizer, + use_bias=False, + data_format=data_format, + ) + + # trick to make batchnorm work for mixed precision training. + # To-Do check if batchnorm works smoothly for >4 dimensional tensors + squeeze = False + if type == "conv1d": + conv = tf.expand_dims(conv, axis=1) # NWC --> NHWC + squeeze = True + + bn = tf.layers.batch_normalization( + name="{}/bn".format(name), + inputs=conv, + gamma_regularizer=regularizer, + training=training, + axis=-1 if data_format == 'channels_last' else 1, + momentum=bn_momentum, + epsilon=bn_epsilon, + ) + + if squeeze: + bn = tf.squeeze(bn, axis=1) + + output = bn + if activation_fn is not None: + output = activation_fn(output) + return output diff --git a/open_seq2seq/parts/convs2s/__init__.py b/open_seq2seq/parts/convs2s/__init__.py new file mode 100644 index 000000000..f6874261b --- /dev/null +++ b/open_seq2seq/parts/convs2s/__init__.py @@ -0,0 +1,3 @@ +from . import ffn_wn_layer +from . import conv_wn_layer +from . import attention_wn_layer diff --git a/open_seq2seq/parts/convs2s/attention_wn_layer.py b/open_seq2seq/parts/convs2s/attention_wn_layer.py new file mode 100644 index 000000000..89d9c3c6e --- /dev/null +++ b/open_seq2seq/parts/convs2s/attention_wn_layer.py @@ -0,0 +1,90 @@ +"""Implementation of the attention layer for convs2s. +Inspired from https://github.com/tobyyouup/conv_seq2seq""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import tensorflow as tf +import math +from open_seq2seq.parts.convs2s.ffn_wn_layer import FeedFowardNetworkNormalized + + +class AttentionLayerNormalized(tf.layers.Layer): + """Attention layer for convs2s with weight normalization""" + + def __init__(self, in_dim, embed_size, layer_id, add_res): + """initializes the attention layer. + It uses weight normalization for linear projections + (Salimans & Kingma, 2016) w = g * v/2-norm(v) + + Args: + in_dim: int last dimension of the inputs + embed_size: int target embedding size + layer_id: int the id of current convolution layer + add_res: bool whether residual connection should be added or not + """ + super(AttentionLayerNormalized, self).__init__() + + self.add_res = add_res + with tf.variable_scope("attention_layer_" + str(layer_id)): + + # linear projection layer to project the attention input to target space + self.tgt_embed_proj = FeedFowardNetworkNormalized( + in_dim, + embed_size, + dropout=1.0, + var_scope_name="att_linear_mapping_tgt_embed") + + # linear projection layer to project back to the input space + self.out_proj = FeedFowardNetworkNormalized( + embed_size, + in_dim, + dropout=1.0, + var_scope_name="att_linear_mapping_out") + + def call(self, input, target_embed, encoder_output_a, encoder_output_b, + input_attention_bias): + """Calculates the attention vectors. + + Args: + input: A float32 tensor with shape [batch_size, length, in_dim] + target_embed: A float32 tensor with shape [batch_size, length, in_dim] + containing the target embeddings + encoder_output_a: A float32 tensor with shape [batch_size, length, out_dim] + containing the first encoder outputs, uses as the keys + encoder_output_b: A float32 tensor with shape [batch_size, length, src_emb_dim] + containing the second encoder outputs, uses as the values + input_attention_bias: A float32 tensor with shape [batch_size, length, 1] + containing the bias used to mask the paddings + + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + + h_proj = self.tgt_embed_proj(input) + d_proj = (h_proj + target_embed) * math.sqrt(0.5) + att_score = tf.matmul(d_proj, encoder_output_a, transpose_b=True) + + # Masking need to be done in float32. Added to support mixed-precision training. + att_score = tf.cast(x=att_score, dtype=tf.float32) + + # mask out the paddings + if input_attention_bias is not None: + att_score = att_score + input_attention_bias + + att_score = tf.nn.softmax(att_score) + + # Cast back to original type + att_score = tf.cast(x=att_score, dtype=encoder_output_b.dtype) + + length = tf.cast(tf.shape(encoder_output_b), encoder_output_b.dtype) + output = tf.matmul(att_score, encoder_output_b) * \ + length[1] * tf.cast(tf.sqrt(1.0 / length[1]), dtype=encoder_output_b.dtype) + output = self.out_proj(output) + + if self.add_res: + output = (output + input) * math.sqrt(0.5) + + return output diff --git a/open_seq2seq/parts/convs2s/conv_wn_layer.py b/open_seq2seq/parts/convs2s/conv_wn_layer.py new file mode 100644 index 000000000..1c18a3b19 --- /dev/null +++ b/open_seq2seq/parts/convs2s/conv_wn_layer.py @@ -0,0 +1,103 @@ +"""Implementation of a 1d convolutional layer with weight normalization. +Inspired from https://github.com/tobyyouup/conv_seq2seq""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import tensorflow as tf +import math + + +class Conv1DNetworkNormalized(tf.layers.Layer): + """1D convolutional layer with weight normalization""" + + def __init__(self, in_dim, out_dim, kernel_width, mode, layer_id, + hidden_dropout, conv_padding, decode_padding): + """initializes the 1D convolution layer. + It uses weight normalization (Salimans & Kingma, 2016) w = g * v/2-norm(v) + + Args: + in_dim: int last dimension of the inputs + out_dim: int new dimension for the output + kernel_width: int width of kernel + mode: str the current mode + layer_id: int the id of current convolution layer + hidden_dropout: float the keep-dropout value used on the input. + Give 1.0 if no dropout. + It is used to initialize the weights of convolution. + conv_padding: str the type of padding done for convolution + decode_padding: bool specifies if this convolution layer is in decoder or not + in decoder padding is done explicitly before convolution + """ + + super(Conv1DNetworkNormalized, self).__init__() + self.mode = mode + self.conv_padding = conv_padding + self.decode_padding = decode_padding + self.hidden_dropout = hidden_dropout + self.kernel_width = kernel_width + + with tf.variable_scope("conv_layer_" + str(layer_id)): + V_std = math.sqrt(4.0 * hidden_dropout / (kernel_width * in_dim)) + self.V = tf.get_variable( + 'V', + shape=[kernel_width, in_dim, 2 * out_dim], + initializer=tf.random_normal_initializer(mean=0, stddev=V_std), + trainable=True) + self.V_norm = tf.norm(self.V.initialized_value(), axis=[0, 1]) + self.g = tf.get_variable('g', initializer=self.V_norm, trainable=True) + self.b = tf.get_variable( + 'b', + shape=[2 * out_dim], + initializer=tf.zeros_initializer(), + trainable=True) + + self.W = tf.reshape(self.g, [1, 1, 2 * out_dim]) * tf.nn.l2_normalize( + self.V, [0, 1]) + + def call(self, input): + """Applies convolution with gated linear units on x. + + Args: + x: A float32 tensor with shape [batch_size, length, in_dim] + + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + x = input + if self.mode == "train": + x = tf.nn.dropout(x, self.hidden_dropout) + + if self.decode_padding: + x = tf.pad( + x, [[0, 0], [self.kernel_width - 1, self.kernel_width - 1], [0, 0]], + "CONSTANT") + + output = tf.nn.bias_add( + tf.nn.conv1d( + value=x, filters=self.W, stride=1, padding=self.conv_padding), + self.b) + + if self.decode_padding and self.kernel_width > 1: + output = output[:, 0:-self.kernel_width + 1, :] + + output = self.gated_linear_units(output) + + return output + + def gated_linear_units(self, inputs): + """Gated Linear Units (GLU) on x. + + Args: + x: A float32 tensor with shape [batch_size, length, 2*out_dim] + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + input_shape = inputs.get_shape().as_list() + assert len(input_shape) == 3 + input_pass = inputs[:, :, 0:int(input_shape[2] / 2)] + input_gate = inputs[:, :, int(input_shape[2] / 2):] + input_gate = tf.sigmoid(input_gate) + return tf.multiply(input_pass, input_gate) diff --git a/open_seq2seq/parts/convs2s/ffn_wn_layer.py b/open_seq2seq/parts/convs2s/ffn_wn_layer.py new file mode 100644 index 000000000..27da7a159 --- /dev/null +++ b/open_seq2seq/parts/convs2s/ffn_wn_layer.py @@ -0,0 +1,68 @@ +"""Implementation of fully connected network with weight normalization. +Inspired from https://github.com/tobyyouup/conv_seq2seq""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import tensorflow as tf +import math + + +class FeedFowardNetworkNormalized(tf.layers.Layer): + """Fully connected feedforward network with weight normalization""" + + def __init__(self, in_dim, out_dim, dropout, var_scope_name): + """initializes the linear layer. + This layer projects from in_dim-dimenstional space to out_dim-dimentional space. + It uses weight normalization (Salimans & Kingma, 2016) w = g * v/2-norm(v) + + Args: + in_dim: int last dimension of the inputs + out_dim: int new dimension for the output + dropout: float the keep-dropout value used in the previous layer. + It is used to initialize the weights. Give 1.0 if no dropout. + var_scope_name: str the scope name for the weight variables + """ + super(FeedFowardNetworkNormalized, self).__init__() + self.out_dim = out_dim + self.in_dim = in_dim + + with tf.variable_scope(var_scope_name): + V_initializer = \ + tf.random_normal_initializer(mean=0, stddev=math.sqrt(dropout * 1.0 / in_dim)) + self.V = tf.get_variable( + 'V', + shape=[in_dim, out_dim], + initializer=V_initializer, + trainable=True) + self.V_norm = tf.norm(self.V.initialized_value(), axis=0) + self.g = tf.get_variable('g', initializer=self.V_norm, trainable=True) + self.b = tf.get_variable( + 'b', + shape=[out_dim], + initializer=tf.zeros_initializer(), + trainable=True) + + def call(self, x): + """Projects x with its linear transformation. + + Args: + x: A float32 tensor with shape [batch_size, length, in_dim] + + Returns: + float32 tensor with shape [batch_size, length, out_dim]. + """ + batch_size = tf.shape(x)[0] + + x = tf.reshape(x, [-1, self.in_dim]) + output = tf.matmul(x, self.V) + output = tf.reshape(output, [batch_size, -1, self.out_dim]) + + # x*(v*(g/2-norm(v))) + b + scaler = tf.div(self.g, tf.norm(self.V, axis=0)) + output = tf.reshape(scaler, [1, self.out_dim]) * output + \ + tf.reshape(self.b, [1, self.out_dim]) + + return output diff --git a/open_seq2seq/parts/rnns/utils.py b/open_seq2seq/parts/rnns/utils.py index a16bab936..53e011e64 100644 --- a/open_seq2seq/parts/rnns/utils.py +++ b/open_seq2seq/parts/rnns/utils.py @@ -11,134 +11,29 @@ import tensorflow as tf -def create_rnn_cell(cell_type, - cell_params, - num_layers=1, - dp_input_keep_prob=1.0, - dp_output_keep_prob=1.0, - residual_connections=False, - wrap_to_multi_rnn=True): - """ - TODO: MOVE THIS properly to utils. Write doc - :param cell_type: - :param cell_params: - :param num_layers: - :param dp_input_keep_prob: - :param dp_output_keep_prob: - :param residual_connections: - :return: - """ - def single_cell(cell_params): - # TODO: This method is ugly - redo - size = cell_params["num_units"] - proj_size = None if "proj_size" not in cell_params else cell_params["proj_size"] +def single_cell(cell_class, + cell_params, + dp_input_keep_prob=1.0, + dp_output_keep_prob=1.0, + residual_connections=False): + """Creates an instance of the rnn cell. + Such cell describes one step one layer and can include residual connection + and/or dropout - if cell_type == "lstm": - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return tf.nn.rnn_cell.LSTMCell(num_units=size, - num_proj=proj_size, - forget_bias=1.0) - else: - return DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=size, - num_proj=proj_size, - forget_bias=1.0), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(tf.nn.rnn_cell.LSTMCell(num_units=size, - num_proj=proj_size, - forget_bias=1.0)) - else: - return ResidualWrapper(DropoutWrapper( - tf.nn.rnn_cell.LSTMCell( - num_units=size, - num_proj=proj_size, - forget_bias=1.0, - ), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - )) - elif cell_type == "gru": - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return tf.nn.rnn_cell.GRUCell(num_units=size) - else: - return DropoutWrapper( - tf.nn.rnn_cell.GRUCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - ) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(tf.nn.rnn_cell.GRUCell(num_units=size)) - else: - return ResidualWrapper(DropoutWrapper( - tf.nn.rnn_cell.GRUCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob), - ) - elif cell_type == "glstm": - num_groups = cell_params["num_groups"] - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return GLSTMCell(num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0) - else: - return DropoutWrapper(GLSTMCell(num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(GLSTMCell(num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0)) - else: - return ResidualWrapper(DropoutWrapper( - GLSTMCell( - num_units=size, - number_of_groups=num_groups, - num_proj=proj_size, - forget_bias=1.0, - ), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - )) - elif cell_type == "slstm": - if not residual_connections: - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return BasicSLSTMCell(num_units=size) - else: - return DropoutWrapper(BasicSLSTMCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob - ) - else: # residual connection required - if dp_input_keep_prob == 1.0 and dp_output_keep_prob == 1.0: - return ResidualWrapper(BasicSLSTMCell(num_units=size)) - else: - return ResidualWrapper(DropoutWrapper( - BasicSLSTMCell(num_units=size), - input_keep_prob=dp_input_keep_prob, - output_keep_prob=dp_output_keep_prob, - )) - else: - raise ValueError("Unknown RNN cell class: {}".format(cell_type)) + Args: + cell_class: Tensorflow RNN cell class + cell_params (dict): cell parameters + dp_input_keep_prob (float): (default: 1.0) input dropout keep probability + dp_output_keep_prob (float): (default: 1.0) output dropout keep probability + residual_connections (bool): whether to add residual connection - if num_layers > 1: - if wrap_to_multi_rnn: - return MultiRNNCell([single_cell(cell_params) for _ in range(num_layers)]) - else: - cells = [] # for GNMT-like attention in decoder - for i in range(num_layers): - cells.append(single_cell(cell_params)) - return cells - else: - return single_cell(cell_params) + Returns: + TF RNN instance + """ + cell = cell_class(**cell_params) + if residual_connections: + cell = ResidualWrapper(cell) + if dp_input_keep_prob != 1.0 or dp_output_keep_prob != 1.0: + cell = DropoutWrapper(cell, input_keep_prob=dp_input_keep_prob, + output_keep_prob=dp_output_keep_prob) + return cell diff --git a/open_seq2seq/parts/transformer/embedding_layer.py b/open_seq2seq/parts/transformer/embedding_layer.py index a4fef6147..0966cdde7 100644 --- a/open_seq2seq/parts/transformer/embedding_layer.py +++ b/open_seq2seq/parts/transformer/embedding_layer.py @@ -26,23 +26,31 @@ class EmbeddingSharedWeights(tf.layers.Layer): """Calculates input embeddings and pre-softmax linear with shared weights.""" - def __init__(self, vocab_size, hidden_size, pad2eight=False): + def __init__(self, vocab_size, hidden_size, pad_vocab_to_eight=False, init_var=None, + embed_scale=True, pad_sym=0, mask_paddings=True): super(EmbeddingSharedWeights, self).__init__() - self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.embed_scale = embed_scale + self.pad_sym = pad_sym + self.mask_paddings = mask_paddings + padf = lambda x: x if x % 8 == 0 else x + 8 - x % 8 - if pad2eight: - self.hidden_size = padf(hidden_size) + if pad_vocab_to_eight: + self.vocab_size = padf(vocab_size) + else: + self.vocab_size = vocab_size + + if init_var is None: + self.init_var = hidden_size ** -0.5 else: - self.hidden_size = hidden_size + self.init_var = init_var def build(self, _): with tf.variable_scope("embedding_and_softmax", reuse=tf.AUTO_REUSE): # Create and initialize weights. The random normal initializer was chosen # randomly, and works well. - self.shared_weights = tf.get_variable( - "weights", [self.vocab_size, self.hidden_size], - initializer=tf.random_normal_initializer( - 0., self.hidden_size ** -0.5)) + self.shared_weights = tf.get_variable("weights", [self.vocab_size, self.hidden_size], + initializer=tf.random_normal_initializer(0., self.init_var)) self.built = True @@ -58,18 +66,18 @@ def call(self, x): """ with tf.name_scope("embedding"): embeddings = tf.gather(self.shared_weights, x) - - # Scale embedding by the sqrt of the hidden size - embeddings *= self.hidden_size ** 0.5 - - # Create binary array of size [batch_size, length] - # where 1 = padding, 0 = not padding - padding = model_utils.get_padding(x) - - # Set all padding embedding values to 0 - #embeddings *= tf.expand_dims(1 - padding, -1) - embeddings *= tf.cast(tf.expand_dims(1 - padding, -1), - dtype=embeddings.dtype) + if self.embed_scale: + # Scale embedding by the sqrt of the hidden size + embeddings *= self.hidden_size ** 0.5 + + if self.mask_paddings: + # Create binary array of size [batch_size, length] + # where 1 = padding, 0 = not padding + padding = model_utils.get_padding(x, padding_value=self.pad_sym) + + # Set all padding embedding values to 0 + #embeddings *= tf.expand_dims(1 - padding, -1) + embeddings *= tf.cast(tf.expand_dims(1.0 - padding, -1), dtype=embeddings.dtype) return embeddings def linear(self, x): diff --git a/open_seq2seq/parts/transformer/utils.py b/open_seq2seq/parts/transformer/utils.py index 467ff319a..cef61af1d 100644 --- a/open_seq2seq/parts/transformer/utils.py +++ b/open_seq2seq/parts/transformer/utils.py @@ -75,23 +75,23 @@ def get_decoder_self_attention_bias(length): return decoder_bias -def get_padding(x, padding_value=0): +def get_padding(x, padding_value=0, dtype=tf.float32): """Return float tensor representing the padding values in x. Args: x: int tensor with any shape padding_value: int value that + dtype: type of the output Returns: flaot tensor with same shape as x containing values 0 or 1. 0 -> non-padding, 1 -> padding """ with tf.name_scope("padding"): - return tf.to_float(tf.equal(x, padding_value)) - #return tf.cast(tf.equal(x, padding_value), dtype=x.dtype) + return tf.cast(tf.equal(x, padding_value), dtype=dtype) -def get_padding_bias(x): +def get_padding_bias(x, res_rank=4, pad_sym=0): """Calculate bias tensor from padding values in tensor. Bias tensor that is added to the pre-softmax multi-headed attention logits, @@ -100,14 +100,22 @@ def get_padding_bias(x): Args: x: int tensor with shape [batch_size, length] + res_rank: int indicates the rank of attention_bias. + dtype: type of the output attention_bias + pad_sym: int the symbol used for padding Returns: - Attention bias tensor of shape [batch_size, 1, 1, length]. + Attention bias tensor of shape + [batch_size, 1, 1, length] if res_rank = 4 - for Transformer + or [batch_size, 1, length] if res_rank = 3 - for ConvS2S """ with tf.name_scope("attention_bias"): - padding = get_padding(x) + padding = get_padding(x, padding_value=pad_sym) attention_bias = padding * _NEG_INF - attention_bias = tf.expand_dims( - tf.expand_dims(attention_bias, axis=1), axis=1) + if res_rank == 4: + attention_bias = tf.expand_dims(tf.expand_dims(attention_bias, axis=1), axis=1) + elif res_rank == 3: + attention_bias = tf.expand_dims(attention_bias, axis=1) + else: + raise ValueError("res_rank should be 3 or 4 but got {}".format(res_rank)) return attention_bias - diff --git a/open_seq2seq/test_utils/test_speech_configs/__init__.py b/open_seq2seq/test_utils/test_speech_configs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/open_seq2seq/test_utils/test_speech_config.py b/open_seq2seq/test_utils/test_speech_configs/ds2_test_config.py similarity index 97% rename from open_seq2seq/test_utils/test_speech_config.py rename to open_seq2seq/test_utils/test_speech_configs/ds2_test_config.py index 3cf326a17..7f1c48e26 100644 --- a/open_seq2seq/test_utils/test_speech_config.py +++ b/open_seq2seq/test_utils/test_speech_configs/ds2_test_config.py @@ -13,7 +13,7 @@ base_params = { "random_seed": 0, "use_horovod": False, - "num_epochs": 111, + "num_epochs": 150, "num_gpus": 1, "batch_size_per_gpu": 10, @@ -32,7 +32,6 @@ "lr_policy_params": { "learning_rate": 0.001, "power": 2, - "decay_steps": 500, }, "larc_params": { "larc_eta": 0.001, @@ -71,7 +70,7 @@ }, "activation_fn": lambda x: tf.minimum(tf.nn.relu(x), 20.0), "data_format": "channels_first", - "bn_momentum": 0.1, + "bn_momentum": 0.001, }, "decoder": FullyConnectedCTCDecoder, diff --git a/open_seq2seq/test_utils/test_speech_configs/w2l_test_config.py b/open_seq2seq/test_utils/test_speech_configs/w2l_test_config.py new file mode 100644 index 000000000..7709154d9 --- /dev/null +++ b/open_seq2seq/test_utils/test_speech_configs/w2l_test_config.py @@ -0,0 +1,103 @@ +from __future__ import absolute_import, division, print_function +import tensorflow as tf +from open_seq2seq.models import Speech2Text +from open_seq2seq.encoders import Wave2LetterEncoder +from open_seq2seq.decoders import FullyConnectedCTCDecoder +from open_seq2seq.data import Speech2TextDataLayer +from open_seq2seq.losses import CTCLoss +from open_seq2seq.optimizers.lr_policies import poly_decay + + +base_model = Speech2Text + +base_params = { + "random_seed": 0, + "use_horovod": False, + "num_epochs": 200, + + "num_gpus": 1, + "batch_size_per_gpu": 10, + "save_summaries_steps": 10, + "print_loss_steps": 10, + "print_samples_steps": 20, + "eval_steps": 50, + "save_checkpoint_steps": 50, + "logdir": "tmp_log_folder", + + "optimizer": "Momentum", + "optimizer_params": { + "momentum": 0.90, + }, + "lr_policy": poly_decay, + "lr_policy_params": { + "learning_rate": 0.001, + "power": 2, + "decay_steps": 500, + }, + "larc_params": { + "larc_eta": 0.001, + }, + "dtype": tf.float32, + "summaries": ['learning_rate', 'variables', 'gradients', 'larc_summaries', + 'variable_norm', 'gradient_norm', 'global_gradient_norm'], + + "encoder": Wave2LetterEncoder, + "encoder_params": { + "convnet_layers": [ + { + "type": "conv1d", "repeat" : 3, + "kernel_size": [7], "stride": [1], + "num_channels": 200, "padding": "SAME" + }, + { + "type": "conv1d", "repeat" : 1, + "kernel_size": [1], "stride": [1], + "num_channels": 400, "padding": "SAME" #n_hidden = num_channels + }, + ], + + "dropout_keep_prob": 0.9, + + "initializer": tf.contrib.layers.xavier_initializer, + "initializer_params": { + 'uniform': False, + }, + "activation_fn": lambda x: tf.minimum(tf.nn.relu(x), 20.0), + "data_format": "channels_last", + "bn_momentum": 0.001, + }, + + "decoder": FullyConnectedCTCDecoder, + "decoder_params": { + "initializer": tf.contrib.layers.xavier_initializer, + "use_language_model": False, + }, + "loss": CTCLoss, + "loss_params": {}, +} + +train_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "open_seq2seq/test_utils/toy_speech_data/toy_data.csv", + ], + "shuffle": True, + }, +} + +eval_params = { + "data_layer": Speech2TextDataLayer, + "data_layer_params": { + "num_audio_features": 40, + "input_type": "logfbank", + "vocab_file": "open_seq2seq/test_utils/toy_speech_data/vocab.txt", + "dataset_files": [ + "open_seq2seq/test_utils/toy_speech_data/toy_data.csv", + ], + "shuffle": False, + }, +} diff --git a/open_seq2seq/utils/funcs.py b/open_seq2seq/utils/funcs.py index ba8a5d069..de3bf96ae 100644 --- a/open_seq2seq/utils/funcs.py +++ b/open_seq2seq/utils/funcs.py @@ -9,7 +9,8 @@ from .hooks import PrintSamplesHook, RunEvaluationHook, PrintLossAndTimeHook, \ BroadcastGlobalVariablesHook -from open_seq2seq.utils.utils import deco_print, get_results_for_epoch +from open_seq2seq.utils.utils import deco_print, get_results_for_epoch, \ + collect_if_horovod from tensorflow.python import debug as tf_debug @@ -117,7 +118,17 @@ def train(train_model, eval_model=None, debug_port=None): break tm = time.time() try: - fetches_vals = sess.run(fetches) + feed_dict = {} + iter_size = train_model.params.get('iter_size', 1) + if iter_size > 1: + feed_dict[train_model.skip_update_ph] = step % iter_size != 0 + if step % iter_size == 0: + fetches_vals = sess.run(fetches, feed_dict) + else: + # necessary to skip "no-update" steps when iter_size > 1 + def run_with_no_hooks(step_context): + return step_context.session.run(fetches, feed_dict) + fetches_vals = sess.run_step_fn(run_with_no_hooks) except tf.errors.OutOfRangeError: break if step >= bench_start: @@ -125,29 +136,29 @@ def train(train_model, eval_model=None, debug_port=None): if len(fetches) > 1: for i in range(train_model.num_gpus): total_objects += np.sum(fetches_vals[i + 1]) + if train_model.params['print_bench_info_steps'] is not None: + if step % train_model.params['print_bench_info_steps'] == 0: + total_objects_cur = collect_if_horovod(total_objects, hvd, + mode="sum") + if master_worker: + avg_objects = 1.0 * total_objects_cur / total_time + deco_print("Avg objects per second: {:.3f}".format(avg_objects)) + step += 1 - if hvd is not None: - deco_print("Finished training on rank {}".format(hvd.rank())) - else: - deco_print("Finished training") + if len(fetches) > 1: + total_objects = collect_if_horovod(total_objects, hvd, mode="sum") - if train_model.on_horovod: - ending = " on worker {}".format(hvd.rank()) - else: - ending = "" - if step > bench_start: - deco_print( - "Avg time per step{}: {:.3f}s".format( - ending, 1.0 * total_time / (step - bench_start)) - ) - if len(fetches) > 1: - deco_print( - "Avg objects per second{}: {:.3f}".format( - ending, 1.0 * total_objects / total_time) - ) - else: - deco_print("Not enough steps for benchmarking{}".format(ending)) + if master_worker: + deco_print("Finished training") + if step > bench_start: + avg_time = 1.0 * total_time / (step - bench_start) + deco_print("Avg time per step: {:.3f}s".format(avg_time)) + if len(fetches) > 1: + avg_objects = 1.0 * total_objects / total_time + deco_print("Avg objects per second: {:.3f}".format(avg_objects)) + else: + deco_print("Not enough steps for benchmarking") def restore_and_get_results(model, checkpoint, mode): diff --git a/open_seq2seq/utils/hooks.py b/open_seq2seq/utils/hooks.py index b841d3945..037ba4346 100644 --- a/open_seq2seq/utils/hooks.py +++ b/open_seq2seq/utils/hooks.py @@ -89,10 +89,10 @@ def after_run(self, run_context, run_values): self._timer.update_last_triggered_step(self._iter_count - 1) input_values, output_values = results - dict_to_log = self._model.maybe_print_logs(input_values, output_values) + dict_to_log = self._model.maybe_print_logs(input_values, output_values, step) # optionally logging to tensorboard any values # returned from maybe_print_logs - if dict_to_log: + if self._model.params['save_summaries_steps'] and dict_to_log: log_summaries_from_dict( dict_to_log, self._model.params['logdir'], @@ -193,11 +193,12 @@ def after_run(self, run_context, run_values): if not self._model.on_horovod or self._model.hvd.rank() == 0: deco_print("Validation loss: {:.4f}".format(total_loss), offset=4) - dict_to_log = self._model.finalize_evaluation(results_per_batch) + dict_to_log = self._model.finalize_evaluation(results_per_batch, step) dict_to_log['eval_loss'] = total_loss # saving the best validation model - if total_loss < self._best_eval_loss: + if self._model.params['save_checkpoint_steps'] and \ + total_loss < self._best_eval_loss: self._best_eval_loss = total_loss self._eval_saver.save( run_context.session, @@ -208,7 +209,7 @@ def after_run(self, run_context, run_values): # optionally logging to tensorboard any values # returned from maybe_print_logs - if dict_to_log: + if self._model.params['save_summaries_steps']: log_summaries_from_dict( dict_to_log, self._model.params['logdir'], diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index e02eae74a..5900783c8 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -4,6 +4,7 @@ from six.moves import range from six import string_types +import six import tensorflow as tf import subprocess import numpy as np @@ -31,6 +32,42 @@ def clip_sparse(value, size): dense_shape_clipped) +def collect_if_horovod(value, hvd, mode='sum'): + """Collects values from all workers if run on Horovod. + Note, that on all workers except first this function will return None. + + Args: + value: value to collect. + hvd: horovod.tensorflow module or None + mode: could be "sum", "mean" or "gather", indicating reduce_sum or gather. + For "sum" and "mean" value has to be numerical, for "gather", value has + to be iterable. + + Returns: + collected results if run on Horovod or value otherwise. + """ + if hvd is None: + return value + + import mpi4py.rc + mpi4py.rc.initialize = False + from mpi4py import MPI + + values = MPI.COMM_WORLD.gather(value) + # synchronize all workers + MPI.COMM_WORLD.Barrier() + + if MPI.COMM_WORLD.Get_rank() != 0: + return None + + if mode == 'sum': + return np.sum(values) + elif mode == 'mean': + return np.mean(values) + elif mode == 'gather': + return [item for sl in values for item in sl] + + def clip_last_batch(last_batch, true_size): last_batch_clipped = [] for val in last_batch: @@ -41,179 +78,172 @@ def clip_last_batch(last_batch, true_size): return last_batch_clipped -def iterate_data_layer(model, dl_id, sess, compute_loss, mode, verbose): +def iterate_data(model, sess, compute_loss, mode, verbose): total_time = 0.0 bench_start = model.params.get('bench_start', 10) results_per_batch = [] - if model.on_horovod: - data_layer = model.get_data_layer() - if compute_loss: - loss_tensor = model.eval_losses[0] - output_tensors = model.get_output_tensors() - else: - data_layer = model.get_data_layer(dl_id) - if compute_loss: - loss_tensor = model.eval_losses[dl_id] - output_tensors = model.get_output_tensors(dl_id) - - sess.run(data_layer.iterator.initializer) - - fetches = [ - data_layer.input_tensors, - output_tensors, - ] + size_defined = model.get_data_layer().get_size_in_samples() is not None + if size_defined: + dl_sizes = [] if compute_loss: - fetches.append(loss_tensor) total_loss = 0.0 - total_samples = 0.0 - size_defined = data_layer.get_size_in_samples() is not None + total_samples = [] + fetches = [] - if size_defined: - data_size = data_layer.get_size_in_samples() // \ - data_layer.params['batch_size'] - last_batch_size = data_layer.get_size_in_samples() % \ - data_layer.params['batch_size'] + # on horovod num_gpus is 1 + for worker_id in range(model.num_gpus): + cur_fetches = [ + model.get_data_layer(worker_id).input_tensors, + model.get_output_tensors(worker_id), + ] + if compute_loss: + cur_fetches.append(model.eval_losses[worker_id]) + if size_defined: + dl_sizes.append(model.get_data_layer(worker_id).get_size_in_samples()) + try: + total_objects = 0.0 + cur_fetches.append(model.get_num_objects_per_step(worker_id)) + except NotImplementedError: + total_objects = None + deco_print("WARNING: Can't compute number of objects per step, since " + "train model does not define get_num_objects_per_step method.") + fetches.append(cur_fetches) + total_samples.append(0.0) + + sess.run([model.get_data_layer(i).iterator.initializer + for i in range(model.num_gpus)]) + + step = 0 + processed_batches = 0 + if verbose: + if model.on_horovod: + ending = " on worker {}".format(model.hvd.rank()) + else: + ending = "" - if model.on_horovod: - worker_id = model.hvd.rank() - else: - worker_id = dl_id + while True: + tm = time.time() + fetches_vals = {} + if size_defined: + fetches_to_run = {} + # removing finished data layers + for worker_id in range(model.num_gpus): + if total_samples[worker_id] < dl_sizes[worker_id]: + fetches_to_run[worker_id] = fetches[worker_id] + fetches_vals = sess.run(fetches_to_run) + else: + # if size is not defined we have to process fetches sequentially, so not + # to lose data when exception is thrown on one data layer + for worker_id, one_fetch in enumerate(fetches): + try: + fetches_vals[worker_id] = sess.run(one_fetch) + except tf.errors.OutOfRangeError: + continue - cross_over = 0 - if size_defined: - if data_size == 0: - raise ValueError( - "Batch size is bigger than dataset size: {} > {}".format( - data_layer.params['batch_size'], data_layer.get_size_in_samples() - ) - ) - if last_batch_size != 0: - cross_over = 1 - else: - # setting data_size to be infinity and assume - # that tf.errors.OutOfRangeError will be raised - data_size = 1000000000000 + if step >= bench_start: + total_time += time.time() - tm - for step in range(data_size + cross_over): - tm = time.time() - try: + # looping over num_gpus. In Horovod case this loop is "dummy", + # since num_gpus = 1 + for worker_id, fetches_val in fetches_vals.items(): if compute_loss: - inputs, outputs, loss = sess.run(fetches) + inputs, outputs, loss = fetches_val[:3] else: - inputs, outputs = sess.run(fetches) - except tf.errors.OutOfRangeError: - break - if step >= bench_start: - total_time += time.time() - tm + inputs, outputs = fetches_val[:2] - # assuming any element of inputs["source_tensors"][ shape[0] is batch size - batch_size = inputs["source_tensors"][0].shape[0] + if total_objects is not None: + total_objects += np.sum(fetches_val[-1]) - if compute_loss: - total_loss += loss * batch_size - total_samples += batch_size + # assuming any element of inputs["source_tensors"] .shape[0] is batch size + batch_size = inputs["source_tensors"][0].shape[0] + total_samples[worker_id] += batch_size - if size_defined and step == data_size: - inputs["source_tensors"] = model.clip_last_batch( - inputs["source_tensors"], last_batch_size, - ) - if 'target_tensors' in inputs: - inputs["target_tensors"] = model.clip_last_batch( - inputs["target_tensors"], last_batch_size, - ) - outputs = model.clip_last_batch(outputs, last_batch_size) - - if mode == 'eval': - results_per_batch.append(model.evaluate(inputs, outputs)) - elif mode == 'infer': - results_per_batch.append(model.infer(inputs, outputs)) - else: - raise ValueError("Unknown mode: {}".format(mode)) + if size_defined: + # this data_layer is at the last batch with few more elements, cutting + if total_samples[worker_id] > dl_sizes[worker_id]: + last_batch_size = dl_sizes[worker_id] % batch_size + for key, value in inputs.items(): + inputs[key] = model.clip_last_batch(value, last_batch_size) + outputs = model.clip_last_batch(outputs, last_batch_size) + + processed_batches += 1 + + if compute_loss: + total_loss += loss * batch_size + + if mode == 'eval': + results_per_batch.append(model.evaluate(inputs, outputs)) + elif mode == 'infer': + results_per_batch.append(model.infer(inputs, outputs)) + else: + raise ValueError("Unknown mode: {}".format(mode)) if verbose: if size_defined: - if data_size > 10 and step % (data_size // 10) == 0: - deco_print("Processed {}/{} batches on worker {}".format( - step + 1, data_size, worker_id)) + data_size = int(np.sum(np.ceil(np.array(dl_sizes) / + model.params['batch_size_per_gpu']))) + if step == 0 or len(fetches_vals) == 0 or \ + (data_size > 10 and processed_batches % (data_size // 10) == 0): + deco_print("Processed {}/{} batches{}".format( + processed_batches, data_size, ending)) else: - deco_print("Processed {} batches".format(step + 1), end='\r') + deco_print("Processed {} batches{}".format(processed_batches, ending), + end='\r') + + if len(fetches_vals) == 0: + break + step += 1 if verbose: if step > bench_start: deco_print( - "Avg time per step: {:.3}s on worker {}".format( - 1.0 * total_time / (step - bench_start), worker_id), + "Avg time per step{}: {:.3}s".format( + ending, 1.0 * total_time / (step - bench_start)), ) + if total_objects is not None: + avg_objects = 1.0 * total_objects / total_time + deco_print("Avg objects per second{}: {:.3f}".format(ending, + avg_objects)) else: deco_print( - "Not enough steps for benchmarking on worker {}".format(worker_id) + "Not enough steps for benchmarking{}".format(ending) ) if compute_loss: - return results_per_batch, total_loss, total_samples + return results_per_batch, total_loss, np.sum(total_samples) else: return results_per_batch def get_results_for_epoch(model, sess, compute_loss, mode, verbose=False): - if model.on_horovod: - if compute_loss: - results_per_batch, total_loss, total_samples = iterate_data_layer( - model, 0, sess, compute_loss, mode, verbose, - ) - else: - results_per_batch = iterate_data_layer( - model, 0, sess, compute_loss, mode, verbose, - ) + if compute_loss: + results_per_batch, total_loss, total_samples = iterate_data( + model, sess, compute_loss, mode, verbose, + ) else: - results_per_batch_all = [] - total_loss_all = [] - total_samples_all = [] - for dl_id in range(model.num_gpus): - if compute_loss: - results_per_batch, total_loss, total_samples = iterate_data_layer( - model, dl_id, sess, compute_loss, mode, verbose, - ) - total_loss_all.append(total_loss) - total_samples_all.append(total_samples) - else: - results_per_batch = iterate_data_layer( - model, dl_id, sess, compute_loss, mode, verbose, - ) - results_per_batch_all.append(results_per_batch) + results_per_batch = iterate_data( + model, sess, compute_loss, mode, verbose, + ) - if model.on_horovod: - import mpi4py.rc - mpi4py.rc.initialize = False - from mpi4py import MPI + if compute_loss: + total_samples = collect_if_horovod(total_samples, model.hvd, 'sum') + total_loss = collect_if_horovod(total_loss, model.hvd, 'sum') + results_per_batch = collect_if_horovod(results_per_batch, model.hvd, 'gather') + if results_per_batch is None: + # returning dummy tuple of correct shape if not in master worker if compute_loss: - total_samples_all = MPI.COMM_WORLD.gather(total_samples) - total_loss_all = MPI.COMM_WORLD.gather(total_loss) - results_per_batch_all = MPI.COMM_WORLD.gather(results_per_batch) - - MPI.COMM_WORLD.Barrier() - if MPI.COMM_WORLD.Get_rank() != 0: - # returning dummy tuple of correct shape - if compute_loss: - return None, None - else: - return None - - if compute_loss: - total_loss = np.sum(total_loss_all) - total_samples = np.sum(total_samples_all) - # moving GPU dimension into the batch dimension - results_per_batch = [item for sl in results_per_batch_all for item in sl] + return None, None + else: + return None if compute_loss: - total_loss /= total_samples - return results_per_batch, total_loss - - return results_per_batch + return results_per_batch, total_loss / total_samples + else: + return results_per_batch def log_summaries_from_dict(dict_to_log, output_dir, step): @@ -286,7 +316,14 @@ def nest_dict(flat_dict): def nested_update(org_dict, upd_dict): for key, value in upd_dict.items(): if isinstance(value, dict): - nested_update(org_dict[key], value) + if key in org_dict: + if not isinstance(org_dict[key], dict): + raise ValueError( + "Mismatch between org_dict and upd_dict at node {}".format(key) + ) + nested_update(org_dict[key], value) + else: + org_dict[key] = value else: org_dict[key] = value @@ -299,7 +336,10 @@ def mask_nans(x): def deco_print(line, offset=0, start="*** ", end='\n'): - print(start + " " * offset + line, end=end) + if six.PY2: + print((start + " " * offset + line).encode('utf-8'), end=end) + else: + print(start + " " * offset + line, end=end) def array_to_string(row, vocab, delim=' '): diff --git a/open_seq2seq/utils/utils_test.py b/open_seq2seq/utils/utils_test.py index 65f345d2f..3ab3709ce 100644 --- a/open_seq2seq/utils/utils_test.py +++ b/open_seq2seq/utils/utils_test.py @@ -9,14 +9,15 @@ import numpy as np import numpy.testing as npt -from open_seq2seq.test_utils.test_speech_config import base_params, \ - train_params, \ - eval_params, \ - base_model +from open_seq2seq.test_utils.test_speech_configs.ds2_test_config import base_params, \ + train_params, \ + eval_params, \ + base_model from open_seq2seq.utils.utils import get_results_for_epoch, get_available_gpus class UtilsTests(tf.test.TestCase): + def setUp(self): base_params['logdir'] = tempfile.mktemp() self.train_config = copy.deepcopy(base_params) @@ -27,7 +28,7 @@ def setUp(self): def tearDown(self): pass - def test_get_batches_for_epoch(self): + def test_get_results_for_epoch(self): # this will take all gpu memory, but that's probably fine for tests gpus = get_available_gpus() length_list = [] @@ -40,16 +41,19 @@ def test_get_batches_for_epoch(self): with tf.Graph().as_default() as g: self.eval_config['batch_size_per_gpu'] = bs self.eval_config['num_gpus'] = num_gpus - model = base_model(params=self.eval_config, mode="eval", hvd=None) + model = base_model(params=self.eval_config, mode="infer", hvd=None) model.compile() - model.evaluate = lambda inputs, outputs: inputs - model.finalize_evaluation = lambda results: results + model.infer = lambda inputs, outputs: inputs + model.finalize_inference = lambda results: results with self.test_session(g, use_gpu=True) as sess: sess.run(tf.global_variables_initializer()) - inputs_per_batch = get_results_for_epoch(model, sess, False, "eval") - length_list.append(np.hstack([inp['source_tensors'][1] - for inp in inputs_per_batch])) + inputs_per_batch = get_results_for_epoch( + model, sess, False, "infer") + length = np.hstack([inp['source_tensors'][1] + for inp in inputs_per_batch]) + ids = np.hstack([inp['source_ids'] for inp in inputs_per_batch]) + length_list.append(length[np.argsort(ids)]) for i in range(len(length_list) - 1): npt.assert_allclose(length_list[i], length_list[i + 1]) diff --git a/requirements.txt b/requirements.txt index 3f6411f5c..e02f835c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,6 @@ numpy nltk==3.2.5 resampy python_speech_features -pandas +pandas==0.23.0 six mpi4py diff --git a/run.py b/run.py index ec8efe4d6..6597c7da6 100644 --- a/run.py +++ b/run.py @@ -3,9 +3,7 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals -from six.moves import range -import tensorflow as tf import datetime import argparse import ast @@ -16,6 +14,11 @@ import sys import shutil +import tensorflow as tf +from six.moves import range +from six import string_types + + from open_seq2seq.utils.utils import deco_print, flatten_dict, \ nest_dict, nested_update, get_git_diff, \ get_git_hash, Logger @@ -66,9 +69,10 @@ def main(): # with command line arguments that were passed to the script parser_unk = argparse.ArgumentParser() for pm, value in flatten_dict(base_config).items(): - if type(value) is int or type(value) is float or type(value) is str: + if type(value) == int or type(value) == float or \ + isinstance(value, string_types): parser_unk.add_argument('--' + pm, default=value, type=type(value)) - elif type(value) is bool: + elif type(value) == bool: parser_unk.add_argument('--' + pm, default=value, type=ast.literal_eval) config_update = parser_unk.parse_args(unknown) nested_update(base_config, nest_dict(vars(config_update))) @@ -97,8 +101,8 @@ def main(): checkpoint = tf.train.latest_checkpoint(ckpt_dir) if checkpoint is None: raise IOError( - "There is no valid TensorFlow checkpoint in the " - "{} directory. Can't load model".format(ckpt_dir) + "There is no valid TensorFlow checkpoint in the " + "{} directory. Can't load model".format(ckpt_dir) ) else: if args.continue_learning: @@ -111,12 +115,14 @@ def main(): checkpoint = tf.train.latest_checkpoint(ckpt_dir) if checkpoint is None: raise IOError( - "There is no valid TensorFlow checkpoint in the " - "{} directory. Can't load model".format(ckpt_dir) + "There is no valid TensorFlow checkpoint in the " + "{} directory. Can't load model".format(ckpt_dir) ) else: raise IOError( - "{} does not exist or is empty, can't restore model".format(ckpt_dir) + "{} does not exist or is empty, can't restore model".format( + ckpt_dir + ) ) except IOError as e: if args.no_dir_check: @@ -140,24 +146,26 @@ def main(): tm_suf = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') shutil.copy( - args.config_file, - os.path.join(logdir, 'config_{}.py'.format(tm_suf)), + args.config_file, + os.path.join(logdir, 'config_{}.py'.format(tm_suf)), ) - with open(os.path.join(logdir, 'cmd-args_{}.log'.format(tm_suf)), 'w') as f: + with open(os.path.join(logdir, 'cmd-args_{}.log'.format(tm_suf)), + 'w') as f: f.write(" ".join(sys.argv)) - with open(os.path.join(logdir, 'git-info_{}.log'.format(tm_suf)), 'w') as f: + with open(os.path.join(logdir, 'git-info_{}.log'.format(tm_suf)), + 'w') as f: f.write('commit hash: {}'.format(get_git_hash())) f.write(get_git_diff()) old_stdout = sys.stdout old_stderr = sys.stderr stdout_log = open( - os.path.join(logdir, 'stdout_{}.log'.format(tm_suf)), 'a', 1 + os.path.join(logdir, 'stdout_{}.log'.format(tm_suf)), 'a', 1 ) stderr_log = open( - os.path.join(logdir, 'stderr_{}.log'.format(tm_suf)), 'a', 1 + os.path.join(logdir, 'stderr_{}.log'.format(tm_suf)), 'a', 1 ) sys.stdout = Logger(sys.stdout, stdout_log) sys.stderr = Logger(sys.stderr, stderr_log) @@ -170,13 +178,13 @@ def main(): if args.mode == 'train' or args.mode == 'train_eval': if 'train_params' in config_module: - train_config.update(copy.deepcopy(config_module['train_params'])) + nested_update(train_config, copy.deepcopy(config_module['train_params'])) if hvd is None or hvd.rank() == 0: deco_print("Training config:") pprint.pprint(train_config) if args.mode == 'eval' or args.mode == 'train_eval': if 'eval_params' in config_module: - eval_config.update(copy.deepcopy(config_module['eval_params'])) + nested_update(eval_config, copy.deepcopy(config_module['eval_params'])) if hvd is None or hvd.rank() == 0: deco_print("Evaluation config:") pprint.pprint(eval_config) @@ -184,7 +192,9 @@ def main(): if args.infer_output_file is None: raise ValueError("\"infer_output_file\" command line parameter is " "required in inference mode") - infer_config.update(copy.deepcopy(config_module['infer_params'])) + if "infer_params" in config_module: + nested_update(infer_config, copy.deepcopy(config_module['infer_params'])) + if hvd is None or hvd.rank() == 0: deco_print("Inference config:") pprint.pprint(infer_config) @@ -216,7 +226,7 @@ def main(): deco_print("Starting training from scratch") else: deco_print( - "Restored checkpoint from {}. Resuming training".format(checkpoint), + "Restored checkpoint from {}. Resuming training".format(checkpoint), ) elif args.mode == 'eval' or args.mode == 'infer': if hvd is None or hvd.rank() == 0: @@ -250,4 +260,4 @@ def main(): if __name__ == '__main__': - main() + main() \ No newline at end of file