From 09005c7b078f8da589258596639f078ff4a0e9a8 Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Wed, 24 Apr 2019 14:09:31 -0700 Subject: [PATCH 1/7] Fixed DataDefinition of RecurrentNeuralNetwork's output and input state streams --- ptp/components/models/recurrent_neural_network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ptp/components/models/recurrent_neural_network.py b/ptp/components/models/recurrent_neural_network.py index 053e4ef..21ab9dd 100644 --- a/ptp/components/models/recurrent_neural_network.py +++ b/ptp/components/models/recurrent_neural_network.py @@ -198,7 +198,7 @@ def input_data_definitions(self): # Input hidden state if self.initial_state == "Input": - d[self.key_input_state] = DataDefinition([-1, 2 if self.cell_type == 'LSTM' else 1, self.input_size, 1, self.hidden_size], [torch.tensor], "Batch of RNN last states") + d[self.key_input_state] = DataDefinition([2 if self.cell_type == 'LSTM' else 1, self.num_layers, -1, self.hidden_size], [torch.tensor], "Batch of RNN last states") return d @@ -218,7 +218,7 @@ def output_data_definitions(self): # Output hidden state stream if self.output_last_state: - d[self.key_output_state] = DataDefinition([-1, 2 if self.cell_type == 'LSTM' else 1, self.input_size, 1, self.hidden_size], [torch.tensor], "Batch of RNN last states") + d[self.key_output_state] = DataDefinition([2 if self.cell_type == 'LSTM' else 1, self.num_layers, -1, self.hidden_size], [torch.tensor], "Batch of RNN last states") return d From 0ec24fa9d68af6ce7a1c85de54f2f55163806e3c Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Wed, 24 Apr 2019 18:50:57 -0700 Subject: [PATCH 2/7] Added first prototype of Attn_Decoder, with dummy wikitext test --- .../components/models/attn_decoder_rnn.yml | 76 ++++++ ..._language_modeling_encoder_attndecoder.yml | 174 +++++++++++++ ptp/components/models/__init__.py | 2 + ptp/components/models/attn_decoder_rnn.py | 236 ++++++++++++++++++ 4 files changed, 488 insertions(+) create mode 100644 configs/default/components/models/attn_decoder_rnn.yml create mode 100644 configs/wikitext/wikitext_language_modeling_encoder_attndecoder.yml create mode 100644 ptp/components/models/attn_decoder_rnn.py diff --git a/configs/default/components/models/attn_decoder_rnn.yml b/configs/default/components/models/attn_decoder_rnn.yml new file mode 100644 index 0000000..75971d4 --- /dev/null +++ b/configs/default/components/models/attn_decoder_rnn.yml @@ -0,0 +1,76 @@ +# This file defines the default values for the RNN model. + +#################################################################### +# 1. CONFIGURATION PARAMETERS that will be LOADED by the component. +#################################################################### + +# Size of the hidden state (LOADED) +hidden_size: 100 + +# Wether to include the last hidden state in the outputs +output_last_state: False + +# Type of recurrent cell (LOADED) +# -> Only GRU is supported + +# Number of "stacked" layers (LOADED) +num_layers: 1 + +# Dropout rate (LOADED) +# Default: 0 (means that it is turned off) +dropout_rate: 0 + +# Prediction mode (LOADED) +# Options: +# * Dense (passes every activation through output layer) | +# * Last (passes only the last activation though output layer) | +# * None (all outputs are discarded) +prediction_mode: Dense + +# Enable FFN layer at the output of the RNN (before eventual feed back in the case of autoregression). +# Useful if the raw outputs of the RNN are needed, for attention encoder-decoder for example. +ffn_output: True + +autoregression_length: 42 + +# If true, output of the last layer will be additionally processed with Log Softmax (LOADED) +use_logsoftmax: True + +streams: + #################################################################### + # 2. Keymappings associated with INPUT and OUTPUT streams. + #################################################################### + + # Stream containing batch of encoder outputs (INPUT) + inputs: inputs + + # Stream containing the inital state of the RNN (INPUT) + # The stream will be actually created only if `inital_state: Input` + input_state: input_state + + # Stream containing predictions (OUTPUT) + predictions: predictions + + # Stream containing the final output state of the RNN (output) + # The stream will be actually created only if `output_last_state: True` + output_state: output_state + +globals: + #################################################################### + # 3. Keymappings of variables that will be RETRIEVED from GLOBALS. + #################################################################### + + # Size of the input (RETRIEVED) + input_size: input_size + + # Size of the prediction (RETRIEVED) + prediction_size: prediction_size + + #################################################################### + # 4. Keymappings associated with GLOBAL variables that will be SET. + #################################################################### + + #################################################################### + # 5. Keymappings associated with statistics that will be ADDED. + #################################################################### + diff --git a/configs/wikitext/wikitext_language_modeling_encoder_attndecoder.yml b/configs/wikitext/wikitext_language_modeling_encoder_attndecoder.yml new file mode 100644 index 0000000..2e78505 --- /dev/null +++ b/configs/wikitext/wikitext_language_modeling_encoder_attndecoder.yml @@ -0,0 +1,174 @@ +# This pipeline applies seq2seq on wikitext-2 to make word-level prediction. +# It's been made for test purposes only, as it is doing: +# [word 0 , ... , word 49] -> [word 1 , ... , word 50] (basically copying most of the input) +# +# The seq2seq here is implemented throught the use of 2 `RecurrentNeuralNetwork` + +# Training parameters: +training: + problem: + type: &p_type WikiTextLanguageModeling + data_folder: &data_folder ~/data/language_modeling/wikitext-2 + dataset: &dataset wikitext-2 + subset: train + sentence_length: 42 + batch_size: 64 + + # optimizer parameters: + optimizer: + name: Adam + lr: 1.0e-3 + + # settings parameters + terminal_conditions: + loss_stop: 1.0e-2 + episode_limit: 1000000 + epoch_limit: 100 + +# Validation parameters: +validation: + partial_validation_interval: 100 + problem: + type: *p_type + data_folder: *data_folder + dataset: *dataset + subset: valid + sentence_length: 42 + batch_size: 64 + +# Testing parameters: +testing: + problem: + type: *p_type + data_folder: *data_folder + dataset: *dataset + subset: test + sentence_length: 42 + batch_size: 64 + +pipeline: + name: wikitext_language_modeling_seq2seq + + # Source encoding - model 1. + source_sentence_embedding: + type: SentenceEmbeddings + priority: 1.1 + embeddings_size: 50 + pretrained_embeddings: glove.6B.50d.txt + data_folder: *data_folder + source_vocabulary_files: wiki.train.tokens,wiki.valid.tokens,wiki.test.tokens + vocabulary_mappings_file: wiki.all.tokenized_words + additional_tokens: + export_word_mappings_to_globals: True + streams: + inputs: sources + outputs: embedded_sources + + # Target encoding. + target_indexer: + type: SentenceIndexer + priority: 2.1 + data_folder: *data_folder + import_word_mappings_from_globals: True + streams: + inputs: targets + outputs: indexed_targets + + # LSTM Encoder + encoder: + type: RecurrentNeuralNetwork + cell_type: GRU + priority: 3 + initial_state: Trainable + hidden_size: 50 + num_layers: 1 + use_logsoftmax: False + output_last_state: True + prediction_mode: Dense + ffn_output: False + streams: + inputs: embedded_sources + predictions: s2s_encoder_output + output_state: s2s_state_output + globals: + input_size: embeddings_size + prediction_size: embeddings_size + + # LSTM Decoder + decoder: + type: Attn_Decoder_RNN + priority: 4 + hidden_size: 50 + num_layers: 1 + use_logsoftmax: False + autoregression_length: 42 + prediction_mode: Dense + streams: + inputs: s2s_encoder_output + predictions: s2s_decoder_output + input_state: s2s_state_output + globals: + input_size: embeddings_size + prediction_size: embeddings_size + + # FF, to resize the from the output size of the seq2seq to the size of the target vector + ff_resize_s2s_output: + type: FeedForwardNetwork + use_logsoftmax: True + dimensions: 3 + priority: 5 + streams: + inputs: s2s_decoder_output + globals: + input_size: embeddings_size + prediction_size: vocabulary_size + + # Loss + nllloss: + type: NLLLoss + priority: 6 + num_targets_dims: 2 + streams: + targets: indexed_targets + loss: loss + + # Prediction decoding. + prediction_decoder: + type: SentenceIndexer + priority: 10 + # Reverse mode. + reverse: True + # Use distributions as inputs. + use_input_distributions: True + data_folder: *data_folder + import_word_mappings_from_globals: True + streams: + inputs: predictions + outputs: prediction_sentences + + + # Statistics. + batch_size: + type: BatchSizeStatistics + priority: 100.0 + + #accuracy: + # type: AccuracyStatistics + # priority: 100.1 + # streams: + # targets: indexed_targets + + bleu: + type: BLEUStatistics + priority: 100.2 + streams: + targets: indexed_targets + + + # Viewers. + viewer: + type: StreamViewer + priority: 100.3 + input_streams: sources,targets,indexed_targets,prediction_sentences + +#: pipeline diff --git a/ptp/components/models/__init__.py b/ptp/components/models/__init__.py index 3451d2f..4b98b7e 100644 --- a/ptp/components/models/__init__.py +++ b/ptp/components/models/__init__.py @@ -7,6 +7,7 @@ from .recurrent_neural_network import RecurrentNeuralNetwork from .sentence_embeddings import SentenceEmbeddings from .seq2seq_rnn import Seq2Seq_RNN +from .attn_decoder_rnn import Attn_Decoder_RNN from .vqa.element_wise_multiplication import ElementWiseMultiplication from .vqa.multimodal_compact_bilinear_pooling import MultimodalCompactBilinearPooling @@ -23,4 +24,5 @@ 'Seq2Seq_RNN', 'ElementWiseMultiplication', 'MultimodalCompactBilinearPooling', + 'Attn_Decoder_RNN' ] diff --git a/ptp/components/models/attn_decoder_rnn.py b/ptp/components/models/attn_decoder_rnn.py new file mode 100644 index 0000000..b28f004 --- /dev/null +++ b/ptp/components/models/attn_decoder_rnn.py @@ -0,0 +1,236 @@ +# Copyright (C) Alexis Asseman, IBM Corporation 2019 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__author__ = "Alexis Asseman" + +import torch + +from ptp.configuration.configuration_error import ConfigurationError +from ptp.components.models.model import Model +from ptp.data_types.data_definition import DataDefinition + + +class Attn_Decoder_RNN(Model): + """ + Simple Classifier consisting of fully connected layer with log softmax non-linearity. + """ + def __init__(self, name, config): + """ + Initializes the model. + + :param config: Dictionary of parameters (read from configuration ``.yaml`` file). + :type config: ``ptp.configuration.ConfigInterface`` + """ + # Call constructors of parent classes. + Model.__init__(self, name, Attn_Decoder_RNN, config) + + # Get input/output mode + self.output_last_state = self.config["output_last_state"] + self.ffn_output = self.config["ffn_output"] + + # Get prediction mode from configuration. + self.prediction_mode = self.config["prediction_mode"] + if self.prediction_mode not in ['Dense','Last', 'None']: + raise ConfigurationError("Invalid 'prediction_mode' (current {}, available {})".format(self.prediction_mode, ['Dense','Last', 'None'])) + + self.autoregression_length = self.config["autoregression_length"] + + # Get number of layers from config. + self.num_layers = self.config["num_layers"] + + # Retrieve input size from global variables. + self.key_input_size = self.global_keys["input_size"] + self.input_size = self.globals["input_size"] + if type(self.input_size) == list: + if len(self.input_size) == 1: + self.input_size = self.input_size[0] + else: + raise ConfigurationError("RNN input size '{}' must be a single dimension (current {})".format(self.key_input_size, self.input_size)) + + # Retrieve output (prediction) size from global params. + self.prediction_size = self.globals["prediction_size"] + if type(self.prediction_size) == list: + if len(self.prediction_size) == 1: + self.prediction_size = self.prediction_size[0] + else: + raise ConfigurationError("RNN prediction size '{}' must be a single dimension (current {})".format(self.key_prediction_size, self.prediction_size)) + + # Retrieve hidden size from configuration. + self.hidden_size = self.config["hidden_size"] + if type(self.hidden_size) == list: + if len(self.hidden_size) == 1: + self.hidden_size = self.hidden_size[0] + else: + raise ConfigurationError("RNN hidden_size must be a single dimension (current {})".format(self.hidden_size)) + + # Get dropout rate value from config. + dropout_rate = self.config["dropout_rate"] + + # Create dropout layer. + self.dropout = torch.nn.Dropout(dropout_rate) + + # Create rnn cell. + self.rnn_cell = getattr(torch.nn, "GRU")(self.input_size, self.hidden_size, self.num_layers, dropout=dropout_rate, batch_first=True) + + # Create layers for the attention + self.attn = torch.nn.Linear(self.hidden_size * 2, self.autoregression_length) + self.attn_combine = torch.nn.Linear(self.hidden_size * 2, self.hidden_size) + + # Create the trainable initial input for the decoder (A trained token of sorts) + self.sos_token = torch.zeros(1, self.input_size) + torch.nn.init.xavier_uniform(self.sos_token) + self.sos_token = torch.nn.Parameter(self.sos_token, requires_grad=True) + + # Get key mappings. + self.key_inputs = self.stream_keys["inputs"] + self.key_predictions = self.stream_keys["predictions"] + self.key_input_state = self.stream_keys["input_state"] + if self.output_last_state: + self.key_output_state = self.stream_keys["output_state"] + + self.logger.info("Initializing RNN with input size = {}, hidden size = {} and prediction size = {}".format(self.input_size, self.hidden_size, self.prediction_size)) + + # Create the output layer. + self.activation2output_lin = None + if(self.ffn_output): + self.activation2output_lin = torch.nn.Linear(self.hidden_size, self.prediction_size) + + # Create the final non-linearity. + self.use_logsoftmax = self.config["use_logsoftmax"] + if self.use_logsoftmax: + if self.prediction_mode == "Dense": + # Used then returning dense prediction, i.e. every output of unfolded model. + self.log_softmax = torch.nn.LogSoftmax(dim=2) + else: + # Used when returning only the last output. + self.log_softmax = torch.nn.LogSoftmax(dim=1) + + def activation2output(self, activations): + output = self.dropout(activations) + + if(self.ffn_output): + #output = activations.squeeze(1) + shape = activations.shape + + # Reshape to 2D tensor [BATCH_SIZE * SEQ_LEN x HIDDEN_SIZE] + output = output.contiguous().view(-1, shape[2]) + + # Propagate data through the output layer [BATCH_SIZE * SEQ_LEN x PREDICTION_SIZE] + output = self.activation2output_lin(output) + #output = output.unsqueeze(1) + + # Reshape back to 3D tensor [BATCH_SIZE x SEQ_LEN x PREDICTION_SIZE] + output = output.view(shape[0], shape[1], output.size(1)) + + return output + + + def input_data_definitions(self): + """ + Function returns a dictionary with definitions of input data that are required by the component. + + :return: dictionary containing input data definitions (each of type :py:class:`ptp.utils.DataDefinition`). + """ + d = {} + + d[self.key_inputs] = DataDefinition([-1, -1, self.hidden_size], [torch.Tensor], "Batch of encoder outputs [BATCH_SIZE x SEQ_LEN x INPUT_SIZE]") + + # Input hidden state + d[self.key_input_state] = DataDefinition([self.num_layers, -1, self.hidden_size], [torch.Tensor], "Batch of RNN last states") + + return d + + def output_data_definitions(self): + """ + Function returns a dictionary with definitions of output data produced the component. + + :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`). + """ + d = {} + + if self.prediction_mode == "Dense": + d[self.key_predictions] = DataDefinition([-1, -1, self.prediction_size], [torch.Tensor], "Batch of predictions, each represented as probability distribution over classes [BATCH_SIZE x SEQ_LEN x PREDICTION_SIZE]") + elif self.prediction_mode == "Last": # "Last" + # Only last prediction. + d[self.key_predictions] = DataDefinition([-1, self.prediction_size], [torch.Tensor], "Batch of predictions, each represented as probability distribution over classes [BATCH_SIZE x SEQ_LEN x PREDICTION_SIZE]") + + # Output hidden state stream + if self.output_last_state: + d[self.key_output_state] = DataDefinition([self.num_layers, -1, self.hidden_size], [torch.Tensor], "Batch of RNN last states") + + return d + + def forward(self, data_dict): + """ + Forward pass of the model. + + :param data_dict: DataDict({'inputs', 'predictions ...}), where: + + - inputs: expected inputs [BATCH_SIZE x SEQ_LEN x INPUT_SIZE], + - predictions: returned output with predictions (log_probs) [BATCH_SIZE x SEQ_LEN x PREDICTION_SIZE] + """ + + inputs = data_dict[self.key_inputs] + batch_size = inputs.shape[0] + + # Initialize hidden state. + hidden = data_dict[self.key_input_state] + + + activations = [] + + # Autoregressive mode - feed back outputs in the input + activations_partial, hidden = self.rnn_cell(self.sos_token.expand(batch_size, -1).unsqueeze(1), hidden) + activations_partial = self.activation2output(activations_partial) + activations += [activations_partial] + + # Feed back the outputs iteratively + for i in range(self.autoregression_length - 1): + # Do the attention thing + attn_weights = torch.nn.functional.softmax( + self.attn(torch.cat((activations_partial.transpose(0, 1), hidden), 2)), + dim=2 + ) + + attn_applied = torch.bmm(attn_weights.transpose(0, 1), inputs) + + activations_partial = torch.cat((activations_partial, attn_applied), 2) + activations_partial = self.attn_combine(activations_partial) + activations_partial = torch.nn.functional.relu(activations_partial) + + # Fedd through the RNN + activations_partial, hidden = self.rnn_cell(activations_partial, hidden) + + activations_partial = self.activation2output(activations_partial) + + # Add the single step output into list + if self.prediction_mode == "Dense": + activations += [activations_partial] + # Reassemble all the outputs from list into an output sequence + if self.prediction_mode == "Dense": + outputs = torch.cat(activations, 1) + # Log softmax - along PREDICTION dim. + if self.use_logsoftmax: + outputs = self.log_softmax(outputs) + # Add predictions to datadict. + data_dict.extend({self.key_predictions: outputs}) + elif self.prediction_mode == "Last": + if self.use_logsoftmax: + outputs = self.log_softmax(activations_partial.squeeze(1)) + # Add predictions to datadict. + data_dict.extend({self.key_predictions: outputs}) + + + if self.output_last_state: + data_dict.extend({self.key_output_state: hidden}) From df60fb46454347746e1ae192b9e8f52a838870d8 Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Thu, 25 Apr 2019 18:33:07 -0700 Subject: [PATCH 3/7] Added translation problem --- .../text_to_text/translation_pairs.yml | 48 ++++ .../eng_fra_translation_enc_attndec.yml | 179 ++++++++++++++ .../problems/text_to_text/__init__.py | 2 + .../text_to_text/translation_pairs.py | 230 ++++++++++++++++++ 4 files changed, 459 insertions(+) create mode 100644 configs/default/components/problems/text_to_text/translation_pairs.yml create mode 100644 configs/translation/eng_fra_translation_enc_attndec.yml create mode 100644 ptp/components/problems/text_to_text/translation_pairs.py diff --git a/configs/default/components/problems/text_to_text/translation_pairs.yml b/configs/default/components/problems/text_to_text/translation_pairs.yml new file mode 100644 index 0000000..44c8e73 --- /dev/null +++ b/configs/default/components/problems/text_to_text/translation_pairs.yml @@ -0,0 +1,48 @@ +# This file defines the default values for the WikiText language modeling. + +#################################################################### +# 1. CONFIGURATION PARAMETERS that will be LOADED by the component. +#################################################################### + +# Folder where problem will store data (LOADED) +data_folder: ~/data/language_modeling/translation_pairs + +# Defines the dataset that will be used used (LOADED) +# Options: eng-fra +dataset: eng-fra + +# Defines the used subset (LOADED) +# Options: train | valid | test +subset: train + +# Length of sentence (i.e. number of tokens in input and target sentences) +sentence_length: 10 + +streams: + #################################################################### + # 2. Keymappings associated with INPUT and OUTPUT streams. + #################################################################### + + # Stream containing batch of indices (OUTPUT) + # Every problem MUST return that stream. + indices: indices + + # Stream containing batch of tokenized source sentences (OUTPUT) + sources: sources + + # Stream containing batch of tokenized target sentences (OUTPUT) + targets: targets + +globals: + #################################################################### + # 3. Keymappings of variables that will be RETRIEVED from GLOBALS. + #################################################################### + + #################################################################### + # 4. Keymappings associated with GLOBAL variables that will be SET. + #################################################################### + + #################################################################### + # 5. Keymappings associated with statistics that will be ADDED. + #################################################################### + diff --git a/configs/translation/eng_fra_translation_enc_attndec.yml b/configs/translation/eng_fra_translation_enc_attndec.yml new file mode 100644 index 0000000..263af19 --- /dev/null +++ b/configs/translation/eng_fra_translation_enc_attndec.yml @@ -0,0 +1,179 @@ +# This pipeline applies seq2seq on wikitext-2 to make word-level prediction. +# It's been made for test purposes only, as it is doing: +# [word 0 , ... , word 49] -> [word 1 , ... , word 50] (basically copying most of the input) +# +# The seq2seq here is implemented throught the use of 2 `RecurrentNeuralNetwork` + +# Training parameters: +training: + problem: + type: &p_type TranslationPairs + data_folder: &data_folder ~/data/language_modeling/translation_pairs + dataset: &dataset eng-fra + subset: train + sentence_length: 10 + batch_size: 64 + + # optimizer parameters: + optimizer: + name: Adam + lr: 1.0e-3 + + # settings parameters + terminal_conditions: + loss_stop: 1.0e-2 + episode_limit: 1000000 + epoch_limit: 100 + +# Validation parameters: +validation: + partial_validation_interval: 100 + problem: + type: *p_type + data_folder: *data_folder + dataset: *dataset + subset: valid + sentence_length: 10 + batch_size: 64 + +# Testing parameters: +testing: + problem: + type: *p_type + data_folder: *data_folder + dataset: *dataset + subset: test + sentence_length: 10 + batch_size: 64 + +pipeline: + name: eng_fra_translation_enc_attndec + + # Source encoding - model 1. + source_sentence_embedding: + type: SentenceEmbeddings + priority: 1.1 + embeddings_size: 50 + pretrained_embeddings: glove.6B.50d.txt + data_folder: *data_folder + source_vocabulary_files: eng-fra/eng.train.txt,eng-fra/eng.valid.txt,eng-fra/eng.test.txt + vocabulary_mappings_file: eng-fra/eng.all.tokenized_words + regenerate: True + additional_tokens: + import_word_mappings_from_globals: False + export_word_mappings_to_globals: False + streams: + inputs: sources + outputs: embedded_sources + + # Target encoding. + target_indexer: + type: SentenceIndexer + priority: 2.1 + data_folder: *data_folder + source_vocabulary_files: eng-fra/fra.train.txt,eng-fra/fra.valid.txt,eng-fra/fra.test.txt + import_word_mappings_from_globals: False + export_word_mappings_to_globals: True + regenerate: True + streams: + inputs: targets + outputs: indexed_targets + + # LSTM Encoder + encoder: + type: RecurrentNeuralNetwork + cell_type: GRU + priority: 3 + initial_state: Trainable + hidden_size: 50 + num_layers: 1 + use_logsoftmax: False + output_last_state: True + prediction_mode: Dense + ffn_output: False + streams: + inputs: embedded_sources + predictions: s2s_encoder_output + output_state: s2s_state_output + globals: + input_size: embeddings_size + prediction_size: embeddings_size + + # LSTM Decoder + decoder: + type: Attn_Decoder_RNN + priority: 4 + hidden_size: 50 + num_layers: 1 + use_logsoftmax: False + autoregression_length: 10 + prediction_mode: Dense + streams: + inputs: s2s_encoder_output + predictions: s2s_decoder_output + input_state: s2s_state_output + globals: + input_size: embeddings_size + prediction_size: embeddings_size + + # FF, to resize the from the output size of the seq2seq to the size of the target vector + ff_resize_s2s_output: + type: FeedForwardNetwork + use_logsoftmax: True + dimensions: 3 + priority: 5 + streams: + inputs: s2s_decoder_output + globals: + input_size: embeddings_size + prediction_size: vocabulary_size + + # Loss + nllloss: + type: NLLLoss + priority: 6 + num_targets_dims: 2 + streams: + targets: indexed_targets + loss: loss + + # Prediction decoding. + prediction_decoder: + type: SentenceIndexer + priority: 10 + # Reverse mode. + reverse: True + # Use distributions as inputs. + use_input_distributions: True + data_folder: *data_folder + import_word_mappings_from_globals: True + streams: + inputs: predictions + outputs: prediction_sentences + + + # Statistics. + batch_size: + type: BatchSizeStatistics + priority: 100.0 + + #accuracy: + # type: AccuracyStatistics + # priority: 100.1 + # streams: + # targets: indexed_targets + + bleu: + type: BLEUStatistics + priority: 100.2 + streams: + targets: indexed_targets + + + # Viewers. + viewer: + type: StreamViewer + priority: 100.3 + input_streams: sources,targets,indexed_targets,prediction_sentences + +#: pipeline diff --git a/ptp/components/problems/text_to_text/__init__.py b/ptp/components/problems/text_to_text/__init__.py index be7cc00..804ae58 100644 --- a/ptp/components/problems/text_to_text/__init__.py +++ b/ptp/components/problems/text_to_text/__init__.py @@ -1,5 +1,7 @@ from .wikitext_language_modeling import WikiTextLanguageModeling +from .translation_pairs import TranslationPairs __all__ = [ 'WikiTextLanguageModeling', + 'TranslationPairs' ] diff --git a/ptp/components/problems/text_to_text/translation_pairs.py b/ptp/components/problems/text_to_text/translation_pairs.py new file mode 100644 index 0000000..0e2af2b --- /dev/null +++ b/ptp/components/problems/text_to_text/translation_pairs.py @@ -0,0 +1,230 @@ +# Copyright (C) tkornuta, IBM Corporation 2019 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__author__ = "Alexis Asseman" + +import os +import random +import tempfile +import unicodedata +import re + +from nltk.tokenize import WhitespaceTokenizer + +import ptp.components.utils.io as io +from ptp.configuration import ConfigurationError +from ptp.components.problems.problem import Problem +from ptp.data_types.data_definition import DataDefinition + + +class TranslationPairs(Problem): + """ + + """ + def __init__(self, name, config): + """ + The init method downloads the required files, loads the file associated with a given subset (train/valid/test), + concatenates all sencentes and tokenizes them using NLTK's WhitespaceTokenizer. + + It also stores the intermediate results, so for example, it file with tokenized set is found, it simply loads it. + + :param name: Name of the component. + + :param class_type: Class type of the component. + + :param config: Dictionary of parameters (read from configuration ``.yaml`` file). + """ + # Call constructor of parent classes. + Problem.__init__(self, name, TranslationPairs, config) + + # Set streams key mappings. + self.key_sources = self.stream_keys["sources"] + self.key_targets = self.stream_keys["targets"] + + # Get absolute path to data folder. + self.data_folder = os.path.expanduser(self.config['data_folder']) + + # Get dataset. + if (self.config['dataset'] is None) or (self.config['dataset'] not in ["eng-fra", "eng-pol"]): + raise ConfigurationError("Problem supports only 'dataset' options: 'eng-fra', 'eng-pol'") + dataset = self.config['dataset'] + + # Get (sub)set: train/valid/test. + if (self.config['subset'] is None) or (self.config['subset'] not in ['train', 'valid', 'test']): + raise ConfigurationError("Problem supports one 'subset' options: 'train', 'valid', 'test' ") + subset = self.config['subset'] + + # Check if file with tokenized words exists. + filename_tokenized_words = "translate_"+dataset+"."+self.config['subset']+".tokenized_words" + + + self.lang_source = self.config['dataset'].split('-')[0] + self.lang_target = self.config['dataset'].split('-')[1] + + + # Names of files used by this problem. + filenames = [ + self.lang_source + ".train.txt", + self.lang_target + ".train.txt", + self.lang_source + ".valid.txt", + self.lang_target + ".valid.txt", + self.lang_source + ".test.txt", + self.lang_target + ".test.txt" + ] + + # Initialize dataset if files do not exist. + if not io.check_files_existence(os.path.join(self.data_folder, dataset), filenames): + # Set url and source filename depending on dataset. + url = "https://www.manythings.org/anki/" + self.lang_target + "-" + self.lang_source + ".zip" + zipfile_name = "translate_" + self.lang_target + "_" + self.lang_source + ".zip" + + with tempfile.TemporaryDirectory() as tmpdirname: + # Download and extract wikitext zip. + io.download_extract_zip_file(self.logger, tmpdirname, url, zipfile_name) + + # Create train, valid, test files from the downloaded file + lines = io.load_string_list_from_txt_file(tmpdirname, self.lang_target + ".txt") + + # Shuffle the lines + random.seed(42) + random.shuffle(lines) + + # Split english and french pairs + lines_source = [self.normalizeString(l.split('\t')[0]) for l in lines] + lines_target = [self.normalizeString(l.split('\t')[1]) for l in lines] + + # Cut dataset into train (90%), valid (5%), test (5%) files + test_mark = len(lines) // 20 + valid_mark = test_mark + (len(lines) // 20) + + os.makedirs(os.path.join(self.data_folder, dataset), exist_ok=True) + + with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".test.txt"), mode='w+') as f: + f.write('\n'.join(lines_source[0:test_mark])) + with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".test.txt"), mode='w+') as f: + f.write('\n'.join(lines_target[0:test_mark])) + + with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".valid.txt"), mode='w+') as f: + f.write('\n'.join(lines_source[test_mark:valid_mark])) + with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".valid.txt"), mode='w+') as f: + f.write('\n'.join(lines_target[test_mark:valid_mark])) + + with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".train.txt"), mode='w+') as f: + f.write('\n'.join(lines_source[valid_mark:])) + with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".train.txt"), mode='w+') as f: + f.write('\n'.join(lines_target[valid_mark:])) + + else: + self.logger.info("Files {} found in folder '{}'".format(filenames, self.data_folder)) + + + # Load the lines + lines_source = io.load_string_list_from_txt_file(os.path.join(self.data_folder, dataset), self.lang_source + "."+subset+".txt") + lines_target = io.load_string_list_from_txt_file(os.path.join(self.data_folder, dataset), self.lang_target + "."+subset+".txt") + + # Get the required sample length. + self.sentence_length = self.config['sentence_length'] + + # Separate into src - tgt sentence pairs + tokenize + tokenizer = WhitespaceTokenizer() + self.sentences_source = [] + self.sentences_target = [] + for s_src, s_tgt in zip(lines_source, lines_target): + src = tokenizer.tokenize(s_src) + tgt = tokenizer.tokenize(s_tgt) + # Keep only the pairs that are shorter or equal to the requested length + if len(src) <= self.sentence_length and len(tgt) <= self.sentence_length: + self.sentences_source += [src] + self.sentences_target += [tgt] + + self.logger.info("Load text consisting of {} sentences".format(len(self.sentences_source))) + + # Calculate the size of dataset. + self.dataset_length = len(self.sentences_source) + + # Display exemplary sample. + self.logger.info("Exemplary sample:\n source: {}\n target: {}".format(self.sentences_source[0], self.sentences_target[0])) + + + def output_data_definitions(self): + """ + Function returns a dictionary with definitions of output data produced the component. + + :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`). + """ + return { + self.key_indices: DataDefinition([-1, 1], [list, int], "Batch of sample indices [BATCH_SIZE] x [1]"), + self.key_sources: DataDefinition([-1, self.sentence_length, 1], [list, list, str], "Batch of input sentences, each consisting of several words [BATCH_SIZE] x [SENTENCE_LENGTH] x [string]"), + self.key_targets: DataDefinition([-1, self.sentence_length, 1], [list, list, str], "Batch of target sentences, each consisting of several words [BATCH_SIZE] x [SENTENCE_LENGTH] x [string]") + } + + # Turn a Unicode string to plain ASCII, thanks to + # https://stackoverflow.com/a/518232/2809427 + @staticmethod + def unicodeToAscii(s): + return ''.join( + c for c in unicodedata.normalize('NFD', s) + if unicodedata.category(c) != 'Mn' + ) + + # Lowercase, trim, and remove non-letter characters + # https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html + def normalizeString(self, s): + s = self.unicodeToAscii(s.lower().strip()) + s = re.sub(r"([.!?])", r" \1", s) + s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) + return s + + def __len__(self): + """ + Returns the "size" of the "problem" (total number of samples). + + :return: The size of the problem. + """ + return self.dataset_length + + + def __getitem__(self, index): + """ + Getter method to access the dataset and return a sample. + + :param index: index of the sample to return. + :type index: int + + :return: ``DataDict({'indices', sources','targets'})`` + + """ + # Return data_dict. + data_dict = self.create_data_dict(index) + data_dict[self.key_sources] = self.sentences_source[index] + data_dict[self.key_targets] = self.sentences_target[index] + return data_dict + + def collate_fn(self, batch): + """ + Generates a batch of samples from a list of individuals samples retrieved by :py:func:`__getitem__`. + + :param batch: List of :py:class:`ptp.utils.DataDict` retrieved by :py:func:`__getitem__` + :type batch: list + + :return: DataDict containing the created batch. + + """ + # Collate indices. + data_dict = self.create_data_dict([sample[self.key_indices] for sample in batch]) + # Collate sources. + data_dict[self.key_sources] = [sample[self.key_sources] for sample in batch] + data_dict[self.key_targets] = [sample[self.key_targets] for sample in batch] + return data_dict + From 04a42d68c90a310e9b7dbd342e231d778ad5154e Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Thu, 25 Apr 2019 19:42:48 -0700 Subject: [PATCH 4/7] Add fixed padding option to sentence_embeddings, sentence_indexer --- .../components/models/sentence_embeddings.yml | 6 ++++ .../components/text/sentence_indexer.yml | 6 ++++ ptp/components/models/sentence_embeddings.py | 9 +++++ ptp/components/text/sentence_indexer.py | 33 +++++++++++++++++-- ptp/components/utils/word_mappings.py | 18 ++++++++++ 5 files changed, 70 insertions(+), 2 deletions(-) diff --git a/configs/default/components/models/sentence_embeddings.yml b/configs/default/components/models/sentence_embeddings.yml index ab3a6da..7ca8987 100644 --- a/configs/default/components/models/sentence_embeddings.yml +++ b/configs/default/components/models/sentence_embeddings.yml @@ -25,6 +25,12 @@ import_word_mappings_from_globals: False # Flag informing whether word mappings will be exported to globals (LOADED) export_word_mappings_to_globals: False +# Fixed padding length +# -1 -> For each batch, automatically pad to the length of the longest sequence of the batch +# (variable from batch to batch) +# > 0 -> Pad each pad to the chosen length (fixed for all batches) +fixed_padding: -1 + # File containing pretrained embeddings (LOADED) # Empty means that no embeddings will be loaded. pretrained_embeddings_file: '' diff --git a/configs/default/components/text/sentence_indexer.yml b/configs/default/components/text/sentence_indexer.yml index 0921bc7..8ec714f 100644 --- a/configs/default/components/text/sentence_indexer.yml +++ b/configs/default/components/text/sentence_indexer.yml @@ -25,6 +25,12 @@ import_word_mappings_from_globals: False # Flag informing whether word mappings will be exported to globals (LOADED) export_word_mappings_to_globals: False +# Fixed padding length +# -1 -> For each batch, automatically pad to the length of the longest sequence of the batch +# (variable from batch to batch) +# > 0 -> Pad each pad to the chosen length (fixed for all batches) +fixed_padding: -1 + # Operation mode. If 'reverse' is True, then it will change indices into words (LOADED) reverse: False diff --git a/ptp/components/models/sentence_embeddings.py b/ptp/components/models/sentence_embeddings.py index 6004e2a..2426d0c 100644 --- a/ptp/components/models/sentence_embeddings.py +++ b/ptp/components/models/sentence_embeddings.py @@ -25,6 +25,7 @@ from ptp.data_types.data_definition import DataDefinition import ptp.components.utils.embeddings as emb +from ptp.components.utils.word_mappings import pad_list class SentenceEmbeddings(Model, WordMappings): @@ -56,6 +57,9 @@ def __init__(self, name, config): self.key_inputs = self.stream_keys["inputs"] self.key_outputs = self.stream_keys["outputs"] + # Force padding to a fixed length + self.fixed_padding = self.config['fixed_padding'] + # Retrieve embeddings size from configuration and export it to globals. self.embeddings_size = self.config['embeddings_size'] self.globals["embeddings_size"] = self.embeddings_size @@ -120,6 +124,11 @@ def forward(self, data_dict): # Add index to outputs. output_sample.append( output_index ) + # Apply fixed padding to all sequences if requested + # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding + if self.fixed_padding > 0: + pad_list(output_sample, self.fixed_padding) + #indices_list.append(self.app_state.FloatTensor(output_sample)) indices_list.append(self.app_state.LongTensor(output_sample)) diff --git a/ptp/components/text/sentence_indexer.py b/ptp/components/text/sentence_indexer.py index 7cb0ece..4c70d05 100644 --- a/ptp/components/text/sentence_indexer.py +++ b/ptp/components/text/sentence_indexer.py @@ -19,6 +19,7 @@ from ptp.components.component import Component from ptp.components.mixins.word_mappings import WordMappings from ptp.data_types.data_definition import DataDefinition +from ptp.components.utils.word_mappings import pad_list class SentenceIndexer(Component, WordMappings): @@ -50,6 +51,9 @@ def __init__(self, name, config): # Read mode from the configuration. self.mode_reverse = self.config['reverse'] + # Force padding to a fixed length + self.fixed_padding = self.config['fixed_padding'] + if self.mode_reverse: # We will need reverse (index:word) mapping. self.ix_to_word = dict((v,k) for k,v in self.word_to_ix.items()) @@ -140,10 +144,16 @@ def sentences_to_tensor(self, data_dict): # Add index to outputs. output_sample.append( output_index ) - outputs_list.append(output_sample) + # Apply fixed padding to all sequences if requested + # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding + if self.fixed_padding > 0: + pad_list(output_sample, self.fixed_padding) + + outputs_list.append(self.app_state.LongTensor(output_sample)) # Transform the list of lists to tensor. - output = self.app_state.LongTensor(outputs_list) + # output = self.app_state.LongTensor(outputs_list) + output = torch.nn.utils.rnn.pad_sequence(outputs_list, batch_first=True) # Create the returned dict. data_dict.extend({self.key_outputs: output}) @@ -172,6 +182,12 @@ def tensor_indices_to_sentences(self, data_dict): output_word = self.ix_to_word[token] # Add index to outputs. output_sample.append( output_word ) + + # Apply fixed padding to all sequences if requested + # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding + if self.fixed_padding > 0: + pad_list(output_sample, self.fixed_padding) + # Add sentence to batch. outputs_list.append(output_sample) @@ -204,8 +220,21 @@ def tensor_distributions_to_sentences(self, data_dict): output_word = self.ix_to_word[token] # Add index to outputs. output_sample.append( output_word ) + + # Apply fixed padding to all sequences if requested + # Otherwise let torch.nn.utils.rnn.pad_sequence handle it and choose a dynamic padding + if self.fixed_padding > 0: + pad_list(output_sample, self.fixed_padding) + # Add sentence to batch. outputs_list.append(output_sample) # Create the returned dict. data_dict.extend({self.key_outputs: outputs_list}) + + @staticmethod + def pad_list(self, l: list, length: int, value = 0): + if len(l) < length: + l.extend([value]*(length-len(l))) + elif len(l) > length: + del l[length:] diff --git a/ptp/components/utils/word_mappings.py b/ptp/components/utils/word_mappings.py index d43abf6..ee1ce20 100644 --- a/ptp/components/utils/word_mappings.py +++ b/ptp/components/utils/word_mappings.py @@ -135,3 +135,21 @@ def save_word_mappings_to_csv_file(logger, folder, filename, word_to_ix, fieldna writer.writerow({fieldnames[0]:k, fieldnames[1]: v}) logger.info("Saved mappings of size {} to file '{}'".format(len(word_to_ix), file_path)) + +def pad_list(l: list, length: int, value = 0): + """ + Will apply padding / clipping to list to meet requested length. + Works on the list in-place. + + :param l: List to manipulate + + :param length: Target length + + :param value: Value to fill when padding. Default is int(0). + + :return: None + """ + if len(l) < length: + l.extend([value]*(length-len(l))) + elif len(l) > length: + del l[length:] From 82a212193862b1a1c7076682e580f9a957888f3f Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Thu, 25 Apr 2019 19:52:39 -0700 Subject: [PATCH 5/7] Changed translation config for fixed padding compatibility --- configs/translation/eng_fra_translation_enc_attndec.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/configs/translation/eng_fra_translation_enc_attndec.yml b/configs/translation/eng_fra_translation_enc_attndec.yml index 263af19..e00c314 100644 --- a/configs/translation/eng_fra_translation_enc_attndec.yml +++ b/configs/translation/eng_fra_translation_enc_attndec.yml @@ -62,6 +62,7 @@ pipeline: additional_tokens: import_word_mappings_from_globals: False export_word_mappings_to_globals: False + fixed_padding: 10 streams: inputs: sources outputs: embedded_sources @@ -74,6 +75,7 @@ pipeline: source_vocabulary_files: eng-fra/fra.train.txt,eng-fra/fra.valid.txt,eng-fra/fra.test.txt import_word_mappings_from_globals: False export_word_mappings_to_globals: True + fixed_padding: 10 regenerate: True streams: inputs: targets From afaf7dfb8e83ea4a4dfa0e916892f78ebcd8e157 Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Fri, 26 Apr 2019 10:55:37 -0700 Subject: [PATCH 6/7] Cleaning --- .../components/models/attn_decoder_rnn.yml | 6 ++- .../text_to_text/translation_pairs.yml | 5 ++- .../eng_fra_translation_enc_attndec.yml | 19 +++----- ..._language_modeling_encoder_attndecoder.yml | 4 +- .../wikitext_language_modeling_seq2seq.yml | 25 ++++++----- ptp/components/models/attn_decoder_rnn.py | 43 +++++++++---------- .../text_to_text/translation_pairs.py | 33 +++++++------- 7 files changed, 67 insertions(+), 68 deletions(-) diff --git a/configs/default/components/models/attn_decoder_rnn.yml b/configs/default/components/models/attn_decoder_rnn.yml index 75971d4..e2e372b 100644 --- a/configs/default/components/models/attn_decoder_rnn.yml +++ b/configs/default/components/models/attn_decoder_rnn.yml @@ -14,7 +14,7 @@ output_last_state: False # -> Only GRU is supported # Number of "stacked" layers (LOADED) -num_layers: 1 +# -> Only a single layer is supported # Dropout rate (LOADED) # Default: 0 (means that it is turned off) @@ -31,7 +31,9 @@ prediction_mode: Dense # Useful if the raw outputs of the RNN are needed, for attention encoder-decoder for example. ffn_output: True -autoregression_length: 42 +# Length of generated output sequence (LOADED) +# User must set it per task, as it is task specific. +autoregression_length: 10 # If true, output of the last layer will be additionally processed with Log Softmax (LOADED) use_logsoftmax: True diff --git a/configs/default/components/problems/text_to_text/translation_pairs.yml b/configs/default/components/problems/text_to_text/translation_pairs.yml index 44c8e73..f48f650 100644 --- a/configs/default/components/problems/text_to_text/translation_pairs.yml +++ b/configs/default/components/problems/text_to_text/translation_pairs.yml @@ -8,14 +8,15 @@ data_folder: ~/data/language_modeling/translation_pairs # Defines the dataset that will be used used (LOADED) -# Options: eng-fra +# Options: eng-fra, eng-pol dataset: eng-fra # Defines the used subset (LOADED) # Options: train | valid | test subset: train -# Length of sentence (i.e. number of tokens in input and target sentences) +# Length limit of source and target sentence +# if < 0, no limit sentence_length: 10 streams: diff --git a/configs/translation/eng_fra_translation_enc_attndec.yml b/configs/translation/eng_fra_translation_enc_attndec.yml index e00c314..7eab08f 100644 --- a/configs/translation/eng_fra_translation_enc_attndec.yml +++ b/configs/translation/eng_fra_translation_enc_attndec.yml @@ -1,8 +1,6 @@ -# This pipeline applies seq2seq on wikitext-2 to make word-level prediction. -# It's been made for test purposes only, as it is doing: -# [word 0 , ... , word 49] -> [word 1 , ... , word 50] (basically copying most of the input) -# -# The seq2seq here is implemented throught the use of 2 `RecurrentNeuralNetwork` +# This pipeline applied an encoder-decoder GRU with attention on the open Tatoeba translation sentence pairs. +# Inspired by https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html . +# Note that training will be slower than in the tutorial, as teacher forcing is not implemented here. # Training parameters: training: @@ -81,7 +79,7 @@ pipeline: inputs: targets outputs: indexed_targets - # LSTM Encoder + # Single layer GRU Encoder encoder: type: RecurrentNeuralNetwork cell_type: GRU @@ -101,12 +99,11 @@ pipeline: input_size: embeddings_size prediction_size: embeddings_size - # LSTM Decoder + # Single layer GRU Decoder with attention decoder: type: Attn_Decoder_RNN priority: 4 hidden_size: 50 - num_layers: 1 use_logsoftmax: False autoregression_length: 10 prediction_mode: Dense @@ -159,12 +156,6 @@ pipeline: type: BatchSizeStatistics priority: 100.0 - #accuracy: - # type: AccuracyStatistics - # priority: 100.1 - # streams: - # targets: indexed_targets - bleu: type: BLEUStatistics priority: 100.2 diff --git a/configs/wikitext/wikitext_language_modeling_encoder_attndecoder.yml b/configs/wikitext/wikitext_language_modeling_encoder_attndecoder.yml index 2e78505..244fc4a 100644 --- a/configs/wikitext/wikitext_language_modeling_encoder_attndecoder.yml +++ b/configs/wikitext/wikitext_language_modeling_encoder_attndecoder.yml @@ -16,8 +16,8 @@ training: # optimizer parameters: optimizer: - name: Adam - lr: 1.0e-3 + name: SGD + lr: 1.0e-2 # settings parameters terminal_conditions: diff --git a/configs/wikitext/wikitext_language_modeling_seq2seq.yml b/configs/wikitext/wikitext_language_modeling_seq2seq.yml index d23243c..c2d89a5 100644 --- a/configs/wikitext/wikitext_language_modeling_seq2seq.yml +++ b/configs/wikitext/wikitext_language_modeling_seq2seq.yml @@ -11,7 +11,7 @@ training: data_folder: &data_folder ~/data/language_modeling/wikitext-2 dataset: &dataset wikitext-2 subset: train - sentence_length: 50 + sentence_length: 42 batch_size: 64 # optimizer parameters: @@ -33,7 +33,7 @@ validation: data_folder: *data_folder dataset: *dataset subset: valid - sentence_length: 50 + sentence_length: 42 batch_size: 64 # Testing parameters: @@ -43,7 +43,7 @@ testing: data_folder: *data_folder dataset: *dataset subset: test - sentence_length: 50 + sentence_length: 42 batch_size: 64 pipeline: @@ -77,13 +77,15 @@ pipeline: # LSTM Encoder lstm_encoder: type: RecurrentNeuralNetwork + cell_type: GRU priority: 3 initial_state: Trainable - hidden_size: 300 - num_layers: 3 + hidden_size: 50 + num_layers: 1 use_logsoftmax: False output_last_state: True - prediction_mode: Last + prediction_mode: Dense + ffn_output: False streams: inputs: embedded_sources predictions: s2s_encoder_output @@ -94,14 +96,17 @@ pipeline: # LSTM Decoder lstm_decoder: - type: RecurrentNeuralNetwork + type: Attn_Decoder_RNN priority: 4 - initial_state: Input - hidden_size: 300 - num_layers: 3 + hidden_size: 50 + num_layers: 1 use_logsoftmax: False +<<<<<<< Updated upstream input_mode: Autoregression_First max_autoregression_length: 50 +======= + autoregression_length: 42 +>>>>>>> Stashed changes prediction_mode: Dense streams: inputs: s2s_encoder_output diff --git a/ptp/components/models/attn_decoder_rnn.py b/ptp/components/models/attn_decoder_rnn.py index b28f004..4d558ed 100644 --- a/ptp/components/models/attn_decoder_rnn.py +++ b/ptp/components/models/attn_decoder_rnn.py @@ -23,7 +23,12 @@ class Attn_Decoder_RNN(Model): """ - Simple Classifier consisting of fully connected layer with log softmax non-linearity. + Single layer GRU decoder with attention: + Bahdanau, D., Cho, K., & Bengio, Y. (2014). Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473. + + Needs the full sequence of hidden states from the encoder as input, as well as the last hidden state from the encoder as input state. + + Code is based on https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html. """ def __init__(self, name, config): """ @@ -46,9 +51,6 @@ def __init__(self, name, config): self.autoregression_length = self.config["autoregression_length"] - # Get number of layers from config. - self.num_layers = self.config["num_layers"] - # Retrieve input size from global variables. self.key_input_size = self.global_keys["input_size"] self.input_size = self.globals["input_size"] @@ -81,7 +83,7 @@ def __init__(self, name, config): self.dropout = torch.nn.Dropout(dropout_rate) # Create rnn cell. - self.rnn_cell = getattr(torch.nn, "GRU")(self.input_size, self.hidden_size, self.num_layers, dropout=dropout_rate, batch_first=True) + self.rnn_cell = getattr(torch.nn, "GRU")(self.input_size, self.hidden_size, 1, dropout=dropout_rate, batch_first=True) # Create layers for the attention self.attn = torch.nn.Linear(self.hidden_size * 2, self.autoregression_length) @@ -102,9 +104,9 @@ def __init__(self, name, config): self.logger.info("Initializing RNN with input size = {}, hidden size = {} and prediction size = {}".format(self.input_size, self.hidden_size, self.prediction_size)) # Create the output layer. - self.activation2output_lin = None + self.activation2output_layer = None if(self.ffn_output): - self.activation2output_lin = torch.nn.Linear(self.hidden_size, self.prediction_size) + self.activation2output_layer = torch.nn.Linear(self.hidden_size, self.prediction_size) # Create the final non-linearity. self.use_logsoftmax = self.config["use_logsoftmax"] @@ -127,7 +129,7 @@ def activation2output(self, activations): output = output.contiguous().view(-1, shape[2]) # Propagate data through the output layer [BATCH_SIZE * SEQ_LEN x PREDICTION_SIZE] - output = self.activation2output_lin(output) + output = self.activation2output_layer(output) #output = output.unsqueeze(1) # Reshape back to 3D tensor [BATCH_SIZE x SEQ_LEN x PREDICTION_SIZE] @@ -147,7 +149,7 @@ def input_data_definitions(self): d[self.key_inputs] = DataDefinition([-1, -1, self.hidden_size], [torch.Tensor], "Batch of encoder outputs [BATCH_SIZE x SEQ_LEN x INPUT_SIZE]") # Input hidden state - d[self.key_input_state] = DataDefinition([self.num_layers, -1, self.hidden_size], [torch.Tensor], "Batch of RNN last states") + d[self.key_input_state] = DataDefinition([1, -1, self.hidden_size], [torch.Tensor], "Batch of RNN last states") return d @@ -167,7 +169,7 @@ def output_data_definitions(self): # Output hidden state stream if self.output_last_state: - d[self.key_output_state] = DataDefinition([self.num_layers, -1, self.hidden_size], [torch.Tensor], "Batch of RNN last states") + d[self.key_output_state] = DataDefinition([1, -1, self.hidden_size], [torch.Tensor], "Batch of RNN last states") return d @@ -187,37 +189,34 @@ def forward(self, data_dict): # Initialize hidden state. hidden = data_dict[self.key_input_state] - + # List that will contain the output sequence activations = [] - # Autoregressive mode - feed back outputs in the input - activations_partial, hidden = self.rnn_cell(self.sos_token.expand(batch_size, -1).unsqueeze(1), hidden) - activations_partial = self.activation2output(activations_partial) - activations += [activations_partial] + # First input to the decoder - trainable "start of sequence" token + activations_partial = self.sos_token.expand(batch_size, -1).unsqueeze(1) # Feed back the outputs iteratively - for i in range(self.autoregression_length - 1): + for i in range(self.autoregression_length): + # Do the attention thing attn_weights = torch.nn.functional.softmax( self.attn(torch.cat((activations_partial.transpose(0, 1), hidden), 2)), dim=2 ) - attn_applied = torch.bmm(attn_weights.transpose(0, 1), inputs) - activations_partial = torch.cat((activations_partial, attn_applied), 2) activations_partial = self.attn_combine(activations_partial) activations_partial = torch.nn.functional.relu(activations_partial) - # Fedd through the RNN + # Feed through the RNN activations_partial, hidden = self.rnn_cell(activations_partial, hidden) - activations_partial = self.activation2output(activations_partial) # Add the single step output into list if self.prediction_mode == "Dense": activations += [activations_partial] - # Reassemble all the outputs from list into an output sequence + + # Reassemble all the outputs from list into an output tensor if self.prediction_mode == "Dense": outputs = torch.cat(activations, 1) # Log softmax - along PREDICTION dim. @@ -231,6 +230,6 @@ def forward(self, data_dict): # Add predictions to datadict. data_dict.extend({self.key_predictions: outputs}) - + # Output last hidden state, if requested if self.output_last_state: data_dict.extend({self.key_output_state: hidden}) diff --git a/ptp/components/problems/text_to_text/translation_pairs.py b/ptp/components/problems/text_to_text/translation_pairs.py index 0e2af2b..3f72b87 100644 --- a/ptp/components/problems/text_to_text/translation_pairs.py +++ b/ptp/components/problems/text_to_text/translation_pairs.py @@ -30,15 +30,17 @@ class TranslationPairs(Problem): """ - + Bilingual sentence pairs from http://www.manythings.org/anki/. + Only some pairs are included here, but many more are available on the website. + Will download the requested language pair if necessary, normalize and tokenize the sentences, and will cut the data into train, valid, test sets. + + Resulting tokens that are shorter than the specified length are then passed to samples (source/target) as list of tokens (set by the user in configuration file). """ def __init__(self, name, config): """ The init method downloads the required files, loads the file associated with a given subset (train/valid/test), concatenates all sencentes and tokenizes them using NLTK's WhitespaceTokenizer. - It also stores the intermediate results, so for example, it file with tokenized set is found, it simply loads it. - :param name: Name of the component. :param class_type: Class type of the component. @@ -65,10 +67,7 @@ def __init__(self, name, config): raise ConfigurationError("Problem supports one 'subset' options: 'train', 'valid', 'test' ") subset = self.config['subset'] - # Check if file with tokenized words exists. - filename_tokenized_words = "translate_"+dataset+"."+self.config['subset']+".tokenized_words" - - + # Extract source and target language name self.lang_source = self.config['dataset'].split('-')[0] self.lang_target = self.config['dataset'].split('-')[1] @@ -105,25 +104,25 @@ def __init__(self, name, config): lines_target = [self.normalizeString(l.split('\t')[1]) for l in lines] # Cut dataset into train (90%), valid (5%), test (5%) files - test_mark = len(lines) // 20 - valid_mark = test_mark + (len(lines) // 20) + test_index = len(lines) // 20 + valid_index = test_index + (len(lines) // 20) os.makedirs(os.path.join(self.data_folder, dataset), exist_ok=True) with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".test.txt"), mode='w+') as f: - f.write('\n'.join(lines_source[0:test_mark])) + f.write('\n'.join(lines_source[0:test_index])) with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".test.txt"), mode='w+') as f: - f.write('\n'.join(lines_target[0:test_mark])) + f.write('\n'.join(lines_target[0:test_index])) with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".valid.txt"), mode='w+') as f: - f.write('\n'.join(lines_source[test_mark:valid_mark])) + f.write('\n'.join(lines_source[test_index:valid_index])) with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".valid.txt"), mode='w+') as f: - f.write('\n'.join(lines_target[test_mark:valid_mark])) + f.write('\n'.join(lines_target[test_index:valid_index])) with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".train.txt"), mode='w+') as f: - f.write('\n'.join(lines_source[valid_mark:])) + f.write('\n'.join(lines_source[valid_index:])) with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".train.txt"), mode='w+') as f: - f.write('\n'.join(lines_target[valid_mark:])) + f.write('\n'.join(lines_target[valid_index:])) else: self.logger.info("Files {} found in folder '{}'".format(filenames, self.data_folder)) @@ -144,7 +143,9 @@ def __init__(self, name, config): src = tokenizer.tokenize(s_src) tgt = tokenizer.tokenize(s_tgt) # Keep only the pairs that are shorter or equal to the requested length - if len(src) <= self.sentence_length and len(tgt) <= self.sentence_length: + # If self.sentence_length < 0, then give all the pairs regardless of length + if (len(src) <= self.sentence_length and len(tgt) <= self.sentence_length) \ + or self.sentence_length < 0: self.sentences_source += [src] self.sentences_target += [tgt] From b8ed220ed3dc354d663f0e28cc597794e53ed190 Mon Sep 17 00:00:00 2001 From: Alexis Asseman <33075224+aasseman@users.noreply.github.com> Date: Fri, 26 Apr 2019 18:23:14 -0700 Subject: [PATCH 7/7] Cleaning --- .../wikitext_language_modeling_seq2seq.yml | 25 ++++++++----------- ptp/components/text/sentence_indexer.py | 7 ------ 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/configs/wikitext/wikitext_language_modeling_seq2seq.yml b/configs/wikitext/wikitext_language_modeling_seq2seq.yml index c2d89a5..d23243c 100644 --- a/configs/wikitext/wikitext_language_modeling_seq2seq.yml +++ b/configs/wikitext/wikitext_language_modeling_seq2seq.yml @@ -11,7 +11,7 @@ training: data_folder: &data_folder ~/data/language_modeling/wikitext-2 dataset: &dataset wikitext-2 subset: train - sentence_length: 42 + sentence_length: 50 batch_size: 64 # optimizer parameters: @@ -33,7 +33,7 @@ validation: data_folder: *data_folder dataset: *dataset subset: valid - sentence_length: 42 + sentence_length: 50 batch_size: 64 # Testing parameters: @@ -43,7 +43,7 @@ testing: data_folder: *data_folder dataset: *dataset subset: test - sentence_length: 42 + sentence_length: 50 batch_size: 64 pipeline: @@ -77,15 +77,13 @@ pipeline: # LSTM Encoder lstm_encoder: type: RecurrentNeuralNetwork - cell_type: GRU priority: 3 initial_state: Trainable - hidden_size: 50 - num_layers: 1 + hidden_size: 300 + num_layers: 3 use_logsoftmax: False output_last_state: True - prediction_mode: Dense - ffn_output: False + prediction_mode: Last streams: inputs: embedded_sources predictions: s2s_encoder_output @@ -96,17 +94,14 @@ pipeline: # LSTM Decoder lstm_decoder: - type: Attn_Decoder_RNN + type: RecurrentNeuralNetwork priority: 4 - hidden_size: 50 - num_layers: 1 + initial_state: Input + hidden_size: 300 + num_layers: 3 use_logsoftmax: False -<<<<<<< Updated upstream input_mode: Autoregression_First max_autoregression_length: 50 -======= - autoregression_length: 42 ->>>>>>> Stashed changes prediction_mode: Dense streams: inputs: s2s_encoder_output diff --git a/ptp/components/text/sentence_indexer.py b/ptp/components/text/sentence_indexer.py index 92c2346..4450e83 100644 --- a/ptp/components/text/sentence_indexer.py +++ b/ptp/components/text/sentence_indexer.py @@ -231,10 +231,3 @@ def tensor_distributions_to_sentences(self, data_dict): # Create the returned dict. data_dict.extend({self.key_outputs: outputs_list}) - - @staticmethod - def pad_list(self, l: list, length: int, value = 0): - if len(l) < length: - l.extend([value]*(length-len(l))) - elif len(l) > length: - del l[length:]