add constraint info on batch size for tar dataset (#5812)

* add constraint info on batch size for tar dataset Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * style fix Signed-off-by: Yang Zhang <yangzhang@nvidia.com> Signed-off-by: Yang Zhang <yangzhang@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
NVIDIA · Jan 17, 2023 · 989b07a · 989b07a
1 parent 7d202c9
commit 989b07a
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 4 deletions.
diff --git a/docs/source/nlp/text_normalization/nn_text_normalization.rst b/docs/source/nlp/text_normalization/nn_text_normalization.rst
@@ -108,9 +108,16 @@ Tarred datasets can be created as follows:
     python examples/nlp/duplex_text_normalization/data/create_tarred_dataset.py \
         --input_files = "<trained_processed/output-00099-of-00100>" \
         --input_files = "<trained_processed/output-00098-of-00100>" \
-        --out_dir="<TARRED_DATA_OUTPUT_DIR>"
+        --batch_size = "<batch size>" \
+        --out_dir= "<TARRED_DATA_OUTPUT_DIR>"
 
 
+.. warning::
+  The batch size used for creating the tarred dataset will be the batch size used in training regardless of what the user specifies in the configuration yaml file. 
+  The number of shards should be divisible by the world size to ensure an even
+  split among workers. If it is not divisible, logging will give a warning but training will proceed, but likely hang at the last epoch.
+
+
 Model Training
 --------------
 

diff --git a/examples/nlp/duplex_text_normalization/conf/duplex_tn_config.yaml b/examples/nlp/duplex_text_normalization/conf/duplex_tn_config.yaml
@@ -118,8 +118,8 @@ decoder_exp_manager:
 # Data
 data:
   train_ds:
-    data_path: train.tsv # provide the full path to the file
-    batch_size: 64
+    data_path: train.tsv # provide the full path to the file. Ignored when using tarred dataset, tar_metadata_file is used instead.
+    batch_size: 64 # local training batch size for each worker. Ignored when using tarred dataset, the batch size of the tarred dataset is used instead.
     shuffle: true
     max_insts: -1 # Maximum number of instances (-1 means no limit)
     # Refer to the text_normalization doc for more information about data augmentation

diff --git a/examples/nlp/duplex_text_normalization/data/create_tarred_dataset.py b/examples/nlp/duplex_text_normalization/data/create_tarred_dataset.py
@@ -187,7 +187,10 @@ def _write_batches_to_tarfiles(
     )
     parser.add_argument('--n_jobs', type=int, default=-2, help='The maximum number of concurrently running jobs.')
     parser.add_argument(
-        '--batch_size', type=int, default=16, help='Batch size, i.e., number of examples in a single pickle file'
+        '--batch_size',
+        type=int,
+        default=16,
+        help='Batch size, i.e., number of examples in a single pickle file. This batch size will override the training size.',
     )
     parser.add_argument(
         '--factor', default=8, type=int, help='The final number of tar files will be divisible by the "factor" value'