Merge pull request #2 from kaldi-asr/master

pull from master
LvHang · Feb 6, 2018 · 60f2bcf · 60f2bcf
2 parents 240f0e4 + 7906590
commit 60f2bcf
Show file tree

Hide file tree

Showing 23 changed files with 93 additions and 70 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 # Compiled extensionless executable files in /src/*/
 # This stanza must precede wildcard patterns below!
 /src/*/*
+!/src/lm/test_data/
 !/src/*/?*.*
 !/src/doc/*
 !/src/*/Makefile

diff --git a/egs/fisher_swbd/s5/local/chain/compare_wer_general.sh b/egs/fisher_swbd/s5/local/chain/compare_wer_general.sh
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -173,8 +173,6 @@ if [ $stage -le 15 ]; then
      /export/b0{5,6,7,8}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
   fi
 
-  touch $dir/egs/.nodelete # keep egs around when that run dies.
-
   steps/nnet3/chain/train.py --stage $train_stage \
     --cmd "$decode_cmd" \
     --feat.online-ivector-dir $train_ivector_dir \

diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
@@ -4,49 +4,38 @@
 set -e
 stage=1
 train_stage=-10
-generate_alignments=true # false if doing ctc training
+generate_alignments=true
 speed_perturb=true
 
 . ./path.sh
 . ./utils/parse_options.sh
 
-mkdir -p nnet3
-# perturbed data preparation
+mkdir -p exp/nnet3
 train_set=train_nodup
 
 if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
 
-if [ "$speed_perturb" == "true" ]; then
+if $speed_perturb; then
   if [ $stage -le 1 ]; then
-    #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
+    # Although the nnet will be trained by high resolution data, we still have to perturb the normal data to get the alignments
     # _sp stands for speed-perturbed
-
-    for datadir in train_nodup; do
-      utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1
-      utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2
-      utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2
-      utils/validate_data_dir.sh --no-feats data/${datadir}_tmp
-      rm -r data/temp1 data/temp2
-
-      mfccdir=mfcc_perturbed
-      steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
-        data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
-      steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
-      utils/fix_data_dir.sh data/${datadir}_tmp
-
-      utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
-      utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0
-      utils/fix_data_dir.sh data/${datadir}_sp
-      rm -r data/temp0 data/${datadir}_tmp
-    done
+    echo "$0: preparing directory for speed-perturbed data"
+    utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+
+    echo "$0: creating MFCC features for low-resolution speed-perturbed data"
+    mfccdir=mfcc_perturbed
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
+                       data/${train_set}_sp exp/make_mfcc/${train_set}_sp $mfccdir
+    steps/compute_cmvn_stats.sh data/${train_set}_sp exp/make_mfcc/${train_set}_sp $mfccdir
+    utils/fix_data_dir.sh data/${train_set}_sp
   fi
 
-  if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
-    #obtain the alignment of the perturbed data
+  if [ $stage -le 2 ] && $generate_alignments; then
+    # obtain the alignment of the perturbed data
     steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
-      data/train_nodup_sp data/lang exp/tri4 exp/tri4_ali_nodup_sp || exit 1
+      data/${train_set}_sp data/lang exp/tri4 exp/tri4_ali_nodup_sp
   fi
-  train_set=train_nodup_sp
+  train_set=${train_set}_sp
 fi
 
 if [ $stage -le 3 ]; then

diff --git a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
@@ -72,9 +72,9 @@ fi
 nj=$(cat $latdir/num_jobs)
 oov=$(cat $lang/oov.int)
 
-utils/split_data.sh --per-utt $data $nj
+utils/split_data.sh $data $nj
 
-sdata=$data/split${nj}utt
+sdata=$data/split$nj;
 
 if [ $stage -le 1 ]; then
   $cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \

diff --git a/egs/wsj/s5/utils/data/get_uniform_subsegments.py b/egs/wsj/s5/utils/data/get_uniform_subsegments.py
@@ -87,8 +87,8 @@ def run(args):
         else:
             end = end_time
         new_utt = "{utt_id}-{s:08d}-{e:08d}".format(
-            utt_id=utt_id, s=int(100 * (start - start_time)),
-            e=int(100 * (end - start_time)))
+            utt_id=utt_id, s=int(round(100 * (start - start_time))),
+            e=int(round(100 * (end - start_time))))
         print ("{new_utt} {utt_id} {s} {e}".format(
             new_utt=new_utt, utt_id=utt_id, s=start - start_time,
             e=end - start_time))

diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh
@@ -38,9 +38,17 @@ utils/data/get_utt2dur.sh ${srcdir}
 
 utils/data/perturb_data_dir_speed.sh 0.9 ${srcdir} ${destdir}_speed0.9 || exit 1
 utils/data/perturb_data_dir_speed.sh 1.1 ${srcdir} ${destdir}_speed1.1 || exit 1
-utils/data/combine_data.sh $destdir ${srcdir} ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1
 
-rm -r ${destdir}_speed0.9 ${destdir}_speed1.1
+utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- ${srcdir} ${destdir}_speed1.0
+if [ ! -f $srcdir/utt2uniq ]; then
+  cat $srcdir/utt2spk | awk  '{printf("sp1.0-%s %s\n", $1, $1);}' > ${destdir}_speed1.0/utt2uniq
+else
+  cat $srcdir/utt2uniq | awk '{printf("sp1.0-%s %s\n", $1, $2);}' > ${destdir}_speed1.0/utt2uniq
+fi
+
+utils/data/combine_data.sh $destdir ${destdir}_speed1.0 ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1
+
+rm -r ${destdir}_speed0.9 ${destdir}_speed1.1 ${destdir}_speed1.0
 
 echo "$0: generated 3-way speed-perturbed version of data in $srcdir, in $destdir"
 utils/validate_data_dir.sh --no-feats --no-text $destdir
diff --git a/src/base/io-funcs.cc b/src/base/io-funcs.cc
@@ -178,8 +178,14 @@ int PeekToken(std::istream &is, bool binary) {
   }
   int ans = is.peek();
   if (read_bracket) {
-    if (!is.unget())
+    if (!is.unget()) {
       KALDI_WARN << "Error ungetting '<' in PeekToken";
+      // Clear the bad bit.  It seems to be possible for this code to be
+      // reached, and the C++ standard is very vague on whether even a single
+      // call to unget() should succeed; see
+      // http://www.cplusplus.com/reference/istream/istream/unget/
+      is.clear();
+    }
   }
   return ans;
 }
@@ -197,7 +203,12 @@ void ExpectToken(std::istream &is, bool binary, const char *token) {
     KALDI_ERR << "Failed to read token [started at file position "
               << pos_at_start << "], expected " << token;
   }
-  if (strcmp(str.c_str(), token) != 0) {
+  // The second half of the '&&' expression below is so that if we're expecting
+  // "<Foo>", we will accept "Foo>" instead.  This is so that the model-reading
+  // code will tolerate errors in PeekToken where is.unget() failed; search for
+  // is.clear() in PeekToken() for an explanation.
+  if (strcmp(str.c_str(), token) != 0 &&
+      !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) {
     KALDI_ERR << "Expected token \"" << token << "\", got instead \""
               << str <<"\".";
   }

diff --git a/src/doc/chain.dox b/src/doc/chain.dox
@@ -389,34 +389,7 @@ on the paths.
  You might notice in the current example scripts that we use iVectors.  We do so
  just because they generally help a bit, and because the baseline setup we were
  comparing with, uses them.  There is no inherent connection with 'chain'
- models, and no fundamental requirement to use them.  Actually we want to get rid
- of them (see below).
-
-
- \section chain_next_steps  Next steps (TODOs) with 'chain' models
-
-  (Note: this list is valid as of Dec 13 2015, but may become out of date).
-   Things we need to do (and that we'd like help with) are:
-     - Supply example scripts (and tune them) on a wide range of corpora
-       (It will be interesting to see whether there are scale-dependent effects
-       affecting how well this model works).
-     - Create and tune LSTM and BLSTM versions of the training script.  (This
-       may involve some playing around with learning rate schedules and
-       configurations).
-     - Figure out how to speed up the forward-backward part of the computation.
-       (E.g. using state-level pruning, or just by optimizing the current kernels or
-       data structures).
-
-  A longer-term TODO, which Dan should do, is to create an online decoding setup
-  for these models.  Actually this isn't really distinct from nnet3 online
-  decoding in general, since the models are no different from regular nnet3
-  acoustic models.  But we do have to decide whether to continue to support
-  iVectors-- getting rid of them would simplify the setup considerably, and
-  would hopefully make it more robust.  We are hoping that with LSTMs, since it
-  already sees quite a wide acoustic context, iVector adaptation will no longer
-  be as helpful and could be dropped.  We also have other ideas how to
-  incorporate adaptation as part of the neural network, without the use of
-  iVectors.  This will require some experimentation.
+ models, and no fundamental requirement to use them.
 
 
 */

diff --git a/src/lm/arpa-lm-compiler-test.cc b/src/lm/arpa-lm-compiler-test.cc
@@ -204,6 +204,17 @@ bool ScoringTest(bool seps, const string &infile, const string& sentence,
   return ok;
 }
 
+bool ThrowsExceptionTest(bool seps, const string &infile) {
+  try {
+    // Make memory cleanup easy in both cases of try-catch block.
+    std::unique_ptr<ArpaLmCompiler> compiler(Compile(seps, infile));
+    return false;
+  } catch (const std::runtime_error&) {
+    // Kaldi throws only std::runtime_error in kaldi-error.cc
+    return true;
+  }
+}
+
 }  // namespace kaldi
 
 bool RunAllTests(bool seps) {
@@ -214,6 +225,9 @@ bool RunAllTests(bool seps) {
 
   ok &= kaldi::ScoringTest(seps, "test_data/input.arpa", "b b b a", 59.2649);
   ok &= kaldi::ScoringTest(seps, "test_data/input.arpa", "a b", 4.36082);
+
+  ok &= kaldi::ThrowsExceptionTest(seps, "test_data/missing_bos.arpa");
+
   if (!ok) {
     KALDI_WARN << "Tests " << (seps ? "with" : "without")
                << " epsilon substitution FAILED";

diff --git a/src/lm/arpa-lm-compiler.cc b/src/lm/arpa-lm-compiler.cc
@@ -360,10 +360,18 @@ void ArpaLmCompiler::RemoveRedundantStates() {
             << fst_.NumStates();
 }
 
+void ArpaLmCompiler::Check() const {
+  if (fst_.Start() == fst::kNoStateId) {
+    KALDI_ERR << "Arpa file did not contain the beginning-of-sentence symbol "
+              << Symbols()->Find(Options().bos_symbol) << ".";
+  }
+}
+
 void ArpaLmCompiler::ReadComplete() {
   fst_.SetInputSymbols(Symbols());
   fst_.SetOutputSymbols(Symbols());
   RemoveRedundantStates();
+  Check();
 }
 
 }  // namespace kaldi
diff --git a/src/lm/arpa-lm-compiler.h b/src/lm/arpa-lm-compiler.h
@@ -52,6 +52,7 @@ class ArpaLmCompiler : public ArpaFileParser {
   // this function removes states that only have a backoff arc coming
   // out of them.
   void RemoveRedundantStates();
+  void Check() const;
 
   int sub_eps_;
   ArpaLmCompilerImplInterface* impl_;  // Owned.

diff --git a/src/lm/test_data/missing_bos.arpa b/src/lm/test_data/missing_bos.arpa
@@ -0,0 +1,18 @@
+
+\data\
+ngram 1=3
+ngram 2=1
+ngram 3=1
+
+\1-grams:
+-5.234679	a -3.3
+-3.456783	b -3.0
+-4.333333	</s>
+
+\2-grams:
+-1.45678	a b -3.23
+
+\3-grams:
+-0.23940	a b </s>
+
+\end\
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
@@ -37,7 +37,8 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
   if (opts.nnet_config.zero_component_stats)
     ZeroComponentStats(nnet);
   KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 &&
-               opts.nnet_config.max_param_change >= 0.0);
+               opts.nnet_config.max_param_change >= 0.0 &&
+               opts.nnet_config.backstitch_training_interval > 0);
   delta_nnet_ = nnet_->Copy();
   ScaleNnet(0.0, delta_nnet_);
   const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
@@ -2854,8 +2854,8 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
   }
   std::string token;
   ReadToken(is, binary, &token);
-  if (token != "<NaturalGradientAffineComponent>" &&
-      token != "</NaturalGradientAffineComponent>")
+  // the following has to handle a couple variants of
+  if (token.find("NaturalGradientAffineComponent>") == std::string::npos)
     KALDI_ERR << "Expected <NaturalGradientAffineComponent> or "
               << "</NaturalGradientAffineComponent>, got " << token;
   SetNaturalGradientConfigs();

diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
@@ -34,7 +34,8 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
   if (config.zero_component_stats)
     ZeroComponentStats(nnet);
   KALDI_ASSERT(config.momentum >= 0.0 &&
-               config.max_param_change >= 0.0);
+               config.max_param_change >= 0.0 &&
+               config.backstitch_training_interval > 0);
   delta_nnet_ = nnet_->Copy();
   ScaleNnet(0.0, delta_nnet_);
   const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);