From 8b415fe52dd3b805a69a0272586e724db1f9e49c Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Sat, 24 Mar 2018 15:45:38 -0500
Subject: [PATCH 01/12] [egs] remove redundant step from chime5 recipe (#2306)

---
 egs/chime5/s5/run.sh | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/egs/chime5/s5/run.sh b/egs/chime5/s5/run.sh
index d80172872ed..c63249b086b 100755
--- a/egs/chime5/s5/run.sh
+++ b/egs/chime5/s5/run.sh
@@ -168,14 +168,6 @@ if [ $stage -le 11 ]; then
 fi
 
 if [ $stage -le 12 ]; then
-  utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
-  for dset in ${test_sets}; do
-    steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
-		    exp/tri2/graph data/${dset} exp/tri2/decode_${dset}
-  done
-fi
-
-if [ $stage -le 13 ]; then
   utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
   for dset in ${test_sets}; do
     steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \

From 9f7e55a0a90481f410e03e91eabf08df081620c8 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 24 Mar 2018 18:57:39 -0400
Subject: [PATCH 02/12] [src] Fix issue with CUDA device initialization if
 'wait' specified.  Thx: @olix20 (#2295)

---
 src/cudamatrix/cu-device.cc | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 87e266e1889..c5114ed8b22 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -62,8 +62,10 @@ static bool GetCudaContext(int32 num_gpus, std::string *debug_str) {
   // Our first attempt to get a device context is: we do cudaFree(0) and see if
   // that returns no error code.  If it succeeds then we have a device
   // context.  Apparently this is the canonical way to get a context.
-  if (cudaFree(0) == 0)
+  if (cudaFree(0) == 0) {
+    cudaGetLastError();  // Clear any error status.
     return true;
+  }
 
   // The rest of this code represents how we used to get a device context, but
   // now its purpose is mainly a debugging one.
@@ -71,16 +73,18 @@ static bool GetCudaContext(int32 num_gpus, std::string *debug_str) {
   debug_stream << "num-gpus=" << num_gpus << ". ";
   for (int32 device = 0; device < num_gpus; device++) {
     cudaSetDevice(device);
-    cudaError_t e = cudaDeviceSynchronize(); // << CUDA context gets created here.
+    cudaError_t e = cudaFree(0);  // CUDA context gets created here.
     if (e == cudaSuccess) {
-      *debug_str = debug_stream.str();
+      if (debug_str)
+        *debug_str = debug_stream.str();
+      cudaGetLastError();  // Make sure the error state doesn't get returned in
+                           // the next cudaGetLastError().
       return true;
     }
     debug_stream << "Device " << device << ": " << cudaGetErrorString(e) << ".  ";
-    cudaGetLastError();  // Make sure the error state doesn't get returned in
-                         // the next cudaGetLastError().
   }
-  *debug_str = debug_stream.str();
+  if (debug_str)
+    *debug_str = debug_stream.str();
   return false;
 }
 
@@ -164,7 +168,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
   } else {
     int32 num_times = 0;
     BaseFloat wait_time = 0.0;
-    while (! got_context) {
+    while (!got_context) {
       int32 sec_sleep = 5;
       if (num_times == 0)
         KALDI_WARN << "Will try again indefinitely every " << sec_sleep
@@ -172,7 +176,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
       num_times++;
       wait_time += sec_sleep;
       Sleep(sec_sleep);
-      got_context = GetCudaContext(num_gpus, &debug_str);
+      got_context = GetCudaContext(num_gpus, NULL);
     }
 
     KALDI_WARN << "Waited " << wait_time

From e03dd12ec7f8f2872708224687868d56beeb1975 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Sun, 25 Mar 2018 04:59:43 +0430
Subject: [PATCH 03/12] [scripts,egs] Fix to
 perturb_speed_to_allowed_lengths.py; egs fix (thanks: @calderma) (#2307)

---
 egs/wsj/s5/local/e2e/run_end2end_char.sh      |  6 ++---
 .../data/perturb_speed_to_allowed_lengths.py  | 22 +++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/egs/wsj/s5/local/e2e/run_end2end_char.sh b/egs/wsj/s5/local/e2e/run_end2end_char.sh
index 6c3786411cc..303a6456159 100755
--- a/egs/wsj/s5/local/e2e/run_end2end_char.sh
+++ b/egs/wsj/s5/local/e2e/run_end2end_char.sh
@@ -34,10 +34,10 @@ wsj1=/export/corpora5/LDC/LDC94S13B
 # _char for character-based dictionary and lang directories.
 
 if [ $stage -le 0 ]; then
-  [[ -f data/train_si284/text ]] || \
+  [[ -d data/local/data ]] || \
     local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?
   [[ -f data/local/dict_nosp/lexicon.txt ]] || \
-    local/wsj_prepare_phn_dict.sh --dict-suffix "_nosp"
+    local/wsj_prepare_dict.sh --dict-suffix "_nosp"
 
   local/wsj_prepare_char_dict.sh
   utils/prepare_lang.sh data/local/dict_char \
@@ -105,7 +105,7 @@ if [ $stage -le 5 ]; then
   mkdir -p exp/chain/e2e_base/log
   $train_cmd exp/chain/e2e_base/log/make_char_lm.log \
   cat data/$trainset/text \| \
-    steps/nnet3/chain/e2e/text_to_phones.py data/lang_char data/local/dict_char/lexicon.txt \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang_char \| \
     utils/sym2int.pl -f 2- data/lang_char/phones.txt \| \
     chain-est-phone-lm --num-extra-lm-states=2000 \
                        ark:- exp/chain/e2e_base/char_lm.fst
diff --git a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py
index 83d0b227767..c6bdb95cb2f 100755
--- a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py
+++ b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright     2017  Hossein Hadian
 # Apache 2.0
@@ -113,7 +113,7 @@ def read_kaldi_datadir(dir):
         else:
             num_fail += 1
 
-    if len(utterances) / len(wav_scp) < 0.5:
+    if float(len(utterances)) / len(wav_scp) < 0.5:
         logger.info("More than half your data is problematic. Try "
                     "fixing using fix_data_dir.sh.")
         sys.exit(1)
@@ -128,7 +128,7 @@ def read_kaldi_mapfile(path):
     """
 
     m = {}
-    with open(path, 'r') as f:
+    with open(path, 'r', encoding='latin-1') as f:
         for line in f:
             line = line.strip()
             sp_pos = line.find(' ')
@@ -145,19 +145,19 @@ def generate_kaldi_data_files(utterances, outdir):
     logger.info("Exporting to {}...".format(outdir))
     speakers = {}
 
-    with open(os.path.join(outdir, 'text'), 'w') as f:
+    with open(os.path.join(outdir, 'text'), 'w', encoding='latin-1') as f:
         for utt in utterances:
             f.write(utt.to_kaldi_utt_str() + "\n")
 
-    with open(os.path.join(outdir, 'wav.scp'), 'w') as f:
+    with open(os.path.join(outdir, 'wav.scp'), 'w', encoding='latin-1') as f:
         for utt in utterances:
             f.write(utt.to_kaldi_wave_str() + "\n")
 
-    with open(os.path.join(outdir, 'utt2dur'), 'w') as f:
+    with open(os.path.join(outdir, 'utt2dur'), 'w', encoding='latin-1') as f:
         for utt in utterances:
             f.write(utt.to_kaldi_dur_str() + "\n")
 
-    with open(os.path.join(outdir, 'utt2spk'), 'w') as f:
+    with open(os.path.join(outdir, 'utt2spk'), 'w', encoding='latin-1') as f:
         for utt in utterances:
             f.write(utt.id + " " + utt.speaker + "\n")
             if utt.speaker not in speakers:
@@ -165,7 +165,7 @@ def generate_kaldi_data_files(utterances, outdir):
             else:
                 speakers[utt.speaker].append(utt.id)
 
-    with open(os.path.join(outdir, 'spk2utt'), 'w') as f:
+    with open(os.path.join(outdir, 'spk2utt'), 'w', encoding='latin-1') as f:
         for s in speakers:
             f.write(s + " ")
             for utt in speakers[s]:
@@ -222,8 +222,8 @@ def find_allowed_durations(start_dur, end_dur, args):
 
     allowed_durations = []
     d = start_dur
-    with open(os.path.join(args.dir, 'allowed_durs.txt'), 'wb') as durs_fp, \
-           open(os.path.join(args.dir, 'allowed_lengths.txt'), 'wb') as lengths_fp:
+    with open(os.path.join(args.dir, 'allowed_durs.txt'), 'w', encoding='latin-1') as durs_fp, \
+           open(os.path.join(args.dir, 'allowed_lengths.txt'), 'w', encoding='latin-1') as lengths_fp:
         while d < end_dur:
             length = int(d * 1000 - args.frame_length) / args.frame_shift + 1
             if length % args.frame_subsampling_factor != 0:
@@ -233,7 +233,7 @@ def find_allowed_durations(start_dur, end_dur, args):
                      + args.frame_length + args.frame_shift / 2) / 1000.0
             allowed_durations.append(d)
             durs_fp.write("{}\n".format(d))
-            lengths_fp.write("{}\n".format(length))
+            lengths_fp.write("{}\n".format(int(length)))
             d *= args.factor
     return allowed_durations
 

From cc16eecee328757811bcb83218ef33447bfa0e00 Mon Sep 17 00:00:00 2001
From: Development and research at SailLabs
 <37703153+saillabs1@users.noreply.github.com>
Date: Mon, 26 Mar 2018 21:31:40 +0200
Subject: [PATCH 04/12] [windows] fix for compiling on Windows VS2017 (15.5.2)
 (#2310)

- adapted install instructions to reflect current dependencies
---
 src/ivector/agglomerative-clustering.h |  1 +
 windows/INSTALL.md                     | 13 +++++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/ivector/agglomerative-clustering.h b/src/ivector/agglomerative-clustering.h
index f260c4c3c8b..310a336f8b5 100644
--- a/src/ivector/agglomerative-clustering.h
+++ b/src/ivector/agglomerative-clustering.h
@@ -25,6 +25,7 @@
 #include <queue>
 #include <set>
 #include <unordered_map>
+#include <functional>
 #include "base/kaldi-common.h"
 #include "matrix/matrix-lib.h"
 #include "util/stl-utils.h"
diff --git a/windows/INSTALL.md b/windows/INSTALL.md
index 4c670e672b1..cd9c77b1776 100644
--- a/windows/INSTALL.md
+++ b/windows/INSTALL.md
@@ -34,6 +34,9 @@ For cygwin installation, see the instructions in `../INSTALL`.
 ## Steps
 
 ## Compiling OpenFST
+
+Skip this section, if you have downloaded OpenFST project from https://github.com/kkm000/openfst.git and it already contains openfst.sln file in the root folder. If it is present you can directly open it with Visual Studio 17 and you do not need CMake.
+------------------------- 
 For compilation of OpenFST, you will need CMake installed. Simply go to https://cmake.org/download/ and download and install.
 Then, in the command line, run the following commands. Be very careful about writing the commands verbatim!
 
@@ -71,6 +74,8 @@ The last command will generate output looking similarly to this. Do not try to r
         -- Build files have been written to: C:/Users/jtrmal/Documents/openfst/build64
 
 In the directory `build64`, find the file `openfst.sln` and open it using Visual Studio 17. 
+------------------------- 
+
    **Switch the configuration to `debug|Win64` and build the solution.**
    **Do the same for configuration `release|Win64`.**
 
@@ -133,15 +138,14 @@ for their processors. It isn't free, but you can get [Community Licensing for In
 
     If you plan to use MKL, you can ignore the `OPENBLASDIR` path.
     If you plan to use OpenBLAS, you can ignore the `MKLDIR` path.
-    No matter what you plan to use, set both the `OPENFST*` and `PTHREADW`
-    variables correctly
+    No matter what you plan to use, set `OPENFST*` variable correctly.
 
 6. For OpenBLAS support, copy the file `kaldiwin_openblas.props` to `kaldiwin.props`
 7. For MKL support, copy the `kaldiwin_mkl.props` to `kaldiwin.props`
 
 8. Call the script that generates the MSVC solution
 
-         ./generate_solution.pl --vsver <default|vs2017|vs2015> [--enable-cuda] [--enable-openblas] [--enable-mkl]
+         generate_solution.pl --vsver <default|vs2017|vs2015> [--enable-cuda] [--enable-openblas] [--enable-mkl]
 
     `--enable-mkl` is the default so you shouldn't need to use it. If `--enable-openblas` is passed it disables MKL support.
     CUDA is disabled by default. The default Visual Studio version is 15.0 (Visual Studio 2017). 
@@ -160,7 +164,8 @@ for their processors. It isn't free, but you can get [Community Licensing for In
         
         (kaldi)/windows$ get_version.pl
   
-10. Open the generated solution in the visual studio and switch to **Debug|x64** (or **Release|x64**) and build.
+10. Open the generated solution that was created in a subfolder (kaldi)/kaldiwin_vs<version>_<blas-library> 
+	in the visual studio and switch to **Debug|x64** (or **Release|x64**) and build.
    Expect 10 projects to fail, majority of them will fail because of missing include `portaudio.h`. The tests will
    fail to compile too -- this is because of deficiency of the script generate_solution.pl. We might fix it
    later on.

From 7352760ef80960137066b022f2dd5ce29e1c2835 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Tue, 27 Mar 2018 12:56:06 +0800
Subject: [PATCH 05/12] [egs] minor fix for fisher_swbd scripts and swbd nnet3
 scripts (#2316)

---
 egs/aspire/s5/local/nnet3/run_tdnn.sh            |  2 +-
 egs/csj/s5/local/nnet3/run_tdnn.sh               |  2 +-
 egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh   |  1 -
 egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh   |  1 -
 egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh    | 16 +++++++++-------
 egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh    | 16 +++++++++-------
 .../s5/local/chain/run_tdnn_lstm_1a.sh           | 16 +++++++++-------
 .../s5/local/chain/run_tdnn_lstm_1b.sh           | 16 +++++++++-------
 .../s5/local/chain/run_tdnn_opgru_1a.sh          | 16 +++++++++-------
 .../s5/local/chain/run_tdnn_opgru_1b.sh          | 16 +++++++++-------
 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_c.sh    |  2 +-
 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh    |  2 +-
 egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh    |  2 +-
 .../s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh     |  2 +-
 .../s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh     |  2 +-
 .../s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh     |  2 +-
 16 files changed, 62 insertions(+), 52 deletions(-)

diff --git a/egs/aspire/s5/local/nnet3/run_tdnn.sh b/egs/aspire/s5/local/nnet3/run_tdnn.sh
index 6dffe45e04f..8e6a45ccbb4 100755
--- a/egs/aspire/s5/local/nnet3/run_tdnn.sh
+++ b/egs/aspire/s5/local/nnet3/run_tdnn.sh
@@ -52,7 +52,7 @@ if [ $stage -le 7 ]; then
   relu-renorm-layer name=tdnn4 dim=1248 input=Append(-3,3)
   relu-renorm-layer name=tdnn5 dim=1248 input=Append(-7,2)
   relu-renorm-layer name=tdnn6 dim=1248
-  output-layer name=output dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+  output-layer name=output dim=$num_targets max-change=1.5
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
 fi
diff --git a/egs/csj/s5/local/nnet3/run_tdnn.sh b/egs/csj/s5/local/nnet3/run_tdnn.sh
index 0b8fc368561..e656b825517 100755
--- a/egs/csj/s5/local/nnet3/run_tdnn.sh
+++ b/egs/csj/s5/local/nnet3/run_tdnn.sh
@@ -72,7 +72,7 @@ if [ $stage -le 9 ]; then
   relu-renorm-layer name=tdnn5 input=Append(-7,2) dim=1024
   relu-renorm-layer name=tdnn6 dim=1024
 
-  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
 EOF
 
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
index 00c74ee8a56..66f87c8da8f 100755
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
@@ -171,7 +171,6 @@ if [ $stage -le 15 ]; then
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
           data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
          $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
       ) &
   done
 fi
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
index 03d362ef552..cbf0ef6cb6c 100755
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
@@ -244,7 +244,6 @@ if [ $stage -le 15 ]; then
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
           data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
          $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
       ) &
   done
 fi
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
index 0150a3b6d03..12b3187a5fa 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
@@ -218,6 +218,7 @@ fi
 decode_suff=fsh_sw1_tg
 graph_dir=$dir/graph_fsh_sw1_tg
 if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
   if [ ! -z $decode_iter ]; then
     iter_opts=" --iter $decode_iter "
   fi
@@ -228,13 +229,16 @@ if [ $stage -le 15 ]; then
           --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
          $graph_dir data/${decode_set}_hires \
          $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
             data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
             $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
-      ) &
+      ) || touch $dir/.error &
   done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
 fi
 
 test_online_decoding=true
@@ -256,11 +260,9 @@ if $test_online_decoding && [ $stage -le 16 ]; then
           --acwt 1.0 --post-decode-acwt 10.0 \
          $graph_dir data/${decode_set}_hires \
          ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
-      if $has_fisher; then
-	      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+	    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
 		      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
 		      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
     ) || touch $dir/.error &
   done
   wait
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
index 6255ba39457..7d640c3262a 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
@@ -227,6 +227,7 @@ fi
 decode_suff=fsh_sw1_tg
 graph_dir=$dir/graph_fsh_sw1_tg
 if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
   if [ ! -z $decode_iter ]; then
     iter_opts=" --iter $decode_iter "
   fi
@@ -237,13 +238,16 @@ if [ $stage -le 15 ]; then
           --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
          $graph_dir data/${decode_set}_hires \
          $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
             data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
             $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
-      ) &
+      ) || touch $dir/.error &
   done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
 fi
 
 test_online_decoding=true
@@ -265,11 +269,9 @@ if $test_online_decoding && [ $stage -le 16 ]; then
           --acwt 1.0 --post-decode-acwt 10.0 \
          $graph_dir data/${decode_set}_hires \
          ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
-      if $has_fisher; then
-	      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+	    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
 		      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
 		      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
     ) || touch $dir/.error &
   done
   wait
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
index bccd61533d2..07e88b59ddc 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
@@ -238,6 +238,7 @@ fi
 decode_suff=fsh_sw1_tg
 graph_dir=$dir/graph_fsh_sw1_tg
 if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
   [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
   [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
   [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
@@ -256,13 +257,16 @@ if [ $stage -le 15 ]; then
           --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
          $graph_dir data/${decode_set}_hires \
          $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
             data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
             $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
-      ) &
+      ) || touch $dir/.error &
   done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
 fi
 
 test_online_decoding=true
@@ -284,11 +288,9 @@ if $test_online_decoding && [ $stage -le 16 ]; then
           --acwt 1.0 --post-decode-acwt 10.0 \
          $graph_dir data/${decode_set}_hires \
          ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
-      if $has_fisher; then
-	      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+	    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
 		      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
 		      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
     ) || touch $dir/.error &
   done
   wait
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
index 2272f746ab3..c9d50d1f7bd 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
@@ -248,6 +248,7 @@ fi
 decode_suff=fsh_sw1_tg
 graph_dir=$dir/graph_fsh_sw1_tg
 if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
   [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
   [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
   if [ ! -z $decode_iter ]; then
@@ -265,13 +266,16 @@ if [ $stage -le 15 ]; then
           --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
          $graph_dir data/${decode_set}_hires \
          $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
             data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
             $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
-      ) &
+      ) || touch $dir/.error &
   done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
 fi
 
 test_online_decoding=false
@@ -293,11 +297,9 @@ if $test_online_decoding && [ $stage -le 16 ]; then
           --acwt 1.0 --post-decode-acwt 10.0 \
          $graph_dir data/${decode_set}_hires \
          ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
-      if $has_fisher; then
-	      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+	    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
 		      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
 		      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
     ) || touch $dir/.error &
   done
   wait
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
index 737e0571b07..1cce08abeee 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
@@ -245,6 +245,7 @@ fi
 decode_suff=fsh_sw1_tg
 graph_dir=$dir/graph_fsh_sw1_tg
 if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
   [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
   [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
   [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
@@ -263,13 +264,16 @@ if [ $stage -le 15 ]; then
           --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
          $graph_dir data/${decode_set}_hires \
          $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
             data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
             $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
-      ) &
+      ) || touch $dir/.error &
   done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
 fi
 
 test_online_decoding=true
@@ -291,11 +295,9 @@ if $test_online_decoding && [ $stage -le 16 ]; then
           --acwt 1.0 --post-decode-acwt 10.0 \
          $graph_dir data/${decode_set}_hires \
          ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
-      if $has_fisher; then
-	      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+	    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
 		      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
 		      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
     ) || touch $dir/.error &
   done
   wait
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
index 762db86a8cf..2334c6a1bc1 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
@@ -246,6 +246,7 @@ fi
 decode_suff=fsh_sw1_tg
 graph_dir=$dir/graph_fsh_sw1_tg
 if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
   [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
   [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
   if [ ! -z $decode_iter ]; then
@@ -263,13 +264,16 @@ if [ $stage -le 15 ]; then
           --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
          $graph_dir data/${decode_set}_hires \
          $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
             data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
             $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
-      ) &
+      ) || touch $dir/.error &
   done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
 fi
 
 test_online_decoding=true
@@ -291,11 +295,9 @@ if $test_online_decoding && [ $stage -le 16 ]; then
           --acwt 1.0 --post-decode-acwt 10.0 \
          $graph_dir data/${decode_set}_hires \
          ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
-      if $has_fisher; then
-	      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+	    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
 		      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
 		      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
-      fi
     ) || touch $dir/.error &
   done
   wait
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_c.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_c.sh
index fab4b0a03e4..49f8ab62247 100644
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_c.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_c.sh
@@ -72,7 +72,7 @@ if [ $stage -le 9 ]; then
   relu-renorm-layer name=tdnn4 input=Append(-7,2) dim=1024
   relu-renorm-layer name=tdnn5 dim=1024
 
-  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5
 EOF
 
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
index fef5d349867..427678da17b 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
@@ -74,7 +74,7 @@ if [ $stage -le 9 ]; then
   relu-renorm-layer name=tdnn5 input=Append(-7,2) dim=1024
   relu-renorm-layer name=tdnn6 dim=1024
 
-  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
 EOF
 
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh
index 9b5338c76ae..974f697d651 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh
@@ -74,7 +74,7 @@ if [ $stage -le 9 ]; then
   relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
   relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
 
-  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5
 EOF
 
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
index a470cc7f06f..02e637286b5 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
@@ -98,7 +98,7 @@ if [ $stage -le 11 ]; then
   relu-renorm-layer name=tdnn4 input=Append(-7,2) dim=1024
   relu-renorm-layer name=tdnn5 dim=1024
 
-  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5
 EOF
 
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
index dc8dac90aea..67fd3c03d27 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
@@ -100,7 +100,7 @@ if [ $stage -le 11 ]; then
   relu-renorm-layer name=tdnn5 input=Append(-7,2) dim=1024
   relu-renorm-layer name=tdnn6 dim=1024
 
-  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
 EOF
 
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
index 285328d58eb..260116666a0 100755
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
@@ -97,7 +97,7 @@ if [ $stage -le 11 ]; then
   relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
   relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
 
-  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5
 EOF
 
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/

From e5b6696c00aaa0003c32e82153b40b12c0ce547e Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Wed, 28 Mar 2018 01:18:49 +0430
Subject: [PATCH 06/12] [egs] Add end-to-end OCR recipe for IAM (thanks:
 @aarora8) (#2311)

---
 egs/cifar/v1/image/get_allowed_lengths.py     | 143 ++++++++++
 egs/cifar/v1/image/get_image2num_frames.py    |  62 +++++
 egs/iam/v1/local/chain/run_cnn_chainali_1c.sh | 246 ++++++++++++++++++
 egs/iam/v1/local/chain/run_flatstart_cnn1a.sh | 165 ++++++++++++
 egs/iam/v1/local/make_features.py             |  56 +++-
 egs/iam/v1/run_end2end.sh                     |  76 ++++++
 6 files changed, 740 insertions(+), 8 deletions(-)
 create mode 100755 egs/cifar/v1/image/get_allowed_lengths.py
 create mode 100755 egs/cifar/v1/image/get_image2num_frames.py
 create mode 100755 egs/iam/v1/local/chain/run_cnn_chainali_1c.sh
 create mode 100755 egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
 create mode 100755 egs/iam/v1/run_end2end.sh

diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py
new file mode 100755
index 00000000000..02321fdd2df
--- /dev/null
+++ b/egs/cifar/v1/image/get_allowed_lengths.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+
+# Copyright     2017  Hossein Hadian
+# Apache 2.0
+
+
+""" This script finds a set of allowed lengths for a given OCR/HWR data dir.
+    The allowed lengths are spaced by a factor (like 10%) and are written
+    in an output file named "allowed_lengths.txt" in the output data dir. This
+    file is later used by make_features.py to pad each image sufficiently so that
+    they all have an allowed length. This is intended for end2end chain training.
+"""
+
+import argparse
+import os
+import sys
+import copy
+import math
+import logging
+
+sys.path.insert(0, 'steps')
+import libs.common as common_lib
+
+logger = logging.getLogger('libs')
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script finds a set of
+                                   allowed lengths for a given OCR/HWR data dir.
+                                   Intended for chain training.""")
+    parser.add_argument('factor', type=float, default=12,
+                        help='Spacing (in percentage) between allowed lengths.')
+    parser.add_argument('srcdir', type=str,
+                        help='path to source data dir')
+    parser.add_argument('--coverage-factor', type=float, default=0.05,
+                        help="""Percentage of durations not covered from each
+                             side of duration histogram.""")
+    parser.add_argument('--frame-subsampling-factor', type=int, default=3,
+                        help="""Chain frame subsampling factor.
+                             See steps/nnet3/chain/train.py""")
+
+    args = parser.parse_args()
+    return args
+
+
+def read_kaldi_mapfile(path):
+    """ Read any Kaldi mapping file - like text, .scp files, etc.
+    """
+
+    m = {}
+    with open(path, 'r', encoding='latin-1') as f:
+        for line in f:
+            line = line.strip()
+            sp_pos = line.find(' ')
+            key = line[:sp_pos]
+            val = line[sp_pos+1:]
+            m[key] = val
+    return m
+
+def find_duration_range(img2len, coverage_factor):
+    """Given a list of utterances, find the start and end duration to cover
+
+     If we try to cover
+     all durations which occur in the training set, the number of
+     allowed lengths could become very large.
+
+     Returns
+     -------
+     start_dur: int
+     end_dur: int
+    """
+    durs = []
+    for im, imlen in img2len.items():
+        durs.append(int(imlen))
+    durs.sort()
+    to_ignore_dur = 0
+    tot_dur = sum(durs)
+    for d in durs:
+        to_ignore_dur += d
+        if to_ignore_dur * 100.0 / tot_dur > coverage_factor:
+            start_dur = d
+            break
+    to_ignore_dur = 0
+    for d in reversed(durs):
+        to_ignore_dur += d
+        if to_ignore_dur * 100.0 / tot_dur > coverage_factor:
+            end_dur = d
+            break
+    if start_dur < 30:
+        start_dur = 30  # a hard limit to avoid too many allowed lengths --not critical
+    return start_dur, end_dur
+
+
+def find_allowed_durations(start_len, end_len, args):
+    """Given the start and end duration, find a set of
+       allowed durations spaced by args.factor%. Also write
+       out the list of allowed durations and the corresponding
+       allowed lengths (in frames) on disk.
+
+     Returns
+     -------
+     allowed_durations: list of allowed durations (in seconds)
+    """
+
+    allowed_lengths = []
+    length = start_len
+    with open(os.path.join(args.srcdir, 'allowed_lengths.txt'), 'w', encoding='latin-1') as fp:
+        while length < end_len:
+            if length % args.frame_subsampling_factor != 0:
+                length = (args.frame_subsampling_factor *
+                          (length // args.frame_subsampling_factor))
+            allowed_lengths.append(length)
+            fp.write("{}\n".format(int(length)))
+            length *= args.factor
+    return allowed_lengths
+
+
+
+def main():
+    args = get_args()
+    args.factor = 1.0 + args.factor / 100.0
+
+    image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames'))
+
+    start_dur, end_dur = find_duration_range(image2length, args.coverage_factor)
+    logger.info("Lengths in the range [{},{}] will be covered. "
+                "Coverage rate: {}%".format(start_dur, end_dur,
+                                      100.0 - args.coverage_factor * 2))
+    logger.info("There will be {} unique allowed lengths "
+                "for the images.".format(int(math.log(end_dur / start_dur) /
+                                             math.log(args.factor))))
+
+    allowed_durations = find_allowed_durations(start_dur, end_dur, args)
+
+
+if __name__ == '__main__':
+      main()
diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py
new file mode 100755
index 00000000000..3c003bb9947
--- /dev/null
+++ b/egs/cifar/v1/image/get_image2num_frames.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Hossein Hadian
+
+
+""" This script computes the image lengths (with padding) in an image data dir.
+    The output is written to 'image2num_frames' in the given data dir. This
+    file is later used by image/get_allowed_lengths.py to find a set of allowed lengths
+    for the data dir. The output format is similar to utt2num_frames
+
+"""
+
+import argparse
+import os
+import sys
+import numpy as np
+from scipy import misc
+
+parser = argparse.ArgumentParser(description="""Computes the image lengths (i.e. width) in an image data dir
+                                                and writes them (by default) to image2num_frames.""")
+parser.add_argument('dir', type=str,
+                    help='Source data directory (containing images.scp)')
+parser.add_argument('--out-ark', type=str, default=None,
+                    help='Where to write the output image-to-num_frames info. '
+                    'Default: "dir"/image2num_frames')
+parser.add_argument('--feat-dim', type=int, default=40,
+                    help='Size to scale the height of all images')
+parser.add_argument('--padding', type=int, default=5,
+                    help='Number of white pixels to pad on the left'
+                    'and right side of the image.')
+args = parser.parse_args()
+
+
+def get_scaled_image_length(im):
+    scale_size = args.feat_dim
+    sx = im.shape[1]
+    sy = im.shape[0]
+    scale = (1.0 * scale_size) / sy
+    nx = int(scale * sx)
+    return nx
+
+### main ###
+data_list_path = os.path.join(args.dir,'images.scp')
+
+if not args.out_ark:
+    args.out_ark = os.path.join(args.dir,'image2num_frames')
+if args.out_ark == '-':
+    out_fh = sys.stdout
+else:
+    out_fh = open(args.out_ark, 'w', encoding='latin-1')
+
+with open(data_list_path) as f:
+    for line in f:
+        line = line.strip()
+        line_vect = line.split(' ')
+        image_id = line_vect[0]
+        image_path = line_vect[1]
+        im = misc.imread(image_path)
+        im_len = get_scaled_image_length(im) + (args.padding * 2)
+        print('{} {}'.format(image_id, im_len), file=out_fh)
+
+out_fh.close()
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh
new file mode 100755
index 00000000000..6ff76490303
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+# chainali_1c is as chainali_1b except it uses l2-regularize
+# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c
+# System                      cnn_chainali_1b cnn_chainali_1c
+# WER                             14.67     12.84
+# CER                              7.31      6.40
+# Final train prob               0.0042   -0.0120
+# Final valid prob              -0.0256   -0.0199
+# Final train prob (xent)       -0.6282   -0.9973
+# Final valid prob (xent)       -0.9096   -1.1537
+# Parameters                      3.96M     3.96M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c
+# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020)
+
+set -e -o pipefail
+
+stage=0
+
+nj=30
+train_set=train
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                # should have alignments for the specified training data.
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1c  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+ali=tri3_ali
+chain_model_dir=exp/chain${nnet3_affix}/cnn${affix}
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+alignment_subsampling_factor=1
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=false
+lang_test=lang_unk
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${ali}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats_chain
+gmm_lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_chainali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_chain
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp \
+    $train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/$lang_test $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir
+  cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.075"
+  opts_2="l2-regularize=0.075"
+  opts_3="l2-regularize=0.1"
+  common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $opts_2
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $opts_3
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=$alignment_subsampling_factor \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+fi
diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
new file mode 100755
index 00000000000..65eeedcc75b
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+
+# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a
+# System                         cnn_1a cnn_chainali_1c e2e_cnn_1a
+# WER                             18.58     12.84     15.46
+# CER                             10.17      6.40      7.21
+# Final train prob              -0.0122   -0.0120   -0.0426
+# Final valid prob              -0.0999   -0.0199   -0.0724
+# Final train prob (xent)       -0.5652   -0.9973
+# Final valid prob (xent)       -0.9758   -1.1537
+# Parameters                      4.36M     3.96M     9.13M
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/
+# exp/chain/e2e_cnn_1a/: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.040->-0.040 (over 1) logprob:train/valid[13,20,final]=(-0.065,-0.046,-0.043/-0.081,-0.073,-0.072)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=4
+minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=1000000
+cmvn_opts="--norm-means=true --norm-vars=true"
+train_set=train_e2e
+lang_test=lang_test
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_bitree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type biphone \
+                                       data/$train_set $lang $treedir
+  cp exp/chain/e2e_base/phone_lm.fst $treedir/
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+
+  opts="l2-regularize=0.075"
+  opts_2="l2-regularize=0.075"
+  opts_3="l2-regularize=0.1"
+  common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py
index b998464953f..8cfca5ee830 100755
--- a/egs/iam/v1/local/make_features.py
+++ b/egs/iam/v1/local/make_features.py
@@ -2,6 +2,7 @@
 
 # Copyright      2017  Chun Chieh Chang
 #                2017  Ashish Arora
+#                2018  Hossein Hadian
 
 """ This script converts images to Kaldi-format feature matrices. The input to
     this script is the path to a data directory, e.g. "data/train". This script
@@ -9,6 +10,10 @@
     (by default) as Kaldi-formatted matrices (in text form). It also scales the
     images so they have the same height (via --feat-dim). It can optionally pad
     the images (on left/right sides) with white pixels.
+    If an 'image2num_frames' file is found in the data dir, it will be used
+    to enforce the images to have the specified length in that file by padding
+    white pixels (the --padding option will be ignored in this case). This relates
+    to end2end chain training.
 
     eg. local/make_features.py data/train --feat-dim 40
 """
@@ -30,6 +35,8 @@
 parser.add_argument('--padding', type=int, default=5,
                     help='Number of white pixels to pad on the left'
                     'and right side of the image.')
+
+
 args = parser.parse_args()
 
 
@@ -49,7 +56,7 @@ def write_kaldi_matrix(file_handle, matrix, key):
             file_handle.write("\n")
     file_handle.write(" ]\n")
 
-def get_scaled_image(im):
+def get_scaled_image(im, allowed_lengths = None):
     scale_size = args.feat_dim
     sx = im.shape[1]
     sy = im.shape[0]
@@ -57,22 +64,48 @@ def get_scaled_image(im):
     nx = int(scale_size)
     ny = int(scale * sx)
     im = misc.imresize(im, (nx, ny))
-    padding_x = args.padding
-    padding_y = im.shape[0]
-    im_pad = np.concatenate((255 * np.ones((padding_y, padding_x),
+    if allowed_lengths is None:
+        left_padding = right_padding = args.padding
+    else:  # Find an allowed length for the image
+        imlen = im.shape[1]
+        allowed_len = 0
+        for l in allowed_lengths:
+            if l > imlen:
+                allowed_len = l
+                break
+        if allowed_len == 0:
+            #  No allowed length was found for the image (the image is too long)
+            return None
+        padding = allowed_len - imlen
+        left_padding = padding // 2
+        right_padding = padding - left_padding
+    dim_y = im.shape[0]
+    im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
                                            dtype=int), im), axis=1)
-    im_pad1 = np.concatenate((im_pad, 255 * np.ones((padding_y, padding_x),
+    im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
                                                     dtype=int)), axis=1)
     return im_pad1
 
 ### main ###
-data_list_path = os.path.join(args.dir,'images.scp')
+data_list_path = os.path.join(args.dir, 'images.scp')
 
 if args.out_ark == '-':
     out_fh = sys.stdout
 else:
     out_fh = open(args.out_ark,'wb')
 
+allowed_lengths = None
+if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')):
+    print("Found 'allowed_lengths.txt' file...", file=sys.stderr)
+    allowed_lengths = []
+    with open(os.path.join(args.dir,'allowed_lengths.txt')) as f:
+        for line in f:
+            allowed_lengths.append(int(line.strip()))
+    print("Read {} allowed lengths and will apply them to the "
+          "features.".format(len(allowed_lengths)), file=sys.stderr)
+
+num_fail = 0
+num_ok = 0
 with open(data_list_path) as f:
     for line in f:
         line = line.strip()
@@ -80,8 +113,15 @@ def get_scaled_image(im):
         image_id = line_vect[0]
         image_path = line_vect[1]
         im = misc.imread(image_path)
-        im_scale = get_scaled_image(im)
+        im_scaled = get_scaled_image(im, allowed_lengths)
 
-        data = np.transpose(im_scale, (1, 0))
+        if im_scaled is None:
+            num_fail += 1
+            continue
+        data = np.transpose(im_scaled, (1, 0))
         data = np.divide(data, 255.0)
+        num_ok += 1
         write_kaldi_matrix(out_fh, data, image_id)
+
+print('Generated features for {} images. Failed for {} (iamge too '
+      'long).'.format(num_ok, num_fail))
diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh
new file mode 100755
index 00000000000..d479bfa2a73
--- /dev/null
+++ b/egs/iam/v1/run_end2end.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# Copyright 2017    Hossein Hadian
+
+set -e
+stage=0
+nj=20
+username=
+password=
+# iam_database points to the database path on the JHU grid. If you have not
+# already downloaded the database you can set it to a local directory
+# like "data/download" and follow the instructions
+# in "local/prepare_data.sh" to download the database:
+iam_database=/export/corpora5/handwriting_ocr/IAM
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+
+
+./local/check_tools.sh
+
+if [ $stage -le 0 ]; then
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --download-dir "$iam_database" \
+    --username "$username" --password "$password"
+fi
+mkdir -p data/{train,test}/data
+
+if [ $stage -le 1 ]; then
+  get_image2num_frames.py data/train  # This will be needed for the next command
+  # The next command creates a "allowed_lengths.txt" file in data/train
+  # which will be used by local/make_features.py to enforce the images to
+  # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  echo "$0: Preparing the test and train feature files..."
+  for dataset in train test; do
+    local/make_features.py data/$dataset --feat-dim 40 | \
+      copy-feats --compress=true --compression-method=7 \
+                 ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp
+    steps/compute_cmvn_stats.sh data/$dataset
+  done
+  utils/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Preparing dictionary and lang..."
+  local/prepare_dict.sh
+  utils/prepare_lang.sh --sil-prob 0.95 \
+                        data/local/dict "<unk>" data/lang/temp data/lang
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang_test
+fi
+
+
+if [ $stage -le 4 ]; then
+  echo "$0: estimating phone language model for the denominator graph"
+  mkdir -p exp/chain/e2e_base/log
+  $cmd exp/chain/e2e_base/log/make_phone_lm.log \
+  cat data/train/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=1000 \
+                       ark:- exp/chain/e2e_base/phone_lm.fst
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: calling the flat-start chain recipe..."
+  local/chain/run_flatstart_cnn1a.sh
+fi

From d7e88902810f48e10a39e6e7f87e4525fe2369ff Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Wed, 28 Mar 2018 01:21:56 +0430
Subject: [PATCH 07/12] [egs] Fix LM/lexicon issues in IAM; Add unk decoding;
 Update results. (#2315)

---
 egs/iam/v1/local/chain/compare_wer.sh         |  15 ++
 egs/iam/v1/local/chain/run_cnn_1a.sh          |  14 +-
 egs/iam/v1/local/chain/run_cnn_chainali_1b.sh |  22 +--
 egs/iam/v1/local/prepare_dict.sh              |  28 ++--
 .../local/remove_test_utterances_from_lob.py  | 117 +++++++++++++
 egs/iam/v1/local/score.sh                     | 156 +++++++++++++++++-
 egs/iam/v1/local/train_lm.sh                  |  13 +-
 .../v1/local/unk_arc_post_to_transcription.py |  86 ++++++++++
 egs/iam/v1/run.sh                             |  20 ++-
 9 files changed, 428 insertions(+), 43 deletions(-)
 create mode 100755 egs/iam/v1/local/remove_test_utterances_from_lob.py
 create mode 100755 egs/iam/v1/local/unk_arc_post_to_transcription.py

diff --git a/egs/iam/v1/local/chain/compare_wer.sh b/egs/iam/v1/local/chain/compare_wer.sh
index 4eb665fc702..ad90710b13f 100755
--- a/egs/iam/v1/local/chain/compare_wer.sh
+++ b/egs/iam/v1/local/chain/compare_wer.sh
@@ -11,6 +11,7 @@ if [ $# == 0 ]; then
   echo "e.g.: $0 exp/chain/cnn{1a,1b}"
   exit 1
 fi
+. ./path.sh
 
 echo "# $0 $*"
 used_epochs=false
@@ -26,6 +27,13 @@ for x in $*; do
 done
 echo
 
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
 if $used_epochs; then
   exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
 fi
@@ -57,3 +65,10 @@ for x in $*; do
   printf "% 10s" $prob
 done
 echo
+
+echo -n "# Parameters                 "
+for x in $*; do
+  params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}')
+  printf "% 10s" $params
+done
+echo
diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/run_cnn_1a.sh
index 3b1571091c1..05cb9948bd9 100755
--- a/egs/iam/v1/local/chain/run_cnn_1a.sh
+++ b/egs/iam/v1/local/chain/run_cnn_1a.sh
@@ -7,9 +7,15 @@
 # steps/info/chain_dir_info.pl exp/chain/cnn_1a/
 # exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098)
 
-# cat exp/chain/cnn_1a/decode_test/scoring_kaldi/best_*
-# %WER 5.94 [ 3913 / 65921, 645 ins, 1466 del, 1802 sub ] exp/chain/cnn_1a/decode_test//cer_11_0.0
-# %WER 9.13 [ 1692 / 18542, 162 ins, 487 del, 1043 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0
+# local/chain/compare_wer.sh exp/chain/cnn_1a/
+# System                         cnn_1a
+# WER                             18.58
+# CER                             10.17
+# Final train prob              -0.0122
+# Final valid prob              -0.0999
+# Final train prob (xent)       -0.5652
+# Final valid prob (xent)       -0.9758
+# Parameters                      4.36M
 
 set -e -o pipefail
 
@@ -40,7 +46,7 @@ tdnn_dim=450
 # training options
 srand=0
 remove_egs=false
-lang_test=lang_test
+lang_test=lang_unk
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
index ddf596a6126..d6d0ee780f4 100755
--- a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
+++ b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
@@ -1,20 +1,20 @@
 #!/bin/bash
 
 # chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer.
-# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_chainali_1b/
-# System                      cnn_chainali_1a cnn_chainali_1b
-# WER                              6.69     6.25
-# Final train prob              -0.0132   -0.0041
-# Final valid prob              -0.0509   -0.0337
-# Final train prob (xent)       -0.6393   -0.6287
-# Final valid prob (xent)       -1.0116   -0.9064
+
+# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/
+# System                         cnn_1a cnn_chainali_1b
+# WER                             18.58     14.67
+# CER                             10.17      7.31
+# Final train prob              -0.0122    0.0042
+# Final valid prob              -0.0999   -0.0256
+# Final train prob (xent)       -0.5652   -0.6282
+# Final valid prob (xent)       -0.9758   -0.9096
+# Parameters                      4.36M     3.96M
 
 # steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/
 # exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038)
 
-# cat exp/chain/cnn_chainali_1b/decode_test/scoring_kaldi/best_*
-# %WER 3.94 [ 2600 / 65921, 415 ins, 1285 del, 900 sub ] exp/chain/cnn_chainali_1b/decode_test/cer_10_0.0
-# %WER 6.25 [ 1158 / 18542, 103 ins, 469 del, 586 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_12_0.0
 
 set -e -o pipefail
 
@@ -46,7 +46,7 @@ tdnn_dim=450
 # training options
 srand=0
 remove_egs=false
-lang_test=lang_test
+lang_test=lang_unk
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh
index 0c3bb325023..8b981de3abd 100755
--- a/egs/iam/v1/local/prepare_dict.sh
+++ b/egs/iam/v1/local/prepare_dict.sh
@@ -15,29 +15,27 @@ cat data/train/text | \
   perl -ne '@A = split; shift @A; for(@A) {print join("\n", split(//)), "\n";}' | \
   sort -u > $dir/nonsilence_phones.txt
 
-# Now list all the unique words (that use only the above letters)
-# in data/train/text and LOB+Brown corpora with their comprising
-# letters as their transcription. (Letter # is replaced with <HASH>)
+# Now use the pocolm's wordlist which is the most N frequent words in
+# in data/train/text and LOB+Brown corpora (dev and test excluded) with their comprising
+# letters as their transcription. Only include words that use the above letters.
+# (Letter # is replaced with <HASH>)
 
 export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n")
 
-cut -d' ' -f2- data/train/text | \
-  cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt \
-      data/local/browncorpus/brown.txt - | \
+cat data/local/local_lm/data/wordlist | \
   perl -e '$letters=$ENV{letters};
-while(<>){ @A = split;
-  foreach(@A) {
-    if(! $seen{$_} && $_ =~ m/^[$letters]+$/){
-      $seen{$_} = 1;
-      $trans = join(" ", split(//));
+while(<>){
+    chop;
+    $w = $_;
+    if($w =~ m/^[$letters]+$/){
+      $trans = join(" ", split(//, $w));
       $trans =~ s/#/<HASH>/g;
-      print "$_ $trans\n";
+      print "$w $trans\n";
     }
-  }
-}' | sort > $dir/lexicon.txt
+}' | sort -u > $dir/lexicon.txt
 
 
-sed -i '' "s/#/<HASH>/" $dir/nonsilence_phones.txt
+sed -i "s/#/<HASH>/" $dir/nonsilence_phones.txt
 
 echo '<sil> SIL' >> $dir/lexicon.txt
 echo '<unk> SIL' >> $dir/lexicon.txt
diff --git a/egs/iam/v1/local/remove_test_utterances_from_lob.py b/egs/iam/v1/local/remove_test_utterances_from_lob.py
new file mode 100755
index 00000000000..1b414ef47f6
--- /dev/null
+++ b/egs/iam/v1/local/remove_test_utterances_from_lob.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+# Copyright   2018 Ashish Arora
+
+import argparse
+import os
+import numpy as np
+import sys
+import re
+
+parser = argparse.ArgumentParser(description="""Removes dev/test set lines
+                                                from the LOB corpus. Reads the
+                                                corpus from stdin, and writes it to stdout.""")
+parser.add_argument('dev_text', type=str,
+                    help='dev transcription location.')
+parser.add_argument('test_text', type=str,
+                    help='test transcription location.')
+args = parser.parse_args()
+
+def remove_punctuations(transcript):
+    char_list = []
+    for char in transcript:
+        if char.isdigit() or char == '+' or char == '~' or char == '?':
+            continue
+        if char == '#' or char == '=' or char == '-' or char == '!':
+            continue
+        if char == ',' or char == '.' or char == ')' or char == '\'':
+            continue
+        if char == '(' or char == ':' or char == ';' or char == '"':
+            continue
+        char_list.append(char)
+    return char_list
+
+
+def remove_special_words(words):
+    word_list = []
+    for word in words:
+        if word == '<SIC>' or word == '#':
+            continue
+        word_list.append(word)
+    return word_list
+
+
+# process and add dev/eval transcript in a list
+# remove special words, punctuations, spaces between words
+# lowercase the characters
+def read_utterances(text_file_path):
+    with open(text_file_path, 'rt') as in_file:
+        for line in in_file:
+            words = line.strip().split()
+            words_wo_sw = remove_special_words(words)
+            transcript = ''.join(words_wo_sw[1:])
+            transcript = transcript.lower()
+            trans_wo_punct = remove_punctuations(transcript)
+            transcript = ''.join(trans_wo_punct)
+            utterance_dict[words_wo_sw[0]] = transcript
+
+
+### main ###
+
+# read utterances and add it to utterance_dict
+utterance_dict = dict()
+read_utterances(args.dev_text)
+read_utterances(args.test_text)
+
+# read corpus and add it to below lists
+corpus_text_lowercase_wo_sc = list()
+corpus_text_wo_sc = list()
+original_corpus_text = list()
+for line in sys.stdin:
+    original_corpus_text.append(line)
+    words = line.strip().split()
+    words_wo_sw = remove_special_words(words)
+
+    transcript = ''.join(words_wo_sw)
+    transcript = transcript.lower()
+    trans_wo_punct = remove_punctuations(transcript)
+    transcript = ''.join(trans_wo_punct)
+    corpus_text_lowercase_wo_sc.append(transcript)
+
+    transcript = ''.join(words_wo_sw)
+    trans_wo_punct = remove_punctuations(transcript)
+    transcript = ''.join(trans_wo_punct)
+    corpus_text_wo_sc.append(transcript)
+
+# find majority of utterances below
+# for utterances which were not found
+# add them to remaining_utterances
+row_to_keep = [True for i in range(len(original_corpus_text))]
+remaining_utterances = dict()
+for line_id, line_to_find in utterance_dict.items():
+    found_line = False
+    for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)):
+        # Combine 3 consecutive lines of the corpus into a single line
+        prev_words = corpus_text_lowercase_wo_sc[i - 1].strip()
+        curr_words = corpus_text_lowercase_wo_sc[i].strip()
+        next_words = corpus_text_lowercase_wo_sc[i + 1].strip()
+        new_line = prev_words + curr_words + next_words
+        transcript = ''.join(new_line)
+        if line_to_find in transcript:
+            found_line = True
+            row_to_keep[i-1] = False
+            row_to_keep[i] = False
+            row_to_keep[i+1] = False
+    if not found_line:
+        remaining_utterances[line_id] = line_to_find
+
+
+for i in range(len(original_corpus_text)):
+    transcript = original_corpus_text[i].strip()
+    if row_to_keep[i]:
+        print(transcript)
+
+print('Sentences not removed from LOB: {}'.format(remaining_utterances), file=sys.stderr)
+print('Total test+dev sentences: {}'.format(len(utterance_dict)), file=sys.stderr)
+print('Number of sentences not removed from LOB: {}'. format(len(remaining_utterances)), file=sys.stderr)
+print('LOB lines: Before: {}   After: {}'.format(len(original_corpus_text),
+                                                 row_to_keep.count(True)), file=sys.stderr)
diff --git a/egs/iam/v1/local/score.sh b/egs/iam/v1/local/score.sh
index 31564d25326..d964d70206b 100755
--- a/egs/iam/v1/local/score.sh
+++ b/egs/iam/v1/local/score.sh
@@ -1,5 +1,157 @@
 #!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
+# Apache 2.0
 
+# This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the <unk>'s
+# using local/unk_arc_post_to_transcription.py and also it calls
+# steps/scoring/score_kaldi_cer.sh at the end.
 
-steps/scoring/score_kaldi_wer.sh "$@"
-steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=false
+stats=true
+beam=6
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=3
+max_lmwt=13
+iter=final
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+model_path=`echo $dir |xargs dirname`
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+
+if $decode_mbr ; then
+  echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
+else
+  echo "$0: scoring with word insertion penalty=$word_ins_penalty"
+fi
+
+
+mkdir -p $dir/scoring_kaldi
+cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+if [ $stage -le 0 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    mkdir -p $dir/scoring_kaldi/penalty_$wip/log
+
+    if $decode_mbr ; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-prune --beam=$beam ark:- ark:- \| \
+        lattice-mbr-decode  --word-symbol-table=$symtab \
+        ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-1best ark:- ark:- \| \
+        lattice-align-words $lang_or_graph/phones/word_boundary.int $model_path/final.mdl ark:- ark:- \| \
+        lattice-arc-post $model_path/final.mdl ark:- - \| \
+        local/unk_arc_post_to_transcription.py $lang_or_graph/phones.txt $lang_or_graph/words.txt data/lang_unk/oov.int \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+    fi
+
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
+      cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
+      compute-wer --text --mode=present \
+      "ark:cat $dir/scoring_kaldi/test_filt.txt |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
+
+  done
+fi
+
+
+
+if [ $stage -le 1 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    for lmwt in $(seq $min_lmwt $max_lmwt); do
+      # adding /dev/null to the command list below forces grep to output the filename
+      grep WER $dir/wer_${lmwt}_${wip} /dev/null
+    done
+  done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer || exit 1
+
+  best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer)
+  best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}')
+  best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}')
+
+  if [ -z "$best_lmwt" ]; then
+    echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
+    exit 1;
+  fi
+
+  if $stats; then
+    mkdir -p $dir/scoring_kaldi/wer_details
+    echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
+    echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty
+
+    $cmd $dir/scoring_kaldi/log/stats1.log \
+      cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
+       utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/stats2.log \
+      cat $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci --mode=present \
+        ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+        '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+
+  fi
+fi
+
+steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 --min-lmwt $min_lmwt \
+                                 --max-lmwt $max_lmwt --word-ins-penalty $word_ins_penalty \
+                                 $data $lang_or_graph $dir
+
+# If we got here, the scoring was successful.
+# As a  small aid to prevent confusion, we remove all wer_{?,??} files;
+# these originate from the previous version of the scoring files
+# i keep both statement here because it could lead to confusion about
+# the capabilities of the script (we don't do cer in the script)
+rm $dir/wer_{?,??} 2>/dev/null
+rm $dir/cer_{?,??} 2>/dev/null
+
+exit 0;
diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh
index aa4303d6a28..a673c5b3f2d 100755
--- a/egs/iam/v1/local/train_lm.sh
+++ b/egs/iam/v1/local/train_lm.sh
@@ -13,6 +13,7 @@
 
 set -e
 stage=0
+vocab_size=50000
 
 echo "$0 $@"  # Print the command line for logging
 . ./utils/parse_options.sh || exit 1;
@@ -57,8 +58,10 @@ if [ $stage -le 0 ]; then
   rm ${dir}/data/text/* 2>/dev/null || true
 
   # Using LOB and brown corpus.
-  cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt > ${dir}/data/text/text.txt
-  cat data/local/browncorpus/brown.txt >> ${dir}/data/text/text.txt
+  cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \
+    local/remove_test_utterances_from_lob.py data/test/text data/val/text \
+                                             > ${dir}/data/text/lob.txt
+  cat data/local/browncorpus/brown.txt >> ${dir}/data/text/brown.txt
 
   # use the validation data as the dev set.
   # Note: the name 'dev' is treated specially by pocolm, it automatically
@@ -78,8 +81,8 @@ if [ $stage -le 0 ]; then
   cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
 
   # get the wordlist from IAM text
-  cat ${dir}/data/text/{iam,text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
-  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+  cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
 fi
 
 order=3
@@ -91,7 +94,7 @@ if [ $stage -le 1 ]; then
   # Note: if you have more than one order, use a certain amount of words as the
   # vocab and want to restrict max memory for 'sort',
   echo "$0: training the unpruned LM"
-  min_counts='train=2 iam=1'
+  min_counts='brown=2 lob=2 iam=1'
   wordlist=${dir}/data/wordlist
 
   lm_name="`basename ${wordlist}`_${order}"
diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py
new file mode 100755
index 00000000000..c86d35e4b8a
--- /dev/null
+++ b/egs/iam/v1/local/unk_arc_post_to_transcription.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+# Copyright     2017  Ashish Arora
+
+import argparse
+import sys
+
+parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""")
+parser.add_argument('phones', type=str, help='phones and phonesID')
+parser.add_argument('words', type=str, help='word and wordID')
+parser.add_argument('unk', type=str, default='-', help='location of unk file')
+parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data')
+parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data')
+args = parser.parse_args()
+### main ###
+phone_fh = open(args.phones, 'r')
+word_fh = open(args.words, 'r')
+unk_fh = open(args.unk,'r')
+if args.input_ark == '-':
+    input_fh = sys.stdin
+else:
+    input_fh = open(args.input_ark,'r')
+if args.out_ark == '-':
+    out_fh = sys.stdout
+else:
+    out_fh = open(args.out_ark,'wb')
+
+phone_dict = dict()# stores phoneID and phone mapping
+phone_data_vect = phone_fh.read().strip().split("\n")
+for key_val in phone_data_vect:
+  key_val = key_val.split(" ")
+  phone_dict[key_val[1]] = key_val[0]
+word_dict = dict()
+word_data_vect = word_fh.read().strip().split("\n")
+for key_val in word_data_vect:
+  key_val = key_val.split(" ")
+  word_dict[key_val[1]] = key_val[0]
+unk_val = unk_fh.read().strip().split(" ")[0]
+
+utt_word_dict = dict()
+utt_phone_dict = dict()# stores utteranceID and phoneID
+unk_word_dict = dict()
+count=0
+for line in input_fh:
+  line_vect = line.strip().split("\t")
+  if len(line_vect) < 6:
+    print "IndexError"
+    print line_vect
+    continue
+  uttID = line_vect[0]
+  word = line_vect[4]
+  phones = line_vect[5]
+  if uttID in utt_word_dict.keys():
+    utt_word_dict[uttID][count] = word
+    utt_phone_dict[uttID][count] = phones
+  else:
+    count = 0
+    utt_word_dict[uttID] = dict()
+    utt_phone_dict[uttID] = dict()
+    utt_word_dict[uttID][count] = word
+    utt_phone_dict[uttID][count] = phones
+  if word == unk_val: # get character sequence for unk
+    phone_key_vect = phones.split(" ")
+    phone_val_vect = list()
+    for pkey in phone_key_vect:
+      phone_val_vect.append(phone_dict[pkey])
+    phone_2_word = list()
+    for phone_val in phone_val_vect:
+      phone_2_word.append(phone_val.split('_')[0])
+    phone_2_word = ''.join(phone_2_word)
+    utt_word_dict[uttID][count] = phone_2_word
+  else:
+    if word == '0':
+      word_val = ' '
+    else:
+      word_val = word_dict[word]
+    utt_word_dict[uttID][count] = word_val
+  count += 1
+
+transcription = ""
+for key in sorted(utt_word_dict.iterkeys()):
+  transcription = key
+  for index in sorted(utt_word_dict[key].iterkeys()):
+    value = utt_word_dict[key][index]
+    transcription = transcription + " " + value
+  out_fh.write(transcription + '\n')
diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh
index d5f66ca4110..f5c4a2b8f80 100755
--- a/egs/iam/v1/run.sh
+++ b/egs/iam/v1/run.sh
@@ -21,7 +21,6 @@ iam_database=/export/corpora5/handwriting_ocr/IAM
 . ./utils/parse_options.sh  # e.g. this parses the above options
                             # if supplied.
 
-
 ./local/check_tools.sh
 
 if [ $stage -le 0 ]; then
@@ -42,17 +41,26 @@ if [ $stage -le 1 ]; then
 fi
 
 if [ $stage -le 2 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  # We do this stage before dict preparation because prepare_dict.sh
+  # generates the lexicon from pocolm's wordlist
+  local/train_lm.sh --vocab-size 50000
+fi
+
+if [ $stage -le 3 ]; then
   echo "$0: Preparing dictionary and lang..."
   local/prepare_dict.sh
   utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
                         data/local/dict "<unk>" data/lang/temp data/lang
-fi
-
-if [ $stage -le 3 ]; then
-  echo "$0: Estimating a language model for decoding..."
-  local/train_lm.sh
   utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \
                      data/local/dict/lexicon.txt data/lang_test
+  echo "$0: Preparing the unk model for open-vocab decoding..."
+  utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 \
+                            data/local/dict exp/unk_lang_model
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \
+                        --unk-fst exp/unk_lang_model/unk_fst.txt \
+                        data/local/dict "<unk>" data/local/temp data/lang_unk
+  cp data/lang_test/G.fst data/lang_unk/G.fst
 fi
 
 if [ $stage -le 4 ]; then

From 785198e3c0538fb2a5ee366ac777516ba731bc20 Mon Sep 17 00:00:00 2001
From: hainan-xv <hainan.xv@gmail.com>
Date: Tue, 27 Mar 2018 17:04:25 -0400
Subject: [PATCH 08/12] [src] Add some asserts in RNNLM code (#2314)

---
 src/rnnlm/rnnlm-compute-state.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/rnnlm/rnnlm-compute-state.cc b/src/rnnlm/rnnlm-compute-state.cc
index 119e3172fbb..4ec5fdd1dd8 100644
--- a/src/rnnlm/rnnlm-compute-state.cc
+++ b/src/rnnlm/rnnlm-compute-state.cc
@@ -43,6 +43,14 @@ RnnlmComputeStateInfo::RnnlmComputeStateInfo(
     KALDI_ERR << "Embedding file and nnet have different embedding sizes. ";
   }
 
+  if (opts.bos_index <= 0 || opts.bos_index >= word_embedding_mat.NumRows()) {
+    KALDI_ERR < "--bos-symbol option isn't set correctly.";
+  }
+
+  if (opts.eos_index <= 0 || opts.eos_index >= word_embedding_mat.NumRows()) {
+    KALDI_ERR < "--eos-symbol option isn't set correctly.";
+  }
+
   nnet3::ComputationRequest request1, request2, request3;
   CreateLoopedComputationRequestSimple(rnnlm,
                                        1, // num_frames
@@ -85,6 +93,7 @@ RnnlmComputeState* RnnlmComputeState::GetSuccessorState(int32 next_word) const {
 }
 
 void RnnlmComputeState::AddWord(int32 word_index) {
+  KALDI_ASSERT(word_index > 0 && word_index < info_.word_embedding_mat.NumRows());
   previous_word_ = word_index;
   AdvanceChunk();
 

From 749839560f4dc1cd129c4ba3ed8a54fb2519f59a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 27 Mar 2018 19:35:07 -0400
Subject: [PATCH 09/12] [src] Fix to recent commit RE RNNLM code

---
 src/rnnlm/rnnlm-compute-state.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/rnnlm/rnnlm-compute-state.cc b/src/rnnlm/rnnlm-compute-state.cc
index 4ec5fdd1dd8..a8e7a17e50b 100644
--- a/src/rnnlm/rnnlm-compute-state.cc
+++ b/src/rnnlm/rnnlm-compute-state.cc
@@ -44,11 +44,11 @@ RnnlmComputeStateInfo::RnnlmComputeStateInfo(
   }
 
   if (opts.bos_index <= 0 || opts.bos_index >= word_embedding_mat.NumRows()) {
-    KALDI_ERR < "--bos-symbol option isn't set correctly.";
+    KALDI_ERR << "--bos-symbol option isn't set correctly.";
   }
 
   if (opts.eos_index <= 0 || opts.eos_index >= word_embedding_mat.NumRows()) {
-    KALDI_ERR < "--eos-symbol option isn't set correctly.";
+    KALDI_ERR << "--eos-symbol option isn't set correctly.";
   }
 
   nnet3::ComputationRequest request1, request2, request3;

From 8af60bb68a543f7901934026ab4885a9389c6c17 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 28 Mar 2018 00:16:45 -0400
Subject: [PATCH 10/12] [src] Apply limits prior to chain denominator
 computation, avoid failures. (#2308)

---
 .../s5c/local/chain/tuning/run_tdnn_7o.sh     |  2 +-
 src/chain/chain-denominator.cc                |  5 ++-
 src/cudamatrix/cu-kernels-ansi.h              |  4 ++
 src/cudamatrix/cu-kernels.cu                  | 32 ++++++++++++++++
 src/cudamatrix/cu-kernels.h                   |  8 ++++
 src/cudamatrix/cu-matrix-test.cc              | 25 ++++++++++++
 src/cudamatrix/cu-matrix.cc                   | 31 +++++++++++++++
 src/cudamatrix/cu-matrix.h                    |  7 ++++
 src/nnet3/nnet-general-component.cc           |  2 +-
 src/nnet3/nnet-simple-component.cc            |  6 +--
 src/nnet3/nnet-simple-component.h             | 38 ++++++++++++++++++-
 11 files changed, 150 insertions(+), 10 deletions(-)

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
index 753dfc632ba..b927cc86823 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
@@ -18,7 +18,7 @@
 #
 #
 # local/chain/compare_wer_general.sh --rt03 tdnn7n_sp tdnn7m26o_sp
-# System                tdnn7n_sp tdnn7m26j_sp
+# System                tdnn7n_sp tdnn7m26o_sp
 # WER on train_dev(tg)      12.18     11.74
 # WER on train_dev(fg)      11.12     10.69
 # WER on eval2000(tg)        14.9      14.6
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index c936061de26..3a767721c6d 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -64,7 +64,10 @@ DenominatorComputation::DenominatorComputation(
                                      nnet_output.NumRows(),
                                      kUndefined, kStrideEqualNumCols);
   exp_nnet_output_transposed_.CopyFromMat(nnet_output, kTrans);
-  exp_nnet_output_transposed_.ApplyExp();
+  // We limit the nnet output to the range [-30,30] before doing the exp;
+  // this avoids NaNs appearing in the forward-backward computation, which
+  // is not done in log space.
+  exp_nnet_output_transposed_.ApplyExpLimited(-30.0, 30.0);
 }
 
 
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index f2926ddc2f1..6b99a77e73b 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -200,6 +200,10 @@ void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
                          MatrixDim d);
 void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
+                             double lower_limit, double upper_limit);
+void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
+                             float lower_limit, float upper_limit);
 void cudaD_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim,
                              const double* in, int in_stride);
 void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 50dd3d1d0ca..934a860a055 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -400,6 +400,26 @@ static void _apply_exp(Real* mat, MatrixDim d) {
   }
 }
 
+template<typename Real>
+__global__
+static void _apply_exp_limited(Real* mat, MatrixDim d,
+                               Real lower_limit, Real upper_limit) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda index = i + j * d.stride;
+  if (i < d.cols && j < d.rows) {
+    Real x = mat[index];
+    // I'm writing !(x >= lower_limit) instead of (x < lower_limit) so that
+    // nan's will be set to the lower-limit.
+    if (!(x >= lower_limit))
+      x = lower_limit;
+    else if (x > upper_limit)
+      x = upper_limit;
+    mat[index] = exp(x);
+  }
+}
+
+
 template<typename Real>
 __global__
 static void _scale_diag_packed(Real* mat, Real value, int dim) {
@@ -3734,6 +3754,11 @@ void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   _apply_exp<<<Gr,Bl>>>(mat,d);
 }
 
+void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
+                             float lower_limit, float upper_limit) {
+  _apply_exp_limited<<<Gr,Bl>>>(mat, d, lower_limit, upper_limit);
+}
+
 void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d) {
   _apply_pow<<<Gr,Bl>>>(mat, power, d);
 }
@@ -4430,6 +4455,13 @@ void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
   _apply_exp<<<Gr,Bl>>>(mat,d);
 }
 
+void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
+                             double lower_limit, double upper_limit) {
+  _apply_exp_limited<<<Gr,Bl>>>(mat, d, lower_limit, upper_limit);
+}
+
+
+
 void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d) {
   _apply_pow<<<Gr,Bl>>>(mat, power, d);
 }
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index fe706815a44..8f719a8c4a1 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -345,6 +345,14 @@ inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
 inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   cudaF_apply_exp(Gr, Bl, mat, d);
 }
+inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
+                                   double lower_limit, double upper_limit) {
+  cudaD_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit);
+}
+inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
+                                   float lower_limit, float upper_limit) {
+  cudaF_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit);
+}
 inline void cuda_apply_exp_special(dim3 Gr, dim3 Bl, double* out,
                                    MatrixDim out_dim, const double* in,
                                    int in_stride) {
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 33db8b3e625..01030bb8353 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -194,6 +194,30 @@ static void UnitTestCuMatrixApplyExp() {
 }
 
 
+template<typename Real>
+static void UnitTestCuMatrixApplyExpLimited() {
+  int32 M = 10 + Rand() % 20, N = 10 + Rand() % 20;
+  Matrix<Real> H(M, N);
+  H.SetRandn();
+
+
+  BaseFloat lower_limit = -0.2, upper_limit = 0.2;
+
+  CuMatrix<Real> D(H);
+
+  D.ApplyExpLimited(lower_limit, upper_limit);
+
+
+  H.ApplyFloor(lower_limit);
+  H.ApplyCeiling(upper_limit);
+  H.ApplyExp();
+
+  Matrix<Real> H2(D);
+
+  AssertEqual(H,H2);
+}
+
+
 
 template<typename Real>
 static void UnitTestCuMatrixSigmoid() {
@@ -2895,6 +2919,7 @@ static void UnitTestCuMatrixEqualElementMask() {
 
 template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixApplyExpSpecial<Real>();
+  UnitTestCuMatrixApplyExpLimited<Real>();
   UnitTextCuMatrixAddSmatMat<Real>();
   UnitTextCuMatrixAddMatSmat<Real>();
   UnitTextCuMatrixAddSmat<Real>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 34290561cc5..beccd9dc4a5 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -2498,6 +2498,37 @@ void CuMatrixBase<Real>::ApplyExp() {
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::ApplyExpLimited(Real lower_limit, Real upper_limit) {
+  KALDI_ASSERT(upper_limit > lower_limit);
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuTimer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_apply_exp_limited(dimGrid, dimBlock, data_, Dim(), lower_limit, upper_limit);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+#endif
+  {
+    int32 num_rows = num_rows_, num_cols = num_cols_;
+    for (int32 r = 0; r < num_rows; r++) {
+      Real *row_data = this->RowData(r);
+      for (int32 c = 0; c < num_cols; c++) {
+        Real x = row_data[c];
+        if (!(x >= lower_limit))
+          x = lower_limit;
+        if (x > upper_limit)
+          x = upper_limit;
+        row_data[c] = Exp(x);
+      }
+    }
+  }
+}
+
+
 template<typename Real>
 void CuMatrixBase<Real>::ApplyExpSpecial() {
 #if HAVE_CUDA == 1
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 86c50cfc485..03e69b639d3 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -399,6 +399,13 @@ class CuMatrixBase {
   void ApplyCeiling(Real ceiling_val);
   void ApplyExp();
 
+
+  /// This is equivalent to running:
+  /// ApplyFloor(lower_limit);
+  /// ApplyCeiling(upper_limit);
+  /// ApplyExp()
+  void ApplyExpLimited(Real lower_limit, Real upper_limit);
+
   /// For each element x of the matrix, set it to
   /// (x < 0 ? exp(x) : x + 1).  This function is used
   /// in our RNNLM training.
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 669e5112793..00a31fa897c 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1414,7 +1414,7 @@ void* DropoutMaskComponent::Propagate(
   BaseFloat dropout_proportion = dropout_proportion_;
   KALDI_ASSERT(dropout_proportion >= 0.0 && dropout_proportion <= 1.0);
 
-  if (dropout_proportion_ == 0) {
+  if (dropout_proportion == 0) {
     out->Set(1.0);
     return NULL;
   }
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index f9f286aaed2..4eb078c0fcb 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -3730,15 +3730,11 @@ void NaturalGradientPerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
                    // for the preconditioner actually exceeds the memory for the
                    // parameters (by "rank").
       update_period = 10;
-  BaseFloat num_samples_history = 2000.0, alpha = 4.0,
-      max_change_per_minibatch = 0.0;
+  BaseFloat num_samples_history = 2000.0, alpha = 4.0;
   cfl->GetValue("rank", &rank);
   cfl->GetValue("update-period", &update_period);
   cfl->GetValue("num-samples-history", &num_samples_history);
   cfl->GetValue("alpha", &alpha);
-  cfl->GetValue("max-change-per-minibatch", &max_change_per_minibatch);
-  if (max_change_per_minibatch != 0.0)
-    KALDI_WARN << "max-change-per-minibatch is now ignored, use 'max-change'";
   InitLearningRatesFromConfig(cfl);
   std::string filename;
   // Accepts "scales" config (for filename) or "dim" -> random init, for testing.
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 9d438678f5d..3929c253aab 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1446,6 +1446,19 @@ class PermuteComponent: public Component {
    trainable scale; it's like a linear component with a diagonal matrix.  This
    version (and its child class NaturalGradientPerElementScaleComponent)
    requires the input for backprop.  See also ScaleAndOffsetComponent.
+
+   Accepted values on its config line, with defaults if applicable:
+
+     vector           If specified, the offsets will be read from this file ('vector'
+                      is interpreted as an rxfilename).
+
+     dim              The dimension that this component inputs and outputs.
+                      Only required if 'vector' is not specified.
+
+     param-mean=1.0   Mean of randomly initialized offset parameters; should only
+                      be supplied if 'vector' is not supplied.
+     param-stddev=0.0 Standard deviation of randomly initialized offset parameters;
+                      should only be supplied if 'vector' is not supplied.
 */
 class PerElementScaleComponent: public UpdatableComponent {
  public:
@@ -1670,8 +1683,29 @@ class ConstantFunctionComponent: public UpdatableComponent {
 
 
 
-// NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
-// it uses a natural gradient update for the per-element scales.
+/**
+   NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
+   it uses a natural gradient update for the per-element scales.
+
+   Accepted values on its config line, with defaults if applicable:
+
+     vector           If specified, the offsets will be read from this file ('vector'
+                      is interpreted as an rxfilename).
+
+     dim              The dimension that this component inputs and outputs.
+                      Only required if 'vector' is not specified.
+
+     param-mean=1.0   Mean of randomly initialized offset parameters; should only
+                      be supplied if 'vector' is not supplied.
+     param-stddev=0.0 Standard deviation of randomly initialized offset parameters;
+                      should only be supplied if 'vector' is not supplied.
+
+  And the natural-gradient-related configuration values:
+      rank=8
+      update-period=10
+      num-samples-history=2000.0
+      alpha=4.0
+*/
 class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
  public:
 

From 5e6bd39e0ec0e510cb7202990c22fe8b8b9d817c Mon Sep 17 00:00:00 2001
From: Lucas Jo <jty016@gmail.com>
Date: Thu, 29 Mar 2018 03:20:14 +0900
Subject: [PATCH 11/12] [tools, extras] morfessor installation script (#2299)

* added install_morfessor.sh and its symbolic link

* deleted symbolic link

* retab with size 2

* simplified installation process acc. to psmit's advice
---
 tools/extras/install_morfessor.sh | 40 +++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100755 tools/extras/install_morfessor.sh

diff --git a/tools/extras/install_morfessor.sh b/tools/extras/install_morfessor.sh
new file mode 100755
index 00000000000..0722f0fa16a
--- /dev/null
+++ b/tools/extras/install_morfessor.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright  2017  Atlas Guide (Author : Lucas Jo)
+#
+# Apache 2.0
+#
+
+echo "#### installing morfessor"
+dirname=morfessor
+if [ ! -d ./$dirname ]; then
+  mkdir -p ./$dirname
+  git clone https://github.com/aalto-speech/morfessor.git morfessor ||
+    {
+      echo  >&2 "$0: Error git clone operation "
+      echo  >&2 "  Failed in cloning the github repository (https://github.com/aalto-speech/morfessor.git)"
+      exit
+    }
+fi
+
+# env.sh setup
+(
+  set +u
+  [ ! -z "${MORFESSOR}" ] && \
+    echo >&2 "morfessor variable is aleady defined. undefining..." && \
+    unset MORFESSOR
+
+  [ -f ./env.sh ] && . ./env.sh
+
+  [ ! -z "${MORFESSOR}" ] && \
+    echo >&2 "MORFESSOR config is already in env.sh" && exit
+
+  wd=`pwd`
+  wd=`readlink -f $wd || pwd`
+
+  echo "export MORFESSOR=\"$wd/morfessor\""
+  echo "export PATH=\"\$PATH:\${MORFESSOR}/scripts\""
+  echo "export PYTHONPATH=\"\${PYTHONPATH:-}:\$MORFESSOR\""
+) >> env.sh
+
+echo >&2 "installation of MORFESSOR finished successfully"
+echo >&2 "please source tools/env.sh in your path.sh to enable it"

From 191b39a14803f3216a443a61cbbbb5278fc47cfe Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 28 Mar 2018 14:20:51 -0400
Subject: [PATCH 12/12] [src,scripts,egs] Semi-supervised training on Fisher
 English (#2140)

---
 .../s5/local/nnet3/run_tdnn_multilingual.sh   |   1 -
 .../s5/local/fisher_create_test_lang.sh       |  27 +-
 .../s5/local/fisher_train_lms_pocolm.sh       | 160 +++++++
 .../s5/local/nnet3/run_ivector_common.sh      | 108 ++---
 egs/fisher_english/s5/local/run_unk_model.sh  |  23 +
 egs/fisher_english/s5/local/score.sh          |  60 +--
 .../s5/local/semisup/chain/run_tdnn.sh        |   1 +
 .../chain/run_tdnn_100k_semisupervised.sh     |   1 +
 .../chain/run_tdnn_50k_semisupervised.sh      |   1 +
 .../tuning/run_tdnn_100k_semisupervised_1a.sh | 434 +++++++++++++++++
 .../local/semisup/chain/tuning/run_tdnn_1a.sh | 247 ++++++++++
 .../tuning/run_tdnn_50k_semisupervised_1a.sh  | 453 ++++++++++++++++++
 .../s5/local/semisup/run_100k.sh              | 219 +++++++++
 .../s5/local/semisup/run_50k.sh               | 229 +++++++++
 egs/fisher_english/s5/local/wer_output_filter |  16 +
 egs/fisher_english/s5/run.sh                  |  29 ++
 egs/wsj/s5/steps/best_path_weights.sh         | 118 +++++
 .../s5/steps/libs/nnet3/report/log_parse.py   |   4 +-
 .../nnet3/train/chain_objf/acoustic_model.py  | 108 ++++-
 .../nnet3/train/frame_level_objf/common.py    |  17 +-
 .../steps/libs/nnet3/xconfig/basic_layers.py  |  72 ++-
 .../lmrescore_const_arpa_undeterminized.sh    | 105 ++++
 egs/wsj/s5/steps/nnet3/chain/build_tree.sh    |   2 +-
 .../chain/build_tree_multiple_sources.sh      | 275 +++++++++++
 egs/wsj/s5/steps/nnet3/chain/get_egs.sh       | 245 +++++++---
 .../nnet3/chain/multilingual/combine_egs.sh   | 168 +++++++
 egs/wsj/s5/steps/nnet3/chain/train.py         |  29 +-
 egs/wsj/s5/steps/nnet3/decode.sh              |   2 +-
 egs/wsj/s5/steps/nnet3/decode_semisup.sh      | 190 ++++++++
 .../allocate_multilingual_examples.py         | 327 +++++--------
 .../steps/nnet3/multilingual/combine_egs.sh   |  80 ++--
 .../s5/steps/nnet3/report/generate_plots.py   |   5 +-
 egs/wsj/s5/steps/subset_ali_dir.sh            |  67 +++
 src/chain/chain-supervision.cc                |  47 +-
 src/chain/chain-supervision.h                 |  13 +-
 src/chainbin/nnet3-chain-combine.cc           |  13 +-
 src/chainbin/nnet3-chain-copy-egs.cc          | 195 +++++---
 src/chainbin/nnet3-chain-get-egs.cc           | 130 ++++-
 src/chainbin/nnet3-chain-normalize-egs.cc     |  14 +-
 src/lat/lattice-functions.cc                  | 111 ++++-
 src/lat/lattice-functions.h                   |  44 ++
 src/latbin/lattice-compose.cc                 |  32 +-
 src/latbin/lattice-determinize-non-compact.cc |  95 ----
 .../lattice-determinize-phone-pruned.cc       |  38 +-
 src/latbin/lattice-determinize-pruned.cc      |  37 +-
 src/latbin/lattice-scale.cc                   |  37 +-
 src/nnet3/nnet-chain-diagnostics.cc           |  33 +-
 src/nnet3/nnet-chain-diagnostics.h            |   5 +
 src/nnet3/nnet-chain-example.cc               |  14 +-
 src/nnet3/nnet-chain-example.h                |   6 +-
 src/nnet3/nnet-diagnostics.cc                 |   8 +-
 src/nnet3/nnet-example-utils.cc               |   1 -
 src/nnet3/nnet-example-utils.h                |   2 -
 src/nnet3bin/nnet3-copy-egs.cc                | 121 +++--
 54 files changed, 3994 insertions(+), 825 deletions(-)
 create mode 100755 egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh
 create mode 100755 egs/fisher_english/s5/local/run_unk_model.sh
 mode change 100755 => 120000 egs/fisher_english/s5/local/score.sh
 create mode 120000 egs/fisher_english/s5/local/semisup/chain/run_tdnn.sh
 create mode 120000 egs/fisher_english/s5/local/semisup/chain/run_tdnn_100k_semisupervised.sh
 create mode 120000 egs/fisher_english/s5/local/semisup/chain/run_tdnn_50k_semisupervised.sh
 create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
 create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
 create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
 create mode 100644 egs/fisher_english/s5/local/semisup/run_100k.sh
 create mode 100644 egs/fisher_english/s5/local/semisup/run_50k.sh
 create mode 100755 egs/fisher_english/s5/local/wer_output_filter
 create mode 100755 egs/wsj/s5/steps/best_path_weights.sh
 create mode 100755 egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh
 create mode 100755 egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh
 create mode 100755 egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh
 create mode 100755 egs/wsj/s5/steps/nnet3/decode_semisup.sh
 create mode 100755 egs/wsj/s5/steps/subset_ali_dir.sh

diff --git a/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh b/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh
index 65808822db3..22ba636f06a 100755
--- a/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh
+++ b/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh
@@ -247,7 +247,6 @@ if [ $stage -le 10 ] && [ ! -z $megs_dir ]; then
   common_egs_dir="${multi_egs_dirs[@]} $megs_dir"
   steps/nnet3/multilingual/combine_egs.sh $egs_opts \
     --cmd "$decode_cmd" \
-    --samples-per-iter 400000 \
     $num_langs ${common_egs_dir[@]} || exit 1;
 fi
 
diff --git a/egs/fisher_english/s5/local/fisher_create_test_lang.sh b/egs/fisher_english/s5/local/fisher_create_test_lang.sh
index f0926d2ceab..ac3e16c9c78 100755
--- a/egs/fisher_english/s5/local/fisher_create_test_lang.sh
+++ b/egs/fisher_english/s5/local/fisher_create_test_lang.sh
@@ -1,23 +1,25 @@
 #!/bin/bash
-#
 
-if [ -f path.sh ]; then . ./path.sh; fi
-
-mkdir -p data/lang_test
+# This script formats ARPA LM into G.fst.
 
 arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
+dir=data/lang_test
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
 [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
 
-mkdir -p data/lang_test
-cp -r data/lang/* data/lang_test
+mkdir -p $dir
+cp -r data/lang/* $dir
 
 gunzip -c "$arpa_lm" | \
   arpa2fst --disambig-symbol=#0 \
-           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
+           --read-symbol-table=$dir/words.txt - $dir/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst
+fstisstochastic $dir/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@@ -27,22 +29,21 @@ fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/l
 echo Performing further checks
 
 # Checking that G.fst is determinizable.
-fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
+fstdeterminize $dir/G.fst /dev/null || echo Error determinizing G.
 
 # Checking that L_disambig.fst is determinizable.
-fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
+fstdeterminize $dir/L_disambig.fst /dev/null || echo Error determinizing L.
 
 # Checking that disambiguated lexicon times G is determinizable
 # Note: we do this with fstdeterminizestar not fstdeterminize, as
 # fstdeterminize was taking forever (presumbaly relates to a bug
 # in this version of OpenFst that makes determinization slow for
 # some case).
-fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
+fsttablecompose $dir/L_disambig.fst $dir/G.fst | \
    fstdeterminizestar >/dev/null || echo Error
 
 # Checking that LG is stochastic:
-fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
+fsttablecompose data/lang/L_disambig.fst $dir/G.fst | \
    fstisstochastic || echo "[log:] LG is not stochastic"
 
-
 echo "$0 succeeded"
diff --git a/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh b/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh
new file mode 100755
index 00000000000..906703953a1
--- /dev/null
+++ b/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Vimal Manohar
+# Apache 2.0
+#
+# This script is used to train LMs using pocolm toolkit. 
+# We use limit-unk-history=true, which truncates the history left of OOV word.
+# This ensure the graph is compact when using phone LM to model OOV word.
+# See the script local/run_unk_model.sh.
+
+set -e
+stage=0
+
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+dir=data/local/pocolm
+
+num_ngrams_large=5000000
+num_ngrams_small=2500000
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+num_dev_sentences=10000
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  cleantext=$dir/text_all.gz
+
+  cut -d ' ' -f 2- $text | awk -v lex=$lexicon '
+  BEGIN{
+    while((getline<lex) >0) { seen[$1]=1; }
+  }
+  {
+    for(n=1; n<=NF;n++) {  
+      if (seen[$n]) { 
+        printf("%s ", $n); 
+      } else {
+        printf("<unk> ");
+      } 
+    }
+    printf("\n");
+  }' | gzip -c > $cleantext || exit 1;
+
+  # This is for reporting perplexities
+  gunzip -c $dir/text_all.gz | head -n $num_dev_sentences > \
+    ${dir}/data/test.txt
+
+  # use a subset of the annotated training data as the dev set .
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  gunzip -c $dir/text_all.gz | tail -n +$[num_dev_sentences+1] | \
+    head -n $num_dev_sentences > ${dir}/data/text/dev.txt
+
+  gunzip -c $dir/text_all.gz | tail -n +$[2*num_dev_sentences+1] > \
+    ${dir}/data/text/train.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (a subset of the training data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cat data/dev/text data/test/text | cut -d " " -f 2- > ${dir}/data/real_dev_set.txt
+
+  cat $lexicon | awk '{print $1}' | sort | uniq  | awk '
+  {
+    if ($1 == "<s>") {
+      print "<s> is in the vocabulary!" | "cat 1>&2"
+      exit 1;
+    }
+    if ($1 == "</s>") {
+      print "</s> is in the vocabulary!" | "cat 1>&2"
+      exit 1;
+    }
+    printf("%s\n", $1);
+  }' > $dir/data/wordlist || exit 1;
+fi
+  
+order=4
+wordlist=${dir}/data/wordlist
+
+lm_name="`basename ${wordlist}`_${order}"
+min_counts='train=1'
+if [ -n "${min_counts}" ]; then
+  lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+fi
+
+unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  train_lm.py  --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20  \
+               --limit-unk-history=true \
+               --fold-dev-into=train ${bypass_metaparam_optim_opt} \
+               --min-counts="${min_counts}" \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/test.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' | tee ${unpruned_lm_dir}/perplexity_test.log
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' | tee ${unpruned_lm_dir}/perplexity_real_dev_set.log
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 5 million n-grams for a big LM for rescoring purposes.
+  prune_lm_dir.py --target-num-ngrams=$num_ngrams_large --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+  
+  get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_test.log 
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_real_dev_set.log
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 2.5 million n-grams for a smaller LM for graph building.  
+  # Prune from the bigger-pruned LM, it'll be faster.
+  prune_lm_dir.py --target-num-ngrams=$num_ngrams_small ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_small/perplexity_test.log 
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_small/perplexity_real_dev_set.log
+
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh
index f6dc67991f5..b203f9638b4 100755
--- a/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh
@@ -1,21 +1,22 @@
 #!/bin/bash
 
+# Copyright 2017  Hossein Hadian
+#           2017  Vimal Manohar
+# Apache 2.0
 . ./cmd.sh
 set -e
 stage=1
-generate_alignments=true  # false if doing chain training
 speed_perturb=true
-train_set=train
+train_set=train  # Supervised training set
+ivector_train_set=  # data set for training i-vector extractor. 
+                    # If not provided, train_set will be used.
 
-lda_train_set=train_100k
 nnet3_affix=
-gmm=tri2_ali   # should also contain alignments for $lda_train_set
+exp_root=exp
 
 . ./path.sh
 . ./utils/parse_options.sh
 
-gmm_dir=exp/$gmm
-
 # perturbed data preparation
 if [ "$speed_perturb" == "true" ]; then
   if [ $stage -le 1 ]; then
@@ -23,32 +24,22 @@ if [ "$speed_perturb" == "true" ]; then
     # to perturb the normal data to get the alignments.
     # _sp stands for speed-perturbed
 
-    for datadir in ${train_set}; do
-      utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1
-      utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2
-      utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2
-      utils/validate_data_dir.sh --no-feats data/${datadir}_tmp
-      rm -r data/temp1 data/temp2
+    for datadir in ${train_set} ${ivector_train_set}; do
+      utils/data/perturb_data_dir_speed_3way.sh data/${datadir} data/${datadir}_sp
+      utils/fix_data_dir.sh data/${datadir}_sp
 
       mfccdir=mfcc_perturbed
       steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
-        data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
-      steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
-      utils/fix_data_dir.sh data/${datadir}_tmp
-
-      utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
-      utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0
+        data/${datadir}_sp $exp_root/make_mfcc/${datadir}_sp $mfccdir || exit 1;
+      steps/compute_cmvn_stats.sh \
+        data/${datadir}_sp $exp_root/make_mfcc/${datadir}_sp $mfccdir || exit 1;
       utils/fix_data_dir.sh data/${datadir}_sp
-      rm -r data/temp0 data/${datadir}_tmp
     done
   fi
-
-  if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
-    #obtain the alignment of the perturbed data
-    steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
-      data/${train_set}_sp data/lang exp/tri5a exp/tri5a_ali_${train_set}_sp || exit 1
-  fi
   train_set=${train_set}_sp
+  if ! [ -z "$ivector_train_set" ]; then
+    ivector_train_set=${ivector_train_set}_sp
+  fi
 fi
 
 if [ $stage -le 3 ]; then
@@ -58,28 +49,13 @@ if [ $stage -le 3 ]; then
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/fisher_english-$date/s5b/$mfccdir/storage $mfccdir/storage
   fi
 
-  # the 100k directory is copied seperately, as
-  # we want to use exp/tri2_ali for lda_mllt training
-  # the main train directory might be speed_perturbed
-  for dataset in $train_set $lda_train_set; do
+  for dataset in $ivector_train_set $train_set; do
     utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
-
-    # scale the waveforms, this is useful as we don't use CMVN
-    data_dir=data/${dataset}_hires
-    cat $data_dir/wav.scp | python -c "
-import sys, os, subprocess, re, random
-scale_low = 1.0/8
-scale_high = 2.0
-for line in sys.stdin.readlines():
-  if len(line.strip()) == 0:
-    continue
-  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
-"| sort -k1,1 -u  > $data_dir/wav.scp_scaled || exit 1;
-    mv $data_dir/wav.scp_scaled $data_dir/wav.scp
+    utils/data/perturb_data_dir_volume.sh data/${dataset}_hires
 
     steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
-        --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
-    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir;
+        --cmd "$train_cmd" data/${dataset}_hires $exp_root/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires $exp_root/make_hires/${dataset} $mfccdir;
 
     # Remove the small number of utterances that couldn't be extracted for some
     # reason (e.g. too short; no such file).
@@ -90,57 +66,55 @@ for line in sys.stdin.readlines():
     # Create MFCCs for the eval set
     utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
     steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
-        data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
-    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+        data/${dataset}_hires $exp_root/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires $exp_root/make_hires/$dataset $mfccdir;
     utils/fix_data_dir.sh data/${dataset}_hires  # remove segments with problems
   done
+fi
 
-  # Take the first 30k utterances (about 1/8th of the data) this will be used
-  # for the diagubm training
-  utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires
-  utils/data/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires  # 33hr
+if [ -z "$ivector_train_set" ]; then
+  ivector_train_set=$train_set
 fi
 
 # ivector extractor training
 if [ $stage -le 4 ]; then
-  # We need to build a small system just because we need the LDA+MLLT transform
-  # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
-  # the transform (12th iter is the last), any further training is pointless.
-  # this decision is based on fisher_english
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
     --splice-opts "--left-context=3 --right-context=3" \
-    5500 90000 data/${lda_train_set}_hires \
-    data/lang $gmm_dir exp/nnet3${nnet3_affix}/tri3a
+    --max-utts 10000 --subsample 2 \
+    data/${ivector_train_set}_hires \
+    $exp_root/nnet3${nnet3_affix}/pca_transform
 fi
 
 if [ $stage -le 5 ]; then
-  # To train a diagonal UBM we don't need very much data, so use the smallest subset.
   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \
-    data/${train_set}_30k_nodup_hires 512 exp/nnet3${nnet3_affix}/tri3a exp/nnet3${nnet3_affix}/diag_ubm
+    data/${ivector_train_set}_hires 512 \
+    $exp_root/nnet3${nnet3_affix}/pca_transform $exp_root/nnet3${nnet3_affix}/diag_ubm
 fi
 
 if [ $stage -le 6 ]; then
-  # iVector extractors can be sensitive to the amount of data, but this one has a
-  # fairly small dim (defaults to 100) so we don't use all of it, we use just the
-  # 100k subset (just under half the data).
   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-    data/${lda_train_set}_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+    data/${ivector_train_set}_hires $exp_root/nnet3${nnet3_affix}/diag_ubm \
+    $exp_root/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   # We extract iVectors on all the ${train_set} data, which will be what we
   # train the system on.
-
   # having a larger number of speakers is helpful for generalization, and to
   # handle per-utterance decoding well (iVector starts at zero).
-  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${ivector_train_set}_hires data/${ivector_train_set}_max2_hires
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
-    data/${train_set}_max2_hires exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires || exit 1;
+    data/${ivector_train_set}_max2_hires $exp_root/nnet3${nnet3_affix}/extractor \
+    $exp_root/nnet3${nnet3_affix}/ivectors_${ivector_train_set}_hires || exit 1;
+fi
 
+if [ $stage -le 8 ]; then
   for dataset in test dev; do
     steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
-      data/${dataset}_hires exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1;
+      data/${dataset}_hires $exp_root/nnet3${nnet3_affix}/extractor \
+      $exp_root/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1;
   done
 fi
 
diff --git a/egs/fisher_english/s5/local/run_unk_model.sh b/egs/fisher_english/s5/local/run_unk_model.sh
new file mode 100755
index 00000000000..1fe658bda79
--- /dev/null
+++ b/egs/fisher_english/s5/local/run_unk_model.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+
+# This script prepares lang directory with UNK modeled by a phone LM.
+
+utils/lang/make_unk_lm.sh data/local/dict exp/unk_lang_model || exit 1
+
+utils/prepare_lang.sh \
+  --unk-fst exp/unk_lang_model/unk_fst.txt \
+  data/local/dict "<unk>" data/local/lang data/lang_unk
+
+# note: it's important that the LM we built in data/lang/G.fst was created using
+# pocolm with the option --limit-unk-history=true (see ted_train_lm.sh).  This
+# keeps the graph compact after adding the unk model (we only have to add one
+# copy of it).
+
+exit 0
+
+## Caution: if you use this unk-model stuff, be sure that the scoring script
+## does not use lattice-align-words-lexicon, because it's not compatible with
+## the unk-model.  Instead you should use lattice-align-words (of course, this
+## only works if you have position-dependent phones).
diff --git a/egs/fisher_english/s5/local/score.sh b/egs/fisher_english/s5/local/score.sh
deleted file mode 100755
index c381abf7277..00000000000
--- a/egs/fisher_english/s5/local/score.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/bash
-# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
-
-# begin configuration section.
-cmd=run.pl
-min_lmwt=5
-max_lmwt=17
-#end configuration section.
-
-[ -f ./path.sh ] && . ./path.sh
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
-  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
-  echo " Options:"
-  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
-  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
-  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
-  exit 1;
-fi
-
-data=$1
-lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
-dir=$3
-
-model=$dir/../final.mdl # assume model one level up from decoding dir.
-
-for f in $data/text $lang/words.txt $dir/lat.1.gz; do
-  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
-done
-
-name=`basename $data`; # e.g. eval2000
-
-mkdir -p $dir/scoring/log
-
-
-function filter_text {
-  perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
-   while(<STDIN>) { @A  = split(" ", $_); $id = shift @A; print "$id ";
-     foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \
-   '[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '<UNK>' '%HESITATION'
-}
-
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
-  lattice-best-path --lm-scale=LMWT --word-symbol-table=$lang/words.txt \
-    "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
-
-for lmwt in `seq $min_lmwt $max_lmwt`; do
-  utils/int2sym.pl -f 2- $lang/words.txt <$dir/scoring/$lmwt.tra | \
-   filter_text > $dir/scoring/$lmwt.txt || exit 1;
-done
-
-filter_text <$data/text >$dir/scoring/text.filt
-
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
-  compute-wer --text --mode=present \
-   ark:$dir/scoring/text.filt ark:$dir/scoring/LMWT.txt ">&" $dir/wer_LMWT || exit 1;
-
-exit 0
diff --git a/egs/fisher_english/s5/local/score.sh b/egs/fisher_english/s5/local/score.sh
new file mode 120000
index 00000000000..6a200b42ed3
--- /dev/null
+++ b/egs/fisher_english/s5/local/score.sh
@@ -0,0 +1 @@
+../steps/scoring/score_kaldi_wer.sh
\ No newline at end of file
diff --git a/egs/fisher_english/s5/local/semisup/chain/run_tdnn.sh b/egs/fisher_english/s5/local/semisup/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/fisher_english/s5/local/semisup/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/fisher_english/s5/local/semisup/chain/run_tdnn_100k_semisupervised.sh b/egs/fisher_english/s5/local/semisup/chain/run_tdnn_100k_semisupervised.sh
new file mode 120000
index 00000000000..705b1a1dd12
--- /dev/null
+++ b/egs/fisher_english/s5/local/semisup/chain/run_tdnn_100k_semisupervised.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_100k_semisupervised_1a.sh
\ No newline at end of file
diff --git a/egs/fisher_english/s5/local/semisup/chain/run_tdnn_50k_semisupervised.sh b/egs/fisher_english/s5/local/semisup/chain/run_tdnn_50k_semisupervised.sh
new file mode 120000
index 00000000000..70ebebf3c13
--- /dev/null
+++ b/egs/fisher_english/s5/local/semisup/chain/run_tdnn_50k_semisupervised.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_50k_semisupervised_1a.sh
\ No newline at end of file
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
new file mode 100644
index 00000000000..9ba7da6e361
--- /dev/null
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
@@ -0,0 +1,434 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+# This script is semi-supervised recipe with 100 hours of supervised data
+# and 250 hours unsupervised data with naive splitting.
+# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI",
+# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018
+# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf
+# local/semisup/run_100k.sh shows how to call this.
+
+# This version of script uses only supervised data for i-vector extractor
+# training as against using the combined data as in run_tdnn_50k_semisupervised.sh.
+# We use 3-gram LM trained on 100 hours of supervised data. We do not have
+# enough data to do 4-gram LM rescoring as in run_tdnn_50k_semisupervised.sh.
+
+# This script uses phone LM to model UNK.
+# This script uses the same tree as that for the seed model.
+# See the comments in the script about how to change these.
+
+# Unsupervised set: train_unsup100k_250k (250 hour subset of Fisher excluding 100 hours for supervised)
+# unsup_frames_per_eg=150
+# Deriv weights: Lattice posterior of best path pdf
+# Unsupervised weight: 1.0
+# Weights for phone LM (supervised, unsupervised): 3,2
+# LM for decoding unsupervised data: 4gram
+# Supervision: Naive split lattices
+
+# output-0 and output-1 are for superivsed and unsupervised data respectively.
+
+# Semi-supervised training            train_sup
+# WER on dev                          18.70
+# WER on test                         18.18
+# Final output-0 train prob           -0.1345
+# Final output-0 valid prob           -0.1547
+# Final output-0 train prob (xent)    -1.3683
+# Final output-0 valid prob (xent)    -1.4077
+# Final output-1 train prob           -0.6856
+# Final output-1 valid prob           -0.6815
+# Final output-1 train prob (xent)    -1.1224
+# Final output-1 valid prob (xent)    -1.2218
+
+set -u -e -o pipefail
+
+stage=0   # Start from -1 for supervised seed system training
+train_stage=-100
+nj=80
+test_nj=50
+
+# The following 3 options decide the output directory for semi-supervised 
+# chain system
+# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix}
+
+exp_root=exp/semisup_100k
+chain_affix=    # affix for chain dir
+tdnn_affix=_semisup_1a  # affix for semi-supervised chain system
+
+# Datasets -- Expects data/$supervised_set and data/$unsupervised_set to be
+# present
+supervised_set=train_sup
+unsupervised_set=train_unsup100k_250k
+
+# Input seed system
+sup_chain_dir=exp/semisup_100k/chain/tdnn_1a_sp  # supervised chain system
+sup_lat_dir=exp/semisup_100k/chain/tri4a_train_sup_unk_lats  # Seed model options
+sup_tree_dir=exp/semisup_100k/chain/tree_bi_a  # tree directory for supervised chain system
+ivector_root_dir=exp/semisup_100k/nnet3  # i-vector extractor root directory
+
+# Semi-supervised options
+supervision_weights=1.0,1.0   # Weights for supervised, unsupervised data egs.
+                              # Can be used to scale down the effect of unsupervised data
+                              # by using a smaller scale for it e.g. 1.0,0.3
+lm_weights=3,2  # Weights on phone counts from supervised, unsupervised data for denominator FST creation
+
+sup_egs_dir=   # Supply this to skip supervised egs creation
+unsup_egs_dir=  # Supply this to skip unsupervised egs creation
+unsup_egs_opts=  # Extra options to pass to unsupervised egs creation
+
+# Neural network opts
+xent_regularize=0.1
+
+decode_iter=  # Iteration to decode with
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+# The following can be replaced with the versions that model
+# UNK using phone LM. $sup_lat_dir should also ideally be changed.
+unsup_decode_lang=data/lang_test_poco_sup100k_unk
+unsup_decode_graph_affix=_poco_sup100k_unk
+test_lang=data/lang_test_poco_unk
+test_graph_affix=_poco_unk
+
+dir=$exp_root/chain${chain_affix}/tdnn${tdnn_affix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+supervised_set_perturbed=${supervised_set}_sp
+unsupervised_set_perturbed=${unsupervised_set}_sp
+
+sup_ivector_dir=$ivector_root_dir/ivectors_${supervised_set_perturbed}_hires
+
+graphdir=$sup_chain_dir/graph${unsup_decode_graph_affix}
+
+for f in data/${supervised_set_perturbed}/feats.scp \
+  data/${supervised_set_perturbed}_hires/feats.scp \
+  data/${unsupervised_set_perturbed}/feats.scp \
+  $ivector_root_dir/extractor/final.ie $sup_ivector_dir/ivector_online.scp \
+  $sup_lat_dir/lat.1.gz $sup_tree_dir/ali.1.gz \
+  $unsup_decode_lang/G.fst; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find file $f"
+    exit 1
+  fi
+done
+
+if [ ! -f $graphdir/HCLG.fst ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 $unsup_decode_lang $sup_chain_dir $graphdir
+fi
+
+# Prepare the speed-perturbed unsupervised data directory
+if [ $stage -le 2 ]; then
+  if [ -f data/${unsupervised_set}_sp_hires/feats.scp ]; then
+    echo "$0: data/${unsupervised_set}_sp_hires/feats.scp exists. Remove it or re-run from next stage"
+    exit 1
+  fi
+
+  utils/data/perturb_data_dir_speed_3way.sh data/$unsupervised_set data/${unsupervised_set}_sp_hires
+  utils/data/perturb_data_dir_volume.sh data/${unsupervised_set}_sp_hires
+
+  steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
+    --mfcc-config conf/mfcc_hires.conf data/${unsupervised_set}_sp_hires || exit 1
+fi
+unsupervised_set_perturbed=${unsupervised_set}_sp
+
+# Extract i-vectors for the unsupervised data
+if [ $stage -le 3 ]; then
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${unsupervised_set_perturbed}_hires data/${unsupervised_set_perturbed}_max2_hires
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    data/${unsupervised_set_perturbed}_max2_hires $ivector_root_dir/extractor \
+    $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires || exit 1
+fi
+
+# Decode unsupervised data and write lattices in non-compact
+# undeterminized format
+# Set --skip-scoring to false in order to score the unsupervised data
+if [ $stage -le 5 ]; then
+  echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $chaindir"
+  steps/nnet3/decode_semisup.sh --num-threads 4 --nj $nj --cmd "$decode_cmd" \
+            --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \
+            --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \
+            --scoring-opts "--min-lmwt 10 --max-lmwt 10" --word-determinize false \
+            $graphdir data/${unsupervised_set_perturbed}_hires $sup_chain_dir/decode_${unsupervised_set_perturbed}
+fi
+
+# Get best path alignment and lattice posterior of best path alignment to be
+# used as frame-weights in lattice-based training
+if [ $stage -le 8 ]; then
+  steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \
+    data/${unsupervised_set_perturbed}_hires \
+    $sup_chain_dir/decode_${unsupervised_set_perturbed} \
+    $sup_chain_dir/best_path_${unsupervised_set_perturbed}
+fi
+
+frame_subsampling_factor=1
+if [ -f $sup_chain_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sup_chain_dir/frame_subsampling_factor)
+fi
+cmvn_opts=$(cat $sup_chain_dir/cmvn_opts) || exit 1
+
+diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; }
+
+# Uncomment the following lines if you need to build new tree using both
+# supervised and unsupervised data. This may help if amount of
+# supervised data used to train the seed system tree is very small.
+# unsupervised data
+
+# tree_affix=bi_semisup_a
+# treedir=$exp_root/chain${chain_affix}/tree_${tree_affix}
+# if [ -f $treedir/final.mdl ]; then
+#   echo "$0: $treedir/final.mdl exists. Remove it and run again."
+#   exit 1
+# fi
+#
+# if [ $stage -le 9 ]; then
+#   # This is usually 3 for chain systems.
+#   echo $frame_subsampling_factor > \
+#     $sup_chain_dir/best_path_${unsupervised_set_perturbed}/frame_subsampling_factor
+#
+#   # This should be 1 if using a different source for supervised data alignments.
+#   # However alignments in seed tree directory have already been sub-sampled.
+#   echo $frame_subsampling_factor > \
+#     $sup_tree_dir/frame_subsampling_factor
+#
+#   # Build a new tree using stats from both supervised and unsupervised data
+#   steps/nnet3/chain/build_tree_multiple_sources.sh \
+#     --use-fmllr false --context-opts "--context-width=2 --central-position=1" \
+#     --frame-subsampling-factor $frame_subsampling_factor \
+#     7000 $lang \
+#     data/${supervised_set_perturbed} \
+#     ${sup_tree_dir} \
+#     data/${unsupervised_set_perturbed} \
+#     $chaindir/best_path_${unsupervised_set_perturbed} \
+#     $treedir || exit 1
+# fi
+#
+# sup_tree_dir=$treedir   # Use the new tree dir for further steps
+
+# Train denominator FST using phone alignments from
+# supervised and unsupervised data
+if [ $stage -le 10 ]; then
+  steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \
+    ${sup_tree_dir} ${sup_chain_dir}/best_path_${unsupervised_set_perturbed} \
+    $dir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=725
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=725
+  relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=725
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=725
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=725
+  relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=725
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=725 target-rms=0.5
+  output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=725 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+  # We use separate outputs for supervised and unsupervised data
+  # so we can properly track the train and valid objectives.
+
+  output name=output-0 input=output.affine
+  output name=output-1 input=output.affine
+
+  output name=output-0-xent input=output-xent.log-softmax
+  output name=output-1-xent input=output-xent.log-softmax
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+# Get values for $model_left_context, $model_right_context
+. $dir/configs/vars
+
+left_context=$model_left_context
+right_context=$model_right_context
+left_context_initial=$model_left_context
+right_context_final=$model_right_context
+
+egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)")
+egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)")
+egs_left_context_initial=$(perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)")
+egs_right_context_final=$(perl -e "print int($right_context_final + $frame_subsampling_factor / 2)")
+
+if [ -z "$sup_egs_dir" ]; then
+  sup_egs_dir=$dir/egs_${supervised_set_perturbed}
+  frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg)
+
+  if [ $stage -le 12 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage
+    fi
+    mkdir -p $sup_egs_dir/
+    touch $sup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the supervised data"
+    steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \
+               --left-context $egs_left_context --right-context $egs_right_context \
+               --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \
+               --frame-subsampling-factor $frame_subsampling_factor \
+               --alignment-subsampling-factor $frame_subsampling_factor \
+               --frames-per-eg $frames_per_eg \
+               --frames-per-iter 1500000 \
+               --cmvn-opts "$cmvn_opts" \
+               --online-ivector-dir $sup_ivector_dir \
+               --generate-egs-scp true \
+               data/${supervised_set_perturbed}_hires $dir \
+               $sup_lat_dir $sup_egs_dir
+  fi
+else
+  frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg)
+fi
+
+unsup_frames_per_eg=150  # Using a frames-per-eg of 150 for unsupervised data
+                         # was found to be better than allowing smaller chunks
+                         # (160,140,110,80) like for supervised system
+lattice_lm_scale=0.5  # lm-scale for using the weights from unsupervised lattices when
+                      # creating numerator supervision
+lattice_prune_beam=4.0  # beam for pruning the lattices prior to getting egs
+                        # for unsupervised data
+tolerance=1   # frame-tolerance for chain training
+
+unsup_lat_dir=${sup_chain_dir}/decode_${unsupervised_set_perturbed}
+if [ -z "$unsup_egs_dir" ]; then
+  unsup_egs_dir=$dir/egs_${unsupervised_set_perturbed}
+
+  if [ $stage -le 13 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage
+    fi
+    mkdir -p $unsup_egs_dir
+    touch $unsup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the unsupervised data"
+    steps/nnet3/chain/get_egs.sh \
+      --cmd "$decode_cmd" --alignment-subsampling-factor 1 \
+      --left-tolerance $tolerance --right-tolerance $tolerance \
+      --left-context $egs_left_context --right-context $egs_right_context \
+      --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \
+      --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \
+      --frame-subsampling-factor $frame_subsampling_factor \
+      --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \
+      --lattice-prune-beam "$lattice_prune_beam" \
+      --deriv-weights-scp $sup_chain_dir/best_path_${unsupervised_set_perturbed}/weights.scp \
+      --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \
+      --generate-egs-scp true $unsup_egs_opts \
+      data/${unsupervised_set_perturbed}_hires $dir \
+      $unsup_lat_dir $unsup_egs_dir
+  fi
+fi
+
+comb_egs_dir=$dir/comb_egs
+if [ $stage -le 14 ]; then
+  steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \
+    --block-size 128 \
+    --lang2weight $supervision_weights 2 \
+    $sup_egs_dir $unsup_egs_dir $comb_egs_dir
+  touch $comb_egs_dir/.nodelete # keep egs around when that run dies.
+fi
+
+if [ $train_stage -le -4 ]; then
+  # This is to skip stages of den-fst creation, which was already done.
+  train_stage=-4
+fi
+
+if [ $stage -le 15 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir "$comb_egs_dir" \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $sup_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights true \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs false \
+    --feat-dir data/${supervised_set_perturbed}_hires \
+    --tree-dir $sup_tree_dir \
+    --lat-dir $sup_lat_dir \
+    --dir $dir || exit 1;
+fi
+
+test_graph_dir=$dir/graph${test_graph_affix}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm -f $dir/.error
+  for decode_set in dev test; do
+    (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      if [ $num_jobs -gt $test_nj ]; then num_jobs=$test_nj; fi
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $num_jobs --cmd "$decode_cmd" ${decode_iter:+--iter $decode_iter} \
+        --online-ivector-dir $ivector_root_dir/ivectors_${decode_set}_hires \
+        $test_graph_dir data/${decode_set}_hires \
+        $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || touch $dir/.error
+    ) &
+  done
+  wait;
+  if [ -f $dir/.error ]; then
+    echo "$0: Decoding failed. See $dir/decode${test_graph_affix}_*/log/*"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..e76df666e8a
--- /dev/null
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,247 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This is fisher chain recipe for training a model on a subset of around
+# 100-300 hours of supervised data.
+# This system uses phone LM to model UNK.
+# local/semisup/run_50k.sh and local/semisup/run_100k.sh show how to call this.
+
+# train_set                 train_sup15k           train_sup50k           train_sup
+# ivector_train_set         semisup15k_100k_250k   semisup50k_100k_250k   train_sup
+# WER on dev                27.75                  21.41                  19.23
+# WER on test               27.24                  21.03                  19.01
+# Final train prob          -0.0959                -0.1035                -0.1224
+# Final valid prob          -0.1823                -0.1667                -0.1503
+# Final train prob (xent)   -1.9246                -1.5926                -1.6454
+# Final valid prob (xent)   -2.1873                -1.7990                -1.7107
+
+# train_set                           semisup15k_100k_250k    semisup50k_100k_250k    semisup100k_250k
+# ivector_train_set                   semisup15k_100k_250k    semisup50k_100k_250k    train_sup
+# WER on dev                          17.92                   17.55                   16.97
+# WER on test                         17.95                   17.72                   17.03
+# Final output train prob             -0.1145                 -0.1155                 -0.1196
+# Final output valid prob             -0.1370                 -0.1510                 -0.1469
+# Final output train prob (xent)      -1.7449                 -1.7458                 -1.5487
+# Final output valid prob (xent)      -1.7785                 -1.9045                 -1.6360
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+exp_root=exp/semisup_100k
+
+nj=30
+tdnn_affix=_1a
+train_set=train_sup
+ivector_train_set=   # dataset for training i-vector extractor
+
+nnet3_affix=  # affix for nnet3 dir -- relates to i-vector used
+chain_affix=  # affix for chain dir
+tree_affix=bi_a
+gmm=tri4a  # Expect GMM model in $exp/$gmm for alignment
+
+# Neural network opts
+xent_regularize=0.1
+hidden_dim=725
+
+# training options
+num_epochs=4
+
+remove_egs=false
+common_egs_dir=   # if provided, will skip egs generation
+common_treedir=   # if provided, will skip the tree building stage
+
+decode_iter=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+gmm_dir=$exp_root/$gmm   # used to get training lattices (for chain supervision)
+treedir=$exp_root/chain${chain_affix}/tree_${tree_affix}
+lat_dir=$exp_root/chain${chain_affix}/${gmm}_${train_set}_sp_unk_lats  # training lattices directory
+dir=$exp_root/chain${chain_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=$exp_root/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lang=data/lang_chain_unk
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage --exp-root $exp_root \
+                                  --speed-perturb true \
+                                  --train-set $train_set \
+                                  --ivector-train-set "$ivector_train_set" \
+                                  --nnet3-affix "$nnet3_affix" || exit 1
+
+if [ "$train_set" != "$ivector_train_set" ]; then
+  if [ $stage -le 9 ]; then
+    # We extract iVectors on all the ${train_set} data, which will be what we
+    # train the system on.
+    # having a larger number of speakers is helpful for generalization, and to
+    # handle per-utterance decoding well (iVector starts at zero).
+    utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+      data/${train_set}_sp_hires data/${train_set}_sp_max2_hires
+
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+      data/${train_set}_sp_max2_hires $exp_root/nnet3${nnet3_affix}/extractor \
+      $exp_root/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires || exit 1;
+  fi
+fi
+
+if [ $stage -le 10 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \
+    --generate-ali-from-lats true data/${train_set}_sp \
+    data/lang_unk $gmm_dir $lat_dir || exit 1;
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 11 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang_unk $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ -z "$common_treedir" ]; then
+  if [ $stage -le 12 ]; then
+    # Build a tree using our new topology.
+    steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+        --context-opts "--context-width=2 --central-position=1" \
+        --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1
+  fi
+else
+  treedir=$common_treedir
+fi
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  mkdir -p $dir/egs
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir "$common_egs_dir" \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \
+    --egs.chunk-width 160,140,110,80 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $treedir \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1;
+fi
+
+graph_dir=$dir/graph_poco_unk
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_poco_unk $dir $graph_dir
+fi
+
+decode_suff=
+if [ $stage -le 16 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in dev test; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $num_jobs --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir $exp_root/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          $graph_dir data/${decode_set}_hires $dir/decode_poco_unk_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1;
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
new file mode 100755
index 00000000000..ad5d2b106b5
--- /dev/null
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
@@ -0,0 +1,453 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+# This script is semi-supervised recipe with around 50 hours of supervised data
+# and 250 hours unsupervised data with naive splitting.
+# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI",
+# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018
+# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf
+# local/semisup/run_50k.sh shows how to call this.
+
+# We use the combined data for i-vector extractor training.
+# We use 4-gram LM trained on 1250 hours of data excluding the 250 hours
+# unsupervised data to create LM for decoding. Rescoring is done with
+# a larger 4-gram LM.
+# This differs from the case in run_tdnn_100k_semisupervised.sh.
+
+# This script uses phone LM to model UNK.
+# This script uses the same tree as that for the seed model.
+# See the comments in the script about how to change these.
+
+# Unsupervised set: train_unsup100k_250k (250 hour subset of Fisher excluding 100 hours for supervised)
+# unsup_frames_per_eg=150
+# Deriv weights: Lattice posterior of best path pdf
+# Unsupervised weight: 1.0
+# Weights for phone LM (supervised, unsupervised): 3,2
+# LM for decoding unsupervised data: 4gram
+# Supervision: Naive split lattices
+
+# Supervised training results         train_sup15k        train_sup50k
+# WER on dev                          27.75               21.41
+# WER on test                         27.24               21.03
+# Final output train prob             -0.0959             -0.1035
+# Final output valid prob             -0.1823             -0.1667
+# Final output train prob (xent)      -1.9246             -1.5926
+# Final output valid prob (xent)      -2.1873             -1.7990
+
+# output-0 and output-1 are for superivsed and unsupervised data respectively.
+
+# Semi-supervised training            train_sup15k        train_sup50k
+# WER on dev                          21.31               18.98
+# WER on test                         21.00               18.85
+# Final output-0 train prob           -0.1577             -0.1381
+# Final output-0 valid prob           -0.1761             -0.1723
+# Final output-0 train prob (xent)    -1.4744             -1.3676
+# Final output-0 valid prob (xent)    -1.5293             -1.4589
+# Final output-1 train prob           -0.7305             -0.7671
+# Final output-1 valid prob           -0.7319             -0.7714
+# Final output-1 train prob (xent)    -1.1681             -1.1480
+# Final output-1 valid prob (xent)    -1.2871             -1.2382
+
+set -u -e -o pipefail
+
+stage=0   # Start from -1 for supervised seed system training
+train_stage=-100
+nj=80
+test_nj=50
+
+# The following 3 options decide the output directory for semi-supervised 
+# chain system
+# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix}
+
+exp_root=exp/semisup_50k
+chain_affix=_semi50k_100k_250k    # affix for chain dir
+                                  # 50 hour subset out of 100 hours of supervised data
+                                  # 250 hour subset out of (1500-100=1400) hours of unsupervised data 
+tdnn_affix=_semisup_1a
+
+# Datasets -- Expects data/$supervised_set and data/$unsupervised_set to be
+# present
+supervised_set=train_sup50k
+unsupervised_set=train_unsup100k_250k
+
+# Input seed system
+sup_chain_dir=exp/semisup_50k/chain_semi50k_100k_250k/tdnn_1a_sp  # supervised chain system
+sup_lat_dir=exp/semisup_50k/chain_semi50k_100k_250k/tri4a_train_sup50k_unk_lats  # lattices for supervised set
+sup_tree_dir=exp/semisup_50k/chain_semi50k_100k_250k/tree_bi_a  # tree directory for supervised chain system
+ivector_root_dir=exp/semisup_50k/nnet3_semi50k_100k_250k  # i-vector extractor root directory
+
+# Semi-supervised options
+supervision_weights=1.0,1.0   # Weights for supervised, unsupervised data egs.
+                              # Can be used to scale down the effect of unsupervised data
+                              # by using a smaller scale for it e.g. 1.0,0.3
+lm_weights=3,2  # Weights on phone counts from supervised, unsupervised data for denominator FST creation
+
+sup_egs_dir=   # Supply this to skip supervised egs creation
+unsup_egs_dir=  # Supply this to skip unsupervised egs creation
+unsup_egs_opts=  # Extra options to pass to unsupervised egs creation
+
+# Neural network opts
+xent_regularize=0.1
+
+decode_iter=  # Iteration to decode with
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+# The following can be replaced with the versions that do not model
+# UNK using phone LM. $sup_lat_dir should also ideally be changed.
+unsup_decode_lang=data/lang_test_poco_ex250k_unk
+unsup_decode_graph_affix=_poco_ex250k_unk
+test_lang=data/lang_test_poco_unk
+test_graph_affix=_poco_unk
+
+unsup_rescore_lang=${unsup_decode_lang}_big
+
+dir=$exp_root/chain${chain_affix}/tdnn${tdnn_affix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+supervised_set_perturbed=${supervised_set}_sp
+unsupervised_set_perturbed=${unsupervised_set}_sp
+
+sup_ivector_dir=$ivector_root_dir/ivectors_${supervised_set_perturbed}_hires
+
+graphdir=$sup_chain_dir/graph${unsup_decode_graph_affix}
+
+for f in data/${supervised_set_perturbed}/feats.scp \
+  data/${supervised_set_perturbed}_hires/feats.scp \
+  $ivector_root_dir/extractor/final.ie $sup_ivector_dir/ivector_online.scp \
+  $sup_lat_dir/lat.1.gz $sup_tree_dir/ali.1.gz \
+  $unsup_decode_lang/G.fst; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find file $f"
+    exit 1
+  fi
+done
+
+if [ ! -f $graphdir/HCLG.fst ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 $unsup_decode_lang $sup_chain_dir $graphdir
+fi
+
+if [ $stage -le 2 ]; then
+  utils/data/perturb_data_dir_speed_3way.sh data/${unsupervised_set} \
+    data/${unsupervised_set_perturbed}_hires
+
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj \
+    --mfcc-config conf/mfcc_hires.conf \
+    data/${unsupervised_set_perturbed}_hires
+  steps/compute_cmvn_stats.sh data/${unsupervised_set_perturbed}_hires
+  utils/fix_data_dir.sh data/${unsupervised_set_perturbed}_hires
+fi
+
+# Extract i-vectors for the unsupervised data
+if [ $stage -le 3 ]; then
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${unsupervised_set_perturbed}_hires data/${unsupervised_set_perturbed}_max2_hires
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    data/${unsupervised_set_perturbed}_max2_hires $ivector_root_dir/extractor \
+    $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires || exit 1
+fi
+
+# Decode unsupervised data and write lattices in non-compact
+# undeterminized format
+# Set --skip-scoring to false in order to score the unsupervised data
+if [ $stage -le 4 ]; then
+  echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $sup_chain_dir"
+  steps/nnet3/decode_semisup.sh --num-threads 4 --nj $nj --cmd "$decode_cmd" \
+            --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \
+            --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \
+            --scoring-opts "--min-lmwt 10 --max-lmwt 10" --word-determinize false \
+            $graphdir data/${unsupervised_set_perturbed}_hires $sup_chain_dir/decode_${unsupervised_set_perturbed}
+fi
+
+# Rescore undeterminized lattices with larger LM
+if [ $stage -le 5 ]; then
+  steps/lmrescore_const_arpa_undeterminized.sh --cmd "$decode_cmd" \
+    --acwt 0.1 --beam 8.0  --skip-scoring true \
+    $unsup_decode_lang $unsup_rescore_lang \
+    data/${unsupervised_set_perturbed}_hires \
+    $sup_chain_dir/decode_${unsupervised_set_perturbed} \
+    $sup_chain_dir/decode_${unsupervised_set_perturbed}_big
+  ln -sf ../final.mdl $sup_chain_dir/decode_${unsupervised_set_perturbed}_big/final.mdl
+fi
+
+# Get best path alignment and lattice posterior of best path alignment to be
+# used as frame-weights in lattice-based training
+if [ $stage -le 8 ]; then
+  steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \
+    data/${unsupervised_set_perturbed}_hires \
+    $sup_chain_dir/decode_${unsupervised_set_perturbed}_big \
+    $sup_chain_dir/best_path_${unsupervised_set_perturbed}_big
+fi
+
+frame_subsampling_factor=1
+if [ -f $sup_chain_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sup_chain_dir/frame_subsampling_factor)
+fi
+cmvn_opts=$(cat $sup_chain_dir/cmvn_opts) || exit 1
+
+diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; }
+
+# Uncomment the following lines if you need to build new tree using both
+# supervised and unsupervised data. This may help if amount of
+# supervised data used to train the seed system tree is very small.
+# unsupervised data
+
+# tree_affix=bi_semisup_a
+# treedir=$exp_root/chain${chain_affix}/tree_${tree_affix}
+# if [ -f $treedir/final.mdl ]; then
+#   echo "$0: $treedir/final.mdl exists. Remove it and run again."
+#   exit 1
+# fi
+#
+# if [ $stage -le 9 ]; then
+#   # This is usually 3 for chain systems.
+#   echo $frame_subsampling_factor > \
+#     $sup_chain_dir/best_path_${unsupervised_set_perturbed}_big/frame_subsampling_factor
+#
+#   # This should be 1 if using a different source for supervised data alignments.
+#   # However alignments in seed tree directory have already been sub-sampled.
+#   echo $frame_subsampling_factor > \
+#     $sup_tree_dir/frame_subsampling_factor
+#
+#   # Build a new tree using stats from both supervised and unsupervised data
+#   steps/nnet3/chain/build_tree_multiple_sources.sh \
+#     --use-fmllr false --context-opts "--context-width=2 --central-position=1" \
+#     --frame-subsampling-factor $frame_subsampling_factor \
+#     7000 $lang \
+#     data/${supervised_set_perturbed} \
+#     ${sup_tree_dir} \
+#     data/${unsupervised_set_perturbed} \
+#     $chaindir/best_path_${unsupervised_set_perturbed} \
+#     $treedir || exit 1
+# fi
+#
+# sup_tree_dir=$treedir   # Use the new tree dir for further steps
+
+# Train denominator FST using phone alignments from
+# supervised and unsupervised data
+if [ $stage -le 10 ]; then
+  steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \
+    ${sup_tree_dir} ${sup_chain_dir}/best_path_${unsupervised_set_perturbed}_big \
+    $dir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=725
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=725
+  relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=725
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=725
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=725
+  relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=725
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=725 target-rms=0.5
+  output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=725 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+  # We use separate outputs for supervised and unsupervised data
+  # so we can properly track the train and valid objectives.
+
+  output name=output-0 input=output.affine
+  output name=output-1 input=output.affine
+
+  output name=output-0-xent input=output-xent.log-softmax
+  output name=output-1-xent input=output-xent.log-softmax
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+# Get values for $model_left_context, $model_right_context
+. $dir/configs/vars
+
+left_context=$model_left_context
+right_context=$model_right_context
+left_context_initial=$model_left_context
+right_context_final=$model_right_context
+
+egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)")
+egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)")
+egs_left_context_initial=$(perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)")
+egs_right_context_final=$(perl -e "print int($right_context_final + $frame_subsampling_factor / 2)")
+
+if [ -z "$sup_egs_dir" ]; then
+  sup_egs_dir=$dir/egs_${supervised_set_perturbed}
+  frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg)
+
+  if [ $stage -le 12 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage
+    fi
+    mkdir -p $sup_egs_dir/
+    touch $sup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the supervised data"
+    steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \
+               --left-context $egs_left_context --right-context $egs_right_context \
+               --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \
+               --frame-subsampling-factor $frame_subsampling_factor \
+               --alignment-subsampling-factor $frame_subsampling_factor \
+               --frames-per-eg $frames_per_eg \
+               --frames-per-iter 1500000 \
+               --cmvn-opts "$cmvn_opts" \
+               --online-ivector-dir $sup_ivector_dir \
+               --generate-egs-scp true \
+               data/${supervised_set_perturbed}_hires $dir \
+               $sup_lat_dir $sup_egs_dir
+  fi
+else
+  frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg)
+fi
+
+unsup_frames_per_eg=150  # Using a frames-per-eg of 150 for unsupervised data
+                         # was found to be better than allowing smaller chunks
+                         # (160,140,110,80) like for supervised system
+lattice_lm_scale=0.5  # lm-scale for using the weights from unsupervised lattices when
+                      # creating numerator supervision
+lattice_prune_beam=4.0  # beam for pruning the lattices prior to getting egs
+                        # for unsupervised data
+tolerance=1   # frame-tolerance for chain training
+
+unsup_lat_dir=${sup_chain_dir}/decode_${unsupervised_set_perturbed}_big
+if [ -z "$unsup_egs_dir" ]; then
+  unsup_egs_dir=$dir/egs_${unsupervised_set_perturbed}
+
+  if [ $stage -le 13 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage
+    fi
+    mkdir -p $unsup_egs_dir
+    touch $unsup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the unsupervised data"
+    steps/nnet3/chain/get_egs.sh \
+      --cmd "$decode_cmd" --alignment-subsampling-factor 1 \
+      --left-tolerance $tolerance --right-tolerance $tolerance \
+      --left-context $egs_left_context --right-context $egs_right_context \
+      --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \
+      --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \
+      --frame-subsampling-factor $frame_subsampling_factor \
+      --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \
+      --lattice-prune-beam "$lattice_prune_beam" \
+      --deriv-weights-scp $sup_chain_dir/best_path_${unsupervised_set_perturbed}_big/weights.scp \
+      --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \
+      --generate-egs-scp true $unsup_egs_opts \
+      data/${unsupervised_set_perturbed}_hires $dir \
+      $unsup_lat_dir $unsup_egs_dir
+  fi
+fi
+
+comb_egs_dir=$dir/comb_egs
+if [ $stage -le 14 ]; then
+  steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \
+    --block-size 128 \
+    --lang2weight $supervision_weights 2 \
+    $sup_egs_dir $unsup_egs_dir $comb_egs_dir
+  touch $comb_egs_dir/.nodelete # keep egs around when that run dies.
+fi
+
+if [ $train_stage -le -4 ]; then
+  # This is to skip stages of den-fst creation, which was already done.
+  train_stage=-4
+fi
+
+if [ $stage -le 15 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir "$comb_egs_dir" \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $sup_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights true \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs false \
+    --feat-dir data/${supervised_set_perturbed}_hires \
+    --tree-dir $sup_tree_dir \
+    --lat-dir $sup_lat_dir \
+    --dir $dir || exit 1;
+fi
+
+test_graph_dir=$dir/graph${test_graph_affix}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm -f $dir/.error
+  for decode_set in dev test; do
+    (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      if [ $num_jobs -gt $test_nj ]; then num_jobs=$test_nj; fi
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $num_jobs --cmd "$decode_cmd" ${decode_iter:+--iter $decode_iter} \
+        --online-ivector-dir $ivector_root_dir/ivectors_${decode_set}_hires \
+        $test_graph_dir data/${decode_set}_hires \
+        $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || touch $dir/.error
+    ) &
+  done
+  wait;
+  if [ -f $dir/.error ]; then
+    echo "$0: Decoding failed. See $dir/decode${test_graph_affix}_*/log/*"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/fisher_english/s5/local/semisup/run_100k.sh b/egs/fisher_english/s5/local/semisup/run_100k.sh
new file mode 100644
index 00000000000..7657e94b7f2
--- /dev/null
+++ b/egs/fisher_english/s5/local/semisup/run_100k.sh
@@ -0,0 +1,219 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+# This script demonstrates semi-supervised training using 100 hours of 
+# supervised data and 250 hours of unsupervised data.
+# We assume the supervised data is in data/train_sup and unsupervised data
+# is in data/train_unsup100k_250k. 
+# For LM training, we only use the supervised set corresponding to 100 hours as 
+# opposed to the case in run_50k.sh, where we included part of the 
+# transcripts in data/train/text.
+# This uses only 100 hours supervised set for i-vector extractor training, 
+# which is different from run_50k.sh, which uses combined supervised + 
+# unsupervised set.
+
+. ./cmd.sh
+. ./path.sh 
+
+set -o pipefail
+exp_root=exp/semisup_100k
+
+stage=0
+
+. utils/parse_options.sh
+
+for f in data/train_sup/utt2spk data/train_unsup100k_250k/utt2spk \
+  data/train_sup/text; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find $f"
+    exit 1
+  fi
+done
+
+###############################################################################
+# Prepare the 100 hours supervised set and subsets for initial GMM training
+###############################################################################
+
+if [ $stage -le 0 ]; then
+  utils/subset_data_dir.sh --shortest data/train_sup 100000 data/train_sup_100kshort
+  utils/subset_data_dir.sh  data/train_sup_100kshort 10000 data/train_sup_10k
+  utils/data/remove_dup_utts.sh 100 data/train_sup_10k data/train_sup_10k_nodup
+  utils/subset_data_dir.sh --speakers data/train_sup 30000 data/train_sup_30k
+fi
+
+###############################################################################
+# GMM system training using 100 hours supervised data
+###############################################################################
+
+if [ $stage -le 1 ]; then
+  steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
+    data/train_sup_10k_nodup data/lang $exp_root/mono0a || exit 1
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_sup_30k data/lang $exp_root/mono0a $exp_root/mono0a_ali || exit 1
+
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 20000 data/train_sup_30k data/lang $exp_root/mono0a_ali $exp_root/tri1 || exit 1
+
+  (utils/mkgraph.sh data/lang_test $exp_root/tri1 $exp_root/tri1/graph
+   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+     $exp_root/tri1/graph data/dev $exp_root/tri1/decode_dev)&
+fi
+
+if [ $stage -le 3 ]; then
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+   data/train_sup_30k data/lang $exp_root/tri1 $exp_root/tri1_ali || exit 1;
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    2500 20000 data/train_sup_30k data/lang $exp_root/tri1_ali $exp_root/tri2 || exit 1;
+
+  (utils/mkgraph.sh data/lang_test $exp_root/tri2 $exp_root/tri2/graph
+   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+     $exp_root/tri2/graph data/dev $exp_root/tri2/decode_dev)&
+fi
+
+if [ $stage -le 4 ]; then
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_sup data/lang $exp_root/tri2 $exp_root/tri2_ali || exit 1;
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+     --splice-opts "--left-context=3 --right-context=3" \
+     5000 40000 data/train_sup data/lang $exp_root/tri2_ali $exp_root/tri3a || exit 1;
+
+  (
+    utils/mkgraph.sh data/lang_test $exp_root/tri3a $exp_root/tri3a/graph || exit 1;
+    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+     $exp_root/tri3a/graph data/dev $exp_root/tri3a/decode_dev || exit 1;
+  )&
+fi
+
+if [ $stage -le 5 ]; then
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/train_sup data/lang $exp_root/tri3a $exp_root/tri3a_ali || exit 1;
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 100000 data/train_sup data/lang $exp_root/tri3a_ali $exp_root/tri4a || exit 1;
+
+  (
+    utils/mkgraph.sh data/lang_test $exp_root/tri4a $exp_root/tri4a/graph
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+     $exp_root/tri4a/graph data/dev $exp_root/tri4a/decode_dev
+  )&
+fi
+
+###############################################################################
+# Prepare semi-supervised train set 
+###############################################################################
+
+if [ $stage -le 6 ]; then
+  utils/combine_data.sh data/semisup100k_250k \
+    data/train_sup data/train_unsup100k_250k || exit 1
+fi
+
+###############################################################################
+# Train LM on the supervised set
+###############################################################################
+
+if [ $stage -le 7 ]; then
+  if [ ! -f data/lang_test_poco_sup100k/G.fst ]; then
+    local/fisher_train_lms_pocolm.sh \
+      --text data/train_sup/text \
+      --dir data/local/lm_sup100k
+
+    local/fisher_create_test_lang.sh \
+      --arpa-lm data/local/pocolm_sup100k/data/arpa/4gram_small.arpa.gz \
+      --dir data/lang_test_poco_sup100k
+  fi
+fi
+
+###############################################################################
+# Prepare lang directories with UNK modeled using phone LM
+###############################################################################
+
+if [ $stage -le 8 ]; then
+  local/run_unk_model.sh || exit 1
+
+  for lang_dir in data/lang_test_poco_sup100k; do
+    rm -r ${lang_dir}_unk 2>/dev/null || true
+    cp -rT data/lang_unk ${lang_dir}_unk
+    cp ${lang_dir}/G.fst ${lang_dir}_unk/G.fst
+  done
+fi
+
+###############################################################################
+# Train seed chain system using 100 hours supervised data.
+# Here we train i-vector extractor on only the supervised set.
+###############################################################################
+
+if [ $stage -le 9 ]; then
+  local/semisup/chain/run_tdnn.sh \
+    --train-set train_sup \
+    --ivector-train-set "" \
+    --nnet3-affix "" --chain-affix "" \
+    --tdnn-affix _1a --tree-affix bi_a \
+    --gmm tri4a --exp-root $exp_root || exit 1
+
+  # WER on dev                19.23
+  # WER on test               19.01
+  # Final train prob          -0.1224
+  # Final valid prob          -0.1503
+  # Final train prob (xent)   -1.6454
+  # Final valid prob (xent)   -1.7107
+fi
+
+###############################################################################
+# Semi-supervised training using 100 hours supervised data and 
+# 250 hours unsupervised data. We use i-vector extractor, tree, lattices 
+# and seed chain system from the previous stage.
+###############################################################################
+
+if [ $stage -le 10 ]; then
+  local/semisup/chain/run_tdnn_100k_semisupervised.sh \
+    --supervised-set train_sup \
+    --unsupervised-set train_unsup100k_250k \
+    --sup-chain-dir $exp_root/chain/tdnn_1a_sp \
+    --sup-lat-dir $exp_root/chain/tri4a_train_sup_unk_lats \
+    --sup-tree-dir $exp_root/chain/tree_bi_a \
+    --ivector-root-dir $exp_root/nnet3 \
+    --chain-affix "" \
+    --tdnn-affix _semisup_1a \
+    --exp-root $exp_root || exit 1
+
+  # WER on dev                          18.70
+  # WER on test                         18.18
+  # Final output-0 train prob           -0.1345
+  # Final output-0 valid prob           -0.1547
+  # Final output-0 train prob (xent)    -1.3683
+  # Final output-0 valid prob (xent)    -1.4077
+  # Final output-1 train prob           -0.6856
+  # Final output-1 valid prob           -0.6815
+  # Final output-1 train prob (xent)    -1.1224
+  # Final output-1 valid prob (xent)    -1.2218
+fi
+
+###############################################################################
+# Oracle system trained on combined 350 hours including both supervised and 
+# unsupervised sets. We use i-vector extractor, tree, and GMM trained
+# on only the supervised for fair comparison to semi-supervised experiments.
+###############################################################################
+
+if [ $stage -le 11 ]; then
+  local/semisup/chain/run_tdnn.sh \
+    --train-set semisup100k_250k \
+    --nnet3-affix "" --chain-affix "" \
+    --common-treedir $exp_root/chain/tree_bi_a \
+    --tdnn-affix 1a_oracle --nj 100 \
+    --gmm tri4a --exp $exp_root \
+    --stage 9 || exit 1
+
+  # WER on dev                          16.97
+  # WER on test                         17.03
+  # Final output train prob             -0.1196
+  # Final output valid prob             -0.1469
+  # Final output train prob (xent)      -1.5487
+  # Final output valid prob (xent)      -1.6360
+fi
diff --git a/egs/fisher_english/s5/local/semisup/run_50k.sh b/egs/fisher_english/s5/local/semisup/run_50k.sh
new file mode 100644
index 00000000000..c2a5c0db7e7
--- /dev/null
+++ b/egs/fisher_english/s5/local/semisup/run_50k.sh
@@ -0,0 +1,229 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+# This script demonstrates semi-supervised training using 50 hours of 
+# supervised data and 250 hours of unsupervised data.
+# We assume the supervised data is in data/train_sup and unsupervised data
+# is in data/train_unsup100k_250k. 
+# For LM training, we assume there is data/train/text, from which
+# we will exclude the utterances contained in the unsupervised set.
+# We use all 300 hours of semi-supervised data for i-vector extractor training.
+
+# This differs from run_100k.sh, which uses only 100 hours supervised data for 
+# both i-vector extractor training and LM training.
+
+. ./cmd.sh
+. ./path.sh 
+
+set -o pipefail
+exp_root=exp/semisup_50k
+
+stage=0
+
+. utils/parse_options.sh
+
+for f in data/train_sup/utt2spk data/train_unsup100k_250k/utt2spk \
+  data/train/text; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find $f"
+    exit 1
+  fi
+done
+
+###############################################################################
+# Prepare the 50 hours supervised set and subsets for initial GMM training
+###############################################################################
+
+if [ $stage -le 0 ]; then
+  utils/subset_data_dir.sh --speakers data/train_sup 50000 data/train_sup50k || exit 1
+  utils/subset_data_dir.sh --shortest data/train_sup50k 25000 data/train_sup50k_short || exit 1
+  utils/subset_data_dir.sh --speakers data/train_sup50k 30000 data/train_sup50k_30k || exit 1;
+fi
+
+###############################################################################
+# GMM system training using 50 hours supervised data
+###############################################################################
+
+if [ $stage -le 1 ]; then
+  steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
+    data/train_sup50k_short data/lang $exp_root/mono0a || exit 1
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_sup50k_30k data/lang $exp_root/mono0a $exp_root/mono0a_ali || exit 1
+
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 20000 data/train_sup50k_30k data/lang $exp_root/mono0a_ali $exp_root/tri1 || exit 1
+
+  (utils/mkgraph.sh data/lang_test $exp_root/tri1 $exp_root/tri1/graph
+   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+     $exp_root/tri1/graph data/dev $exp_root/tri1/decode_dev)&
+fi
+
+if [ $stage -le 3 ]; then
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+   data/train_sup50k_30k data/lang $exp_root/tri1 $exp_root/tri1_ali || exit 1;
+
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 20000 data/train_sup50k_30k data/lang $exp_root/tri1_ali $exp_root/tri2 || exit 1
+
+  (utils/mkgraph.sh data/lang_test $exp_root/tri2 $exp_root/tri2/graph
+   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+     $exp_root/tri2/graph data/dev $exp_root/tri2/decode_dev)&
+fi
+
+if [ $stage -le 4 ]; then
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_sup50k data/lang $exp_root/tri2 $exp_root/tri2_ali || exit 1;
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    4000 30000 data/train_sup50k data/lang $exp_root/tri2_ali $exp_root/tri3a || exit 1;
+
+  (utils/mkgraph.sh data/lang_test $exp_root/tri3a $exp_root/tri3a/graph
+   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+     $exp_root/tri3a/graph data/dev $exp_root/tri3a/decode_dev)&
+fi
+
+if [ $stage -le 5 ]; then
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/train_sup50k data/lang $exp_root/tri3a $exp_root/tri3a_ali || exit 1;
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    4000 50000 data/train_sup50k data/lang $exp_root/tri3a_ali $exp_root/tri4a || exit 1;
+
+  (
+    utils/mkgraph.sh data/lang_test $exp_root/tri4a $exp_root/tri4a/graph
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+     $exp_root/tri4a/graph data/dev $exp_root/tri4a/decode_dev
+  )&
+fi
+
+###############################################################################
+# Prepare semi-supervised train set 
+###############################################################################
+
+if [ $stage -le 6 ]; then
+  utils/combine_data.sh data/semisup50k_100k_250k \
+    data/train_sup50k data/train_unsup100k_250k || exit 1
+fi
+
+###############################################################################
+# Train LM on all the text in data/train/text, but excluding the 
+# utterances in the unsupervised set
+###############################################################################
+
+if [ $stage -le 7 ]; then
+  mkdir -p data/local/pocolm_ex250k
+
+  utils/filter_scp.pl --exclude data/train_unsup100k_250k/utt2spk \
+    data/train/text > data/local/pocolm_ex250k/text.tmp
+
+  if [ ! -f data/lang_test_poco_ex250k_big/G.carpa ]; then
+    local/fisher_train_lms_pocolm.sh \
+      --text data/local/pocolm_ex250k/text.tmp \
+      --dir data/local/pocolm_ex250k
+
+    local/fisher_create_test_lang.sh \
+      --arpa-lm data/local/pocolm_ex250k/data/arpa/4gram_small.arpa.gz \
+      --dir data/lang_test_poco_ex250k
+
+    utils/build_const_arpa_lm.sh \
+      data/local/pocolm_ex250k/data/arpa/4gram_big.arpa.gz \
+      data/lang_test_poco_ex250k data/lang_test_poco_ex250k_big
+  fi
+fi
+
+###############################################################################
+# Prepare lang directories with UNK modeled using phone LM
+###############################################################################
+
+if [ $stage -le 8 ]; then
+  local/run_unk_model.sh || exit 1
+
+  for lang_dir in data/lang_test_poco_ex250k; do
+    rm -r ${lang_dir}_unk ${lang_dir}_unk_big 2>/dev/null || true
+    cp -rT data/lang_unk ${lang_dir}_unk
+    cp ${lang_dir}/G.fst ${lang_dir}_unk/G.fst
+    cp -rT data/lang_unk ${lang_dir}_unk_big
+    cp ${lang_dir}_big/G.carpa ${lang_dir}_unk_big/G.carpa; 
+  done
+fi
+
+###############################################################################
+# Train seed chain system using 50 hours supervised data.
+# Here we train i-vector extractor on combined supervised and unsupervised data
+###############################################################################
+
+if [ $stage -le 9 ]; then
+  local/semisup/chain/run_tdnn.sh \
+    --train-set train_sup50k \
+    --ivector-train-set semisup50k_100k_250k \
+    --nnet3-affix _semi50k_100k_250k \
+    --chain-affix _semi50k_100k_250k \
+    --tdnn-affix _1a --tree-affix bi_a \
+    --gmm tri4a --exp-root $exp_root || exit 1
+
+  # WER on dev                21.41
+  # WER on test               21.03
+  # Final train prob          -0.1035
+  # Final valid prob          -0.1667
+  # Final train prob (xent)   -1.5926
+  # Final valid prob (xent)   -1.7990
+fi
+
+###############################################################################
+# Semi-supervised training using 50 hours supervised data and 
+# 250 hours unsupervised data. We use i-vector extractor, tree, lattices 
+# and seed chain system from the previous stage.
+###############################################################################
+
+if [ $stage -le 10 ]; then
+  local/semisup/chain/run_tdnn_50k_semisupervised.sh \
+    --supervised-set train_sup50k \
+    --unsupervised-set train_unsup100k_250k \
+    --sup-chain-dir $exp_root/chain_semi50k_100k_250k/tdnn_1a_sp \
+    --sup-lat-dir $exp_root/chain_semi50k_100k_250k/tri4a_train_sup50k_sp_unk_lats \
+    --sup-tree-dir $exp_root/chain_semi50k_100k_250k/tree_bi_a \
+    --ivector-root-dir $exp_root/nnet3_semi50k_100k_250k \
+    --chain-affix _semi50k_100k_250k \
+    --tdnn-affix _semisup_1a \
+    --exp-root $exp_root || exit 1
+
+  # WER on dev                          18.98
+  # WER on test                         18.85
+  # Final output-0 train prob           -0.1381
+  # Final output-0 valid prob           -0.1723
+  # Final output-0 train prob (xent)    -1.3676
+  # Final output-0 valid prob (xent)    -1.4589
+  # Final output-1 train prob           -0.7671
+  # Final output-1 valid prob           -0.7714
+  # Final output-1 train prob (xent)    -1.1480
+  # Final output-1 valid prob (xent)    -1.2382
+fi
+
+###############################################################################
+# Oracle system trained on combined 300 hours including both supervised and 
+# unsupervised sets. We use i-vector extractor, tree, and GMM trained
+# on only the supervised for fair comparison to semi-supervised experiments.
+###############################################################################
+
+if [ $stage -le 11 ]; then
+  local/semisup/chain/run_tdnn.sh \
+    --train-set semisup50k_100k_250k \
+    --nnet3-affix _semi50k_100k_250k \
+    --chain-affix _semi50k_100k_250k \
+    --common-treedir $exp_root/chain_semi50k_100k_250k/tree_bi_a \
+    --tdnn-affix 1a_oracle --nj 100 \
+    --gmm tri4a --exp-root $exp_root \
+    --stage 9 || exit 1
+
+  # WER on dev                          17.55
+  # WER on test                         17.72
+  # Final output train prob             -0.1155
+  # Final output valid prob             -0.1510
+  # Final output train prob (xent)      -1.7458
+  # Final output valid prob (xent)      -1.9045
+fi
diff --git a/egs/fisher_english/s5/local/wer_output_filter b/egs/fisher_english/s5/local/wer_output_filter
new file mode 100755
index 00000000000..2514c385038
--- /dev/null
+++ b/egs/fisher_english/s5/local/wer_output_filter
@@ -0,0 +1,16 @@
+#!/usr/bin/perl
+
+@filter_words = ('[NOISE]', '[LAUGHTER]', '[VOCALIZED-NOISE]', '<UNK>', '%HESITATION');
+foreach $w (@filter_words) { 
+  $bad{$w} = 1; $w = lc $w; $bad{$w} = 1; 
+}
+while(<STDIN>) { 
+  @A  = split(" ", $_); 
+  $id = shift @A; 
+  print "$id ";
+  
+  foreach $a (@A) { 
+    if (!defined $bad{$a}) { print "$a "; }
+  } 
+  print "\n"; 
+}
diff --git a/egs/fisher_english/s5/run.sh b/egs/fisher_english/s5/run.sh
index 77e1ea0870d..67c0d5ce638 100755
--- a/egs/fisher_english/s5/run.sh
+++ b/egs/fisher_english/s5/run.sh
@@ -181,3 +181,32 @@ steps/train_sat.sh  --cmd "$train_cmd" \
 # # local/run_nnet2.sh
 #
 
+# This prepares lang directory with UNK modeled by a phone LM
+# local/run_unk_model.sh
+
+# These are semi-supervised training recipes using 50 hrs and 100 hrs 
+# of supervised data respectively with 250 hrs of unsupervised data.
+# run_50k.sh uses i-vector extractor trained on 300 hrs of combined data, 
+# while run_100.sh uses i-vector extractor trained on 100 hrs of supervised data.
+# run_50k.sh uses 4-gram LM trained on 1250 hrs transcripts, 
+# while run_100k.sh uses 3-gram LM trained on 100 hrs transcripts.
+
+# local/fisher_train_lms_pocolm.sh 
+# local/fisher_create_test_lang.sh --arpa-lm data/local/pocolm/data/arpa/4gram_small.arpa.gz --dir data/lang_test_poco
+# utils/build_const_arpa_lm.sh data/local/pocolm/data/arpa/4gram_big.arpa.gz data/lang_test_poco data/lang_test_poco_big
+
+# for lang_dir in data/lang_test_poco; do
+#   rm -r ${lang_dir}_unk ${lang_dir}_unk_big 2>/dev/null || true
+#   cp -rT data/lang_unk ${lang_dir}_unk
+#   cp ${lang_dir}/G.fst ${lang_dir}_unk/G.fst
+#   cp -rT data/lang_unk ${lang_dir}_unk_big
+#   cp ${lang_dir}_big/G.carpa ${lang_dir}_unk_big/G.carpa; 
+# done
+
+# Create supervised and unsupervised data subsets
+# utils/subset_data_dir.sh --speakers data/train 100000 data/train_sup
+# utils/subset_data_dir.sh --spk-list <(utils/filter_scp.pl --exclude data/train_sup/spk2utt data/train/spk2utt) data/train data/train_unsup100k
+# utils/subset_data_dir.sh --speakers data/train_unsup100k 250000 data/train_unsup100k_250k
+
+# local/semisup/run_50k.sh
+# local/semisup/run_100k.sh 
diff --git a/egs/wsj/s5/steps/best_path_weights.sh b/egs/wsj/s5/steps/best_path_weights.sh
new file mode 100755
index 00000000000..d34d574173f
--- /dev/null
+++ b/egs/wsj/s5/steps/best_path_weights.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+# Copyright 2014-17 Vimal Manohar
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script gets from the lattice the best path alignments and frame-level
+# posteriors of the pdfs in the best path alignment.
+# The output directory has the format of an alignment directory.
+# It can optionally read alignments from a directory, in which case,
+# the script gets frame-level posteriors of the pdf corresponding to those
+# alignments.
+# The frame-level posteriors in the form of kaldi vectors and are 
+# output in weights.scp.
+
+set -e
+
+# begin configuration section.
+cmd=run.pl
+stage=-10
+acwt=0.1
+#end configuration section.
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 3 ] && [ $# -ne 4 ]; then
+  cat <<EOF
+    Usage: $0 [options] <data-dir> <decode-dir> [<ali-dir>] <out-dir>
+      E.g. $0 data/train_unt.seg exp/tri1/decode exp/tri1/best_path
+    Options:
+      --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes.
+EOF
+  
+  exit 1;
+fi
+
+data=$1
+decode_dir=$2
+dir=${@: -1}  # last argument to the script
+
+ali_dir=$dir
+if [ $# -eq 4 ]; then
+  ali_dir=$3
+fi
+
+mkdir -p $dir
+
+nj=$(cat $decode_dir/num_jobs)
+echo $nj > $dir/num_jobs
+
+if [ $stage -le 1 ]; then
+  mkdir -p $dir/log
+  $cmd JOB=1:$nj $dir/log/best_path.JOB.log \
+    lattice-best-path --acoustic-scale=$acwt \
+      "ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz |" \
+      ark:/dev/null "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1
+fi
+
+# Find where the final.mdl is.
+if [ -f $(dirname $decode_dir)/final.mdl ]; then
+  src_dir=$(dirname $decode_dir)
+else
+  src_dir=$decode_dir
+fi
+
+cp $src_dir/cmvn_opts $dir/ || exit 1
+for f in final.mat splice_opts frame_subsampling_factor; do
+  if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir; fi
+done
+
+# make $dir an absolute pathname.
+fdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD})
+
+model=$src_dir/final.mdl
+tree=$src_dir/tree
+
+for f in $model $decode_dir/lat.1.gz $tree; do
+  if [ ! -f $f ]; then echo "$0: expecting file $f to exist" && exit 1; fi
+done
+
+cp $model $tree $dir || exit 1
+
+ali_nj=$(cat $ali_dir/num_jobs) || exit 1
+if [ $nj -ne $ali_nj ]; then
+  echo "$0: $decode_dir and $ali_dir have different number of jobs. Redo alignment with $nj jobs."
+  exit 1
+fi
+
+if [ $stage -lt 2 ]; then
+  $cmd JOB=1:$nj $dir/log/get_post.JOB.log \
+    lattice-to-post --acoustic-scale=$acwt \
+      "ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
+    post-to-pdf-post $model ark,s,cs:- ark:- \| \
+    get-post-on-ali ark,s,cs:- \
+    "ark,s,cs:gunzip -c $ali_dir/ali.JOB.gz | convert-ali $dir/final.mdl $model $tree ark,s,cs:- ark:- | ali-to-pdf $model ark,s,cs:- ark:- |" \
+    "ark,scp:$fdir/weights.JOB.ark,$fdir/weights.JOB.scp" || exit 1
+fi
+
+for n in `seq $nj`; do
+  cat $dir/weights.$n.scp 
+done > $dir/weights.scp
+
+rm $dir/weights.*.scp
+
+exit 0
diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
index afa75eb0296..63b1c12c759 100755
--- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
+++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
@@ -515,7 +515,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"):
     except:
         tb = traceback.format_exc()
         logger.warning("Error getting info from logs, exception was: " + tb)
-        times = []
+        times = {}
 
     report = []
     report.append("%Iter\tduration\ttrain_objective\tvalid_objective\tdifference")
@@ -532,7 +532,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"):
         try:
             report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]),
                                                   x[1], x[2], x[2]-x[1]))
-        except KeyError:
+        except KeyError, IndexError:
             continue
 
     total_time = 0
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 5ae7aecd36c..854a37a52b7 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -129,7 +129,8 @@ def train_new_models(dir, iter, srand, num_jobs,
                      momentum, max_param_change,
                      shuffle_buffer_size, num_chunk_per_minibatch_str,
                      frame_subsampling_factor, run_opts, train_opts,
-                     backstitch_training_scale=0.0, backstitch_training_interval=1):
+                     backstitch_training_scale=0.0, backstitch_training_interval=1,
+                     use_multitask_egs=False):
     """
     Called from train_one_iteration(), this method trains new models
     with 'num_jobs' jobs, and
@@ -140,6 +141,12 @@ def train_new_models(dir, iter, srand, num_jobs,
     to use for each job is a little complex, so we spawn each one separately.
     this is no longer true for RNNs as we use do not use the --frame option
     but we use the same script for consistency with FF-DNN code
+
+    use_multitask_egs : True, if different examples used to train multiple
+                        tasks or outputs, e.g.multilingual training.
+                        multilingual egs can be generated using get_egs.sh and
+                        steps/nnet3/multilingual/allocate_multilingual_examples.py,
+                        those are the top-level scripts.
     """
 
     deriv_time_opts = []
@@ -167,6 +174,12 @@ def train_new_models(dir, iter, srand, num_jobs,
         frame_shift = ((archive_index + k/num_archives)
                        % frame_subsampling_factor)
 
+        multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
+            egs_dir,
+            egs_prefix="cegs.",
+            archive_index=archive_index,
+            use_multitask_egs=use_multitask_egs)
+        scp_or_ark = "scp" if use_multitask_egs else "ark"
         cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir,
                                                                   iter=iter)
                           if iter > 0 else "") +
@@ -187,9 +200,9 @@ def train_new_models(dir, iter, srand, num_jobs,
                     --l2-regularize-factor={l2_regularize_factor} {train_opts} \
                     --srand={srand} \
                     "{raw_model}" {dir}/den.fst \
-                    "ark,bg:nnet3-chain-copy-egs \
+                    "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} \
                         --frame-shift={fr_shft} \
-                        ark:{egs_dir}/cegs.{archive_index}.ark ark:- | \
+                        {scp_or_ark}:{egs_dir}/cegs.{archive_index}.{scp_or_ark} ark:- | \
                         nnet3-chain-shuffle-egs --buffer-size={buf_size} \
                         --srand={srand} ark:- ark:- | nnet3-chain-merge-egs \
                         --minibatch-size={num_chunk_per_mb} ark:- ark:- |" \
@@ -213,17 +226,17 @@ def train_new_models(dir, iter, srand, num_jobs,
                         raw_model=raw_model_string,
                         egs_dir=egs_dir, archive_index=archive_index,
                         buf_size=shuffle_buffer_size,
-                        num_chunk_per_mb=num_chunk_per_minibatch_str),
+                        num_chunk_per_mb=num_chunk_per_minibatch_str,
+                        multitask_egs_opts=multitask_egs_opts,
+                        scp_or_ark=scp_or_ark),
             require_zero_status=True)
 
         threads.append(thread)
 
-
     for thread in threads:
         thread.join()
 
 
-
 def train_one_iteration(dir, iter, srand, egs_dir,
                         num_jobs, num_archives_processed, num_archives,
                         learning_rate, shrinkage_value,
@@ -235,7 +248,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         momentum, max_param_change, shuffle_buffer_size,
                         frame_subsampling_factor,
                         run_opts, dropout_edit_string="", train_opts="",
-                        backstitch_training_scale=0.0, backstitch_training_interval=1):
+                        backstitch_training_scale=0.0, backstitch_training_interval=1,
+                        use_multitask_egs=False):
     """ Called from steps/nnet3/chain/train.py for one iteration for
     neural network training with LF-MMI objective
 
@@ -265,7 +279,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     compute_train_cv_probabilities(
         dir=dir, iter=iter, egs_dir=egs_dir,
         l2_regularize=l2_regularize, xent_regularize=xent_regularize,
-        leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts)
+        leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts,
+        use_multitask_egs=use_multitask_egs)
 
     if iter > 0:
         # Runs in the background
@@ -312,7 +327,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      # first few iterations (hard-coded as 15)
                      backstitch_training_scale=(backstitch_training_scale *
                          iter / 15 if iter < 15 else backstitch_training_scale),
-                     backstitch_training_interval=backstitch_training_interval)
+                     backstitch_training_interval=backstitch_training_interval,
+                     use_multitask_egs=use_multitask_egs)
 
     [models_to_average, best_model] = common_train_lib.get_successful_models(
          num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
@@ -352,11 +368,13 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         os.remove("{0}/cache.{1}".format(dir, iter))
 
 
-def check_for_required_files(feat_dir, tree_dir, lat_dir):
+def check_for_required_files(feat_dir, tree_dir, lat_dir=None):
     files = ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
-             '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir),
+             '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir)]
+    if lat_dir is not None:
+        files += [
              '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
-             '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]
+             '{0}/num_jobs'.format(lat_dir)]
     for file in files:
         if not os.path.isfile(file):
             raise Exception('Expected {0} to exist.'.format(file))
@@ -364,7 +382,7 @@ def check_for_required_files(feat_dir, tree_dir, lat_dir):
 
 def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
                                    max_lda_jobs=None, rand_prune=4.0,
-                                   lda_opts=None):
+                                   lda_opts=None, use_multitask_egs=False):
     """ Function to estimate and write LDA matrix from cegs
 
     This function is exactly similar to the version in module
@@ -374,17 +392,28 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
     if max_lda_jobs is not None:
         if num_lda_jobs > max_lda_jobs:
             num_lda_jobs = max_lda_jobs
+    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
+        egs_dir,
+        egs_prefix="cegs.",
+        archive_index="JOB",
+        use_multitask_egs=use_multitask_egs)
+    scp_or_ark = "scp" if use_multitask_egs else "ark"
+    egs_rspecifier = (
+        "ark:nnet3-chain-copy-egs {multitask_egs_opts} "
+        "{scp_or_ark}:{egs_dir}/cegs.JOB.{scp_or_ark} ark:- |"
+        "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark,
+                  multitask_egs_opts=multitask_egs_opts))
 
     # Write stats with the same format as stats for LDA.
     common_lib.execute_command(
         """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
                 nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \
-                {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" \
+                {dir}/init.raw "{egs_rspecifier}" \
                 {dir}/JOB.lda_stats""".format(
                     command=run_opts.command,
                     num_lda_jobs=num_lda_jobs,
                     dir=dir,
-                    egs_dir=egs_dir,
+                    egs_rspecifier=egs_rspecifier,
                     rand_prune=rand_prune))
 
     # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
@@ -445,32 +474,50 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_model=None):
 
 def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize,
                                    xent_regularize, leaky_hmm_coefficient,
-                                   run_opts):
+                                   run_opts,
+                                   use_multitask_egs=False):
     model = '{0}/{1}.mdl'.format(dir, iter)
+    scp_or_ark = "scp" if use_multitask_egs else "ark"
+    egs_suffix = ".scp" if use_multitask_egs else ".cegs"
+
+    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
+                             egs_dir,
+                             egs_prefix="valid_diagnostic.",
+                             use_multitask_egs=use_multitask_egs)
+
 
     common_lib.background_command(
         """{command} {dir}/log/compute_prob_valid.{iter}.log \
                 nnet3-chain-compute-prob --l2-regularize={l2} \
                 --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \
                 "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
-                "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/valid_diagnostic.cegs \
+                "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \
                     ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \
         """.format(command=run_opts.command, dir=dir, iter=iter, model=model,
                    l2=l2_regularize, leaky=leaky_hmm_coefficient,
                    xent_reg=xent_regularize,
-                   egs_dir=egs_dir))
+                   egs_dir=egs_dir,
+                   multitask_egs_opts=multitask_egs_opts,
+                   scp_or_ark=scp_or_ark, egs_suffix=egs_suffix))
+
+    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
+                             egs_dir,
+                             egs_prefix="train_diagnostic.",
+                             use_multitask_egs=use_multitask_egs)
 
     common_lib.background_command(
         """{command} {dir}/log/compute_prob_train.{iter}.log \
                 nnet3-chain-compute-prob --l2-regularize={l2} \
                 --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \
                 "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
-                "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/train_diagnostic.cegs \
+                "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \
                     ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \
         """.format(command=run_opts.command, dir=dir, iter=iter, model=model,
                    l2=l2_regularize, leaky=leaky_hmm_coefficient,
                    xent_reg=xent_regularize,
-                   egs_dir=egs_dir))
+                   egs_dir=egs_dir,
+                   multitask_egs_opts=multitask_egs_opts,
+                   scp_or_ark=scp_or_ark, egs_suffix=egs_suffix))
 
 
 def compute_progress(dir, iter, run_opts):
@@ -510,10 +557,12 @@ def compute_progress(dir, iter, run_opts):
                    model=model))
 
 
+
 def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str,
                    egs_dir, leaky_hmm_coefficient, l2_regularize,
                    xent_regularize, run_opts,
-                   max_objective_evaluations=30):
+                   max_objective_evaluations=30,
+                   use_multitask_egs=False):
     """ Function to do model combination
 
     In the nnet3 setup, the logic
@@ -536,6 +585,14 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
             print("{0}: warning: model file {1} does not exist "
                   "(final combination)".format(sys.argv[0], model_file))
 
+    scp_or_ark = "scp" if use_multitask_egs else "ark"
+    egs_suffix = ".scp" if use_multitask_egs else ".cegs"
+
+    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
+                             egs_dir,
+                             egs_prefix="combine.",
+                             use_multitask_egs=use_multitask_egs)
+
     # We reverse the order of the raw model strings so that the freshest one
     # goes first.  This is important for systems that include batch
     # normalization-- it means that the freshest batch-norm stats are used.
@@ -550,7 +607,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
                 --max-objective-evaluations={max_objective_evaluations} \
                 --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
                 --verbose=3 {combine_gpu_opt} {dir}/den.fst {raw_models} \
-                "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/combine.cegs ark:- | \
+                "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/combine{egs_suffix} ark:- | \
                     nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \
                     ark:- ark:- |" - \| \
                 nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \
@@ -563,7 +620,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
                     dir=dir, raw_models=" ".join(raw_model_strings),
                     num_chunk_per_mb=num_chunk_per_minibatch_str,
                     num_iters=num_iters,
-                    egs_dir=egs_dir))
+                    egs_dir=egs_dir,
+                    multitask_egs_opts=multitask_egs_opts,
+                    scp_or_ark=scp_or_ark, egs_suffix=egs_suffix))
 
     # Compute the probability of the final, combined model with
     # the same subset we used for the previous compute_probs, as the
@@ -572,4 +631,5 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
         dir=dir, iter='final', egs_dir=egs_dir,
         l2_regularize=l2_regularize, xent_regularize=xent_regularize,
         leaky_hmm_coefficient=leaky_hmm_coefficient,
-        run_opts=run_opts)
+        run_opts=run_opts,
+        use_multitask_egs=use_multitask_egs)
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 6b572acb5d7..c18003a626e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -329,21 +329,32 @@ def train_one_iteration(dir, iter, srand, egs_dir,
 
 def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
                                    max_lda_jobs=None, rand_prune=4.0,
-                                   lda_opts=None):
+                                   lda_opts=None, use_multitask_egs=False):
     if max_lda_jobs is not None:
         if num_lda_jobs > max_lda_jobs:
             num_lda_jobs = max_lda_jobs
+    multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
+        egs_dir,
+        egs_prefix="egs.",
+        archive_index="JOB",
+        use_multitask_egs=use_multitask_egs)
+    scp_or_ark = "scp" if use_multitask_egs else "ark"
+    egs_rspecifier = (
+        "ark:nnet3-copy-egs {multitask_egs_opts} "
+        "{scp_or_ark}:{egs_dir}/egs.JOB.{scp_or_ark} ark:- |"
+        "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark,
+                  multitask_egs_opts=multitask_egs_opts))
 
     # Write stats with the same format as stats for LDA.
     common_lib.execute_command(
         """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
                 nnet3-acc-lda-stats --rand-prune={rand_prune} \
-                {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" \
+                {dir}/init.raw "{egs_rspecifier}" \
                 {dir}/JOB.lda_stats""".format(
                     command=run_opts.command,
                     num_lda_jobs=num_lda_jobs,
                     dir=dir,
-                    egs_dir=egs_dir,
+                    egs_rspecifier=egs_rspecifier,
                     rand_prune=rand_prune))
 
     # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index eda1461a2ab..99a4fb28ff6 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -367,6 +367,14 @@ class XconfigTrivialOutputLayer(XconfigLayerBase):
     This is for outputs that are not really output "layers"
     (there is no affine transform or nonlinearity), they just directly map to an
     output-node in nnet3.
+
+    Parameters of the class, and their defaults:
+        input='[-1]'    :   Descriptor giving the input of the layer.
+        objective-type=linear   :   the only other choice currently is
+            'quadratic', for use in regression problems
+        output-delay=0    :  Can be used to shift the frames on the output, equivalent
+             to delaying labels by this many frames (positive value increases latency
+             in online decoding but may help if you're using unidirectional LSTMs.
     """
 
     def __init__(self, first_token, key_to_value, prev_names=None):
@@ -378,11 +386,17 @@ def set_default_configs(self):
 
         # note: self.config['input'] is a descriptor, '[-1]' means output
         # the most recent layer.
-        self.config = {'input': '[-1]', 'dim': -1}
+        self.config = {'input': '[-1]', 'dim': -1,
+                       'objective-type': 'linear',
+                       'output-delay': 0}
 
     def check_configs(self):
 
-        pass  # nothing to check; descriptor-parsing can't happen in this function.
+        if self.config['objective-type'] != 'linear' and \
+                self.config['objective-type'] != 'quadratic':
+            raise RuntimeError("In output, objective-type has"
+                               " invalid value {0}"
+                               "".format(self.config['objective-type']))
 
     def output_name(self, auxiliary_outputs=None):
 
@@ -412,11 +426,19 @@ def get_full_config(self):
         # by 'output-string' we mean a string that can appear in
         # config-files, i.e. it contains the 'final' names of nodes.
         descriptor_final_str = self.descriptors['input']['final-string']
+        objective_type = self.config['objective-type']
+        output_delay = self.config['output-delay']
 
-        for config_name in ['init', 'ref', 'final']:
+        if output_delay != 0:
+            descriptor_final_str = (
+                'Offset({0}, {1})'.format(descriptor_final_str, output_delay))
+
+        for config_name in ['ref', 'final']:
             ans.append((config_name,
-                        'output-node name={0} input={1}'.format(
-                            self.name, descriptor_final_str)))
+                        'output-node name={0} input={1} '
+                        'objective={2}'.format(
+                            self.name, descriptor_final_str,
+                            objective_type)))
         return ans
 
 
@@ -507,28 +529,38 @@ def check_configs(self):
                                " invalid value {0}"
                                "".format(self.config['learning-rate-factor']))
 
-    # you cannot access the output of this layer from other layers... see
-    # comment in output_name for the reason why.
     def auxiliary_outputs(self):
 
-        return []
+        auxiliary_outputs = ['affine']
+        if self.config['include-log-softmax']:
+            auxiliary_outputs.append('log-softmax')
 
-    def output_name(self, auxiliary_outputs=None):
+        return auxiliary_outputs
+
+    def output_name(self, auxiliary_output=None):
 
-        # Note: nodes of type output-node in nnet3 may not be accessed in
-        # Descriptors, so calling this with auxiliary_outputs=None doesn't
-        # make sense.  But it might make sense to make the output of the softmax
-        # layer and/or the output of the affine layer available as inputs to
-        # other layers, in some circumstances.
-        # we'll implement that when it's needed.
-        raise RuntimeError("Outputs of output-layer may not be used by other"
-                           " layers")
+        if auxiliary_output is None:
+            # Note: nodes of type output-node in nnet3 may not be accessed in
+            # Descriptors, so calling this with auxiliary_outputs=None doesn't
+            # make sense.
+            raise RuntimeError("Outputs of output-layer may not be used by other"
+                               " layers")
+
+        if auxiliary_output in self.auxiliary_outputs():
+            return '{0}.{1}'.format(self.name, auxiliary_output)
+        else:
+            raise RuntimeError("Unknown auxiliary output name {0}"
+                               "".format(auxiliary_output))
 
     def output_dim(self, auxiliary_output=None):
 
-        # see comment in output_name().
-        raise RuntimeError("Outputs of output-layer may not be used by other"
-                           " layers")
+        if auxiliary_output is None:
+            # Note: nodes of type output-node in nnet3 may not be accessed in
+            # Descriptors, so calling this with auxiliary_outputs=None doesn't
+            # make sense.
+            raise RuntimeError("Outputs of output-layer may not be used by other"
+                               " layers")
+        return self.config['dim']
 
     def get_full_config(self):
         ans = []
diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh
new file mode 100755
index 00000000000..a075b8debe8
--- /dev/null
+++ b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+# Copyright 2014  Guoguo Chen
+#           2017  Vimal Manohar
+# Apache 2.0
+
+# This script rescores non-compact, (possibly) undeterminized lattices with the 
+# ConstArpaLm format language model.
+# This is similar to steps/lmrescore_const_arpa.sh, but expects 
+# non-compact lattices as input.
+# This works by first determinizing the lattice and rescoring it with 
+# const ARPA LM, followed by composing it with the original lattice to add the 
+# new LM scores.
+
+# If you use the option "--write compact false" it outputs non-compact lattices;
+# the purpose is to add in LM scores while leaving the frame-by-frame acoustic
+# scores in the same position that they were in in the input, undeterminized
+# lattices. This is important in our 'chain' semi-supervised training recipes,
+# where it helps us to split lattices while keeping the scores at the edges of
+# the split points correct.
+
+# Begin configuration section.
+cmd=run.pl
+skip_scoring=false
+stage=1
+scoring_opts=
+write_compact=true   # If set to false, writes lattice in non-compact format.
+                     # This retains the acoustic scores on the arcs of the lattice.
+                     # Useful for another stage of LM rescoring.
+acwt=0.1  # used for pruning and determinization
+beam=8.0  # beam used in determinization
+
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./utils/parse_options.sh
+
+if [ $# != 5 ]; then
+  cat <<EOF
+   Does language model rescoring of non-compact undeterminized lattices 
+   (remove old LM, add new LM). This script expects the input lattices 
+   to be in non-compact format.
+   Usage: $0 [options] <old-lang-dir> <new-lang-dir> \\
+                      <data-dir> <input-decode-dir> <output-decode-dir>
+   options: [--cmd (run.pl|queue.pl [queue opts])]
+   See also: steps/lmrescore_const_arpa.sh 
+EOF
+   exit 1;
+fi
+
+[ -f path.sh ] && . ./path.sh;
+
+oldlang=$1
+newlang=$2
+data=$3
+indir=$4
+outdir=$5
+
+oldlm=$oldlang/G.fst
+newlm=$newlang/G.carpa
+! cmp $oldlang/words.txt $newlang/words.txt &&\
+  echo "$0: Warning: vocabularies may be incompatible."
+[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
+[ ! -f $newlm ] && echo "$0: Missing file $newlm" && exit 1;
+! ls $indir/lat.*.gz >/dev/null &&\
+  echo "$0: No lattices input directory $indir" && exit 1;
+
+if ! cmp -s $oldlang/words.txt $newlang/words.txt; then
+  echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing.";
+fi
+
+oldlmcommand="fstproject --project_output=true $oldlm |"
+
+mkdir -p $outdir/log
+nj=`cat $indir/num_jobs` || exit 1;
+cp $indir/num_jobs $outdir
+
+lats_rspecifier="ark:gunzip -c $indir/lat.JOB.gz |"
+  
+lats_wspecifier="ark:| gzip -c > $outdir/lat.JOB.gz" 
+
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+    lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam \
+      "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \
+    lattice-scale --lm-scale=0.0 --acoustic-scale=0.0 ark:- ark:- \| \
+    lattice-lmrescore --lm-scale=-1.0 ark:- "$oldlmcommand" ark:- \| \
+    lattice-lmrescore-const-arpa --lm-scale=1.0 \
+      ark:- "$newlm" ark:- \| \
+    lattice-project ark:- ark:- \| \
+    lattice-compose --write-compact=$write_compact \
+      "$lats_rspecifier" \
+      ark,s,cs:- "$lats_wspecifier" || exit 1
+fi
+
+if ! $skip_scoring && [ $stage -le 2 ]; then
+  err_msg="Not scoring because local/score.sh does not exist or not executable."
+  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
+  local/score.sh --cmd "$cmd" $scoring_opts $data $newlang $outdir
+else
+  echo "Not scoring because requested so..."
+fi
+
+exit 0;
diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
index 3c2c770470c..23fb62d7a87 100755
--- a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
@@ -175,7 +175,7 @@ fi
 
 if [ $stage -le -1 ]; then
   # Convert the alignments to the new tree.  Note: we likely will not use these
-  # converted alignments in the CTC system directly, but they could be useful
+  # converted alignments in the chain system directly, but they could be useful
   # for other purposes.
   echo "$0: Converting alignments from $alidir to use current tree"
   $cmd JOB=1:$nj $dir/log/convert.JOB.log \
diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh
new file mode 100755
index 00000000000..48028634e26
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh
@@ -0,0 +1,275 @@
+#!/bin/bash
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+#           2017  Vimal Manohar
+#  Apache 2.0.
+
+# This script is similar to steps/nnet3/chain/build_tree.sh but supports 
+# getting statistics from multiple alignment sources.
+
+
+# Begin configuration section.
+stage=-5
+exit_stage=-100 # you can use this to require it to exit at the
+                # beginning of a specific stage.  Not all values are
+                # supported.
+cmd=run.pl
+use_fmllr=true  # If true, fmllr transforms will be applied from the alignment directories.
+                # Otherwise, no fmllr will be applied even if alignment directory contains trans.*
+context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
+cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
+frame_subsampling_factor=1  # frame subsampling factor of output w.r.t. to the input features
+tree_stats_opts=
+cluster_phones_opts=
+repeat_frames=false
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 5 ]; then
+  echo "Usage: steps/nnet3/chain/build_tree_multiple_sources.sh <#leaves> <lang> <data1> <ali-dir1> [<data2> <ali-dir2> ... <data> <ali-dirN>] <exp-dir>"
+  echo " e.g.: steps/nnet3/chain/build_tree_multiple_sources.sh 15000 data/lang data/train_sup exp/tri3_ali data/train_unsup exp/tri3/best_path_train_unsup exp/tree_semi"
+  echo "Main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --stage <stage>                                  # stage to do partial re-run from."
+  echo "  --repeat-frames <true|false>                     # Only affects alignment conversion at"
+  echo "                                                   # the end. If true, generate an "
+  echo "                                                   # alignment using the frame-subsampled "
+  echo "                                                   # topology that is repeated "
+  echo "                                                   # --frame-subsampling-factor times "
+  echo "                                                   # and interleaved, to be the same "
+  echo "                                                   # length as the original alignment "
+  echo "                                                   # (useful for cross-entropy training "
+  echo "                                                   # of reduced frame rate systems)."
+  exit 1;
+fi
+
+numleaves=$1
+lang=$2
+dir=${@: -1}  # last argument to the script
+shift 2;
+data_and_alidirs=( $@ )  # read the remaining arguments into an array
+unset data_and_alidirs[${#data_and_alidirs[@]}-1]  # 'pop' the last argument which is odir
+num_sys=$[${#data_and_alidirs[@]}]  # number of systems to combine
+
+if (( $num_sys % 2 != 0 )); then
+  echo "$0: The data and alignment arguments must be an even number of arguments."
+  exit 1
+fi
+
+num_sys=$((num_sys / 2))
+
+data=$dir/data_tmp
+mkdir -p $data
+
+mkdir -p $dir
+alidir=`echo ${data_and_alidirs[1]}`
+
+datadirs=()
+alidirs=()
+for n in `seq 0 $[num_sys-1]`; do
+  datadirs[$n]=${data_and_alidirs[$[2*n]]}
+  alidirs[$n]=${data_and_alidirs[$[2*n+1]]}
+done
+
+utils/combine_data.sh $data ${datadirs[@]} || exit 1
+
+for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+oov=`cat $lang/oov.int`
+nj=`cat $alidir/num_jobs` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl`
+ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
+sdata=$data/split$nj;
+splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
+cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` || exit 1
+delta_opts=`cat $alidir/delta_opts 2>/dev/null`
+
+mkdir -p $dir/log
+cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+cp $alidir/delta_opts $dir 2>/dev/null # delta option.
+
+utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
+cp $lang/phones.txt $dir || exit 1;
+
+echo $nj >$dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+# Set up features.
+if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+
+echo "$0: feature type is $feat_type"
+
+feats=()
+feats_one=()
+for n in `seq 0 $[num_sys-1]`; do
+  this_nj=$(cat ${alidirs[$n]}/num_jobs) || exit 1
+  this_sdata=${datadirs[$n]}/split$this_nj
+  [[ -d $this_sdata && ${datadirs[$n]}/feats.scp -ot $this_sdata ]] || split_data.sh ${datadirs[$n]} $this_nj || exit 1;
+  ## Set up speaker-independent features.
+  case $feat_type in
+    delta) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |"
+      feats_one[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/1/utt2spk scp:$this_sdata/1/cmvn.scp scp:$this_sdata/1/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
+    lda) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+      feats_one[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/1/utt2spk scp:$this_sdata/1/cmvn.scp scp:$this_sdata/1/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+      cp $alidir/final.mat $dir
+      cp $alidir/full.mat $dir 2>/dev/null
+      ;;
+    *) echo "$0: invalid feature type $feat_type" && exit 1;
+  esac
+  
+  if $use_fmllr; then
+    if [ ! -f ${alidirs[$n]}/trans.1 ]; then
+      echo "$0: Could not find fMLLR transforms in ${alidirs[$n]}"
+      exit 1
+    fi
+
+    echo "$0: Using transforms from ${alidirs[$n]}"
+    feats[i]="${feats[i]} transform-feats --utt2spk=ark:$this_sdata/JOB/utt2spk ark,s,cs:${alidirs[$n]}/trans.JOB ark:- ark:- |"
+    feats_one[i]="${feats_one[i]} transform-feats --utt2spk=ark:$this_sdata/1/utt2spk ark,s,cs:${alidirs[$n]}/trans.1 ark:- ark:- |"
+  fi
+
+  # Do subsampling of feats, if needed
+  if [ $frame_subsampling_factor -gt 1 ]; then
+    feats[$n]="${feats[$n]} subsample-feats --n=$frame_subsampling_factor ark:- ark:- |"
+    feats_one[$n]="${feats_one[$n]} subsample-feats --n=$frame_subsampling_factor ark:- ark:- |"
+  fi
+done
+
+if [ $stage -le -5 ]; then
+  echo "$0: Initializing monophone model (for alignment conversion, in case topology changed)"
+
+  [ ! -f $lang/phones/sets.int ] && exit 1;
+  shared_phones_opt="--shared-phones=$lang/phones/sets.int"
+  # get feature dimension
+  example_feats="`echo ${feats[0]} | sed s/JOB/1/g`";
+  if ! feat_dim=$(feat-to-dim "$example_feats" - 2>/dev/null) || [ -z $feat_dim ]; then
+    feat-to-dim "$example_feats" - # to see the error message.
+    echo "error getting feature dimension"
+    exit 1;
+  fi
+
+  for n in `seq 0 $[num_sys-1]`; do
+    copy-feats "${feats_one[$n]}" ark:-
+  done | copy-feats ark:- ark:$dir/tmp.ark
+  
+  $cmd $dir/log/init_mono.log \
+    gmm-init-mono $shared_phones_opt \
+      "--train-feats=ark:subset-feats --n=10 ark:$dir/tmp.ark ark:- |" $lang/topo $feat_dim \
+    $dir/mono.mdl $dir/mono.tree || exit 1
+fi
+
+
+if [ $stage -le -4 ]; then
+  # Get tree stats.
+
+  for n in `seq 0 $[num_sys-1]`; do
+    echo "$0: Accumulating tree stats"
+    this_data=${datadirs[$n]}
+    this_alidir=${alidirs[$n]}
+    this_nj=$(cat $this_alidir/num_jobs) || exit 1
+    this_frame_subsampling_factor=1
+    if [ -f $this_alidir/frame_subsampling_factor ]; then
+      this_frame_subsampling_factor=$(cat $this_alidir/frame_subsampling_factor)
+    fi
+
+    if (( $frame_subsampling_factor % $this_frame_subsampling_factor != 0 )); then
+      echo "$0: frame-subsampling-factor=$frame_subsampling_factor is not "
+      echo "divisible by $this_frame_subsampling_factor (that of $this_alidir)"
+      exit 1
+    fi
+
+    this_frame_subsampling_factor=$((frame_subsampling_factor / this_frame_subsampling_factor))
+    $cmd JOB=1:$this_nj $dir/log/acc_tree.$n.JOB.log \
+       convert-ali --frame-subsampling-factor=$this_frame_subsampling_factor \
+           $this_alidir/final.mdl $dir/mono.mdl $dir/mono.tree "ark:gunzip -c $this_alidir/ali.JOB.gz|" ark:-  \| \
+        acc-tree-stats $context_opts $tree_stats_opts --ci-phones=$ciphonelist $dir/mono.mdl \
+           "${feats[$n]}" ark:- $dir/$n.JOB.treeacc || exit 1;
+    [ "`ls $dir/$n.*.treeacc | wc -w`" -ne "$this_nj" ] && echo "$0: Wrong #tree-accs for data $n $this_data" && exit 1;
+  done
+
+  $cmd $dir/log/sum_tree_acc.log \
+    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
+  rm $dir/*.treeacc
+fi
+
+if [ $stage -le -3 ] && $train_tree; then
+  echo "$0: Getting questions for tree clustering."
+  # preparing questions, roots file...
+  $cmd $dir/log/questions.log \
+     cluster-phones $cluster_phones_opts $context_opts $dir/treeacc \
+     $lang/phones/sets.int $dir/questions.int || exit 1;
+  cat $lang/phones/extra_questions.int >> $dir/questions.int
+  $cmd $dir/log/compile_questions.log \
+    compile-questions \
+      $context_opts $lang/topo $dir/questions.int $dir/questions.qst || exit 1;
+
+  echo "$0: Building the tree"
+  $cmd $dir/log/build_tree.log \
+    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
+    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
+    $dir/questions.qst $lang/topo $dir/tree || exit 1;
+fi
+
+if [ $stage -le -2 ]; then
+  echo "$0: Initializing the model"
+  gmm-init-model  --write-occs=$dir/1.occs  \
+    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
+  grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
+  rm $dir/treeacc
+fi
+
+if [ $stage -le -1 ]; then
+  # Convert the alignments to the new tree.  Note: we likely will not use these
+  # converted alignments in the chain system directly, but they could be useful
+  # for other purposes.
+
+  for n in `seq 0 $[num_sys-1]`; do
+    this_alidir=${alidirs[$n]}
+    this_nj=$(cat $this_alidir/num_jobs) || exit 1
+    
+    this_frame_subsampling_factor=1
+    if [ -f $this_alidir/frame_subsampling_factor ]; then
+      this_frame_subsampling_factor=$(cat $this_alidir/frame_subsampling_factor)
+    fi
+
+    if (( $frame_subsampling_factor % $this_frame_subsampling_factor != 0 )); then
+      echo "$0: frame-subsampling-factor=$frame_subsampling_factor is not "
+      echo "divisible by $this_frame_subsampling_factor (hat of $this_alidir)"
+      exit 1
+    fi
+
+    echo "$0: frame-subsampling-factor for $this_alidir is $this_frame_subsampling_factor"
+
+    this_frame_subsampling_factor=$((frame_subsampling_factor / this_frame_subsampling_factor))
+    echo "$0: Converting alignments from $this_alidir to use current tree"
+    $cmd JOB=1:$this_nj $dir/log/convert.$n.JOB.log \
+      convert-ali --repeat-frames=$repeat_frames \
+        --frame-subsampling-factor=$this_frame_subsampling_factor \
+        $this_alidir/final.mdl $dir/1.mdl $dir/tree "ark:gunzip -c $this_alidir/ali.JOB.gz |" \
+        ark,scp:$dir/ali.$n.JOB.ark,$dir/ali.$n.JOB.scp || exit 1
+
+    for i in `seq $this_nj`; do 
+      cat $dir/ali.$n.$i.scp 
+    done > $dir/ali.$n.scp || exit 1
+  done
+
+  for n in `seq 0 $[num_sys-1]`; do
+    cat $dir/ali.$n.scp
+  done | sort -k1,1 > $dir/ali.scp || exit 1
+
+  utils/split_data.sh $data $nj
+  $cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \
+    copy-int-vector "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/ali.scp |" \
+    "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1
+fi
+
+cp $dir/1.mdl $dir/final.mdl
+
+echo $0: Done building tree
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index 0294df0d84a..99e7499bd30 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -46,15 +46,15 @@ frames_per_iter=400000 # each iteration of training, see this many frames per
                        # used.  This is just a guideline; it will pick a number
                        # that divides the number of samples in the entire data.
 
-right_tolerance=  #CTC right tolerance == max label delay.
+right_tolerance=  # chain right tolerance == max label delay.
 left_tolerance=
 
 transform_dir=     # If supplied, overrides latdir as the place to find fMLLR transforms
 
 stage=0
-nj=15         # This should be set to the maximum number of jobs you are
-              # comfortable to run in parallel; you can increase it if your disk
-              # speed is greater and you have more machines.
+max_jobs_run=15         # This should be set to the maximum number of nnet3-chain-get-egs jobs you are
+                        # comfortable to run in parallel; you can increase it if your disk
+                        # speed is greater and you have more machines.
 max_shuffle_jobs_run=50  # the shuffle jobs now include the nnet3-chain-normalize-egs command,
                          # which is fairly CPU intensive, so we can run quite a few at once
                          # without overloading the disks.
@@ -63,6 +63,17 @@ online_ivector_dir=  # can be used if we are including speaker information as iV
 cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
             # it doesn't make sense to use different options than were used as input to the
             # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
+lattice_lm_scale=     # If supplied, the graph/lm weight of the lattices will be
+                      # used (with this scale) in generating supervisions
+                      # This is 0 by default for conventional supervised training, 
+                      # but may be close to 1 for the unsupervised part of the data 
+                      # in semi-supervised training. The optimum is usually 
+                      # 0.5 for unsupervised data.
+lattice_prune_beam=         # If supplied, the lattices will be pruned to this beam,
+                            # before being used to get supervisions.
+acwt=0.1   # For pruning
+deriv_weights_scp=
+generate_egs_scp=false
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -80,7 +91,7 @@ if [ $# != 4 ]; then
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config file containing options"
-  echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
+  echo "  --max-jobs-run <max-jobs-run>                    # The maximum number of jobs you want to run in"
   echo "                                                   # parallel (increase this only if you have good disk and"
   echo "                                                   # network speed).  default=6"
   echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
@@ -94,8 +105,16 @@ if [ $# != 4 ]; then
   echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
   echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
   echo "  --num-egs-diagnostic <#frames;4000>              # Number of egs used in computing (train,valid) diagnostics"
-  echo "  --num-valid-egs-combine <#frames;10000>          # Number of egss used in getting combination weights at the"
+  echo "  --num-valid-egs-combine <#frames;10000>          # Number of egs used in getting combination weights at the"
   echo "                                                   # very end."
+  echo "  --lattice-lm-scale <float>                       # If supplied, the graph/lm weight of the lattices will be "
+  echo "                                                   # used (with this scale) in generating supervisions"
+  echo "  --lattice-prune-beam <float>                     # If supplied, the lattices will be pruned to this beam, "
+  echo "                                                   # before being used to get supervisions."
+  echo "  --acwt <float;0.1>                               # Acoustic scale -- affects pruning"
+  echo "  --deriv-weights-scp <str>                        # If supplied, adds per-frame weights to the supervision."
+  echo "  --generate-egs-scp <bool;false>                  # Generates scp files -- Required if the egs will be "
+  echo "                                                   # used for multilingual/multitask training."
   echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
@@ -116,13 +135,13 @@ for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
+nj=$(cat $latdir/num_jobs) || exit 1
+
 sdata=$data/split$nj
 utils/split_data.sh $data $nj
 
 mkdir -p $dir/log $dir/info
 
-num_lat_jobs=$(cat $latdir/num_jobs) || exit 1;
-
 # Get list of validation utterances.
 
 frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
@@ -186,6 +205,8 @@ if [ -f $dir/trans.scp ]; then
   train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
 fi
 
+tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1
+
 if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
@@ -259,20 +280,11 @@ if [ -e $dir/storage ]; then
   done
 fi
 
-if [ $stage -le 2 ]; then
-  echo "$0: copying training lattices"
-
-  $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \
-    lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1;
-
-  for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp
-fi
-
-
 egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
 [ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
 [ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"
 
+[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp"
 
 chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor"
 [ ! -z $right_tolerance ] && \
@@ -281,60 +293,99 @@ chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$ali
 [ ! -z $left_tolerance ] && \
   chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance"
 
+lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |"
+if [ ! -z $lattice_prune_beam ]; then
+  if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then
+    lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |"
+  else
+    lats_rspecifier="$lats_rspecifier lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:- |"
+  fi
+fi
+
+normalization_fst_scale=1.0
+
+if [ ! -z "$lattice_lm_scale" ]; then
+  chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale"
+
+  normalization_fst_scale=$(perl -e "
+  if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) {
+    print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\";
+    exit(1);
+  }
+  print (1.0 - $lattice_lm_scale);") || exit 1
+fi
+
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
 echo $left_context_initial > $dir/info/left_context_initial
 echo $right_context_final > $dir/info/right_context_final
 
-if [ $stage -le 3 ]; then
-  echo "$0: Getting validation and training subset examples."
+if [ $stage -le 2 ]; then
+  echo "$0: Getting validation and training subset examples in background."
   rm $dir/.error 2>/dev/null
-  echo "$0: ... extracting validation and training-subset alignments."
-
-  # do the filtering just once, as lat.scp may be long.
-  utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
-    <$dir/lat.scp >$dir/lat_special.scp
-
-  $cmd $dir/log/create_valid_subset.log \
-    utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \
-    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
-    chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \
-      ark:- ark:- \| \
-    nnet3-chain-get-egs $ivector_opts --srand=$srand \
-      $egs_opts $chaindir/normalization.fst \
-      "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error &
-  $cmd $dir/log/create_train_subset.log \
-    utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \
-    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
-    chain-get-supervision $chain_supervision_all_opts \
-      $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
-    nnet3-chain-get-egs $ivector_opts --srand=$srand \
-      $egs_opts $chaindir/normalization.fst \
-      "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error &
-  wait;
-  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
-  echo "... Getting subsets of validation examples for diagnostics and combination."
-  $cmd $dir/log/create_valid_subset_combine.log \
-    nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \
-    ark:$dir/valid_combine.cegs || touch $dir/.error &
-  $cmd $dir/log/create_valid_subset_diagnostic.log \
-    nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \
-    ark:$dir/valid_diagnostic.cegs || touch $dir/.error &
-
-  $cmd $dir/log/create_train_subset_combine.log \
-    nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \
-    ark:$dir/train_combine.cegs || touch $dir/.error &
-  $cmd $dir/log/create_train_subset_diagnostic.log \
-    nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \
-    ark:$dir/train_diagnostic.cegs || touch $dir/.error &
-  wait
-  sleep 5  # wait for file system to sync.
-  cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs
-
-  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do
-    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
-  done
-  rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs
+
+  (
+    $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \
+      lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \
+      "$lats_rspecifier" \
+      ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1
+
+    for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp
+
+    $cmd $dir/log/create_valid_subset.log \
+      utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \
+      lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
+      chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \
+        ark:- ark:- \| \
+      nnet3-chain-get-egs $ivector_opts --srand=$srand \
+        $egs_opts --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst \
+        "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || exit 1
+    $cmd $dir/log/create_train_subset.log \
+      utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \
+      lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
+      chain-get-supervision $chain_supervision_all_opts \
+        $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
+      nnet3-chain-get-egs $ivector_opts --srand=$srand \
+        $egs_opts --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst \
+        "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1
+    wait
+    sleep 5  # wait for file system to sync.
+    echo "... Getting subsets of validation examples for diagnostics and combination."
+    if $generate_egs_scp; then
+      valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp"
+      train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp"
+    else
+      valid_diagnostic_output="ark:$dir/valid_diagnostic.cegs"
+      train_diagnostic_output="ark:$dir/train_diagnostic.cegs"
+    fi
+    $cmd $dir/log/create_valid_subset_combine.log \
+      nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \
+      ark:$dir/valid_combine.cegs || exit 1
+    $cmd $dir/log/create_valid_subset_diagnostic.log \
+      nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \
+      $valid_diagnostic_output || exit 1
+
+    $cmd $dir/log/create_train_subset_combine.log \
+      nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \
+      ark:$dir/train_combine.cegs || exit 1
+    $cmd $dir/log/create_train_subset_diagnostic.log \
+      nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \
+      $train_diagnostic_output || exit 1
+    wait
+    sleep 5  # wait for file system to sync.
+    if $generate_egs_scp; then
+      cat $dir/valid_combine.cegs $dir/train_combine.cegs | \
+        nnet3-chain-copy-egs ark:- ark,scp:$dir/combine.cegs,$dir/combine.scp
+      rm $dir/{train,valid}_combine.scp
+    else
+      cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs
+    fi
+
+    for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do
+      [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+    done
+    rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs
+  ) || touch $dir/.error &
 fi
 
 if [ $stage -le 4 ]; then
@@ -355,9 +406,10 @@ if [ $stage -le 4 ]; then
   # there can be too many small files to deal with, because the total number of
   # files is the product of 'nj' by 'num_archives_intermediate', which might be
   # quite large.
-  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
-    utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \
-    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
+
+  $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \
+    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl \
+      "$lats_rspecifier" ark:- \| \
     chain-get-supervision $chain_supervision_all_opts \
       $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
     nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \
@@ -366,6 +418,10 @@ if [ $stage -le 4 ]; then
     nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
 fi
 
+if [ -f $dir/.error ]; then
+  echo "Error detected while creating train/valid egs" && exit 1
+fi
+
 if [ $stage -le 5 ]; then
   echo "$0: recombining and shuffling order of archives on disk"
   # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and
@@ -378,16 +434,35 @@ if [ $stage -le 5 ]; then
   done
 
   if [ $archives_multiple == 1 ]; then # normal case.
-    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
-      nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \
-      nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:$dir/cegs.JOB.ark  || exit 1;
+    if $generate_egs_scp; then
+      output_archive="ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp"
+    else
+      output_archive="ark:$dir/cegs.JOB.ark"
+    fi
+    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G \
+      JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale \
+        $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \
+      nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1;
+
+    if $generate_egs_scp; then
+      #concatenate cegs.JOB.scp in single cegs.scp
+      for j in $(seq $num_archives_intermediate); do
+        cat $dir/cegs.$j.scp || exit 1;
+      done > $dir/cegs.scp || exit 1;
+      for f in $dir/cegs.*.scp; do rm $f; done
+    fi
   else
     # we need to shuffle the 'intermediate archives' and then split into the
     # final archives.  we create soft links to manage this splitting, because
     # otherwise managing the output names is quite difficult (and we don't want
     # to submit separate queue jobs for each intermediate archive, because then
     # the --max-jobs-run option is hard to enforce).
-    output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)"
+    if $generate_egs_scp; then
+      output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/cegs.JOB.$y.ark,$dir/cegs.JOB.$y.scp; done)"
+    else
+      output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)"
+    fi
     for x in $(seq $num_archives_intermediate); do
       for y in $(seq $archives_multiple); do
         archive_index=$[($x-1)*$archives_multiple+$y]
@@ -395,13 +470,31 @@ if [ $stage -le 5 ]; then
         ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1
       done
     done
-    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
-      nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \
+    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G \
+      JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale \
+        $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \
       nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \
       nnet3-chain-copy-egs ark:- $output_archives || exit 1;
+
+    if $generate_egs_scp; then
+      #concatenate cegs.JOB.scp in single cegs.scp
+      rm -rf $dir/cegs.scp
+      for j in $(seq $num_archives_intermediate); do
+        for y in $(seq $archives_multiple); do
+          cat $dir/cegs.$j.$y.scp || exit 1;
+        done
+      done > $dir/cegs.scp || exit 1;
+      for f in $dir/cegs.*.*.scp; do rm $f; done
+    fi
   fi
 fi
 
+wait
+if [ -f $dir/.error ]; then
+  echo "Error detected while creating train/valid egs" && exit 1
+fi
+
 if [ $stage -le 6 ]; then
   echo "$0: removing temporary archives"
   (
@@ -415,8 +508,6 @@ if [ $stage -le 6 ]; then
     # there are some extra soft links that we should delete.
     for f in $dir/cegs.*.*.ark; do rm $f; done
   fi
-  echo "$0: removing temporary lattices"
-  rm $dir/lat.*
   echo "$0: removing temporary alignments and transforms"
   # Ignore errors below because trans.* might not exist.
   rm $dir/{ali,trans}.{ark,scp} 2>/dev/null
diff --git a/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh
new file mode 100755
index 00000000000..76793e8fa25
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+# Copyright 2017     Pegah Ghahremani
+#           2017-18  Vimal Manohar
+# Apache 2.0
+
+# This script generates examples for multilingual training of 'chain' 
+# models using separate input egs dir per language as input.
+# This script is similar to steps/nnet3/multilingual/combine_egs.sh, but 
+# works on 'chain' egs. This is also useful for semi-supervised training,
+# where supervised and unsupervised datasets are treated as different 
+# languages.
+
+# This scripts produces 3 sets of files --
+# cegs.*.scp, cegs.output.*.ark, cegs.weight.*.ark
+#
+# cegs.*.scp are the SCP files of the training examples.
+# cegs.weight.*.ark map from the key of the example to the language-specific
+# weight of that example.
+# cegs.output.*.ark map from the key of the example to the name of
+# the output-node in the neural net for that specific language, e.g.
+# 'output-2'.
+#
+# Begin configuration section.
+cmd=run.pl
+block_size=256          # This is the number of consecutive egs that we take from
+                        # each source, and it only affects the locality of disk
+                        # access.
+lang2weight=            # array of weights one per input languge to scale example's output
+                        # w.r.t its input language during training.
+stage=0
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -lt 3 ]; then
+  cat <<EOF
+  This script generates examples for multilingual training of neural network
+  using separate input egs dir per language as input.
+  See top of the script for details.
+
+  Usage: $0 [opts] <num-input-langs,N> <lang1-egs-dir> ...<langN-egs-dir> <multilingual-egs-dir>
+   e.g.: $0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs
+
+  Options:
+      --cmd (utils/run.pl|utils/queue.pl <queue opts>)  # how to run jobs.
+      --block-size <int|512>      # it is the number of consecutive egs that we take from 
+                                  # each source, and it only affects the locality of disk 
+                                  # access. This does not have to be the actual minibatch size
+EOF
+  exit 1;
+fi
+
+num_langs=$1
+
+shift 1
+args=("$@")
+megs_dir=${args[-1]} # multilingual directory
+mkdir -p $megs_dir
+mkdir -p $megs_dir/info
+if [ ${#args[@]} != $[$num_langs+1] ]; then
+  echo "$0: num of input example dirs provided is not compatible with num_langs $num_langs."
+  echo "Usage:$0 [opts] <num-input-langs,N> <lang1-egs-dir> ...<langN-egs-dir> <multilingual-egs-dir>"
+  echo "Usage:$0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs"
+  exit 1;
+fi
+
+required="cegs.scp combine.scp train_diagnostic.scp valid_diagnostic.scp"
+train_scp_list=
+train_diagnostic_scp_list=
+valid_diagnostic_scp_list=
+combine_scp_list=
+
+# read paramter from $egs_dir[0]/info and cmvn_opts
+# to write in multilingual egs_dir.
+check_params="info/feat_dim info/ivector_dim info/left_context info/right_context cmvn_opts"
+ivec_dim=`cat ${args[0]}/info/ivector_dim`
+if [ $ivec_dim -ne 0 ];then check_params="$check_params info/final.ie.id"; fi
+
+for param in $check_params info/frames_per_eg; do
+  cat ${args[0]}/$param > $megs_dir/$param || exit 1;
+done
+
+tot_num_archives=0
+for lang in $(seq 0 $[$num_langs-1]);do
+  multi_egs_dir[$lang]=${args[$lang]}
+  for f in $required; do
+    if [ ! -f ${multi_egs_dir[$lang]}/$f ]; then
+      echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1;
+    fi
+  done
+  num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives)
+  tot_num_archives=$[tot_num_archives+num_archives]
+  train_scp_list="$train_scp_list ${args[$lang]}/cegs.scp"
+  train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp"
+  valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp"
+  combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp"
+
+  # check parameter dimension to be the same in all egs dirs
+  for f in $check_params; do
+    if [ -f $megs_dir/$f ] && [ -f ${multi_egs_dir[$lang]}/$f ]; then
+      f1=$(cat $megs_dir/$f)
+      f2=$(cat ${multi_egs_dir[$lang]}/$f)
+      if [ "$f1" != "$f2" ]  ; then
+        echo "$0: mismatch for $f in $megs_dir vs. ${multi_egs_dir[$lang]}($f1 vs. $f2)."
+        exit 1;
+      fi
+    else
+      echo "$0: file $f does not exits in $megs_dir or ${multi_egs_dir[$lang]}/$f ."
+    fi
+  done
+done
+
+if [ ! -z "$lang2weight" ]; then
+  egs_opt="--lang2weight '$lang2weight'"
+fi
+
+if [ $stage -le 0 ]; then
+  echo "$0: allocating multilingual examples for training."
+  # Generate cegs.*.scp for multilingual setup.
+  $cmd $megs_dir/log/allocate_multilingual_examples_train.log \
+    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
+      --num-archives $tot_num_archives \
+      --block-size $block_size \
+      --egs-prefix "cegs." \
+      $train_scp_list $megs_dir || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: combine combine.scp examples from all langs in $megs_dir/combine.scp."
+  # Generate combine.scp for multilingual setup.
+  $cmd $megs_dir/log/allocate_multilingual_examples_combine.log \
+    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
+      --num-archives 1 \
+      --block-size $block_size \
+      --egs-prefix "combine." \
+      $combine_scp_list $megs_dir || exit 1;
+
+  echo "$0: combine train_diagnostic.scp examples from all langs in $megs_dir/train_diagnostic.scp."
+  # Generate train_diagnostic.scp for multilingual setup.
+  $cmd $megs_dir/log/allocate_multilingual_examples_train_diagnostic.log \
+    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
+      --num-archives 1 \
+      --block-size $block_size \
+      --egs-prefix "train_diagnostic." \
+      $train_diagnostic_scp_list $megs_dir || exit 1;
+
+
+  echo "$0: combine valid_diagnostic.scp examples from all langs in $megs_dir/valid_diagnostic.scp."
+  # Generate valid_diagnostic.scp for multilingual setup.
+  $cmd $megs_dir/log/allocate_multilingual_examples_valid_diagnostic.log \
+    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
+      --num-archives 1 \
+      --block-size $block_size \
+      --egs-prefix "valid_diagnostic." \
+      $valid_diagnostic_scp_list $megs_dir || exit 1;
+
+fi
+for egs_type in combine train_diagnostic valid_diagnostic; do
+  mv $megs_dir/${egs_type}.output.1.ark $megs_dir/${egs_type}.output.ark || exit 1;
+  mv $megs_dir/${egs_type}.weight.1.ark $megs_dir/${egs_type}.weight.ark || exit 1;
+  mv $megs_dir/${egs_type}.1.scp $megs_dir/${egs_type}.scp || exit 1;
+done
+mv $megs_dir/info/cegs.num_archives $megs_dir/info/num_archives || exit 1;
+mv $megs_dir/info/cegs.num_tasks $megs_dir/info/num_tasks || exit 1;
+echo "$0: Finished preparing multilingual training example."
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 6a68d9ecb6e..acabf733c94 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -274,7 +274,8 @@ def train(args, run_opts):
 
     # Check files
     chain_lib.check_for_required_files(args.feat_dir, args.tree_dir,
-                                       args.lat_dir)
+                                       args.lat_dir if args.egs_dir is None
+                                       else None)
 
     # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will
     # use it to check compatibility between training and decoding phone-sets.
@@ -410,6 +411,15 @@ def train(args, run_opts):
     logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir))
     common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)
 
+    if not os.path.exists('{0}/valid_diagnostic.cegs'.format(egs_dir)):
+        if (not os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir))):
+            raise Exception('Neither {0}/valid_diagnostic.cegs nor '
+                            '{0}/valid_diagnostic.scp exist.'
+                            'This script expects one of them.'.format(egs_dir))
+        use_multitask_egs = True
+    else:
+        use_multitask_egs = False
+
     if ((args.stage <= -2) and (os.path.exists(args.dir+"/configs/init.config"))
             and (args.input_model is None)):
         logger.info('Computing the preconditioning matrix for input features')
@@ -417,7 +427,8 @@ def train(args, run_opts):
         chain_lib.compute_preconditioning_matrix(
             args.dir, egs_dir, num_archives, run_opts,
             max_lda_jobs=args.max_lda_jobs,
-            rand_prune=args.rand_prune)
+            rand_prune=args.rand_prune,
+            use_multitask_egs=use_multitask_egs)
 
     if (args.stage <= -1):
         logger.info("Preparing the initial acoustic model.")
@@ -526,7 +537,8 @@ def train(args, run_opts):
                 frame_subsampling_factor=args.frame_subsampling_factor,
                 run_opts=run_opts,
                 backstitch_training_scale=args.backstitch_training_scale,
-                backstitch_training_interval=args.backstitch_training_interval)
+                backstitch_training_interval=args.backstitch_training_interval,
+                use_multitask_egs=use_multitask_egs)
 
             if args.cleanup:
                 # do a clean up everything but the last 2 models, under certain
@@ -561,13 +573,20 @@ def train(args, run_opts):
                 l2_regularize=args.l2_regularize,
                 xent_regularize=args.xent_regularize,
                 run_opts=run_opts,
-                max_objective_evaluations=args.max_objective_evaluations)
+                max_objective_evaluations=args.max_objective_evaluations,
+                use_multitask_egs=use_multitask_egs)
         else:
             logger.info("Copying the last-numbered model to final.mdl")
             common_lib.force_symlink("{0}.mdl".format(num_iters),
                                      "{0}/final.mdl".format(args.dir))
+            chain_lib.compute_train_cv_probabilities(
+                dir=args.dir, iter=num_iters, egs_dir=egs_dir,
+                l2_regularize=l2_regularize, xent_regularize=xent_regularize,
+                leaky_hmm_coefficient=args.leaky_hmm_coefficient,
+                run_opts=run_opts,
+                use_multitask_egs=use_multitask_egs)
             common_lib.force_symlink("compute_prob_valid.{iter}.log"
-                                     "".format(iter=num_iters-1),
+                                     "".format(iter=num_iters),
                                      "{dir}/log/compute_prob_valid.final.log".format(
                                          dir=args.dir))
 
diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index 8c520e0b5e1..37a67b41f94 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -37,7 +37,7 @@ minimize=false
 echo "$0 $@"  # Print the command line for logging
 
 [ -f ./path.sh ] && . ./path.sh; # source the path.
-. parse_options.sh || exit 1;
+. utils/parse_options.sh || exit 1;
 
 if [ $# -ne 3 ]; then
   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
diff --git a/egs/wsj/s5/steps/nnet3/decode_semisup.sh b/egs/wsj/s5/steps/nnet3/decode_semisup.sh
new file mode 100755
index 00000000000..b742835f588
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/decode_semisup.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+# This script does decoding with a neural-net.  If the neural net was built on
+# top of fMLLR transforms from a conventional system, you should provide the
+# --transform-dir option.
+
+# Begin configuration section.
+stage=1
+transform_dir=    # dir to find fMLLR transforms.
+nj=4 # number of decoding jobs.  If --transform-dir set, must match that number!
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
+cmd=run.pl
+beam=15.0
+frames_per_chunk=50
+max_active=7000
+min_active=200
+ivector_scale=1.0
+lattice_beam=8.0 # Beam we use in lattice generation.
+iter=final
+num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+scoring_opts=
+skip_diagnostics=false
+skip_scoring=false
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+online_ivector_dir=
+minimize=false
+word_determinize=false  # If set to true, then output lattice does not retain
+                        # alternate paths a sequence of words (with alternate pronunciations).
+                        # Setting to true is the default in steps/nnet3/decode.sh.  
+                        # However, setting this to false
+                        # is useful for generation w of semi-supervised training
+                        # supervision and frame-level confidences.
+write_compact=true   # If set to false, then writes the lattice in non-compact format,
+                     # retaining the acoustic scores on each arc. This is 
+                     # required to be false for LM rescoring undeterminized 
+                     # lattices (when --word-determinize is false)
+                     # Useful for semi-supervised training with rescored lattices.
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
+  echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
+  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
+  echo "main options (for others, see top of script file)"
+  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
+  echo "                                           # where we can find transforms for SAT systems."
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 15.0"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  echo "  --scoring-opts <string>                  # options to local/score.sh"
+  echo "  --num-threads <n>                        # number of threads to use, default 1."
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+model=$srcdir/$iter.mdl
+
+
+extra_files=
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
+
+utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1
+
+for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+## Set up features.
+echo "$0: feature type is raw"
+
+feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+
+  if [ ! -f $transform_dir/raw_trans.1 ]; then
+    echo "$0: expected $transform_dir/raw_trans.1 to exist (--transform-dir option)"
+    exit 1;
+  fi
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/raw_trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/raw_trans.ark,$dir/raw_trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/raw_trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
+  fi
+elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then
+  echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option in test time."
+fi
+##
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+extra_opts=
+lat_wspecifier="ark:|"
+if ! $write_compact; then
+  extra_opts="--determinize-lattice=false"
+  lat_wspecifier="ark:| lattice-determinize-phone-pruned --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |"
+fi
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="$lat_wspecifier gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="$lat_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+fi
+
+if [ $stage -le 1 ]; then
+  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --extra-right-context-final=$extra_right_context_final \
+     --minimize=$minimize --word-determinize=$word_determinize \
+     --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+     --word-symbol-table=$graphdir/words.txt ${extra_opts} "$model" \
+     $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1;
+fi
+
+
+if [ $stage -le 2 ]; then
+  if ! $skip_diagnostics ; then
+    [ ! -z $iter ] && iter_opt="--iter $iter"
+    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
+  fi
+fi
+
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
+# different acoustic scales to get the final output.
+if [ $stage -le 3 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    [ "$iter" != "final" ] && iter_opt="--iter $iter"
+    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+echo "Decoding done."
+exit 0;
diff --git a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py
index e5f5f627567..54c65eb5403 100755
--- a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py
+++ b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py
@@ -1,6 +1,7 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
-# Copyright 2017 Pegah Ghahremani
+# Copyright      2017 Pegah Ghahremani
+#                2018 Hossein Hadian
 #
 # Apache 2.0.
 
@@ -15,25 +16,6 @@
     the output-node in the neural net for that specific language, e.g.
     'output-2'.
 
-    This script additionally produces temporary files -- egs.ranges.*.txt,
-    which are consumed by this script itself.
-    There is one egs.ranges.*.txt file for each of the egs.*.scp files.
-    Each line in egs.ranges.*.txt corresponds to ranges of examples
-    selected from one of the input languages's scp files as:
-    <lang> <local-scp-line> <num-examples>
-
-    That can be interpreted as selecting <num-example> examples starting from
-    <local-scp-line> line from {lang}_th 'egs' file in "egs_scp_list".
-    (note that <local-scp-line> is the zero-based line number.)
-
-    Example lines might look like:
-    0 0 256
-    2 1024 256
-
-    egs.*.scp is generated using egs.ranges.*.txt as following:
-    "<num-examples>" consecutive examples starting from line "<local-scp-line>"
-    from {lang}_th input scp-file is copied to egs.*.scp.
-
     --egs-prefix option can be used to generate train and diagnostics egs files.
     If --egs-prefix=train_diagnostics. is passed, then the files produced by the
     script will be named with the prefix as "train_diagnostics."
@@ -45,21 +27,17 @@
     for validation examples and "combine." for examples used for model
     combination.
 
+    For chain training egs, the --egs-prefix option should be "cegs."
+
     You can call this script as (e.g.):
 
     allocate_multilingual_examples.py [opts] example-scp-lists
         multilingual-egs-dir
 
-    allocate_multilingual_examples.py --minibatch-size 128
+    allocate_multilingual_examples.py --block-size 512
         --lang2weight  "0.2,0.8" exp/lang1/egs.scp exp/lang2/egs.scp
         exp/multi/egs
 
-    To avoid loading whole scp files from all languages in memory,
-    input egs.scp files are processed line by line using readline() for input
-    languages. To have more randomization across different archives,
-    "num-jobs * num-archives" temporary scp.<job>.<archive_index> files are created
-    in egs/temp dir and all "num_jobs" scp.*.<archive_index> combined into
-    egs.<archive_index>.scp.
 """
 
 from __future__ import print_function
@@ -68,7 +46,6 @@
 import traceback
 
 sys.path.insert(0, 'steps')
-import libs.common as common_lib
 
 logger = logging.getLogger('libs')
 logger.setLevel(logging.INFO)
@@ -95,43 +72,31 @@ def get_args():
         'output-2'.""",
         epilog="Called by steps/nnet3/multilingual/combine_egs.sh")
 
-    parser.add_argument("--samples-per-iter", type=int, default=40000,
-                        help="The target number of egs in each archive of egs, "
-                        "(prior to merging egs). ")
-    parser.add_argument("--num-jobs", type=int, default=20,
-                        help="This can be used for better randomization in distributing "
-                        "examples for different languages across egs.*.scp files, "
-                        "where egs.<job>.*.scp are generated "
-                        "randomly and combined across all jobs in egs.*.scp files.")
-    parser.add_argument("--random-lang", type=str, action=common_lib.StrToBoolAction,
-                        help="If true, egs.ranges.*.txt are generated "
-                        "randomly w.r.t distribution of remaining examples in "
-                        "each language, otherwise it is generated sequentially.",
-                        default=True, choices = ["false", "true"])
-    parser.add_argument("--max-archives", type=int, default=1000,
-                        help="max number of archives used to generate egs.*.scp")
-    parser.add_argument("--seed", type=int, default=1,
-                        help="Seed for random number generator")
-    parser.add_argument("--minibatch-size", type=int, default=512,
-                        help="It is the number of consecutive egs that is taken "
-                        "from each input scp source, and it only affects locality "
-                        "of disk access. This does not have to be actual minibatch size.")
+    parser.add_argument("--num-archives", type=int, default=None,
+                        help="Number of archives to split the data into. (Note: in reality they are not "
+                        "archives, only scp files, but we use this notation by analogy with the "
+                        "conventional egs-creating script).")
+    parser.add_argument("--block-size", type=int, default=512,
+                        help="This relates to locality of disk access. 'block-size' is"
+                        "the average number of examples that are read consecutively"
+                        "from each input scp file (and are written in the same order to the output scp files)"
+                        "Smaller values lead to more random disk access (during "
+                        "the nnet3 training process).")
     parser.add_argument("--egs-prefix", type=str, default="egs.",
-                        help="option can be used to generated example scp, weight "
-                        "and output files for training and diagnostics."
-                        "If --egs-prefix=combine. , then files produced "
-                        "by the sript will be named with this prefix as "
-                        "combine.output.*.ark, combine.weight.*.ark, combine.*.scp, "
-                        "combine.ranges.*.ark.")
+                        help="This option can be used to add a prefix to the filenames "
+                        "of the output files. For e.g. "
+                        "if --egs-prefix=combine. , then the files produced "
+                        "by this script will be "
+                        "combine.output.*.ark, combine.weight.*.ark, and combine.*.scp")
     parser.add_argument("--lang2weight", type=str,
-                        help="comma-separated list of weights, one per language."
+                        help="Comma-separated list of weights, one per language. "
                         "The language order is as egs_scp_lists.")
 # now the positional arguments
     parser.add_argument("egs_scp_lists", nargs='+',
-                        help="list of egs.scp files per input language."
+                        help="List of egs.scp files per input language."
                            "e.g. exp/lang1/egs/egs.scp exp/lang2/egs/egs.scp")
     parser.add_argument("egs_dir",
-                        help="Name of egs directory e.g. exp/tdnn_multilingual_sp/egs")
+                        help="Name of output egs directory e.g. exp/tdnn_multilingual_sp/egs")
 
 
     print(sys.argv, file=sys.stderr)
@@ -140,169 +105,119 @@ def get_args():
     return args
 
 
-def select_random_lang(lang_len, tot_egs, random_selection):
-    """ Returns a random language index w.r.t
-        amount of examples in each language.
-        It works based on sampling from a
-        discrete distribution, where it returns i
-        with prob(i) = (num_egs in lang(i)/ tot_egs).
-        tot_egs is sum of lang_len.
-    """
-    assert(tot_egs > 0)
-    rand_int = random.randint(0, tot_egs - 1)
-    count = 0
-    for l in range(len(lang_len)):
-        if random_selection:
-            if  rand_int <= (count + lang_len[l]):
-                return l
-            else:
-                count += lang_len[l]
-        else:
-            if (lang_len[l] > 0):
-                return l
-    return -1
+def read_lines(file_handle, num_lines):
+    n_read = 0
+    lines = []
+    while n_read < num_lines:
+        line = file_handle.readline()
+        if not line:
+            break
+        lines.append(line.strip())
+        n_read += 1
+    return lines
 
 
 def process_multilingual_egs(args):
     args = get_args()
-    random.seed(args.seed)
-    rand_select = args.random_lang
 
-    # read egs.scp for input languages
     scp_lists = args.egs_scp_lists
     num_langs = len(scp_lists)
 
-    scp_files = [open(scp_lists[lang], 'r') for lang in range(num_langs)]
-
-    lang2len = [0] * num_langs
+    lang_to_num_examples = [0] * num_langs
     for lang in range(num_langs):
-        lang2len[lang] = sum(1 for line in open(scp_lists[lang]))
+        with open(scp_lists[lang]) as fh:
+            lang_to_num_examples[lang] = sum([1 for line in fh])
         logger.info("Number of examples for language {0} "
-                    "is {1}.".format(lang, lang2len[lang]))
+                    "is {1}.".format(lang, lang_to_num_examples[lang]))
 
     # If weights are not provided, the weights are 1.0.
     if args.lang2weight is None:
-        lang2weight = [ 1.0 ] * num_langs
+        lang2weight = [1.0] * num_langs
     else:
         lang2weight = args.lang2weight.split(",")
         assert(len(lang2weight) == num_langs)
 
-    if not os.path.exists("{0}/temp".format(args.egs_dir)):
-        os.makedirs("{0}/temp".format(args.egs_dir))
-    num_lang_file = open("{0}/info/{1}num_tasks".format(args.egs_dir, args.egs_prefix), "w")
-    print("{0}".format(num_langs), file=num_lang_file)
-
-    # Each element of all_egs (one per num_archive * num_jobs) is
-    # an array of 3-tuples (lang-id, local-start-egs-line, num-egs)
-    all_egs = []
-    lang_len = lang2len[:]
-    # total num of egs in all languages
-    tot_num_egs = sum(lang2len[i] for i in range(len(lang2len)))
-    num_archives = max(1, min(args.max_archives, tot_num_egs // args.samples_per_iter))
-
-    num_arch_file = open("{0}/info/{1}num_archives".format(
-                            args.egs_dir,
-                            args.egs_prefix),
-                         "w")
-    print("{0}".format(num_archives), file=num_arch_file)
-    num_arch_file.close()
-    this_num_egs_per_archive = tot_num_egs // (num_archives * args.num_jobs)
-
-    logger.info("Generating {0}scp.<job>.<archive_index> temporary files used to "
-                "generate {0}<archive_index>.scp.".format(args.egs_prefix))
-    for job in range(args.num_jobs):
-        for archive_index in range(num_archives):
-            archfile = open("{0}/temp/{1}scp.{2}.{3}"
-                            "".format(args.egs_dir, args.egs_prefix,
-                                      job + 1, archive_index + 1),
-                            "w")
-            this_egs = [] # this will be array of 2-tuples (lang-id start-frame num-frames)
-
-            num_egs = 0
-            while num_egs <= this_num_egs_per_archive:
-                num_left_egs = sum(num_left_egs_per_lang for
-                                   num_left_egs_per_lang in lang_len)
-                if num_left_egs > 0:
-                    lang_id = select_random_lang(lang_len, num_left_egs, rand_select)
-                    start_egs = lang2len[lang_id] - lang_len[lang_id]
-                    this_egs.append((lang_id, start_egs, args.minibatch_size))
-                    for scpline in range(args.minibatch_size):
-                        scp_key = scp_files[lang_id].readline().splitlines()[0]
-                        print("{0} {1}".format(scp_key, lang_id),
-                              file=archfile)
-
-                    lang_len[lang_id] = lang_len[lang_id] - args.minibatch_size
-                    num_egs = num_egs + args.minibatch_size
-                    # If num of remaining egs in each lang is less than minibatch_size,
-                    # they are discarded.
-                    if lang_len[lang_id] < args.minibatch_size:
-                        lang_len[lang_id] = 0
-                        logger.info("Done processing data for language {0}".format(
-                            lang_id))
-                else:
-                    logger.info("Done processing data for all languages.")
-                    break
-            all_egs.append(this_egs)
-            archfile.close()
-
-    logger.info("combining egs.<job>.*.scp across all jobs into egs.*.scp file.")
-    for archive in range(num_archives):
-        logger.info("Combine {0}job.{1}.scp across all jobs into "
-                    "{0}{1}.scp.".format(args.egs_prefix, archive))
-        this_ranges = []
-        f = open("{0}/temp/{1}ranges.{2}.txt".format(
-                    args.egs_dir, args.egs_prefix, archive + 1),
-                 'w')
-        o = open("{0}/{1}output.{2}.ark".format(
-                    args.egs_dir, args.egs_prefix, archive + 1),
-                 'w')
-        w = open("{0}/{1}weight.{2}.ark".format(
-                    args.egs_dir, args.egs_prefix, archive + 1),
-                 'w')
-        scp_per_archive_file = open("{0}/{1}{2}.scp"
-                                    "".format(args.egs_dir,
-                                              args.egs_prefix, archive + 1),
-                                    'w')
-
-        # check files before writing.
-        if f is None:
-            raise Exception("Error opening file {0}".format(f))
-        if o is None:
-            raise Exception("Error opening file {0}".format(o))
-        if w is None:
-            raise Exception("Error opening file {0}".format(w))
-        if scp_per_archive_file is None:
-            raise Exception("Error opening file {0}".format(scp_per_archive_file))
-
-        for job in range(args.num_jobs):
-            scp = ("{0}/temp/{1}scp.{2}.{3}".format(args.egs_dir, args.egs_prefix,
-                                                    job + 1, archive + 1))
-            with open(scp, "r") as scpfile:
-                for line in scpfile:
-                    scp_line = line.splitlines()[0].split()
-                    print("{0} {1}".format(scp_line[0], scp_line[1]),
-                          file=scp_per_archive_file)
-                    print("{0} output-{1}".format(scp_line[0], scp_line[2]),
-                          file=o)
-                    print("{0} {1}".format(
-                            scp_line[0],
-                            lang2weight[int(scp_line[2])]),
-                          file=w)
-            os.remove(scp)
-
-        for (lang_id, start_eg_line, num_egs) in all_egs[num_archives * job + archive]:
-            this_ranges.append((lang_id, start_eg_line, num_egs))
-
-        # write egs.ranges.*.txt
-        for (lang_id, start_eg_line, num_egs) in this_ranges:
-            print("{0} {1} {2}".format(lang_id, start_eg_line, num_egs), file=f)
-
-        f.close()
-        o.close()
-        w.close()
-        scp_per_archive_file.close()
-    logger.info("finished generating {0}*.scp, {0}output.*.ark "
-                "and {0}weight.*.ark files.".format(args.egs_prefix))
+    if not os.path.exists(os.path.join(args.egs_dir, 'info')):
+        os.makedirs(os.path.join(args.egs_dir, 'info'))
+
+    with open("{0}/info/{1}num_tasks".format(args.egs_dir, args.egs_prefix), "w") as fh:
+        print("{0}".format(num_langs), file=fh)
+
+    # Total number of egs in all languages
+    tot_num_egs = sum(lang_to_num_examples[i] for i in range(num_langs))
+    num_archives = args.num_archives
+
+    with open("{0}/info/{1}num_archives".format(args.egs_dir, args.egs_prefix), "w") as fh:
+        print("{0}".format(num_archives), file=fh)
+
+    logger.info("There are a total of {} examples in the input scp "
+                "files.".format(tot_num_egs))
+    logger.info("Number of blocks in each output archive will be approximately "
+                "{}, and block-size is {}.".format(int(round(tot_num_egs / num_archives / args.block_size)),
+                                                   args.block_size))
+    for lang in range(num_langs):
+        blocks_per_archive_this_lang = lang_to_num_examples[lang] / num_archives / args.block_size
+        warning = ""
+        if blocks_per_archive_this_lang < 1.0:
+            warning = ("Warning: This means some of the output archives might "
+                       "not include any examples from this lang.")
+        logger.info("The proportion of egs from lang {} is {:.2f}. The number of blocks "
+                    "per archive for this lang is approximately {:.2f}. "
+                    "{}".format(lang, lang_to_num_examples[lang] / tot_num_egs,
+                                blocks_per_archive_this_lang,
+                                warning))
+
+    in_scp_file_handles = [open(scp_lists[lang], 'r') for lang in range(num_langs)]
+
+    num_remaining_egs = tot_num_egs
+    lang_to_num_remaining_egs = [n for n in lang_to_num_examples]
+    for archive_index in range(num_archives + 1):  #  +1 is because we write to the last archive in two rounds
+        num_remaining_archives = num_archives - archive_index
+        num_remaining_blocks = num_remaining_egs / args.block_size
+
+        last_round = (archive_index == num_archives)
+        if not last_round:
+            num_blocks_this_archive = int(round(num_remaining_blocks / num_remaining_archives))
+            logger.info("Generating archive {} containing {} blocks...".format(archive_index, num_blocks_this_archive))
+        else:  # This is the second round for the last archive. Flush all the remaining egs...
+            archive_index = num_archives - 1
+            num_blocks_this_archive = num_langs
+            logger.info("Writing all the {} remaining egs to the last archive...".format(num_remaining_egs))
+
+        out_scp_file_handle = open('{0}/{1}{2}.scp'.format(args.egs_dir, args.egs_prefix, archive_index + 1),
+                                   'a' if last_round else 'w')
+        eg_to_output_file_handle = open("{0}/{1}output.{2}.ark".format(args.egs_dir, args.egs_prefix, archive_index + 1),
+                                        'a' if last_round else 'w')
+        eg_to_weight_file_handle = open("{0}/{1}weight.{2}.ark".format(args.egs_dir, args.egs_prefix, archive_index + 1),
+                                        'a' if last_round else 'w')
+
+
+        for block_index in range(num_blocks_this_archive):
+            # Find the lang with the highest proportion of remaining examples
+            remaining_proportions = [remain / tot for remain, tot in zip(lang_to_num_remaining_egs, lang_to_num_examples)]
+            lang_index, max_proportion = max(enumerate(remaining_proportions), key=lambda a: a[1])
+
+            # Read 'block_size' examples from the selected lang and write them to the current output scp file:
+            example_lines  = read_lines(in_scp_file_handles[lang_index], args.block_size)
+            for eg_line in example_lines:
+                eg_id = eg_line.split()[0]
+                print(eg_line, file=out_scp_file_handle)
+                print("{0} output-{1}".format(eg_id, lang_index), file=eg_to_output_file_handle)
+                print("{0} {1}".format(eg_id, lang2weight[lang_index]), file=eg_to_weight_file_handle)
+
+            num_remaining_egs -= len(example_lines)
+            lang_to_num_remaining_egs[lang_index] -= len(example_lines)
+
+        out_scp_file_handle.close()
+        eg_to_output_file_handle.close()
+        eg_to_weight_file_handle.close()
+
+    for handle in in_scp_file_handles:
+        handle.close()
+    logger.info("Finished generating {0}*.scp, {0}output.*.ark "
+                "and {0}weight.*.ark files. Wrote a total of {1} examples "
+                "to {2} archives.".format(args.egs_prefix,
+                                          tot_num_egs - num_remaining_egs, num_archives))
 
 
 def main():
@@ -315,4 +230,4 @@ def main():
 
 
 if __name__ == "__main__":
-  main()
+    main()
diff --git a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh
index 3826dad11a9..e1aeb0b70d6 100755
--- a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh
@@ -1,5 +1,10 @@
 #!/bin/bash
-#
+
+# Copyright 2017     Pegah Ghahremani
+#           2017-18  Vimal Manohar
+#           2018     Hossein Hadian
+# Apache 2.0
+
 # This script generates examples for multilingual training of neural network
 # using separate input egs dir per language as input.
 # This scripts produces 3 sets of files --
@@ -14,16 +19,9 @@
 #
 # Begin configuration section.
 cmd=run.pl
-minibatch_size=512      # it is the number of consecutive egs that we take from 
-                        # each source, and it only affects the locality of disk 
-                        # access. This does not have to be the actual minibatch size;
-num_jobs=10             # helps for better randomness across languages
-                        # per archive.
-samples_per_iter=400000 # this is the target number of egs in each archive of egs
-                        # (prior to merging egs).  We probably should have called
-                        # it egs_per_iter. This is just a guideline; it will pick
-                        # a number that divides the number of samples in the
-                        # entire data.
+block_size=256          # This is the number of consecutive egs that we take from
+                        # each source, and it only affects the locality of disk
+                        # access.
 lang2weight=            # array of weights one per input languge to scale example's output
                         # w.r.t its input language during training.
 stage=0
@@ -33,6 +31,24 @@ echo "$0 $@"  # Print the command line for logging
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
+if [ $# -lt 3 ]; then
+  cat <<EOF
+  This script generates examples for multilingual training of neural network
+  using separate input egs dir per language as input.
+  See top of the script for details.
+
+  Usage: $0 [opts] <num-input-langs,N> <lang1-egs-dir> ...<langN-egs-dir> <multilingual-egs-dir>
+   e.g.: $0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs
+
+  Options:
+      --cmd (utils/run.pl|utils/queue.pl <queue opts>)  # how to run jobs.
+      --block-size <int|512>      # it is the number of consecutive egs that we take from 
+                                  # each source, and it only affects the locality of disk 
+                                  # access. This does not have to be the actual minibatch size
+EOF
+  exit 1;
+fi
+
 num_langs=$1
 
 shift 1
@@ -55,14 +71,15 @@ combine_scp_list=
 
 # read paramter from $egs_dir[0]/info and cmvn_opts
 # to write in multilingual egs_dir.
-check_params="info/feat_dim info/ivector_dim info/left_context info/right_context info/frames_per_eg cmvn_opts"
+check_params="info/feat_dim info/ivector_dim info/left_context info/right_context cmvn_opts"
 ivec_dim=`cat ${args[0]}/info/ivector_dim`
 if [ $ivec_dim -ne 0 ];then check_params="$check_params info/final.ie.id"; fi
 
-for param in $check_params; do
-    cat ${args[0]}/$param > $megs_dir/$param || exit 1;
+for param in $check_params info/frames_per_eg; do
+  cat ${args[0]}/$param > $megs_dir/$param || exit 1;
 done
 
+tot_num_archives=0
 for lang in $(seq 0 $[$num_langs-1]);do
   multi_egs_dir[$lang]=${args[$lang]}
   for f in $required; do
@@ -70,6 +87,8 @@ for lang in $(seq 0 $[$num_langs-1]);do
       echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1;
     fi
   done
+  num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives)
+  tot_num_archives=$[tot_num_archives+num_archives]
   train_scp_list="$train_scp_list ${args[$lang]}/egs.scp"
   train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp"
   valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp"
@@ -90,16 +109,17 @@ for lang in $(seq 0 $[$num_langs-1]);do
   done
 done
 
+if [ ! -z "$lang2weight" ]; then
+  egs_opt="--lang2weight '$lang2weight'"
+fi
+
 if [ $stage -le 0 ]; then
   echo "$0: allocating multilingual examples for training."
-  if [ ! -z "$lang2weight" ]; then
-    egs_opt="--lang2weight '$lang2weight'"
-  fi
   # Generate egs.*.scp for multilingual setup.
   $cmd $megs_dir/log/allocate_multilingual_examples_train.log \
-  steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
-      --minibatch-size $minibatch_size \
-      --samples-per-iter $samples_per_iter \
+    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
+      --num-archives $tot_num_archives \
+      --block-size $block_size \
       $train_scp_list $megs_dir || exit 1;
 fi
 
@@ -107,20 +127,18 @@ if [ $stage -le 1 ]; then
   echo "$0: combine combine.scp examples from all langs in $megs_dir/combine.scp."
   # Generate combine.scp for multilingual setup.
   $cmd $megs_dir/log/allocate_multilingual_examples_combine.log \
-  steps/nnet3/multilingual/allocate_multilingual_examples.py \
-      --random-lang false \
-      --max-archives 1 --num-jobs 1 \
-      --minibatch-size $minibatch_size \
+    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
+      --num-archives 1 \
+      --block-size $block_size \
       --egs-prefix "combine." \
       $combine_scp_list $megs_dir || exit 1;
 
   echo "$0: combine train_diagnostic.scp examples from all langs in $megs_dir/train_diagnostic.scp."
   # Generate train_diagnostic.scp for multilingual setup.
   $cmd $megs_dir/log/allocate_multilingual_examples_train_diagnostic.log \
-  steps/nnet3/multilingual/allocate_multilingual_examples.py \
-      --random-lang false \
-      --max-archives 1 --num-jobs 1 \
-      --minibatch-size $minibatch_size \
+    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
+      --num-archives 1 \
+      --block-size $block_size \
       --egs-prefix "train_diagnostic." \
       $train_diagnostic_scp_list $megs_dir || exit 1;
 
@@ -128,9 +146,9 @@ if [ $stage -le 1 ]; then
   echo "$0: combine valid_diagnostic.scp examples from all langs in $megs_dir/valid_diagnostic.scp."
   # Generate valid_diagnostic.scp for multilingual setup.
   $cmd $megs_dir/log/allocate_multilingual_examples_valid_diagnostic.log \
-  steps/nnet3/multilingual/allocate_multilingual_examples.py \
-      --random-lang false --max-archives 1 --num-jobs 1\
-      --minibatch-size $minibatch_size \
+    steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \
+      --num-archives 1 \
+      --block-size $block_size \
       --egs-prefix "valid_diagnostic." \
       $valid_diagnostic_scp_list $megs_dir || exit 1;
 
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index 0e336cdbc11..93cbc940c33 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -259,7 +259,7 @@ def plot_a_nonlin_component(fig, dirs, stat_tables_per_component_per_dir,
                 continue
 
             data = np.array(iter_stats)
-            data = data[data[:, 0] >= start_iter, :] 
+            data = data[data[:, 0] >= start_iter, :]
 
             ax = plt.subplot(211)
             lp, = ax.plot(data[:, 0], data[:, gate_index*10+5], color=color_val,
@@ -345,7 +345,7 @@ def plot_a_nonlin_component(fig, dirs, stat_tables_per_component_per_dir,
                 bbox_to_anchor=(0.5 , -1.5 + len(dirs) * -0.2),
                 ncol=4, handletextpad = -2, title="[1]:{0}".format(common_prefix),
                 borderaxespad=0.)
-        plt.grid(True)        
+        plt.grid(True)
 
     return lgd
 
@@ -826,6 +826,7 @@ def main():
             output_nodes.append(tuple(parts))
     elif args.is_chain:
         output_nodes.append(('output', 'chain'))
+        output_nodes.append(('output-xent', 'chain'))
     elif args.is_rnnlm:
         output_nodes.append(('output', 'rnnlm_objective'))
     else:
diff --git a/egs/wsj/s5/steps/subset_ali_dir.sh b/egs/wsj/s5/steps/subset_ali_dir.sh
new file mode 100755
index 00000000000..537d91c1248
--- /dev/null
+++ b/egs/wsj/s5/steps/subset_ali_dir.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0.
+
+cmd=run.pl
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+. ./utils/parse_options.sh
+
+if [ $# -ne 4 ]; then
+  cat <<EOF
+  This script creates an alignment directory containing a subset of 
+  utterances contained in <subset-data-dir> from the 
+  original alignment directory containing alignments for utterances in
+  <full-data-dir>.
+
+  The number of split jobs in the output alignment directory is 
+  equal to the number of jobs in the original alignment directory, 
+  unless the subset data directory has too few speakers.
+
+  Usage: $0 [options] <full-data-dir> <subset-data-dir> <ali-dir> <subset-ali-dir>
+   e.g.: $0 data/train_sp data/train exp/tri3_ali_sp exp/tri3_ali
+
+  Options: 
+      --cmd (utils/run.pl|utils/queue.pl <queue opts>)  # how to run jobs.
+EOF
+  exit 1
+fi
+
+data=$1
+subset_data=$2
+ali_dir=$3
+dir=$4
+
+nj=$(cat $ali_dir/num_jobs) || exit 1
+utils/split_data.sh $data $nj
+
+mkdir -p $dir
+cp $ali_dir/{final.mdl,*.mat,*_opts,tree} $dir/ || true
+cp -r $ali_dir/phones $dir 2>/dev/null || true
+
+$cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \
+  copy-int-vector "ark:gunzip -c $ali_dir/ali.JOB.gz |" \
+  ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1
+
+for n in `seq $nj`; do
+  cat $dir/ali_tmp.$n.scp 
+done > $dir/ali_tmp.scp
+
+num_spk=$(cat $subset_data/spk2utt | wc -l)
+if [ $num_spk -lt $nj ]; then
+  nj=$num_spk
+fi
+
+utils/split_data.sh $subset_data $nj
+$cmd JOB=1:$nj $dir/log/filter_alignments.JOB.log \
+  copy-int-vector \
+  "scp:utils/filter_scp.pl $subset_data/split${nj}/JOB/utt2spk $dir/ali_tmp.scp |" \
+  "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1
+
+echo $nj > $dir/num_jobs
+
+rm $dir/ali_tmp.*.{ark,scp} $dir/ali_tmp.scp
+
+exit 0
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index 579e3d7b3e0..30aff50170b 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -74,7 +74,9 @@ void ProtoSupervision::Write(std::ostream &os, bool binary) const {
 void SupervisionOptions::Check() const {
   KALDI_ASSERT(left_tolerance >= 0 && right_tolerance >= 0 &&
                frame_subsampling_factor > 0 &&
-               left_tolerance + right_tolerance >= frame_subsampling_factor);
+               left_tolerance + right_tolerance + 1 >= frame_subsampling_factor);
+
+  KALDI_ASSERT(lm_scale >= 0.0 && lm_scale < 1.0);
 }
 
 bool AlignmentToProtoSupervision(const SupervisionOptions &opts,
@@ -142,9 +144,10 @@ bool ProtoSupervision::operator == (const ProtoSupervision &other) const {
           fst::Equal(fst, other.fst));
 }
 
-bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts,
-                                    const CompactLattice &lat,
-                                    ProtoSupervision *proto_supervision) {
+bool PhoneLatticeToProtoSupervisionInternal(
+    const SupervisionOptions &opts,
+    const CompactLattice &lat,
+    ProtoSupervision *proto_supervision) {
   opts.Check();
   if (lat.NumStates() == 0) {
     KALDI_WARN << "Empty lattice provided";
@@ -176,20 +179,24 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts,
         return false;
       }
       proto_supervision->fst.AddArc(state,
-                                    fst::StdArc(phone, phone,
-                                                fst::TropicalWeight::One(),
-                                                lat_arc.nextstate));
+        fst::StdArc(phone, phone,
+                    fst::TropicalWeight(
+                      lat_arc.weight.Weight().Value1()
+                      * opts.lm_scale),
+                    lat_arc.nextstate));
+
       int32 t_begin = std::max<int32>(0, (state_time - opts.left_tolerance)),
               t_end = std::min<int32>(num_frames,
                                       (next_state_time + opts.right_tolerance)),
- t_begin_subsampled = (t_begin + factor - 1)/ factor,
-   t_end_subsampled = (t_end + factor - 1)/ factor;
+              t_begin_subsampled = (t_begin + factor - 1)/ factor,
+              t_end_subsampled = (t_end + factor - 1)/ factor;
     for (int32 t_subsampled = t_begin_subsampled;
          t_subsampled < t_end_subsampled; t_subsampled++)
       proto_supervision->allowed_phones[t_subsampled].push_back(phone);
     }
     if (lat.Final(state) != CompactLatticeWeight::Zero()) {
-      proto_supervision->fst.SetFinal(state, fst::TropicalWeight::One());
+      proto_supervision->fst.SetFinal(state, fst::TropicalWeight(
+            lat.Final(state).Weight().Value1() * opts.lm_scale));
       if (state_times[state] != num_frames) {
         KALDI_WARN << "Time of final state " << state << " in lattice is "
                    << "not equal to number of frames " << num_frames
@@ -207,6 +214,18 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts,
   return true;
 }
 
+bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts,
+                                    const CompactLattice &lat,
+                                    ProtoSupervision *proto_supervision) {
+
+  if (!PhoneLatticeToProtoSupervisionInternal(opts, lat, proto_supervision))
+    return false;
+  if (opts.lm_scale != 0.0)
+    fst::Push(&(proto_supervision->fst),
+              fst::REWEIGHT_TO_INITIAL, fst::kDelta, true);
+  
+  return true;
+}
 
 bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
   // the following call will do the range-check on 'ilabel'.
@@ -787,8 +806,10 @@ bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst,
   fst::StdVectorFst supervision_fst_noeps(supervision->fst);
   fst::RmEpsilon(&supervision_fst_noeps);
   if (!TryDeterminizeMinimize(kSupervisionMaxStates,
-                              &supervision_fst_noeps))
+                              &supervision_fst_noeps)) {
+    KALDI_WARN << "Failed to determinize supervision fst";
     return false;
+  }
 
   // note: by default, 'Compose' will call 'Connect', so if the
   // resulting FST is not connected, it will end up empty.
@@ -801,8 +822,10 @@ bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst,
   // determinize and minimize to make it as compact as possible.
 
   if (!TryDeterminizeMinimize(kSupervisionMaxStates,
-                              &composed_fst))
+                              &composed_fst)) {
+    KALDI_WARN << "Failed to determinize normalized supervision fst";
     return false;
+  }
   supervision->fst = composed_fst;
 
   // Make sure the states are numbered in increasing order of time.
diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h
index 13866e2aba6..e52602e1c12 100644
--- a/src/chain/chain-supervision.h
+++ b/src/chain/chain-supervision.h
@@ -50,10 +50,14 @@ struct SupervisionOptions {
   int32 left_tolerance;
   int32 right_tolerance;
   int32 frame_subsampling_factor;
+  BaseFloat weight;
+  BaseFloat lm_scale;
 
   SupervisionOptions(): left_tolerance(5),
                         right_tolerance(5),
-                        frame_subsampling_factor(1) { }
+                        frame_subsampling_factor(1),
+                        weight(1.0),
+                        lm_scale(0.0) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("left-tolerance", &left_tolerance, "Left tolerance for "
@@ -65,6 +69,13 @@ struct SupervisionOptions {
                    "frame-rate of the original alignment.  Applied after "
                    "left-tolerance and right-tolerance are applied (so they are "
                    "in terms of the original num-frames.");
+    opts->Register("weight", &weight,
+                   "Use this to set the supervision weight for training. "
+                   "This can be used to assign different weights to "
+                   "different data sources.");
+    opts->Register("lm-scale", &lm_scale, "The scale with which the graph/lm "
+                   "weights from the phone lattice are included in the "
+                   "supervision fst.");
   }
   void Check() const;
 };
diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc
index ca0428553c1..a3222d2285f 100644
--- a/src/chainbin/nnet3-chain-combine.cc
+++ b/src/chainbin/nnet3-chain-combine.cc
@@ -54,17 +54,16 @@ double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode,
                                                    end = egs.end();
     for (; iter != end; ++iter)
       prob_computer->Compute(*iter);
-    const ChainObjectiveInfo *objf_info =
-        prob_computer->GetObjective("output");
-    if (objf_info == NULL)
-      KALDI_ERR << "Error getting objective info (unsuitable egs?)";
-    KALDI_ASSERT(objf_info->tot_weight > 0.0);
+
+    double tot_weight = 0.0;
+    double tot_objf = prob_computer->GetTotalObjective(&tot_weight);
+
+    KALDI_ASSERT(tot_weight > 0.0);
     // inf/nan tot_objf->return -inf objective.
-    double tot_objf = objf_info->tot_like + objf_info->tot_l2_term;
     if (!(tot_objf == tot_objf && tot_objf - tot_objf == 0))
       return -std::numeric_limits<double>::infinity();
     // we prefer to deal with normalized objective functions.
-    return tot_objf / objf_info->tot_weight;
+    return tot_objf / tot_weight;
   }
 }
 
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
index 4f26e145ac5..0117fe2200f 100644
--- a/src/chainbin/nnet3-chain-copy-egs.cc
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -1,8 +1,9 @@
 // chainbin/nnet3-chain-copy-egs.cc
 
 // Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
-//                2014  Vimal Manohar
+//           2014-2017  Vimal Manohar
 //                2016  Gaofeng Cheng
+//                2017  Pegah Ghahremani
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,6 +27,40 @@
 namespace kaldi {
 namespace nnet3 {
 
+// renames outputs named "output" to new_name
+void RenameOutputs(const std::string &new_name, NnetChainExample *eg) {
+  bool found_output = false;
+  for (std::vector<NnetChainSupervision>::iterator it = eg->outputs.begin();
+       it != eg->outputs.end(); ++it) {
+    if (it->name == "output") {
+      it->name = new_name;
+      found_output = true;
+    }
+  }
+
+  if (!found_output)
+    KALDI_ERR << "No supervision with name 'output'"
+              << "exists in eg.";
+}
+
+// scales the supervision for 'output' by a factor of "weight"
+void ScaleSupervisionWeight(BaseFloat weight, NnetChainExample *eg) {
+  if (weight == 1.0) return;
+
+  bool found_output = false;
+  for (std::vector<NnetChainSupervision>::iterator it = eg->outputs.begin();
+       it != eg->outputs.end(); ++it) {
+    if (it->name == "output") {
+      it->supervision.weight *= weight;
+      found_output = true;
+    }
+  }
+
+  if (!found_output)
+    KALDI_ERR << "No supervision with name 'output'"
+              << "exists in eg.";
+}
+
 // returns an integer randomly drawn with expected value "expected_count"
 // (will be either floor(expected_count) or ceil(expected_count)).
 int32 GetCount(double expected_count) {
@@ -37,40 +72,31 @@ int32 GetCount(double expected_count) {
   return ans;
 }
 
-void FilterExample(const NnetChainExample &eg,
-                   int32 min_input_t,
+/**
+   This function filters the indexes (and associated feature rows) in a
+   NnetExample, removing any index/row in an NnetIo named "input" with t <
+   min_input_t or t > max_input_t and any index/row in an NnetIo named "output" with t <
+   min_output_t or t > max_output_t.
+   Will crash if filtering removes all Indexes of "input" or "output".
+ */
+void FilterExample(int32 min_input_t,
                    int32 max_input_t,
                    int32 min_output_t,
                    int32 max_output_t,
-                   NnetChainExample *eg_out) {
-  eg_out->inputs.clear();
-  eg_out->inputs.resize(eg.inputs.size());
-  eg_out->outputs.clear();
-  eg_out->outputs.resize(eg.outputs.size());
+                   NnetChainExample *eg) {
   // process the <NnetIo> inputs
-  for (size_t i = 0; i < eg.inputs.size(); i++) {
-    bool is_input;
+  for (size_t i = 0; i < eg->inputs.size(); i++) {
     int32 min_t, max_t;
-    const NnetIo &io_in = eg.inputs[i];
-    NnetIo &io_out = eg_out->inputs[i];
-    const std::string &name = io_in.name;
-    io_out.name = name;
-    if (name == "input") {
+    NnetIo &io = eg->inputs[i];
+    if (io.name == "input") {
       min_t = min_input_t;
       max_t = max_input_t;
-      is_input = true;
-    } else {
-      is_input = false;
-    }
-    if (!is_input) {  // Just copy everything.
-      io_out.indexes = io_in.indexes;
-      io_out.features = io_in.features;
-    } else {
-      const std::vector<Index> &indexes_in = io_in.indexes;
-      std::vector<Index> &indexes_out = io_out.indexes;
+      
+      const std::vector<Index> &indexes_in = io.indexes;
+      std::vector<Index> indexes_out;
       indexes_out.reserve(indexes_in.size());
       int32 num_indexes = indexes_in.size(), num_kept = 0;
-      KALDI_ASSERT(io_in.features.NumRows() == num_indexes);
+      KALDI_ASSERT(io.features.NumRows() == num_indexes);
       std::vector<bool> keep(num_indexes, false);
       std::vector<Index>::const_iterator iter_in = indexes_in.begin(),
                                           end_in = indexes_in.end();
@@ -86,27 +112,26 @@ void FilterExample(const NnetChainExample &eg,
       }
       KALDI_ASSERT(iter_out == keep.end());
       if (num_kept == 0)
-        KALDI_ERR << "FilterExample removed all indexes for '" << name << "'";
+        KALDI_ERR << "FilterExample removed all indexes for '" << io.name << "'";
+      io.indexes = indexes_out;
 
-      FilterGeneralMatrixRows(io_in.features, keep,
-                              &io_out.features);
-      KALDI_ASSERT(io_out.features.NumRows() == num_kept &&
+      GeneralMatrix features_out;
+      FilterGeneralMatrixRows(io.features, keep, &features_out);
+      io.features = features_out;
+      KALDI_ASSERT(io.features.NumRows() == num_kept &&
                    indexes_out.size() == static_cast<size_t>(num_kept));
     }
   }
-  // process the <NnetChainSupervision> outputs, we will copy all supervision
-  // output as default
-  for (size_t i = 0; i < eg.outputs.size(); i++) {
-    const NnetChainSupervision &io_in = eg.outputs[i];
-    NnetChainSupervision &io_out = eg_out->outputs[i];
-    const std::string &name = io_in.name;
-    io_out.name = name;
-    io_out.indexes = io_in.indexes;
-    io_out.supervision = io_in.supervision;
-    io_out.deriv_weights = io_in.deriv_weights;
-  }
 }
 
+
+/** Returns true if the "eg" contains just a single example, meaning
+    that all the "n" values in the indexes are zero, and the example
+    has NnetIo members named both "input" and "output"
+
+    Also computes the minimum and maximum "t" values in the "input" and
+    "output" NnetIo members.
+ */
 bool ContainsSingleExample(const NnetChainExample &eg,
                            int32 *min_input_t,
                            int32 *max_input_t,
@@ -196,15 +221,14 @@ void CalculateFrameSubsamplingFactor(const NnetChainExample &eg,
                               - eg.outputs[0].indexes[0].t;
 }
 
-void ModifyChainExampleContext(const NnetChainExample &eg,
-                               int32 left_context,
+void ModifyChainExampleContext(int32 left_context,
                                int32 right_context,
                                const int32 frame_subsampling_factor,
-                               NnetChainExample *eg_out) {
+                               NnetChainExample *eg) {
   static bool warned_left = false, warned_right = false;
   int32 min_input_t, max_input_t,
         min_output_t, max_output_t;
-  if (!ContainsSingleExample(eg, &min_input_t, &max_input_t,
+  if (!ContainsSingleExample(*eg, &min_input_t, &max_input_t,
                              &min_output_t, &max_output_t))
     KALDI_ERR << "Too late to perform frame selection/context reduction on "
               << "these examples (already merged?)";
@@ -235,11 +259,11 @@ void ModifyChainExampleContext(const NnetChainExample &eg,
       max_input_t = std::min(max_input_t, max_output_t + right_context);
     }
   }
-  FilterExample(eg,
-                min_input_t, max_input_t,
+  FilterExample(min_input_t, max_input_t,
                 min_output_t, max_output_t,
-                eg_out);
+                eg);
 }  // ModifyChainExampleContext
+
 }  // namespace nnet3
 }  // namespace kaldi
 
@@ -268,6 +292,8 @@ int main(int argc, char *argv[]) {
     int32 frame_subsampling_factor = -1;
     BaseFloat keep_proportion = 1.0;
     int32 left_context = -1, right_context = -1;
+    std::string eg_weight_rspecifier, eg_output_name_rspecifier;
+
     ParseOptions po(usage);
     po.Register("random", &random, "If true, will write frames to output "
                 "archives randomly, not round-robin.");
@@ -285,6 +311,15 @@ int main(int argc, char *argv[]) {
                 "feature left-context that we output.");
     po.Register("right-context", &right_context, "Can be used to truncate the "
                 "feature right-context that we output.");
+    po.Register("weights", &eg_weight_rspecifier,
+                "Rspecifier indexed by the key of egs, providing a weight by "
+                "which we will scale the supervision matrix for that eg. "
+                "Used in multilingual training.");
+    po.Register("outputs", &eg_output_name_rspecifier,
+                "Rspecifier indexed by the key of egs, providing a string-valued "
+                "output name, e.g. 'output-0'.  If provided, the NnetIo with "
+                "name 'output' will be renamed to the provided name. Used in "
+                "multilingual training.");
     po.Read(argc, argv);
 
     srand(srand_seed);
@@ -298,6 +333,11 @@ int main(int argc, char *argv[]) {
 
     SequentialNnetChainExampleReader example_reader(examples_rspecifier);
 
+    // In the normal case, these would not be used. These are only applicable
+    // for multi-task or multilingual training.
+    RandomAccessTokenReader output_name_reader(eg_output_name_rspecifier);
+    RandomAccessBaseFloatReader egs_weight_reader(eg_weight_rspecifier);
+
     int32 num_outputs = po.NumArgs() - 1;
     std::vector<NnetChainExampleWriter*> example_writers(num_outputs);
     for (int32 i = 0; i < num_outputs; i++)
@@ -307,38 +347,47 @@ int main(int argc, char *argv[]) {
                                             // not configurable for now.
     exclude_names.push_back(std::string("ivector"));
 
-    int64 num_read = 0, num_written = 0;
-
+    int64 num_read = 0, num_written = 0, num_err = 0;
     for (; !example_reader.Done(); example_reader.Next(), num_read++) {
+      const std::string &key = example_reader.Key();
+      NnetChainExample &eg = example_reader.Value();
       if (frame_subsampling_factor == -1)
-        CalculateFrameSubsamplingFactor(example_reader.Value(),
+        CalculateFrameSubsamplingFactor(eg,
                                         &frame_subsampling_factor);
       // count is normally 1; could be 0, or possibly >1.
       int32 count = GetCount(keep_proportion);
-      std::string key = example_reader.Key();
-      if (frame_shift == 0 &&
-          left_context == -1 && right_context == -1) {
-        const NnetChainExample &eg = example_reader.Value();
-        for (int32 c = 0; c < count; c++) {
-          int32 index = (random ? Rand() : num_written) % num_outputs;
-          example_writers[index]->Write(key, eg);
-          num_written++;
+
+      if (!eg_weight_rspecifier.empty()) {
+        BaseFloat weight = 1.0;
+        if (!egs_weight_reader.HasKey(key)) {
+          KALDI_WARN << "No weight for example key " << key;
+          num_err++;
+          continue;
         }
-      } else if (count > 0) {
-        NnetChainExample eg = example_reader.Value();
-        if (frame_shift != 0)
-          ShiftChainExampleTimes(frame_shift, exclude_names, &eg);
-        NnetChainExample eg_out;
-        if (left_context != -1 || right_context != -1)
-          ModifyChainExampleContext(eg, left_context, right_context,
-                                    frame_subsampling_factor, &eg_out);
-        else
-          eg_out.Swap(&eg);
-        for (int32 c = 0; c < count; c++) {
-          int32 index = (random ? Rand() : num_written) % num_outputs;
-          example_writers[index]->Write(key, eg_out);
-          num_written++;
+        weight = egs_weight_reader.Value(key);
+        ScaleSupervisionWeight(weight, &eg);
+      }
+      
+      if (!eg_output_name_rspecifier.empty()) {
+        if (!output_name_reader.HasKey(key)) {
+          KALDI_WARN << "No new output-name for example key " << key;
+          num_err++;
+          continue;
         }
+        std::string new_output_name = output_name_reader.Value(key);
+        RenameOutputs(new_output_name, &eg);
+      }
+      
+      if (frame_shift != 0)
+        ShiftChainExampleTimes(frame_shift, exclude_names, &eg);
+      if (left_context != -1 || right_context != -1)
+        ModifyChainExampleContext(left_context, right_context,
+                                  frame_subsampling_factor, &eg);
+        
+      for (int32 c = 0; c < count; c++) {
+        int32 index = (random ? Rand() : num_written) % num_outputs;
+        example_writers[index]->Write(key, eg);
+        num_written++;
       }
     }
     for (int32 i = 0; i < num_outputs; i++)
diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index c8c251900ec..ef545ab9162 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -36,13 +36,51 @@ namespace nnet3 {
    supervision objects to 'example_writer'.  Note: if normalization_fst is the
    empty FST (with no states), it skips the final stage of egs preparation and
    you should do it later with nnet3-chain-normalize-egs.
-*/
+
+     @param [in]  normalization_fst   A version of denominator FST used to add weights
+                                      to the created supervision. It is 
+                                      actually an FST expected to have the
+                                      labels as (pdf-id+1)
+     @param [in]  feats               Input feature matrix 
+     @param [in]  ivector_feats       Online iVector matrix sub-sampled at a 
+                                      rate of "ivector_period".
+                                      If NULL, iVector will not be added 
+                                      as in input to the egs.
+     @param [in]  ivector_period      Number of frames between iVectors in
+                                      "ivector_feats" matrix.
+     @param [in]  supervision         Supervision for 'chain' training created 
+                                      from the binary chain-get-supervision.
+                                      This is expected to be at a 
+                                      sub-sampled rate if 
+                                      --frame-subsampling-factor > 1.
+     @param [in]  deriv_weights       Vector of per-frame weights that scale
+                                      a frame's gradient during backpropagation.
+                                      If NULL, this is equivalent to specifying
+                                      a vector of all 1s. 
+                                      The dimension of the vector is expected 
+                                      to be the supervision size, which is 
+                                      at a sub-sampled rate if 
+                                      --frame-subsampling-factor > 1.
+     @param [in]  supervision_length_tolerance
+                                      Tolerance for difference in num-frames-subsampled between 
+                                      supervision and deriv weights, and also between supervision 
+                                      and input frames.
+     @param [in]  utt_id              Utterance-id
+     @param [in]  compress            If true, compresses the feature matrices.
+     @param [out]  utt_splitter       Pointer to UtteranceSplitter object,
+                                      which helps to split an utterance into 
+                                      chunks. This also stores some stats.
+     @param [out]  example_writer     Pointer to egs writer.
+
+**/
 
 static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
                         const GeneralMatrix &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
                         int32 ivector_period,
                         const chain::Supervision &supervision,
+                        const VectorBase<BaseFloat> *deriv_weights,
+                        int32 supervision_length_tolerance,
                         const std::string &utt_id,
                         bool compress,
                         UtteranceSplitter *utt_splitter,
@@ -51,7 +89,18 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
   int32 num_input_frames = feats.NumRows(),
       num_output_frames = supervision.frames_per_sequence;
 
-  if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames))
+  int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor;
+
+  if (deriv_weights && (std::abs(deriv_weights->Dim() - num_output_frames)
+                        > supervision_length_tolerance)) {
+    KALDI_WARN << "For utterance " << utt_id
+               << ", mismatch between deriv-weights dim and num-output-frames"
+               << "; " << deriv_weights->Dim() << " vs " << num_output_frames;
+    return false;
+  }
+
+  if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames,
+                                  supervision_length_tolerance))
     return false;  // LengthsMatch() will have printed a warning.
 
   std::vector<ChunkTimeInfo> chunks;
@@ -65,8 +114,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
     return false;
   }
 
-  int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor;
-
   chain::SupervisionSplitter sup_splitter(supervision);
 
   for (size_t c = 0; c < chunks.size(); c++) {
@@ -92,19 +139,36 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
 
     int32 first_frame = 0;  // we shift the time-indexes of all these parts so
                             // that the supervised part starts from frame 0.
+    
+    NnetChainExample nnet_chain_eg;
+    nnet_chain_eg.outputs.resize(1);
 
     SubVector<BaseFloat> output_weights(
         &(chunk.output_weights[0]),
         static_cast<int32>(chunk.output_weights.size()));
 
-    NnetChainSupervision nnet_supervision("output", supervision_part,
-                                          output_weights,
-                                          first_frame,
-                                          frame_subsampling_factor);
+    if (!deriv_weights) {
+      NnetChainSupervision nnet_supervision("output", supervision_part,
+                                            output_weights,
+                                            first_frame,
+                                            frame_subsampling_factor);
+      nnet_chain_eg.outputs[0].Swap(&nnet_supervision);
+    } else {
+      Vector<BaseFloat> this_deriv_weights(num_frames_subsampled);
+      for (int32 i = 0; i < num_frames_subsampled; i++) {
+        int32 t = i + start_frame_subsampled;
+        if (t < deriv_weights->Dim())
+          this_deriv_weights(i) = (*deriv_weights)(t);
+      }
+      KALDI_ASSERT(output_weights.Dim() == num_frames_subsampled);
+      this_deriv_weights.MulElements(output_weights);
+      NnetChainSupervision nnet_supervision("output", supervision_part,
+                                            this_deriv_weights,
+                                            first_frame,
+                                            frame_subsampling_factor);
+      nnet_chain_eg.outputs[0].Swap(&nnet_supervision);
+    }
 
-    NnetChainExample nnet_chain_eg;
-    nnet_chain_eg.outputs.resize(1);
-    nnet_chain_eg.outputs[0].Swap(&nnet_supervision);
     nnet_chain_eg.inputs.resize(ivector_feats != NULL ? 2 : 1);
 
     int32 tot_input_frames = chunk.left_context + chunk.num_frames +
@@ -176,13 +240,15 @@ int main(int argc, char *argv[]) {
         "chain-get-supervision.\n";
 
     bool compress = true;
-    int32 length_tolerance = 100, online_ivector_period = 1;
+    int32 length_tolerance = 100, online_ivector_period = 1,
+          supervision_length_tolerance = 1;
 
     ExampleGenerationConfig eg_config;  // controls num-frames,
                                         // left/right-context, etc.
 
+    BaseFloat normalization_fst_scale = 1.0;
     int32 srand_seed = 0;
-    std::string online_ivector_rspecifier;
+    std::string online_ivector_rspecifier, deriv_weights_rspecifier;
 
     ParseOptions po(usage);
     po.Register("compress", &compress, "If true, write egs with input features "
@@ -200,6 +266,20 @@ int main(int argc, char *argv[]) {
     po.Register("srand", &srand_seed, "Seed for random number generator ");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
+    po.Register("supervision-length-tolerance", &supervision_length_tolerance, 
+                "Tolerance for difference in num-frames-subsampled between "
+                "supervision and deriv weights, and also between supervision "
+                "and input frames.");
+    po.Register("deriv-weights-rspecifier", &deriv_weights_rspecifier,
+                "Per-frame weights that scales a frame's gradient during "
+                "backpropagation. "
+                "Not specifying this is equivalent to specifying a vector of "
+                "all 1s.");
+    po.Register("normalization-fst-scale", &normalization_fst_scale, 
+                "Scale the weights from the "
+                "'normalization' FST before applying them to the examples. "
+                "(Useful for semi-supervised training)");
+
     eg_config.Register(&po);
 
     po.Read(argc, argv);
@@ -235,6 +315,12 @@ int main(int argc, char *argv[]) {
     if (!normalization_fst_rxfilename.empty()) {
       ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst);
       KALDI_ASSERT(normalization_fst.NumStates() > 0);
+      
+      if (normalization_fst_scale <= 0.0)
+        KALDI_ERR << "Invalid scale on normalization FST; must be > 0.0";
+
+      if (normalization_fst_scale != 1.0)
+        ApplyProbabilityScale(normalization_fst_scale, &normalization_fst);
     }
 
     // Read as GeneralMatrix so we don't need to un-compress and re-compress
@@ -245,6 +331,8 @@ int main(int argc, char *argv[]) {
     NnetChainExampleWriter example_writer(examples_wspecifier);
     RandomAccessBaseFloatMatrixReader online_ivector_reader(
         online_ivector_rspecifier);
+    RandomAccessBaseFloatVectorReader deriv_weights_reader(
+        deriv_weights_rspecifier);
 
     int32 num_err = 0;
 
@@ -278,10 +366,24 @@ int main(int argc, char *argv[]) {
           num_err++;
           continue;
         }
+        
+        const Vector<BaseFloat> *deriv_weights = NULL;
+        if (!deriv_weights_rspecifier.empty()) {
+          if (!deriv_weights_reader.HasKey(key)) {
+            KALDI_WARN << "No deriv weights for utterance " << key;
+            num_err++;
+            continue;
+          } else {
+            // this address will be valid until we call HasKey() or Value()
+            // again.
+            deriv_weights = &(deriv_weights_reader.Value(key));
+          }
+        }
 
         if (!ProcessFile(normalization_fst, feats,
                          online_ivector_feats, online_ivector_period,
-                         supervision, key, compress,
+                         supervision, deriv_weights, supervision_length_tolerance,
+                         key, compress,
                          &utt_splitter, &example_writer))
           num_err++;
       }
diff --git a/src/chainbin/nnet3-chain-normalize-egs.cc b/src/chainbin/nnet3-chain-normalize-egs.cc
index 9d3f56f756a..a97797e3246 100644
--- a/src/chainbin/nnet3-chain-normalize-egs.cc
+++ b/src/chainbin/nnet3-chain-normalize-egs.cc
@@ -41,7 +41,13 @@ int main(int argc, char *argv[]) {
         "e.g.\n"
         "nnet3-chain-normalize-egs dir/normalization.fst ark:train_in.cegs ark:train_out.cegs\n";
 
+    BaseFloat normalization_fst_scale = 1.0;
+    
     ParseOptions po(usage);
+    po.Register("normalization-fst-scale", &normalization_fst_scale, 
+                "Scale the weights from the "
+                "'normalization' FST before applying them to the examples. "
+                "(Useful for semi-supervised training)");
 
     po.Read(argc, argv);
 
@@ -57,6 +63,12 @@ int main(int argc, char *argv[]) {
     fst::StdVectorFst normalization_fst;
     ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst);
 
+    if (normalization_fst_scale < 0.0)
+      KALDI_ERR << "Invalid scale on normalization FST; must be >= 0.0";
+
+    if (normalization_fst_scale != 1.0)
+      ApplyProbabilityScale(normalization_fst_scale, &normalization_fst);
+
     SequentialNnetChainExampleReader example_reader(examples_rspecifier);
     NnetChainExampleWriter example_writer(examples_wspecifier);
 
@@ -87,5 +99,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc
index b04b23702fb..54c856a9403 100644
--- a/src/lat/lattice-functions.cc
+++ b/src/lat/lattice-functions.cc
@@ -431,9 +431,10 @@ void ConvertLatticeToPhones(const TransitionModel &trans,
       arc.olabel = 0; // remove any word.
       if ((arc.ilabel != 0) // has a transition-id on input..
           && (trans.TransitionIdToHmmState(arc.ilabel) == 0)
-          && (!trans.IsSelfLoop(arc.ilabel)))
+          && (!trans.IsSelfLoop(arc.ilabel))) {
          // && trans.IsFinal(arc.ilabel)) // there is one of these per phone...
         arc.olabel = trans.TransitionIdToPhone(arc.ilabel);
+      }
       aiter.SetValue(arc);
     }  // end looping over arcs
   }  // end looping over states
@@ -459,6 +460,8 @@ double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
   StateId num_states = lat.NumStates();
   KALDI_ASSERT(lat.Properties(fst::kTopSorted, true) == fst::kTopSorted);
   KALDI_ASSERT(lat.Start() == 0);
+  alpha->clear();
+  beta->clear();
   alpha->resize(num_states, kLogZeroDouble);
   beta->resize(num_states, kLogZeroDouble);
 
@@ -1646,4 +1649,110 @@ void ComposeCompactLatticeDeterministic(
   fst::Connect(composed_clat);
 }
 
+
+void ComputeAcousticScoresMap(
+    const Lattice &lat,
+    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
+                                        PairHasher<int32> > *acoustic_scores) {
+  // typedef the arc, weight types
+  typedef Lattice::Arc Arc;
+  typedef Arc::Weight LatticeWeight;
+  typedef Arc::StateId StateId;
+
+  acoustic_scores->clear();
+
+  std::vector<int32> state_times;
+  LatticeStateTimes(lat, &state_times);   // Assumes the input is top sorted
+
+  KALDI_ASSERT(lat.Start() == 0);
+
+  for (StateId s = 0; s < lat.NumStates(); s++) {
+    int32 t = state_times[s];
+    for (fst::ArcIterator<Lattice> aiter(lat, s); !aiter.Done();
+          aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      const LatticeWeight &weight = arc.weight;
+
+      int32 tid = arc.ilabel;
+
+      if (tid != 0) {
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
+          PairHasher<int32> >::iterator it = acoustic_scores->find(std::make_pair(t, tid));
+        if (it == acoustic_scores->end()) {
+          acoustic_scores->insert(std::make_pair(std::make_pair(t, tid),
+                                          std::make_pair(weight.Value2(), 1)));
+        } else {
+          if (it->second.second == 2
+                && it->second.first / it->second.second != weight.Value2()) {
+            KALDI_VLOG(2) << "Transitions on the same frame have different "
+                          << "acoustic costs for tid " << tid << "; "
+                          << it->second.first / it->second.second
+                          << " vs " << weight.Value2();
+          }
+          it->second.first += weight.Value2();
+          it->second.second++;
+        }
+      } else {
+        // Arcs with epsilon input label (tid) must have 0 acoustic cost
+        KALDI_ASSERT(weight.Value2() == 0);
+      }
+    }
+
+    LatticeWeight f = lat.Final(s);
+    if (f != LatticeWeight::Zero()) {
+      // Final acoustic cost must be 0 as we are reading from
+      // non-determinized, non-compact lattice
+      KALDI_ASSERT(f.Value2() == 0.0);
+    }
+  }
+}
+
+void ReplaceAcousticScoresFromMap(
+    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
+                                        PairHasher<int32> > &acoustic_scores,
+    Lattice *lat) {
+  // typedef the arc, weight types
+  typedef Lattice::Arc Arc;
+  typedef Arc::Weight LatticeWeight;
+  typedef Arc::StateId StateId;
+
+  TopSortLatticeIfNeeded(lat);
+
+  std::vector<int32> state_times;
+  LatticeStateTimes(*lat, &state_times);
+
+  KALDI_ASSERT(lat->Start() == 0);
+
+  for (StateId s = 0; s < lat->NumStates(); s++) {
+    int32 t = state_times[s];
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
+          !aiter.Done(); aiter.Next()) {
+      Arc arc(aiter.Value());
+
+      int32 tid = arc.ilabel;
+      if (tid != 0) {
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
+          PairHasher<int32> >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid));
+        if (it == acoustic_scores.end()) {
+          KALDI_ERR << "Could not find tid " << tid << " at time " << t
+                    << " in the acoustic scores map.";
+        } else {
+          arc.weight.SetValue2(it->second.first / it->second.second);
+        }
+      } else {
+        // For epsilon arcs, set acoustic cost to 0.0
+        arc.weight.SetValue2(0.0);
+      }
+      aiter.SetValue(arc);
+    }
+
+    LatticeWeight f = lat->Final(s);
+    if (f != LatticeWeight::Zero()) {
+      // Set final acoustic cost to 0.0
+      f.SetValue2(0.0);
+      lat->SetFinal(s, f);
+    }
+  }
+}
+
 }  // namespace kaldi
diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h
index b4b16e6221a..c7fe4833a4a 100644
--- a/src/lat/lattice-functions.h
+++ b/src/lat/lattice-functions.h
@@ -377,6 +377,50 @@ void ComposeCompactLatticeDeterministic(
     fst::DeterministicOnDemandFst<fst::StdArc>* det_fst,
     CompactLattice* composed_clat);
 
+/// This function computes the mapping from the pair 
+/// (frame-index, transition-id) to the pair 
+/// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the 
+/// transition-id in that frame.
+/// frame-index in the lattice. 
+/// This function is useful for retaining the acoustic scores in a 
+/// non-compact lattice after a process like determinization where the 
+/// frame-level acoustic scores are typically lost.
+/// The function ReplaceAcousticScoresFromMap is used to restore the 
+/// acoustic scores computed by this function.
+///
+///   @param [in] lat   Input lattice. Expected to be top-sorted. Otherwise the 
+///                     function will crash. 
+///   @param [out] acoustic_scores  
+///                     Pointer to a map from the pair (frame-index,
+///                     transition-id) to a pair (sum-of-acoustic-scores,
+///                     num-of-occurences).
+///                     Usually the acoustic scores for a pdf-id (and hence
+///                     transition-id) on a frame will be the same for all the
+///                     occurences of the pdf-id in that frame. 
+///                     But if not, we will take the average of the acoustic
+///                     scores. Hence, we store both the sum-of-acoustic-scores
+///                     and the num-of-occurences of the transition-id in that
+///                     frame.
+void ComputeAcousticScoresMap(
+    const Lattice &lat,
+    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
+                                        PairHasher<int32> > *acoustic_scores);
+
+/// This function restores acoustic scores computed using the function
+/// ComputeAcousticScoresMap into the lattice.
+///
+///   @param [in] acoustic_scores  
+///                      A map from the pair (frame-index, transition-id) to a
+///                      pair (sum-of-acoustic-scores, num-of-occurences) of 
+///                      the occurences of the transition-id in that frame.
+///                      See the comments for ComputeAcousticScoresMap for 
+///                      details.
+///   @param [out] lat   Pointer to the output lattice.
+void ReplaceAcousticScoresFromMap(
+    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
+                                        PairHasher<int32> > &acoustic_scores,
+    Lattice *lat);
+
 }  // namespace kaldi
 
 #endif  // KALDI_LAT_LATTICE_FUNCTIONS_H_
diff --git a/src/latbin/lattice-compose.cc b/src/latbin/lattice-compose.cc
index b9b261f7d36..df70229bfd8 100644
--- a/src/latbin/lattice-compose.cc
+++ b/src/latbin/lattice-compose.cc
@@ -22,6 +22,7 @@
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -46,8 +47,10 @@ int main(int argc, char *argv[]) {
 
     ParseOptions po(usage);
 
+    bool write_compact = true;
     int32 num_states_cache = 50000;
     int32 phi_label = fst::kNoLabel; // == -1
+    po.Register("write-compact", &write_compact, "If true, write in normal (compact) form.");
     po.Register("phi-label", &phi_label, "If >0, the label on backoff arcs of the LM");
     po.Register("num-states-cache", &num_states_cache,
                 "Number of states we cache when mapping LM FST to lattice type. "
@@ -67,9 +70,14 @@ int main(int argc, char *argv[]) {
     int32 n_done = 0, n_fail = 0;
 
     SequentialLatticeReader lattice_reader1(lats_rspecifier1);
-    // Write as compact lattice.
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
+    
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
 
+    if (write_compact)
+      compact_lattice_writer.Open(lats_wspecifier);
+    else
+      lattice_writer.Open(lats_wspecifier);
 
     if (ClassifyRspecifier(arg2, NULL, NULL) == kNoRspecifier) {
       std::string fst_rxfilename = arg2;
@@ -102,9 +110,13 @@ int main(int argc, char *argv[]) {
           KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)";
           n_fail++;
         } else {
-          CompactLattice clat;
-          ConvertLattice(composed_lat, &clat);
-          compact_lattice_writer.Write(key, clat);
+          if (write_compact) {
+            CompactLattice clat;
+            ConvertLattice(composed_lat, &clat);
+            compact_lattice_writer.Write(key, clat);
+          } else {
+            lattice_writer.Write(key, composed_lat);
+          }
           n_done++;
         }
       }
@@ -149,9 +161,13 @@ int main(int argc, char *argv[]) {
           KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)";
           n_fail++;
         } else {
-          CompactLattice clat_out;
-          ConvertLattice(lat_out, &clat_out);
-          compact_lattice_writer.Write(key, clat_out);
+          if (write_compact) {
+            CompactLattice clat_out;
+            ConvertLattice(lat_out, &clat_out);
+            compact_lattice_writer.Write(key, clat_out);
+          } else {
+            lattice_writer.Write(key, lat_out);
+          }
           n_done++;
         }
       }
diff --git a/src/latbin/lattice-determinize-non-compact.cc b/src/latbin/lattice-determinize-non-compact.cc
index 44ae8566f86..cf73e22980d 100644
--- a/src/latbin/lattice-determinize-non-compact.cc
+++ b/src/latbin/lattice-determinize-non-compact.cc
@@ -90,101 +90,6 @@ bool DeterminizeLatticeWrapper(const Lattice &lat,
   return false;
 }
 
-void ComputeAcousticScoresMap(
-    const Lattice &lat,
-    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
-                                        PairHasher<int32> > *acoustic_scores) {
-  acoustic_scores->clear();
-
-  std::vector<int32> state_times;
-  LatticeStateTimes(lat, &state_times);
-
-  KALDI_ASSERT(lat.Start() == 0);
-
-  for (StateId s = 0; s < lat.NumStates(); s++) {
-    int32 t = state_times[s];
-    for (fst::ArcIterator<Lattice> aiter(lat, s); !aiter.Done();
-          aiter.Next()) {
-      const Arc &arc = aiter.Value();
-      const LatticeWeight &weight = arc.weight;
-
-      int32 tid = arc.ilabel;
-
-      if (tid != 0) {
-        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
-          PairHasher<int32> >::iterator it = acoustic_scores->find(std::make_pair(t, tid));
-        if (it == acoustic_scores->end()) {
-          acoustic_scores->insert(std::make_pair(std::make_pair(t, tid),
-                                          std::make_pair(weight.Value2(), 1)));
-        } else {
-          if (it->second.second == 2
-                && it->second.first / it->second.second != weight.Value2()) {
-            KALDI_VLOG(2) << "Transitions on the same frame have different "
-                          << "acoustic costs for tid " << tid << "; "
-                          << it->second.first / it->second.second
-                          << " vs " << weight.Value2();
-          }
-          it->second.first += weight.Value2();
-          it->second.second++;
-        }
-      } else {
-        // Arcs with epsilon input label (tid) must have 0 acoustic cost
-        KALDI_ASSERT(weight.Value2() == 0);
-      }
-    }
-
-    LatticeWeight f = lat.Final(s);
-    if (f != LatticeWeight::Zero()) {
-      // Final acoustic cost must be 0 as we are reading from
-      // non-determinized, non-compact lattice
-      KALDI_ASSERT(f.Value2() == 0.0);
-    }
-  }
-}
-
-void ReplaceAcousticScoresFromMap(
-    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
-                                        PairHasher<int32> > &acoustic_scores,
-    Lattice *lat) {
-  fst::TopSort(lat);
-
-  std::vector<int32> state_times;
-  LatticeStateTimes(*lat, &state_times);
-
-  KALDI_ASSERT(lat->Start() == 0);
-
-  for (StateId s = 0; s < lat->NumStates(); s++) {
-    int32 t = state_times[s];
-    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
-          !aiter.Done(); aiter.Next()) {
-      Arc arc(aiter.Value());
-
-      int32 tid = arc.ilabel;
-      if (tid != 0) {
-        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
-          PairHasher<int32> >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid));
-        if (it == acoustic_scores.end()) {
-          KALDI_ERR << "Could not find tid " << tid << " at time " << t
-                    << " in the acoustic scores map.";
-        } else {
-          arc.weight.SetValue2(it->second.first / it->second.second);
-        }
-      } else {
-        // For epsilon arcs, set acoustic cost to 0.0
-        arc.weight.SetValue2(0.0);
-      }
-      aiter.SetValue(arc);
-    }
-
-    LatticeWeight f = lat->Final(s);
-    if (f != LatticeWeight::Zero()) {
-      // Set final acoustic cost to 0.0
-      f.SetValue2(0.0);
-      lat->SetFinal(s, f);
-    }
-  }
-}
-
 }
 
 int main(int argc, char *argv[]) {
diff --git a/src/latbin/lattice-determinize-phone-pruned.cc b/src/latbin/lattice-determinize-phone-pruned.cc
index 0959bcbcd74..94a8530273b 100644
--- a/src/latbin/lattice-determinize-phone-pruned.cc
+++ b/src/latbin/lattice-determinize-phone-pruned.cc
@@ -1,6 +1,7 @@
 // latbin/lattice-determinize-phone-pruned.cc
 
 // Copyright 2014  Guoguo Chen
+//           2017  Vimal Manohar
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -43,11 +44,18 @@ int main(int argc, char *argv[]) {
         "                            final.mdl ark:in.lats ark:det.lats\n";
 
     ParseOptions po(usage);
+    bool write_compact = true;
     BaseFloat acoustic_scale = 1.0;
     BaseFloat beam = 10.0;
     fst::DeterminizeLatticePhonePrunedOptions opts;
     opts.max_mem = 50000000;
 
+    po.Register("write-compact", &write_compact, 
+                "If true, write in normal (compact) form. "
+                "--write-compact=false allows you to retain frame-level "
+                "acoustic score information, but this requires the input "
+                "to be in non-compact form e.g. undeterminized lattice "
+                "straight from decoding.");
     po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic"
                 " likelihoods.");
     po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling].");
@@ -70,8 +78,13 @@ int main(int argc, char *argv[]) {
     // accepts.
     SequentialLatticeReader lat_reader(lats_rspecifier);
 
-    // Writes as compact lattice.
-    CompactLatticeWriter compact_lat_writer(lats_wspecifier);
+    CompactLatticeWriter compact_lat_writer;
+    LatticeWriter lat_writer;
+
+    if (write_compact)
+      compact_lat_writer.Open(lats_wspecifier);
+    else
+      lat_writer.Open(lats_wspecifier);
 
     int32 n_done = 0, n_warn = 0;
 
@@ -89,6 +102,12 @@ int main(int argc, char *argv[]) {
 
       KALDI_VLOG(2) << "Processing lattice " << key;
 
+      // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count)
+      unordered_map<std::pair<int32,int32>, std::pair<BaseFloat, int32>,
+                                          PairHasher<int32> > acoustic_scores;
+      if (!write_compact)
+        ComputeAcousticScoresMap(lat, &acoustic_scores);
+
       fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat);
 
       CompactLattice det_clat;
@@ -106,8 +125,19 @@ int main(int argc, char *argv[]) {
       sum_depth_out += depth * t;
       sum_t += t;
 
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat);
-      compact_lat_writer.Write(key, det_clat);
+      if (write_compact) {
+        fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat);
+        compact_lat_writer.Write(key, det_clat);
+      } else {
+        Lattice out_lat;
+        fst::ConvertLattice(det_clat, &out_lat);
+
+        // Replace each arc (t, tid) with the averaged acoustic score from
+        // the computed map
+        ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat);
+        lat_writer.Write(key, out_lat);
+      }
+
       n_done++;
     }
 
diff --git a/src/latbin/lattice-determinize-pruned.cc b/src/latbin/lattice-determinize-pruned.cc
index 3e8bca5a3ce..3c6c5796811 100644
--- a/src/latbin/lattice-determinize-pruned.cc
+++ b/src/latbin/lattice-determinize-pruned.cc
@@ -39,6 +39,7 @@ int main(int argc, char *argv[]) {
         " e.g.: lattice-determinize-pruned --acoustic-scale=0.1 --beam=6.0 ark:in.lats ark:det.lats\n";
 
     ParseOptions po(usage);
+    bool write_compact = true;
     BaseFloat acoustic_scale = 1.0;
     BaseFloat beam = 10.0;
     bool minimize = false;
@@ -48,6 +49,12 @@ int main(int argc, char *argv[]) {
     opts.max_mem = 50000000;
     opts.max_loop = 0; // was 500000;
 
+    po.Register("write-compact", &write_compact, 
+                "If true, write in normal (compact) form. "
+                "--write-compact=false allows you to retain frame-level "
+                "acoustic score information, but this requires the input "
+                "to be in non-compact form e.g. undeterminized lattice "
+                "straight from decoding.");
     po.Register("acoustic-scale", &acoustic_scale,
                 "Scaling factor for acoustic likelihoods");
     po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling].");
@@ -69,8 +76,13 @@ int main(int argc, char *argv[]) {
     // accepts.
     SequentialLatticeReader lat_reader(lats_rspecifier);
 
-    // Write as compact lattice.
-    CompactLatticeWriter compact_lat_writer(lats_wspecifier);
+    CompactLatticeWriter compact_lat_writer;
+    LatticeWriter lat_writer;
+
+    if (write_compact)
+      compact_lat_writer.Open(lats_wspecifier);
+    else
+      lat_writer.Open(lats_wspecifier);
 
     int32 n_done = 0, n_warn = 0;
 
@@ -87,6 +99,12 @@ int main(int argc, char *argv[]) {
 
       KALDI_VLOG(2) << "Processing lattice " << key;
 
+      // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count)
+      unordered_map<std::pair<int32,int32>, std::pair<BaseFloat, int32>,
+                                          PairHasher<int32> > acoustic_scores;
+      if (!write_compact)
+        ComputeAcousticScoresMap(lat, &acoustic_scores);
+
       Invert(&lat); // so word labels are on the input side.
       lat_reader.FreeCurrent();
       fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat);
@@ -121,8 +139,19 @@ int main(int argc, char *argv[]) {
       sum_depth_out += depth * t;
       sum_t += t;
 
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat);
-      compact_lat_writer.Write(key, det_clat);
+      if (write_compact) {
+        fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat);
+        compact_lat_writer.Write(key, det_clat);
+      } else {
+        Lattice out_lat;
+        fst::ConvertLattice(det_clat, &out_lat);
+
+        // Replace each arc (t, tid) with the averaged acoustic score from
+        // the computed map
+        ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat);
+        lat_writer.Write(key, out_lat);
+      }
+
       n_done++;
     }
 
diff --git a/src/latbin/lattice-scale.cc b/src/latbin/lattice-scale.cc
index 5ca6012d994..f9c61b4c5e3 100644
--- a/src/latbin/lattice-scale.cc
+++ b/src/latbin/lattice-scale.cc
@@ -39,12 +39,14 @@ int main(int argc, char *argv[]) {
         " e.g.: lattice-scale --lm-scale=0.0 ark:1.lats ark:scaled.lats\n";
 
     ParseOptions po(usage);
+    bool write_compact = true;
     BaseFloat acoustic_scale = 1.0;
     BaseFloat inv_acoustic_scale = 1.0;
     BaseFloat lm_scale = 1.0;
     BaseFloat acoustic2lm_scale = 0.0;
     BaseFloat lm2acoustic_scale = 0.0;
 
+    po.Register("write-compact", &write_compact, "If true, write in normal (compact) form.");
     po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
     po.Register("inv-acoustic-scale", &inv_acoustic_scale, "An alternative way "
                 "of setting the acoustic scale: you can set its inverse.");
@@ -62,11 +64,6 @@ int main(int argc, char *argv[]) {
     std::string lats_rspecifier = po.GetArg(1),
         lats_wspecifier = po.GetArg(2);
 
-    SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
-
-    // Write as compact lattice.
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
-
     int32 n_done = 0;
 
     KALDI_ASSERT(acoustic_scale == 1.0 || inv_acoustic_scale == 1.0);
@@ -81,12 +78,32 @@ int main(int argc, char *argv[]) {
     scale[1][0] = lm2acoustic_scale;
     scale[1][1] = acoustic_scale;
 
-    for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
-      CompactLattice lat = compact_lattice_reader.Value();
-      ScaleLattice(scale, &lat);
-      compact_lattice_writer.Write(compact_lattice_reader.Key(), lat);
-      n_done++;
+    if (write_compact) {
+      SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
+
+      // Write as compact lattice.
+      CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
+
+      for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
+        CompactLattice lat = compact_lattice_reader.Value();
+        ScaleLattice(scale, &lat);
+        compact_lattice_writer.Write(compact_lattice_reader.Key(), lat);
+        n_done++;
+      }
+    } else {
+      SequentialLatticeReader lattice_reader(lats_rspecifier);
+
+      // Write as regular lattice.
+      LatticeWriter lattice_writer(lats_wspecifier);
+
+      for (; !lattice_reader.Done(); lattice_reader.Next()) {
+        Lattice lat = lattice_reader.Value();
+        ScaleLattice(scale, &lat);
+        lattice_writer.Write(lattice_reader.Key(), lat);
+        n_done++;
+      }
     }
+
     KALDI_LOG << "Done " << n_done << " lattices.";
     return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc
index a7e60a5e0c4..81f19c44b5c 100644
--- a/src/nnet3/nnet-chain-diagnostics.cc
+++ b/src/nnet3/nnet-chain-diagnostics.cc
@@ -60,6 +60,7 @@ NnetChainComputeProb::NnetChainComputeProb(
     deriv_nnet_owned_(false),
     deriv_nnet_(nnet),
     num_minibatches_processed_(0) {
+  KALDI_ASSERT(den_graph_.NumPdfs() > 0);
   KALDI_ASSERT(nnet_config.store_component_stats && !nnet_config.compute_deriv);
 }
 
@@ -217,15 +218,43 @@ const ChainObjectiveInfo* NnetChainComputeProb::GetObjective(
     return NULL;
 }
 
+double NnetChainComputeProb::GetTotalObjective(double *total_weight) const {
+  double tot_objectives = 0.0;
+  double tot_weight = 0.0;
+  unordered_map<std::string, ChainObjectiveInfo, StringHasher>::const_iterator
+    iter = objf_info_.begin(), end = objf_info_.end();
+  for (; iter != end; ++iter) {
+    tot_objectives += iter->second.tot_like + iter->second.tot_l2_term;
+    tot_weight += iter->second.tot_weight;
+  }
+
+  if (total_weight) *total_weight = tot_weight;
+  return tot_objectives;
+}
+
+static bool HasXentOutputs(const Nnet &nnet) {
+  const std::vector<std::string> node_names = nnet.GetNodeNames();
+  for (std::vector<std::string>::const_iterator it = node_names.begin();
+        it != node_names.end(); ++it) {
+    int32 node_index = nnet.GetNodeIndex(*it);
+    if (nnet.IsOutputNode(node_index) && 
+        it->find("-xent") != std::string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 void RecomputeStats(const std::vector<NnetChainExample> &egs,
                     const chain::ChainTrainingOptions &chain_config_in,
                     const fst::StdVectorFst &den_fst,
                     Nnet *nnet) {
   KALDI_LOG << "Recomputing stats on nnet (affects batch-norm)";
   chain::ChainTrainingOptions chain_config(chain_config_in);
-  if (nnet->GetNodeIndex("output-xent") != -1 &&
+  if (HasXentOutputs(*nnet) &&
       chain_config.xent_regularize == 0) {
-    // this forces it to compute the output for 'output-xent', which
+    // this forces it to compute the output for xent outputs, 
+    // usually 'output-xent', which
     // means that we'll be computing batch-norm stats for any
     // components in that branch that have batch-norm.
     chain_config.xent_regularize = 0.1;
diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h
index 4125427c463..49fc5c8f4d8 100644
--- a/src/nnet3/nnet-chain-diagnostics.h
+++ b/src/nnet3/nnet-chain-diagnostics.h
@@ -83,6 +83,11 @@ class NnetChainComputeProb {
   // or NULL if there is no such info.
   const ChainObjectiveInfo *GetObjective(const std::string &output_name) const;
 
+  // This function returns the total objective over all output nodes recorded here, and
+  // outputs to 'tot_weight' the total weight (typically the number of frames)
+  // corresponding to it.
+  double GetTotalObjective(double *tot_weight) const;
+
   // if config.compute_deriv == true, returns a reference to the
   // computed derivative.  Otherwise crashes.
   const Nnet &GetDeriv() const;
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
index aad5f83bc80..a05c002c3af 100644
--- a/src/nnet3/nnet-chain-example.cc
+++ b/src/nnet3/nnet-chain-example.cc
@@ -31,8 +31,8 @@ void NnetChainSupervision::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, name);
   WriteIndexVector(os, binary, indexes);
   supervision.Write(os, binary);
-  WriteToken(os, binary, "<DW>");  // for DerivWeights.  Want to save space.
-  WriteVectorAsChar(os, binary, deriv_weights);
+  WriteToken(os, binary, "<DW2>");
+  deriv_weights.Write(os, binary);
   WriteToken(os, binary, "</NnetChainSup>");
 }
 
@@ -51,8 +51,11 @@ void NnetChainSupervision::Read(std::istream &is, bool binary) {
   ReadToken(is, binary, &token);
   // in the future this back-compatibility code can be reworked.
   if (token != "</NnetChainSup>") {
-    KALDI_ASSERT(token == "<DW>");
-    ReadVectorAsChar(is, binary, &deriv_weights);
+    KALDI_ASSERT(token == "<DW>" || token == "<DW2>");
+    if (token == "<DW>")
+      ReadVectorAsChar(is, binary, &deriv_weights);
+    else
+      deriv_weights.Read(is, binary);
     ExpectToken(is, binary, "</NnetChainSup>");
   }
   CheckDim();
@@ -82,8 +85,7 @@ void NnetChainSupervision::CheckDim() const {
   }
   if (deriv_weights.Dim() != 0) {
     KALDI_ASSERT(deriv_weights.Dim() == indexes.size());
-    KALDI_ASSERT(deriv_weights.Min() >= 0.0 &&
-                 deriv_weights.Max() <= 1.0);
+    KALDI_ASSERT(deriv_weights.Min() >= 0.0);
   }
 }
 
diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h
index 2718af746b2..047f30cfc48 100644
--- a/src/nnet3/nnet-chain-example.h
+++ b/src/nnet3/nnet-chain-example.h
@@ -101,8 +101,8 @@ struct NnetChainSupervision {
   bool operator == (const NnetChainSupervision &other) const;
 };
 
-/// NnetChainExample is like NnetExample, but specialized for CTC training.
-/// (actually CCTC training, which is our extension of CTC).
+/// NnetChainExample is like NnetExample, but specialized for 
+/// lattice-free (chain) training.
 struct NnetChainExample {
 
   /// 'inputs' contains the input to the network-- normally just it has just one
@@ -110,7 +110,7 @@ struct NnetChainExample {
   /// "ivector")...  this depends on the setup.
   std::vector<NnetIo> inputs;
 
-  /// 'outputs' contains the CTC output supervision.  There will normally
+  /// 'outputs' contains the chain output supervision.  There will normally
   /// be just one member with name == "output".
   std::vector<NnetChainSupervision> outputs;
 
diff --git a/src/nnet3/nnet-diagnostics.cc b/src/nnet3/nnet-diagnostics.cc
index 2a6cfe5de6a..ca6124a212f 100644
--- a/src/nnet3/nnet-diagnostics.cc
+++ b/src/nnet3/nnet-diagnostics.cc
@@ -306,15 +306,17 @@ const SimpleObjectiveInfo* NnetComputeProb::GetObjective(
     return NULL;
 }
 
-double NnetComputeProb::GetTotalObjective(double *tot_weight) const {
+double NnetComputeProb::GetTotalObjective(double *total_weight) const {
   double tot_objectives = 0.0;
-  *tot_weight = 0.0;
+  double tot_weight = 0.0;
   unordered_map<std::string, SimpleObjectiveInfo, StringHasher>::const_iterator
     iter = objf_info_.begin(), end = objf_info_.end();
   for (; iter != end; ++iter) {
     tot_objectives += iter->second.tot_objective;
-    (*tot_weight) += iter->second.tot_weight;
+    tot_weight += iter->second.tot_weight;
   }
+
+  if (total_weight) *total_weight = tot_weight;
   return tot_objectives;
 }
 
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 65df0c891c1..07112c9d873 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -1265,6 +1265,5 @@ void ExampleMerger::Finish() {
   stats_.PrintStats();
 }
 
-
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 02620df7485..c93d0dd2c81 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -516,8 +516,6 @@ class ExampleMerger {
    MapType eg_to_egs_;
 };
 
-
-
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3bin/nnet3-copy-egs.cc b/src/nnet3bin/nnet3-copy-egs.cc
index 02c2e429977..17053ad9b2d 100644
--- a/src/nnet3bin/nnet3-copy-egs.cc
+++ b/src/nnet3bin/nnet3-copy-egs.cc
@@ -26,39 +26,39 @@
 
 namespace kaldi {
 namespace nnet3 {
-// rename name of NnetIo with old_name to new_name.
-void RenameIoNames(const std::string &old_name,
-                   const std::string &new_name,
-                   NnetExample *eg_modified) {
-  // list of io-names in eg_modified.
-  std::vector<std::string> orig_io_list;
-  int32 io_size = eg_modified->io.size();
-  for (int32 io_ind = 0; io_ind < io_size; io_ind++)
-    orig_io_list.push_back(eg_modified->io[io_ind].name);
-
-  // find the io in eg with name 'old_name'.
-  int32 rename_io_ind =
-     std::find(orig_io_list.begin(), orig_io_list.end(), old_name) -
-      orig_io_list.begin();
-
-  if (rename_io_ind >= io_size)
-    KALDI_ERR << "No io-node with name " << old_name
+
+// renames outputs named "output" to new_name
+void RenameOutputs(const std::string &new_name, NnetExample *eg) {
+  bool found_output = false;
+  for (std::vector<NnetIo>::iterator it = eg->io.begin();
+       it != eg->io.end(); ++it) {
+    if (it->name == "output") {
+      it->name = new_name;
+      found_output = true;
+    }
+  }
+
+  if (!found_output)
+    KALDI_ERR << "No io-node with name 'output'"
               << "exists in eg.";
-  eg_modified->io[rename_io_ind].name = new_name;
 }
 
-// ranames NnetIo name with name 'output' to new_output_name
-// and scales the supervision for 'output' using weight.
-void ScaleAndRenameOutput(BaseFloat weight,
-                          const std::string &new_output_name,
-                          NnetExample *eg) {
-  // scale the supervision weight for egs
-  for (int32 i = 0; i < eg->io.size(); i++)
-    if (eg->io[i].name == "output")
-      if (weight != 0.0 && weight != 1.0)
-        eg->io[i].features.Scale(weight);
-  // rename output io name to 'new_output_name'.
-  RenameIoNames("output", new_output_name, eg);
+// scales the supervision for 'output' by a factor of "weight"
+void ScaleSupervisionWeight(BaseFloat weight, NnetExample *eg) {
+  if (weight == 1.0) return;
+
+  bool found_output = false;
+  for (std::vector<NnetIo>::iterator it = eg->io.begin();
+       it != eg->io.end(); ++it) {
+    if (it->name == "output") {
+      it->features.Scale(weight);
+      found_output = true;
+    }
+  }
+
+  if (!found_output)
+    KALDI_ERR << "No supervision with name 'output'"
+              << "exists in eg.";
 }
 
 // returns an integer randomly drawn with expected value "expected_count"
@@ -320,7 +320,7 @@ int main(int argc, char *argv[]) {
     // you can set frame to a number to select a single frame with a particular
     // offset, or to 'random' to select a random single frame.
     std::string frame_str,
-      eg_weight_rspecifier, eg_output_rspecifier;
+      eg_weight_rspecifier, eg_output_name_rspecifier;
 
     ParseOptions po(usage);
     po.Register("random", &random, "If true, will write frames to output "
@@ -347,12 +347,11 @@ int main(int argc, char *argv[]) {
                 "Rspecifier indexed by the key of egs, providing a weight by "
                 "which we will scale the supervision matrix for that eg. "
                 "Used in multilingual training.");
-    po.Register("outputs", &eg_output_rspecifier,
+    po.Register("outputs", &eg_output_name_rspecifier,
                 "Rspecifier indexed by the key of egs, providing a string-valued "
                 "output name, e.g. 'output-0'.  If provided, the NnetIo with "
                 "name 'output' will be renamed to the provided name. Used in "
                 "multilingual training.");
-
     po.Read(argc, argv);
 
     srand(srand_seed);
@@ -366,8 +365,11 @@ int main(int argc, char *argv[]) {
 
     SequentialNnetExampleReader example_reader(examples_rspecifier);
 
-    RandomAccessTokenReader output_reader(eg_output_rspecifier);
+    // In the normal case, these would not be used. These are only applicable
+    // for multi-task or multilingual training.
+    RandomAccessTokenReader output_name_reader(eg_output_name_rspecifier);
     RandomAccessBaseFloatReader egs_weight_reader(eg_weight_rspecifier);
+
     int32 num_outputs = po.NumArgs() - 1;
     std::vector<NnetExampleWriter*> example_writers(num_outputs);
     for (int32 i = 0; i < num_outputs; i++)
@@ -376,52 +378,41 @@ int main(int argc, char *argv[]) {
 
     int64 num_read = 0, num_written = 0, num_err = 0;
     for (; !example_reader.Done(); example_reader.Next(), num_read++) {
-      bool modify_eg_output = !(eg_output_rspecifier.empty() &&
-                                eg_weight_rspecifier.empty());
+      const std::string &key = example_reader.Key();
+      NnetExample &eg = example_reader.Value();
       // count is normally 1; could be 0, or possibly >1.
       int32 count = GetCount(keep_proportion);
-      std::string key = example_reader.Key();
-      NnetExample eg_modified_output;
-      const NnetExample &eg_orig = example_reader.Value(),
-        &eg = (modify_eg_output ? eg_modified_output : eg_orig);
-      // Note: in the normal case we just use 'eg'; eg_modified_output is
-      // for the case when the --outputs or --weights option is specified
-      // (only for multilingual training).
-      BaseFloat weight = 1.0;
-      std::string new_output_name;
-      if (modify_eg_output) { // This branch is only taken for multilingual training.
-        eg_modified_output = eg_orig;
-        if (!eg_weight_rspecifier.empty()) {
-          if (!egs_weight_reader.HasKey(key)) {
-            KALDI_WARN << "No weight for example key " << key;
-            num_err++;
-            continue;
-          }
-          weight = egs_weight_reader.Value(key);
+
+      if (!eg_weight_rspecifier.empty()) {
+        BaseFloat weight = 1.0;
+        if (!egs_weight_reader.HasKey(key)) {
+          KALDI_WARN << "No weight for example key " << key;
+          num_err++;
+          continue;
         }
-        if (!eg_output_rspecifier.empty()) {
-          if (!output_reader.HasKey(key)) {
-            KALDI_WARN << "No new output-name for example key " << key;
-            num_err++;
-            continue;
-          }
-          new_output_name = output_reader.Value(key);
+        weight = egs_weight_reader.Value(key);
+        ScaleSupervisionWeight(weight, &eg);
+      }
+      
+      if (!eg_output_name_rspecifier.empty()) {
+        if (!output_name_reader.HasKey(key)) {
+          KALDI_WARN << "No new output-name for example key " << key;
+          num_err++;
+          continue;
         }
+        std::string new_output_name = output_name_reader.Value(key);
+        RenameOutputs(new_output_name, &eg);
       }
       for (int32 c = 0; c < count; c++) {
         int32 index = (random ? Rand() : num_written) % num_outputs;
         if (frame_str == "" && left_context == -1 && right_context == -1 &&
             frame_shift == 0) {
-          if (modify_eg_output) // Only for multilingual training
-            ScaleAndRenameOutput(weight, new_output_name, &eg_modified_output);
           example_writers[index]->Write(key, eg);
           num_written++;
         } else { // the --frame option or context options were set.
           NnetExample eg_modified;
           if (SelectFromExample(eg, frame_str, left_context, right_context,
                                 frame_shift, &eg_modified)) {
-            if (modify_eg_output)
-              ScaleAndRenameOutput(weight, new_output_name, &eg_modified);
             // this branch of the if statement will almost always be taken (should only
             // not be taken for shorter-than-normal egs from the end of a file.
             example_writers[index]->Write(key, eg_modified);