diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
index afdaa347055..8bf2b73dada 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
@@ -3,6 +3,10 @@
 # same as 1e but uses batchnorm components instead of renorm also adding
 # proportional-shrink 10, trained with 4 epochs
 
+
+# local/chain/tuning/run_tdnn_1f.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned
+
+# local/chain/compare_wer_general.sh sdm1 tdnn1e_sp_bi_ihmali tdnn1f_sp_bi_ihmali
 # System               tdnn1e_sp_bi_ihmali tdnn1f_sp_bi_ihmali
 # WER on dev        39.2      37.5
 # WER on eval        42.8      41.3
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
index 5900296b671..dfb6dfedee7 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
@@ -3,6 +3,10 @@
 # same as 1e but uses batchnorm components instead of renorm also adding
 # proportional-shrink 10, trained with 6 epochs
 
+
+# local/chain/tuning/run_tdnn_1g.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned
+
+# local/chain/compare_wer_general.sh sdm1 tdnn1e_sp_bi_ihmali tdnn1g_sp_bi_ihmali
 # System               tdnn1e_sp_bi_ihmali tdnn1g_sp_bi_ihmali
 # WER on dev        39.2      36.9
 # WER on eval        42.8      41.0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
index 6cdca0d222b..bee4d997b01 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -4,13 +4,16 @@
 # and using proportional-shrink with value 10, this model uses
 # 5 epochs to train
 
+# local/chain/tuning/run_tdnn_lstm_1n.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned
+# local/chain/compare_wer_general.sh sdm1 tdnn_lstmli_5epoch_sp_bi_ihmali_ld5 tdnn_lstm1n_sp_bi_ihmali_ld5
 # System               tdnn_lstmli_5epoch_sp_bi_ihmali_ld5 tdnn_lstm1n_sp_bi_ihmali_ld5
-# WER on dev        36.9      34.6
-# WER on eval        40.4      37.9
-# Final train prob     -0.0867643 -0.134102
-# Final valid prob      -0.266945 -0.234435
-# Final train prob (xent)      -1.22333  -1.52368
-# Final valid prob (xent)      -2.13335  -2.02384
+# WER on dev        36.9      34.2
+# WER on eval        40.4      37.7
+# Final train prob     -0.0867643 -0.132712
+# Final valid prob      -0.266945 -0.234348
+# Final train prob (xent)      -1.22333  -1.5112
+# Final valid prob (xent)      -2.13335  -2.01698
+
 
 
 set -e -o pipefail
@@ -188,19 +191,19 @@ if [ $stage -le 15 ]; then
   fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
 
   # the first splicing is moved before the lda layer, so no splicing here
-  relu-renorm-layer name=tdnn1 dim=1024
-  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
-  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+  relu-batchnorm-layer name=tdnn1 dim=1024
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
 
   # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
   lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
-  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
-  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
-  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
   lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
-  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
-  relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
-  relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
   lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
 
   ## adding the layers for chain branch
diff --git a/egs/cifar/v1/local/nnet3/run_cnn_aug_1d.sh b/egs/cifar/v1/local/nnet3/run_cnn_aug_1d.sh
new file mode 100755
index 00000000000..1eb448149ba
--- /dev/null
+++ b/egs/cifar/v1/local/nnet3/run_cnn_aug_1d.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+
+
+# 1d is as 1c but setting num-minibatches-history=40.
+# A bit better.
+
+# local/nnet3/compare.sh exp/cnn_aug_1c_cifar10 exp/cnn_aug_1d_cifar10
+# System                cnn_aug_1c_cifar10 cnn_aug_1d_cifar10
+# final test accuracy:       0.8834      0.8857
+# final train accuracy:       0.9644      0.9626
+# final test objf:         -0.362241   -0.356861
+# final train objf:        -0.114712   -0.114144
+# num-parameters:           2205290     2205290
+
+# local/nnet3/compare.sh exp/cnn_aug_1c_cifar100 exp/cnn_aug_1d_cifar100
+# System                cnn_aug_1c_cifar100 cnn_aug_1d_cifar100
+# final test accuracy:       0.6219      0.6237
+# final train accuracy:       0.8634      0.8688
+# final test objf:          -1.42399    -1.40784
+# final train objf:        -0.493349   -0.482047
+# num-parameters:           2251460     2251460
+
+
+# steps/info/nnet3_dir_info.pl exp/cnn_aug_1d_cifar10{,0}
+# exp/cnn_aug_1d_cifar10: num-iters=200 nj=1..2 num-params=2.2M dim=96->10 combine=-0.24->-0.23 loglike:train/valid[132,199,final]=(-0.172,-0.114,-0.114/-0.38,-0.36,-0.36) accuracy:train/valid[132,199,final]=(0.938,0.963,0.963/0.879,0.887,0.886)
+# exp/cnn_aug_1d_cifar100: num-iters=200 nj=1..2 num-params=2.3M dim=96->100 combine=-0.90->-0.92 loglike:train/valid[132,199,final]=(-0.63,-0.48,-0.48/-1.43,-1.41,-1.41) accuracy:train/valid[132,199,final]=(0.821,0.868,0.869/0.61,0.62,0.62)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+
+
+# training options
+stage=0
+train_stage=-10
+dataset=cifar10
+srand=0
+reporting_email=
+affix=_aug_1d
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+
+dir=exp/cnn${affix}_${dataset}
+
+egs=exp/${dataset}_egs
+
+if [ ! -d $egs ]; then
+  echo "$0: expected directory $egs to exist.  Run the get_egs.sh commands in the"
+  echo "    run.sh before this script."
+  exit 1
+fi
+
+# check that the expected files are in the egs directory.
+
+for f in $egs/egs.1.ark $egs/train_diagnostic.egs $egs/valid_diagnostic.egs $egs/combine.egs \
+         $egs/info/feat_dim $egs/info/left_context $egs/info/right_context \
+         $egs/info/output_dim; do
+  if [ ! -e $f ]; then
+    echo "$0: expected file $f to exist."
+    exit 1;
+  fi
+done
+
+
+mkdir -p $dir/log
+
+
+if [ $stage -le 1 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(cat $egs/info/output_dim)
+
+  # Note: we hardcode in the CNN config that we are dealing with 32x3x color
+  # images.
+
+  a="num-minibatches-history=40.0"
+  common1="$a required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=32"
+  common2="$a required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=64"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=96 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=32 height-out=32 time-offsets=-1,0,1 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=32 height-out=16 time-offsets=-1,0,1 dropout-proportion=0.25 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=16 height-out=16 time-offsets=-2,0,2 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=16 height-out=8 time-offsets=-2,0,2 dropout-proportion=0.25 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn5 height-in=8 height-out=8 time-offsets=-4,0,4 $common2
+  relu-dropout-layer name=fully_connected1 input=Append(2,6,10,14,18,22,26,30) dropout-proportion=0.5 dim=512
+  output-layer name=output dim=$num_targets
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 2 ]; then
+
+  steps/nnet3/train_raw_dnn.py --stage=$train_stage \
+    --cmd="$train_cmd" \
+    --image.augmentation-opts="--horizontal-flip-prob=0.5 --horizontal-shift=0.1 --vertical-shift=0.1 --num-channels=3" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=100 \
+    --egs.frames-per-eg=1 \
+    --trainer.optimization.num-jobs-initial=1 \
+    --trainer.optimization.num-jobs-final=2 \
+    --trainer.optimization.initial-effective-lrate=0.003 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.minibatch-size=256,128,64 \
+    --trainer.shuffle-buffer-size=2000 \
+    --egs.dir="$egs" \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+
+exit 0;
diff --git a/egs/cifar/v1/local/nnet3/run_resnet_1c.sh b/egs/cifar/v1/local/nnet3/run_resnet_1c.sh
new file mode 100755
index 00000000000..0708b3d6eaa
--- /dev/null
+++ b/egs/cifar/v1/local/nnet3/run_resnet_1c.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# 1c is as 1b but setting num-minibatches-history=40.0 in the configs,
+# so the Fisher matrix estimates change less fast.
+# Seems to be helpfu.
+
+# local/nnet3/compare.sh exp/resnet1b_cifar10 exp/resnet1c_cifar10
+# System                resnet1b_cifar10 resnet1c_cifar10
+# final test accuracy:       0.9481      0.9514
+# final train accuracy:       0.9996           1
+# final test objf:         -0.163336   -0.157244
+# final train objf:      -0.00788341 -0.00751868
+# num-parameters:           1322730     1322730
+
+# local/nnet3/compare.sh exp/resnet1b_cifar100 exp/resnet1c_cifar100
+# System                resnet1b_cifar100 resnet1c_cifar100
+# final test accuracy:       0.7602      0.7627
+# final train accuracy:       0.9598        0.96
+# final test objf:         -0.888699   -0.862205
+# final train objf:        -0.164213   -0.174973
+# num-parameters:           1345860     1345860
+# steps/info/nnet3_dir_info.pl exp/resnet1c_cifar10{,0}
+# exp/resnet1c_cifar10: num-iters=133 nj=1..2 num-params=1.3M dim=96->10 combine=-0.02->-0.01 loglike:train/valid[87,132,final]=(-0.115,-0.034,-0.0075/-0.24,-0.21,-0.157) accuracy:train/valid[87,132,final]=(0.960,0.9888,1.0000/0.925,0.938,0.951)
+# exp/resnet1c_cifar100: num-iters=133 nj=1..2 num-params=1.3M dim=96->100 combine=-0.24->-0.20 loglike:train/valid[87,132,final]=(-0.75,-0.27,-0.175/-1.20,-1.00,-0.86) accuracy:train/valid[87,132,final]=(0.78,0.923,0.960/0.67,0.73,0.76)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+
+
+# training options
+stage=0
+train_stage=-10
+dataset=cifar10
+srand=0
+reporting_email=
+affix=1c
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+
+dir=exp/resnet${affix}_${dataset}
+
+egs=exp/${dataset}_egs2
+
+if [ ! -d $egs ]; then
+  echo "$0: expected directory $egs to exist.  Run the get_egs.sh commands in the"
+  echo "    run.sh before this script."
+  exit 1
+fi
+
+# check that the expected files are in the egs directory.
+
+for f in $egs/egs.1.ark $egs/train_diagnostic.egs $egs/valid_diagnostic.egs $egs/combine.egs \
+         $egs/info/feat_dim $egs/info/left_context $egs/info/right_context \
+         $egs/info/output_dim; do
+  if [ ! -e $f ]; then
+    echo "$0: expected file $f to exist."
+    exit 1;
+  fi
+done
+
+
+mkdir -p $dir/log
+
+
+if [ $stage -le 1 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(cat $egs/info/output_dim)
+
+  # Note: we hardcode in the CNN config that we are dealing with 32x3x color
+  # images.
+
+
+  nf1=48
+  nf2=96
+  nf3=256
+  nb3=128
+
+  a="num-minibatches-history=40.0"
+  common="$a required-time-offsets=0 height-offsets=-1,0,1"
+  res_opts="$a bypass-source=batchnorm"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=96 name=input
+  conv-layer name=conv1 $a height-in=32 height-out=32 time-offsets=-1,0,1 required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=$nf1
+  res-block name=res2 num-filters=$nf1 height=32 time-period=1 $res_opts
+  res-block name=res3 num-filters=$nf1 height=32 time-period=1 $res_opts
+  conv-layer name=conv4 height-in=32 height-out=16 height-subsample-out=2 time-offsets=-1,0,1 $common num-filters-out=$nf2
+  res-block name=res5 num-filters=$nf2 height=16 time-period=2 $res_opts
+  res-block name=res6 num-filters=$nf2 height=16 time-period=2 $res_opts
+  conv-layer name=conv7 height-in=16 height-out=8 height-subsample-out=2 time-offsets=-2,0,2 $common num-filters-out=$nf3
+  res-block name=res8 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts
+  res-block name=res9 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts
+  res-block name=res10 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts
+  channel-average-layer name=channel-average input=Append(2,6,10,14,18,22,24,28) dim=$nf3
+  output-layer name=output learning-rate-factor=0.1 dim=$num_targets
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 2 ]; then
+
+  steps/nnet3/train_raw_dnn.py --stage=$train_stage \
+    --cmd="$train_cmd" \
+    --image.augmentation-opts="--horizontal-flip-prob=0.5 --horizontal-shift=0.1 --vertical-shift=0.1 --num-channels=3" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=100 \
+    --egs.frames-per-eg=1 \
+    --trainer.optimization.num-jobs-initial=1 \
+    --trainer.optimization.num-jobs-final=2 \
+    --trainer.optimization.initial-effective-lrate=0.003 \
+    --trainer.optimization.final-effective-lrate=0.0003 \
+    --trainer.optimization.minibatch-size=256,128,64 \
+    --trainer.optimization.proportional-shrink=50.0 \
+    --trainer.shuffle-buffer-size=2000 \
+    --egs.dir="$egs" \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+
+exit 0;
diff --git a/egs/svhn/v1/local/nnet3/run_cnn_aug_1a.sh b/egs/svhn/v1/local/nnet3/run_cnn_aug_1a.sh
index 96e7254474a..e89ff125102 100755
--- a/egs/svhn/v1/local/nnet3/run_cnn_aug_1a.sh
+++ b/egs/svhn/v1/local/nnet3/run_cnn_aug_1a.sh
@@ -2,8 +2,9 @@
 
 # nnet topology similar to 1a but bigger and with more epochs and data augmentation (improved 95 --> 97)
 
-# steps/info/nnet3_dir_info.pl exp/cnn_aug1a:
-# exp/cnn_aug1a: num-iters=300 nj=1..2 num-params=2.8M dim=96->10 combine=-0.02->-0.02 loglike:train/valid[199,299,final]=(-0.01,-0.00,-0.00/-0.17,-0.17,-0.17) accuracy:train/valid[199,299,final]=(1.00,1.00,1.00/0.97,0.97,0.97)
+
+# steps/info/nnet3_dir_info.pl exp/cnn_aug1a
+# exp/cnn_aug1a: num-iters=130 nj=2..4 num-params=2.8M dim=96->10 combine=-0.07->-0.06 loglike:train/valid[85,129,final]=(-0.090,-0.060,-0.054/-0.163,-0.110,-0.102) accuracy:train/valid[85,129,final]=(0.9764,0.9868,0.9886/0.958,0.9731,0.9762)
 
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
diff --git a/egs/svhn/v1/local/nnet3/run_cnn_aug_1b.sh b/egs/svhn/v1/local/nnet3/run_cnn_aug_1b.sh
new file mode 100755
index 00000000000..cf2f92590d2
--- /dev/null
+++ b/egs/svhn/v1/local/nnet3/run_cnn_aug_1b.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+# run_cnn_aug_1b.sh is like run_cnn_aug_1a.sh but setting
+# num-minibatches-history=40.0 (longer history for natural gradient),
+# and using the "egs2" examples with more archives, which necessitates
+# adjusting the proportional-shrink option (since it should be
+# proportional to archive size).
+
+# result improves 97.62 -> 97.71.
+
+# steps/info/nnet3_dir_info.pl exp/cnn_aug1b
+# exp/cnn_aug1b: num-iters=180 nj=2..4 num-params=2.8M dim=96->10 combine=-0.06->-0.06 loglike:train/valid[119,179,final]=(-0.066,-0.051,-0.049/-0.126,-0.103,-0.100) accuracy:train/valid[119,179,final]=(0.9846,0.9890,0.9900/0.970,0.9760,0.9771)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+
+
+# training options
+stage=0
+train_stage=-10
+srand=0
+reporting_email=
+affix=_aug1b
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+
+dir=exp/cnn${affix}
+
+egs=exp/egs2
+
+if [ ! -d $egs ]; then
+  echo "$0: expected directory $egs to exist.  Run the get_egs.sh commands in the"
+  echo "    run.sh before this script."
+  exit 1
+fi
+
+# check that the expected files are in the egs directory.
+
+for f in $egs/egs.1.ark $egs/train_diagnostic.egs $egs/valid_diagnostic.egs $egs/combine.egs \
+         $egs/info/feat_dim $egs/info/left_context $egs/info/right_context \
+         $egs/info/output_dim; do
+  if [ ! -e $f ]; then
+    echo "$0: expected file $f to exist."
+    exit 1;
+  fi
+done
+
+
+mkdir -p $dir/log
+
+
+if [ $stage -le 1 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(cat $egs/info/output_dim)
+
+  # Note: we hardcode in the CNN config that we are dealing with 32x3x color
+  # images.
+
+  a="num-minibatches-history=40.0"
+  common1="$a required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=40"
+  common2="$a required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=80"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=96 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=32 height-out=32 time-offsets=-1,0,1 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=32 height-out=16 time-offsets=-1,0,1 dropout-proportion=0.25 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=16 height-out=16 time-offsets=-2,0,2 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=16 height-out=8 time-offsets=-2,0,2 dropout-proportion=0.25 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn5 height-in=8 height-out=8 time-offsets=-4,0,4 $common2
+  relu-dropout-layer name=fully_connected1 input=Append(2,6,10,14,18,22,26,30) dropout-proportion=0.5 dim=512
+  output-layer name=output dim=$num_targets
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 2 ]; then
+
+  steps/nnet3/train_raw_dnn.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --image.augmentation-opts="--horizontal-shift=0.04 --vertical-shift=0.08 --num-channels=3" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=30 \
+    --egs.frames-per-eg=1 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.003 \
+    --trainer.optimization.final-effective-lrate=0.0003 \
+    --trainer.optimization.minibatch-size=256,128,64 \
+    --trainer.optimization.proportional-shrink=18.0 \
+    --trainer.shuffle-buffer-size=2000 \
+    --egs.dir="$egs" \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+
+exit 0;
diff --git a/egs/svhn/v1/local/nnet3/run_resnet_1b.sh b/egs/svhn/v1/local/nnet3/run_resnet_1b.sh
index 7e6ab60eae3..7f0540e90fe 100755
--- a/egs/svhn/v1/local/nnet3/run_resnet_1b.sh
+++ b/egs/svhn/v1/local/nnet3/run_resnet_1b.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# exp/resnet1b: num-iters=130 nj=2..4 num-params=1.3M dim=96->10 combine=-0.04->-0.04 loglike:train/valid[85,129,final]=(-0.055,-0.041,-0.035/-0.097,-0.079,-0.074) accuracy:train/valid[85,129,final]=(0.9882,0.9924,0.9946/0.977,0.9817,0.9840)
+# exp/resnet1b: num-iters=130 nj=2..4 num-params=1.3M dim=96->10 combine=-0.04->-0.04 loglike:train/valid[85,129,final]=(-0.049,-0.044,-0.036/-0.098,-0.085,-0.076) accuracy:train/valid[85,129,final]=(0.9904,0.9908,0.9940/0.9764,0.9804,0.9831)
 
 # This setup is based on the one in cifar/v1/local/nnet3/run_resnet_1{a,b}.sh.
 # We are reducing the number of epochs quite a bit, since there is so much
@@ -107,7 +107,7 @@ fi
 if [ $stage -le 2 ]; then
 
   steps/nnet3/train_raw_dnn.py --stage=$train_stage \
-    --cmd="$train_cmd" \
+    --cmd="$cmd" \
     --image.augmentation-opts="--horizontal-shift=0.04 --vertical-shift=0.08 --num-channels=3" \
     --trainer.srand=$srand \
     --trainer.max-param-change=2.0 \
diff --git a/egs/svhn/v1/local/nnet3/run_resnet_1c.sh b/egs/svhn/v1/local/nnet3/run_resnet_1c.sh
new file mode 100755
index 00000000000..b56ee62b806
--- /dev/null
+++ b/egs/svhn/v1/local/nnet3/run_resnet_1c.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+# resnet1c is as resnet1b but adding "num-minibatches-history=40.0" to
+# all layers to increase the history size of natural gradient
+# (improves optimization), and using the "egs2" egs with more,
+# smaller archives.  Also changing the proportional-shrink option
+# to compensate for the change in archive size (it should vary
+# proportionally to the number of egs in the archive).
+
+# improves 98.31 -> 98.45.
+
+# exp/resnet1c: num-iters=180 nj=2..4 num-params=1.3M dim=96->10 combine=-0.04->-0.03 loglike:train/valid[119,179,final]=(-0.047,-0.041,-0.034/-0.083,-0.075,-0.071) accuracy:train/valid[119,179,final]=(0.9914,0.9922,0.9944/0.9803,0.9826,0.9845)
+
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+
+
+# training options
+stage=0
+train_stage=-10
+srand=0
+reporting_email=
+affix=1b5
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+
+dir=exp/resnet${affix}
+
+egs=exp/egs2
+
+if [ ! -d $egs ]; then
+  echo "$0: expected directory $egs to exist.  Run the get_egs.sh commands in the"
+  echo "    run.sh before this script."
+  exit 1
+fi
+
+# check that the expected files are in the egs directory.
+
+for f in $egs/egs.1.ark $egs/train_diagnostic.egs $egs/valid_diagnostic.egs $egs/combine.egs \
+         $egs/info/feat_dim $egs/info/left_context $egs/info/right_context \
+         $egs/info/output_dim; do
+  if [ ! -e $f ]; then
+    echo "$0: expected file $f to exist."
+    exit 1;
+  fi
+done
+
+
+mkdir -p $dir/log
+
+
+if [ $stage -le 1 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(cat $egs/info/output_dim)
+
+  # Note: we hardcode in the CNN config that we are dealing with 32x3x color
+  # images.
+
+
+  nf1=48
+  nf2=96
+  nf3=256
+  nb3=128
+
+  a="num-minibatches-history=40.0"
+  common="$a required-time-offsets=0 height-offsets=-1,0,1"
+  res_opts="$a bypass-source=batchnorm"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=96 name=input
+  conv-layer name=conv1 $a height-in=32 height-out=32 time-offsets=-1,0,1 required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=$nf1
+  res-block name=res2 num-filters=$nf1 height=32 time-period=1 $res_opts
+  res-block name=res3 num-filters=$nf1 height=32 time-period=1 $res_opts
+  conv-layer name=conv4 height-in=32 height-out=16 height-subsample-out=2 time-offsets=-1,0,1 $common num-filters-out=$nf2
+  res-block name=res5 num-filters=$nf2 height=16 time-period=2 $res_opts
+  res-block name=res6 num-filters=$nf2 height=16 time-period=2 $res_opts
+  conv-layer name=conv7 height-in=16 height-out=8 height-subsample-out=2 time-offsets=-2,0,2 $common num-filters-out=$nf3
+  res-block name=res8 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts
+  res-block name=res9 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts
+  res-block name=res10 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts
+  channel-average-layer name=channel-average input=Append(2,6,10,14,18,22,24,28) dim=$nf3
+  output-layer name=output learning-rate-factor=0.1 dim=$num_targets
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 2 ]; then
+
+  steps/nnet3/train_raw_dnn.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --image.augmentation-opts="--horizontal-shift=0.04 --vertical-shift=0.08 --num-channels=3" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=30 \
+    --egs.frames-per-eg=1 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.003 \
+    --trainer.optimization.final-effective-lrate=0.0003 \
+    --trainer.optimization.minibatch-size=256,128,64 \
+    --trainer.optimization.proportional-shrink=18.0 \
+    --trainer.shuffle-buffer-size=2000 \
+    --egs.dir="$egs" \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+
+exit 0;
diff --git a/egs/svhn/v1/run.sh b/egs/svhn/v1/run.sh
index fc2e2ef7733..720f4a13e29 100755
--- a/egs/svhn/v1/run.sh
+++ b/egs/svhn/v1/run.sh
@@ -19,3 +19,9 @@ if [ $stage -le 1 ]; then
   # egs preparation
   image/nnet3/get_egs.sh --egs-per-archive 50000 --cmd "$cmd" data/train_all data/test exp/egs
 fi
+
+if [ $stage -le 2 ]; then
+  # Making a version of the egs that have more archives with fewer egs each (this seems to
+  # slightly improve results).  Eventually we'll disable the creation of the egs above.
+  image/nnet3/get_egs.sh --egs-per-archive 35000 --cmd "$cmd" data/train_all data/test exp/egs2
+fi
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
index 08eeba59c3d..4c578c20ad1 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
@@ -3,8 +3,8 @@
 
 # run_tdnn_1e.sh is like run_tdnn_1d.sh but batchnorm components instead of renorm
 
-exp/chain_cleaned/tdnn1d_sp_bi: num-iters=253 nj=2..12 num-params=7.0M dim=40+100->3597 combine=-0.098->-0.097 xent:train/valid[167,252,final]=(-1.40,-1.34,-1.34/-1.50,-1.46,-1.46) logprob:train/valid[167,252,final]=(-0.091,-0.083,-0.083/-0.104,-0.101,-0.101)
-exp/chain_cleaned/tdnn1e_sp_bi/: num-iters=253 nj=2..12 num-params=7.0M dim=40+100->3597 combine=-0.095->-0.095 xent:train/valid[167,252,final]=(-1.37,-1.31,-1.31/-1.47,-1.44,-1.44) logprob:train/valid[167,252,final]=(-0.087,-0.078,-0.078/-0.102,-0.099,-0.099)
+# exp/chain_cleaned/tdnn1d_sp_bi: num-iters=253 nj=2..12 num-params=7.0M dim=40+100->3597 combine=-0.098->-0.097 xent:train/valid[167,252,final]=(-1.40,-1.34,-1.34/-1.50,-1.46,-1.46) logprob:train/valid[167,252,final]=(-0.091,-0.083,-0.083/-0.104,-0.101,-0.101)
+# exp/chain_cleaned/tdnn1e_sp_bi/: num-iters=253 nj=2..12 num-params=7.0M dim=40+100->3597 combine=-0.095->-0.095 xent:train/valid[167,252,final]=(-1.37,-1.31,-1.31/-1.47,-1.44,-1.44) logprob:train/valid[167,252,final]=(-0.087,-0.078,-0.078/-0.102,-0.099,-0.099)
 
 # local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1d_sp_bi exp/chain_cleaned/tdnn1e_sp_bi
 # System                tdnn1d_sp_bi tdnn1e_sp_bi
@@ -49,7 +49,7 @@ nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
 # are just hardcoded at this level, in the commands below.
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=1d  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1e  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
 common_egs_dir=  # you can set this to use previously dumped egs.
 
 # End configuration section.
diff --git a/egs/wsj/s5/steps/info/nnet3_dir_info.pl b/egs/wsj/s5/steps/info/nnet3_dir_info.pl
index ad4a86e4afd..06d07a63755 100755
--- a/egs/wsj/s5/steps/info/nnet3_dir_info.pl
+++ b/egs/wsj/s5/steps/info/nnet3_dir_info.pl
@@ -146,9 +146,9 @@ sub get_combine_info {
 sub number_to_string {
   my ($value, $name) = @_;
   my $precision;
-  if (abs($value) < 0.02 or ($name eq "accuracy" and abs($value) > 0.98)) {
+  if (abs($value) < 0.02 or ($name eq "accuracy" and abs($value) > 0.97)) {
     $precision = 4;
-  } elsif (abs($value) < 0.2 or ($name eq "accuracy" and abs($value) > 0.8)) {
+  } elsif (abs($value) < 0.2 or ($name eq "accuracy" and abs($value) > 0.7)) {
     $precision = 3;
   } else {
     $precision = 2;
diff --git a/src/bin/ali-to-post.cc b/src/bin/ali-to-post.cc
index 589d9d64afe..ac87d676c06 100644
--- a/src/bin/ali-to-post.cc
+++ b/src/bin/ali-to-post.cc
@@ -33,10 +33,18 @@ int main(int argc, char *argv[]) {
   typedef kaldi::int32 int32;
   try {
     const char *usage =
-        "Convert alignments to posteriors\n"
+        "Convert alignments to posteriors.  This is simply a format change\n"
+        "from integer vectors to Posteriors, which are vectors of lists of\n"
+        "pairs (int, float) where the float represents the posterior.  The\n"
+        "floats would all be 1.0 in this case.\n"
+        "The posteriors will still be in terms of whatever integer index\n"
+        "the input contained, which will be transition-ids if they came\n"
+        "directly from decoding, or pdf-ids if they were processed by\n"
+        "ali-to-post.\n"
         "Usage:  ali-to-post [options] <alignments-rspecifier> <posteriors-wspecifier>\n"
         "e.g.:\n"
-        " ali-to-post ark:1.ali ark:1.post\n";
+        " ali-to-post ark:1.ali ark:1.post\n"
+        "See also: ali-to-pdf, ali-to-phones, show-alignments, post-to-weights\n";
 
     ParseOptions po(usage);
 
@@ -69,5 +77,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/bin/analyze-counts.cc b/src/bin/analyze-counts.cc
index 80d43891696..6eeb90d30df 100644
--- a/src/bin/analyze-counts.cc
+++ b/src/bin/analyze-counts.cc
@@ -43,8 +43,9 @@ int main(int argc, char *argv[]) {
         "e.g.: \n"
         " analyze-counts ark:1.ali prior.counts\n"
         " Show phone counts by:\n"
-        " ali-to-phone --per-frame=true ark:1.ali ark:- |"
-        " analyze-counts --verbose=1 ark:- - >/dev/null\n";
+        " ali-to-phones --per-frame=true ark:1.ali ark:- |"
+        " analyze-counts --verbose=1 ark:- - >/dev/null\n"
+        "Note: this is deprecated, see post-to-tacc.\n";
 
     ParseOptions po(usage);
 
diff --git a/src/configure b/src/configure
index 4bfe6bc8470..1d062feccc2 100755
--- a/src/configure
+++ b/src/configure
@@ -663,7 +663,7 @@ function linux_configure_static {
   done
   if [ "$ATLASLIBS" == "" ]; then
     echo Could not find any libraries $ATLASLIBDIR/{liblapack,liblapack_atlas,libclapack} that seem to be an ATLAS CLAPACK library.
-    return ;
+    return 1;
   fi
 
   for x in lib${pt}cblas.a libatlas.a lib${pt}f77blas.a; do
diff --git a/src/gst-plugin/Makefile b/src/gst-plugin/Makefile
index e9dec8f78fe..a9b3a208ff1 100644
--- a/src/gst-plugin/Makefile
+++ b/src/gst-plugin/Makefile
@@ -17,7 +17,7 @@ EXTRA_LDLIBS += $(shell pkg-config --libs glib-2.0)
 #Kaldi shared libraries required by the GStreamer plugin
 EXTRA_LDLIBS += -lkaldi-online -lkaldi-lat -lkaldi-decoder -lkaldi-feat -lkaldi-transform \
  -lkaldi-gmm -lkaldi-hmm \
- -lkaldi-tree -lkaldi-matrix  -lkaldi-util -lkaldi-base -lkaldi-thread
+ -lkaldi-tree -lkaldi-matrix  -lkaldi-util -lkaldi-base 
 
 
 OBJFILES = gst-audio-source.o gst-online-gmm-decode-faster.o
diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc
index 5701c010680..b0ca42f15ab 100644
--- a/src/nnet3/nnet-compile-looped.cc
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -38,13 +38,25 @@ void ModifyNnetIvectorPeriod(int32 ivector_period,
     KALDI_ASSERT(b && "Could not parse config line.");
     if (config_line.FirstToken() == "component-node") {
       std::string whole_line = config_lines[i];
-      std::string to_search_for = "ReplaceIndex(ivector, t, 0)";
+      std::string to_search_for = "ReplaceIndex(";
+      std::string::size_type to_search_for_size = to_search_for.size();
       std::string::size_type pos = whole_line.find(to_search_for);
       if (pos != std::string::npos) {
-        std::ostringstream to_replace_with;
-        to_replace_with << "Round(ivector, " << ivector_period << ")";
-        whole_line.replace(pos, to_search_for.size(), to_replace_with.str());
-        config_to_read << whole_line << "\n";
+        std::string::size_type comma_pos = whole_line.find(',', pos);
+        if (comma_pos != std::string::npos) {
+          // if the line contained ReplaceIndex(ivector, t, 0),
+          // descriptor_name would now be 'ivector'.
+          std::string descriptor_name =
+              whole_line.substr(pos + to_search_for_size,
+                                comma_pos - (pos + to_search_for_size));
+          std::string::size_type end_pos = whole_line.find(')', pos);
+          std::string::size_type expr_size = end_pos + 1 - pos;
+          // e.g. expr_size would be strlen("ReplaceIndex(ivector, t, 0)").
+          std::ostringstream to_replace_with;
+          to_replace_with << "Round(" << descriptor_name << ", " << ivector_period << ")";
+          whole_line.replace(pos, expr_size, to_replace_with.str());
+          config_to_read << whole_line << "\n";
+        }
       }
     }
   }
diff --git a/src/nnet3/nnet-compile-looped.h b/src/nnet3/nnet-compile-looped.h
index f6ff47045fe..2ebb371ecc5 100644
--- a/src/nnet3/nnet-compile-looped.h
+++ b/src/nnet3/nnet-compile-looped.h
@@ -83,12 +83,12 @@ int32 GetChunkSize(const Nnet &nnet,
    We normally train neural networks that expect to see an iVector at frame zero
    only; this is because we train on fixed-size chunks and the iVector doesn't
    change that much within each chunk.  However, expecting just one iVector
-   isn't that convenient for looped recognition because it changes with
-   time, so we modify the iVector input period in the network by replacing
-   expressions like ReplaceIndex(ivector, t, 0) or just "t", with
-   Round(ivector, 10) [assuming ivector_period == 10].  This won't work
-   in every conceivable network, but it does do what you want in the
-   cases of interest.
+   isn't that convenient for looped recognition because it changes with time, so
+   we modify the iVector input period in the network by replacing expressions
+   like ReplaceIndex(ivector, t, 0) with Round(ivector, 10) [assuming
+   ivector_period == 10].  The descriptor doesn't have to be named "ivector", it
+   would work for ReplaceIndex(foo, t, 0).  This won't work in every conceivable
+   network, but it does do what you want in the cases of interest.
 
    It does this in a rather simple way, by getting the config lines that
    correspond to descriptors, and doing a search-and-replace.  It's
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 38e2559ac62..4ad2543c602 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -3339,7 +3339,7 @@ class ComputationLoopedOptimizer {
 
   // Given a vector of lists, one list for each segment, of the active matrices
   // at the end of that segment, this function converts those lists into a
-  // different representation where each matrix is reprented as a pair instead
+  // different representation where each matrix is represented as a pair instead
   // of as a single int32.  'active_pairs' will have the same dimensions as
   // 'active_matrices'.
   static void ConvertListsToPairLists(
@@ -3347,16 +3347,19 @@ class ComputationLoopedOptimizer {
       const std::vector<std::pair<int32, int32> > &matrix_to_pair,
       std::vector<std::vector<std::pair<int32, int32> > > *active_pairs);
 
-  // This function modifies the lists of active matrices per segment
-  // (represented as pairs) in 'active_pairs' by sorting them and
-  // then subtracting the time-offset of the first pair in each
-  // list ((*active_pair)[seg][0].second), from all elements in that list.
-  // It puts the subtracted offset in (*time_offsets)[seg].  This change
-  // of representation makes it easy to tell whether the sets of active
-  // matrices for different segments are identical up to a time-offset.
-  static void NormalizePairLists(
-      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs,
-      std::vector<int32> *time_offsets);
+  // This function, used in FindFirstRepeat, tells us whether the two lists a
+  // and b are the same except for a possible time-shift.
+  // Each element of a or b is of the form (matrix-unique-index, time-offset).
+  // Let's suppose we have two pairs p1=(m1, o1) and p2=(m2, o2).
+  // For p2 to be equal to p1 except for a possible shift of value 'shift', we
+  // require m2 == m1 and either o2 == o1 + 'shift' or o2 == o1.
+  // This function returns true if a.size() == b.size() and for each
+  // i, b[i].first == a[i].first and b[i].second is either
+  // a[i].second or a[i].second + shift.
+  static bool ListsAreEqualExceptForPossibleShift(
+      const std::vector<std::pair<int32, int32> > &a,
+      const std::vector<std::pair<int32, int32> > &b,
+      int32 shift);
 
   // This function looks in the matrix 'active_pairs' for the first pair of
   // identical values, i.e. it is looking for i < j for which
@@ -3376,18 +3379,23 @@ class ComputationLoopedOptimizer {
   // each segment should be shifted relative to the previous segment, by
   // 'time_shift_per_segment'.
   static bool FindFirstRepeat(
-      const std::vector<std::vector<std::pair<int32, int32> > > &normalized_active_pairs,
-      const std::vector<int32> &time_offsets,
+      const std::vector<std::vector<std::pair<int32, int32> > > &active_pairs,
       int32 time_shift_per_segment,
       int32 *seg1, int32 *seg2);
 
-  // Converts a list of pairs (e.g. one of the elements of the output of
-  // 'ConvertListsToPairLists)', back into a list of matrix indexes, using the
-  // map 'pair_to_matrix'.
-  static void PairListToMatrixList(
-      const std::vector<std::pair<int32, int32> > &pair_list,
+
+  // 'pair_list1' is the list of active (unique-id, time-offset) pairs for one
+  // segment of the computation and 'pair_list2' is the same list for a later
+  // segment.  The map 'pair_to_matrix' can convert these back into matrix
+  // indexes.  This function will output two lists of matrices.  These will just
+  // be 'pair_list1' and 'pair_list2' converted back into matrix indexes,
+  // except we omit pairs which are identical (i.e. the time-offset was zero).
+  static void GetIdentifiedMatrices(
+      const std::vector<std::pair<int32, int32> > &pair_list1,
+      const std::vector<std::pair<int32, int32> > &pair_list2,
       const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
-      std::vector<int32> *matrix_list);
+      std::vector<int32> *matrix_list1,
+      std::vector<int32> *matrix_list2);
 
 
   // This function just does some checking (via asserts), that
@@ -3529,7 +3537,7 @@ int32 ComputationLoopedOptimizer::NormalizeCindexes(
   }
   if (iter == end) {
     // this should not happen.
-    KALDI_ERR << "All t value are kNoTime in matrix.";
+    KALDI_ERR << "All t values are kNoTime in matrix.";
   }
   iter = cindexes->begin();
   for (; iter != end; iter++)
@@ -3608,49 +3616,41 @@ void ComputationLoopedOptimizer::ConvertListsToPairLists(
 }
 
 // static
-void ComputationLoopedOptimizer::NormalizePairLists(
-    std::vector<std::vector<std::pair<int32, int32> > > *active_pairs,
-    std::vector<int32> *time_offsets) {
-  int32 num_segments = active_pairs->size();
-  time_offsets->resize(num_segments);
-  for (int32 seg = 0; seg < num_segments; seg++) {
-    std::vector<std::pair<int32, int32> > &this_pairs = (*active_pairs)[seg];
-    std::sort(this_pairs.begin(), this_pairs.end());
-    int32 this_offset;
-    if (!this_pairs.empty()) {
-      this_offset = this_pairs[0].second;
-    } else {
-      // if this_pairs is empty, produce arbitrary offsets that are increasing
-      // (this will keep some self-testing code happy).
-      if (seg == 0) { this_offset = 0; }
-      else { this_offset = (*time_offsets)[seg - 1] + 1; }
-    }
-    (*time_offsets)[seg] = this_offset;
-    std::vector<std::pair<int32, int32> >::iterator
-        iter = this_pairs.begin(), end = this_pairs.end();
-    for (; iter != end; ++iter)
-      iter->second -= this_offset;
+bool ComputationLoopedOptimizer::ListsAreEqualExceptForPossibleShift(
+    const std::vector<std::pair<int32, int32> > &a,
+    const std::vector<std::pair<int32, int32> > &b,
+    int32 shift) {
+  size_t size = a.size();
+  if (b.size() != size)
+    return false;
+  for (size_t i = 0; i < size; i++) {
+    const std::pair<int32, int32> &p1 = a[i],
+        &p2 = b[i];
+    if (p1.first != p2.first)
+      return false;
+    if (p2.second != p1.second + shift && p2.second != p1.second)
+      return false;
   }
+  return true;
 }
 
-
 // static
 bool ComputationLoopedOptimizer::FindFirstRepeat(
-    const std::vector<std::vector<std::pair<int32, int32> > > &normalized_active_pairs,
-    const std::vector<int32> &time_offsets,
+    const std::vector<std::vector<std::pair<int32, int32> > > &active_pairs,
     int32 time_shift_per_segment,
     int32 *seg1, int32 *seg2) {
-  int32 num_segments = normalized_active_pairs.size();
+  int32 num_segments = active_pairs.size();
   // This algorithm may seem like it would be very slow, but the number of
   // segments will normally be quite small (e.g. 10), and the comparison of
-  // elements of 'normalized_active_pairs' should be fast in cases where they
+  // elements of 'active_pairs' should be fast in cases where they
   // differ.
   KALDI_ASSERT(num_segments >= 2);
 
   for (int32 s = 0; s < num_segments; s++) {
     for (int32 t = s + 1; t < num_segments; t++) {
-      if ((time_offsets[t]-time_offsets[s] == (t-s) * time_shift_per_segment) &&
-          normalized_active_pairs[s] == normalized_active_pairs[t]) {
+      if (ListsAreEqualExceptForPossibleShift(active_pairs[s],
+                                              active_pairs[t],
+                                              (t - s) * time_shift_per_segment)) {
         *seg1 = s;
         *seg2 = t;
         return true;
@@ -3661,22 +3661,35 @@ bool ComputationLoopedOptimizer::FindFirstRepeat(
 }
 
 // static
-void ComputationLoopedOptimizer::PairListToMatrixList(
-    const std::vector<std::pair<int32, int32> > &pair_list,
+void ComputationLoopedOptimizer::GetIdentifiedMatrices(
+    const std::vector<std::pair<int32, int32> > &pair_list1,
+    const std::vector<std::pair<int32, int32> > &pair_list2,
     const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
-    std::vector<int32> *matrix_list) {
-  matrix_list->resize(pair_list.size());
+    std::vector<int32> *matrix_list1,
+    std::vector<int32> *matrix_list2) {
+  size_t size = pair_list1.size();
+  KALDI_ASSERT(pair_list2.size() == size);
+  matrix_list1->clear();
+  matrix_list2->clear();
+  matrix_list1->reserve(size);
+  matrix_list2->reserve(size);
   std::vector<std::pair<int32, int32> >::const_iterator
-      iter = pair_list.begin(), end = pair_list.end();
-  std::vector<int32>::iterator out_iter = matrix_list->begin();
-  for (; iter != end; ++iter, ++out_iter) {
+      iter1 = pair_list1.begin(), end1 = pair_list1.end(),
+      iter2 = pair_list2.begin();
+  for (; iter1 != end1; ++iter1, ++iter2) {
+    if (iter1->second == iter2->second)
+      continue;
+    // skip those that have no time shift, we won't have to do any swapping for
+    // those.
     unordered_map<std::pair<int32, int32>, int32,
                   PairHasher<int32> >::const_iterator
-        map_iter = pair_to_matrix.find(*iter);
-    if (map_iter == pair_to_matrix.end()) {
+        map_iter1 = pair_to_matrix.find(*iter1),
+        map_iter2 = pair_to_matrix.find(*iter2);
+    if (map_iter1 == pair_to_matrix.end() ||
+        map_iter2 == pair_to_matrix.end())
       KALDI_ERR << "Could not find pair in map (code error)";
-    }
-    *out_iter = map_iter->second;
+    matrix_list1->push_back(map_iter1->second);
+    matrix_list2->push_back(map_iter2->second);
   }
 }
 
@@ -3895,7 +3908,7 @@ bool ComputationLoopedOptimizer::Optimize() {
   std::vector<std::pair<int32, int32> > matrix_to_pair;
   CreateMatrixPairs(*computation_, &matrix_to_pair);
 
-  // Create the reverse map from pair to matrix index; we'll need it.
+  // Create the reverse map from pair to matrix index; we'll need it later.
   unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > pair_to_matrix;
   GetPairToMatrixMap(matrix_to_pair, &pair_to_matrix);
 
@@ -3904,34 +3917,25 @@ bool ComputationLoopedOptimizer::Optimize() {
   ConvertListsToPairLists(active_matrices, matrix_to_pair,
                           &pair_lists);
 
-  std::vector<int32> time_offsets;
-  NormalizePairLists(&pair_lists, &time_offsets);
-
   // Note: seg1 and seg2 are indexes into 'splice_points', representing
   // potential splice points (located near the beginnings of segments).
   int32 seg1, seg2;
   if (!FindFirstRepeat(pair_lists,
-                       time_offsets,
                        time_shift_per_segment,
                        &seg1, &seg2)) {
     KALDI_VLOG(2) << "Could not find repeats of variables.";
     return false;
   }
 
-  // reverse the normalization for segments seg1 and seg2.
-  for (size_t i = 0; i < pair_lists[seg1].size(); i++)
-    pair_lists[seg1][i].second += time_offsets[seg1];
-  for (size_t i = 0; i < pair_lists[seg2].size(); i++)
-    pair_lists[seg2][i].second += time_offsets[seg2];
   std::vector<int32> seg1_matrices, seg2_matrices;
-  PairListToMatrixList(pair_lists[seg1], pair_to_matrix, &seg1_matrices);
-  PairListToMatrixList(pair_lists[seg2], pair_to_matrix, &seg2_matrices);
+  GetIdentifiedMatrices(pair_lists[seg1], pair_lists[seg2],
+                        pair_to_matrix,
+                        &seg1_matrices, &seg2_matrices);
 
-  int32 time_difference = time_offsets[seg2] - time_offsets[seg1];
+  int32 time_difference = time_shift_per_segment * (seg2 - seg1);
   CheckIdentifiedMatrices(*computation_, seg1_matrices, seg2_matrices,
                           time_difference);
 
-
   FormInfiniteLoop(splice_points[seg1], splice_points[seg2], computation_);
 
   AddMatrixSwapCommands(seg1_matrices, seg2_matrices, computation_);
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index da19b477337..f1e47b2794b 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -2670,8 +2670,8 @@ std::string NaturalGradientAffineComponent::Info() const {
   PrintParameterStats(stream, "bias", bias_params_, true);
   stream << ", rank-in=" << rank_in_
          << ", rank-out=" << rank_out_
-         << ", num_samples_history=" << num_samples_history_
-         << ", update_period=" << update_period_
+         << ", num-samples-history=" << num_samples_history_
+         << ", update-period=" << update_period_
          << ", alpha=" << alpha_;
   return stream.str();
 }
@@ -5375,7 +5375,8 @@ std::string BatchNormComponent::Info() const {
   std::ostringstream stream;
   stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_
          << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_
-         << ", count=" << count_;
+         << ", count=" << count_
+         << ", test-mode=" << (test_mode_ ? "true" : "false");
   if (count_ > 0) {
     Vector<BaseFloat> mean(stats_sum_), var(stats_sumsq_);
     mean.Scale(1.0 / count_);
diff --git a/tools/extras/install_mmseg.sh b/tools/extras/install_mmseg.sh
index b931b93674f..586740b5cbc 100755
--- a/tools/extras/install_mmseg.sh
+++ b/tools/extras/install_mmseg.sh
@@ -39,6 +39,7 @@ if [ -d ./mmseg-1.3.0 ] ; then
   echo  >&2 "$0: Warning: old installation of mmseg found. You should manually"
   echo  >&2 "  delete the directory tools/mmseg and "
   echo  >&2 "  edit the file tools/env.sh and remove manually all references to it"
+  exit 1
 fi
 
 if [ ! -d ./mmseg-1.3.0 ] ; then
@@ -46,22 +47,39 @@ if [ ! -d ./mmseg-1.3.0 ] ; then
   tar xf mmseg-1.3.0.tar.gz
 fi
 
-pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
-export PYTHONPATH=$PYTHONPATH:`pwd`/mmseg-1.3.0/lib/python${pyver}/site-packages
+(
 cd mmseg-1.3.0
-mkdir -p lib/python${pyver}/site-packages
+pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
+export PYTHONPATH=$PYTHONPATH:$PWD/lib/python${pyver}/site-packages/:$PWD/lib64/python${pyver}/site-packages/
+# we have to create those dir, as the install target does not create it
+mkdir -p $PWD/lib/python${pyver}/site-packages/
+mkdir -p $PWD/lib64/python${pyver}/site-packages/
 python setup.py build
 python setup.py install --prefix `pwd`
-cd ../
-
-(
-  set +u
-  pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
-  wd=`pwd`
+)
 
-  [ -f ./env.sh ] && . ./env.sh
+## we first find the mmseg.py file (the module name which will be imported,
+## so that should be pretty reliable) and then we work out the location of
+## the site-packages directory (typically it would be one level up from
+## the location of the mmseg.py file but using find seems more reliable
+mmseg_file_lib=$(find ./mmseg-1.3.0/lib/ -type f -name mmseg.py | head -n1)
+mmseg_file_lib64=$(find ./mmseg-1.3.0/lib64/ -type f -name mmseg.py | head -n1)
+if [ ! -z ${mmseg_file_lib+x} ]; then
+  lib_dir=./lib/
+elif [ ! -z ${mmseg_file_lib64+x} ]; then
+  lib_dir=./lib64/
+else
+  echo >&2 "$0: ERROR: Didn't find ./mmseg-1.3.0/lib/ or ./mmseg-1.3.0/lib64/"
+  echo >&2 "  Perhaps your python or system installs python modules into"
+  echo >&2 "  a different dir or some other unknown issues arised. Review the output"
+  echo >&2 "  of the script and try to figure out what went wrong."
+  exit 1
+fi
 
-  echo "export PYTHONPATH=\$PYTHONPATH:$wd/mmseg-1.3.0/lib/python${pyver}/site-packages"
+site_packages_dir=$(cd ./mmseg-1.3.0; find $lib_dir -name "site-packages" -type d | head -n1)
+(
+  echo "export MMSEG=\"$PWD/mmseg-1.3.0\""
+  echo "export PYTHONPATH=\"\${PYTHONPATH:-}:\$MMSEG/${site_packages_dir}\""
 ) >> env.sh
 
 echo >&2 "Installation of mmseg finished successfully"
diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh
index ba6d028edad..f14057bb494 100755
--- a/tools/extras/install_sequitur.sh
+++ b/tools/extras/install_sequitur.sh
@@ -11,7 +11,6 @@ fi
 ! [ `basename $PWD` == tools ] && \
   echo "You must call this script from the tools/ directory" && exit 1;
 
-
 # Install python-devel package if not already available
 # first, makes sure distutils.sysconfig usable
 if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
@@ -46,6 +45,7 @@ if [ -d ./g2p ] || [ -d sequitur ] ; then
   echo  >&2 "$0: Warning: old installation of Sequitur found. You should manually"
   echo  >&2 "  delete the directories tools/sequitur and/or tools/g2p and "
   echo  >&2 "  edit the file tools/env.sh and remove manually all references to it"
+  exit 1
 fi
 
 if [ ! -d ./sequitur-g2p ] ; then
@@ -67,13 +67,12 @@ fi
 #in a couple of months.
 ln -sf sequitur-g2p sequitur
 
-
+(
 cd sequitur-g2p
 make CXX=g++ CC=gcc
 python setup.py install --prefix `pwd`
-
-cd ../
-
+)
+site_packages_dir=$(cd sequitur-g2p; find ./lib{,64} -type d -name site-packages | head -n 1)
 (
   set +u
   [ ! -z "${SEQUITUR}" ] && \
@@ -88,10 +87,9 @@ cd ../
   wd=`pwd`
   wd=`readlink -f $wd || pwd`
 
-  echo "export SEQUITUR=$wd/sequitur-g2p"
-  echo "export PATH=\$PATH:\${SEQUITUR}/bin"
-  echo "_site_packages=\`find \${SEQUITUR}/lib -type d -regex '.*python.*/site-packages'\`"
-  echo "export PYTHONPATH=\${PYTHONPATH:-}:\$_site_packages"
+  echo "export SEQUITUR=\"$wd/sequitur-g2p\""
+  echo "export PATH=\"\$PATH:\${SEQUITUR}/bin\""
+  echo "export PYTHONPATH=\"\${PYTHONPATH:-}:\$SEQUITUR/${site_packages_dir}\""
 ) >> env.sh
 
 echo >&2 "Installation of SEQUITUR finished successfully"