From 8b415fe52dd3b805a69a0272586e724db1f9e49c Mon Sep 17 00:00:00 2001 From: "Jan \"yenda\" Trmal" Date: Sat, 24 Mar 2018 15:45:38 -0500 Subject: [PATCH 01/12] [egs] remove redundant step from chime5 recipe (#2306) --- egs/chime5/s5/run.sh | 8 -------- 1 file changed, 8 deletions(-) diff --git a/egs/chime5/s5/run.sh b/egs/chime5/s5/run.sh index d80172872ed..c63249b086b 100755 --- a/egs/chime5/s5/run.sh +++ b/egs/chime5/s5/run.sh @@ -168,14 +168,6 @@ if [ $stage -le 11 ]; then fi if [ $stage -le 12 ]; then - utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph - for dset in ${test_sets}; do - steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ - exp/tri2/graph data/${dset} exp/tri2/decode_${dset} - done -fi - -if [ $stage -le 13 ]; then utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph for dset in ${test_sets}; do steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ From 9f7e55a0a90481f410e03e91eabf08df081620c8 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 24 Mar 2018 18:57:39 -0400 Subject: [PATCH 02/12] [src] Fix issue with CUDA device initialization if 'wait' specified. Thx: @olix20 (#2295) --- src/cudamatrix/cu-device.cc | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 87e266e1889..c5114ed8b22 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -62,8 +62,10 @@ static bool GetCudaContext(int32 num_gpus, std::string *debug_str) { // Our first attempt to get a device context is: we do cudaFree(0) and see if // that returns no error code. If it succeeds then we have a device // context. Apparently this is the canonical way to get a context. - if (cudaFree(0) == 0) + if (cudaFree(0) == 0) { + cudaGetLastError(); // Clear any error status. return true; + } // The rest of this code represents how we used to get a device context, but // now its purpose is mainly a debugging one. @@ -71,16 +73,18 @@ static bool GetCudaContext(int32 num_gpus, std::string *debug_str) { debug_stream << "num-gpus=" << num_gpus << ". "; for (int32 device = 0; device < num_gpus; device++) { cudaSetDevice(device); - cudaError_t e = cudaDeviceSynchronize(); // << CUDA context gets created here. + cudaError_t e = cudaFree(0); // CUDA context gets created here. if (e == cudaSuccess) { - *debug_str = debug_stream.str(); + if (debug_str) + *debug_str = debug_stream.str(); + cudaGetLastError(); // Make sure the error state doesn't get returned in + // the next cudaGetLastError(). return true; } debug_stream << "Device " << device << ": " << cudaGetErrorString(e) << ". "; - cudaGetLastError(); // Make sure the error state doesn't get returned in - // the next cudaGetLastError(). } - *debug_str = debug_stream.str(); + if (debug_str) + *debug_str = debug_stream.str(); return false; } @@ -164,7 +168,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) { } else { int32 num_times = 0; BaseFloat wait_time = 0.0; - while (! got_context) { + while (!got_context) { int32 sec_sleep = 5; if (num_times == 0) KALDI_WARN << "Will try again indefinitely every " << sec_sleep @@ -172,7 +176,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) { num_times++; wait_time += sec_sleep; Sleep(sec_sleep); - got_context = GetCudaContext(num_gpus, &debug_str); + got_context = GetCudaContext(num_gpus, NULL); } KALDI_WARN << "Waited " << wait_time From e03dd12ec7f8f2872708224687868d56beeb1975 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sun, 25 Mar 2018 04:59:43 +0430 Subject: [PATCH 03/12] [scripts,egs] Fix to perturb_speed_to_allowed_lengths.py; egs fix (thanks: @calderma) (#2307) --- egs/wsj/s5/local/e2e/run_end2end_char.sh | 6 ++--- .../data/perturb_speed_to_allowed_lengths.py | 22 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/egs/wsj/s5/local/e2e/run_end2end_char.sh b/egs/wsj/s5/local/e2e/run_end2end_char.sh index 6c3786411cc..303a6456159 100755 --- a/egs/wsj/s5/local/e2e/run_end2end_char.sh +++ b/egs/wsj/s5/local/e2e/run_end2end_char.sh @@ -34,10 +34,10 @@ wsj1=/export/corpora5/LDC/LDC94S13B # _char for character-based dictionary and lang directories. if [ $stage -le 0 ]; then - [[ -f data/train_si284/text ]] || \ + [[ -d data/local/data ]] || \ local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.? [[ -f data/local/dict_nosp/lexicon.txt ]] || \ - local/wsj_prepare_phn_dict.sh --dict-suffix "_nosp" + local/wsj_prepare_dict.sh --dict-suffix "_nosp" local/wsj_prepare_char_dict.sh utils/prepare_lang.sh data/local/dict_char \ @@ -105,7 +105,7 @@ if [ $stage -le 5 ]; then mkdir -p exp/chain/e2e_base/log $train_cmd exp/chain/e2e_base/log/make_char_lm.log \ cat data/$trainset/text \| \ - steps/nnet3/chain/e2e/text_to_phones.py data/lang_char data/local/dict_char/lexicon.txt \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang_char \| \ utils/sym2int.pl -f 2- data/lang_char/phones.txt \| \ chain-est-phone-lm --num-extra-lm-states=2000 \ ark:- exp/chain/e2e_base/char_lm.fst diff --git a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py index 83d0b227767..c6bdb95cb2f 100755 --- a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py +++ b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2017 Hossein Hadian # Apache 2.0 @@ -113,7 +113,7 @@ def read_kaldi_datadir(dir): else: num_fail += 1 - if len(utterances) / len(wav_scp) < 0.5: + if float(len(utterances)) / len(wav_scp) < 0.5: logger.info("More than half your data is problematic. Try " "fixing using fix_data_dir.sh.") sys.exit(1) @@ -128,7 +128,7 @@ def read_kaldi_mapfile(path): """ m = {} - with open(path, 'r') as f: + with open(path, 'r', encoding='latin-1') as f: for line in f: line = line.strip() sp_pos = line.find(' ') @@ -145,19 +145,19 @@ def generate_kaldi_data_files(utterances, outdir): logger.info("Exporting to {}...".format(outdir)) speakers = {} - with open(os.path.join(outdir, 'text'), 'w') as f: + with open(os.path.join(outdir, 'text'), 'w', encoding='latin-1') as f: for utt in utterances: f.write(utt.to_kaldi_utt_str() + "\n") - with open(os.path.join(outdir, 'wav.scp'), 'w') as f: + with open(os.path.join(outdir, 'wav.scp'), 'w', encoding='latin-1') as f: for utt in utterances: f.write(utt.to_kaldi_wave_str() + "\n") - with open(os.path.join(outdir, 'utt2dur'), 'w') as f: + with open(os.path.join(outdir, 'utt2dur'), 'w', encoding='latin-1') as f: for utt in utterances: f.write(utt.to_kaldi_dur_str() + "\n") - with open(os.path.join(outdir, 'utt2spk'), 'w') as f: + with open(os.path.join(outdir, 'utt2spk'), 'w', encoding='latin-1') as f: for utt in utterances: f.write(utt.id + " " + utt.speaker + "\n") if utt.speaker not in speakers: @@ -165,7 +165,7 @@ def generate_kaldi_data_files(utterances, outdir): else: speakers[utt.speaker].append(utt.id) - with open(os.path.join(outdir, 'spk2utt'), 'w') as f: + with open(os.path.join(outdir, 'spk2utt'), 'w', encoding='latin-1') as f: for s in speakers: f.write(s + " ") for utt in speakers[s]: @@ -222,8 +222,8 @@ def find_allowed_durations(start_dur, end_dur, args): allowed_durations = [] d = start_dur - with open(os.path.join(args.dir, 'allowed_durs.txt'), 'wb') as durs_fp, \ - open(os.path.join(args.dir, 'allowed_lengths.txt'), 'wb') as lengths_fp: + with open(os.path.join(args.dir, 'allowed_durs.txt'), 'w', encoding='latin-1') as durs_fp, \ + open(os.path.join(args.dir, 'allowed_lengths.txt'), 'w', encoding='latin-1') as lengths_fp: while d < end_dur: length = int(d * 1000 - args.frame_length) / args.frame_shift + 1 if length % args.frame_subsampling_factor != 0: @@ -233,7 +233,7 @@ def find_allowed_durations(start_dur, end_dur, args): + args.frame_length + args.frame_shift / 2) / 1000.0 allowed_durations.append(d) durs_fp.write("{}\n".format(d)) - lengths_fp.write("{}\n".format(length)) + lengths_fp.write("{}\n".format(int(length))) d *= args.factor return allowed_durations From cc16eecee328757811bcb83218ef33447bfa0e00 Mon Sep 17 00:00:00 2001 From: Development and research at SailLabs <37703153+saillabs1@users.noreply.github.com> Date: Mon, 26 Mar 2018 21:31:40 +0200 Subject: [PATCH 04/12] [windows] fix for compiling on Windows VS2017 (15.5.2) (#2310) - adapted install instructions to reflect current dependencies --- src/ivector/agglomerative-clustering.h | 1 + windows/INSTALL.md | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/ivector/agglomerative-clustering.h b/src/ivector/agglomerative-clustering.h index f260c4c3c8b..310a336f8b5 100644 --- a/src/ivector/agglomerative-clustering.h +++ b/src/ivector/agglomerative-clustering.h @@ -25,6 +25,7 @@ #include #include #include +#include #include "base/kaldi-common.h" #include "matrix/matrix-lib.h" #include "util/stl-utils.h" diff --git a/windows/INSTALL.md b/windows/INSTALL.md index 4c670e672b1..cd9c77b1776 100644 --- a/windows/INSTALL.md +++ b/windows/INSTALL.md @@ -34,6 +34,9 @@ For cygwin installation, see the instructions in `../INSTALL`. ## Steps ## Compiling OpenFST + +Skip this section, if you have downloaded OpenFST project from https://github.com/kkm000/openfst.git and it already contains openfst.sln file in the root folder. If it is present you can directly open it with Visual Studio 17 and you do not need CMake. +------------------------- For compilation of OpenFST, you will need CMake installed. Simply go to https://cmake.org/download/ and download and install. Then, in the command line, run the following commands. Be very careful about writing the commands verbatim! @@ -71,6 +74,8 @@ The last command will generate output looking similarly to this. Do not try to r -- Build files have been written to: C:/Users/jtrmal/Documents/openfst/build64 In the directory `build64`, find the file `openfst.sln` and open it using Visual Studio 17. +------------------------- + **Switch the configuration to `debug|Win64` and build the solution.** **Do the same for configuration `release|Win64`.** @@ -133,15 +138,14 @@ for their processors. It isn't free, but you can get [Community Licensing for In If you plan to use MKL, you can ignore the `OPENBLASDIR` path. If you plan to use OpenBLAS, you can ignore the `MKLDIR` path. - No matter what you plan to use, set both the `OPENFST*` and `PTHREADW` - variables correctly + No matter what you plan to use, set `OPENFST*` variable correctly. 6. For OpenBLAS support, copy the file `kaldiwin_openblas.props` to `kaldiwin.props` 7. For MKL support, copy the `kaldiwin_mkl.props` to `kaldiwin.props` 8. Call the script that generates the MSVC solution - ./generate_solution.pl --vsver [--enable-cuda] [--enable-openblas] [--enable-mkl] + generate_solution.pl --vsver [--enable-cuda] [--enable-openblas] [--enable-mkl] `--enable-mkl` is the default so you shouldn't need to use it. If `--enable-openblas` is passed it disables MKL support. CUDA is disabled by default. The default Visual Studio version is 15.0 (Visual Studio 2017). @@ -160,7 +164,8 @@ for their processors. It isn't free, but you can get [Community Licensing for In (kaldi)/windows$ get_version.pl -10. Open the generated solution in the visual studio and switch to **Debug|x64** (or **Release|x64**) and build. +10. Open the generated solution that was created in a subfolder (kaldi)/kaldiwin_vs_ + in the visual studio and switch to **Debug|x64** (or **Release|x64**) and build. Expect 10 projects to fail, majority of them will fail because of missing include `portaudio.h`. The tests will fail to compile too -- this is because of deficiency of the script generate_solution.pl. We might fix it later on. From 7352760ef80960137066b022f2dd5ce29e1c2835 Mon Sep 17 00:00:00 2001 From: Gaofeng Cheng <770579626@qq.com> Date: Tue, 27 Mar 2018 12:56:06 +0800 Subject: [PATCH 05/12] [egs] minor fix for fisher_swbd scripts and swbd nnet3 scripts (#2316) --- egs/aspire/s5/local/nnet3/run_tdnn.sh | 2 +- egs/csj/s5/local/nnet3/run_tdnn.sh | 2 +- egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh | 1 - egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh | 1 - egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh | 16 +++++++++------- egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh | 16 +++++++++------- .../s5/local/chain/run_tdnn_lstm_1a.sh | 16 +++++++++------- .../s5/local/chain/run_tdnn_lstm_1b.sh | 16 +++++++++------- .../s5/local/chain/run_tdnn_opgru_1a.sh | 16 +++++++++------- .../s5/local/chain/run_tdnn_opgru_1b.sh | 16 +++++++++------- egs/swbd/s5c/local/nnet3/tuning/run_tdnn_c.sh | 2 +- egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh | 2 +- egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh | 2 +- .../s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh | 2 +- .../s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh | 2 +- .../s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh | 2 +- 16 files changed, 62 insertions(+), 52 deletions(-) diff --git a/egs/aspire/s5/local/nnet3/run_tdnn.sh b/egs/aspire/s5/local/nnet3/run_tdnn.sh index 6dffe45e04f..8e6a45ccbb4 100755 --- a/egs/aspire/s5/local/nnet3/run_tdnn.sh +++ b/egs/aspire/s5/local/nnet3/run_tdnn.sh @@ -52,7 +52,7 @@ if [ $stage -le 7 ]; then relu-renorm-layer name=tdnn4 dim=1248 input=Append(-3,3) relu-renorm-layer name=tdnn5 dim=1248 input=Append(-7,2) relu-renorm-layer name=tdnn6 dim=1248 - output-layer name=output dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec + output-layer name=output dim=$num_targets max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ fi diff --git a/egs/csj/s5/local/nnet3/run_tdnn.sh b/egs/csj/s5/local/nnet3/run_tdnn.sh index 0b8fc368561..e656b825517 100755 --- a/egs/csj/s5/local/nnet3/run_tdnn.sh +++ b/egs/csj/s5/local/nnet3/run_tdnn.sh @@ -72,7 +72,7 @@ if [ $stage -le 9 ]; then relu-renorm-layer name=tdnn5 input=Append(-7,2) dim=1024 relu-renorm-layer name=tdnn6 dim=1024 - output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh index 00c74ee8a56..66f87c8da8f 100755 --- a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh +++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh @@ -171,7 +171,6 @@ if [ $stage -le 15 ]; then steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi ) & done fi diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh index 03d362ef552..cbf0ef6cb6c 100755 --- a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh +++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh @@ -244,7 +244,6 @@ if [ $stage -le 15 ]; then steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi ) & done fi diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh index 0150a3b6d03..12b3187a5fa 100644 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh @@ -218,6 +218,7 @@ fi decode_suff=fsh_sw1_tg graph_dir=$dir/graph_fsh_sw1_tg if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true if [ ! -z $decode_iter ]; then iter_opts=" --iter $decode_iter " fi @@ -228,13 +229,16 @@ if [ $stage -le 15 ]; then --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ $graph_dir data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; - if $has_fisher; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi - ) & + ) || touch $dir/.error & done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi fi test_online_decoding=true @@ -256,11 +260,9 @@ if $test_online_decoding && [ $stage -le 16 ]; then --acwt 1.0 --post-decode-acwt 10.0 \ $graph_dir data/${decode_set}_hires \ ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; - if $has_fisher; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi ) || touch $dir/.error & done wait diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh index 6255ba39457..7d640c3262a 100644 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh @@ -227,6 +227,7 @@ fi decode_suff=fsh_sw1_tg graph_dir=$dir/graph_fsh_sw1_tg if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true if [ ! -z $decode_iter ]; then iter_opts=" --iter $decode_iter " fi @@ -237,13 +238,16 @@ if [ $stage -le 15 ]; then --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ $graph_dir data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; - if $has_fisher; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi - ) & + ) || touch $dir/.error & done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi fi test_online_decoding=true @@ -265,11 +269,9 @@ if $test_online_decoding && [ $stage -le 16 ]; then --acwt 1.0 --post-decode-acwt 10.0 \ $graph_dir data/${decode_set}_hires \ ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; - if $has_fisher; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi ) || touch $dir/.error & done wait diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh index bccd61533d2..07e88b59ddc 100755 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh @@ -238,6 +238,7 @@ fi decode_suff=fsh_sw1_tg graph_dir=$dir/graph_fsh_sw1_tg if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; @@ -256,13 +257,16 @@ if [ $stage -le 15 ]; then --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ $graph_dir data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; - if $has_fisher; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi - ) & + ) || touch $dir/.error & done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi fi test_online_decoding=true @@ -284,11 +288,9 @@ if $test_online_decoding && [ $stage -le 16 ]; then --acwt 1.0 --post-decode-acwt 10.0 \ $graph_dir data/${decode_set}_hires \ ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; - if $has_fisher; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi ) || touch $dir/.error & done wait diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh index 2272f746ab3..c9d50d1f7bd 100755 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh @@ -248,6 +248,7 @@ fi decode_suff=fsh_sw1_tg graph_dir=$dir/graph_fsh_sw1_tg if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; if [ ! -z $decode_iter ]; then @@ -265,13 +266,16 @@ if [ $stage -le 15 ]; then --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ $graph_dir data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; - if $has_fisher; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi - ) & + ) || touch $dir/.error & done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi fi test_online_decoding=false @@ -293,11 +297,9 @@ if $test_online_decoding && [ $stage -le 16 ]; then --acwt 1.0 --post-decode-acwt 10.0 \ $graph_dir data/${decode_set}_hires \ ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; - if $has_fisher; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi ) || touch $dir/.error & done wait diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh index 737e0571b07..1cce08abeee 100755 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh @@ -245,6 +245,7 @@ fi decode_suff=fsh_sw1_tg graph_dir=$dir/graph_fsh_sw1_tg if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; @@ -263,13 +264,16 @@ if [ $stage -le 15 ]; then --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ $graph_dir data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; - if $has_fisher; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi - ) & + ) || touch $dir/.error & done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi fi test_online_decoding=true @@ -291,11 +295,9 @@ if $test_online_decoding && [ $stage -le 16 ]; then --acwt 1.0 --post-decode-acwt 10.0 \ $graph_dir data/${decode_set}_hires \ ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; - if $has_fisher; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi ) || touch $dir/.error & done wait diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh index 762db86a8cf..2334c6a1bc1 100755 --- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh @@ -246,6 +246,7 @@ fi decode_suff=fsh_sw1_tg graph_dir=$dir/graph_fsh_sw1_tg if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; if [ ! -z $decode_iter ]; then @@ -263,13 +264,16 @@ if [ $stage -le 15 ]; then --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ $graph_dir data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; - if $has_fisher; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi - ) & + ) || touch $dir/.error & done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi fi test_online_decoding=true @@ -291,11 +295,9 @@ if $test_online_decoding && [ $stage -le 16 ]; then --acwt 1.0 --post-decode-acwt 10.0 \ $graph_dir data/${decode_set}_hires \ ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; - if $has_fisher; then - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; - fi ) || touch $dir/.error & done wait diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_c.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_c.sh index fab4b0a03e4..49f8ab62247 100644 --- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_c.sh +++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_c.sh @@ -72,7 +72,7 @@ if [ $stage -le 9 ]; then relu-renorm-layer name=tdnn4 input=Append(-7,2) dim=1024 relu-renorm-layer name=tdnn5 dim=1024 - output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec + output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh index fef5d349867..427678da17b 100755 --- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh +++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh @@ -74,7 +74,7 @@ if [ $stage -le 9 ]; then relu-renorm-layer name=tdnn5 input=Append(-7,2) dim=1024 relu-renorm-layer name=tdnn6 dim=1024 - output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh index 9b5338c76ae..974f697d651 100755 --- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh +++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh @@ -74,7 +74,7 @@ if [ $stage -le 9 ]; then relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 - output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec + output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh index a470cc7f06f..02e637286b5 100755 --- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh +++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh @@ -98,7 +98,7 @@ if [ $stage -le 11 ]; then relu-renorm-layer name=tdnn4 input=Append(-7,2) dim=1024 relu-renorm-layer name=tdnn5 dim=1024 - output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec + output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh index dc8dac90aea..67fd3c03d27 100755 --- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh +++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh @@ -100,7 +100,7 @@ if [ $stage -le 11 ]; then relu-renorm-layer name=tdnn5 input=Append(-7,2) dim=1024 relu-renorm-layer name=tdnn6 dim=1024 - output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh index 285328d58eb..260116666a0 100755 --- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh +++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh @@ -97,7 +97,7 @@ if [ $stage -le 11 ]; then relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 - output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec + output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ From e5b6696c00aaa0003c32e82153b40b12c0ce547e Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Wed, 28 Mar 2018 01:18:49 +0430 Subject: [PATCH 06/12] [egs] Add end-to-end OCR recipe for IAM (thanks: @aarora8) (#2311) --- egs/cifar/v1/image/get_allowed_lengths.py | 143 ++++++++++ egs/cifar/v1/image/get_image2num_frames.py | 62 +++++ egs/iam/v1/local/chain/run_cnn_chainali_1c.sh | 246 ++++++++++++++++++ egs/iam/v1/local/chain/run_flatstart_cnn1a.sh | 165 ++++++++++++ egs/iam/v1/local/make_features.py | 56 +++- egs/iam/v1/run_end2end.sh | 76 ++++++ 6 files changed, 740 insertions(+), 8 deletions(-) create mode 100755 egs/cifar/v1/image/get_allowed_lengths.py create mode 100755 egs/cifar/v1/image/get_image2num_frames.py create mode 100755 egs/iam/v1/local/chain/run_cnn_chainali_1c.sh create mode 100755 egs/iam/v1/local/chain/run_flatstart_cnn1a.sh create mode 100755 egs/iam/v1/run_end2end.sh diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py new file mode 100755 index 00000000000..02321fdd2df --- /dev/null +++ b/egs/cifar/v1/image/get_allowed_lengths.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Hossein Hadian +# Apache 2.0 + + +""" This script finds a set of allowed lengths for a given OCR/HWR data dir. + The allowed lengths are spaced by a factor (like 10%) and are written + in an output file named "allowed_lengths.txt" in the output data dir. This + file is later used by make_features.py to pad each image sufficiently so that + they all have an allowed length. This is intended for end2end chain training. +""" + +import argparse +import os +import sys +import copy +import math +import logging + +sys.path.insert(0, 'steps') +import libs.common as common_lib + +logger = logging.getLogger('libs') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + +def get_args(): + parser = argparse.ArgumentParser(description="""This script finds a set of + allowed lengths for a given OCR/HWR data dir. + Intended for chain training.""") + parser.add_argument('factor', type=float, default=12, + help='Spacing (in percentage) between allowed lengths.') + parser.add_argument('srcdir', type=str, + help='path to source data dir') + parser.add_argument('--coverage-factor', type=float, default=0.05, + help="""Percentage of durations not covered from each + side of duration histogram.""") + parser.add_argument('--frame-subsampling-factor', type=int, default=3, + help="""Chain frame subsampling factor. + See steps/nnet3/chain/train.py""") + + args = parser.parse_args() + return args + + +def read_kaldi_mapfile(path): + """ Read any Kaldi mapping file - like text, .scp files, etc. + """ + + m = {} + with open(path, 'r', encoding='latin-1') as f: + for line in f: + line = line.strip() + sp_pos = line.find(' ') + key = line[:sp_pos] + val = line[sp_pos+1:] + m[key] = val + return m + +def find_duration_range(img2len, coverage_factor): + """Given a list of utterances, find the start and end duration to cover + + If we try to cover + all durations which occur in the training set, the number of + allowed lengths could become very large. + + Returns + ------- + start_dur: int + end_dur: int + """ + durs = [] + for im, imlen in img2len.items(): + durs.append(int(imlen)) + durs.sort() + to_ignore_dur = 0 + tot_dur = sum(durs) + for d in durs: + to_ignore_dur += d + if to_ignore_dur * 100.0 / tot_dur > coverage_factor: + start_dur = d + break + to_ignore_dur = 0 + for d in reversed(durs): + to_ignore_dur += d + if to_ignore_dur * 100.0 / tot_dur > coverage_factor: + end_dur = d + break + if start_dur < 30: + start_dur = 30 # a hard limit to avoid too many allowed lengths --not critical + return start_dur, end_dur + + +def find_allowed_durations(start_len, end_len, args): + """Given the start and end duration, find a set of + allowed durations spaced by args.factor%. Also write + out the list of allowed durations and the corresponding + allowed lengths (in frames) on disk. + + Returns + ------- + allowed_durations: list of allowed durations (in seconds) + """ + + allowed_lengths = [] + length = start_len + with open(os.path.join(args.srcdir, 'allowed_lengths.txt'), 'w', encoding='latin-1') as fp: + while length < end_len: + if length % args.frame_subsampling_factor != 0: + length = (args.frame_subsampling_factor * + (length // args.frame_subsampling_factor)) + allowed_lengths.append(length) + fp.write("{}\n".format(int(length))) + length *= args.factor + return allowed_lengths + + + +def main(): + args = get_args() + args.factor = 1.0 + args.factor / 100.0 + + image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames')) + + start_dur, end_dur = find_duration_range(image2length, args.coverage_factor) + logger.info("Lengths in the range [{},{}] will be covered. " + "Coverage rate: {}%".format(start_dur, end_dur, + 100.0 - args.coverage_factor * 2)) + logger.info("There will be {} unique allowed lengths " + "for the images.".format(int(math.log(end_dur / start_dur) / + math.log(args.factor)))) + + allowed_durations = find_allowed_durations(start_dur, end_dur, args) + + +if __name__ == '__main__': + main() diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py new file mode 100755 index 00000000000..3c003bb9947 --- /dev/null +++ b/egs/cifar/v1/image/get_image2num_frames.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Hossein Hadian + + +""" This script computes the image lengths (with padding) in an image data dir. + The output is written to 'image2num_frames' in the given data dir. This + file is later used by image/get_allowed_lengths.py to find a set of allowed lengths + for the data dir. The output format is similar to utt2num_frames + +""" + +import argparse +import os +import sys +import numpy as np +from scipy import misc + +parser = argparse.ArgumentParser(description="""Computes the image lengths (i.e. width) in an image data dir + and writes them (by default) to image2num_frames.""") +parser.add_argument('dir', type=str, + help='Source data directory (containing images.scp)') +parser.add_argument('--out-ark', type=str, default=None, + help='Where to write the output image-to-num_frames info. ' + 'Default: "dir"/image2num_frames') +parser.add_argument('--feat-dim', type=int, default=40, + help='Size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='Number of white pixels to pad on the left' + 'and right side of the image.') +args = parser.parse_args() + + +def get_scaled_image_length(im): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale * sx) + return nx + +### main ### +data_list_path = os.path.join(args.dir,'images.scp') + +if not args.out_ark: + args.out_ark = os.path.join(args.dir,'image2num_frames') +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark, 'w', encoding='latin-1') + +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_len = get_scaled_image_length(im) + (args.padding * 2) + print('{} {}'.format(image_id, im_len), file=out_fh) + +out_fh.close() diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh new file mode 100755 index 00000000000..6ff76490303 --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh @@ -0,0 +1,246 @@ +#!/bin/bash + +# chainali_1c is as chainali_1b except it uses l2-regularize +# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c +# System cnn_chainali_1b cnn_chainali_1c +# WER 14.67 12.84 +# CER 7.31 6.40 +# Final train prob 0.0042 -0.0120 +# Final valid prob -0.0256 -0.0199 +# Final train prob (xent) -0.6282 -0.9973 +# Final valid prob (xent) -0.9096 -1.1537 +# Parameters 3.96M 3.96M + +# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c +# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_unk +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.075" + opts_2="l2-regularize=0.075" + opts_3="l2-regularize=0.1" + common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $opts_2 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $opts_3 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh new file mode 100755 index 00000000000..65eeedcc75b --- /dev/null +++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh @@ -0,0 +1,165 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) + +# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a +# System cnn_1a cnn_chainali_1c e2e_cnn_1a +# WER 18.58 12.84 15.46 +# CER 10.17 6.40 7.21 +# Final train prob -0.0122 -0.0120 -0.0426 +# Final valid prob -0.0999 -0.0199 -0.0724 +# Final train prob (xent) -0.5652 -0.9973 +# Final valid prob (xent) -0.9758 -1.1537 +# Parameters 4.36M 3.96M 9.13M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ +# exp/chain/e2e_cnn_1a/: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.040->-0.040 (over 1) logprob:train/valid[13,20,final]=(-0.065,-0.046,-0.043/-0.081,-0.073,-0.072) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1000000 +cmvn_opts="--norm-means=true --norm-vars=true" +train_set=train_e2e +lang_test=lang_test + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + cp exp/chain/e2e_base/phone_lm.fst $treedir/ +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + + opts="l2-regularize=0.075" + opts_2="l2-regularize=0.075" + opts_3="l2-regularize=0.1" + common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 30 --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py index b998464953f..8cfca5ee830 100755 --- a/egs/iam/v1/local/make_features.py +++ b/egs/iam/v1/local/make_features.py @@ -2,6 +2,7 @@ # Copyright 2017 Chun Chieh Chang # 2017 Ashish Arora +# 2018 Hossein Hadian """ This script converts images to Kaldi-format feature matrices. The input to this script is the path to a data directory, e.g. "data/train". This script @@ -9,6 +10,10 @@ (by default) as Kaldi-formatted matrices (in text form). It also scales the images so they have the same height (via --feat-dim). It can optionally pad the images (on left/right sides) with white pixels. + If an 'image2num_frames' file is found in the data dir, it will be used + to enforce the images to have the specified length in that file by padding + white pixels (the --padding option will be ignored in this case). This relates + to end2end chain training. eg. local/make_features.py data/train --feat-dim 40 """ @@ -30,6 +35,8 @@ parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') + + args = parser.parse_args() @@ -49,7 +56,7 @@ def write_kaldi_matrix(file_handle, matrix, key): file_handle.write("\n") file_handle.write(" ]\n") -def get_scaled_image(im): +def get_scaled_image(im, allowed_lengths = None): scale_size = args.feat_dim sx = im.shape[1] sy = im.shape[0] @@ -57,22 +64,48 @@ def get_scaled_image(im): nx = int(scale_size) ny = int(scale * sx) im = misc.imresize(im, (nx, ny)) - padding_x = args.padding - padding_y = im.shape[0] - im_pad = np.concatenate((255 * np.ones((padding_y, padding_x), + if allowed_lengths is None: + left_padding = right_padding = args.padding + else: # Find an allowed length for the image + imlen = im.shape[1] + allowed_len = 0 + for l in allowed_lengths: + if l > imlen: + allowed_len = l + break + if allowed_len == 0: + # No allowed length was found for the image (the image is too long) + return None + padding = allowed_len - imlen + left_padding = padding // 2 + right_padding = padding - left_padding + dim_y = im.shape[0] + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), dtype=int), im), axis=1) - im_pad1 = np.concatenate((im_pad, 255 * np.ones((padding_y, padding_x), + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), dtype=int)), axis=1) return im_pad1 ### main ### -data_list_path = os.path.join(args.dir,'images.scp') +data_list_path = os.path.join(args.dir, 'images.scp') if args.out_ark == '-': out_fh = sys.stdout else: out_fh = open(args.out_ark,'wb') +allowed_lengths = None +if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')): + print("Found 'allowed_lengths.txt' file...", file=sys.stderr) + allowed_lengths = [] + with open(os.path.join(args.dir,'allowed_lengths.txt')) as f: + for line in f: + allowed_lengths.append(int(line.strip())) + print("Read {} allowed lengths and will apply them to the " + "features.".format(len(allowed_lengths)), file=sys.stderr) + +num_fail = 0 +num_ok = 0 with open(data_list_path) as f: for line in f: line = line.strip() @@ -80,8 +113,15 @@ def get_scaled_image(im): image_id = line_vect[0] image_path = line_vect[1] im = misc.imread(image_path) - im_scale = get_scaled_image(im) + im_scaled = get_scaled_image(im, allowed_lengths) - data = np.transpose(im_scale, (1, 0)) + if im_scaled is None: + num_fail += 1 + continue + data = np.transpose(im_scaled, (1, 0)) data = np.divide(data, 255.0) + num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) + +print('Generated features for {} images. Failed for {} (iamge too ' + 'long).'.format(num_ok, num_fail)) diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh new file mode 100755 index 00000000000..d479bfa2a73 --- /dev/null +++ b/egs/iam/v1/run_end2end.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +set -e +stage=0 +nj=20 +username= +password= +# iam_database points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# like "data/download" and follow the instructions +# in "local/prepare_data.sh" to download the database: +iam_database=/export/corpora5/handwriting_ocr/IAM + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + + +./local/check_tools.sh + +if [ $stage -le 0 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir "$iam_database" \ + --username "$username" --password "$password" +fi +mkdir -p data/{train,test}/data + +if [ $stage -le 1 ]; then + get_image2num_frames.py data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + echo "$0: Preparing the test and train feature files..." + for dataset in train test; do + local/make_features.py data/$dataset --feat-dim 40 | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp + steps/compute_cmvn_stats.sh data/$dataset + done + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --sil-prob 0.95 \ + data/local/dict "" data/lang/temp data/lang +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + + +if [ $stage -le 4 ]; then + echo "$0: estimating phone language model for the denominator graph" + mkdir -p exp/chain/e2e_base/log + $cmd exp/chain/e2e_base/log/make_phone_lm.log \ + cat data/train/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=1000 \ + ark:- exp/chain/e2e_base/phone_lm.fst +fi + +if [ $stage -le 5 ]; then + echo "$0: calling the flat-start chain recipe..." + local/chain/run_flatstart_cnn1a.sh +fi From d7e88902810f48e10a39e6e7f87e4525fe2369ff Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Wed, 28 Mar 2018 01:21:56 +0430 Subject: [PATCH 07/12] [egs] Fix LM/lexicon issues in IAM; Add unk decoding; Update results. (#2315) --- egs/iam/v1/local/chain/compare_wer.sh | 15 ++ egs/iam/v1/local/chain/run_cnn_1a.sh | 14 +- egs/iam/v1/local/chain/run_cnn_chainali_1b.sh | 22 +-- egs/iam/v1/local/prepare_dict.sh | 28 ++-- .../local/remove_test_utterances_from_lob.py | 117 +++++++++++++ egs/iam/v1/local/score.sh | 156 +++++++++++++++++- egs/iam/v1/local/train_lm.sh | 13 +- .../v1/local/unk_arc_post_to_transcription.py | 86 ++++++++++ egs/iam/v1/run.sh | 20 ++- 9 files changed, 428 insertions(+), 43 deletions(-) create mode 100755 egs/iam/v1/local/remove_test_utterances_from_lob.py create mode 100755 egs/iam/v1/local/unk_arc_post_to_transcription.py diff --git a/egs/iam/v1/local/chain/compare_wer.sh b/egs/iam/v1/local/chain/compare_wer.sh index 4eb665fc702..ad90710b13f 100755 --- a/egs/iam/v1/local/chain/compare_wer.sh +++ b/egs/iam/v1/local/chain/compare_wer.sh @@ -11,6 +11,7 @@ if [ $# == 0 ]; then echo "e.g.: $0 exp/chain/cnn{1a,1b}" exit 1 fi +. ./path.sh echo "# $0 $*" used_epochs=false @@ -26,6 +27,13 @@ for x in $*; do done echo +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + if $used_epochs; then exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. fi @@ -57,3 +65,10 @@ for x in $*; do printf "% 10s" $prob done echo + +echo -n "# Parameters " +for x in $*; do + params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}') + printf "% 10s" $params +done +echo diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/run_cnn_1a.sh index 3b1571091c1..05cb9948bd9 100755 --- a/egs/iam/v1/local/chain/run_cnn_1a.sh +++ b/egs/iam/v1/local/chain/run_cnn_1a.sh @@ -7,9 +7,15 @@ # steps/info/chain_dir_info.pl exp/chain/cnn_1a/ # exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098) -# cat exp/chain/cnn_1a/decode_test/scoring_kaldi/best_* -# %WER 5.94 [ 3913 / 65921, 645 ins, 1466 del, 1802 sub ] exp/chain/cnn_1a/decode_test//cer_11_0.0 -# %WER 9.13 [ 1692 / 18542, 162 ins, 487 del, 1043 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0 +# local/chain/compare_wer.sh exp/chain/cnn_1a/ +# System cnn_1a +# WER 18.58 +# CER 10.17 +# Final train prob -0.0122 +# Final valid prob -0.0999 +# Final train prob (xent) -0.5652 +# Final valid prob (xent) -0.9758 +# Parameters 4.36M set -e -o pipefail @@ -40,7 +46,7 @@ tdnn_dim=450 # training options srand=0 remove_egs=false -lang_test=lang_test +lang_test=lang_unk # End configuration section. echo "$0 $@" # Print the command line for logging diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh index ddf596a6126..d6d0ee780f4 100755 --- a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh @@ -1,20 +1,20 @@ #!/bin/bash # chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer. -# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_chainali_1b/ -# System cnn_chainali_1a cnn_chainali_1b -# WER 6.69 6.25 -# Final train prob -0.0132 -0.0041 -# Final valid prob -0.0509 -0.0337 -# Final train prob (xent) -0.6393 -0.6287 -# Final valid prob (xent) -1.0116 -0.9064 + +# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/ +# System cnn_1a cnn_chainali_1b +# WER 18.58 14.67 +# CER 10.17 7.31 +# Final train prob -0.0122 0.0042 +# Final valid prob -0.0999 -0.0256 +# Final train prob (xent) -0.5652 -0.6282 +# Final valid prob (xent) -0.9758 -0.9096 +# Parameters 4.36M 3.96M # steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/ # exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038) -# cat exp/chain/cnn_chainali_1b/decode_test/scoring_kaldi/best_* -# %WER 3.94 [ 2600 / 65921, 415 ins, 1285 del, 900 sub ] exp/chain/cnn_chainali_1b/decode_test/cer_10_0.0 -# %WER 6.25 [ 1158 / 18542, 103 ins, 469 del, 586 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_12_0.0 set -e -o pipefail @@ -46,7 +46,7 @@ tdnn_dim=450 # training options srand=0 remove_egs=false -lang_test=lang_test +lang_test=lang_unk # End configuration section. echo "$0 $@" # Print the command line for logging diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh index 0c3bb325023..8b981de3abd 100755 --- a/egs/iam/v1/local/prepare_dict.sh +++ b/egs/iam/v1/local/prepare_dict.sh @@ -15,29 +15,27 @@ cat data/train/text | \ perl -ne '@A = split; shift @A; for(@A) {print join("\n", split(//)), "\n";}' | \ sort -u > $dir/nonsilence_phones.txt -# Now list all the unique words (that use only the above letters) -# in data/train/text and LOB+Brown corpora with their comprising -# letters as their transcription. (Letter # is replaced with ) +# Now use the pocolm's wordlist which is the most N frequent words in +# in data/train/text and LOB+Brown corpora (dev and test excluded) with their comprising +# letters as their transcription. Only include words that use the above letters. +# (Letter # is replaced with ) export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n") -cut -d' ' -f2- data/train/text | \ - cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt \ - data/local/browncorpus/brown.txt - | \ +cat data/local/local_lm/data/wordlist | \ perl -e '$letters=$ENV{letters}; -while(<>){ @A = split; - foreach(@A) { - if(! $seen{$_} && $_ =~ m/^[$letters]+$/){ - $seen{$_} = 1; - $trans = join(" ", split(//)); +while(<>){ + chop; + $w = $_; + if($w =~ m/^[$letters]+$/){ + $trans = join(" ", split(//, $w)); $trans =~ s/#//g; - print "$_ $trans\n"; + print "$w $trans\n"; } - } -}' | sort > $dir/lexicon.txt +}' | sort -u > $dir/lexicon.txt -sed -i '' "s/#//" $dir/nonsilence_phones.txt +sed -i "s/#//" $dir/nonsilence_phones.txt echo ' SIL' >> $dir/lexicon.txt echo ' SIL' >> $dir/lexicon.txt diff --git a/egs/iam/v1/local/remove_test_utterances_from_lob.py b/egs/iam/v1/local/remove_test_utterances_from_lob.py new file mode 100755 index 00000000000..1b414ef47f6 --- /dev/null +++ b/egs/iam/v1/local/remove_test_utterances_from_lob.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# Copyright 2018 Ashish Arora + +import argparse +import os +import numpy as np +import sys +import re + +parser = argparse.ArgumentParser(description="""Removes dev/test set lines + from the LOB corpus. Reads the + corpus from stdin, and writes it to stdout.""") +parser.add_argument('dev_text', type=str, + help='dev transcription location.') +parser.add_argument('test_text', type=str, + help='test transcription location.') +args = parser.parse_args() + +def remove_punctuations(transcript): + char_list = [] + for char in transcript: + if char.isdigit() or char == '+' or char == '~' or char == '?': + continue + if char == '#' or char == '=' or char == '-' or char == '!': + continue + if char == ',' or char == '.' or char == ')' or char == '\'': + continue + if char == '(' or char == ':' or char == ';' or char == '"': + continue + char_list.append(char) + return char_list + + +def remove_special_words(words): + word_list = [] + for word in words: + if word == '' or word == '#': + continue + word_list.append(word) + return word_list + + +# process and add dev/eval transcript in a list +# remove special words, punctuations, spaces between words +# lowercase the characters +def read_utterances(text_file_path): + with open(text_file_path, 'rt') as in_file: + for line in in_file: + words = line.strip().split() + words_wo_sw = remove_special_words(words) + transcript = ''.join(words_wo_sw[1:]) + transcript = transcript.lower() + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + utterance_dict[words_wo_sw[0]] = transcript + + +### main ### + +# read utterances and add it to utterance_dict +utterance_dict = dict() +read_utterances(args.dev_text) +read_utterances(args.test_text) + +# read corpus and add it to below lists +corpus_text_lowercase_wo_sc = list() +corpus_text_wo_sc = list() +original_corpus_text = list() +for line in sys.stdin: + original_corpus_text.append(line) + words = line.strip().split() + words_wo_sw = remove_special_words(words) + + transcript = ''.join(words_wo_sw) + transcript = transcript.lower() + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + corpus_text_lowercase_wo_sc.append(transcript) + + transcript = ''.join(words_wo_sw) + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + corpus_text_wo_sc.append(transcript) + +# find majority of utterances below +# for utterances which were not found +# add them to remaining_utterances +row_to_keep = [True for i in range(len(original_corpus_text))] +remaining_utterances = dict() +for line_id, line_to_find in utterance_dict.items(): + found_line = False + for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)): + # Combine 3 consecutive lines of the corpus into a single line + prev_words = corpus_text_lowercase_wo_sc[i - 1].strip() + curr_words = corpus_text_lowercase_wo_sc[i].strip() + next_words = corpus_text_lowercase_wo_sc[i + 1].strip() + new_line = prev_words + curr_words + next_words + transcript = ''.join(new_line) + if line_to_find in transcript: + found_line = True + row_to_keep[i-1] = False + row_to_keep[i] = False + row_to_keep[i+1] = False + if not found_line: + remaining_utterances[line_id] = line_to_find + + +for i in range(len(original_corpus_text)): + transcript = original_corpus_text[i].strip() + if row_to_keep[i]: + print(transcript) + +print('Sentences not removed from LOB: {}'.format(remaining_utterances), file=sys.stderr) +print('Total test+dev sentences: {}'.format(len(utterance_dict)), file=sys.stderr) +print('Number of sentences not removed from LOB: {}'. format(len(remaining_utterances)), file=sys.stderr) +print('LOB lines: Before: {} After: {}'.format(len(original_corpus_text), + row_to_keep.count(True)), file=sys.stderr) diff --git a/egs/iam/v1/local/score.sh b/egs/iam/v1/local/score.sh index 31564d25326..d964d70206b 100755 --- a/egs/iam/v1/local/score.sh +++ b/egs/iam/v1/local/score.sh @@ -1,5 +1,157 @@ #!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) +# Apache 2.0 +# This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the 's +# using local/unk_arc_post_to_transcription.py and also it calls +# steps/scoring/score_kaldi_cer.sh at the end. -steps/scoring/score_kaldi_wer.sh "$@" -steps/scoring/score_kaldi_cer.sh --stage 2 "$@" +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=false +stats=true +beam=6 +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=3 +max_lmwt=13 +iter=final +#end configuration section. + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 +model_path=`echo $dir |xargs dirname` +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + + +ref_filtering_cmd="cat" +[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" +[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" +hyp_filtering_cmd="cat" +[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" +[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" + + +if $decode_mbr ; then + echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty" +else + echo "$0: scoring with word insertion penalty=$word_ins_penalty" +fi + + +mkdir -p $dir/scoring_kaldi +cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; +if [ $stage -le 0 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + mkdir -p $dir/scoring_kaldi/penalty_$wip/log + + if $decode_mbr ; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-prune --beam=$beam ark:- ark:- \| \ + lattice-mbr-decode --word-symbol-table=$symtab \ + ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + + else + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-1best ark:- ark:- \| \ + lattice-align-words $lang_or_graph/phones/word_boundary.int $model_path/final.mdl ark:- ark:- \| \ + lattice-arc-post $model_path/final.mdl ark:- - \| \ + local/unk_arc_post_to_transcription.py $lang_or_graph/phones.txt $lang_or_graph/words.txt data/lang_unk/oov.int \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + fi + + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ + cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ + compute-wer --text --mode=present \ + "ark:cat $dir/scoring_kaldi/test_filt.txt |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; + + done +fi + + + +if [ $stage -le 1 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for lmwt in $(seq $min_lmwt $max_lmwt); do + # adding /dev/null to the command list below forces grep to output the filename + grep WER $dir/wer_${lmwt}_${wip} /dev/null + done + done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 + + best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) + best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') + best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') + + if [ -z "$best_lmwt" ]; then + echo "$0: we could not get the details of the best WER from the file $dir/wer_*. Probably something went wrong." + exit 1; + fi + + if $stats; then + mkdir -p $dir/scoring_kaldi/wer_details + echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight + echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty + + $cmd $dir/scoring_kaldi/log/stats1.log \ + cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ + utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; + + $cmd $dir/scoring_kaldi/log/stats2.log \ + cat $dir/scoring_kaldi/wer_details/per_utt \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + + $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + compute-wer-bootci --mode=present \ + ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + + fi +fi + +steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 --min-lmwt $min_lmwt \ + --max-lmwt $max_lmwt --word-ins-penalty $word_ins_penalty \ + $data $lang_or_graph $dir + +# If we got here, the scoring was successful. +# As a small aid to prevent confusion, we remove all wer_{?,??} files; +# these originate from the previous version of the scoring files +# i keep both statement here because it could lead to confusion about +# the capabilities of the script (we don't do cer in the script) +rm $dir/wer_{?,??} 2>/dev/null +rm $dir/cer_{?,??} 2>/dev/null + +exit 0; diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh index aa4303d6a28..a673c5b3f2d 100755 --- a/egs/iam/v1/local/train_lm.sh +++ b/egs/iam/v1/local/train_lm.sh @@ -13,6 +13,7 @@ set -e stage=0 +vocab_size=50000 echo "$0 $@" # Print the command line for logging . ./utils/parse_options.sh || exit 1; @@ -57,8 +58,10 @@ if [ $stage -le 0 ]; then rm ${dir}/data/text/* 2>/dev/null || true # Using LOB and brown corpus. - cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt > ${dir}/data/text/text.txt - cat data/local/browncorpus/brown.txt >> ${dir}/data/text/text.txt + cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \ + local/remove_test_utterances_from_lob.py data/test/text data/val/text \ + > ${dir}/data/text/lob.txt + cat data/local/browncorpus/brown.txt >> ${dir}/data/text/brown.txt # use the validation data as the dev set. # Note: the name 'dev' is treated specially by pocolm, it automatically @@ -78,8 +81,8 @@ if [ $stage -le 0 ]; then cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt # get the wordlist from IAM text - cat ${dir}/data/text/{iam,text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count - cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist + cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist fi order=3 @@ -91,7 +94,7 @@ if [ $stage -le 1 ]; then # Note: if you have more than one order, use a certain amount of words as the # vocab and want to restrict max memory for 'sort', echo "$0: training the unpruned LM" - min_counts='train=2 iam=1' + min_counts='brown=2 lob=2 iam=1' wordlist=${dir}/data/wordlist lm_name="`basename ${wordlist}`_${order}" diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py new file mode 100755 index 00000000000..c86d35e4b8a --- /dev/null +++ b/egs/iam/v1/local/unk_arc_post_to_transcription.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +# Copyright 2017 Ashish Arora + +import argparse +import sys + +parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""") +parser.add_argument('phones', type=str, help='phones and phonesID') +parser.add_argument('words', type=str, help='word and wordID') +parser.add_argument('unk', type=str, default='-', help='location of unk file') +parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data') +parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data') +args = parser.parse_args() +### main ### +phone_fh = open(args.phones, 'r') +word_fh = open(args.words, 'r') +unk_fh = open(args.unk,'r') +if args.input_ark == '-': + input_fh = sys.stdin +else: + input_fh = open(args.input_ark,'r') +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +phone_dict = dict()# stores phoneID and phone mapping +phone_data_vect = phone_fh.read().strip().split("\n") +for key_val in phone_data_vect: + key_val = key_val.split(" ") + phone_dict[key_val[1]] = key_val[0] +word_dict = dict() +word_data_vect = word_fh.read().strip().split("\n") +for key_val in word_data_vect: + key_val = key_val.split(" ") + word_dict[key_val[1]] = key_val[0] +unk_val = unk_fh.read().strip().split(" ")[0] + +utt_word_dict = dict() +utt_phone_dict = dict()# stores utteranceID and phoneID +unk_word_dict = dict() +count=0 +for line in input_fh: + line_vect = line.strip().split("\t") + if len(line_vect) < 6: + print "IndexError" + print line_vect + continue + uttID = line_vect[0] + word = line_vect[4] + phones = line_vect[5] + if uttID in utt_word_dict.keys(): + utt_word_dict[uttID][count] = word + utt_phone_dict[uttID][count] = phones + else: + count = 0 + utt_word_dict[uttID] = dict() + utt_phone_dict[uttID] = dict() + utt_word_dict[uttID][count] = word + utt_phone_dict[uttID][count] = phones + if word == unk_val: # get character sequence for unk + phone_key_vect = phones.split(" ") + phone_val_vect = list() + for pkey in phone_key_vect: + phone_val_vect.append(phone_dict[pkey]) + phone_2_word = list() + for phone_val in phone_val_vect: + phone_2_word.append(phone_val.split('_')[0]) + phone_2_word = ''.join(phone_2_word) + utt_word_dict[uttID][count] = phone_2_word + else: + if word == '0': + word_val = ' ' + else: + word_val = word_dict[word] + utt_word_dict[uttID][count] = word_val + count += 1 + +transcription = "" +for key in sorted(utt_word_dict.iterkeys()): + transcription = key + for index in sorted(utt_word_dict[key].iterkeys()): + value = utt_word_dict[key][index] + transcription = transcription + " " + value + out_fh.write(transcription + '\n') diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh index d5f66ca4110..f5c4a2b8f80 100755 --- a/egs/iam/v1/run.sh +++ b/egs/iam/v1/run.sh @@ -21,7 +21,6 @@ iam_database=/export/corpora5/handwriting_ocr/IAM . ./utils/parse_options.sh # e.g. this parses the above options # if supplied. - ./local/check_tools.sh if [ $stage -le 0 ]; then @@ -42,17 +41,26 @@ if [ $stage -le 1 ]; then fi if [ $stage -le 2 ]; then + echo "$0: Estimating a language model for decoding..." + # We do this stage before dict preparation because prepare_dict.sh + # generates the lexicon from pocolm's wordlist + local/train_lm.sh --vocab-size 50000 +fi + +if [ $stage -le 3 ]; then echo "$0: Preparing dictionary and lang..." local/prepare_dict.sh utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ data/local/dict "" data/lang/temp data/lang -fi - -if [ $stage -le 3 ]; then - echo "$0: Estimating a language model for decoding..." - local/train_lm.sh utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \ data/local/dict/lexicon.txt data/lang_test + echo "$0: Preparing the unk model for open-vocab decoding..." + utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 \ + data/local/dict exp/unk_lang_model + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \ + --unk-fst exp/unk_lang_model/unk_fst.txt \ + data/local/dict "" data/local/temp data/lang_unk + cp data/lang_test/G.fst data/lang_unk/G.fst fi if [ $stage -le 4 ]; then From 785198e3c0538fb2a5ee366ac777516ba731bc20 Mon Sep 17 00:00:00 2001 From: hainan-xv Date: Tue, 27 Mar 2018 17:04:25 -0400 Subject: [PATCH 08/12] [src] Add some asserts in RNNLM code (#2314) --- src/rnnlm/rnnlm-compute-state.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/rnnlm/rnnlm-compute-state.cc b/src/rnnlm/rnnlm-compute-state.cc index 119e3172fbb..4ec5fdd1dd8 100644 --- a/src/rnnlm/rnnlm-compute-state.cc +++ b/src/rnnlm/rnnlm-compute-state.cc @@ -43,6 +43,14 @@ RnnlmComputeStateInfo::RnnlmComputeStateInfo( KALDI_ERR << "Embedding file and nnet have different embedding sizes. "; } + if (opts.bos_index <= 0 || opts.bos_index >= word_embedding_mat.NumRows()) { + KALDI_ERR < "--bos-symbol option isn't set correctly."; + } + + if (opts.eos_index <= 0 || opts.eos_index >= word_embedding_mat.NumRows()) { + KALDI_ERR < "--eos-symbol option isn't set correctly."; + } + nnet3::ComputationRequest request1, request2, request3; CreateLoopedComputationRequestSimple(rnnlm, 1, // num_frames @@ -85,6 +93,7 @@ RnnlmComputeState* RnnlmComputeState::GetSuccessorState(int32 next_word) const { } void RnnlmComputeState::AddWord(int32 word_index) { + KALDI_ASSERT(word_index > 0 && word_index < info_.word_embedding_mat.NumRows()); previous_word_ = word_index; AdvanceChunk(); From 749839560f4dc1cd129c4ba3ed8a54fb2519f59a Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 27 Mar 2018 19:35:07 -0400 Subject: [PATCH 09/12] [src] Fix to recent commit RE RNNLM code --- src/rnnlm/rnnlm-compute-state.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rnnlm/rnnlm-compute-state.cc b/src/rnnlm/rnnlm-compute-state.cc index 4ec5fdd1dd8..a8e7a17e50b 100644 --- a/src/rnnlm/rnnlm-compute-state.cc +++ b/src/rnnlm/rnnlm-compute-state.cc @@ -44,11 +44,11 @@ RnnlmComputeStateInfo::RnnlmComputeStateInfo( } if (opts.bos_index <= 0 || opts.bos_index >= word_embedding_mat.NumRows()) { - KALDI_ERR < "--bos-symbol option isn't set correctly."; + KALDI_ERR << "--bos-symbol option isn't set correctly."; } if (opts.eos_index <= 0 || opts.eos_index >= word_embedding_mat.NumRows()) { - KALDI_ERR < "--eos-symbol option isn't set correctly."; + KALDI_ERR << "--eos-symbol option isn't set correctly."; } nnet3::ComputationRequest request1, request2, request3; From 8af60bb68a543f7901934026ab4885a9389c6c17 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 28 Mar 2018 00:16:45 -0400 Subject: [PATCH 10/12] [src] Apply limits prior to chain denominator computation, avoid failures. (#2308) --- .../s5c/local/chain/tuning/run_tdnn_7o.sh | 2 +- src/chain/chain-denominator.cc | 5 ++- src/cudamatrix/cu-kernels-ansi.h | 4 ++ src/cudamatrix/cu-kernels.cu | 32 ++++++++++++++++ src/cudamatrix/cu-kernels.h | 8 ++++ src/cudamatrix/cu-matrix-test.cc | 25 ++++++++++++ src/cudamatrix/cu-matrix.cc | 31 +++++++++++++++ src/cudamatrix/cu-matrix.h | 7 ++++ src/nnet3/nnet-general-component.cc | 2 +- src/nnet3/nnet-simple-component.cc | 6 +-- src/nnet3/nnet-simple-component.h | 38 ++++++++++++++++++- 11 files changed, 150 insertions(+), 10 deletions(-) diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh index 753dfc632ba..b927cc86823 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh @@ -18,7 +18,7 @@ # # # local/chain/compare_wer_general.sh --rt03 tdnn7n_sp tdnn7m26o_sp -# System tdnn7n_sp tdnn7m26j_sp +# System tdnn7n_sp tdnn7m26o_sp # WER on train_dev(tg) 12.18 11.74 # WER on train_dev(fg) 11.12 10.69 # WER on eval2000(tg) 14.9 14.6 diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index c936061de26..3a767721c6d 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -64,7 +64,10 @@ DenominatorComputation::DenominatorComputation( nnet_output.NumRows(), kUndefined, kStrideEqualNumCols); exp_nnet_output_transposed_.CopyFromMat(nnet_output, kTrans); - exp_nnet_output_transposed_.ApplyExp(); + // We limit the nnet output to the range [-30,30] before doing the exp; + // this avoids NaNs appearing in the forward-backward computation, which + // is not done in log space. + exp_nnet_output_transposed_.ApplyExpLimited(-30.0, 30.0); } diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index f2926ddc2f1..6b99a77e73b 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -200,6 +200,10 @@ void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d); void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); +void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d, + double lower_limit, double upper_limit); +void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d, + float lower_limit, float upper_limit); void cudaD_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim, const double* in, int in_stride); void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 50dd3d1d0ca..934a860a055 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -400,6 +400,26 @@ static void _apply_exp(Real* mat, MatrixDim d) { } } +template +__global__ +static void _apply_exp_limited(Real* mat, MatrixDim d, + Real lower_limit, Real upper_limit) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda index = i + j * d.stride; + if (i < d.cols && j < d.rows) { + Real x = mat[index]; + // I'm writing !(x >= lower_limit) instead of (x < lower_limit) so that + // nan's will be set to the lower-limit. + if (!(x >= lower_limit)) + x = lower_limit; + else if (x > upper_limit) + x = upper_limit; + mat[index] = exp(x); + } +} + + template __global__ static void _scale_diag_packed(Real* mat, Real value, int dim) { @@ -3734,6 +3754,11 @@ void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { _apply_exp<<>>(mat,d); } +void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d, + float lower_limit, float upper_limit) { + _apply_exp_limited<<>>(mat, d, lower_limit, upper_limit); +} + void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d) { _apply_pow<<>>(mat, power, d); } @@ -4430,6 +4455,13 @@ void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { _apply_exp<<>>(mat,d); } +void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d, + double lower_limit, double upper_limit) { + _apply_exp_limited<<>>(mat, d, lower_limit, upper_limit); +} + + + void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d) { _apply_pow<<>>(mat, power, d); } diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index fe706815a44..8f719a8c4a1 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -345,6 +345,14 @@ inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_apply_exp(Gr, Bl, mat, d); } +inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d, + double lower_limit, double upper_limit) { + cudaD_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit); +} +inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d, + float lower_limit, float upper_limit) { + cudaF_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit); +} inline void cuda_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim, const double* in, int in_stride) { diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index 33db8b3e625..01030bb8353 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -194,6 +194,30 @@ static void UnitTestCuMatrixApplyExp() { } +template +static void UnitTestCuMatrixApplyExpLimited() { + int32 M = 10 + Rand() % 20, N = 10 + Rand() % 20; + Matrix H(M, N); + H.SetRandn(); + + + BaseFloat lower_limit = -0.2, upper_limit = 0.2; + + CuMatrix D(H); + + D.ApplyExpLimited(lower_limit, upper_limit); + + + H.ApplyFloor(lower_limit); + H.ApplyCeiling(upper_limit); + H.ApplyExp(); + + Matrix H2(D); + + AssertEqual(H,H2); +} + + template static void UnitTestCuMatrixSigmoid() { @@ -2895,6 +2919,7 @@ static void UnitTestCuMatrixEqualElementMask() { template void CudaMatrixUnitTest() { UnitTestCuMatrixApplyExpSpecial(); + UnitTestCuMatrixApplyExpLimited(); UnitTextCuMatrixAddSmatMat(); UnitTextCuMatrixAddMatSmat(); UnitTextCuMatrixAddSmat(); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 34290561cc5..beccd9dc4a5 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -2498,6 +2498,37 @@ void CuMatrixBase::ApplyExp() { } } +template +void CuMatrixBase::ApplyExpLimited(Real lower_limit, Real upper_limit) { + KALDI_ASSERT(upper_limit > lower_limit); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuTimer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_apply_exp_limited(dimGrid, dimBlock, data_, Dim(), lower_limit, upper_limit); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim); + } else +#endif + { + int32 num_rows = num_rows_, num_cols = num_cols_; + for (int32 r = 0; r < num_rows; r++) { + Real *row_data = this->RowData(r); + for (int32 c = 0; c < num_cols; c++) { + Real x = row_data[c]; + if (!(x >= lower_limit)) + x = lower_limit; + if (x > upper_limit) + x = upper_limit; + row_data[c] = Exp(x); + } + } + } +} + + template void CuMatrixBase::ApplyExpSpecial() { #if HAVE_CUDA == 1 diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 86c50cfc485..03e69b639d3 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -399,6 +399,13 @@ class CuMatrixBase { void ApplyCeiling(Real ceiling_val); void ApplyExp(); + + /// This is equivalent to running: + /// ApplyFloor(lower_limit); + /// ApplyCeiling(upper_limit); + /// ApplyExp() + void ApplyExpLimited(Real lower_limit, Real upper_limit); + /// For each element x of the matrix, set it to /// (x < 0 ? exp(x) : x + 1). This function is used /// in our RNNLM training. diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index 669e5112793..00a31fa897c 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1414,7 +1414,7 @@ void* DropoutMaskComponent::Propagate( BaseFloat dropout_proportion = dropout_proportion_; KALDI_ASSERT(dropout_proportion >= 0.0 && dropout_proportion <= 1.0); - if (dropout_proportion_ == 0) { + if (dropout_proportion == 0) { out->Set(1.0); return NULL; } diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index f9f286aaed2..4eb078c0fcb 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -3730,15 +3730,11 @@ void NaturalGradientPerElementScaleComponent::InitFromConfig(ConfigLine *cfl) { // for the preconditioner actually exceeds the memory for the // parameters (by "rank"). update_period = 10; - BaseFloat num_samples_history = 2000.0, alpha = 4.0, - max_change_per_minibatch = 0.0; + BaseFloat num_samples_history = 2000.0, alpha = 4.0; cfl->GetValue("rank", &rank); cfl->GetValue("update-period", &update_period); cfl->GetValue("num-samples-history", &num_samples_history); cfl->GetValue("alpha", &alpha); - cfl->GetValue("max-change-per-minibatch", &max_change_per_minibatch); - if (max_change_per_minibatch != 0.0) - KALDI_WARN << "max-change-per-minibatch is now ignored, use 'max-change'"; InitLearningRatesFromConfig(cfl); std::string filename; // Accepts "scales" config (for filename) or "dim" -> random init, for testing. diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 9d438678f5d..3929c253aab 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -1446,6 +1446,19 @@ class PermuteComponent: public Component { trainable scale; it's like a linear component with a diagonal matrix. This version (and its child class NaturalGradientPerElementScaleComponent) requires the input for backprop. See also ScaleAndOffsetComponent. + + Accepted values on its config line, with defaults if applicable: + + vector If specified, the offsets will be read from this file ('vector' + is interpreted as an rxfilename). + + dim The dimension that this component inputs and outputs. + Only required if 'vector' is not specified. + + param-mean=1.0 Mean of randomly initialized offset parameters; should only + be supplied if 'vector' is not supplied. + param-stddev=0.0 Standard deviation of randomly initialized offset parameters; + should only be supplied if 'vector' is not supplied. */ class PerElementScaleComponent: public UpdatableComponent { public: @@ -1670,8 +1683,29 @@ class ConstantFunctionComponent: public UpdatableComponent { -// NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but -// it uses a natural gradient update for the per-element scales. +/** + NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but + it uses a natural gradient update for the per-element scales. + + Accepted values on its config line, with defaults if applicable: + + vector If specified, the offsets will be read from this file ('vector' + is interpreted as an rxfilename). + + dim The dimension that this component inputs and outputs. + Only required if 'vector' is not specified. + + param-mean=1.0 Mean of randomly initialized offset parameters; should only + be supplied if 'vector' is not supplied. + param-stddev=0.0 Standard deviation of randomly initialized offset parameters; + should only be supplied if 'vector' is not supplied. + + And the natural-gradient-related configuration values: + rank=8 + update-period=10 + num-samples-history=2000.0 + alpha=4.0 +*/ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent { public: From 5e6bd39e0ec0e510cb7202990c22fe8b8b9d817c Mon Sep 17 00:00:00 2001 From: Lucas Jo Date: Thu, 29 Mar 2018 03:20:14 +0900 Subject: [PATCH 11/12] [tools, extras] morfessor installation script (#2299) * added install_morfessor.sh and its symbolic link * deleted symbolic link * retab with size 2 * simplified installation process acc. to psmit's advice --- tools/extras/install_morfessor.sh | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100755 tools/extras/install_morfessor.sh diff --git a/tools/extras/install_morfessor.sh b/tools/extras/install_morfessor.sh new file mode 100755 index 00000000000..0722f0fa16a --- /dev/null +++ b/tools/extras/install_morfessor.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Copyright 2017 Atlas Guide (Author : Lucas Jo) +# +# Apache 2.0 +# + +echo "#### installing morfessor" +dirname=morfessor +if [ ! -d ./$dirname ]; then + mkdir -p ./$dirname + git clone https://github.com/aalto-speech/morfessor.git morfessor || + { + echo >&2 "$0: Error git clone operation " + echo >&2 " Failed in cloning the github repository (https://github.com/aalto-speech/morfessor.git)" + exit + } +fi + +# env.sh setup +( + set +u + [ ! -z "${MORFESSOR}" ] && \ + echo >&2 "morfessor variable is aleady defined. undefining..." && \ + unset MORFESSOR + + [ -f ./env.sh ] && . ./env.sh + + [ ! -z "${MORFESSOR}" ] && \ + echo >&2 "MORFESSOR config is already in env.sh" && exit + + wd=`pwd` + wd=`readlink -f $wd || pwd` + + echo "export MORFESSOR=\"$wd/morfessor\"" + echo "export PATH=\"\$PATH:\${MORFESSOR}/scripts\"" + echo "export PYTHONPATH=\"\${PYTHONPATH:-}:\$MORFESSOR\"" +) >> env.sh + +echo >&2 "installation of MORFESSOR finished successfully" +echo >&2 "please source tools/env.sh in your path.sh to enable it" From 191b39a14803f3216a443a61cbbbb5278fc47cfe Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 28 Mar 2018 14:20:51 -0400 Subject: [PATCH 12/12] [src,scripts,egs] Semi-supervised training on Fisher English (#2140) --- .../s5/local/nnet3/run_tdnn_multilingual.sh | 1 - .../s5/local/fisher_create_test_lang.sh | 27 +- .../s5/local/fisher_train_lms_pocolm.sh | 160 +++++++ .../s5/local/nnet3/run_ivector_common.sh | 108 ++--- egs/fisher_english/s5/local/run_unk_model.sh | 23 + egs/fisher_english/s5/local/score.sh | 60 +-- .../s5/local/semisup/chain/run_tdnn.sh | 1 + .../chain/run_tdnn_100k_semisupervised.sh | 1 + .../chain/run_tdnn_50k_semisupervised.sh | 1 + .../tuning/run_tdnn_100k_semisupervised_1a.sh | 434 +++++++++++++++++ .../local/semisup/chain/tuning/run_tdnn_1a.sh | 247 ++++++++++ .../tuning/run_tdnn_50k_semisupervised_1a.sh | 453 ++++++++++++++++++ .../s5/local/semisup/run_100k.sh | 219 +++++++++ .../s5/local/semisup/run_50k.sh | 229 +++++++++ egs/fisher_english/s5/local/wer_output_filter | 16 + egs/fisher_english/s5/run.sh | 29 ++ egs/wsj/s5/steps/best_path_weights.sh | 118 +++++ .../s5/steps/libs/nnet3/report/log_parse.py | 4 +- .../nnet3/train/chain_objf/acoustic_model.py | 108 ++++- .../nnet3/train/frame_level_objf/common.py | 17 +- .../steps/libs/nnet3/xconfig/basic_layers.py | 72 ++- .../lmrescore_const_arpa_undeterminized.sh | 105 ++++ egs/wsj/s5/steps/nnet3/chain/build_tree.sh | 2 +- .../chain/build_tree_multiple_sources.sh | 275 +++++++++++ egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 245 +++++++--- .../nnet3/chain/multilingual/combine_egs.sh | 168 +++++++ egs/wsj/s5/steps/nnet3/chain/train.py | 29 +- egs/wsj/s5/steps/nnet3/decode.sh | 2 +- egs/wsj/s5/steps/nnet3/decode_semisup.sh | 190 ++++++++ .../allocate_multilingual_examples.py | 327 +++++-------- .../steps/nnet3/multilingual/combine_egs.sh | 80 ++-- .../s5/steps/nnet3/report/generate_plots.py | 5 +- egs/wsj/s5/steps/subset_ali_dir.sh | 67 +++ src/chain/chain-supervision.cc | 47 +- src/chain/chain-supervision.h | 13 +- src/chainbin/nnet3-chain-combine.cc | 13 +- src/chainbin/nnet3-chain-copy-egs.cc | 195 +++++--- src/chainbin/nnet3-chain-get-egs.cc | 130 ++++- src/chainbin/nnet3-chain-normalize-egs.cc | 14 +- src/lat/lattice-functions.cc | 111 ++++- src/lat/lattice-functions.h | 44 ++ src/latbin/lattice-compose.cc | 32 +- src/latbin/lattice-determinize-non-compact.cc | 95 ---- .../lattice-determinize-phone-pruned.cc | 38 +- src/latbin/lattice-determinize-pruned.cc | 37 +- src/latbin/lattice-scale.cc | 37 +- src/nnet3/nnet-chain-diagnostics.cc | 33 +- src/nnet3/nnet-chain-diagnostics.h | 5 + src/nnet3/nnet-chain-example.cc | 14 +- src/nnet3/nnet-chain-example.h | 6 +- src/nnet3/nnet-diagnostics.cc | 8 +- src/nnet3/nnet-example-utils.cc | 1 - src/nnet3/nnet-example-utils.h | 2 - src/nnet3bin/nnet3-copy-egs.cc | 121 +++-- 54 files changed, 3994 insertions(+), 825 deletions(-) create mode 100755 egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh create mode 100755 egs/fisher_english/s5/local/run_unk_model.sh mode change 100755 => 120000 egs/fisher_english/s5/local/score.sh create mode 120000 egs/fisher_english/s5/local/semisup/chain/run_tdnn.sh create mode 120000 egs/fisher_english/s5/local/semisup/chain/run_tdnn_100k_semisupervised.sh create mode 120000 egs/fisher_english/s5/local/semisup/chain/run_tdnn_50k_semisupervised.sh create mode 100644 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh create mode 100755 egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh create mode 100644 egs/fisher_english/s5/local/semisup/run_100k.sh create mode 100644 egs/fisher_english/s5/local/semisup/run_50k.sh create mode 100755 egs/fisher_english/s5/local/wer_output_filter create mode 100755 egs/wsj/s5/steps/best_path_weights.sh create mode 100755 egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh create mode 100755 egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh create mode 100755 egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh create mode 100755 egs/wsj/s5/steps/nnet3/decode_semisup.sh create mode 100755 egs/wsj/s5/steps/subset_ali_dir.sh diff --git a/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh b/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh index 65808822db3..22ba636f06a 100755 --- a/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh +++ b/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh @@ -247,7 +247,6 @@ if [ $stage -le 10 ] && [ ! -z $megs_dir ]; then common_egs_dir="${multi_egs_dirs[@]} $megs_dir" steps/nnet3/multilingual/combine_egs.sh $egs_opts \ --cmd "$decode_cmd" \ - --samples-per-iter 400000 \ $num_langs ${common_egs_dir[@]} || exit 1; fi diff --git a/egs/fisher_english/s5/local/fisher_create_test_lang.sh b/egs/fisher_english/s5/local/fisher_create_test_lang.sh index f0926d2ceab..ac3e16c9c78 100755 --- a/egs/fisher_english/s5/local/fisher_create_test_lang.sh +++ b/egs/fisher_english/s5/local/fisher_create_test_lang.sh @@ -1,23 +1,25 @@ #!/bin/bash -# -if [ -f path.sh ]; then . ./path.sh; fi - -mkdir -p data/lang_test +# This script formats ARPA LM into G.fst. arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz +dir=data/lang_test + +if [ -f ./path.sh ]; then . ./path.sh; fi +. utils/parse_options.sh + [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; -mkdir -p data/lang_test -cp -r data/lang/* data/lang_test +mkdir -p $dir +cp -r data/lang/* $dir gunzip -c "$arpa_lm" | \ arpa2fst --disambig-symbol=#0 \ - --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst + --read-symbol-table=$dir/words.txt - $dir/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic $dir/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. @@ -27,22 +29,21 @@ fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/l echo Performing further checks # Checking that G.fst is determinizable. -fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. +fstdeterminize $dir/G.fst /dev/null || echo Error determinizing G. # Checking that L_disambig.fst is determinizable. -fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. +fstdeterminize $dir/L_disambig.fst /dev/null || echo Error determinizing L. # Checking that disambiguated lexicon times G is determinizable # Note: we do this with fstdeterminizestar not fstdeterminize, as # fstdeterminize was taking forever (presumbaly relates to a bug # in this version of OpenFst that makes determinization slow for # some case). -fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ +fsttablecompose $dir/L_disambig.fst $dir/G.fst | \ fstdeterminizestar >/dev/null || echo Error # Checking that LG is stochastic: -fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ +fsttablecompose data/lang/L_disambig.fst $dir/G.fst | \ fstisstochastic || echo "[log:] LG is not stochastic" - echo "$0 succeeded" diff --git a/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh b/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh new file mode 100755 index 00000000000..906703953a1 --- /dev/null +++ b/egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh @@ -0,0 +1,160 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Vimal Manohar +# Apache 2.0 +# +# This script is used to train LMs using pocolm toolkit. +# We use limit-unk-history=true, which truncates the history left of OOV word. +# This ensure the graph is compact when using phone LM to model OOV word. +# See the script local/run_unk_model.sh. + +set -e +stage=0 + +text=data/train/text +lexicon=data/local/dict/lexicon.txt +dir=data/local/pocolm + +num_ngrams_large=5000000 +num_ngrams_small=2500000 + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +for f in "$text" "$lexicon"; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +num_dev_sentences=10000 + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + cleantext=$dir/text_all.gz + + cut -d ' ' -f 2- $text | awk -v lex=$lexicon ' + BEGIN{ + while((getline0) { seen[$1]=1; } + } + { + for(n=1; n<=NF;n++) { + if (seen[$n]) { + printf("%s ", $n); + } else { + printf(" "); + } + } + printf("\n"); + }' | gzip -c > $cleantext || exit 1; + + # This is for reporting perplexities + gunzip -c $dir/text_all.gz | head -n $num_dev_sentences > \ + ${dir}/data/test.txt + + # use a subset of the annotated training data as the dev set . + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + gunzip -c $dir/text_all.gz | tail -n +$[num_dev_sentences+1] | \ + head -n $num_dev_sentences > ${dir}/data/text/dev.txt + + gunzip -c $dir/text_all.gz | tail -n +$[2*num_dev_sentences+1] > \ + ${dir}/data/text/train.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (a subset of the training data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights. + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cat data/dev/text data/test/text | cut -d " " -f 2- > ${dir}/data/real_dev_set.txt + + cat $lexicon | awk '{print $1}' | sort | uniq | awk ' + { + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + printf("%s\n", $1); + }' > $dir/data/wordlist || exit 1; +fi + +order=4 +wordlist=${dir}/data/wordlist + +lm_name="`basename ${wordlist}`_${order}" +min_counts='train=1' +if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" +fi + +unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + --fold-dev-into=train ${bypass_metaparam_optim_opt} \ + --min-counts="${min_counts}" \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/test.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' | tee ${unpruned_lm_dir}/perplexity_test.log + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' | tee ${unpruned_lm_dir}/perplexity_real_dev_set.log +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 5 million n-grams for a big LM for rescoring purposes. + prune_lm_dir.py --target-num-ngrams=$num_ngrams_large --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_test.log + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_real_dev_set.log + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 2.5 million n-grams for a smaller LM for graph building. + # Prune from the bigger-pruned LM, it'll be faster. + prune_lm_dir.py --target-num-ngrams=$num_ngrams_small ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_small/perplexity_test.log + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_small/perplexity_real_dev_set.log + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh index f6dc67991f5..b203f9638b4 100755 --- a/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh +++ b/egs/fisher_english/s5/local/nnet3/run_ivector_common.sh @@ -1,21 +1,22 @@ #!/bin/bash +# Copyright 2017 Hossein Hadian +# 2017 Vimal Manohar +# Apache 2.0 . ./cmd.sh set -e stage=1 -generate_alignments=true # false if doing chain training speed_perturb=true -train_set=train +train_set=train # Supervised training set +ivector_train_set= # data set for training i-vector extractor. + # If not provided, train_set will be used. -lda_train_set=train_100k nnet3_affix= -gmm=tri2_ali # should also contain alignments for $lda_train_set +exp_root=exp . ./path.sh . ./utils/parse_options.sh -gmm_dir=exp/$gmm - # perturbed data preparation if [ "$speed_perturb" == "true" ]; then if [ $stage -le 1 ]; then @@ -23,32 +24,22 @@ if [ "$speed_perturb" == "true" ]; then # to perturb the normal data to get the alignments. # _sp stands for speed-perturbed - for datadir in ${train_set}; do - utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 - utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 - utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 - utils/validate_data_dir.sh --no-feats data/${datadir}_tmp - rm -r data/temp1 data/temp2 + for datadir in ${train_set} ${ivector_train_set}; do + utils/data/perturb_data_dir_speed_3way.sh data/${datadir} data/${datadir}_sp + utils/fix_data_dir.sh data/${datadir}_sp mfccdir=mfcc_perturbed steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ - data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; - utils/fix_data_dir.sh data/${datadir}_tmp - - utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 - utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0 + data/${datadir}_sp $exp_root/make_mfcc/${datadir}_sp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh \ + data/${datadir}_sp $exp_root/make_mfcc/${datadir}_sp $mfccdir || exit 1; utils/fix_data_dir.sh data/${datadir}_sp - rm -r data/temp0 data/${datadir}_tmp done fi - - if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then - #obtain the alignment of the perturbed data - steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ - data/${train_set}_sp data/lang exp/tri5a exp/tri5a_ali_${train_set}_sp || exit 1 - fi train_set=${train_set}_sp + if ! [ -z "$ivector_train_set" ]; then + ivector_train_set=${ivector_train_set}_sp + fi fi if [ $stage -le 3 ]; then @@ -58,28 +49,13 @@ if [ $stage -le 3 ]; then utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/fisher_english-$date/s5b/$mfccdir/storage $mfccdir/storage fi - # the 100k directory is copied seperately, as - # we want to use exp/tri2_ali for lda_mllt training - # the main train directory might be speed_perturbed - for dataset in $train_set $lda_train_set; do + for dataset in $ivector_train_set $train_set; do utils/copy_data_dir.sh data/$dataset data/${dataset}_hires - - # scale the waveforms, this is useful as we don't use CMVN - data_dir=data/${dataset}_hires - cat $data_dir/wav.scp | python -c " -import sys, os, subprocess, re, random -scale_low = 1.0/8 -scale_high = 2.0 -for line in sys.stdin.readlines(): - if len(line.strip()) == 0: - continue - print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) -"| sort -k1,1 -u > $data_dir/wav.scp_scaled || exit 1; - mv $data_dir/wav.scp_scaled $data_dir/wav.scp + utils/data/perturb_data_dir_volume.sh data/${dataset}_hires steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir; - steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir; + --cmd "$train_cmd" data/${dataset}_hires $exp_root/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires $exp_root/make_hires/${dataset} $mfccdir; # Remove the small number of utterances that couldn't be extracted for some # reason (e.g. too short; no such file). @@ -90,57 +66,55 @@ for line in sys.stdin.readlines(): # Create MFCCs for the eval set utils/copy_data_dir.sh data/$dataset data/${dataset}_hires steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ - data/${dataset}_hires exp/make_hires/$dataset $mfccdir; - steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + data/${dataset}_hires $exp_root/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires $exp_root/make_hires/$dataset $mfccdir; utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems done +fi - # Take the first 30k utterances (about 1/8th of the data) this will be used - # for the diagubm training - utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires - utils/data/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr +if [ -z "$ivector_train_set" ]; then + ivector_train_set=$train_set fi # ivector extractor training if [ $stage -le 4 ]; then - # We need to build a small system just because we need the LDA+MLLT transform - # to train the diag-UBM on top of. We use --num-iters 13 because after we get - # the transform (12th iter is the last), any further training is pointless. - # this decision is based on fisher_english - steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ - 5500 90000 data/${lda_train_set}_hires \ - data/lang $gmm_dir exp/nnet3${nnet3_affix}/tri3a + --max-utts 10000 --subsample 2 \ + data/${ivector_train_set}_hires \ + $exp_root/nnet3${nnet3_affix}/pca_transform fi if [ $stage -le 5 ]; then - # To train a diagonal UBM we don't need very much data, so use the smallest subset. steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ - data/${train_set}_30k_nodup_hires 512 exp/nnet3${nnet3_affix}/tri3a exp/nnet3${nnet3_affix}/diag_ubm + data/${ivector_train_set}_hires 512 \ + $exp_root/nnet3${nnet3_affix}/pca_transform $exp_root/nnet3${nnet3_affix}/diag_ubm fi if [ $stage -le 6 ]; then - # iVector extractors can be sensitive to the amount of data, but this one has a - # fairly small dim (defaults to 100) so we don't use all of it, we use just the - # 100k subset (just under half the data). steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ - data/${lda_train_set}_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1; + data/${ivector_train_set}_hires $exp_root/nnet3${nnet3_affix}/diag_ubm \ + $exp_root/nnet3${nnet3_affix}/extractor || exit 1; fi if [ $stage -le 7 ]; then # We extract iVectors on all the ${train_set} data, which will be what we # train the system on. - # having a larger number of speakers is helpful for generalization, and to # handle per-utterance decoding well (iVector starts at zero). - steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires + utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ + data/${ivector_train_set}_hires data/${ivector_train_set}_max2_hires steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${train_set}_max2_hires exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires || exit 1; + data/${ivector_train_set}_max2_hires $exp_root/nnet3${nnet3_affix}/extractor \ + $exp_root/nnet3${nnet3_affix}/ivectors_${ivector_train_set}_hires || exit 1; +fi +if [ $stage -le 8 ]; then for dataset in test dev; do steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${dataset}_hires exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1; + data/${dataset}_hires $exp_root/nnet3${nnet3_affix}/extractor \ + $exp_root/nnet3${nnet3_affix}/ivectors_${dataset}_hires || exit 1; done fi diff --git a/egs/fisher_english/s5/local/run_unk_model.sh b/egs/fisher_english/s5/local/run_unk_model.sh new file mode 100755 index 00000000000..1fe658bda79 --- /dev/null +++ b/egs/fisher_english/s5/local/run_unk_model.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar + +# This script prepares lang directory with UNK modeled by a phone LM. + +utils/lang/make_unk_lm.sh data/local/dict exp/unk_lang_model || exit 1 + +utils/prepare_lang.sh \ + --unk-fst exp/unk_lang_model/unk_fst.txt \ + data/local/dict "" data/local/lang data/lang_unk + +# note: it's important that the LM we built in data/lang/G.fst was created using +# pocolm with the option --limit-unk-history=true (see ted_train_lm.sh). This +# keeps the graph compact after adding the unk model (we only have to add one +# copy of it). + +exit 0 + +## Caution: if you use this unk-model stuff, be sure that the scoring script +## does not use lattice-align-words-lexicon, because it's not compatible with +## the unk-model. Instead you should use lattice-align-words (of course, this +## only works if you have position-dependent phones). diff --git a/egs/fisher_english/s5/local/score.sh b/egs/fisher_english/s5/local/score.sh deleted file mode 100755 index c381abf7277..00000000000 --- a/egs/fisher_english/s5/local/score.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -# Copyright Johns Hopkins University (Author: Daniel Povey) 2012. Apache 2.0. - -# begin configuration section. -cmd=run.pl -min_lmwt=5 -max_lmwt=17 -#end configuration section. - -[ -f ./path.sh ] && . ./path.sh -. parse_options.sh || exit 1; - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " - echo " Options:" - echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." - echo " --min_lmwt # minumum LM-weight for lattice rescoring " - echo " --max_lmwt # maximum LM-weight for lattice rescoring " - exit 1; -fi - -data=$1 -lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. -dir=$3 - -model=$dir/../final.mdl # assume model one level up from decoding dir. - -for f in $data/text $lang/words.txt $dir/lat.1.gz; do - [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; -done - -name=`basename $data`; # e.g. eval2000 - -mkdir -p $dir/scoring/log - - -function filter_text { - perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; } - while() { @A = split(" ", $_); $id = shift @A; print "$id "; - foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \ - '[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '' '%HESITATION' -} - -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ - lattice-best-path --lm-scale=LMWT --word-symbol-table=$lang/words.txt \ - "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1; - -for lmwt in `seq $min_lmwt $max_lmwt`; do - utils/int2sym.pl -f 2- $lang/words.txt <$dir/scoring/$lmwt.tra | \ - filter_text > $dir/scoring/$lmwt.txt || exit 1; -done - -filter_text <$data/text >$dir/scoring/text.filt - -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ - compute-wer --text --mode=present \ - ark:$dir/scoring/text.filt ark:$dir/scoring/LMWT.txt ">&" $dir/wer_LMWT || exit 1; - -exit 0 diff --git a/egs/fisher_english/s5/local/score.sh b/egs/fisher_english/s5/local/score.sh new file mode 120000 index 00000000000..6a200b42ed3 --- /dev/null +++ b/egs/fisher_english/s5/local/score.sh @@ -0,0 +1 @@ +../steps/scoring/score_kaldi_wer.sh \ No newline at end of file diff --git a/egs/fisher_english/s5/local/semisup/chain/run_tdnn.sh b/egs/fisher_english/s5/local/semisup/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/fisher_english/s5/local/semisup/chain/run_tdnn_100k_semisupervised.sh b/egs/fisher_english/s5/local/semisup/chain/run_tdnn_100k_semisupervised.sh new file mode 120000 index 00000000000..705b1a1dd12 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/run_tdnn_100k_semisupervised.sh @@ -0,0 +1 @@ +tuning/run_tdnn_100k_semisupervised_1a.sh \ No newline at end of file diff --git a/egs/fisher_english/s5/local/semisup/chain/run_tdnn_50k_semisupervised.sh b/egs/fisher_english/s5/local/semisup/chain/run_tdnn_50k_semisupervised.sh new file mode 120000 index 00000000000..70ebebf3c13 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/run_tdnn_50k_semisupervised.sh @@ -0,0 +1 @@ +tuning/run_tdnn_50k_semisupervised_1a.sh \ No newline at end of file diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh new file mode 100644 index 00000000000..9ba7da6e361 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh @@ -0,0 +1,434 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +# This script is semi-supervised recipe with 100 hours of supervised data +# and 250 hours unsupervised data with naive splitting. +# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI", +# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018 +# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf +# local/semisup/run_100k.sh shows how to call this. + +# This version of script uses only supervised data for i-vector extractor +# training as against using the combined data as in run_tdnn_50k_semisupervised.sh. +# We use 3-gram LM trained on 100 hours of supervised data. We do not have +# enough data to do 4-gram LM rescoring as in run_tdnn_50k_semisupervised.sh. + +# This script uses phone LM to model UNK. +# This script uses the same tree as that for the seed model. +# See the comments in the script about how to change these. + +# Unsupervised set: train_unsup100k_250k (250 hour subset of Fisher excluding 100 hours for supervised) +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervised): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices + +# output-0 and output-1 are for superivsed and unsupervised data respectively. + +# Semi-supervised training train_sup +# WER on dev 18.70 +# WER on test 18.18 +# Final output-0 train prob -0.1345 +# Final output-0 valid prob -0.1547 +# Final output-0 train prob (xent) -1.3683 +# Final output-0 valid prob (xent) -1.4077 +# Final output-1 train prob -0.6856 +# Final output-1 valid prob -0.6815 +# Final output-1 train prob (xent) -1.1224 +# Final output-1 valid prob (xent) -1.2218 + +set -u -e -o pipefail + +stage=0 # Start from -1 for supervised seed system training +train_stage=-100 +nj=80 +test_nj=50 + +# The following 3 options decide the output directory for semi-supervised +# chain system +# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix} + +exp_root=exp/semisup_100k +chain_affix= # affix for chain dir +tdnn_affix=_semisup_1a # affix for semi-supervised chain system + +# Datasets -- Expects data/$supervised_set and data/$unsupervised_set to be +# present +supervised_set=train_sup +unsupervised_set=train_unsup100k_250k + +# Input seed system +sup_chain_dir=exp/semisup_100k/chain/tdnn_1a_sp # supervised chain system +sup_lat_dir=exp/semisup_100k/chain/tri4a_train_sup_unk_lats # Seed model options +sup_tree_dir=exp/semisup_100k/chain/tree_bi_a # tree directory for supervised chain system +ivector_root_dir=exp/semisup_100k/nnet3 # i-vector extractor root directory + +# Semi-supervised options +supervision_weights=1.0,1.0 # Weights for supervised, unsupervised data egs. + # Can be used to scale down the effect of unsupervised data + # by using a smaller scale for it e.g. 1.0,0.3 +lm_weights=3,2 # Weights on phone counts from supervised, unsupervised data for denominator FST creation + +sup_egs_dir= # Supply this to skip supervised egs creation +unsup_egs_dir= # Supply this to skip unsupervised egs creation +unsup_egs_opts= # Extra options to pass to unsupervised egs creation + +# Neural network opts +xent_regularize=0.1 + +decode_iter= # Iteration to decode with + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +# The following can be replaced with the versions that model +# UNK using phone LM. $sup_lat_dir should also ideally be changed. +unsup_decode_lang=data/lang_test_poco_sup100k_unk +unsup_decode_graph_affix=_poco_sup100k_unk +test_lang=data/lang_test_poco_unk +test_graph_affix=_poco_unk + +dir=$exp_root/chain${chain_affix}/tdnn${tdnn_affix} + +if ! cuda-compiled; then + cat < \ +# $sup_chain_dir/best_path_${unsupervised_set_perturbed}/frame_subsampling_factor +# +# # This should be 1 if using a different source for supervised data alignments. +# # However alignments in seed tree directory have already been sub-sampled. +# echo $frame_subsampling_factor > \ +# $sup_tree_dir/frame_subsampling_factor +# +# # Build a new tree using stats from both supervised and unsupervised data +# steps/nnet3/chain/build_tree_multiple_sources.sh \ +# --use-fmllr false --context-opts "--context-width=2 --central-position=1" \ +# --frame-subsampling-factor $frame_subsampling_factor \ +# 7000 $lang \ +# data/${supervised_set_perturbed} \ +# ${sup_tree_dir} \ +# data/${unsupervised_set_perturbed} \ +# $chaindir/best_path_${unsupervised_set_perturbed} \ +# $treedir || exit 1 +# fi +# +# sup_tree_dir=$treedir # Use the new tree dir for further steps + +# Train denominator FST using phone alignments from +# supervised and unsupervised data +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${sup_tree_dir} ${sup_chain_dir}/best_path_${unsupervised_set_perturbed} \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=725 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=725 + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=725 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=725 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=725 + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=725 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=725 target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=725 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + # We use separate outputs for supervised and unsupervised data + # so we can properly track the train and valid objectives. + + output name=output-0 input=output.affine + output name=output-1 input=output.affine + + output name=output-0-xent input=output-xent.log-softmax + output name=output-1-xent input=output-xent.log-softmax +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +# Get values for $model_left_context, $model_right_context +. $dir/configs/vars + +left_context=$model_left_context +right_context=$model_right_context +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)") +egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)") +egs_left_context_initial=$(perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)") +egs_right_context_final=$(perl -e "print int($right_context_final + $frame_subsampling_factor / 2)") + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set_perturbed} + frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $sup_ivector_dir \ + --generate-egs-scp true \ + data/${supervised_set_perturbed}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_frames_per_eg=150 # Using a frames-per-eg of 150 for unsupervised data + # was found to be better than allowing smaller chunks + # (160,140,110,80) like for supervised system +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices when + # creating numerator supervision +lattice_prune_beam=4.0 # beam for pruning the lattices prior to getting egs + # for unsupervised data +tolerance=1 # frame-tolerance for chain training + +unsup_lat_dir=${sup_chain_dir}/decode_${unsupervised_set_perturbed} +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set_perturbed} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh \ + --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --deriv-weights-scp $sup_chain_dir/best_path_${unsupervised_set_perturbed}/weights.scp \ + --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set_perturbed}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/comb_egs +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + # This is to skip stages of den-fst creation, which was already done. + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $sup_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set_perturbed}_hires \ + --tree-dir $sup_tree_dir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + rm -f $dir/.error + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + if [ $num_jobs -gt $test_nj ]; then num_jobs=$test_nj; fi + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" ${decode_iter:+--iter $decode_iter} \ + --online-ivector-dir $ivector_root_dir/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || touch $dir/.error + ) & + done + wait; + if [ -f $dir/.error ]; then + echo "$0: Decoding failed. See $dir/decode${test_graph_affix}_*/log/*" + exit 1 + fi +fi + +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..e76df666e8a --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,247 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +set -e +set -o pipefail + +# This is fisher chain recipe for training a model on a subset of around +# 100-300 hours of supervised data. +# This system uses phone LM to model UNK. +# local/semisup/run_50k.sh and local/semisup/run_100k.sh show how to call this. + +# train_set train_sup15k train_sup50k train_sup +# ivector_train_set semisup15k_100k_250k semisup50k_100k_250k train_sup +# WER on dev 27.75 21.41 19.23 +# WER on test 27.24 21.03 19.01 +# Final train prob -0.0959 -0.1035 -0.1224 +# Final valid prob -0.1823 -0.1667 -0.1503 +# Final train prob (xent) -1.9246 -1.5926 -1.6454 +# Final valid prob (xent) -2.1873 -1.7990 -1.7107 + +# train_set semisup15k_100k_250k semisup50k_100k_250k semisup100k_250k +# ivector_train_set semisup15k_100k_250k semisup50k_100k_250k train_sup +# WER on dev 17.92 17.55 16.97 +# WER on test 17.95 17.72 17.03 +# Final output train prob -0.1145 -0.1155 -0.1196 +# Final output valid prob -0.1370 -0.1510 -0.1469 +# Final output train prob (xent) -1.7449 -1.7458 -1.5487 +# Final output valid prob (xent) -1.7785 -1.9045 -1.6360 + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +exp_root=exp/semisup_100k + +nj=30 +tdnn_affix=_1a +train_set=train_sup +ivector_train_set= # dataset for training i-vector extractor + +nnet3_affix= # affix for nnet3 dir -- relates to i-vector used +chain_affix= # affix for chain dir +tree_affix=bi_a +gmm=tri4a # Expect GMM model in $exp/$gmm for alignment + +# Neural network opts +xent_regularize=0.1 +hidden_dim=725 + +# training options +num_epochs=4 + +remove_egs=false +common_egs_dir= # if provided, will skip egs generation +common_treedir= # if provided, will skip the tree building stage + +decode_iter= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ -z "$common_treedir" ]; then + if [ $stage -le 12 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/${train_set}_sp $lang $lat_dir $treedir || exit 1 + fi +else + treedir=$common_treedir +fi + +if [ $stage -le 13 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=$hidden_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=$hidden_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --egs.chunk-width 160,140,110,80 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $treedir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; +fi + +graph_dir=$dir/graph_poco_unk +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test_poco_unk $dir $graph_dir +fi + +decode_suff= +if [ $stage -le 16 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir $exp_root/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_poco_unk_${decode_set}${decode_iter:+_$decode_iter}${decode_suff} || exit 1; + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh new file mode 100755 index 00000000000..ad5d2b106b5 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh @@ -0,0 +1,453 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +# This script is semi-supervised recipe with around 50 hours of supervised data +# and 250 hours unsupervised data with naive splitting. +# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI", +# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018 +# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf +# local/semisup/run_50k.sh shows how to call this. + +# We use the combined data for i-vector extractor training. +# We use 4-gram LM trained on 1250 hours of data excluding the 250 hours +# unsupervised data to create LM for decoding. Rescoring is done with +# a larger 4-gram LM. +# This differs from the case in run_tdnn_100k_semisupervised.sh. + +# This script uses phone LM to model UNK. +# This script uses the same tree as that for the seed model. +# See the comments in the script about how to change these. + +# Unsupervised set: train_unsup100k_250k (250 hour subset of Fisher excluding 100 hours for supervised) +# unsup_frames_per_eg=150 +# Deriv weights: Lattice posterior of best path pdf +# Unsupervised weight: 1.0 +# Weights for phone LM (supervised, unsupervised): 3,2 +# LM for decoding unsupervised data: 4gram +# Supervision: Naive split lattices + +# Supervised training results train_sup15k train_sup50k +# WER on dev 27.75 21.41 +# WER on test 27.24 21.03 +# Final output train prob -0.0959 -0.1035 +# Final output valid prob -0.1823 -0.1667 +# Final output train prob (xent) -1.9246 -1.5926 +# Final output valid prob (xent) -2.1873 -1.7990 + +# output-0 and output-1 are for superivsed and unsupervised data respectively. + +# Semi-supervised training train_sup15k train_sup50k +# WER on dev 21.31 18.98 +# WER on test 21.00 18.85 +# Final output-0 train prob -0.1577 -0.1381 +# Final output-0 valid prob -0.1761 -0.1723 +# Final output-0 train prob (xent) -1.4744 -1.3676 +# Final output-0 valid prob (xent) -1.5293 -1.4589 +# Final output-1 train prob -0.7305 -0.7671 +# Final output-1 valid prob -0.7319 -0.7714 +# Final output-1 train prob (xent) -1.1681 -1.1480 +# Final output-1 valid prob (xent) -1.2871 -1.2382 + +set -u -e -o pipefail + +stage=0 # Start from -1 for supervised seed system training +train_stage=-100 +nj=80 +test_nj=50 + +# The following 3 options decide the output directory for semi-supervised +# chain system +# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix} + +exp_root=exp/semisup_50k +chain_affix=_semi50k_100k_250k # affix for chain dir + # 50 hour subset out of 100 hours of supervised data + # 250 hour subset out of (1500-100=1400) hours of unsupervised data +tdnn_affix=_semisup_1a + +# Datasets -- Expects data/$supervised_set and data/$unsupervised_set to be +# present +supervised_set=train_sup50k +unsupervised_set=train_unsup100k_250k + +# Input seed system +sup_chain_dir=exp/semisup_50k/chain_semi50k_100k_250k/tdnn_1a_sp # supervised chain system +sup_lat_dir=exp/semisup_50k/chain_semi50k_100k_250k/tri4a_train_sup50k_unk_lats # lattices for supervised set +sup_tree_dir=exp/semisup_50k/chain_semi50k_100k_250k/tree_bi_a # tree directory for supervised chain system +ivector_root_dir=exp/semisup_50k/nnet3_semi50k_100k_250k # i-vector extractor root directory + +# Semi-supervised options +supervision_weights=1.0,1.0 # Weights for supervised, unsupervised data egs. + # Can be used to scale down the effect of unsupervised data + # by using a smaller scale for it e.g. 1.0,0.3 +lm_weights=3,2 # Weights on phone counts from supervised, unsupervised data for denominator FST creation + +sup_egs_dir= # Supply this to skip supervised egs creation +unsup_egs_dir= # Supply this to skip unsupervised egs creation +unsup_egs_opts= # Extra options to pass to unsupervised egs creation + +# Neural network opts +xent_regularize=0.1 + +decode_iter= # Iteration to decode with + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +# The following can be replaced with the versions that do not model +# UNK using phone LM. $sup_lat_dir should also ideally be changed. +unsup_decode_lang=data/lang_test_poco_ex250k_unk +unsup_decode_graph_affix=_poco_ex250k_unk +test_lang=data/lang_test_poco_unk +test_graph_affix=_poco_unk + +unsup_rescore_lang=${unsup_decode_lang}_big + +dir=$exp_root/chain${chain_affix}/tdnn${tdnn_affix} + +if ! cuda-compiled; then + cat < \ +# $sup_chain_dir/best_path_${unsupervised_set_perturbed}_big/frame_subsampling_factor +# +# # This should be 1 if using a different source for supervised data alignments. +# # However alignments in seed tree directory have already been sub-sampled. +# echo $frame_subsampling_factor > \ +# $sup_tree_dir/frame_subsampling_factor +# +# # Build a new tree using stats from both supervised and unsupervised data +# steps/nnet3/chain/build_tree_multiple_sources.sh \ +# --use-fmllr false --context-opts "--context-width=2 --central-position=1" \ +# --frame-subsampling-factor $frame_subsampling_factor \ +# 7000 $lang \ +# data/${supervised_set_perturbed} \ +# ${sup_tree_dir} \ +# data/${unsupervised_set_perturbed} \ +# $chaindir/best_path_${unsupervised_set_perturbed} \ +# $treedir || exit 1 +# fi +# +# sup_tree_dir=$treedir # Use the new tree dir for further steps + +# Train denominator FST using phone alignments from +# supervised and unsupervised data +if [ $stage -le 10 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${sup_tree_dir} ${sup_chain_dir}/best_path_${unsupervised_set_perturbed}_big \ + $dir +fi + +if [ $stage -le 11 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=725 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=725 + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=725 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=725 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=725 + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=725 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=725 target-rms=0.5 + output-layer name=output input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=725 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + + # We use separate outputs for supervised and unsupervised data + # so we can properly track the train and valid objectives. + + output name=output-0 input=output.affine + output name=output-1 input=output.affine + + output name=output-0-xent input=output-xent.log-softmax + output name=output-1-xent input=output-xent.log-softmax +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +# Get values for $model_left_context, $model_right_context +. $dir/configs/vars + +left_context=$model_left_context +right_context=$model_right_context +left_context_initial=$model_left_context +right_context_final=$model_right_context + +egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)") +egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)") +egs_left_context_initial=$(perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)") +egs_right_context_final=$(perl -e "print int($right_context_final + $frame_subsampling_factor / 2)") + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set_perturbed} + frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg) + + if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $sup_ivector_dir \ + --generate-egs-scp true \ + data/${supervised_set_perturbed}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_frames_per_eg=150 # Using a frames-per-eg of 150 for unsupervised data + # was found to be better than allowing smaller chunks + # (160,140,110,80) like for supervised system +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices when + # creating numerator supervision +lattice_prune_beam=4.0 # beam for pruning the lattices prior to getting egs + # for unsupervised data +tolerance=1 # frame-tolerance for chain training + +unsup_lat_dir=${sup_chain_dir}/decode_${unsupervised_set_perturbed}_big +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set_perturbed} + + if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh \ + --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --deriv-weights-scp $sup_chain_dir/best_path_${unsupervised_set_perturbed}_big/weights.scp \ + --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set_perturbed}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/comb_egs +if [ $stage -le 14 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + # This is to skip stages of den-fst creation, which was already done. + train_stage=-4 +fi + +if [ $stage -le 15 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$comb_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $sup_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir data/${supervised_set_perturbed}_hires \ + --tree-dir $sup_tree_dir \ + --lat-dir $sup_lat_dir \ + --dir $dir || exit 1; +fi + +test_graph_dir=$dir/graph${test_graph_affix} +if [ $stage -le 17 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 18 ]; then + rm -f $dir/.error + for decode_set in dev test; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + if [ $num_jobs -gt $test_nj ]; then num_jobs=$test_nj; fi + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" ${decode_iter:+--iter $decode_iter} \ + --online-ivector-dir $ivector_root_dir/ivectors_${decode_set}_hires \ + $test_graph_dir data/${decode_set}_hires \ + $dir/decode${test_graph_affix}_${decode_set}${decode_iter:+_iter$decode_iter} || touch $dir/.error + ) & + done + wait; + if [ -f $dir/.error ]; then + echo "$0: Decoding failed. See $dir/decode${test_graph_affix}_*/log/*" + exit 1 + fi +fi + +exit 0; diff --git a/egs/fisher_english/s5/local/semisup/run_100k.sh b/egs/fisher_english/s5/local/semisup/run_100k.sh new file mode 100644 index 00000000000..7657e94b7f2 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_100k.sh @@ -0,0 +1,219 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +# This script demonstrates semi-supervised training using 100 hours of +# supervised data and 250 hours of unsupervised data. +# We assume the supervised data is in data/train_sup and unsupervised data +# is in data/train_unsup100k_250k. +# For LM training, we only use the supervised set corresponding to 100 hours as +# opposed to the case in run_50k.sh, where we included part of the +# transcripts in data/train/text. +# This uses only 100 hours supervised set for i-vector extractor training, +# which is different from run_50k.sh, which uses combined supervised + +# unsupervised set. + +. ./cmd.sh +. ./path.sh + +set -o pipefail +exp_root=exp/semisup_100k + +stage=0 + +. utils/parse_options.sh + +for f in data/train_sup/utt2spk data/train_unsup100k_250k/utt2spk \ + data/train_sup/text; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1 + fi +done + +############################################################################### +# Prepare the 100 hours supervised set and subsets for initial GMM training +############################################################################### + +if [ $stage -le 0 ]; then + utils/subset_data_dir.sh --shortest data/train_sup 100000 data/train_sup_100kshort + utils/subset_data_dir.sh data/train_sup_100kshort 10000 data/train_sup_10k + utils/data/remove_dup_utts.sh 100 data/train_sup_10k data/train_sup_10k_nodup + utils/subset_data_dir.sh --speakers data/train_sup 30000 data/train_sup_30k +fi + +############################################################################### +# GMM system training using 100 hours supervised data +############################################################################### + +if [ $stage -le 1 ]; then + steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup_10k_nodup data/lang $exp_root/mono0a || exit 1 +fi + +if [ $stage -le 2 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup_30k data/lang $exp_root/mono0a $exp_root/mono0a_ali || exit 1 + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train_sup_30k data/lang $exp_root/mono0a_ali $exp_root/tri1 || exit 1 + + (utils/mkgraph.sh data/lang_test $exp_root/tri1 $exp_root/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri1/graph data/dev $exp_root/tri1/decode_dev)& +fi + +if [ $stage -le 3 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup_30k data/lang $exp_root/tri1 $exp_root/tri1_ali || exit 1; + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 2500 20000 data/train_sup_30k data/lang $exp_root/tri1_ali $exp_root/tri2 || exit 1; + + (utils/mkgraph.sh data/lang_test $exp_root/tri2 $exp_root/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri2/graph data/dev $exp_root/tri2/decode_dev)& +fi + +if [ $stage -le 4 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup data/lang $exp_root/tri2 $exp_root/tri2_ali || exit 1; + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + 5000 40000 data/train_sup data/lang $exp_root/tri2_ali $exp_root/tri3a || exit 1; + + ( + utils/mkgraph.sh data/lang_test $exp_root/tri3a $exp_root/tri3a/graph || exit 1; + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri3a/graph data/dev $exp_root/tri3a/decode_dev || exit 1; + )& +fi + +if [ $stage -le 5 ]; then + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup data/lang $exp_root/tri3a $exp_root/tri3a_ali || exit 1; + + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/train_sup data/lang $exp_root/tri3a_ali $exp_root/tri4a || exit 1; + + ( + utils/mkgraph.sh data/lang_test $exp_root/tri4a $exp_root/tri4a/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri4a/graph data/dev $exp_root/tri4a/decode_dev + )& +fi + +############################################################################### +# Prepare semi-supervised train set +############################################################################### + +if [ $stage -le 6 ]; then + utils/combine_data.sh data/semisup100k_250k \ + data/train_sup data/train_unsup100k_250k || exit 1 +fi + +############################################################################### +# Train LM on the supervised set +############################################################################### + +if [ $stage -le 7 ]; then + if [ ! -f data/lang_test_poco_sup100k/G.fst ]; then + local/fisher_train_lms_pocolm.sh \ + --text data/train_sup/text \ + --dir data/local/lm_sup100k + + local/fisher_create_test_lang.sh \ + --arpa-lm data/local/pocolm_sup100k/data/arpa/4gram_small.arpa.gz \ + --dir data/lang_test_poco_sup100k + fi +fi + +############################################################################### +# Prepare lang directories with UNK modeled using phone LM +############################################################################### + +if [ $stage -le 8 ]; then + local/run_unk_model.sh || exit 1 + + for lang_dir in data/lang_test_poco_sup100k; do + rm -r ${lang_dir}_unk 2>/dev/null || true + cp -rT data/lang_unk ${lang_dir}_unk + cp ${lang_dir}/G.fst ${lang_dir}_unk/G.fst + done +fi + +############################################################################### +# Train seed chain system using 100 hours supervised data. +# Here we train i-vector extractor on only the supervised set. +############################################################################### + +if [ $stage -le 9 ]; then + local/semisup/chain/run_tdnn.sh \ + --train-set train_sup \ + --ivector-train-set "" \ + --nnet3-affix "" --chain-affix "" \ + --tdnn-affix _1a --tree-affix bi_a \ + --gmm tri4a --exp-root $exp_root || exit 1 + + # WER on dev 19.23 + # WER on test 19.01 + # Final train prob -0.1224 + # Final valid prob -0.1503 + # Final train prob (xent) -1.6454 + # Final valid prob (xent) -1.7107 +fi + +############################################################################### +# Semi-supervised training using 100 hours supervised data and +# 250 hours unsupervised data. We use i-vector extractor, tree, lattices +# and seed chain system from the previous stage. +############################################################################### + +if [ $stage -le 10 ]; then + local/semisup/chain/run_tdnn_100k_semisupervised.sh \ + --supervised-set train_sup \ + --unsupervised-set train_unsup100k_250k \ + --sup-chain-dir $exp_root/chain/tdnn_1a_sp \ + --sup-lat-dir $exp_root/chain/tri4a_train_sup_unk_lats \ + --sup-tree-dir $exp_root/chain/tree_bi_a \ + --ivector-root-dir $exp_root/nnet3 \ + --chain-affix "" \ + --tdnn-affix _semisup_1a \ + --exp-root $exp_root || exit 1 + + # WER on dev 18.70 + # WER on test 18.18 + # Final output-0 train prob -0.1345 + # Final output-0 valid prob -0.1547 + # Final output-0 train prob (xent) -1.3683 + # Final output-0 valid prob (xent) -1.4077 + # Final output-1 train prob -0.6856 + # Final output-1 valid prob -0.6815 + # Final output-1 train prob (xent) -1.1224 + # Final output-1 valid prob (xent) -1.2218 +fi + +############################################################################### +# Oracle system trained on combined 350 hours including both supervised and +# unsupervised sets. We use i-vector extractor, tree, and GMM trained +# on only the supervised for fair comparison to semi-supervised experiments. +############################################################################### + +if [ $stage -le 11 ]; then + local/semisup/chain/run_tdnn.sh \ + --train-set semisup100k_250k \ + --nnet3-affix "" --chain-affix "" \ + --common-treedir $exp_root/chain/tree_bi_a \ + --tdnn-affix 1a_oracle --nj 100 \ + --gmm tri4a --exp $exp_root \ + --stage 9 || exit 1 + + # WER on dev 16.97 + # WER on test 17.03 + # Final output train prob -0.1196 + # Final output valid prob -0.1469 + # Final output train prob (xent) -1.5487 + # Final output valid prob (xent) -1.6360 +fi diff --git a/egs/fisher_english/s5/local/semisup/run_50k.sh b/egs/fisher_english/s5/local/semisup/run_50k.sh new file mode 100644 index 00000000000..c2a5c0db7e7 --- /dev/null +++ b/egs/fisher_english/s5/local/semisup/run_50k.sh @@ -0,0 +1,229 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +# This script demonstrates semi-supervised training using 50 hours of +# supervised data and 250 hours of unsupervised data. +# We assume the supervised data is in data/train_sup and unsupervised data +# is in data/train_unsup100k_250k. +# For LM training, we assume there is data/train/text, from which +# we will exclude the utterances contained in the unsupervised set. +# We use all 300 hours of semi-supervised data for i-vector extractor training. + +# This differs from run_100k.sh, which uses only 100 hours supervised data for +# both i-vector extractor training and LM training. + +. ./cmd.sh +. ./path.sh + +set -o pipefail +exp_root=exp/semisup_50k + +stage=0 + +. utils/parse_options.sh + +for f in data/train_sup/utt2spk data/train_unsup100k_250k/utt2spk \ + data/train/text; do + if [ ! -f $f ]; then + echo "$0: Could not find $f" + exit 1 + fi +done + +############################################################################### +# Prepare the 50 hours supervised set and subsets for initial GMM training +############################################################################### + +if [ $stage -le 0 ]; then + utils/subset_data_dir.sh --speakers data/train_sup 50000 data/train_sup50k || exit 1 + utils/subset_data_dir.sh --shortest data/train_sup50k 25000 data/train_sup50k_short || exit 1 + utils/subset_data_dir.sh --speakers data/train_sup50k 30000 data/train_sup50k_30k || exit 1; +fi + +############################################################################### +# GMM system training using 50 hours supervised data +############################################################################### + +if [ $stage -le 1 ]; then + steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ + data/train_sup50k_short data/lang $exp_root/mono0a || exit 1 +fi + +if [ $stage -le 2 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k_30k data/lang $exp_root/mono0a $exp_root/mono0a_ali || exit 1 + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train_sup50k_30k data/lang $exp_root/mono0a_ali $exp_root/tri1 || exit 1 + + (utils/mkgraph.sh data/lang_test $exp_root/tri1 $exp_root/tri1/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri1/graph data/dev $exp_root/tri1/decode_dev)& +fi + +if [ $stage -le 3 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k_30k data/lang $exp_root/tri1 $exp_root/tri1_ali || exit 1; + + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 20000 data/train_sup50k_30k data/lang $exp_root/tri1_ali $exp_root/tri2 || exit 1 + + (utils/mkgraph.sh data/lang_test $exp_root/tri2 $exp_root/tri2/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri2/graph data/dev $exp_root/tri2/decode_dev)& +fi + +if [ $stage -le 4 ]; then + steps/align_si.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k data/lang $exp_root/tri2 $exp_root/tri2_ali || exit 1; + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 30000 data/train_sup50k data/lang $exp_root/tri2_ali $exp_root/tri3a || exit 1; + + (utils/mkgraph.sh data/lang_test $exp_root/tri3a $exp_root/tri3a/graph + steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri3a/graph data/dev $exp_root/tri3a/decode_dev)& +fi + +if [ $stage -le 5 ]; then + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_sup50k data/lang $exp_root/tri3a $exp_root/tri3a_ali || exit 1; + + steps/train_sat.sh --cmd "$train_cmd" \ + 4000 50000 data/train_sup50k data/lang $exp_root/tri3a_ali $exp_root/tri4a || exit 1; + + ( + utils/mkgraph.sh data/lang_test $exp_root/tri4a $exp_root/tri4a/graph + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + $exp_root/tri4a/graph data/dev $exp_root/tri4a/decode_dev + )& +fi + +############################################################################### +# Prepare semi-supervised train set +############################################################################### + +if [ $stage -le 6 ]; then + utils/combine_data.sh data/semisup50k_100k_250k \ + data/train_sup50k data/train_unsup100k_250k || exit 1 +fi + +############################################################################### +# Train LM on all the text in data/train/text, but excluding the +# utterances in the unsupervised set +############################################################################### + +if [ $stage -le 7 ]; then + mkdir -p data/local/pocolm_ex250k + + utils/filter_scp.pl --exclude data/train_unsup100k_250k/utt2spk \ + data/train/text > data/local/pocolm_ex250k/text.tmp + + if [ ! -f data/lang_test_poco_ex250k_big/G.carpa ]; then + local/fisher_train_lms_pocolm.sh \ + --text data/local/pocolm_ex250k/text.tmp \ + --dir data/local/pocolm_ex250k + + local/fisher_create_test_lang.sh \ + --arpa-lm data/local/pocolm_ex250k/data/arpa/4gram_small.arpa.gz \ + --dir data/lang_test_poco_ex250k + + utils/build_const_arpa_lm.sh \ + data/local/pocolm_ex250k/data/arpa/4gram_big.arpa.gz \ + data/lang_test_poco_ex250k data/lang_test_poco_ex250k_big + fi +fi + +############################################################################### +# Prepare lang directories with UNK modeled using phone LM +############################################################################### + +if [ $stage -le 8 ]; then + local/run_unk_model.sh || exit 1 + + for lang_dir in data/lang_test_poco_ex250k; do + rm -r ${lang_dir}_unk ${lang_dir}_unk_big 2>/dev/null || true + cp -rT data/lang_unk ${lang_dir}_unk + cp ${lang_dir}/G.fst ${lang_dir}_unk/G.fst + cp -rT data/lang_unk ${lang_dir}_unk_big + cp ${lang_dir}_big/G.carpa ${lang_dir}_unk_big/G.carpa; + done +fi + +############################################################################### +# Train seed chain system using 50 hours supervised data. +# Here we train i-vector extractor on combined supervised and unsupervised data +############################################################################### + +if [ $stage -le 9 ]; then + local/semisup/chain/run_tdnn.sh \ + --train-set train_sup50k \ + --ivector-train-set semisup50k_100k_250k \ + --nnet3-affix _semi50k_100k_250k \ + --chain-affix _semi50k_100k_250k \ + --tdnn-affix _1a --tree-affix bi_a \ + --gmm tri4a --exp-root $exp_root || exit 1 + + # WER on dev 21.41 + # WER on test 21.03 + # Final train prob -0.1035 + # Final valid prob -0.1667 + # Final train prob (xent) -1.5926 + # Final valid prob (xent) -1.7990 +fi + +############################################################################### +# Semi-supervised training using 50 hours supervised data and +# 250 hours unsupervised data. We use i-vector extractor, tree, lattices +# and seed chain system from the previous stage. +############################################################################### + +if [ $stage -le 10 ]; then + local/semisup/chain/run_tdnn_50k_semisupervised.sh \ + --supervised-set train_sup50k \ + --unsupervised-set train_unsup100k_250k \ + --sup-chain-dir $exp_root/chain_semi50k_100k_250k/tdnn_1a_sp \ + --sup-lat-dir $exp_root/chain_semi50k_100k_250k/tri4a_train_sup50k_sp_unk_lats \ + --sup-tree-dir $exp_root/chain_semi50k_100k_250k/tree_bi_a \ + --ivector-root-dir $exp_root/nnet3_semi50k_100k_250k \ + --chain-affix _semi50k_100k_250k \ + --tdnn-affix _semisup_1a \ + --exp-root $exp_root || exit 1 + + # WER on dev 18.98 + # WER on test 18.85 + # Final output-0 train prob -0.1381 + # Final output-0 valid prob -0.1723 + # Final output-0 train prob (xent) -1.3676 + # Final output-0 valid prob (xent) -1.4589 + # Final output-1 train prob -0.7671 + # Final output-1 valid prob -0.7714 + # Final output-1 train prob (xent) -1.1480 + # Final output-1 valid prob (xent) -1.2382 +fi + +############################################################################### +# Oracle system trained on combined 300 hours including both supervised and +# unsupervised sets. We use i-vector extractor, tree, and GMM trained +# on only the supervised for fair comparison to semi-supervised experiments. +############################################################################### + +if [ $stage -le 11 ]; then + local/semisup/chain/run_tdnn.sh \ + --train-set semisup50k_100k_250k \ + --nnet3-affix _semi50k_100k_250k \ + --chain-affix _semi50k_100k_250k \ + --common-treedir $exp_root/chain_semi50k_100k_250k/tree_bi_a \ + --tdnn-affix 1a_oracle --nj 100 \ + --gmm tri4a --exp-root $exp_root \ + --stage 9 || exit 1 + + # WER on dev 17.55 + # WER on test 17.72 + # Final output train prob -0.1155 + # Final output valid prob -0.1510 + # Final output train prob (xent) -1.7458 + # Final output valid prob (xent) -1.9045 +fi diff --git a/egs/fisher_english/s5/local/wer_output_filter b/egs/fisher_english/s5/local/wer_output_filter new file mode 100755 index 00000000000..2514c385038 --- /dev/null +++ b/egs/fisher_english/s5/local/wer_output_filter @@ -0,0 +1,16 @@ +#!/usr/bin/perl + +@filter_words = ('[NOISE]', '[LAUGHTER]', '[VOCALIZED-NOISE]', '', '%HESITATION'); +foreach $w (@filter_words) { + $bad{$w} = 1; $w = lc $w; $bad{$w} = 1; +} +while() { + @A = split(" ", $_); + $id = shift @A; + print "$id "; + + foreach $a (@A) { + if (!defined $bad{$a}) { print "$a "; } + } + print "\n"; +} diff --git a/egs/fisher_english/s5/run.sh b/egs/fisher_english/s5/run.sh index 77e1ea0870d..67c0d5ce638 100755 --- a/egs/fisher_english/s5/run.sh +++ b/egs/fisher_english/s5/run.sh @@ -181,3 +181,32 @@ steps/train_sat.sh --cmd "$train_cmd" \ # # local/run_nnet2.sh # +# This prepares lang directory with UNK modeled by a phone LM +# local/run_unk_model.sh + +# These are semi-supervised training recipes using 50 hrs and 100 hrs +# of supervised data respectively with 250 hrs of unsupervised data. +# run_50k.sh uses i-vector extractor trained on 300 hrs of combined data, +# while run_100.sh uses i-vector extractor trained on 100 hrs of supervised data. +# run_50k.sh uses 4-gram LM trained on 1250 hrs transcripts, +# while run_100k.sh uses 3-gram LM trained on 100 hrs transcripts. + +# local/fisher_train_lms_pocolm.sh +# local/fisher_create_test_lang.sh --arpa-lm data/local/pocolm/data/arpa/4gram_small.arpa.gz --dir data/lang_test_poco +# utils/build_const_arpa_lm.sh data/local/pocolm/data/arpa/4gram_big.arpa.gz data/lang_test_poco data/lang_test_poco_big + +# for lang_dir in data/lang_test_poco; do +# rm -r ${lang_dir}_unk ${lang_dir}_unk_big 2>/dev/null || true +# cp -rT data/lang_unk ${lang_dir}_unk +# cp ${lang_dir}/G.fst ${lang_dir}_unk/G.fst +# cp -rT data/lang_unk ${lang_dir}_unk_big +# cp ${lang_dir}_big/G.carpa ${lang_dir}_unk_big/G.carpa; +# done + +# Create supervised and unsupervised data subsets +# utils/subset_data_dir.sh --speakers data/train 100000 data/train_sup +# utils/subset_data_dir.sh --spk-list <(utils/filter_scp.pl --exclude data/train_sup/spk2utt data/train/spk2utt) data/train data/train_unsup100k +# utils/subset_data_dir.sh --speakers data/train_unsup100k 250000 data/train_unsup100k_250k + +# local/semisup/run_50k.sh +# local/semisup/run_100k.sh diff --git a/egs/wsj/s5/steps/best_path_weights.sh b/egs/wsj/s5/steps/best_path_weights.sh new file mode 100755 index 00000000000..d34d574173f --- /dev/null +++ b/egs/wsj/s5/steps/best_path_weights.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# Copyright 2014-17 Vimal Manohar + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# This script gets from the lattice the best path alignments and frame-level +# posteriors of the pdfs in the best path alignment. +# The output directory has the format of an alignment directory. +# It can optionally read alignments from a directory, in which case, +# the script gets frame-level posteriors of the pdf corresponding to those +# alignments. +# The frame-level posteriors in the form of kaldi vectors and are +# output in weights.scp. + +set -e + +# begin configuration section. +cmd=run.pl +stage=-10 +acwt=0.1 +#end configuration section. + +if [ -f ./path.sh ]; then . ./path.sh; fi +. utils/parse_options.sh || exit 1; + +if [ $# -ne 3 ] && [ $# -ne 4 ]; then + cat < [] + E.g. $0 data/train_unt.seg exp/tri1/decode exp/tri1/best_path + Options: + --cmd (run.pl|queue.pl...) # specify how to run the sub-processes. +EOF + + exit 1; +fi + +data=$1 +decode_dir=$2 +dir=${@: -1} # last argument to the script + +ali_dir=$dir +if [ $# -eq 4 ]; then + ali_dir=$3 +fi + +mkdir -p $dir + +nj=$(cat $decode_dir/num_jobs) +echo $nj > $dir/num_jobs + +if [ $stage -le 1 ]; then + mkdir -p $dir/log + $cmd JOB=1:$nj $dir/log/best_path.JOB.log \ + lattice-best-path --acoustic-scale=$acwt \ + "ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz |" \ + ark:/dev/null "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 +fi + +# Find where the final.mdl is. +if [ -f $(dirname $decode_dir)/final.mdl ]; then + src_dir=$(dirname $decode_dir) +else + src_dir=$decode_dir +fi + +cp $src_dir/cmvn_opts $dir/ || exit 1 +for f in final.mat splice_opts frame_subsampling_factor; do + if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir; fi +done + +# make $dir an absolute pathname. +fdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}) + +model=$src_dir/final.mdl +tree=$src_dir/tree + +for f in $model $decode_dir/lat.1.gz $tree; do + if [ ! -f $f ]; then echo "$0: expecting file $f to exist" && exit 1; fi +done + +cp $model $tree $dir || exit 1 + +ali_nj=$(cat $ali_dir/num_jobs) || exit 1 +if [ $nj -ne $ali_nj ]; then + echo "$0: $decode_dir and $ali_dir have different number of jobs. Redo alignment with $nj jobs." + exit 1 +fi + +if [ $stage -lt 2 ]; then + $cmd JOB=1:$nj $dir/log/get_post.JOB.log \ + lattice-to-post --acoustic-scale=$acwt \ + "ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \ + post-to-pdf-post $model ark,s,cs:- ark:- \| \ + get-post-on-ali ark,s,cs:- \ + "ark,s,cs:gunzip -c $ali_dir/ali.JOB.gz | convert-ali $dir/final.mdl $model $tree ark,s,cs:- ark:- | ali-to-pdf $model ark,s,cs:- ark:- |" \ + "ark,scp:$fdir/weights.JOB.ark,$fdir/weights.JOB.scp" || exit 1 +fi + +for n in `seq $nj`; do + cat $dir/weights.$n.scp +done > $dir/weights.scp + +rm $dir/weights.*.scp + +exit 0 diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py index afa75eb0296..63b1c12c759 100755 --- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py @@ -515,7 +515,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): except: tb = traceback.format_exc() logger.warning("Error getting info from logs, exception was: " + tb) - times = [] + times = {} report = [] report.append("%Iter\tduration\ttrain_objective\tvalid_objective\tdifference") @@ -532,7 +532,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): try: report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2]-x[1])) - except KeyError: + except KeyError, IndexError: continue total_time = 0 diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 5ae7aecd36c..854a37a52b7 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -129,7 +129,8 @@ def train_new_models(dir, iter, srand, num_jobs, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch_str, frame_subsampling_factor, run_opts, train_opts, - backstitch_training_scale=0.0, backstitch_training_interval=1): + backstitch_training_scale=0.0, backstitch_training_interval=1, + use_multitask_egs=False): """ Called from train_one_iteration(), this method trains new models with 'num_jobs' jobs, and @@ -140,6 +141,12 @@ def train_new_models(dir, iter, srand, num_jobs, to use for each job is a little complex, so we spawn each one separately. this is no longer true for RNNs as we use do not use the --frame option but we use the same script for consistency with FF-DNN code + + use_multitask_egs : True, if different examples used to train multiple + tasks or outputs, e.g.multilingual training. + multilingual egs can be generated using get_egs.sh and + steps/nnet3/multilingual/allocate_multilingual_examples.py, + those are the top-level scripts. """ deriv_time_opts = [] @@ -167,6 +174,12 @@ def train_new_models(dir, iter, srand, num_jobs, frame_shift = ((archive_index + k/num_archives) % frame_subsampling_factor) + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="cegs.", + archive_index=archive_index, + use_multitask_egs=use_multitask_egs) + scp_or_ark = "scp" if use_multitask_egs else "ark" cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) if iter > 0 else "") + @@ -187,9 +200,9 @@ def train_new_models(dir, iter, srand, num_jobs, --l2-regularize-factor={l2_regularize_factor} {train_opts} \ --srand={srand} \ "{raw_model}" {dir}/den.fst \ - "ark,bg:nnet3-chain-copy-egs \ + "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} \ --frame-shift={fr_shft} \ - ark:{egs_dir}/cegs.{archive_index}.ark ark:- | \ + {scp_or_ark}:{egs_dir}/cegs.{archive_index}.{scp_or_ark} ark:- | \ nnet3-chain-shuffle-egs --buffer-size={buf_size} \ --srand={srand} ark:- ark:- | nnet3-chain-merge-egs \ --minibatch-size={num_chunk_per_mb} ark:- ark:- |" \ @@ -213,17 +226,17 @@ def train_new_models(dir, iter, srand, num_jobs, raw_model=raw_model_string, egs_dir=egs_dir, archive_index=archive_index, buf_size=shuffle_buffer_size, - num_chunk_per_mb=num_chunk_per_minibatch_str), + num_chunk_per_mb=num_chunk_per_minibatch_str, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark), require_zero_status=True) threads.append(thread) - for thread in threads: thread.join() - def train_one_iteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, @@ -235,7 +248,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, run_opts, dropout_edit_string="", train_opts="", - backstitch_training_scale=0.0, backstitch_training_interval=1): + backstitch_training_scale=0.0, backstitch_training_interval=1, + use_multitask_egs=False): """ Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective @@ -265,7 +279,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, compute_train_cv_probabilities( dir=dir, iter=iter, egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts) + leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, + use_multitask_egs=use_multitask_egs) if iter > 0: # Runs in the background @@ -312,7 +327,8 @@ def train_one_iteration(dir, iter, srand, egs_dir, # first few iterations (hard-coded as 15) backstitch_training_scale=(backstitch_training_scale * iter / 15 if iter < 15 else backstitch_training_scale), - backstitch_training_interval=backstitch_training_interval) + backstitch_training_interval=backstitch_training_interval, + use_multitask_egs=use_multitask_egs) [models_to_average, best_model] = common_train_lib.get_successful_models( num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) @@ -352,11 +368,13 @@ def train_one_iteration(dir, iter, srand, egs_dir, os.remove("{0}/cache.{1}".format(dir, iter)) -def check_for_required_files(feat_dir, tree_dir, lat_dir): +def check_for_required_files(feat_dir, tree_dir, lat_dir=None): files = ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir), - '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir), + '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir)] + if lat_dir is not None: + files += [ '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir), - '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)] + '{0}/num_jobs'.format(lat_dir)] for file in files: if not os.path.isfile(file): raise Exception('Expected {0} to exist.'.format(file)) @@ -364,7 +382,7 @@ def check_for_required_files(feat_dir, tree_dir, lat_dir): def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, max_lda_jobs=None, rand_prune=4.0, - lda_opts=None): + lda_opts=None, use_multitask_egs=False): """ Function to estimate and write LDA matrix from cegs This function is exactly similar to the version in module @@ -374,17 +392,28 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, if max_lda_jobs is not None: if num_lda_jobs > max_lda_jobs: num_lda_jobs = max_lda_jobs + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="cegs.", + archive_index="JOB", + use_multitask_egs=use_multitask_egs) + scp_or_ark = "scp" if use_multitask_egs else "ark" + egs_rspecifier = ( + "ark:nnet3-chain-copy-egs {multitask_egs_opts} " + "{scp_or_ark}:{egs_dir}/cegs.JOB.{scp_or_ark} ark:- |" + "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark, + multitask_egs_opts=multitask_egs_opts)) # Write stats with the same format as stats for LDA. common_lib.execute_command( """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \ nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \ - {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" \ + {dir}/init.raw "{egs_rspecifier}" \ {dir}/JOB.lda_stats""".format( command=run_opts.command, num_lda_jobs=num_lda_jobs, dir=dir, - egs_dir=egs_dir, + egs_rspecifier=egs_rspecifier, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats @@ -445,32 +474,50 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_model=None): def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, - run_opts): + run_opts, + use_multitask_egs=False): model = '{0}/{1}.mdl'.format(dir, iter) + scp_or_ark = "scp" if use_multitask_egs else "ark" + egs_suffix = ".scp" if use_multitask_egs else ".cegs" + + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="valid_diagnostic.", + use_multitask_egs=use_multitask_egs) + common_lib.background_command( """{command} {dir}/log/compute_prob_valid.{iter}.log \ nnet3-chain-compute-prob --l2-regularize={l2} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ - "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/valid_diagnostic.cegs \ + "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, l2=l2_regularize, leaky=leaky_hmm_coefficient, xent_reg=xent_regularize, - egs_dir=egs_dir)) + egs_dir=egs_dir, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) + + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="train_diagnostic.", + use_multitask_egs=use_multitask_egs) common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ nnet3-chain-compute-prob --l2-regularize={l2} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ - "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/train_diagnostic.cegs \ + "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, l2=l2_regularize, leaky=leaky_hmm_coefficient, xent_reg=xent_regularize, - egs_dir=egs_dir)) + egs_dir=egs_dir, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) def compute_progress(dir, iter, run_opts): @@ -510,10 +557,12 @@ def compute_progress(dir, iter, run_opts): model=model)) + def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, egs_dir, leaky_hmm_coefficient, l2_regularize, xent_regularize, run_opts, - max_objective_evaluations=30): + max_objective_evaluations=30, + use_multitask_egs=False): """ Function to do model combination In the nnet3 setup, the logic @@ -536,6 +585,14 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st print("{0}: warning: model file {1} does not exist " "(final combination)".format(sys.argv[0], model_file)) + scp_or_ark = "scp" if use_multitask_egs else "ark" + egs_suffix = ".scp" if use_multitask_egs else ".cegs" + + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="combine.", + use_multitask_egs=use_multitask_egs) + # We reverse the order of the raw model strings so that the freshest one # goes first. This is important for systems that include batch # normalization-- it means that the freshest batch-norm stats are used. @@ -550,7 +607,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st --max-objective-evaluations={max_objective_evaluations} \ --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ --verbose=3 {combine_gpu_opt} {dir}/den.fst {raw_models} \ - "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/combine.cegs ark:- | \ + "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/combine{egs_suffix} ark:- | \ nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \ ark:- ark:- |" - \| \ nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \ @@ -563,7 +620,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st dir=dir, raw_models=" ".join(raw_model_strings), num_chunk_per_mb=num_chunk_per_minibatch_str, num_iters=num_iters, - egs_dir=egs_dir)) + egs_dir=egs_dir, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the @@ -572,4 +631,5 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st dir=dir, iter='final', egs_dir=egs_dir, l2_regularize=l2_regularize, xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, - run_opts=run_opts) + run_opts=run_opts, + use_multitask_egs=use_multitask_egs) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index 6b572acb5d7..c18003a626e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -329,21 +329,32 @@ def train_one_iteration(dir, iter, srand, egs_dir, def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, max_lda_jobs=None, rand_prune=4.0, - lda_opts=None): + lda_opts=None, use_multitask_egs=False): if max_lda_jobs is not None: if num_lda_jobs > max_lda_jobs: num_lda_jobs = max_lda_jobs + multitask_egs_opts = common_train_lib.get_multitask_egs_opts( + egs_dir, + egs_prefix="egs.", + archive_index="JOB", + use_multitask_egs=use_multitask_egs) + scp_or_ark = "scp" if use_multitask_egs else "ark" + egs_rspecifier = ( + "ark:nnet3-copy-egs {multitask_egs_opts} " + "{scp_or_ark}:{egs_dir}/egs.JOB.{scp_or_ark} ark:- |" + "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark, + multitask_egs_opts=multitask_egs_opts)) # Write stats with the same format as stats for LDA. common_lib.execute_command( """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \ nnet3-acc-lda-stats --rand-prune={rand_prune} \ - {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" \ + {dir}/init.raw "{egs_rspecifier}" \ {dir}/JOB.lda_stats""".format( command=run_opts.command, num_lda_jobs=num_lda_jobs, dir=dir, - egs_dir=egs_dir, + egs_rspecifier=egs_rspecifier, rand_prune=rand_prune)) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index eda1461a2ab..99a4fb28ff6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -367,6 +367,14 @@ class XconfigTrivialOutputLayer(XconfigLayerBase): This is for outputs that are not really output "layers" (there is no affine transform or nonlinearity), they just directly map to an output-node in nnet3. + + Parameters of the class, and their defaults: + input='[-1]' : Descriptor giving the input of the layer. + objective-type=linear : the only other choice currently is + 'quadratic', for use in regression problems + output-delay=0 : Can be used to shift the frames on the output, equivalent + to delaying labels by this many frames (positive value increases latency + in online decoding but may help if you're using unidirectional LSTMs. """ def __init__(self, first_token, key_to_value, prev_names=None): @@ -378,11 +386,17 @@ def set_default_configs(self): # note: self.config['input'] is a descriptor, '[-1]' means output # the most recent layer. - self.config = {'input': '[-1]', 'dim': -1} + self.config = {'input': '[-1]', 'dim': -1, + 'objective-type': 'linear', + 'output-delay': 0} def check_configs(self): - pass # nothing to check; descriptor-parsing can't happen in this function. + if self.config['objective-type'] != 'linear' and \ + self.config['objective-type'] != 'quadratic': + raise RuntimeError("In output, objective-type has" + " invalid value {0}" + "".format(self.config['objective-type'])) def output_name(self, auxiliary_outputs=None): @@ -412,11 +426,19 @@ def get_full_config(self): # by 'output-string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. descriptor_final_str = self.descriptors['input']['final-string'] + objective_type = self.config['objective-type'] + output_delay = self.config['output-delay'] - for config_name in ['init', 'ref', 'final']: + if output_delay != 0: + descriptor_final_str = ( + 'Offset({0}, {1})'.format(descriptor_final_str, output_delay)) + + for config_name in ['ref', 'final']: ans.append((config_name, - 'output-node name={0} input={1}'.format( - self.name, descriptor_final_str))) + 'output-node name={0} input={1} ' + 'objective={2}'.format( + self.name, descriptor_final_str, + objective_type))) return ans @@ -507,28 +529,38 @@ def check_configs(self): " invalid value {0}" "".format(self.config['learning-rate-factor'])) - # you cannot access the output of this layer from other layers... see - # comment in output_name for the reason why. def auxiliary_outputs(self): - return [] + auxiliary_outputs = ['affine'] + if self.config['include-log-softmax']: + auxiliary_outputs.append('log-softmax') - def output_name(self, auxiliary_outputs=None): + return auxiliary_outputs + + def output_name(self, auxiliary_output=None): - # Note: nodes of type output-node in nnet3 may not be accessed in - # Descriptors, so calling this with auxiliary_outputs=None doesn't - # make sense. But it might make sense to make the output of the softmax - # layer and/or the output of the affine layer available as inputs to - # other layers, in some circumstances. - # we'll implement that when it's needed. - raise RuntimeError("Outputs of output-layer may not be used by other" - " layers") + if auxiliary_output is None: + # Note: nodes of type output-node in nnet3 may not be accessed in + # Descriptors, so calling this with auxiliary_outputs=None doesn't + # make sense. + raise RuntimeError("Outputs of output-layer may not be used by other" + " layers") + + if auxiliary_output in self.auxiliary_outputs(): + return '{0}.{1}'.format(self.name, auxiliary_output) + else: + raise RuntimeError("Unknown auxiliary output name {0}" + "".format(auxiliary_output)) def output_dim(self, auxiliary_output=None): - # see comment in output_name(). - raise RuntimeError("Outputs of output-layer may not be used by other" - " layers") + if auxiliary_output is None: + # Note: nodes of type output-node in nnet3 may not be accessed in + # Descriptors, so calling this with auxiliary_outputs=None doesn't + # make sense. + raise RuntimeError("Outputs of output-layer may not be used by other" + " layers") + return self.config['dim'] def get_full_config(self): ans = [] diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh new file mode 100755 index 00000000000..a075b8debe8 --- /dev/null +++ b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +# Copyright 2014 Guoguo Chen +# 2017 Vimal Manohar +# Apache 2.0 + +# This script rescores non-compact, (possibly) undeterminized lattices with the +# ConstArpaLm format language model. +# This is similar to steps/lmrescore_const_arpa.sh, but expects +# non-compact lattices as input. +# This works by first determinizing the lattice and rescoring it with +# const ARPA LM, followed by composing it with the original lattice to add the +# new LM scores. + +# If you use the option "--write compact false" it outputs non-compact lattices; +# the purpose is to add in LM scores while leaving the frame-by-frame acoustic +# scores in the same position that they were in in the input, undeterminized +# lattices. This is important in our 'chain' semi-supervised training recipes, +# where it helps us to split lattices while keeping the scores at the edges of +# the split points correct. + +# Begin configuration section. +cmd=run.pl +skip_scoring=false +stage=1 +scoring_opts= +write_compact=true # If set to false, writes lattice in non-compact format. + # This retains the acoustic scores on the arcs of the lattice. + # Useful for another stage of LM rescoring. +acwt=0.1 # used for pruning and determinization +beam=8.0 # beam used in determinization + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +. ./utils/parse_options.sh + +if [ $# != 5 ]; then + cat < \\ + + options: [--cmd (run.pl|queue.pl [queue opts])] + See also: steps/lmrescore_const_arpa.sh +EOF + exit 1; +fi + +[ -f path.sh ] && . ./path.sh; + +oldlang=$1 +newlang=$2 +data=$3 +indir=$4 +outdir=$5 + +oldlm=$oldlang/G.fst +newlm=$newlang/G.carpa +! cmp $oldlang/words.txt $newlang/words.txt &&\ + echo "$0: Warning: vocabularies may be incompatible." +[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1; +[ ! -f $newlm ] && echo "$0: Missing file $newlm" && exit 1; +! ls $indir/lat.*.gz >/dev/null &&\ + echo "$0: No lattices input directory $indir" && exit 1; + +if ! cmp -s $oldlang/words.txt $newlang/words.txt; then + echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing."; +fi + +oldlmcommand="fstproject --project_output=true $oldlm |" + +mkdir -p $outdir/log +nj=`cat $indir/num_jobs` || exit 1; +cp $indir/num_jobs $outdir + +lats_rspecifier="ark:gunzip -c $indir/lat.JOB.gz |" + +lats_wspecifier="ark:| gzip -c > $outdir/lat.JOB.gz" + +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-determinize-pruned --acoustic-scale=$acwt --beam=$beam \ + "ark:gunzip -c $indir/lat.JOB.gz |" ark:- \| \ + lattice-scale --lm-scale=0.0 --acoustic-scale=0.0 ark:- ark:- \| \ + lattice-lmrescore --lm-scale=-1.0 ark:- "$oldlmcommand" ark:- \| \ + lattice-lmrescore-const-arpa --lm-scale=1.0 \ + ark:- "$newlm" ark:- \| \ + lattice-project ark:- ark:- \| \ + lattice-compose --write-compact=$write_compact \ + "$lats_rspecifier" \ + ark,s,cs:- "$lats_wspecifier" || exit 1 +fi + +if ! $skip_scoring && [ $stage -le 2 ]; then + err_msg="Not scoring because local/score.sh does not exist or not executable." + [ ! -x local/score.sh ] && echo $err_msg && exit 1; + local/score.sh --cmd "$cmd" $scoring_opts $data $newlang $outdir +else + echo "Not scoring because requested so..." +fi + +exit 0; diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh index 3c2c770470c..23fb62d7a87 100755 --- a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh @@ -175,7 +175,7 @@ fi if [ $stage -le -1 ]; then # Convert the alignments to the new tree. Note: we likely will not use these - # converted alignments in the CTC system directly, but they could be useful + # converted alignments in the chain system directly, but they could be useful # for other purposes. echo "$0: Converting alignments from $alidir to use current tree" $cmd JOB=1:$nj $dir/log/convert.JOB.log \ diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh new file mode 100755 index 00000000000..48028634e26 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh @@ -0,0 +1,275 @@ +#!/bin/bash +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# 2017 Vimal Manohar +# Apache 2.0. + +# This script is similar to steps/nnet3/chain/build_tree.sh but supports +# getting statistics from multiple alignment sources. + + +# Begin configuration section. +stage=-5 +exit_stage=-100 # you can use this to require it to exit at the + # beginning of a specific stage. Not all values are + # supported. +cmd=run.pl +use_fmllr=true # If true, fmllr transforms will be applied from the alignment directories. + # Otherwise, no fmllr will be applied even if alignment directory contains trans.* +context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone. +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +frame_subsampling_factor=1 # frame subsampling factor of output w.r.t. to the input features +tree_stats_opts= +cluster_phones_opts= +repeat_frames=false +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -lt 5 ]; then + echo "Usage: steps/nnet3/chain/build_tree_multiple_sources.sh <#leaves> [ ... ] " + echo " e.g.: steps/nnet3/chain/build_tree_multiple_sources.sh 15000 data/lang data/train_sup exp/tri3_ali data/train_unsup exp/tri3/best_path_train_unsup exp/tree_semi" + echo "Main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --repeat-frames # Only affects alignment conversion at" + echo " # the end. If true, generate an " + echo " # alignment using the frame-subsampled " + echo " # topology that is repeated " + echo " # --frame-subsampling-factor times " + echo " # and interleaved, to be the same " + echo " # length as the original alignment " + echo " # (useful for cross-entropy training " + echo " # of reduced frame rate systems)." + exit 1; +fi + +numleaves=$1 +lang=$2 +dir=${@: -1} # last argument to the script +shift 2; +data_and_alidirs=( $@ ) # read the remaining arguments into an array +unset data_and_alidirs[${#data_and_alidirs[@]}-1] # 'pop' the last argument which is odir +num_sys=$[${#data_and_alidirs[@]}] # number of systems to combine + +if (( $num_sys % 2 != 0 )); then + echo "$0: The data and alignment arguments must be an even number of arguments." + exit 1 +fi + +num_sys=$((num_sys / 2)) + +data=$dir/data_tmp +mkdir -p $data + +mkdir -p $dir +alidir=`echo ${data_and_alidirs[1]}` + +datadirs=() +alidirs=() +for n in `seq 0 $[num_sys-1]`; do + datadirs[$n]=${data_and_alidirs[$[2*n]]} + alidirs[$n]=${data_and_alidirs[$[2*n+1]]} +done + +utils/combine_data.sh $data ${datadirs[@]} || exit 1 + +for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +oov=`cat $lang/oov.int` +nj=`cat $alidir/num_jobs` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; +sdata=$data/split$nj; +splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. +cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` || exit 1 +delta_opts=`cat $alidir/delta_opts 2>/dev/null` + +mkdir -p $dir/log +cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. +cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $alidir/delta_opts $dir 2>/dev/null # delta option. + +utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; + +echo $nj >$dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +# Set up features. +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi + +echo "$0: feature type is $feat_type" + +feats=() +feats_one=() +for n in `seq 0 $[num_sys-1]`; do + this_nj=$(cat ${alidirs[$n]}/num_jobs) || exit 1 + this_sdata=${datadirs[$n]}/split$this_nj + [[ -d $this_sdata && ${datadirs[$n]}/feats.scp -ot $this_sdata ]] || split_data.sh ${datadirs[$n]} $this_nj || exit 1; + ## Set up speaker-independent features. + case $feat_type in + delta) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |" + feats_one[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/1/utt2spk scp:$this_sdata/1/cmvn.scp scp:$this_sdata/1/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; + lda) feats[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/JOB/utt2spk scp:$this_sdata/JOB/cmvn.scp scp:$this_sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + feats_one[$n]="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$this_sdata/1/utt2spk scp:$this_sdata/1/cmvn.scp scp:$this_sdata/1/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + cp $alidir/full.mat $dir 2>/dev/null + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; + esac + + if $use_fmllr; then + if [ ! -f ${alidirs[$n]}/trans.1 ]; then + echo "$0: Could not find fMLLR transforms in ${alidirs[$n]}" + exit 1 + fi + + echo "$0: Using transforms from ${alidirs[$n]}" + feats[i]="${feats[i]} transform-feats --utt2spk=ark:$this_sdata/JOB/utt2spk ark,s,cs:${alidirs[$n]}/trans.JOB ark:- ark:- |" + feats_one[i]="${feats_one[i]} transform-feats --utt2spk=ark:$this_sdata/1/utt2spk ark,s,cs:${alidirs[$n]}/trans.1 ark:- ark:- |" + fi + + # Do subsampling of feats, if needed + if [ $frame_subsampling_factor -gt 1 ]; then + feats[$n]="${feats[$n]} subsample-feats --n=$frame_subsampling_factor ark:- ark:- |" + feats_one[$n]="${feats_one[$n]} subsample-feats --n=$frame_subsampling_factor ark:- ark:- |" + fi +done + +if [ $stage -le -5 ]; then + echo "$0: Initializing monophone model (for alignment conversion, in case topology changed)" + + [ ! -f $lang/phones/sets.int ] && exit 1; + shared_phones_opt="--shared-phones=$lang/phones/sets.int" + # get feature dimension + example_feats="`echo ${feats[0]} | sed s/JOB/1/g`"; + if ! feat_dim=$(feat-to-dim "$example_feats" - 2>/dev/null) || [ -z $feat_dim ]; then + feat-to-dim "$example_feats" - # to see the error message. + echo "error getting feature dimension" + exit 1; + fi + + for n in `seq 0 $[num_sys-1]`; do + copy-feats "${feats_one[$n]}" ark:- + done | copy-feats ark:- ark:$dir/tmp.ark + + $cmd $dir/log/init_mono.log \ + gmm-init-mono $shared_phones_opt \ + "--train-feats=ark:subset-feats --n=10 ark:$dir/tmp.ark ark:- |" $lang/topo $feat_dim \ + $dir/mono.mdl $dir/mono.tree || exit 1 +fi + + +if [ $stage -le -4 ]; then + # Get tree stats. + + for n in `seq 0 $[num_sys-1]`; do + echo "$0: Accumulating tree stats" + this_data=${datadirs[$n]} + this_alidir=${alidirs[$n]} + this_nj=$(cat $this_alidir/num_jobs) || exit 1 + this_frame_subsampling_factor=1 + if [ -f $this_alidir/frame_subsampling_factor ]; then + this_frame_subsampling_factor=$(cat $this_alidir/frame_subsampling_factor) + fi + + if (( $frame_subsampling_factor % $this_frame_subsampling_factor != 0 )); then + echo "$0: frame-subsampling-factor=$frame_subsampling_factor is not " + echo "divisible by $this_frame_subsampling_factor (that of $this_alidir)" + exit 1 + fi + + this_frame_subsampling_factor=$((frame_subsampling_factor / this_frame_subsampling_factor)) + $cmd JOB=1:$this_nj $dir/log/acc_tree.$n.JOB.log \ + convert-ali --frame-subsampling-factor=$this_frame_subsampling_factor \ + $this_alidir/final.mdl $dir/mono.mdl $dir/mono.tree "ark:gunzip -c $this_alidir/ali.JOB.gz|" ark:- \| \ + acc-tree-stats $context_opts $tree_stats_opts --ci-phones=$ciphonelist $dir/mono.mdl \ + "${feats[$n]}" ark:- $dir/$n.JOB.treeacc || exit 1; + [ "`ls $dir/$n.*.treeacc | wc -w`" -ne "$this_nj" ] && echo "$0: Wrong #tree-accs for data $n $this_data" && exit 1; + done + + $cmd $dir/log/sum_tree_acc.log \ + sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -3 ] && $train_tree; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + $cmd $dir/log/questions.log \ + cluster-phones $cluster_phones_opts $context_opts $dir/treeacc \ + $lang/phones/sets.int $dir/questions.int || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + $cmd $dir/log/compile_questions.log \ + compile-questions \ + $context_opts $lang/topo $dir/questions.int $dir/questions.qst || exit 1; + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; +fi + +if [ $stage -le -2 ]; then + echo "$0: Initializing the model" + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; + grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; + rm $dir/treeacc +fi + +if [ $stage -le -1 ]; then + # Convert the alignments to the new tree. Note: we likely will not use these + # converted alignments in the chain system directly, but they could be useful + # for other purposes. + + for n in `seq 0 $[num_sys-1]`; do + this_alidir=${alidirs[$n]} + this_nj=$(cat $this_alidir/num_jobs) || exit 1 + + this_frame_subsampling_factor=1 + if [ -f $this_alidir/frame_subsampling_factor ]; then + this_frame_subsampling_factor=$(cat $this_alidir/frame_subsampling_factor) + fi + + if (( $frame_subsampling_factor % $this_frame_subsampling_factor != 0 )); then + echo "$0: frame-subsampling-factor=$frame_subsampling_factor is not " + echo "divisible by $this_frame_subsampling_factor (hat of $this_alidir)" + exit 1 + fi + + echo "$0: frame-subsampling-factor for $this_alidir is $this_frame_subsampling_factor" + + this_frame_subsampling_factor=$((frame_subsampling_factor / this_frame_subsampling_factor)) + echo "$0: Converting alignments from $this_alidir to use current tree" + $cmd JOB=1:$this_nj $dir/log/convert.$n.JOB.log \ + convert-ali --repeat-frames=$repeat_frames \ + --frame-subsampling-factor=$this_frame_subsampling_factor \ + $this_alidir/final.mdl $dir/1.mdl $dir/tree "ark:gunzip -c $this_alidir/ali.JOB.gz |" \ + ark,scp:$dir/ali.$n.JOB.ark,$dir/ali.$n.JOB.scp || exit 1 + + for i in `seq $this_nj`; do + cat $dir/ali.$n.$i.scp + done > $dir/ali.$n.scp || exit 1 + done + + for n in `seq 0 $[num_sys-1]`; do + cat $dir/ali.$n.scp + done | sort -k1,1 > $dir/ali.scp || exit 1 + + utils/split_data.sh $data $nj + $cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \ + copy-int-vector "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/ali.scp |" \ + "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 +fi + +cp $dir/1.mdl $dir/final.mdl + +echo $0: Done building tree diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index 0294df0d84a..99e7499bd30 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -46,15 +46,15 @@ frames_per_iter=400000 # each iteration of training, see this many frames per # used. This is just a guideline; it will pick a number # that divides the number of samples in the entire data. -right_tolerance= #CTC right tolerance == max label delay. +right_tolerance= # chain right tolerance == max label delay. left_tolerance= transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms stage=0 -nj=15 # This should be set to the maximum number of jobs you are - # comfortable to run in parallel; you can increase it if your disk - # speed is greater and you have more machines. +max_jobs_run=15 # This should be set to the maximum number of nnet3-chain-get-egs jobs you are + # comfortable to run in parallel; you can increase it if your disk + # speed is greater and you have more machines. max_shuffle_jobs_run=50 # the shuffle jobs now include the nnet3-chain-normalize-egs command, # which is fairly CPU intensive, so we can run quite a few at once # without overloading the disks. @@ -63,6 +63,17 @@ online_ivector_dir= # can be used if we are including speaker information as iV cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, # it doesn't make sense to use different options than were used as input to the # LDA transform). This is used to turn off CMVN in the online-nnet experiments. +lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be + # used (with this scale) in generating supervisions + # This is 0 by default for conventional supervised training, + # but may be close to 1 for the unsupervised part of the data + # in semi-supervised training. The optimum is usually + # 0.5 for unsupervised data. +lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, + # before being used to get supervisions. +acwt=0.1 # For pruning +deriv_weights_scp= +generate_egs_scp=false echo "$0 $@" # Print the command line for logging @@ -80,7 +91,7 @@ if [ $# != 4 ]; then echo "" echo "Main options (for others, see top of script file)" echo " --config # config file containing options" - echo " --nj # The maximum number of jobs you want to run in" + echo " --max-jobs-run # The maximum number of jobs you want to run in" echo " # parallel (increase this only if you have good disk and" echo " # network speed). default=6" echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." @@ -94,8 +105,16 @@ if [ $# != 4 ]; then echo " --left-context-initial # If >= 0, left-context for first chunk of an utterance" echo " --right-context-final # If >= 0, right-context for last chunk of an utterance" echo " --num-egs-diagnostic <#frames;4000> # Number of egs used in computing (train,valid) diagnostics" - echo " --num-valid-egs-combine <#frames;10000> # Number of egss used in getting combination weights at the" + echo " --num-valid-egs-combine <#frames;10000> # Number of egs used in getting combination weights at the" echo " # very end." + echo " --lattice-lm-scale # If supplied, the graph/lm weight of the lattices will be " + echo " # used (with this scale) in generating supervisions" + echo " --lattice-prune-beam # If supplied, the lattices will be pruned to this beam, " + echo " # before being used to get supervisions." + echo " --acwt # Acoustic scale -- affects pruning" + echo " --deriv-weights-scp # If supplied, adds per-frame weights to the supervision." + echo " --generate-egs-scp # Generates scp files -- Required if the egs will be " + echo " # used for multilingual/multitask training." echo " --stage # Used to run a partially-completed training process from somewhere in" echo " # the middle." @@ -116,13 +135,13 @@ for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done +nj=$(cat $latdir/num_jobs) || exit 1 + sdata=$data/split$nj utils/split_data.sh $data $nj mkdir -p $dir/log $dir/info -num_lat_jobs=$(cat $latdir/num_jobs) || exit 1; - # Get list of validation utterances. frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 @@ -186,6 +205,8 @@ if [ -f $dir/trans.scp ]; then train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" fi +tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1 + if [ ! -z "$online_ivector_dir" ]; then ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; echo $ivector_dim > $dir/info/ivector_dim @@ -259,20 +280,11 @@ if [ -e $dir/storage ]; then done fi -if [ $stage -le 2 ]; then - echo "$0: copying training lattices" - - $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ - lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; - - for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp -fi - - egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" [ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" [ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" +[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp" chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor" [ ! -z $right_tolerance ] && \ @@ -281,60 +293,99 @@ chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$ali [ ! -z $left_tolerance ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance" +lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" +if [ ! -z $lattice_prune_beam ]; then + if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then + lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |" + else + lats_rspecifier="$lats_rspecifier lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:- |" + fi +fi + +normalization_fst_scale=1.0 + +if [ ! -z "$lattice_lm_scale" ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" + + normalization_fst_scale=$(perl -e " + if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { + print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; + exit(1); + } + print (1.0 - $lattice_lm_scale);") || exit 1 +fi + echo $left_context > $dir/info/left_context echo $right_context > $dir/info/right_context echo $left_context_initial > $dir/info/left_context_initial echo $right_context_final > $dir/info/right_context_final -if [ $stage -le 3 ]; then - echo "$0: Getting validation and training subset examples." +if [ $stage -le 2 ]; then + echo "$0: Getting validation and training subset examples in background." rm $dir/.error 2>/dev/null - echo "$0: ... extracting validation and training-subset alignments." - - # do the filtering just once, as lat.scp may be long. - utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ - <$dir/lat.scp >$dir/lat_special.scp - - $cmd $dir/log/create_valid_subset.log \ - utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ - lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ - chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \ - ark:- ark:- \| \ - nnet3-chain-get-egs $ivector_opts --srand=$srand \ - $egs_opts $chaindir/normalization.fst \ - "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error & - $cmd $dir/log/create_train_subset.log \ - utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ - lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ - chain-get-supervision $chain_supervision_all_opts \ - $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ - nnet3-chain-get-egs $ivector_opts --srand=$srand \ - $egs_opts $chaindir/normalization.fst \ - "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error & - wait; - [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 - echo "... Getting subsets of validation examples for diagnostics and combination." - $cmd $dir/log/create_valid_subset_combine.log \ - nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \ - ark:$dir/valid_combine.cegs || touch $dir/.error & - $cmd $dir/log/create_valid_subset_diagnostic.log \ - nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \ - ark:$dir/valid_diagnostic.cegs || touch $dir/.error & - - $cmd $dir/log/create_train_subset_combine.log \ - nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \ - ark:$dir/train_combine.cegs || touch $dir/.error & - $cmd $dir/log/create_train_subset_diagnostic.log \ - nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ - ark:$dir/train_diagnostic.cegs || touch $dir/.error & - wait - sleep 5 # wait for file system to sync. - cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs - - for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do - [ ! -s $f ] && echo "No examples in file $f" && exit 1; - done - rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs + + ( + $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \ + lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ + "$lats_rspecifier" \ + ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1 + + for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp + + $cmd $dir/log/create_valid_subset.log \ + utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \ + ark:- ark:- \| \ + nnet3-chain-get-egs $ivector_opts --srand=$srand \ + $egs_opts --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst \ + "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || exit 1 + $cmd $dir/log/create_train_subset.log \ + utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + chain-get-supervision $chain_supervision_all_opts \ + $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ + nnet3-chain-get-egs $ivector_opts --srand=$srand \ + $egs_opts --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst \ + "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1 + wait + sleep 5 # wait for file system to sync. + echo "... Getting subsets of validation examples for diagnostics and combination." + if $generate_egs_scp; then + valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp" + train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp" + else + valid_diagnostic_output="ark:$dir/valid_diagnostic.cegs" + train_diagnostic_output="ark:$dir/train_diagnostic.cegs" + fi + $cmd $dir/log/create_valid_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \ + ark:$dir/valid_combine.cegs || exit 1 + $cmd $dir/log/create_valid_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \ + $valid_diagnostic_output || exit 1 + + $cmd $dir/log/create_train_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \ + ark:$dir/train_combine.cegs || exit 1 + $cmd $dir/log/create_train_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ + $train_diagnostic_output || exit 1 + wait + sleep 5 # wait for file system to sync. + if $generate_egs_scp; then + cat $dir/valid_combine.cegs $dir/train_combine.cegs | \ + nnet3-chain-copy-egs ark:- ark,scp:$dir/combine.cegs,$dir/combine.scp + rm $dir/{train,valid}_combine.scp + else + cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs + fi + + for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do + [ ! -s $f ] && echo "No examples in file $f" && exit 1; + done + rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs + ) || touch $dir/.error & fi if [ $stage -le 4 ]; then @@ -355,9 +406,10 @@ if [ $stage -le 4 ]; then # there can be too many small files to deal with, because the total number of # files is the product of 'nj' by 'num_archives_intermediate', which might be # quite large. - $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ - utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \ - lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + + $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl \ + "$lats_rspecifier" ark:- \| \ chain-get-supervision $chain_supervision_all_opts \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ @@ -366,6 +418,10 @@ if [ $stage -le 4 ]; then nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1; fi +if [ -f $dir/.error ]; then + echo "Error detected while creating train/valid egs" && exit 1 +fi + if [ $stage -le 5 ]; then echo "$0: recombining and shuffling order of archives on disk" # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and @@ -378,16 +434,35 @@ if [ $stage -le 5 ]; then done if [ $archives_multiple == 1 ]; then # normal case. - $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ - nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:$dir/cegs.JOB.ark || exit 1; + if $generate_egs_scp; then + output_archive="ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp" + else + output_archive="ark:$dir/cegs.JOB.ark" + fi + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G \ + JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale \ + $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- $output_archive || exit 1; + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + for j in $(seq $num_archives_intermediate); do + cat $dir/cegs.$j.scp || exit 1; + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.scp; do rm $f; done + fi else # we need to shuffle the 'intermediate archives' and then split into the # final archives. we create soft links to manage this splitting, because # otherwise managing the output names is quite difficult (and we don't want # to submit separate queue jobs for each intermediate archive, because then # the --max-jobs-run option is hard to enforce). - output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)" + if $generate_egs_scp; then + output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/cegs.JOB.$y.ark,$dir/cegs.JOB.$y.scp; done)" + else + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)" + fi for x in $(seq $num_archives_intermediate); do for y in $(seq $archives_multiple); do archive_index=$[($x-1)*$archives_multiple+$y] @@ -395,13 +470,31 @@ if [ $stage -le 5 ]; then ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1 done done - $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G \ + JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-normalize-egs --normalization-fst-scale=$normalization_fst_scale \ + $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ nnet3-chain-shuffle-egs --srand=\$[JOB+$srand] ark:- ark:- \| \ nnet3-chain-copy-egs ark:- $output_archives || exit 1; + + if $generate_egs_scp; then + #concatenate cegs.JOB.scp in single cegs.scp + rm -rf $dir/cegs.scp + for j in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + cat $dir/cegs.$j.$y.scp || exit 1; + done + done > $dir/cegs.scp || exit 1; + for f in $dir/cegs.*.*.scp; do rm $f; done + fi fi fi +wait +if [ -f $dir/.error ]; then + echo "Error detected while creating train/valid egs" && exit 1 +fi + if [ $stage -le 6 ]; then echo "$0: removing temporary archives" ( @@ -415,8 +508,6 @@ if [ $stage -le 6 ]; then # there are some extra soft links that we should delete. for f in $dir/cegs.*.*.ark; do rm $f; done fi - echo "$0: removing temporary lattices" - rm $dir/lat.* echo "$0: removing temporary alignments and transforms" # Ignore errors below because trans.* might not exist. rm $dir/{ali,trans}.{ark,scp} 2>/dev/null diff --git a/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh new file mode 100755 index 00000000000..76793e8fa25 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +# Copyright 2017 Pegah Ghahremani +# 2017-18 Vimal Manohar +# Apache 2.0 + +# This script generates examples for multilingual training of 'chain' +# models using separate input egs dir per language as input. +# This script is similar to steps/nnet3/multilingual/combine_egs.sh, but +# works on 'chain' egs. This is also useful for semi-supervised training, +# where supervised and unsupervised datasets are treated as different +# languages. + +# This scripts produces 3 sets of files -- +# cegs.*.scp, cegs.output.*.ark, cegs.weight.*.ark +# +# cegs.*.scp are the SCP files of the training examples. +# cegs.weight.*.ark map from the key of the example to the language-specific +# weight of that example. +# cegs.output.*.ark map from the key of the example to the name of +# the output-node in the neural net for that specific language, e.g. +# 'output-2'. +# +# Begin configuration section. +cmd=run.pl +block_size=256 # This is the number of consecutive egs that we take from + # each source, and it only affects the locality of disk + # access. +lang2weight= # array of weights one per input languge to scale example's output + # w.r.t its input language during training. +stage=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -lt 3 ]; then + cat < ... + e.g.: $0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs + + Options: + --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. + --block-size # it is the number of consecutive egs that we take from + # each source, and it only affects the locality of disk + # access. This does not have to be the actual minibatch size +EOF + exit 1; +fi + +num_langs=$1 + +shift 1 +args=("$@") +megs_dir=${args[-1]} # multilingual directory +mkdir -p $megs_dir +mkdir -p $megs_dir/info +if [ ${#args[@]} != $[$num_langs+1] ]; then + echo "$0: num of input example dirs provided is not compatible with num_langs $num_langs." + echo "Usage:$0 [opts] ... " + echo "Usage:$0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs" + exit 1; +fi + +required="cegs.scp combine.scp train_diagnostic.scp valid_diagnostic.scp" +train_scp_list= +train_diagnostic_scp_list= +valid_diagnostic_scp_list= +combine_scp_list= + +# read paramter from $egs_dir[0]/info and cmvn_opts +# to write in multilingual egs_dir. +check_params="info/feat_dim info/ivector_dim info/left_context info/right_context cmvn_opts" +ivec_dim=`cat ${args[0]}/info/ivector_dim` +if [ $ivec_dim -ne 0 ];then check_params="$check_params info/final.ie.id"; fi + +for param in $check_params info/frames_per_eg; do + cat ${args[0]}/$param > $megs_dir/$param || exit 1; +done + +tot_num_archives=0 +for lang in $(seq 0 $[$num_langs-1]);do + multi_egs_dir[$lang]=${args[$lang]} + for f in $required; do + if [ ! -f ${multi_egs_dir[$lang]}/$f ]; then + echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1; + fi + done + num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives) + tot_num_archives=$[tot_num_archives+num_archives] + train_scp_list="$train_scp_list ${args[$lang]}/cegs.scp" + train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp" + valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp" + combine_scp_list="$combine_scp_list ${args[$lang]}/combine.scp" + + # check parameter dimension to be the same in all egs dirs + for f in $check_params; do + if [ -f $megs_dir/$f ] && [ -f ${multi_egs_dir[$lang]}/$f ]; then + f1=$(cat $megs_dir/$f) + f2=$(cat ${multi_egs_dir[$lang]}/$f) + if [ "$f1" != "$f2" ] ; then + echo "$0: mismatch for $f in $megs_dir vs. ${multi_egs_dir[$lang]}($f1 vs. $f2)." + exit 1; + fi + else + echo "$0: file $f does not exits in $megs_dir or ${multi_egs_dir[$lang]}/$f ." + fi + done +done + +if [ ! -z "$lang2weight" ]; then + egs_opt="--lang2weight '$lang2weight'" +fi + +if [ $stage -le 0 ]; then + echo "$0: allocating multilingual examples for training." + # Generate cegs.*.scp for multilingual setup. + $cmd $megs_dir/log/allocate_multilingual_examples_train.log \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ + --num-archives $tot_num_archives \ + --block-size $block_size \ + --egs-prefix "cegs." \ + $train_scp_list $megs_dir || exit 1; +fi + +if [ $stage -le 1 ]; then + echo "$0: combine combine.scp examples from all langs in $megs_dir/combine.scp." + # Generate combine.scp for multilingual setup. + $cmd $megs_dir/log/allocate_multilingual_examples_combine.log \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ + --num-archives 1 \ + --block-size $block_size \ + --egs-prefix "combine." \ + $combine_scp_list $megs_dir || exit 1; + + echo "$0: combine train_diagnostic.scp examples from all langs in $megs_dir/train_diagnostic.scp." + # Generate train_diagnostic.scp for multilingual setup. + $cmd $megs_dir/log/allocate_multilingual_examples_train_diagnostic.log \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ + --num-archives 1 \ + --block-size $block_size \ + --egs-prefix "train_diagnostic." \ + $train_diagnostic_scp_list $megs_dir || exit 1; + + + echo "$0: combine valid_diagnostic.scp examples from all langs in $megs_dir/valid_diagnostic.scp." + # Generate valid_diagnostic.scp for multilingual setup. + $cmd $megs_dir/log/allocate_multilingual_examples_valid_diagnostic.log \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ + --num-archives 1 \ + --block-size $block_size \ + --egs-prefix "valid_diagnostic." \ + $valid_diagnostic_scp_list $megs_dir || exit 1; + +fi +for egs_type in combine train_diagnostic valid_diagnostic; do + mv $megs_dir/${egs_type}.output.1.ark $megs_dir/${egs_type}.output.ark || exit 1; + mv $megs_dir/${egs_type}.weight.1.ark $megs_dir/${egs_type}.weight.ark || exit 1; + mv $megs_dir/${egs_type}.1.scp $megs_dir/${egs_type}.scp || exit 1; +done +mv $megs_dir/info/cegs.num_archives $megs_dir/info/num_archives || exit 1; +mv $megs_dir/info/cegs.num_tasks $megs_dir/info/num_tasks || exit 1; +echo "$0: Finished preparing multilingual training example." diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 6a68d9ecb6e..acabf733c94 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -274,7 +274,8 @@ def train(args, run_opts): # Check files chain_lib.check_for_required_files(args.feat_dir, args.tree_dir, - args.lat_dir) + args.lat_dir if args.egs_dir is None + else None) # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will # use it to check compatibility between training and decoding phone-sets. @@ -410,6 +411,15 @@ def train(args, run_opts): logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir)) common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) + if not os.path.exists('{0}/valid_diagnostic.cegs'.format(egs_dir)): + if (not os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir))): + raise Exception('Neither {0}/valid_diagnostic.cegs nor ' + '{0}/valid_diagnostic.scp exist.' + 'This script expects one of them.'.format(egs_dir)) + use_multitask_egs = True + else: + use_multitask_egs = False + if ((args.stage <= -2) and (os.path.exists(args.dir+"/configs/init.config")) and (args.input_model is None)): logger.info('Computing the preconditioning matrix for input features') @@ -417,7 +427,8 @@ def train(args, run_opts): chain_lib.compute_preconditioning_matrix( args.dir, egs_dir, num_archives, run_opts, max_lda_jobs=args.max_lda_jobs, - rand_prune=args.rand_prune) + rand_prune=args.rand_prune, + use_multitask_egs=use_multitask_egs) if (args.stage <= -1): logger.info("Preparing the initial acoustic model.") @@ -526,7 +537,8 @@ def train(args, run_opts): frame_subsampling_factor=args.frame_subsampling_factor, run_opts=run_opts, backstitch_training_scale=args.backstitch_training_scale, - backstitch_training_interval=args.backstitch_training_interval) + backstitch_training_interval=args.backstitch_training_interval, + use_multitask_egs=use_multitask_egs) if args.cleanup: # do a clean up everything but the last 2 models, under certain @@ -561,13 +573,20 @@ def train(args, run_opts): l2_regularize=args.l2_regularize, xent_regularize=args.xent_regularize, run_opts=run_opts, - max_objective_evaluations=args.max_objective_evaluations) + max_objective_evaluations=args.max_objective_evaluations, + use_multitask_egs=use_multitask_egs) else: logger.info("Copying the last-numbered model to final.mdl") common_lib.force_symlink("{0}.mdl".format(num_iters), "{0}/final.mdl".format(args.dir)) + chain_lib.compute_train_cv_probabilities( + dir=args.dir, iter=num_iters, egs_dir=egs_dir, + l2_regularize=l2_regularize, xent_regularize=xent_regularize, + leaky_hmm_coefficient=args.leaky_hmm_coefficient, + run_opts=run_opts, + use_multitask_egs=use_multitask_egs) common_lib.force_symlink("compute_prob_valid.{iter}.log" - "".format(iter=num_iters-1), + "".format(iter=num_iters), "{dir}/log/compute_prob_valid.final.log".format( dir=args.dir)) diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 8c520e0b5e1..37a67b41f94 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -37,7 +37,7 @@ minimize=false echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. -. parse_options.sh || exit 1; +. utils/parse_options.sh || exit 1; if [ $# -ne 3 ]; then echo "Usage: $0 [options] " diff --git a/egs/wsj/s5/steps/nnet3/decode_semisup.sh b/egs/wsj/s5/steps/nnet3/decode_semisup.sh new file mode 100755 index 00000000000..b742835f588 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/decode_semisup.sh @@ -0,0 +1,190 @@ +#!/bin/bash + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +# This script does decoding with a neural-net. If the neural net was built on +# top of fMLLR transforms from a conventional system, you should provide the +# --transform-dir option. + +# Begin configuration section. +stage=1 +transform_dir= # dir to find fMLLR transforms. +nj=4 # number of decoding jobs. If --transform-dir set, must match that number! +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the + # regular scoring script works. +cmd=run.pl +beam=15.0 +frames_per_chunk=50 +max_active=7000 +min_active=200 +ivector_scale=1.0 +lattice_beam=8.0 # Beam we use in lattice generation. +iter=final +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +scoring_opts= +skip_diagnostics=false +skip_scoring=false +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +online_ivector_dir= +minimize=false +word_determinize=false # If set to true, then output lattice does not retain + # alternate paths a sequence of words (with alternate pronunciations). + # Setting to true is the default in steps/nnet3/decode.sh. + # However, setting this to false + # is useful for generation w of semi-supervised training + # supervision and frame-level confidences. +write_compact=true # If set to false, then writes the lattice in non-compact format, + # retaining the acoustic scores on each arc. This is + # required to be false for LM rescoring undeterminized + # lattices (when --word-determinize is false) + # Useful for semi-supervised training with rescored lattices. +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [options] " + echo "e.g.: steps/nnet3/decode.sh --nj 8 \\" + echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\" + echo " exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 15.0" + echo " --iter # Iteration of model to decode; default is final." + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + exit 1; +fi + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. +model=$srcdir/$iter.mdl + + +extra_files= +if [ ! -z "$online_ivector_dir" ]; then + steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1 + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +fi + +utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1 + +for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1; +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +## Set up features. +echo "$0: feature type is raw" + +feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -s $transform_dir/num_jobs ] && \ + echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; + nj_orig=$(cat $transform_dir/num_jobs) + + if [ ! -f $transform_dir/raw_trans.1 ]; then + echo "$0: expected $transform_dir/raw_trans.1 to exist (--transform-dir option)" + exit 1; + fi + if [ $nj -ne $nj_orig ]; then + # Copy the transforms into an archive with an index. + for n in $(seq $nj_orig); do cat $transform_dir/raw_trans.$n; done | \ + copy-feats ark:- ark,scp:$dir/raw_trans.ark,$dir/raw_trans.scp || exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/raw_trans.scp ark:- ark:- |" + else + # number of jobs matches with alignment dir. + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" + fi +elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then + echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi +## + +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +extra_opts= +lat_wspecifier="ark:|" +if ! $write_compact; then + extra_opts="--determinize-lattice=false" + lat_wspecifier="ark:| lattice-determinize-phone-pruned --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |" +fi + +if [ "$post_decode_acwt" == 1.0 ]; then + lat_wspecifier="$lat_wspecifier gzip -c >$dir/lat.JOB.gz" +else + lat_wspecifier="$lat_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" +fi + +if [ $stage -le 1 ]; then + $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=$minimize --word-determinize=$word_determinize \ + --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt ${extra_opts} "$model" \ + $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1; +fi + + +if [ $stage -le 2 ]; then + if ! $skip_diagnostics ; then + [ ! -z $iter ] && iter_opt="--iter $iter" + steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir + fi +fi + + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. +if [ $stage -le 3 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + [ "$iter" != "final" ] && iter_opt="--iter $iter" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi +echo "Decoding done." +exit 0; diff --git a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py index e5f5f627567..54c65eb5403 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py +++ b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py @@ -1,6 +1,7 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 -# Copyright 2017 Pegah Ghahremani +# Copyright 2017 Pegah Ghahremani +# 2018 Hossein Hadian # # Apache 2.0. @@ -15,25 +16,6 @@ the output-node in the neural net for that specific language, e.g. 'output-2'. - This script additionally produces temporary files -- egs.ranges.*.txt, - which are consumed by this script itself. - There is one egs.ranges.*.txt file for each of the egs.*.scp files. - Each line in egs.ranges.*.txt corresponds to ranges of examples - selected from one of the input languages's scp files as: - - - That can be interpreted as selecting examples starting from - line from {lang}_th 'egs' file in "egs_scp_list". - (note that is the zero-based line number.) - - Example lines might look like: - 0 0 256 - 2 1024 256 - - egs.*.scp is generated using egs.ranges.*.txt as following: - "" consecutive examples starting from line "" - from {lang}_th input scp-file is copied to egs.*.scp. - --egs-prefix option can be used to generate train and diagnostics egs files. If --egs-prefix=train_diagnostics. is passed, then the files produced by the script will be named with the prefix as "train_diagnostics." @@ -45,21 +27,17 @@ for validation examples and "combine." for examples used for model combination. + For chain training egs, the --egs-prefix option should be "cegs." + You can call this script as (e.g.): allocate_multilingual_examples.py [opts] example-scp-lists multilingual-egs-dir - allocate_multilingual_examples.py --minibatch-size 128 + allocate_multilingual_examples.py --block-size 512 --lang2weight "0.2,0.8" exp/lang1/egs.scp exp/lang2/egs.scp exp/multi/egs - To avoid loading whole scp files from all languages in memory, - input egs.scp files are processed line by line using readline() for input - languages. To have more randomization across different archives, - "num-jobs * num-archives" temporary scp.. files are created - in egs/temp dir and all "num_jobs" scp.*. combined into - egs..scp. """ from __future__ import print_function @@ -68,7 +46,6 @@ import traceback sys.path.insert(0, 'steps') -import libs.common as common_lib logger = logging.getLogger('libs') logger.setLevel(logging.INFO) @@ -95,43 +72,31 @@ def get_args(): 'output-2'.""", epilog="Called by steps/nnet3/multilingual/combine_egs.sh") - parser.add_argument("--samples-per-iter", type=int, default=40000, - help="The target number of egs in each archive of egs, " - "(prior to merging egs). ") - parser.add_argument("--num-jobs", type=int, default=20, - help="This can be used for better randomization in distributing " - "examples for different languages across egs.*.scp files, " - "where egs..*.scp are generated " - "randomly and combined across all jobs in egs.*.scp files.") - parser.add_argument("--random-lang", type=str, action=common_lib.StrToBoolAction, - help="If true, egs.ranges.*.txt are generated " - "randomly w.r.t distribution of remaining examples in " - "each language, otherwise it is generated sequentially.", - default=True, choices = ["false", "true"]) - parser.add_argument("--max-archives", type=int, default=1000, - help="max number of archives used to generate egs.*.scp") - parser.add_argument("--seed", type=int, default=1, - help="Seed for random number generator") - parser.add_argument("--minibatch-size", type=int, default=512, - help="It is the number of consecutive egs that is taken " - "from each input scp source, and it only affects locality " - "of disk access. This does not have to be actual minibatch size.") + parser.add_argument("--num-archives", type=int, default=None, + help="Number of archives to split the data into. (Note: in reality they are not " + "archives, only scp files, but we use this notation by analogy with the " + "conventional egs-creating script).") + parser.add_argument("--block-size", type=int, default=512, + help="This relates to locality of disk access. 'block-size' is" + "the average number of examples that are read consecutively" + "from each input scp file (and are written in the same order to the output scp files)" + "Smaller values lead to more random disk access (during " + "the nnet3 training process).") parser.add_argument("--egs-prefix", type=str, default="egs.", - help="option can be used to generated example scp, weight " - "and output files for training and diagnostics." - "If --egs-prefix=combine. , then files produced " - "by the sript will be named with this prefix as " - "combine.output.*.ark, combine.weight.*.ark, combine.*.scp, " - "combine.ranges.*.ark.") + help="This option can be used to add a prefix to the filenames " + "of the output files. For e.g. " + "if --egs-prefix=combine. , then the files produced " + "by this script will be " + "combine.output.*.ark, combine.weight.*.ark, and combine.*.scp") parser.add_argument("--lang2weight", type=str, - help="comma-separated list of weights, one per language." + help="Comma-separated list of weights, one per language. " "The language order is as egs_scp_lists.") # now the positional arguments parser.add_argument("egs_scp_lists", nargs='+', - help="list of egs.scp files per input language." + help="List of egs.scp files per input language." "e.g. exp/lang1/egs/egs.scp exp/lang2/egs/egs.scp") parser.add_argument("egs_dir", - help="Name of egs directory e.g. exp/tdnn_multilingual_sp/egs") + help="Name of output egs directory e.g. exp/tdnn_multilingual_sp/egs") print(sys.argv, file=sys.stderr) @@ -140,169 +105,119 @@ def get_args(): return args -def select_random_lang(lang_len, tot_egs, random_selection): - """ Returns a random language index w.r.t - amount of examples in each language. - It works based on sampling from a - discrete distribution, where it returns i - with prob(i) = (num_egs in lang(i)/ tot_egs). - tot_egs is sum of lang_len. - """ - assert(tot_egs > 0) - rand_int = random.randint(0, tot_egs - 1) - count = 0 - for l in range(len(lang_len)): - if random_selection: - if rand_int <= (count + lang_len[l]): - return l - else: - count += lang_len[l] - else: - if (lang_len[l] > 0): - return l - return -1 +def read_lines(file_handle, num_lines): + n_read = 0 + lines = [] + while n_read < num_lines: + line = file_handle.readline() + if not line: + break + lines.append(line.strip()) + n_read += 1 + return lines def process_multilingual_egs(args): args = get_args() - random.seed(args.seed) - rand_select = args.random_lang - # read egs.scp for input languages scp_lists = args.egs_scp_lists num_langs = len(scp_lists) - scp_files = [open(scp_lists[lang], 'r') for lang in range(num_langs)] - - lang2len = [0] * num_langs + lang_to_num_examples = [0] * num_langs for lang in range(num_langs): - lang2len[lang] = sum(1 for line in open(scp_lists[lang])) + with open(scp_lists[lang]) as fh: + lang_to_num_examples[lang] = sum([1 for line in fh]) logger.info("Number of examples for language {0} " - "is {1}.".format(lang, lang2len[lang])) + "is {1}.".format(lang, lang_to_num_examples[lang])) # If weights are not provided, the weights are 1.0. if args.lang2weight is None: - lang2weight = [ 1.0 ] * num_langs + lang2weight = [1.0] * num_langs else: lang2weight = args.lang2weight.split(",") assert(len(lang2weight) == num_langs) - if not os.path.exists("{0}/temp".format(args.egs_dir)): - os.makedirs("{0}/temp".format(args.egs_dir)) - num_lang_file = open("{0}/info/{1}num_tasks".format(args.egs_dir, args.egs_prefix), "w") - print("{0}".format(num_langs), file=num_lang_file) - - # Each element of all_egs (one per num_archive * num_jobs) is - # an array of 3-tuples (lang-id, local-start-egs-line, num-egs) - all_egs = [] - lang_len = lang2len[:] - # total num of egs in all languages - tot_num_egs = sum(lang2len[i] for i in range(len(lang2len))) - num_archives = max(1, min(args.max_archives, tot_num_egs // args.samples_per_iter)) - - num_arch_file = open("{0}/info/{1}num_archives".format( - args.egs_dir, - args.egs_prefix), - "w") - print("{0}".format(num_archives), file=num_arch_file) - num_arch_file.close() - this_num_egs_per_archive = tot_num_egs // (num_archives * args.num_jobs) - - logger.info("Generating {0}scp.. temporary files used to " - "generate {0}.scp.".format(args.egs_prefix)) - for job in range(args.num_jobs): - for archive_index in range(num_archives): - archfile = open("{0}/temp/{1}scp.{2}.{3}" - "".format(args.egs_dir, args.egs_prefix, - job + 1, archive_index + 1), - "w") - this_egs = [] # this will be array of 2-tuples (lang-id start-frame num-frames) - - num_egs = 0 - while num_egs <= this_num_egs_per_archive: - num_left_egs = sum(num_left_egs_per_lang for - num_left_egs_per_lang in lang_len) - if num_left_egs > 0: - lang_id = select_random_lang(lang_len, num_left_egs, rand_select) - start_egs = lang2len[lang_id] - lang_len[lang_id] - this_egs.append((lang_id, start_egs, args.minibatch_size)) - for scpline in range(args.minibatch_size): - scp_key = scp_files[lang_id].readline().splitlines()[0] - print("{0} {1}".format(scp_key, lang_id), - file=archfile) - - lang_len[lang_id] = lang_len[lang_id] - args.minibatch_size - num_egs = num_egs + args.minibatch_size - # If num of remaining egs in each lang is less than minibatch_size, - # they are discarded. - if lang_len[lang_id] < args.minibatch_size: - lang_len[lang_id] = 0 - logger.info("Done processing data for language {0}".format( - lang_id)) - else: - logger.info("Done processing data for all languages.") - break - all_egs.append(this_egs) - archfile.close() - - logger.info("combining egs..*.scp across all jobs into egs.*.scp file.") - for archive in range(num_archives): - logger.info("Combine {0}job.{1}.scp across all jobs into " - "{0}{1}.scp.".format(args.egs_prefix, archive)) - this_ranges = [] - f = open("{0}/temp/{1}ranges.{2}.txt".format( - args.egs_dir, args.egs_prefix, archive + 1), - 'w') - o = open("{0}/{1}output.{2}.ark".format( - args.egs_dir, args.egs_prefix, archive + 1), - 'w') - w = open("{0}/{1}weight.{2}.ark".format( - args.egs_dir, args.egs_prefix, archive + 1), - 'w') - scp_per_archive_file = open("{0}/{1}{2}.scp" - "".format(args.egs_dir, - args.egs_prefix, archive + 1), - 'w') - - # check files before writing. - if f is None: - raise Exception("Error opening file {0}".format(f)) - if o is None: - raise Exception("Error opening file {0}".format(o)) - if w is None: - raise Exception("Error opening file {0}".format(w)) - if scp_per_archive_file is None: - raise Exception("Error opening file {0}".format(scp_per_archive_file)) - - for job in range(args.num_jobs): - scp = ("{0}/temp/{1}scp.{2}.{3}".format(args.egs_dir, args.egs_prefix, - job + 1, archive + 1)) - with open(scp, "r") as scpfile: - for line in scpfile: - scp_line = line.splitlines()[0].split() - print("{0} {1}".format(scp_line[0], scp_line[1]), - file=scp_per_archive_file) - print("{0} output-{1}".format(scp_line[0], scp_line[2]), - file=o) - print("{0} {1}".format( - scp_line[0], - lang2weight[int(scp_line[2])]), - file=w) - os.remove(scp) - - for (lang_id, start_eg_line, num_egs) in all_egs[num_archives * job + archive]: - this_ranges.append((lang_id, start_eg_line, num_egs)) - - # write egs.ranges.*.txt - for (lang_id, start_eg_line, num_egs) in this_ranges: - print("{0} {1} {2}".format(lang_id, start_eg_line, num_egs), file=f) - - f.close() - o.close() - w.close() - scp_per_archive_file.close() - logger.info("finished generating {0}*.scp, {0}output.*.ark " - "and {0}weight.*.ark files.".format(args.egs_prefix)) + if not os.path.exists(os.path.join(args.egs_dir, 'info')): + os.makedirs(os.path.join(args.egs_dir, 'info')) + + with open("{0}/info/{1}num_tasks".format(args.egs_dir, args.egs_prefix), "w") as fh: + print("{0}".format(num_langs), file=fh) + + # Total number of egs in all languages + tot_num_egs = sum(lang_to_num_examples[i] for i in range(num_langs)) + num_archives = args.num_archives + + with open("{0}/info/{1}num_archives".format(args.egs_dir, args.egs_prefix), "w") as fh: + print("{0}".format(num_archives), file=fh) + + logger.info("There are a total of {} examples in the input scp " + "files.".format(tot_num_egs)) + logger.info("Number of blocks in each output archive will be approximately " + "{}, and block-size is {}.".format(int(round(tot_num_egs / num_archives / args.block_size)), + args.block_size)) + for lang in range(num_langs): + blocks_per_archive_this_lang = lang_to_num_examples[lang] / num_archives / args.block_size + warning = "" + if blocks_per_archive_this_lang < 1.0: + warning = ("Warning: This means some of the output archives might " + "not include any examples from this lang.") + logger.info("The proportion of egs from lang {} is {:.2f}. The number of blocks " + "per archive for this lang is approximately {:.2f}. " + "{}".format(lang, lang_to_num_examples[lang] / tot_num_egs, + blocks_per_archive_this_lang, + warning)) + + in_scp_file_handles = [open(scp_lists[lang], 'r') for lang in range(num_langs)] + + num_remaining_egs = tot_num_egs + lang_to_num_remaining_egs = [n for n in lang_to_num_examples] + for archive_index in range(num_archives + 1): # +1 is because we write to the last archive in two rounds + num_remaining_archives = num_archives - archive_index + num_remaining_blocks = num_remaining_egs / args.block_size + + last_round = (archive_index == num_archives) + if not last_round: + num_blocks_this_archive = int(round(num_remaining_blocks / num_remaining_archives)) + logger.info("Generating archive {} containing {} blocks...".format(archive_index, num_blocks_this_archive)) + else: # This is the second round for the last archive. Flush all the remaining egs... + archive_index = num_archives - 1 + num_blocks_this_archive = num_langs + logger.info("Writing all the {} remaining egs to the last archive...".format(num_remaining_egs)) + + out_scp_file_handle = open('{0}/{1}{2}.scp'.format(args.egs_dir, args.egs_prefix, archive_index + 1), + 'a' if last_round else 'w') + eg_to_output_file_handle = open("{0}/{1}output.{2}.ark".format(args.egs_dir, args.egs_prefix, archive_index + 1), + 'a' if last_round else 'w') + eg_to_weight_file_handle = open("{0}/{1}weight.{2}.ark".format(args.egs_dir, args.egs_prefix, archive_index + 1), + 'a' if last_round else 'w') + + + for block_index in range(num_blocks_this_archive): + # Find the lang with the highest proportion of remaining examples + remaining_proportions = [remain / tot for remain, tot in zip(lang_to_num_remaining_egs, lang_to_num_examples)] + lang_index, max_proportion = max(enumerate(remaining_proportions), key=lambda a: a[1]) + + # Read 'block_size' examples from the selected lang and write them to the current output scp file: + example_lines = read_lines(in_scp_file_handles[lang_index], args.block_size) + for eg_line in example_lines: + eg_id = eg_line.split()[0] + print(eg_line, file=out_scp_file_handle) + print("{0} output-{1}".format(eg_id, lang_index), file=eg_to_output_file_handle) + print("{0} {1}".format(eg_id, lang2weight[lang_index]), file=eg_to_weight_file_handle) + + num_remaining_egs -= len(example_lines) + lang_to_num_remaining_egs[lang_index] -= len(example_lines) + + out_scp_file_handle.close() + eg_to_output_file_handle.close() + eg_to_weight_file_handle.close() + + for handle in in_scp_file_handles: + handle.close() + logger.info("Finished generating {0}*.scp, {0}output.*.ark " + "and {0}weight.*.ark files. Wrote a total of {1} examples " + "to {2} archives.".format(args.egs_prefix, + tot_num_egs - num_remaining_egs, num_archives)) def main(): @@ -315,4 +230,4 @@ def main(): if __name__ == "__main__": - main() + main() diff --git a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh index 3826dad11a9..e1aeb0b70d6 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh @@ -1,5 +1,10 @@ #!/bin/bash -# + +# Copyright 2017 Pegah Ghahremani +# 2017-18 Vimal Manohar +# 2018 Hossein Hadian +# Apache 2.0 + # This script generates examples for multilingual training of neural network # using separate input egs dir per language as input. # This scripts produces 3 sets of files -- @@ -14,16 +19,9 @@ # # Begin configuration section. cmd=run.pl -minibatch_size=512 # it is the number of consecutive egs that we take from - # each source, and it only affects the locality of disk - # access. This does not have to be the actual minibatch size; -num_jobs=10 # helps for better randomness across languages - # per archive. -samples_per_iter=400000 # this is the target number of egs in each archive of egs - # (prior to merging egs). We probably should have called - # it egs_per_iter. This is just a guideline; it will pick - # a number that divides the number of samples in the - # entire data. +block_size=256 # This is the number of consecutive egs that we take from + # each source, and it only affects the locality of disk + # access. lang2weight= # array of weights one per input languge to scale example's output # w.r.t its input language during training. stage=0 @@ -33,6 +31,24 @@ echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; +if [ $# -lt 3 ]; then + cat < ... + e.g.: $0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs + + Options: + --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. + --block-size # it is the number of consecutive egs that we take from + # each source, and it only affects the locality of disk + # access. This does not have to be the actual minibatch size +EOF + exit 1; +fi + num_langs=$1 shift 1 @@ -55,14 +71,15 @@ combine_scp_list= # read paramter from $egs_dir[0]/info and cmvn_opts # to write in multilingual egs_dir. -check_params="info/feat_dim info/ivector_dim info/left_context info/right_context info/frames_per_eg cmvn_opts" +check_params="info/feat_dim info/ivector_dim info/left_context info/right_context cmvn_opts" ivec_dim=`cat ${args[0]}/info/ivector_dim` if [ $ivec_dim -ne 0 ];then check_params="$check_params info/final.ie.id"; fi -for param in $check_params; do - cat ${args[0]}/$param > $megs_dir/$param || exit 1; +for param in $check_params info/frames_per_eg; do + cat ${args[0]}/$param > $megs_dir/$param || exit 1; done +tot_num_archives=0 for lang in $(seq 0 $[$num_langs-1]);do multi_egs_dir[$lang]=${args[$lang]} for f in $required; do @@ -70,6 +87,8 @@ for lang in $(seq 0 $[$num_langs-1]);do echo "$0: no such file ${multi_egs_dir[$lang]}/$f." && exit 1; fi done + num_archives=$(cat ${multi_egs_dir[$lang]}/info/num_archives) + tot_num_archives=$[tot_num_archives+num_archives] train_scp_list="$train_scp_list ${args[$lang]}/egs.scp" train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.scp" valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.scp" @@ -90,16 +109,17 @@ for lang in $(seq 0 $[$num_langs-1]);do done done +if [ ! -z "$lang2weight" ]; then + egs_opt="--lang2weight '$lang2weight'" +fi + if [ $stage -le 0 ]; then echo "$0: allocating multilingual examples for training." - if [ ! -z "$lang2weight" ]; then - egs_opt="--lang2weight '$lang2weight'" - fi # Generate egs.*.scp for multilingual setup. $cmd $megs_dir/log/allocate_multilingual_examples_train.log \ - steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ - --minibatch-size $minibatch_size \ - --samples-per-iter $samples_per_iter \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ + --num-archives $tot_num_archives \ + --block-size $block_size \ $train_scp_list $megs_dir || exit 1; fi @@ -107,20 +127,18 @@ if [ $stage -le 1 ]; then echo "$0: combine combine.scp examples from all langs in $megs_dir/combine.scp." # Generate combine.scp for multilingual setup. $cmd $megs_dir/log/allocate_multilingual_examples_combine.log \ - steps/nnet3/multilingual/allocate_multilingual_examples.py \ - --random-lang false \ - --max-archives 1 --num-jobs 1 \ - --minibatch-size $minibatch_size \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ + --num-archives 1 \ + --block-size $block_size \ --egs-prefix "combine." \ $combine_scp_list $megs_dir || exit 1; echo "$0: combine train_diagnostic.scp examples from all langs in $megs_dir/train_diagnostic.scp." # Generate train_diagnostic.scp for multilingual setup. $cmd $megs_dir/log/allocate_multilingual_examples_train_diagnostic.log \ - steps/nnet3/multilingual/allocate_multilingual_examples.py \ - --random-lang false \ - --max-archives 1 --num-jobs 1 \ - --minibatch-size $minibatch_size \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ + --num-archives 1 \ + --block-size $block_size \ --egs-prefix "train_diagnostic." \ $train_diagnostic_scp_list $megs_dir || exit 1; @@ -128,9 +146,9 @@ if [ $stage -le 1 ]; then echo "$0: combine valid_diagnostic.scp examples from all langs in $megs_dir/valid_diagnostic.scp." # Generate valid_diagnostic.scp for multilingual setup. $cmd $megs_dir/log/allocate_multilingual_examples_valid_diagnostic.log \ - steps/nnet3/multilingual/allocate_multilingual_examples.py \ - --random-lang false --max-archives 1 --num-jobs 1\ - --minibatch-size $minibatch_size \ + steps/nnet3/multilingual/allocate_multilingual_examples.py $egs_opt \ + --num-archives 1 \ + --block-size $block_size \ --egs-prefix "valid_diagnostic." \ $valid_diagnostic_scp_list $megs_dir || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py index 0e336cdbc11..93cbc940c33 100755 --- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py +++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py @@ -259,7 +259,7 @@ def plot_a_nonlin_component(fig, dirs, stat_tables_per_component_per_dir, continue data = np.array(iter_stats) - data = data[data[:, 0] >= start_iter, :] + data = data[data[:, 0] >= start_iter, :] ax = plt.subplot(211) lp, = ax.plot(data[:, 0], data[:, gate_index*10+5], color=color_val, @@ -345,7 +345,7 @@ def plot_a_nonlin_component(fig, dirs, stat_tables_per_component_per_dir, bbox_to_anchor=(0.5 , -1.5 + len(dirs) * -0.2), ncol=4, handletextpad = -2, title="[1]:{0}".format(common_prefix), borderaxespad=0.) - plt.grid(True) + plt.grid(True) return lgd @@ -826,6 +826,7 @@ def main(): output_nodes.append(tuple(parts)) elif args.is_chain: output_nodes.append(('output', 'chain')) + output_nodes.append(('output-xent', 'chain')) elif args.is_rnnlm: output_nodes.append(('output', 'rnnlm_objective')) else: diff --git a/egs/wsj/s5/steps/subset_ali_dir.sh b/egs/wsj/s5/steps/subset_ali_dir.sh new file mode 100755 index 00000000000..537d91c1248 --- /dev/null +++ b/egs/wsj/s5/steps/subset_ali_dir.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0. + +cmd=run.pl + +if [ -f ./path.sh ]; then . ./path.sh; fi + +. ./utils/parse_options.sh + +if [ $# -ne 4 ]; then + cat < from the + original alignment directory containing alignments for utterances in + . + + The number of split jobs in the output alignment directory is + equal to the number of jobs in the original alignment directory, + unless the subset data directory has too few speakers. + + Usage: $0 [options] + e.g.: $0 data/train_sp data/train exp/tri3_ali_sp exp/tri3_ali + + Options: + --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs. +EOF + exit 1 +fi + +data=$1 +subset_data=$2 +ali_dir=$3 +dir=$4 + +nj=$(cat $ali_dir/num_jobs) || exit 1 +utils/split_data.sh $data $nj + +mkdir -p $dir +cp $ali_dir/{final.mdl,*.mat,*_opts,tree} $dir/ || true +cp -r $ali_dir/phones $dir 2>/dev/null || true + +$cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \ + copy-int-vector "ark:gunzip -c $ali_dir/ali.JOB.gz |" \ + ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1 + +for n in `seq $nj`; do + cat $dir/ali_tmp.$n.scp +done > $dir/ali_tmp.scp + +num_spk=$(cat $subset_data/spk2utt | wc -l) +if [ $num_spk -lt $nj ]; then + nj=$num_spk +fi + +utils/split_data.sh $subset_data $nj +$cmd JOB=1:$nj $dir/log/filter_alignments.JOB.log \ + copy-int-vector \ + "scp:utils/filter_scp.pl $subset_data/split${nj}/JOB/utt2spk $dir/ali_tmp.scp |" \ + "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1 + +echo $nj > $dir/num_jobs + +rm $dir/ali_tmp.*.{ark,scp} $dir/ali_tmp.scp + +exit 0 diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 579e3d7b3e0..30aff50170b 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -74,7 +74,9 @@ void ProtoSupervision::Write(std::ostream &os, bool binary) const { void SupervisionOptions::Check() const { KALDI_ASSERT(left_tolerance >= 0 && right_tolerance >= 0 && frame_subsampling_factor > 0 && - left_tolerance + right_tolerance >= frame_subsampling_factor); + left_tolerance + right_tolerance + 1 >= frame_subsampling_factor); + + KALDI_ASSERT(lm_scale >= 0.0 && lm_scale < 1.0); } bool AlignmentToProtoSupervision(const SupervisionOptions &opts, @@ -142,9 +144,10 @@ bool ProtoSupervision::operator == (const ProtoSupervision &other) const { fst::Equal(fst, other.fst)); } -bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, - const CompactLattice &lat, - ProtoSupervision *proto_supervision) { +bool PhoneLatticeToProtoSupervisionInternal( + const SupervisionOptions &opts, + const CompactLattice &lat, + ProtoSupervision *proto_supervision) { opts.Check(); if (lat.NumStates() == 0) { KALDI_WARN << "Empty lattice provided"; @@ -176,20 +179,24 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, return false; } proto_supervision->fst.AddArc(state, - fst::StdArc(phone, phone, - fst::TropicalWeight::One(), - lat_arc.nextstate)); + fst::StdArc(phone, phone, + fst::TropicalWeight( + lat_arc.weight.Weight().Value1() + * opts.lm_scale), + lat_arc.nextstate)); + int32 t_begin = std::max(0, (state_time - opts.left_tolerance)), t_end = std::min(num_frames, (next_state_time + opts.right_tolerance)), - t_begin_subsampled = (t_begin + factor - 1)/ factor, - t_end_subsampled = (t_end + factor - 1)/ factor; + t_begin_subsampled = (t_begin + factor - 1)/ factor, + t_end_subsampled = (t_end + factor - 1)/ factor; for (int32 t_subsampled = t_begin_subsampled; t_subsampled < t_end_subsampled; t_subsampled++) proto_supervision->allowed_phones[t_subsampled].push_back(phone); } if (lat.Final(state) != CompactLatticeWeight::Zero()) { - proto_supervision->fst.SetFinal(state, fst::TropicalWeight::One()); + proto_supervision->fst.SetFinal(state, fst::TropicalWeight( + lat.Final(state).Weight().Value1() * opts.lm_scale)); if (state_times[state] != num_frames) { KALDI_WARN << "Time of final state " << state << " in lattice is " << "not equal to number of frames " << num_frames @@ -207,6 +214,18 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, return true; } +bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, + const CompactLattice &lat, + ProtoSupervision *proto_supervision) { + + if (!PhoneLatticeToProtoSupervisionInternal(opts, lat, proto_supervision)) + return false; + if (opts.lm_scale != 0.0) + fst::Push(&(proto_supervision->fst), + fst::REWEIGHT_TO_INITIAL, fst::kDelta, true); + + return true; +} bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) { // the following call will do the range-check on 'ilabel'. @@ -787,8 +806,10 @@ bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, fst::StdVectorFst supervision_fst_noeps(supervision->fst); fst::RmEpsilon(&supervision_fst_noeps); if (!TryDeterminizeMinimize(kSupervisionMaxStates, - &supervision_fst_noeps)) + &supervision_fst_noeps)) { + KALDI_WARN << "Failed to determinize supervision fst"; return false; + } // note: by default, 'Compose' will call 'Connect', so if the // resulting FST is not connected, it will end up empty. @@ -801,8 +822,10 @@ bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, // determinize and minimize to make it as compact as possible. if (!TryDeterminizeMinimize(kSupervisionMaxStates, - &composed_fst)) + &composed_fst)) { + KALDI_WARN << "Failed to determinize normalized supervision fst"; return false; + } supervision->fst = composed_fst; // Make sure the states are numbered in increasing order of time. diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index 13866e2aba6..e52602e1c12 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -50,10 +50,14 @@ struct SupervisionOptions { int32 left_tolerance; int32 right_tolerance; int32 frame_subsampling_factor; + BaseFloat weight; + BaseFloat lm_scale; SupervisionOptions(): left_tolerance(5), right_tolerance(5), - frame_subsampling_factor(1) { } + frame_subsampling_factor(1), + weight(1.0), + lm_scale(0.0) { } void Register(OptionsItf *opts) { opts->Register("left-tolerance", &left_tolerance, "Left tolerance for " @@ -65,6 +69,13 @@ struct SupervisionOptions { "frame-rate of the original alignment. Applied after " "left-tolerance and right-tolerance are applied (so they are " "in terms of the original num-frames."); + opts->Register("weight", &weight, + "Use this to set the supervision weight for training. " + "This can be used to assign different weights to " + "different data sources."); + opts->Register("lm-scale", &lm_scale, "The scale with which the graph/lm " + "weights from the phone lattice are included in the " + "supervision fst."); } void Check() const; }; diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index ca0428553c1..a3222d2285f 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -54,17 +54,16 @@ double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode, end = egs.end(); for (; iter != end; ++iter) prob_computer->Compute(*iter); - const ChainObjectiveInfo *objf_info = - prob_computer->GetObjective("output"); - if (objf_info == NULL) - KALDI_ERR << "Error getting objective info (unsuitable egs?)"; - KALDI_ASSERT(objf_info->tot_weight > 0.0); + + double tot_weight = 0.0; + double tot_objf = prob_computer->GetTotalObjective(&tot_weight); + + KALDI_ASSERT(tot_weight > 0.0); // inf/nan tot_objf->return -inf objective. - double tot_objf = objf_info->tot_like + objf_info->tot_l2_term; if (!(tot_objf == tot_objf && tot_objf - tot_objf == 0)) return -std::numeric_limits::infinity(); // we prefer to deal with normalized objective functions. - return tot_objf / objf_info->tot_weight; + return tot_objf / tot_weight; } } diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc index 4f26e145ac5..0117fe2200f 100644 --- a/src/chainbin/nnet3-chain-copy-egs.cc +++ b/src/chainbin/nnet3-chain-copy-egs.cc @@ -1,8 +1,9 @@ // chainbin/nnet3-chain-copy-egs.cc // Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) -// 2014 Vimal Manohar +// 2014-2017 Vimal Manohar // 2016 Gaofeng Cheng +// 2017 Pegah Ghahremani // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -26,6 +27,40 @@ namespace kaldi { namespace nnet3 { +// renames outputs named "output" to new_name +void RenameOutputs(const std::string &new_name, NnetChainExample *eg) { + bool found_output = false; + for (std::vector::iterator it = eg->outputs.begin(); + it != eg->outputs.end(); ++it) { + if (it->name == "output") { + it->name = new_name; + found_output = true; + } + } + + if (!found_output) + KALDI_ERR << "No supervision with name 'output'" + << "exists in eg."; +} + +// scales the supervision for 'output' by a factor of "weight" +void ScaleSupervisionWeight(BaseFloat weight, NnetChainExample *eg) { + if (weight == 1.0) return; + + bool found_output = false; + for (std::vector::iterator it = eg->outputs.begin(); + it != eg->outputs.end(); ++it) { + if (it->name == "output") { + it->supervision.weight *= weight; + found_output = true; + } + } + + if (!found_output) + KALDI_ERR << "No supervision with name 'output'" + << "exists in eg."; +} + // returns an integer randomly drawn with expected value "expected_count" // (will be either floor(expected_count) or ceil(expected_count)). int32 GetCount(double expected_count) { @@ -37,40 +72,31 @@ int32 GetCount(double expected_count) { return ans; } -void FilterExample(const NnetChainExample &eg, - int32 min_input_t, +/** + This function filters the indexes (and associated feature rows) in a + NnetExample, removing any index/row in an NnetIo named "input" with t < + min_input_t or t > max_input_t and any index/row in an NnetIo named "output" with t < + min_output_t or t > max_output_t. + Will crash if filtering removes all Indexes of "input" or "output". + */ +void FilterExample(int32 min_input_t, int32 max_input_t, int32 min_output_t, int32 max_output_t, - NnetChainExample *eg_out) { - eg_out->inputs.clear(); - eg_out->inputs.resize(eg.inputs.size()); - eg_out->outputs.clear(); - eg_out->outputs.resize(eg.outputs.size()); + NnetChainExample *eg) { // process the inputs - for (size_t i = 0; i < eg.inputs.size(); i++) { - bool is_input; + for (size_t i = 0; i < eg->inputs.size(); i++) { int32 min_t, max_t; - const NnetIo &io_in = eg.inputs[i]; - NnetIo &io_out = eg_out->inputs[i]; - const std::string &name = io_in.name; - io_out.name = name; - if (name == "input") { + NnetIo &io = eg->inputs[i]; + if (io.name == "input") { min_t = min_input_t; max_t = max_input_t; - is_input = true; - } else { - is_input = false; - } - if (!is_input) { // Just copy everything. - io_out.indexes = io_in.indexes; - io_out.features = io_in.features; - } else { - const std::vector &indexes_in = io_in.indexes; - std::vector &indexes_out = io_out.indexes; + + const std::vector &indexes_in = io.indexes; + std::vector indexes_out; indexes_out.reserve(indexes_in.size()); int32 num_indexes = indexes_in.size(), num_kept = 0; - KALDI_ASSERT(io_in.features.NumRows() == num_indexes); + KALDI_ASSERT(io.features.NumRows() == num_indexes); std::vector keep(num_indexes, false); std::vector::const_iterator iter_in = indexes_in.begin(), end_in = indexes_in.end(); @@ -86,27 +112,26 @@ void FilterExample(const NnetChainExample &eg, } KALDI_ASSERT(iter_out == keep.end()); if (num_kept == 0) - KALDI_ERR << "FilterExample removed all indexes for '" << name << "'"; + KALDI_ERR << "FilterExample removed all indexes for '" << io.name << "'"; + io.indexes = indexes_out; - FilterGeneralMatrixRows(io_in.features, keep, - &io_out.features); - KALDI_ASSERT(io_out.features.NumRows() == num_kept && + GeneralMatrix features_out; + FilterGeneralMatrixRows(io.features, keep, &features_out); + io.features = features_out; + KALDI_ASSERT(io.features.NumRows() == num_kept && indexes_out.size() == static_cast(num_kept)); } } - // process the outputs, we will copy all supervision - // output as default - for (size_t i = 0; i < eg.outputs.size(); i++) { - const NnetChainSupervision &io_in = eg.outputs[i]; - NnetChainSupervision &io_out = eg_out->outputs[i]; - const std::string &name = io_in.name; - io_out.name = name; - io_out.indexes = io_in.indexes; - io_out.supervision = io_in.supervision; - io_out.deriv_weights = io_in.deriv_weights; - } } + +/** Returns true if the "eg" contains just a single example, meaning + that all the "n" values in the indexes are zero, and the example + has NnetIo members named both "input" and "output" + + Also computes the minimum and maximum "t" values in the "input" and + "output" NnetIo members. + */ bool ContainsSingleExample(const NnetChainExample &eg, int32 *min_input_t, int32 *max_input_t, @@ -196,15 +221,14 @@ void CalculateFrameSubsamplingFactor(const NnetChainExample &eg, - eg.outputs[0].indexes[0].t; } -void ModifyChainExampleContext(const NnetChainExample &eg, - int32 left_context, +void ModifyChainExampleContext(int32 left_context, int32 right_context, const int32 frame_subsampling_factor, - NnetChainExample *eg_out) { + NnetChainExample *eg) { static bool warned_left = false, warned_right = false; int32 min_input_t, max_input_t, min_output_t, max_output_t; - if (!ContainsSingleExample(eg, &min_input_t, &max_input_t, + if (!ContainsSingleExample(*eg, &min_input_t, &max_input_t, &min_output_t, &max_output_t)) KALDI_ERR << "Too late to perform frame selection/context reduction on " << "these examples (already merged?)"; @@ -235,11 +259,11 @@ void ModifyChainExampleContext(const NnetChainExample &eg, max_input_t = std::min(max_input_t, max_output_t + right_context); } } - FilterExample(eg, - min_input_t, max_input_t, + FilterExample(min_input_t, max_input_t, min_output_t, max_output_t, - eg_out); + eg); } // ModifyChainExampleContext + } // namespace nnet3 } // namespace kaldi @@ -268,6 +292,8 @@ int main(int argc, char *argv[]) { int32 frame_subsampling_factor = -1; BaseFloat keep_proportion = 1.0; int32 left_context = -1, right_context = -1; + std::string eg_weight_rspecifier, eg_output_name_rspecifier; + ParseOptions po(usage); po.Register("random", &random, "If true, will write frames to output " "archives randomly, not round-robin."); @@ -285,6 +311,15 @@ int main(int argc, char *argv[]) { "feature left-context that we output."); po.Register("right-context", &right_context, "Can be used to truncate the " "feature right-context that we output."); + po.Register("weights", &eg_weight_rspecifier, + "Rspecifier indexed by the key of egs, providing a weight by " + "which we will scale the supervision matrix for that eg. " + "Used in multilingual training."); + po.Register("outputs", &eg_output_name_rspecifier, + "Rspecifier indexed by the key of egs, providing a string-valued " + "output name, e.g. 'output-0'. If provided, the NnetIo with " + "name 'output' will be renamed to the provided name. Used in " + "multilingual training."); po.Read(argc, argv); srand(srand_seed); @@ -298,6 +333,11 @@ int main(int argc, char *argv[]) { SequentialNnetChainExampleReader example_reader(examples_rspecifier); + // In the normal case, these would not be used. These are only applicable + // for multi-task or multilingual training. + RandomAccessTokenReader output_name_reader(eg_output_name_rspecifier); + RandomAccessBaseFloatReader egs_weight_reader(eg_weight_rspecifier); + int32 num_outputs = po.NumArgs() - 1; std::vector example_writers(num_outputs); for (int32 i = 0; i < num_outputs; i++) @@ -307,38 +347,47 @@ int main(int argc, char *argv[]) { // not configurable for now. exclude_names.push_back(std::string("ivector")); - int64 num_read = 0, num_written = 0; - + int64 num_read = 0, num_written = 0, num_err = 0; for (; !example_reader.Done(); example_reader.Next(), num_read++) { + const std::string &key = example_reader.Key(); + NnetChainExample &eg = example_reader.Value(); if (frame_subsampling_factor == -1) - CalculateFrameSubsamplingFactor(example_reader.Value(), + CalculateFrameSubsamplingFactor(eg, &frame_subsampling_factor); // count is normally 1; could be 0, or possibly >1. int32 count = GetCount(keep_proportion); - std::string key = example_reader.Key(); - if (frame_shift == 0 && - left_context == -1 && right_context == -1) { - const NnetChainExample &eg = example_reader.Value(); - for (int32 c = 0; c < count; c++) { - int32 index = (random ? Rand() : num_written) % num_outputs; - example_writers[index]->Write(key, eg); - num_written++; + + if (!eg_weight_rspecifier.empty()) { + BaseFloat weight = 1.0; + if (!egs_weight_reader.HasKey(key)) { + KALDI_WARN << "No weight for example key " << key; + num_err++; + continue; } - } else if (count > 0) { - NnetChainExample eg = example_reader.Value(); - if (frame_shift != 0) - ShiftChainExampleTimes(frame_shift, exclude_names, &eg); - NnetChainExample eg_out; - if (left_context != -1 || right_context != -1) - ModifyChainExampleContext(eg, left_context, right_context, - frame_subsampling_factor, &eg_out); - else - eg_out.Swap(&eg); - for (int32 c = 0; c < count; c++) { - int32 index = (random ? Rand() : num_written) % num_outputs; - example_writers[index]->Write(key, eg_out); - num_written++; + weight = egs_weight_reader.Value(key); + ScaleSupervisionWeight(weight, &eg); + } + + if (!eg_output_name_rspecifier.empty()) { + if (!output_name_reader.HasKey(key)) { + KALDI_WARN << "No new output-name for example key " << key; + num_err++; + continue; } + std::string new_output_name = output_name_reader.Value(key); + RenameOutputs(new_output_name, &eg); + } + + if (frame_shift != 0) + ShiftChainExampleTimes(frame_shift, exclude_names, &eg); + if (left_context != -1 || right_context != -1) + ModifyChainExampleContext(left_context, right_context, + frame_subsampling_factor, &eg); + + for (int32 c = 0; c < count; c++) { + int32 index = (random ? Rand() : num_written) % num_outputs; + example_writers[index]->Write(key, eg); + num_written++; } } for (int32 i = 0; i < num_outputs; i++) diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index c8c251900ec..ef545ab9162 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -36,13 +36,51 @@ namespace nnet3 { supervision objects to 'example_writer'. Note: if normalization_fst is the empty FST (with no states), it skips the final stage of egs preparation and you should do it later with nnet3-chain-normalize-egs. -*/ + + @param [in] normalization_fst A version of denominator FST used to add weights + to the created supervision. It is + actually an FST expected to have the + labels as (pdf-id+1) + @param [in] feats Input feature matrix + @param [in] ivector_feats Online iVector matrix sub-sampled at a + rate of "ivector_period". + If NULL, iVector will not be added + as in input to the egs. + @param [in] ivector_period Number of frames between iVectors in + "ivector_feats" matrix. + @param [in] supervision Supervision for 'chain' training created + from the binary chain-get-supervision. + This is expected to be at a + sub-sampled rate if + --frame-subsampling-factor > 1. + @param [in] deriv_weights Vector of per-frame weights that scale + a frame's gradient during backpropagation. + If NULL, this is equivalent to specifying + a vector of all 1s. + The dimension of the vector is expected + to be the supervision size, which is + at a sub-sampled rate if + --frame-subsampling-factor > 1. + @param [in] supervision_length_tolerance + Tolerance for difference in num-frames-subsampled between + supervision and deriv weights, and also between supervision + and input frames. + @param [in] utt_id Utterance-id + @param [in] compress If true, compresses the feature matrices. + @param [out] utt_splitter Pointer to UtteranceSplitter object, + which helps to split an utterance into + chunks. This also stores some stats. + @param [out] example_writer Pointer to egs writer. + +**/ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, const GeneralMatrix &feats, const MatrixBase *ivector_feats, int32 ivector_period, const chain::Supervision &supervision, + const VectorBase *deriv_weights, + int32 supervision_length_tolerance, const std::string &utt_id, bool compress, UtteranceSplitter *utt_splitter, @@ -51,7 +89,18 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, int32 num_input_frames = feats.NumRows(), num_output_frames = supervision.frames_per_sequence; - if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames)) + int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; + + if (deriv_weights && (std::abs(deriv_weights->Dim() - num_output_frames) + > supervision_length_tolerance)) { + KALDI_WARN << "For utterance " << utt_id + << ", mismatch between deriv-weights dim and num-output-frames" + << "; " << deriv_weights->Dim() << " vs " << num_output_frames; + return false; + } + + if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames, + supervision_length_tolerance)) return false; // LengthsMatch() will have printed a warning. std::vector chunks; @@ -65,8 +114,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, return false; } - int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor; - chain::SupervisionSplitter sup_splitter(supervision); for (size_t c = 0; c < chunks.size(); c++) { @@ -92,19 +139,36 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, int32 first_frame = 0; // we shift the time-indexes of all these parts so // that the supervised part starts from frame 0. + + NnetChainExample nnet_chain_eg; + nnet_chain_eg.outputs.resize(1); SubVector output_weights( &(chunk.output_weights[0]), static_cast(chunk.output_weights.size())); - NnetChainSupervision nnet_supervision("output", supervision_part, - output_weights, - first_frame, - frame_subsampling_factor); + if (!deriv_weights) { + NnetChainSupervision nnet_supervision("output", supervision_part, + output_weights, + first_frame, + frame_subsampling_factor); + nnet_chain_eg.outputs[0].Swap(&nnet_supervision); + } else { + Vector this_deriv_weights(num_frames_subsampled); + for (int32 i = 0; i < num_frames_subsampled; i++) { + int32 t = i + start_frame_subsampled; + if (t < deriv_weights->Dim()) + this_deriv_weights(i) = (*deriv_weights)(t); + } + KALDI_ASSERT(output_weights.Dim() == num_frames_subsampled); + this_deriv_weights.MulElements(output_weights); + NnetChainSupervision nnet_supervision("output", supervision_part, + this_deriv_weights, + first_frame, + frame_subsampling_factor); + nnet_chain_eg.outputs[0].Swap(&nnet_supervision); + } - NnetChainExample nnet_chain_eg; - nnet_chain_eg.outputs.resize(1); - nnet_chain_eg.outputs[0].Swap(&nnet_supervision); nnet_chain_eg.inputs.resize(ivector_feats != NULL ? 2 : 1); int32 tot_input_frames = chunk.left_context + chunk.num_frames + @@ -176,13 +240,15 @@ int main(int argc, char *argv[]) { "chain-get-supervision.\n"; bool compress = true; - int32 length_tolerance = 100, online_ivector_period = 1; + int32 length_tolerance = 100, online_ivector_period = 1, + supervision_length_tolerance = 1; ExampleGenerationConfig eg_config; // controls num-frames, // left/right-context, etc. + BaseFloat normalization_fst_scale = 1.0; int32 srand_seed = 0; - std::string online_ivector_rspecifier; + std::string online_ivector_rspecifier, deriv_weights_rspecifier; ParseOptions po(usage); po.Register("compress", &compress, "If true, write egs with input features " @@ -200,6 +266,20 @@ int main(int argc, char *argv[]) { po.Register("srand", &srand_seed, "Seed for random number generator "); po.Register("length-tolerance", &length_tolerance, "Tolerance for " "difference in num-frames between feat and ivector matrices"); + po.Register("supervision-length-tolerance", &supervision_length_tolerance, + "Tolerance for difference in num-frames-subsampled between " + "supervision and deriv weights, and also between supervision " + "and input frames."); + po.Register("deriv-weights-rspecifier", &deriv_weights_rspecifier, + "Per-frame weights that scales a frame's gradient during " + "backpropagation. " + "Not specifying this is equivalent to specifying a vector of " + "all 1s."); + po.Register("normalization-fst-scale", &normalization_fst_scale, + "Scale the weights from the " + "'normalization' FST before applying them to the examples. " + "(Useful for semi-supervised training)"); + eg_config.Register(&po); po.Read(argc, argv); @@ -235,6 +315,12 @@ int main(int argc, char *argv[]) { if (!normalization_fst_rxfilename.empty()) { ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); KALDI_ASSERT(normalization_fst.NumStates() > 0); + + if (normalization_fst_scale <= 0.0) + KALDI_ERR << "Invalid scale on normalization FST; must be > 0.0"; + + if (normalization_fst_scale != 1.0) + ApplyProbabilityScale(normalization_fst_scale, &normalization_fst); } // Read as GeneralMatrix so we don't need to un-compress and re-compress @@ -245,6 +331,8 @@ int main(int argc, char *argv[]) { NnetChainExampleWriter example_writer(examples_wspecifier); RandomAccessBaseFloatMatrixReader online_ivector_reader( online_ivector_rspecifier); + RandomAccessBaseFloatVectorReader deriv_weights_reader( + deriv_weights_rspecifier); int32 num_err = 0; @@ -278,10 +366,24 @@ int main(int argc, char *argv[]) { num_err++; continue; } + + const Vector *deriv_weights = NULL; + if (!deriv_weights_rspecifier.empty()) { + if (!deriv_weights_reader.HasKey(key)) { + KALDI_WARN << "No deriv weights for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + deriv_weights = &(deriv_weights_reader.Value(key)); + } + } if (!ProcessFile(normalization_fst, feats, online_ivector_feats, online_ivector_period, - supervision, key, compress, + supervision, deriv_weights, supervision_length_tolerance, + key, compress, &utt_splitter, &example_writer)) num_err++; } diff --git a/src/chainbin/nnet3-chain-normalize-egs.cc b/src/chainbin/nnet3-chain-normalize-egs.cc index 9d3f56f756a..a97797e3246 100644 --- a/src/chainbin/nnet3-chain-normalize-egs.cc +++ b/src/chainbin/nnet3-chain-normalize-egs.cc @@ -41,7 +41,13 @@ int main(int argc, char *argv[]) { "e.g.\n" "nnet3-chain-normalize-egs dir/normalization.fst ark:train_in.cegs ark:train_out.cegs\n"; + BaseFloat normalization_fst_scale = 1.0; + ParseOptions po(usage); + po.Register("normalization-fst-scale", &normalization_fst_scale, + "Scale the weights from the " + "'normalization' FST before applying them to the examples. " + "(Useful for semi-supervised training)"); po.Read(argc, argv); @@ -57,6 +63,12 @@ int main(int argc, char *argv[]) { fst::StdVectorFst normalization_fst; ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); + if (normalization_fst_scale < 0.0) + KALDI_ERR << "Invalid scale on normalization FST; must be >= 0.0"; + + if (normalization_fst_scale != 1.0) + ApplyProbabilityScale(normalization_fst_scale, &normalization_fst); + SequentialNnetChainExampleReader example_reader(examples_rspecifier); NnetChainExampleWriter example_writer(examples_wspecifier); @@ -87,5 +99,3 @@ int main(int argc, char *argv[]) { return -1; } } - - diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc index b04b23702fb..54c856a9403 100644 --- a/src/lat/lattice-functions.cc +++ b/src/lat/lattice-functions.cc @@ -431,9 +431,10 @@ void ConvertLatticeToPhones(const TransitionModel &trans, arc.olabel = 0; // remove any word. if ((arc.ilabel != 0) // has a transition-id on input.. && (trans.TransitionIdToHmmState(arc.ilabel) == 0) - && (!trans.IsSelfLoop(arc.ilabel))) + && (!trans.IsSelfLoop(arc.ilabel))) { // && trans.IsFinal(arc.ilabel)) // there is one of these per phone... arc.olabel = trans.TransitionIdToPhone(arc.ilabel); + } aiter.SetValue(arc); } // end looping over arcs } // end looping over states @@ -459,6 +460,8 @@ double ComputeLatticeAlphasAndBetas(const LatticeType &lat, StateId num_states = lat.NumStates(); KALDI_ASSERT(lat.Properties(fst::kTopSorted, true) == fst::kTopSorted); KALDI_ASSERT(lat.Start() == 0); + alpha->clear(); + beta->clear(); alpha->resize(num_states, kLogZeroDouble); beta->resize(num_states, kLogZeroDouble); @@ -1646,4 +1649,110 @@ void ComposeCompactLatticeDeterministic( fst::Connect(composed_clat); } + +void ComputeAcousticScoresMap( + const Lattice &lat, + unordered_map, std::pair, + PairHasher > *acoustic_scores) { + // typedef the arc, weight types + typedef Lattice::Arc Arc; + typedef Arc::Weight LatticeWeight; + typedef Arc::StateId StateId; + + acoustic_scores->clear(); + + std::vector state_times; + LatticeStateTimes(lat, &state_times); // Assumes the input is top sorted + + KALDI_ASSERT(lat.Start() == 0); + + for (StateId s = 0; s < lat.NumStates(); s++) { + int32 t = state_times[s]; + for (fst::ArcIterator aiter(lat, s); !aiter.Done(); + aiter.Next()) { + const Arc &arc = aiter.Value(); + const LatticeWeight &weight = arc.weight; + + int32 tid = arc.ilabel; + + if (tid != 0) { + unordered_map, std::pair, + PairHasher >::iterator it = acoustic_scores->find(std::make_pair(t, tid)); + if (it == acoustic_scores->end()) { + acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), + std::make_pair(weight.Value2(), 1))); + } else { + if (it->second.second == 2 + && it->second.first / it->second.second != weight.Value2()) { + KALDI_VLOG(2) << "Transitions on the same frame have different " + << "acoustic costs for tid " << tid << "; " + << it->second.first / it->second.second + << " vs " << weight.Value2(); + } + it->second.first += weight.Value2(); + it->second.second++; + } + } else { + // Arcs with epsilon input label (tid) must have 0 acoustic cost + KALDI_ASSERT(weight.Value2() == 0); + } + } + + LatticeWeight f = lat.Final(s); + if (f != LatticeWeight::Zero()) { + // Final acoustic cost must be 0 as we are reading from + // non-determinized, non-compact lattice + KALDI_ASSERT(f.Value2() == 0.0); + } + } +} + +void ReplaceAcousticScoresFromMap( + const unordered_map, std::pair, + PairHasher > &acoustic_scores, + Lattice *lat) { + // typedef the arc, weight types + typedef Lattice::Arc Arc; + typedef Arc::Weight LatticeWeight; + typedef Arc::StateId StateId; + + TopSortLatticeIfNeeded(lat); + + std::vector state_times; + LatticeStateTimes(*lat, &state_times); + + KALDI_ASSERT(lat->Start() == 0); + + for (StateId s = 0; s < lat->NumStates(); s++) { + int32 t = state_times[s]; + for (fst::MutableArcIterator aiter(lat, s); + !aiter.Done(); aiter.Next()) { + Arc arc(aiter.Value()); + + int32 tid = arc.ilabel; + if (tid != 0) { + unordered_map, std::pair, + PairHasher >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid)); + if (it == acoustic_scores.end()) { + KALDI_ERR << "Could not find tid " << tid << " at time " << t + << " in the acoustic scores map."; + } else { + arc.weight.SetValue2(it->second.first / it->second.second); + } + } else { + // For epsilon arcs, set acoustic cost to 0.0 + arc.weight.SetValue2(0.0); + } + aiter.SetValue(arc); + } + + LatticeWeight f = lat->Final(s); + if (f != LatticeWeight::Zero()) { + // Set final acoustic cost to 0.0 + f.SetValue2(0.0); + lat->SetFinal(s, f); + } + } +} + } // namespace kaldi diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h index b4b16e6221a..c7fe4833a4a 100644 --- a/src/lat/lattice-functions.h +++ b/src/lat/lattice-functions.h @@ -377,6 +377,50 @@ void ComposeCompactLatticeDeterministic( fst::DeterministicOnDemandFst* det_fst, CompactLattice* composed_clat); +/// This function computes the mapping from the pair +/// (frame-index, transition-id) to the pair +/// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the +/// transition-id in that frame. +/// frame-index in the lattice. +/// This function is useful for retaining the acoustic scores in a +/// non-compact lattice after a process like determinization where the +/// frame-level acoustic scores are typically lost. +/// The function ReplaceAcousticScoresFromMap is used to restore the +/// acoustic scores computed by this function. +/// +/// @param [in] lat Input lattice. Expected to be top-sorted. Otherwise the +/// function will crash. +/// @param [out] acoustic_scores +/// Pointer to a map from the pair (frame-index, +/// transition-id) to a pair (sum-of-acoustic-scores, +/// num-of-occurences). +/// Usually the acoustic scores for a pdf-id (and hence +/// transition-id) on a frame will be the same for all the +/// occurences of the pdf-id in that frame. +/// But if not, we will take the average of the acoustic +/// scores. Hence, we store both the sum-of-acoustic-scores +/// and the num-of-occurences of the transition-id in that +/// frame. +void ComputeAcousticScoresMap( + const Lattice &lat, + unordered_map, std::pair, + PairHasher > *acoustic_scores); + +/// This function restores acoustic scores computed using the function +/// ComputeAcousticScoresMap into the lattice. +/// +/// @param [in] acoustic_scores +/// A map from the pair (frame-index, transition-id) to a +/// pair (sum-of-acoustic-scores, num-of-occurences) of +/// the occurences of the transition-id in that frame. +/// See the comments for ComputeAcousticScoresMap for +/// details. +/// @param [out] lat Pointer to the output lattice. +void ReplaceAcousticScoresFromMap( + const unordered_map, std::pair, + PairHasher > &acoustic_scores, + Lattice *lat); + } // namespace kaldi #endif // KALDI_LAT_LATTICE_FUNCTIONS_H_ diff --git a/src/latbin/lattice-compose.cc b/src/latbin/lattice-compose.cc index b9b261f7d36..df70229bfd8 100644 --- a/src/latbin/lattice-compose.cc +++ b/src/latbin/lattice-compose.cc @@ -22,6 +22,7 @@ #include "util/common-utils.h" #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" int main(int argc, char *argv[]) { try { @@ -46,8 +47,10 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); + bool write_compact = true; int32 num_states_cache = 50000; int32 phi_label = fst::kNoLabel; // == -1 + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("phi-label", &phi_label, "If >0, the label on backoff arcs of the LM"); po.Register("num-states-cache", &num_states_cache, "Number of states we cache when mapping LM FST to lattice type. " @@ -67,9 +70,14 @@ int main(int argc, char *argv[]) { int32 n_done = 0, n_fail = 0; SequentialLatticeReader lattice_reader1(lats_rspecifier1); - // Write as compact lattice. - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + + CompactLatticeWriter compact_lattice_writer; + LatticeWriter lattice_writer; + if (write_compact) + compact_lattice_writer.Open(lats_wspecifier); + else + lattice_writer.Open(lats_wspecifier); if (ClassifyRspecifier(arg2, NULL, NULL) == kNoRspecifier) { std::string fst_rxfilename = arg2; @@ -102,9 +110,13 @@ int main(int argc, char *argv[]) { KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; n_fail++; } else { - CompactLattice clat; - ConvertLattice(composed_lat, &clat); - compact_lattice_writer.Write(key, clat); + if (write_compact) { + CompactLattice clat; + ConvertLattice(composed_lat, &clat); + compact_lattice_writer.Write(key, clat); + } else { + lattice_writer.Write(key, composed_lat); + } n_done++; } } @@ -149,9 +161,13 @@ int main(int argc, char *argv[]) { KALDI_WARN << "Empty lattice for utterance " << key << " (incompatible LM?)"; n_fail++; } else { - CompactLattice clat_out; - ConvertLattice(lat_out, &clat_out); - compact_lattice_writer.Write(key, clat_out); + if (write_compact) { + CompactLattice clat_out; + ConvertLattice(lat_out, &clat_out); + compact_lattice_writer.Write(key, clat_out); + } else { + lattice_writer.Write(key, lat_out); + } n_done++; } } diff --git a/src/latbin/lattice-determinize-non-compact.cc b/src/latbin/lattice-determinize-non-compact.cc index 44ae8566f86..cf73e22980d 100644 --- a/src/latbin/lattice-determinize-non-compact.cc +++ b/src/latbin/lattice-determinize-non-compact.cc @@ -90,101 +90,6 @@ bool DeterminizeLatticeWrapper(const Lattice &lat, return false; } -void ComputeAcousticScoresMap( - const Lattice &lat, - unordered_map, std::pair, - PairHasher > *acoustic_scores) { - acoustic_scores->clear(); - - std::vector state_times; - LatticeStateTimes(lat, &state_times); - - KALDI_ASSERT(lat.Start() == 0); - - for (StateId s = 0; s < lat.NumStates(); s++) { - int32 t = state_times[s]; - for (fst::ArcIterator aiter(lat, s); !aiter.Done(); - aiter.Next()) { - const Arc &arc = aiter.Value(); - const LatticeWeight &weight = arc.weight; - - int32 tid = arc.ilabel; - - if (tid != 0) { - unordered_map, std::pair, - PairHasher >::iterator it = acoustic_scores->find(std::make_pair(t, tid)); - if (it == acoustic_scores->end()) { - acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), - std::make_pair(weight.Value2(), 1))); - } else { - if (it->second.second == 2 - && it->second.first / it->second.second != weight.Value2()) { - KALDI_VLOG(2) << "Transitions on the same frame have different " - << "acoustic costs for tid " << tid << "; " - << it->second.first / it->second.second - << " vs " << weight.Value2(); - } - it->second.first += weight.Value2(); - it->second.second++; - } - } else { - // Arcs with epsilon input label (tid) must have 0 acoustic cost - KALDI_ASSERT(weight.Value2() == 0); - } - } - - LatticeWeight f = lat.Final(s); - if (f != LatticeWeight::Zero()) { - // Final acoustic cost must be 0 as we are reading from - // non-determinized, non-compact lattice - KALDI_ASSERT(f.Value2() == 0.0); - } - } -} - -void ReplaceAcousticScoresFromMap( - const unordered_map, std::pair, - PairHasher > &acoustic_scores, - Lattice *lat) { - fst::TopSort(lat); - - std::vector state_times; - LatticeStateTimes(*lat, &state_times); - - KALDI_ASSERT(lat->Start() == 0); - - for (StateId s = 0; s < lat->NumStates(); s++) { - int32 t = state_times[s]; - for (fst::MutableArcIterator aiter(lat, s); - !aiter.Done(); aiter.Next()) { - Arc arc(aiter.Value()); - - int32 tid = arc.ilabel; - if (tid != 0) { - unordered_map, std::pair, - PairHasher >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid)); - if (it == acoustic_scores.end()) { - KALDI_ERR << "Could not find tid " << tid << " at time " << t - << " in the acoustic scores map."; - } else { - arc.weight.SetValue2(it->second.first / it->second.second); - } - } else { - // For epsilon arcs, set acoustic cost to 0.0 - arc.weight.SetValue2(0.0); - } - aiter.SetValue(arc); - } - - LatticeWeight f = lat->Final(s); - if (f != LatticeWeight::Zero()) { - // Set final acoustic cost to 0.0 - f.SetValue2(0.0); - lat->SetFinal(s, f); - } - } -} - } int main(int argc, char *argv[]) { diff --git a/src/latbin/lattice-determinize-phone-pruned.cc b/src/latbin/lattice-determinize-phone-pruned.cc index 0959bcbcd74..94a8530273b 100644 --- a/src/latbin/lattice-determinize-phone-pruned.cc +++ b/src/latbin/lattice-determinize-phone-pruned.cc @@ -1,6 +1,7 @@ // latbin/lattice-determinize-phone-pruned.cc // Copyright 2014 Guoguo Chen +// 2017 Vimal Manohar // See ../../COPYING for clarification regarding multiple authors // @@ -43,11 +44,18 @@ int main(int argc, char *argv[]) { " final.mdl ark:in.lats ark:det.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat acoustic_scale = 1.0; BaseFloat beam = 10.0; fst::DeterminizeLatticePhonePrunedOptions opts; opts.max_mem = 50000000; + po.Register("write-compact", &write_compact, + "If true, write in normal (compact) form. " + "--write-compact=false allows you to retain frame-level " + "acoustic score information, but this requires the input " + "to be in non-compact form e.g. undeterminized lattice " + "straight from decoding."); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic" " likelihoods."); po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling]."); @@ -70,8 +78,13 @@ int main(int argc, char *argv[]) { // accepts. SequentialLatticeReader lat_reader(lats_rspecifier); - // Writes as compact lattice. - CompactLatticeWriter compact_lat_writer(lats_wspecifier); + CompactLatticeWriter compact_lat_writer; + LatticeWriter lat_writer; + + if (write_compact) + compact_lat_writer.Open(lats_wspecifier); + else + lat_writer.Open(lats_wspecifier); int32 n_done = 0, n_warn = 0; @@ -89,6 +102,12 @@ int main(int argc, char *argv[]) { KALDI_VLOG(2) << "Processing lattice " << key; + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + if (!write_compact) + ComputeAcousticScoresMap(lat, &acoustic_scores); + fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat); CompactLattice det_clat; @@ -106,8 +125,19 @@ int main(int argc, char *argv[]) { sum_depth_out += depth * t; sum_t += t; - fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat); - compact_lat_writer.Write(key, det_clat); + if (write_compact) { + fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat); + compact_lat_writer.Write(key, det_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(det_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lat_writer.Write(key, out_lat); + } + n_done++; } diff --git a/src/latbin/lattice-determinize-pruned.cc b/src/latbin/lattice-determinize-pruned.cc index 3e8bca5a3ce..3c6c5796811 100644 --- a/src/latbin/lattice-determinize-pruned.cc +++ b/src/latbin/lattice-determinize-pruned.cc @@ -39,6 +39,7 @@ int main(int argc, char *argv[]) { " e.g.: lattice-determinize-pruned --acoustic-scale=0.1 --beam=6.0 ark:in.lats ark:det.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat acoustic_scale = 1.0; BaseFloat beam = 10.0; bool minimize = false; @@ -48,6 +49,12 @@ int main(int argc, char *argv[]) { opts.max_mem = 50000000; opts.max_loop = 0; // was 500000; + po.Register("write-compact", &write_compact, + "If true, write in normal (compact) form. " + "--write-compact=false allows you to retain frame-level " + "acoustic score information, but this requires the input " + "to be in non-compact form e.g. undeterminized lattice " + "straight from decoding."); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling]."); @@ -69,8 +76,13 @@ int main(int argc, char *argv[]) { // accepts. SequentialLatticeReader lat_reader(lats_rspecifier); - // Write as compact lattice. - CompactLatticeWriter compact_lat_writer(lats_wspecifier); + CompactLatticeWriter compact_lat_writer; + LatticeWriter lat_writer; + + if (write_compact) + compact_lat_writer.Open(lats_wspecifier); + else + lat_writer.Open(lats_wspecifier); int32 n_done = 0, n_warn = 0; @@ -87,6 +99,12 @@ int main(int argc, char *argv[]) { KALDI_VLOG(2) << "Processing lattice " << key; + // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) + unordered_map, std::pair, + PairHasher > acoustic_scores; + if (!write_compact) + ComputeAcousticScoresMap(lat, &acoustic_scores); + Invert(&lat); // so word labels are on the input side. lat_reader.FreeCurrent(); fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat); @@ -121,8 +139,19 @@ int main(int argc, char *argv[]) { sum_depth_out += depth * t; sum_t += t; - fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat); - compact_lat_writer.Write(key, det_clat); + if (write_compact) { + fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat); + compact_lat_writer.Write(key, det_clat); + } else { + Lattice out_lat; + fst::ConvertLattice(det_clat, &out_lat); + + // Replace each arc (t, tid) with the averaged acoustic score from + // the computed map + ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat); + lat_writer.Write(key, out_lat); + } + n_done++; } diff --git a/src/latbin/lattice-scale.cc b/src/latbin/lattice-scale.cc index 5ca6012d994..f9c61b4c5e3 100644 --- a/src/latbin/lattice-scale.cc +++ b/src/latbin/lattice-scale.cc @@ -39,12 +39,14 @@ int main(int argc, char *argv[]) { " e.g.: lattice-scale --lm-scale=0.0 ark:1.lats ark:scaled.lats\n"; ParseOptions po(usage); + bool write_compact = true; BaseFloat acoustic_scale = 1.0; BaseFloat inv_acoustic_scale = 1.0; BaseFloat lm_scale = 1.0; BaseFloat acoustic2lm_scale = 0.0; BaseFloat lm2acoustic_scale = 0.0; + po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); po.Register("inv-acoustic-scale", &inv_acoustic_scale, "An alternative way " "of setting the acoustic scale: you can set its inverse."); @@ -62,11 +64,6 @@ int main(int argc, char *argv[]) { std::string lats_rspecifier = po.GetArg(1), lats_wspecifier = po.GetArg(2); - SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); - - // Write as compact lattice. - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); - int32 n_done = 0; KALDI_ASSERT(acoustic_scale == 1.0 || inv_acoustic_scale == 1.0); @@ -81,12 +78,32 @@ int main(int argc, char *argv[]) { scale[1][0] = lm2acoustic_scale; scale[1][1] = acoustic_scale; - for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { - CompactLattice lat = compact_lattice_reader.Value(); - ScaleLattice(scale, &lat); - compact_lattice_writer.Write(compact_lattice_reader.Key(), lat); - n_done++; + if (write_compact) { + SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); + + // Write as compact lattice. + CompactLatticeWriter compact_lattice_writer(lats_wspecifier); + + for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { + CompactLattice lat = compact_lattice_reader.Value(); + ScaleLattice(scale, &lat); + compact_lattice_writer.Write(compact_lattice_reader.Key(), lat); + n_done++; + } + } else { + SequentialLatticeReader lattice_reader(lats_rspecifier); + + // Write as regular lattice. + LatticeWriter lattice_writer(lats_wspecifier); + + for (; !lattice_reader.Done(); lattice_reader.Next()) { + Lattice lat = lattice_reader.Value(); + ScaleLattice(scale, &lat); + lattice_writer.Write(lattice_reader.Key(), lat); + n_done++; + } } + KALDI_LOG << "Done " << n_done << " lattices."; return (n_done != 0 ? 0 : 1); } catch(const std::exception &e) { diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index a7e60a5e0c4..81f19c44b5c 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -60,6 +60,7 @@ NnetChainComputeProb::NnetChainComputeProb( deriv_nnet_owned_(false), deriv_nnet_(nnet), num_minibatches_processed_(0) { + KALDI_ASSERT(den_graph_.NumPdfs() > 0); KALDI_ASSERT(nnet_config.store_component_stats && !nnet_config.compute_deriv); } @@ -217,15 +218,43 @@ const ChainObjectiveInfo* NnetChainComputeProb::GetObjective( return NULL; } +double NnetChainComputeProb::GetTotalObjective(double *total_weight) const { + double tot_objectives = 0.0; + double tot_weight = 0.0; + unordered_map::const_iterator + iter = objf_info_.begin(), end = objf_info_.end(); + for (; iter != end; ++iter) { + tot_objectives += iter->second.tot_like + iter->second.tot_l2_term; + tot_weight += iter->second.tot_weight; + } + + if (total_weight) *total_weight = tot_weight; + return tot_objectives; +} + +static bool HasXentOutputs(const Nnet &nnet) { + const std::vector node_names = nnet.GetNodeNames(); + for (std::vector::const_iterator it = node_names.begin(); + it != node_names.end(); ++it) { + int32 node_index = nnet.GetNodeIndex(*it); + if (nnet.IsOutputNode(node_index) && + it->find("-xent") != std::string::npos) { + return true; + } + } + return false; +} + void RecomputeStats(const std::vector &egs, const chain::ChainTrainingOptions &chain_config_in, const fst::StdVectorFst &den_fst, Nnet *nnet) { KALDI_LOG << "Recomputing stats on nnet (affects batch-norm)"; chain::ChainTrainingOptions chain_config(chain_config_in); - if (nnet->GetNodeIndex("output-xent") != -1 && + if (HasXentOutputs(*nnet) && chain_config.xent_regularize == 0) { - // this forces it to compute the output for 'output-xent', which + // this forces it to compute the output for xent outputs, + // usually 'output-xent', which // means that we'll be computing batch-norm stats for any // components in that branch that have batch-norm. chain_config.xent_regularize = 0.1; diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h index 4125427c463..49fc5c8f4d8 100644 --- a/src/nnet3/nnet-chain-diagnostics.h +++ b/src/nnet3/nnet-chain-diagnostics.h @@ -83,6 +83,11 @@ class NnetChainComputeProb { // or NULL if there is no such info. const ChainObjectiveInfo *GetObjective(const std::string &output_name) const; + // This function returns the total objective over all output nodes recorded here, and + // outputs to 'tot_weight' the total weight (typically the number of frames) + // corresponding to it. + double GetTotalObjective(double *tot_weight) const; + // if config.compute_deriv == true, returns a reference to the // computed derivative. Otherwise crashes. const Nnet &GetDeriv() const; diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index aad5f83bc80..a05c002c3af 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -31,8 +31,8 @@ void NnetChainSupervision::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, name); WriteIndexVector(os, binary, indexes); supervision.Write(os, binary); - WriteToken(os, binary, ""); // for DerivWeights. Want to save space. - WriteVectorAsChar(os, binary, deriv_weights); + WriteToken(os, binary, ""); + deriv_weights.Write(os, binary); WriteToken(os, binary, ""); } @@ -51,8 +51,11 @@ void NnetChainSupervision::Read(std::istream &is, bool binary) { ReadToken(is, binary, &token); // in the future this back-compatibility code can be reworked. if (token != "") { - KALDI_ASSERT(token == ""); - ReadVectorAsChar(is, binary, &deriv_weights); + KALDI_ASSERT(token == "" || token == ""); + if (token == "") + ReadVectorAsChar(is, binary, &deriv_weights); + else + deriv_weights.Read(is, binary); ExpectToken(is, binary, ""); } CheckDim(); @@ -82,8 +85,7 @@ void NnetChainSupervision::CheckDim() const { } if (deriv_weights.Dim() != 0) { KALDI_ASSERT(deriv_weights.Dim() == indexes.size()); - KALDI_ASSERT(deriv_weights.Min() >= 0.0 && - deriv_weights.Max() <= 1.0); + KALDI_ASSERT(deriv_weights.Min() >= 0.0); } } diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h index 2718af746b2..047f30cfc48 100644 --- a/src/nnet3/nnet-chain-example.h +++ b/src/nnet3/nnet-chain-example.h @@ -101,8 +101,8 @@ struct NnetChainSupervision { bool operator == (const NnetChainSupervision &other) const; }; -/// NnetChainExample is like NnetExample, but specialized for CTC training. -/// (actually CCTC training, which is our extension of CTC). +/// NnetChainExample is like NnetExample, but specialized for +/// lattice-free (chain) training. struct NnetChainExample { /// 'inputs' contains the input to the network-- normally just it has just one @@ -110,7 +110,7 @@ struct NnetChainExample { /// "ivector")... this depends on the setup. std::vector inputs; - /// 'outputs' contains the CTC output supervision. There will normally + /// 'outputs' contains the chain output supervision. There will normally /// be just one member with name == "output". std::vector outputs; diff --git a/src/nnet3/nnet-diagnostics.cc b/src/nnet3/nnet-diagnostics.cc index 2a6cfe5de6a..ca6124a212f 100644 --- a/src/nnet3/nnet-diagnostics.cc +++ b/src/nnet3/nnet-diagnostics.cc @@ -306,15 +306,17 @@ const SimpleObjectiveInfo* NnetComputeProb::GetObjective( return NULL; } -double NnetComputeProb::GetTotalObjective(double *tot_weight) const { +double NnetComputeProb::GetTotalObjective(double *total_weight) const { double tot_objectives = 0.0; - *tot_weight = 0.0; + double tot_weight = 0.0; unordered_map::const_iterator iter = objf_info_.begin(), end = objf_info_.end(); for (; iter != end; ++iter) { tot_objectives += iter->second.tot_objective; - (*tot_weight) += iter->second.tot_weight; + tot_weight += iter->second.tot_weight; } + + if (total_weight) *total_weight = tot_weight; return tot_objectives; } diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 65df0c891c1..07112c9d873 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -1265,6 +1265,5 @@ void ExampleMerger::Finish() { stats_.PrintStats(); } - } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index 02620df7485..c93d0dd2c81 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -516,8 +516,6 @@ class ExampleMerger { MapType eg_to_egs_; }; - - } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3bin/nnet3-copy-egs.cc b/src/nnet3bin/nnet3-copy-egs.cc index 02c2e429977..17053ad9b2d 100644 --- a/src/nnet3bin/nnet3-copy-egs.cc +++ b/src/nnet3bin/nnet3-copy-egs.cc @@ -26,39 +26,39 @@ namespace kaldi { namespace nnet3 { -// rename name of NnetIo with old_name to new_name. -void RenameIoNames(const std::string &old_name, - const std::string &new_name, - NnetExample *eg_modified) { - // list of io-names in eg_modified. - std::vector orig_io_list; - int32 io_size = eg_modified->io.size(); - for (int32 io_ind = 0; io_ind < io_size; io_ind++) - orig_io_list.push_back(eg_modified->io[io_ind].name); - - // find the io in eg with name 'old_name'. - int32 rename_io_ind = - std::find(orig_io_list.begin(), orig_io_list.end(), old_name) - - orig_io_list.begin(); - - if (rename_io_ind >= io_size) - KALDI_ERR << "No io-node with name " << old_name + +// renames outputs named "output" to new_name +void RenameOutputs(const std::string &new_name, NnetExample *eg) { + bool found_output = false; + for (std::vector::iterator it = eg->io.begin(); + it != eg->io.end(); ++it) { + if (it->name == "output") { + it->name = new_name; + found_output = true; + } + } + + if (!found_output) + KALDI_ERR << "No io-node with name 'output'" << "exists in eg."; - eg_modified->io[rename_io_ind].name = new_name; } -// ranames NnetIo name with name 'output' to new_output_name -// and scales the supervision for 'output' using weight. -void ScaleAndRenameOutput(BaseFloat weight, - const std::string &new_output_name, - NnetExample *eg) { - // scale the supervision weight for egs - for (int32 i = 0; i < eg->io.size(); i++) - if (eg->io[i].name == "output") - if (weight != 0.0 && weight != 1.0) - eg->io[i].features.Scale(weight); - // rename output io name to 'new_output_name'. - RenameIoNames("output", new_output_name, eg); +// scales the supervision for 'output' by a factor of "weight" +void ScaleSupervisionWeight(BaseFloat weight, NnetExample *eg) { + if (weight == 1.0) return; + + bool found_output = false; + for (std::vector::iterator it = eg->io.begin(); + it != eg->io.end(); ++it) { + if (it->name == "output") { + it->features.Scale(weight); + found_output = true; + } + } + + if (!found_output) + KALDI_ERR << "No supervision with name 'output'" + << "exists in eg."; } // returns an integer randomly drawn with expected value "expected_count" @@ -320,7 +320,7 @@ int main(int argc, char *argv[]) { // you can set frame to a number to select a single frame with a particular // offset, or to 'random' to select a random single frame. std::string frame_str, - eg_weight_rspecifier, eg_output_rspecifier; + eg_weight_rspecifier, eg_output_name_rspecifier; ParseOptions po(usage); po.Register("random", &random, "If true, will write frames to output " @@ -347,12 +347,11 @@ int main(int argc, char *argv[]) { "Rspecifier indexed by the key of egs, providing a weight by " "which we will scale the supervision matrix for that eg. " "Used in multilingual training."); - po.Register("outputs", &eg_output_rspecifier, + po.Register("outputs", &eg_output_name_rspecifier, "Rspecifier indexed by the key of egs, providing a string-valued " "output name, e.g. 'output-0'. If provided, the NnetIo with " "name 'output' will be renamed to the provided name. Used in " "multilingual training."); - po.Read(argc, argv); srand(srand_seed); @@ -366,8 +365,11 @@ int main(int argc, char *argv[]) { SequentialNnetExampleReader example_reader(examples_rspecifier); - RandomAccessTokenReader output_reader(eg_output_rspecifier); + // In the normal case, these would not be used. These are only applicable + // for multi-task or multilingual training. + RandomAccessTokenReader output_name_reader(eg_output_name_rspecifier); RandomAccessBaseFloatReader egs_weight_reader(eg_weight_rspecifier); + int32 num_outputs = po.NumArgs() - 1; std::vector example_writers(num_outputs); for (int32 i = 0; i < num_outputs; i++) @@ -376,52 +378,41 @@ int main(int argc, char *argv[]) { int64 num_read = 0, num_written = 0, num_err = 0; for (; !example_reader.Done(); example_reader.Next(), num_read++) { - bool modify_eg_output = !(eg_output_rspecifier.empty() && - eg_weight_rspecifier.empty()); + const std::string &key = example_reader.Key(); + NnetExample &eg = example_reader.Value(); // count is normally 1; could be 0, or possibly >1. int32 count = GetCount(keep_proportion); - std::string key = example_reader.Key(); - NnetExample eg_modified_output; - const NnetExample &eg_orig = example_reader.Value(), - &eg = (modify_eg_output ? eg_modified_output : eg_orig); - // Note: in the normal case we just use 'eg'; eg_modified_output is - // for the case when the --outputs or --weights option is specified - // (only for multilingual training). - BaseFloat weight = 1.0; - std::string new_output_name; - if (modify_eg_output) { // This branch is only taken for multilingual training. - eg_modified_output = eg_orig; - if (!eg_weight_rspecifier.empty()) { - if (!egs_weight_reader.HasKey(key)) { - KALDI_WARN << "No weight for example key " << key; - num_err++; - continue; - } - weight = egs_weight_reader.Value(key); + + if (!eg_weight_rspecifier.empty()) { + BaseFloat weight = 1.0; + if (!egs_weight_reader.HasKey(key)) { + KALDI_WARN << "No weight for example key " << key; + num_err++; + continue; } - if (!eg_output_rspecifier.empty()) { - if (!output_reader.HasKey(key)) { - KALDI_WARN << "No new output-name for example key " << key; - num_err++; - continue; - } - new_output_name = output_reader.Value(key); + weight = egs_weight_reader.Value(key); + ScaleSupervisionWeight(weight, &eg); + } + + if (!eg_output_name_rspecifier.empty()) { + if (!output_name_reader.HasKey(key)) { + KALDI_WARN << "No new output-name for example key " << key; + num_err++; + continue; } + std::string new_output_name = output_name_reader.Value(key); + RenameOutputs(new_output_name, &eg); } for (int32 c = 0; c < count; c++) { int32 index = (random ? Rand() : num_written) % num_outputs; if (frame_str == "" && left_context == -1 && right_context == -1 && frame_shift == 0) { - if (modify_eg_output) // Only for multilingual training - ScaleAndRenameOutput(weight, new_output_name, &eg_modified_output); example_writers[index]->Write(key, eg); num_written++; } else { // the --frame option or context options were set. NnetExample eg_modified; if (SelectFromExample(eg, frame_str, left_context, right_context, frame_shift, &eg_modified)) { - if (modify_eg_output) - ScaleAndRenameOutput(weight, new_output_name, &eg_modified); // this branch of the if statement will almost always be taken (should only // not be taken for shorter-than-normal egs from the end of a file. example_writers[index]->Write(key, eg_modified);