Merge pull request kaldi-asr#8 from kaldi-asr/master

merge
LvHang · Mar 29, 2018 · 6d5e966 · 6d5e966
2 parents ac49815 + 191b39a
commit 6d5e966
Show file tree

Hide file tree

Showing 104 changed files with 5,459 additions and 972 deletions.
diff --git a/egs/aspire/s5/local/nnet3/run_tdnn.sh b/egs/aspire/s5/local/nnet3/run_tdnn.sh
@@ -52,7 +52,7 @@ if [ $stage -le 7 ]; then
   relu-renorm-layer name=tdnn4 dim=1248 input=Append(-3,3)
   relu-renorm-layer name=tdnn5 dim=1248 input=Append(-7,2)
   relu-renorm-layer name=tdnn6 dim=1248
-  output-layer name=output dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+  output-layer name=output dim=$num_targets max-change=1.5
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
 fi

diff --git a/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh b/egs/babel_multilang/s5/local/nnet3/run_tdnn_multilingual.sh
@@ -247,7 +247,6 @@ if [ $stage -le 10 ] && [ ! -z $megs_dir ]; then
   common_egs_dir="${multi_egs_dirs[@]} $megs_dir"
   steps/nnet3/multilingual/combine_egs.sh $egs_opts \
     --cmd "$decode_cmd" \
-    --samples-per-iter 400000 \
     $num_langs ${common_egs_dir[@]} || exit 1;
 fi
 

diff --git a/egs/chime5/s5/run.sh b/egs/chime5/s5/run.sh
@@ -168,14 +168,6 @@ if [ $stage -le 11 ]; then
 fi
 
 if [ $stage -le 12 ]; then
-  utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
-  for dset in ${test_sets}; do
-    steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
-		    exp/tri2/graph data/${dset} exp/tri2/decode_${dset}
-  done
-fi
-
-if [ $stage -le 13 ]; then
   utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
   for dset in ${test_sets}; do
     steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \

diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+
+# Copyright     2017  Hossein Hadian
+# Apache 2.0
+
+
+""" This script finds a set of allowed lengths for a given OCR/HWR data dir.
+    The allowed lengths are spaced by a factor (like 10%) and are written
+    in an output file named "allowed_lengths.txt" in the output data dir. This
+    file is later used by make_features.py to pad each image sufficiently so that
+    they all have an allowed length. This is intended for end2end chain training.
+"""
+
+import argparse
+import os
+import sys
+import copy
+import math
+import logging
+
+sys.path.insert(0, 'steps')
+import libs.common as common_lib
+
+logger = logging.getLogger('libs')
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script finds a set of
+                                   allowed lengths for a given OCR/HWR data dir.
+                                   Intended for chain training.""")
+    parser.add_argument('factor', type=float, default=12,
+                        help='Spacing (in percentage) between allowed lengths.')
+    parser.add_argument('srcdir', type=str,
+                        help='path to source data dir')
+    parser.add_argument('--coverage-factor', type=float, default=0.05,
+                        help="""Percentage of durations not covered from each
+                             side of duration histogram.""")
+    parser.add_argument('--frame-subsampling-factor', type=int, default=3,
+                        help="""Chain frame subsampling factor.
+                             See steps/nnet3/chain/train.py""")
+
+    args = parser.parse_args()
+    return args
+
+
+def read_kaldi_mapfile(path):
+    """ Read any Kaldi mapping file - like text, .scp files, etc.
+    """
+
+    m = {}
+    with open(path, 'r', encoding='latin-1') as f:
+        for line in f:
+            line = line.strip()
+            sp_pos = line.find(' ')
+            key = line[:sp_pos]
+            val = line[sp_pos+1:]
+            m[key] = val
+    return m
+
+def find_duration_range(img2len, coverage_factor):
+    """Given a list of utterances, find the start and end duration to cover
+
+     If we try to cover
+     all durations which occur in the training set, the number of
+     allowed lengths could become very large.
+
+     Returns
+     -------
+     start_dur: int
+     end_dur: int
+    """
+    durs = []
+    for im, imlen in img2len.items():
+        durs.append(int(imlen))
+    durs.sort()
+    to_ignore_dur = 0
+    tot_dur = sum(durs)
+    for d in durs:
+        to_ignore_dur += d
+        if to_ignore_dur * 100.0 / tot_dur > coverage_factor:
+            start_dur = d
+            break
+    to_ignore_dur = 0
+    for d in reversed(durs):
+        to_ignore_dur += d
+        if to_ignore_dur * 100.0 / tot_dur > coverage_factor:
+            end_dur = d
+            break
+    if start_dur < 30:
+        start_dur = 30  # a hard limit to avoid too many allowed lengths --not critical
+    return start_dur, end_dur
+
+
+def find_allowed_durations(start_len, end_len, args):
+    """Given the start and end duration, find a set of
+       allowed durations spaced by args.factor%. Also write
+       out the list of allowed durations and the corresponding
+       allowed lengths (in frames) on disk.
+
+     Returns
+     -------
+     allowed_durations: list of allowed durations (in seconds)
+    """
+
+    allowed_lengths = []
+    length = start_len
+    with open(os.path.join(args.srcdir, 'allowed_lengths.txt'), 'w', encoding='latin-1') as fp:
+        while length < end_len:
+            if length % args.frame_subsampling_factor != 0:
+                length = (args.frame_subsampling_factor *
+                          (length // args.frame_subsampling_factor))
+            allowed_lengths.append(length)
+            fp.write("{}\n".format(int(length)))
+            length *= args.factor
+    return allowed_lengths
+
+
+
+def main():
+    args = get_args()
+    args.factor = 1.0 + args.factor / 100.0
+
+    image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames'))
+
+    start_dur, end_dur = find_duration_range(image2length, args.coverage_factor)
+    logger.info("Lengths in the range [{},{}] will be covered. "
+                "Coverage rate: {}%".format(start_dur, end_dur,
+                                      100.0 - args.coverage_factor * 2))
+    logger.info("There will be {} unique allowed lengths "
+                "for the images.".format(int(math.log(end_dur / start_dur) /
+                                             math.log(args.factor))))
+
+    allowed_durations = find_allowed_durations(start_dur, end_dur, args)
+
+
+if __name__ == '__main__':
+      main()
diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Hossein Hadian
+
+
+""" This script computes the image lengths (with padding) in an image data dir.
+    The output is written to 'image2num_frames' in the given data dir. This
+    file is later used by image/get_allowed_lengths.py to find a set of allowed lengths
+    for the data dir. The output format is similar to utt2num_frames
+
+"""
+
+import argparse
+import os
+import sys
+import numpy as np
+from scipy import misc
+
+parser = argparse.ArgumentParser(description="""Computes the image lengths (i.e. width) in an image data dir
+                                                and writes them (by default) to image2num_frames.""")
+parser.add_argument('dir', type=str,
+                    help='Source data directory (containing images.scp)')
+parser.add_argument('--out-ark', type=str, default=None,
+                    help='Where to write the output image-to-num_frames info. '
+                    'Default: "dir"/image2num_frames')
+parser.add_argument('--feat-dim', type=int, default=40,
+                    help='Size to scale the height of all images')
+parser.add_argument('--padding', type=int, default=5,
+                    help='Number of white pixels to pad on the left'
+                    'and right side of the image.')
+args = parser.parse_args()
+
+
+def get_scaled_image_length(im):
+    scale_size = args.feat_dim
+    sx = im.shape[1]
+    sy = im.shape[0]
+    scale = (1.0 * scale_size) / sy
+    nx = int(scale * sx)
+    return nx
+
+### main ###
+data_list_path = os.path.join(args.dir,'images.scp')
+
+if not args.out_ark:
+    args.out_ark = os.path.join(args.dir,'image2num_frames')
+if args.out_ark == '-':
+    out_fh = sys.stdout
+else:
+    out_fh = open(args.out_ark, 'w', encoding='latin-1')
+
+with open(data_list_path) as f:
+    for line in f:
+        line = line.strip()
+        line_vect = line.split(' ')
+        image_id = line_vect[0]
+        image_path = line_vect[1]
+        im = misc.imread(image_path)
+        im_len = get_scaled_image_length(im) + (args.padding * 2)
+        print('{} {}'.format(image_id, im_len), file=out_fh)
+
+out_fh.close()
diff --git a/egs/csj/s5/local/nnet3/run_tdnn.sh b/egs/csj/s5/local/nnet3/run_tdnn.sh
@@ -72,7 +72,7 @@ if [ $stage -le 9 ]; then
   relu-renorm-layer name=tdnn5 input=Append(-7,2) dim=1024
   relu-renorm-layer name=tdnn6 dim=1024
 
-  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
 EOF
 
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/

diff --git a/egs/fisher_english/s5/local/fisher_create_test_lang.sh b/egs/fisher_english/s5/local/fisher_create_test_lang.sh
@@ -1,23 +1,25 @@
 #!/bin/bash
-#
 
-if [ -f path.sh ]; then . ./path.sh; fi
-
-mkdir -p data/lang_test
+# This script formats ARPA LM into G.fst.
 
 arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
+dir=data/lang_test
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
 [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
 
-mkdir -p data/lang_test
-cp -r data/lang/* data/lang_test
+mkdir -p $dir
+cp -r data/lang/* $dir
 
 gunzip -c "$arpa_lm" | \
   arpa2fst --disambig-symbol=#0 \
-           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
+           --read-symbol-table=$dir/words.txt - $dir/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst
+fstisstochastic $dir/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@@ -27,22 +29,21 @@ fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/l
 echo Performing further checks
 
 # Checking that G.fst is determinizable.
-fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
+fstdeterminize $dir/G.fst /dev/null || echo Error determinizing G.
 
 # Checking that L_disambig.fst is determinizable.
-fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
+fstdeterminize $dir/L_disambig.fst /dev/null || echo Error determinizing L.
 
 # Checking that disambiguated lexicon times G is determinizable
 # Note: we do this with fstdeterminizestar not fstdeterminize, as
 # fstdeterminize was taking forever (presumbaly relates to a bug
 # in this version of OpenFst that makes determinization slow for
 # some case).
-fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
+fsttablecompose $dir/L_disambig.fst $dir/G.fst | \
    fstdeterminizestar >/dev/null || echo Error
 
 # Checking that LG is stochastic:
-fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
+fsttablecompose data/lang/L_disambig.fst $dir/G.fst | \
    fstisstochastic || echo "[log:] LG is not stochastic"
 
-
 echo "$0 succeeded"