Add TIMIT parsing tests

MontrealCorpusTools · Sep 23, 2015 · 8982f1f · 8982f1f
1 parent d7e0b1f
commit 8982f1f
Show file tree

Hide file tree

Showing 4 changed files with 143 additions and 21 deletions.
diff --git a/polyglotdb/io/standards/timit.py b/polyglotdb/io/standards/timit.py
@@ -25,8 +25,8 @@ def inspect_discourse_timit(word_path):
     list of AnnotationTypes
         Auto-detected AnnotationTypes for TIMIT
     """
-    annotation_types = [AnnotationType('spelling', 'transcription', None, anchor = True),
-                       AnnotationType('transcription', None, 'spelling', base = True, token = True)]
+    annotation_types = [AnnotationType('spelling', 'surface_transcription', None, anchor = True),
+                       AnnotationType('surface_transcription', None, 'spelling', base = True, token = True)]
     return annotation_types
 
 def timit_to_data(word_path, phone_path, annotation_types = None,
@@ -69,7 +69,7 @@ def timit_to_data(word_path, phone_path, annotation_types = None,
             found.append(p)
             if p.end == end:
                 found_all = True
-        n = 'transcription'
+        n = data.base_levels[0]
         level_count = data.level_length(n)
         word.references.append(n)
         word.begins.append(level_count)
@@ -149,7 +149,7 @@ def load_discourse_timit(corpus_context, word_path, phone_path,
                                     feature_system_path = None,
                                     stop_check = None, call_back = None):
     """
-    Load a discourse from a text file containing interlinear glosses
+    Load a discourse from a TIMIT style corpus
 
     Parameters
     ----------
@@ -163,7 +163,7 @@ def load_discourse_timit(corpus_context, word_path, phone_path,
         List of AnnotationType specifying how to parse the glosses.
         Auto-generated based on dialect.
     feature_system_path : str
-        Full path to pickled FeatureMatrix to use with the Corpus
+        Full path to pickled FeatureMatrix to use with the corpus
     stop_check : callable or None
         Optional function to check whether to gracefully terminate early
     call_back : callable or None
@@ -180,31 +180,21 @@ def read_phones(path):
     sr = 16000
     with open(path,'r') as file_handle:
         for line in file_handle:
-
             l = line.strip().split(' ')
-            start = float(l[0])
-            end = float(l[1])
+            begin = float(l[0]) / sr
+            end = float(l[1])/ sr
             label = l[2]
-            if sr is not None:
-                start /= sr
-                end /= sr
             output.append(BaseAnnotation(label, begin, end))
-
     return output
 
 def read_words(path):
-    output = list()
+    output = []
     sr = 16000
     with open(path,'r') as file_handle:
         for line in file_handle:
-
             l = line.strip().split(' ')
-            start = float(l[0])
-            end = float(l[1])
+            begin = float(l[0]) / sr
+            end = float(l[1]) / sr
             word = l[2]
-            if sr is not None:
-                start /= sr
-                end /= sr
-            output.append({'spelling':word, 'begin':start, 'end':end})
-
+            output.append({'spelling':word, 'begin':begin, 'end':end})
     return output
diff --git a/tests/data/timit/test.PHN b/tests/data/timit/test.PHN
@@ -0,0 +1,43 @@
+0 2400 h#
+2400 4260 sh
+4260 5480 iy
+5480 6516 hv
+6516 8856 ae
+8856 9610 dcl
+9610 9859 d
+9859 10639 y
+10639 12200 axr
+12200 13380 dcl
+13380 13540 d
+13540 15426 aa
+15426 16440 r
+16440 17330 kcl
+17330 17796 k
+17796 19860 s
+19860 21880 ux
+21880 22180 tcl
+22180 22511 t
+22511 23436 q
+23436 24229 ih
+24229 25400 n
+25400 26130 gcl
+26130 26960 g
+26960 27750 r
+27750 29368 iy
+29368 31140 s
+31140 32584 iy
+32584 33796 w
+33796 36845 aa
+36845 38417 sh
+38417 38833 epi
+38833 39742 w
+39742 41560 ao
+41560 41934 dx
+41934 43730 axr
+43730 45067 q
+45067 47026 aa
+47026 48200 l
+48200 49996 y
+49996 51876 ix
+51876 53756 axr
+53756 55840 h#
diff --git a/tests/data/timit/test.WRD b/tests/data/timit/test.WRD
@@ -0,0 +1,11 @@
+2400 5480 she
+5480 9859 had
+9859 12200 your
+12200 17796 dark
+17796 22511 suit
+22511 25400 in
+25400 32584 greasy
+32584 38417 wash
+38833 43730 water
+45067 48200 all
+48200 53756 year
diff --git a/tests/test_io_timit.py b/tests/test_io_timit.py
@@ -0,0 +1,78 @@
+
+import pytest
+import os
+
+from polyglotdb.io.standards.timit import (read_phones, read_words,
+                            BaseAnnotation,
+                            timit_to_data)
+
+def test_load_phones(timit_test_dir):
+    expected_phones = [BaseAnnotation('h#',0, 2400 / 16000),
+                            BaseAnnotation('sh',2400 / 16000, 4260/ 16000),
+                            BaseAnnotation('iy',4260 / 16000,5480 / 16000),
+                            BaseAnnotation('hv',5480 / 16000,6516 / 16000),
+                            BaseAnnotation('ae',6516 / 16000,8856 / 16000),
+                            BaseAnnotation('dcl',8856 / 16000,9610 / 16000),
+                            BaseAnnotation('d',9610 / 16000,9859 / 16000),
+                            BaseAnnotation('y',9859 / 16000,10639 / 16000),
+                            BaseAnnotation('axr',10639 / 16000,12200 / 16000),
+                            BaseAnnotation('dcl',12200 / 16000,13380 / 16000),
+                            BaseAnnotation('d',13380 / 16000,13540 / 16000),
+                            BaseAnnotation('aa',13540 / 16000,15426 / 16000),
+                            BaseAnnotation('r',15426 / 16000, 16440/ 16000),
+                            BaseAnnotation('kcl',16440 / 16000,17330 / 16000),
+                            BaseAnnotation('k',17330 / 16000,17796 / 16000),
+                            BaseAnnotation('s',17796 / 16000,19860 / 16000),
+                            BaseAnnotation('ux',19860 / 16000,21880 / 16000),
+                            BaseAnnotation('tcl',21880 / 16000,22180 / 16000),
+                            BaseAnnotation('t',22180 / 16000,22511 / 16000),
+                            BaseAnnotation('q',22511 / 16000,23436 / 16000),
+                            BaseAnnotation('ih',23436 / 16000,24229 / 16000),
+                            BaseAnnotation('n', 24229/ 16000,25400 / 16000),
+                            BaseAnnotation('gcl',25400 / 16000,26130 / 16000),
+                            BaseAnnotation('g',26130 / 16000,26960 / 16000),
+                            BaseAnnotation('r',26960 / 16000,27750 / 16000),
+                            BaseAnnotation('iy',27750 / 16000,29368 / 16000),
+                            BaseAnnotation('s',29368 / 16000,31140 / 16000),
+                            BaseAnnotation('iy',31140 / 16000,32584 / 16000),
+                            BaseAnnotation('w', 32584/ 16000,33796 / 16000),
+                            BaseAnnotation('aa',33796 / 16000,36845 / 16000),
+                            BaseAnnotation('sh', 36845/ 16000,38417 / 16000),
+                            BaseAnnotation('epi',38417 / 16000,38833 / 16000),
+                            BaseAnnotation('w', 38833/ 16000,39742 / 16000),
+                            BaseAnnotation('ao',39742 / 16000,41560 / 16000),
+                            BaseAnnotation('dx', 41560/ 16000,41934 / 16000),
+                            BaseAnnotation('axr',41934 / 16000,43730 / 16000),
+                            BaseAnnotation('q', 43730/ 16000,45067 / 16000),
+                            BaseAnnotation('aa',45067 / 16000,47026 / 16000),
+                            BaseAnnotation('l',47026 / 16000,48200 / 16000),
+                            BaseAnnotation('y', 48200/ 16000,49996 / 16000),
+                            BaseAnnotation('ix',49996 / 16000,51876 / 16000),
+                            BaseAnnotation('axr', 51876/ 16000,53756 / 16000),
+                            BaseAnnotation('h#',53756 / 16000,55840 / 16000),
+                            ]
+    phones = read_phones(os.path.join(timit_test_dir,'test.PHN'))
+    for i,p in enumerate(expected_phones):
+        assert(p == phones[i])
+
+def test_load_words(timit_test_dir):
+    words = read_words(os.path.join(timit_test_dir, 'test.WRD'))
+    expected_words = [{'spelling':'she','begin': 2400 / 16000,'end': 5480 / 16000},
+        {'spelling':'had','begin': 5480 / 16000,'end': 9859 / 16000,},
+        {'spelling':'your','begin': 9859 / 16000,'end': 12200 / 16000,},
+        {'spelling':'dark','begin': 12200 / 16000,'end': 17796 / 16000},
+        {'spelling':'suit','begin': 17796 / 16000,'end': 22511 / 16000},
+        {'spelling':'in','begin': 22511 / 16000,'end': 25400 / 16000},
+        {'spelling':'greasy','begin': 25400 / 16000,'end': 32584 / 16000},
+        {'spelling':'wash','begin': 32584 / 16000,'end': 38417 / 16000},
+        {'spelling':'water','begin': 38833 / 16000,'end': 43730 / 16000},
+        {'spelling':'all','begin': 45067 / 16000,'end': 48200 / 16000},
+        {'spelling':'year','begin': 48200 / 16000,'end': 53756 / 16000}]
+    for i,w in enumerate(expected_words):
+        assert(w == words[i])
+
+def test_files_to_data(timit_test_dir):
+    words = timit_to_data(os.path.join(timit_test_dir,'test.WRD'),
+                            os.path.join(timit_test_dir,'test.PHN'))
+
+