More robust labbcat parser and added MAUS parser

MontrealCorpusTools · Mar 21, 2019 · 9417a4a · 9417a4a
1 parent 8c87e9f
commit 9417a4a
Show file tree

Hide file tree

Showing 12 changed files with 307 additions and 16 deletions.
diff --git a/polyglotdb/io/__init__.py b/polyglotdb/io/__init__.py
@@ -3,12 +3,12 @@
 
 from .parsers import (BuckeyeParser, IlgParser, OrthographyTextParser,
                     TranscriptionTextParser, TextgridParser, TimitParser,
-                    MfaParser, LabbCatParser, FaveParser, PartiturParser)
+                    MfaParser, MausParser, LabbCatParser, FaveParser, PartiturParser)
 
 from .inspect import (inspect_buckeye, inspect_orthography,
                     inspect_transcription, inspect_textgrid, inspect_timit,
                     inspect_ilg, inspect_mfa, inspect_labbcat,
-                    inspect_fave, inspect_partitur)
+                    inspect_fave, inspect_partitur, inspect_maus)
 
 from .exporters import save_results
 

diff --git a/polyglotdb/io/inspect/__init__.py b/polyglotdb/io/inspect/__init__.py
@@ -6,6 +6,8 @@
 
 from .mfa import inspect_mfa
 
+from .maus import inspect_maus
+
 from .partitur import inspect_partitur
 
 from .text_orthography import inspect_orthography

diff --git a/polyglotdb/io/inspect/fave.py b/polyglotdb/io/inspect/fave.py
@@ -20,8 +20,8 @@ def inspect_fave(path):
     :class:`~polyglotdb.io.parsers.ilg.FaveParser`
         Autodetected parser for the text file
     """
-    annotation_types = [OrthographyTier('word', 'word'),
-                        OrthographyTier('phone', 'phone')]
+    annotation_types = [OrthographyTier(FaveParser.word_label, 'word'),
+                        OrthographyTier(FaveParser.phone_label, 'phone')]
 
     annotation_types[0].label = True
     annotation_types[1].label = True

diff --git a/polyglotdb/io/inspect/maus.py b/polyglotdb/io/inspect/maus.py
@@ -0,0 +1,31 @@
+from polyglotdb.structure import Hierarchy
+
+from ..types.parsing import *
+
+from ..parsers import MausParser
+
+
+def inspect_maus(path):
+    """
+    Generate an :class:`~polyglotdb.io.parsers.maus.MausParser`
+    for a specified text file for parsing it as a MAUS file
+
+    Parameters
+    ----------
+    path : str
+        Full path to text file
+
+    Returns
+    -------
+    :class:`~polyglotdb.io.parsers.maus.MausParser`
+        Autodetected parser for MAUS TextGrids
+    """
+
+    annotation_types = [OrthographyTier(MausParser.word_label, 'word'),
+                        OrthographyTier(MausParser.phone_label, 'phone')]
+
+    annotation_types[0].label = True
+    annotation_types[1].label = True
+    hierarchy = Hierarchy({'phone': 'word', 'word': None})
+
+    return MausParser(annotation_types, hierarchy)
diff --git a/polyglotdb/io/inspect/mfa.py b/polyglotdb/io/inspect/mfa.py
@@ -7,8 +7,8 @@
 
 def inspect_mfa(path):
     """
-    Generate an :class:`~polyglotdb.io.parsers.ilg.MfaParser`
-    for a specified text file for parsing it as a Mfa file
+    Generate an :class:`~polyglotdb.io.parsers.mfa.MfaParser`
+    for a specified text file for parsing it as a MFA file
 
     Parameters
     ----------
@@ -17,12 +17,12 @@ def inspect_mfa(path):
 
     Returns
     -------
-    :class:`~polyglotdb.io.parsers.ilg.Mfa`
-        Autodetected parser for Mfa
+    :class:`~polyglotdb.io.parsers.mfa.MfaParser`
+        Autodetected parser for MFA
     """
 
-    annotation_types = [OrthographyTier('words', 'word'),
-                        OrthographyTier('phones', 'phone')]
+    annotation_types = [OrthographyTier(MfaParser.word_label, 'word'),
+                        OrthographyTier(MfaParser.phone_label, 'phone')]
 
     annotation_types[0].label = True
     annotation_types[1].label = True

diff --git a/polyglotdb/io/parsers/__init__.py b/polyglotdb/io/parsers/__init__.py
@@ -8,6 +8,8 @@
 
 from .fave import FaveParser
 
+from .maus import MausParser
+
 from .text_orthography import OrthographyTextParser
 
 from .text_transcription import TranscriptionTextParser

diff --git a/polyglotdb/io/parsers/aligner.py b/polyglotdb/io/parsers/aligner.py
@@ -81,13 +81,8 @@ def parse_discourse(self, path, types_only=False):
         :class:`~polyglotdb.io.discoursedata.DiscourseData`
             Parsed data from the file
         '''
-        tg = TextGrid(strict=False)
-        try:
-            tg.read(path)
-        except Exception as e:
-            print('There was an issue parsing {}:'.format(path))
-            raise
 
+        tg = self.load_textgrid(path)
         multiple_speakers, is_valid = self._is_valid(tg)
 
         if not is_valid:

diff --git a/polyglotdb/io/parsers/labbcat.py b/polyglotdb/io/parsers/labbcat.py
@@ -1,3 +1,5 @@
+from collections import Counter
+from textgrid import TextGrid, IntervalTier
 from .aligner import AlignerParser
 
 
@@ -6,3 +8,27 @@ class LabbCatParser(AlignerParser):
     word_label = 'transcript'
     phone_label = 'segment'
     speaker_first = False
+
+    def load_textgrid(self, path):
+        tg = TextGrid(strict=False)
+        try:
+            tg.read(path)
+            new_tiers = []
+            dup_tiers_maxes = {k:0 for k,v in Counter([t.name for t in tg.tiers]).items() if v > 1}
+            dup_tiers_inds = {k:0 for k in dup_tiers_maxes.keys()}
+
+            for i, t in enumerate(tg.tiers):
+                if t.name in dup_tiers_maxes:
+                    if len(t) > dup_tiers_maxes[t.name]:
+                        dup_tiers_maxes[t.name] = len(t)
+                        dup_tiers_inds[t.name] = i
+            for i, t in enumerate(tg.tiers):
+                if t.name in dup_tiers_maxes:
+                    if i != dup_tiers_inds[t.name]:
+                        continue
+                new_tiers.append(t)
+            tg.tiers = new_tiers
+            return tg
+        except Exception as e:
+            print('There was an issue parsing {}:'.format(path))
+            raise
diff --git a/polyglotdb/io/parsers/maus.py b/polyglotdb/io/parsers/maus.py
@@ -0,0 +1,10 @@
+from collections import Counter
+from textgrid import TextGrid, IntervalTier
+from .aligner import AlignerParser
+
+
+class MausParser(AlignerParser):
+    name = 'Maus'
+    word_label = 'ort'
+    phone_label = 'mau'
+
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -73,6 +73,11 @@ def mfa_test_dir(textgrid_test_dir):
     return os.path.join(textgrid_test_dir, 'mfa')
 
 
+@pytest.fixture(scope='session')
+def maus_test_dir(textgrid_test_dir):
+    return os.path.join(textgrid_test_dir, 'maus')
+
+
 @pytest.fixture(scope='session')
 def labbcat_test_dir(textgrid_test_dir):
     return os.path.join(textgrid_test_dir, 'labbcat')

diff --git a/tests/data/textgrids/maus/maus_test.TextGrid b/tests/data/textgrids/maus/maus_test.TextGrid
@@ -0,0 +1,158 @@
+File type = "ooTextFile"
+Object class = "TextGrid"
+
+xmin = 0 
+xmax = 5 
+tiers? <exists> 
+size = 3 
+item []: 
+    item [1]:
+        class = "IntervalTier" 
+        name = "ORT" 
+        xmin = 0 
+        xmax = 5 
+        intervals: size = 6 
+        intervals [1]:
+            xmin = 0 
+            xmax = 0.5359299313286737 
+            text = "ASPIRIN" 
+        intervals [2]:
+            xmin = 0.5359299313286737 
+            xmax = 0.7277498521584862 
+            text = "" 
+        intervals [3]:
+            xmin = 0.7277498521584862 
+            xmax = 1.5463606347329 
+            text = "PLANET" 
+        intervals [4]:
+            xmin = 1.5463606347329 
+            xmax = 2.9389864791485665 
+            text = "" 
+        intervals [5]:
+            xmin = 2.9389864791485665 
+            xmax = 4.29344727410947 
+            text = "JURASSIC" 
+        intervals [6]:
+            xmin = 4.29344727410947 
+            xmax = 5 
+            text = "" 
+    item [2]:
+        class = "IntervalTier" 
+        name = "KAN" 
+        xmin = 0 
+        xmax = 5 
+        intervals: size = 6 
+        intervals [1]:
+            xmin = 0 
+            xmax = 0.5359299313286737 
+            text = "AESPERIHN" 
+        intervals [2]:
+            xmin = 0.5359299313286737 
+            xmax = 0.7277498521584862 
+            text = "" 
+        intervals [3]:
+            xmin = 0.7277498521584862 
+            xmax = 1.5463606347329 
+            text = "PLAENEHT" 
+        intervals [4]:
+            xmin = 1.5463606347329 
+            xmax = 2.9389864791485665 
+            text = "" 
+        intervals [5]:
+            xmin = 2.9389864791485665 
+            xmax = 4.29344727410947 
+            text = "JHERARSIHK" 
+        intervals [6]:
+            xmin = 4.29344727410947 
+            xmax = 5 
+            text = "" 
+    item [3]:
+        class = "IntervalTier" 
+        name = "MAU" 
+        xmin = 0 
+        xmax = 5 
+        intervals: size = 21 
+        intervals [1]:
+            xmin = 0 
+            xmax = 0.08225269466768025 
+            text = "AE" 
+        intervals [2]:
+            xmin = 0.08225269466768025 
+            xmax = 0.14677708921589916 
+            text = "S" 
+        intervals [3]:
+            xmin = 0.14677708921589916 
+            xmax = 0.2024776349370112 
+            text = "P" 
+        intervals [4]:
+            xmin = 0.2024776349370112 
+            xmax = 0.2874071798979148 
+            text = "ER" 
+        intervals [5]:
+            xmin = 0.2874071798979148 
+            xmax = 0.4065291390638574 
+            text = "IH" 
+        intervals [6]:
+            xmin = 0.4065291390638574 
+            xmax = 0.5359299313286737 
+            text = "N" 
+        intervals [7]:
+            xmin = 0.5359299313286737 
+            xmax = 0.7277498521584862 
+            text = "<p:>" 
+        intervals [8]:
+            xmin = 0.7277498521584862 
+            xmax = 0.8227025752063423 
+            text = "P" 
+        intervals [9]:
+            xmin = 0.8227025752063423 
+            xmax = 0.9230738556146828 
+            text = "L" 
+        intervals [10]:
+            xmin = 0.9230738556146828 
+            xmax = 1.0471592297458732 
+            text = "AE" 
+        intervals [11]:
+            xmin = 1.0471592297458732 
+            xmax = 1.2057865782471762 
+            text = "N" 
+        intervals [12]:
+            xmin = 1.2057865782471762 
+            xmax = 1.3988082713401389 
+            text = "EH" 
+        intervals [13]:
+            xmin = 1.3988082713401389 
+            xmax = 1.5463606347329 
+            text = "T" 
+        intervals [14]:
+            xmin = 1.5463606347329 
+            xmax = 2.9389864791485665 
+            text = "<p:>" 
+        intervals [15]:
+            xmin = 2.9389864791485665 
+            xmax = 3.4038857361685495 
+            text = "JH" 
+        intervals [16]:
+            xmin = 3.4038857361685495 
+            xmax = 3.594149976503041 
+            text = "ER" 
+        intervals [17]:
+            xmin = 3.594149976503041 
+            xmax = 3.8572109696611645 
+            text = "AE" 
+        intervals [18]:
+            xmin = 3.8572109696611645 
+            xmax = 4.023007695334491 
+            text = "S" 
+        intervals [19]:
+            xmin = 4.023007695334491 
+            xmax = 4.186800389187662 
+            text = "IH" 
+        intervals [20]:
+            xmin = 4.186800389187662 
+            xmax = 4.29344727410947 
+            text = "K" 
+        intervals [21]:
+            xmin = 4.29344727410947 
+            xmax = 5 
+            text = "<p:>" 
diff --git a/tests/test_io_maus.py b/tests/test_io_maus.py
@@ -0,0 +1,62 @@
+import pytest
+import os
+
+from polyglotdb.io import inspect_maus
+
+from polyglotdb import CorpusContext
+
+from polyglotdb.exceptions import TextGridError, GraphQueryError, ParseError
+from textgrid import TextGrid, IntervalTier
+from polyglotdb.io.parsers.mfa import MfaParser
+
+def test_load_aus(maus_test_dir, graph_db):
+    with CorpusContext('test_mfa', **graph_db) as c:
+        c.reset()
+        testFilePath = os.path.join(maus_test_dir, "maus_test.TextGrid")
+        parser = inspect_maus(testFilePath)
+        print(parser.speaker_parser)
+        c.load(parser, testFilePath)
+        assert (c.hierarchy.has_type_property('word', 'transcription'))
+
+        q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC')
+        print(q)
+        print(q.all())
+        q = q.filter(c.word.speaker.name == 'maus')
+        #print(c.word.speaker.name)
+        print(q.all())
+        q = q.order_by(c.word.begin)
+        print(q.all())
+        q = q.columns(c.word.label)
+        print(q.all())
+        results = q.all()
+        assert (len(results) == 1)
+
+        c.encode_pauses('<SIL>')
+
+        c.encode_utterances(min_pause_length=0)
+
+        q = c.query_graph(c.word).filter(c.word.label == 'PLANET')
+        q = q.filter(c.word.speaker.name == 'maus')
+        q = q.order_by(c.word.begin)
+        q = q.columns(c.word.label, c.word.following.label.column_name('following'))
+        results = q.all()
+        assert (len(results) == 1)
+        assert (results[0]['following'] == 'JURASSIC')
+
+        q = c.query_speakers().filter(c.speaker.name == 'maus')
+        q = q.columns(c.speaker.discourses.name.column_name('discourses'))
+
+        s = q.get()
+
+        assert (len(s['discourses']) == 1)
+        assert (s['discourses'] == ['maus_test'])
+
+
+def test_mismatch_parser(timit_test_dir, graph_db):
+    with CorpusContext('test_mismatch', **graph_db) as c:
+        c.reset()
+        parser = inspect_maus(timit_test_dir)
+        with pytest.raises(ParseError):
+            c.load(parser, timit_test_dir)
+
+