Skip to content

Commit

Permalink
More robust labbcat parser and added MAUS parser
Browse files Browse the repository at this point in the history
  • Loading branch information
mmcauliffe committed Mar 21, 2019
1 parent 8c87e9f commit 9417a4a
Show file tree
Hide file tree
Showing 12 changed files with 307 additions and 16 deletions.
4 changes: 2 additions & 2 deletions polyglotdb/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@

from .parsers import (BuckeyeParser, IlgParser, OrthographyTextParser,
TranscriptionTextParser, TextgridParser, TimitParser,
MfaParser, LabbCatParser, FaveParser, PartiturParser)
MfaParser, MausParser, LabbCatParser, FaveParser, PartiturParser)

from .inspect import (inspect_buckeye, inspect_orthography,
inspect_transcription, inspect_textgrid, inspect_timit,
inspect_ilg, inspect_mfa, inspect_labbcat,
inspect_fave, inspect_partitur)
inspect_fave, inspect_partitur, inspect_maus)

from .exporters import save_results

Expand Down
2 changes: 2 additions & 0 deletions polyglotdb/io/inspect/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from .mfa import inspect_mfa

from .maus import inspect_maus

from .partitur import inspect_partitur

from .text_orthography import inspect_orthography
Expand Down
4 changes: 2 additions & 2 deletions polyglotdb/io/inspect/fave.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def inspect_fave(path):
:class:`~polyglotdb.io.parsers.ilg.FaveParser`
Autodetected parser for the text file
"""
annotation_types = [OrthographyTier('word', 'word'),
OrthographyTier('phone', 'phone')]
annotation_types = [OrthographyTier(FaveParser.word_label, 'word'),
OrthographyTier(FaveParser.phone_label, 'phone')]

annotation_types[0].label = True
annotation_types[1].label = True
Expand Down
31 changes: 31 additions & 0 deletions polyglotdb/io/inspect/maus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from polyglotdb.structure import Hierarchy

from ..types.parsing import *

from ..parsers import MausParser


def inspect_maus(path):
"""
Generate an :class:`~polyglotdb.io.parsers.maus.MausParser`
for a specified text file for parsing it as a MAUS file
Parameters
----------
path : str
Full path to text file
Returns
-------
:class:`~polyglotdb.io.parsers.maus.MausParser`
Autodetected parser for MAUS TextGrids
"""

annotation_types = [OrthographyTier(MausParser.word_label, 'word'),
OrthographyTier(MausParser.phone_label, 'phone')]

annotation_types[0].label = True
annotation_types[1].label = True
hierarchy = Hierarchy({'phone': 'word', 'word': None})

return MausParser(annotation_types, hierarchy)
12 changes: 6 additions & 6 deletions polyglotdb/io/inspect/mfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

def inspect_mfa(path):
"""
Generate an :class:`~polyglotdb.io.parsers.ilg.MfaParser`
for a specified text file for parsing it as a Mfa file
Generate an :class:`~polyglotdb.io.parsers.mfa.MfaParser`
for a specified text file for parsing it as a MFA file
Parameters
----------
Expand All @@ -17,12 +17,12 @@ def inspect_mfa(path):
Returns
-------
:class:`~polyglotdb.io.parsers.ilg.Mfa`
Autodetected parser for Mfa
:class:`~polyglotdb.io.parsers.mfa.MfaParser`
Autodetected parser for MFA
"""

annotation_types = [OrthographyTier('words', 'word'),
OrthographyTier('phones', 'phone')]
annotation_types = [OrthographyTier(MfaParser.word_label, 'word'),
OrthographyTier(MfaParser.phone_label, 'phone')]

annotation_types[0].label = True
annotation_types[1].label = True
Expand Down
2 changes: 2 additions & 0 deletions polyglotdb/io/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

from .fave import FaveParser

from .maus import MausParser

from .text_orthography import OrthographyTextParser

from .text_transcription import TranscriptionTextParser
Expand Down
7 changes: 1 addition & 6 deletions polyglotdb/io/parsers/aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,8 @@ def parse_discourse(self, path, types_only=False):
:class:`~polyglotdb.io.discoursedata.DiscourseData`
Parsed data from the file
'''
tg = TextGrid(strict=False)
try:
tg.read(path)
except Exception as e:
print('There was an issue parsing {}:'.format(path))
raise

tg = self.load_textgrid(path)
multiple_speakers, is_valid = self._is_valid(tg)

if not is_valid:
Expand Down
26 changes: 26 additions & 0 deletions polyglotdb/io/parsers/labbcat.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from collections import Counter
from textgrid import TextGrid, IntervalTier
from .aligner import AlignerParser


Expand All @@ -6,3 +8,27 @@ class LabbCatParser(AlignerParser):
word_label = 'transcript'
phone_label = 'segment'
speaker_first = False

def load_textgrid(self, path):
tg = TextGrid(strict=False)
try:
tg.read(path)
new_tiers = []
dup_tiers_maxes = {k:0 for k,v in Counter([t.name for t in tg.tiers]).items() if v > 1}
dup_tiers_inds = {k:0 for k in dup_tiers_maxes.keys()}

for i, t in enumerate(tg.tiers):
if t.name in dup_tiers_maxes:
if len(t) > dup_tiers_maxes[t.name]:
dup_tiers_maxes[t.name] = len(t)
dup_tiers_inds[t.name] = i
for i, t in enumerate(tg.tiers):
if t.name in dup_tiers_maxes:
if i != dup_tiers_inds[t.name]:
continue
new_tiers.append(t)
tg.tiers = new_tiers
return tg
except Exception as e:
print('There was an issue parsing {}:'.format(path))
raise
10 changes: 10 additions & 0 deletions polyglotdb/io/parsers/maus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from collections import Counter
from textgrid import TextGrid, IntervalTier
from .aligner import AlignerParser


class MausParser(AlignerParser):
name = 'Maus'
word_label = 'ort'
phone_label = 'mau'

5 changes: 5 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ def mfa_test_dir(textgrid_test_dir):
return os.path.join(textgrid_test_dir, 'mfa')


@pytest.fixture(scope='session')
def maus_test_dir(textgrid_test_dir):
return os.path.join(textgrid_test_dir, 'maus')


@pytest.fixture(scope='session')
def labbcat_test_dir(textgrid_test_dir):
return os.path.join(textgrid_test_dir, 'labbcat')
Expand Down
158 changes: 158 additions & 0 deletions tests/data/textgrids/maus/maus_test.TextGrid
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0
xmax = 5
tiers? <exists>
size = 3
item []:
item [1]:
class = "IntervalTier"
name = "ORT"
xmin = 0
xmax = 5
intervals: size = 6
intervals [1]:
xmin = 0
xmax = 0.5359299313286737
text = "ASPIRIN"
intervals [2]:
xmin = 0.5359299313286737
xmax = 0.7277498521584862
text = ""
intervals [3]:
xmin = 0.7277498521584862
xmax = 1.5463606347329
text = "PLANET"
intervals [4]:
xmin = 1.5463606347329
xmax = 2.9389864791485665
text = ""
intervals [5]:
xmin = 2.9389864791485665
xmax = 4.29344727410947
text = "JURASSIC"
intervals [6]:
xmin = 4.29344727410947
xmax = 5
text = ""
item [2]:
class = "IntervalTier"
name = "KAN"
xmin = 0
xmax = 5
intervals: size = 6
intervals [1]:
xmin = 0
xmax = 0.5359299313286737
text = "AESPERIHN"
intervals [2]:
xmin = 0.5359299313286737
xmax = 0.7277498521584862
text = ""
intervals [3]:
xmin = 0.7277498521584862
xmax = 1.5463606347329
text = "PLAENEHT"
intervals [4]:
xmin = 1.5463606347329
xmax = 2.9389864791485665
text = ""
intervals [5]:
xmin = 2.9389864791485665
xmax = 4.29344727410947
text = "JHERARSIHK"
intervals [6]:
xmin = 4.29344727410947
xmax = 5
text = ""
item [3]:
class = "IntervalTier"
name = "MAU"
xmin = 0
xmax = 5
intervals: size = 21
intervals [1]:
xmin = 0
xmax = 0.08225269466768025
text = "AE"
intervals [2]:
xmin = 0.08225269466768025
xmax = 0.14677708921589916
text = "S"
intervals [3]:
xmin = 0.14677708921589916
xmax = 0.2024776349370112
text = "P"
intervals [4]:
xmin = 0.2024776349370112
xmax = 0.2874071798979148
text = "ER"
intervals [5]:
xmin = 0.2874071798979148
xmax = 0.4065291390638574
text = "IH"
intervals [6]:
xmin = 0.4065291390638574
xmax = 0.5359299313286737
text = "N"
intervals [7]:
xmin = 0.5359299313286737
xmax = 0.7277498521584862
text = "<p:>"
intervals [8]:
xmin = 0.7277498521584862
xmax = 0.8227025752063423
text = "P"
intervals [9]:
xmin = 0.8227025752063423
xmax = 0.9230738556146828
text = "L"
intervals [10]:
xmin = 0.9230738556146828
xmax = 1.0471592297458732
text = "AE"
intervals [11]:
xmin = 1.0471592297458732
xmax = 1.2057865782471762
text = "N"
intervals [12]:
xmin = 1.2057865782471762
xmax = 1.3988082713401389
text = "EH"
intervals [13]:
xmin = 1.3988082713401389
xmax = 1.5463606347329
text = "T"
intervals [14]:
xmin = 1.5463606347329
xmax = 2.9389864791485665
text = "<p:>"
intervals [15]:
xmin = 2.9389864791485665
xmax = 3.4038857361685495
text = "JH"
intervals [16]:
xmin = 3.4038857361685495
xmax = 3.594149976503041
text = "ER"
intervals [17]:
xmin = 3.594149976503041
xmax = 3.8572109696611645
text = "AE"
intervals [18]:
xmin = 3.8572109696611645
xmax = 4.023007695334491
text = "S"
intervals [19]:
xmin = 4.023007695334491
xmax = 4.186800389187662
text = "IH"
intervals [20]:
xmin = 4.186800389187662
xmax = 4.29344727410947
text = "K"
intervals [21]:
xmin = 4.29344727410947
xmax = 5
text = "<p:>"
62 changes: 62 additions & 0 deletions tests/test_io_maus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pytest
import os

from polyglotdb.io import inspect_maus

from polyglotdb import CorpusContext

from polyglotdb.exceptions import TextGridError, GraphQueryError, ParseError
from textgrid import TextGrid, IntervalTier
from polyglotdb.io.parsers.mfa import MfaParser

def test_load_aus(maus_test_dir, graph_db):
with CorpusContext('test_mfa', **graph_db) as c:
c.reset()
testFilePath = os.path.join(maus_test_dir, "maus_test.TextGrid")
parser = inspect_maus(testFilePath)
print(parser.speaker_parser)
c.load(parser, testFilePath)
assert (c.hierarchy.has_type_property('word', 'transcription'))

q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC')
print(q)
print(q.all())
q = q.filter(c.word.speaker.name == 'maus')
#print(c.word.speaker.name)
print(q.all())
q = q.order_by(c.word.begin)
print(q.all())
q = q.columns(c.word.label)
print(q.all())
results = q.all()
assert (len(results) == 1)

c.encode_pauses('<SIL>')

c.encode_utterances(min_pause_length=0)

q = c.query_graph(c.word).filter(c.word.label == 'PLANET')
q = q.filter(c.word.speaker.name == 'maus')
q = q.order_by(c.word.begin)
q = q.columns(c.word.label, c.word.following.label.column_name('following'))
results = q.all()
assert (len(results) == 1)
assert (results[0]['following'] == 'JURASSIC')

q = c.query_speakers().filter(c.speaker.name == 'maus')
q = q.columns(c.speaker.discourses.name.column_name('discourses'))

s = q.get()

assert (len(s['discourses']) == 1)
assert (s['discourses'] == ['maus_test'])


def test_mismatch_parser(timit_test_dir, graph_db):
with CorpusContext('test_mismatch', **graph_db) as c:
c.reset()
parser = inspect_maus(timit_test_dir)
with pytest.raises(ParseError):
c.load(parser, timit_test_dir)


0 comments on commit 9417a4a

Please sign in to comment.