Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 72 #73

Merged
merged 5 commits into from
Jun 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 0 additions & 28 deletions tools/fst/BuildTestTransducers.sh

This file was deleted.

207 changes: 207 additions & 0 deletions tools/fst/ExtractWordLemmaPairs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-

# Author: Leonel Figueiredo de Alencar
# leonel.de.alencar@ufc.br
# Date: April 20, 2018, updated February 18, 2020

"""This module is the first component of the architecture of a generator of Portuguese diminutives. It extracts possible diminutive formation bases from existing nouns and adjectives encoded in MorphoBr's format, as described in the following paper:

ALENCAR, Leonel Figueiredo de; CUCONATO , Bruno; RADEMAKER, Alexandre. MorphoBr: an open source large-coverage full-form lexicon for morphological analysis of Portuguese. Texto Livre: Linguagem e Tecnologia, Belo Horizonte, v. 11, n. 3, p. 1-25, set.- dez. 2018.
ISSN 1983-3652
DOI: 10.17851/1983-3652.11.3.1-25
http://www.periodicos.letras.ufmg.br/index.php/textolivre/article/view/14294.


Unplausible bases are filtered out, see details below. The extracted bases are converted to spaced-text format and written to different files, according to the classification expected by the finite-state grammar in the morphotactic-grammar.lexc file. Some examples may help clarify the pipeline:

Input in MorphoBr' format:

agulhão agulha+N+AUG+M+SG
agulhões agulha+N+AUG+M+PL
agulhona agulha+N+AUG+F+SG
agulhonas agulha+N+AUG+F+PL


Output generated by this module (written to different files):

a g u l h a +N +AUG
a g u l h ã o

a g u l h a +N +AUG
a g u l h õ e s

a g u l h a +N +AUG
a g u l h o n a

a g u l h a +N +AUG
a g u l h o n a s


"""
import os, sys, re

EXCLUDE_TAGS=["+DIM","+SUPER"]
EXTENSION=".stxt"

"""Regex pattern matching itens that can not function as bases for
morphological derivations. This includes one or more consonants before a space
at the beginning of a line, for example:
b b+N+M+SG
c c+N+M+SG
d d+N+M+SG

These itens are in fact abbreviations. As such, they cannot feed diminutive formation,
e.g. *bzinho 'little b' is ungrammatical (the correct form is 'bezinho', from 'bê', the name of
letter b).
The regex pattern also matches abbreviations such as 'ha' (for hectare) and chemical symbols ('Ba', 'Ca', etc.).
"""
CONS="[bcdfghjklmnpqrstvwxyz]"
ABB=re.compile(r"(?i)(%s{1,}|%s[aeo])\s" % (CONS,CONS))

aug_m_sg = open("aug_m_sg%s" % EXTENSION,"w")
aug_m_pl = open("aug_m_pl%s" % EXTENSION,"w")
aug_f_sg = open("aug_f_sg%s" % EXTENSION,"w")
aug_f_pl = open("aug_f_pl%s" % EXTENSION,"w")

wdlm_in_s_m_sg = open("wdlm_in_s_m_sg%s" % EXTENSION,"w")
wdlm_in_s_m_pl = open("wdlm_in_s_m_pl%s" % EXTENSION,"w")
wdlm_in_s_f_sg = open("wdlm_in_s_f_sg%s" % EXTENSION,"w")
wdlm_in_s_f_pl = open("wdlm_in_s_f_pl%s" % EXTENSION,"w")

masc_in_a_sg = open("masc_in_a_sg%s" % EXTENSION,"w")
fem_in_o_sg = open("fem_in_o_sg%s" % EXTENSION,"w")
masc_in_a_pl = open("masc_in_a_pl%s" % EXTENSION,"w")
fem_in_o_pl = open("fem_in_o_pl%s" % EXTENSION,"w")

other_m_sg = open("other_m_sg%s" % EXTENSION,"w")
other_m_pl = open("other_m_pl%s" % EXTENSION,"w")
other_f_sg = open("other_f_sg%s" % EXTENSION,"w")
other_f_pl = open("other_f_pl%s" % EXTENSION,"w")

def extract_entries(infile):
return [entry.strip().decode("utf-8") for entry in open(infile,"rU").readlines() if ignore_entry(entry.strip()) ]

def split_entry(entry):
return re.split(r"\s+",entry)

def exclude_abbr(entry):
if ABB.match(entry):
return True
return False

def exclude_tag(entry):
for tag in EXCLUDE_TAGS:
if tag in entry:
return True
return False

def ignore_entry(entry):
if entry == "" or exclude_tag(entry) or exclude_abbr(entry):
return False
else:
return True

def space(word):
return " ".join(list(word))

def convert_entry(word,lemma,tags):
return "%s %s\n%s" % (space(lemma),"+%s" % " +".join(tags),space(word))

def parse_entry(entry):
word,parse=split_entry(entry)
lemma,tags=re.split(r"\+",parse,1)
return word,lemma,tags

def WordLemmaInS(word,lemma):
if word.endswith("s") and lemma.endswith("s") and word == lemma:
return True
else:
return False


def NonCanonGendMarker(word,tags):
if ("-" in word and "+M+PL" in tags and word.endswith("a")
or "-" in word and "+F+PL" in tags and word.endswith("o")
or "+M+SG" in tags and word.endswith("a")
or "+M+PL" in tags and word.endswith("as") # N-N compounds like 'aços-liga'
or "+F+SG" in tags and word.endswith("o")
or "+F+PL" in tags and word.endswith("os") # N-N compounds like 'amostras-tipo'
):
return True
else:
return False

def write_entries(entries):
for entry in entries:
word,lemma,tags=parse_entry(entry)
if "+AUG" in tags:
stxt=convert_entry(word,lemma,re.split(r"\+",tags)[:2]).encode("utf-8")
if "+M+SG" in tags:
aug_m_sg.write("%s\n\n" % stxt)
elif "+M+PL" in tags:
aug_m_pl.write("%s\n\n" % stxt)
elif "+F+SG" in tags:
aug_f_sg.write("%s\n\n" % stxt)
else:
aug_f_pl.write("%s\n\n" % stxt)

elif WordLemmaInS(word,lemma): # TODO: use re.split(r"\+",tags)[:-2], excluding gender and number tags,
# but including other tags besides the category tag (this may be useful in the future)
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8")
if "+M+SG" in tags:
wdlm_in_s_m_sg.write("%s\n\n" % stxt)
elif "+M+PL" in tags:
wdlm_in_s_m_pl.write("%s\n\n" % stxt)
elif "+F+SG" in tags:
wdlm_in_s_f_sg.write("%s\n\n" % stxt)
else:
wdlm_in_s_f_pl.write("%s\n\n" % stxt)

elif NonCanonGendMarker(word,tags): # TODO: see the above comment
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8")
if "+M+SG" in tags:
masc_in_a_sg.write("%s\n\n" % stxt)
elif "+F+SG" in tags:
fem_in_o_sg.write("%s\n\n" % stxt)
#else: # discard plural forms
# this generates incorrect plurals of compounds like 'cebeça-chata' (23/01/2020)
#pass
elif "+F+PL" in tags:
fem_in_o_pl.write("%s\n\n" % stxt)
elif "+M+PL" in tags:
masc_in_a_pl.write("%s\n\n" % stxt)
else:
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") # TODO: tags[:-2] (see above)
if "+M+SG" in tags:
other_m_sg.write("%s\n\n" % stxt)
elif "+M+PL" in tags:
other_m_pl.write("%s\n\n" % stxt)
elif "+F+SG" in tags:
other_f_sg.write("%s\n\n" % stxt)
else:
other_f_pl.write("%s\n\n" % stxt)

def main():
for infile in sys.argv[1:]:
entries=extract_entries(infile)
write_entries(entries)
aug_m_sg.close()
aug_m_pl.close()
aug_f_sg.close()
aug_f_pl.close()
wdlm_in_s_m_sg.close()
wdlm_in_s_m_pl.close()
wdlm_in_s_f_sg.close()
wdlm_in_s_f_pl.close()
masc_in_a_sg.close()
fem_in_o_sg.close()
masc_in_a_pl.close()
fem_in_o_pl.close()
other_m_sg.close()
other_m_pl.close()
other_f_sg.close()
other_f_pl.close()

if __name__ == '__main__':
main()
55 changes: 50 additions & 5 deletions tools/fst/alternation-rules.xfst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Author: Leonel F. de Alencar, Federal University of Ceará
# Date: April 16, 2018
# Author: Leonel F. de Alencar, leonel.de.alencar@ufc.br, Federal University of Ceará
# Date: April 27, 2018, bug corrections February 17, 2020

# Implementation of diminutive formation in Portuguese in the paradigm
# of finite-state morphology (Beesley & Karttunen 2003)
Expand All @@ -15,7 +15,7 @@
# processes in Portuguese. The individual transducers are composed
# into a single transducer encoding all alternation rules.

# Defining a marker for words with stemms ending in s,
# Defining a marker for words with stems ending in s,
# e.g. "lápis", "burguês", etc. In these words,
# z of -zinho suffix is deleted after a stemm's s,
# e. g. "lapisinho", "burguesinhos". In other cases,
Expand All @@ -30,6 +30,31 @@ define StemmS %$;
# delete this marker
define DelStemmS StemmS -> 0 ;

# right context defining a non-final hyphen-separated compound member
define Hyph [$"-"] ;

# protect accents in non-final hyphen-separated compound members from being removed by Unaccent rule
define Protect [
[á -> A§ || _ Hyph ]
.o. [é -> E§ || _ Hyph ]
.o. [ê -> E¢ || _ Hyph]
.o. [ó -> O§ || _ Hyph]
.o. [ô -> O¢ || _ Hyph]
.o. [í -> I§ || _ Hyph]
.o. [ú -> U§ || _ Hyph]
.o. [â -> A¢ || _ Hyph]
];

# convert protected letters back into accented letters
define Reconv [[á -> A§ ]
.o. [ E§ -> é ]
.o. [ E¢ -> ê ]
.o. [ O§ -> ó ]
.o. [ O¢ -> ô ]
.o. [ I§ -> í ]
.o. [ U§ -> ú ]
.o. [ A¢ -> â ]];

# anterior vowels
define AntVow [ e | i ] ;

Expand All @@ -52,6 +77,15 @@ define PhonC [c -> %[ s %] || _ AntVow MorphSep ] ;
# convett back phone [s] to letter c
define OrthC %[ s %] -> c ;


# convert letter g to phone [Z] (SAMPA code for the voiced
# postalveolar fricative [ʒ] in IPA) to prevent rule ChangeG
# from applying in cases like herege^inha (diminitive of herege)
define PhonG [g -> %[ Z %] || _ AntVow MorphSep ] ;

# convett back phone [Z] to letter g
define OrthG %[ Z %] -> g ;

# delete ç before morpheme separator and anterior vowel
define DeleteCedilla [ ç -> c || _ MorphSep AntVow ];

Expand Down Expand Up @@ -83,7 +117,10 @@ define OptDelEStemZ e (->) 0 || [z | s] _ s MorphSep z ;
# words with the stem ending in r,
# e.g. flores^zinhas (diminutive of "flor" 'flower' in plural)
# flores^zinhas => flors^zinhas
define OptDelEStemR e (->) 0 || r _ s MorphSep z ;
define OptDelEStemR e (->) 0 || Vow r _ s MorphSep z ;

# TODO: abdômen => abdômenes => abdomenezinhos
# => abdomenzinhos

# composing the two previous rules in one single FST
define OptDelE OptDelEStemZ .o. OptDelEStemR ;
Expand Down Expand Up @@ -113,24 +150,32 @@ define Unaccent [[á -> a] .o. [é -> e] .o. [ê -> e] .o. [ó -> o]
define AltRules NasalBilabAssim .o.
PhonC
.o.
PhonG
.o.
ThemVowDel
.o.
ChangeC
.o.
OrthC
.o.
ChangeG
ChangeG
.o.
OrthG
.o.
OptDelE
.o.
PluralSDeletion
.o.
SuffZDeletion
.o.
Protect
.o.
IDeletion
.o.
Unaccent
.o.
Reconv
.o.
DeleteCedilla
.o.
DelStemmS
Expand Down
Loading