# Importing the lemmas

In [41]:
from pyfoma import *

In [42]:
# document containing the quechua lemmas
lemmas_path = "quechua_lemmas.txt"

In [43]:
# load the lemmas in a list
with open(lemmas_path, 'r') as file:
    lemmas = file.readlines()
lemmas = [st.strip() for st in lemmas]
lemmas

['achalachiy',
 'achalachiy',
 'achhiy',
 'allay',
 'allichiy',
 'allpachay',
 'anchaykachay',
 'antachay',
 'aqtuy',
 'aquchay',
 'arinsay',
 'arphayay',
 'ashiy',
 'ashiy',
 'aspiy',
 'aspiy',
 'astakuy',
 'awankay',
 'awannay',
 'away',
 'awsay',
 'awñiy',
 'aypuriy',
 'añaychay',
 'añiy',
 "ch'antay",
 "ch'aphranay",
 "ch'aqchuy",
 "ch'illuchay",
 "ch'ipchiy",
 "ch'ipuy",
 "ch'isiyay",
 "ch'isiyay",
 "ch'uhtanay",
 "ch'ullchuy",
 "ch'utiy",
 'chantay',
 'chapatiyay',
 'chapiy',
 'chimpay',
 'chinkay',
 'chinruykachiy',
 'chiqachakuy',
 'chiriyachiy',
 'halay',
 'halay',
 'hampiy',
 'hamurpay',
 'hamuy',
 'hanpuy',
 "hanuk'ay",
 'haraway',
 'harchiyay',
 "hat'ansillay",
 'hatunchay',
 'hawiy',
 'hayratachiy',
 "hiq'ipay",
 "huch'uiyay",
 'huhllay',
 'huhyay',
 "hunt'ay",
 'hurquy',
 "husk'ay",
 'ichiy',
 'ichiy',
 'ismuy',
 'kañay',
 'kañiy',
 'kulliyay',
 'kulliyay',
 'kutipakuy',
 'kuyuy',
 "llik'iy",
 "llik'iy",
 'machay',
 'malliy',
 'maqakuy',
 'maqay',
 'maqay',
 "marq'ay",
 "

# All Suffixes and Circumfixes

In [44]:
# For each tag set, I define a dictionary
#  with its suffixes or circumfixes

In [45]:
quechua_suffixes = {
    'V;FUT;1+EXCL;PL':  'saqku',
    'V;FUT;1+INCL;PL':  'sunchik',
    'V;FUT;1;SG':       'saq',
    'V;FUT;2;PL':       'nkichik',
    'V;FUT;2;SG':       'nki',
    'V;FUT;3;PL':       'nqaku',
    'V;FUT;3;SG':       'nqa',

    'V;IMP;POS;2;PL':   'ychik',
    'V;IMP;POS;2;SG':   'y',
    'V;IMP;POS;3;PL':   'chunku',
    'V;IMP;POS;3;SG':   'chun',
    'V;NFIN':           'y',
    'V;NFIN;AGT':       'q',
    'V;PRS;1+EXCL;PL':  'yku',
    'V;PRS;1+INCL;PL':  'nchik',
    'V;PRS;1;SG':       'ni',
    'V;PRS;2;PL':       'nkichik',
    'V;PRS;2;SG':       'nki',
    'V;PRS;3;PL':       'nku',
    'V;PRS;3;SG':       'n',
    'V;PST;FH;1+EXCL;PL':   'rqaniku',
    'V;PST;FH;1+INCL;PL':   'rqanchik',
    'V;PST;FH;1;SG':        'rqani',
    'V;PST;FH;2;PL':        'rqankichik',
    'V;PST;FH;2;SG':        'rqanki',
    'V;PST;FH;3;PL':        'rqanku',
    'V;PST;FH;3;SG':        'rqan',
    'V;PST;NFH;1+EXCL;PL':  'sqaniku',
    'V;PST;NFH;1+INCL;PL':  'sqanchik',
    'V;PST;NFH;1;SG':       'sqani',
    'V;PST;NFH;2;PL':       'sqankichik',
    'V;PST;NFH;2;SG':       'sqanki',
    'V;PST;NFH;3;PL':       'sqanku',
    'V;PST;NFH;3;SG':       'sqan',
    }

In [46]:
quechua_circumfixes = {
    'V;IMP;NEG;2;PL': ('ama', 'ychikchu'),
    'V;IMP;NEG;2;SG': ('ama', 'ychu'),
    'V;IMP;NEG;3;PL': ('ama', 'chunkuchu'),
    'V;IMP;NEG;3;SG': ('ama', 'chunchu'),
}

# Defining the grammar

## Basic lexicon

In [47]:
# I start constructing the lexicon for the FST

In [48]:
# put the tagsets between brackets
tags = list(quechua_suffixes.keys()) + list(quechua_circumfixes.keys())
tags = ["'["+ tag + "]'" for tag in tags]
tags

["'[V;FUT;1+EXCL;PL]'",
 "'[V;FUT;1+INCL;PL]'",
 "'[V;FUT;1;SG]'",
 "'[V;FUT;2;PL]'",
 "'[V;FUT;2;SG]'",
 "'[V;FUT;3;PL]'",
 "'[V;FUT;3;SG]'",
 "'[V;IMP;POS;2;PL]'",
 "'[V;IMP;POS;2;SG]'",
 "'[V;IMP;POS;3;PL]'",
 "'[V;IMP;POS;3;SG]'",
 "'[V;NFIN]'",
 "'[V;NFIN;AGT]'",
 "'[V;PRS;1+EXCL;PL]'",
 "'[V;PRS;1+INCL;PL]'",
 "'[V;PRS;1;SG]'",
 "'[V;PRS;2;PL]'",
 "'[V;PRS;2;SG]'",
 "'[V;PRS;3;PL]'",
 "'[V;PRS;3;SG]'",
 "'[V;PST;FH;1+EXCL;PL]'",
 "'[V;PST;FH;1+INCL;PL]'",
 "'[V;PST;FH;1;SG]'",
 "'[V;PST;FH;2;PL]'",
 "'[V;PST;FH;2;SG]'",
 "'[V;PST;FH;3;PL]'",
 "'[V;PST;FH;3;SG]'",
 "'[V;PST;NFH;1+EXCL;PL]'",
 "'[V;PST;NFH;1+INCL;PL]'",
 "'[V;PST;NFH;1;SG]'",
 "'[V;PST;NFH;2;PL]'",
 "'[V;PST;NFH;2;SG]'",
 "'[V;PST;NFH;3;PL]'",
 "'[V;PST;NFH;3;SG]'",
 "'[V;IMP;NEG;2;PL]'",
 "'[V;IMP;NEG;2;SG]'",
 "'[V;IMP;NEG;3;PL]'",
 "'[V;IMP;NEG;3;SG]'"]

In [49]:
fsts = {} # Initialize the dictionary of FSTs
fsts["S"] = [ ("'[Prefix]'", "TaggedVerb") ] # [Prefix] is added behind all lemmas
fsts["TaggedVerb"] = [ (lemma, "Tag") for lemma in lemmas]
fsts['Tag'] = [(tag, "#") for tag in tags] # tag sets are added at the end

fsts['lexicon'] = FST.rlg(fsts, "S")


In [50]:
print(Paradigm(fsts['lexicon'], ".*"))

achalachiy     [Prefix][V;FUT;1+EXCL;PL]      [Prefix]achalachiy[V;FUT;1+EXCL;PL]         
achalachiy     [Prefix][V;FUT;1+INCL;PL]      [Prefix]achalachiy[V;FUT;1+INCL;PL]         
achalachiy     [Prefix][V;FUT;1;SG]           [Prefix]achalachiy[V;FUT;1;SG]              
achalachiy     [Prefix][V;FUT;2;PL]           [Prefix]achalachiy[V;FUT;2;PL]              
achalachiy     [Prefix][V;FUT;2;SG]           [Prefix]achalachiy[V;FUT;2;SG]              
achalachiy     [Prefix][V;FUT;3;PL]           [Prefix]achalachiy[V;FUT;3;PL]              
achalachiy     [Prefix][V;FUT;3;SG]           [Prefix]achalachiy[V;FUT;3;SG]              
achalachiy     [Prefix][V;IMP;NEG;2;PL]       [Prefix]achalachiy[V;IMP;NEG;2;PL]          
achalachiy     [Prefix][V;IMP;NEG;2;SG]       [Prefix]achalachiy[V;IMP;NEG;2;SG]          
achalachiy     [Prefix][V;IMP;NEG;3;PL]       [Prefix]achalachiy[V;IMP;NEG;3;PL]          
achalachiy     [Prefix][V;IMP;NEG;3;SG]       [Prefix]achalachiy[V;IMP;NEG;3;SG]          

## Replacement rules for suffixes

In [51]:
# All the tag sets for suffixes are replaced
# I take advantage of the fact that all verbal lemmas
# end in -y, so I rewrite "-y[TAGSET]"

In [52]:
rewrite_suffix_rules = [ 
    "("
    + "y" 
    + "'["+ tag + "]'" 
    + ")"
    + ":" 
    + "(" + suffix + ")" 
    for tag,suffix in quechua_suffixes.items()]
rewrite_suffix_rules = " | ".join(rewrite_suffix_rules)
rewrite_suffix_rules

"(y'[V;FUT;1+EXCL;PL]'):(saqku) | (y'[V;FUT;1+INCL;PL]'):(sunchik) | (y'[V;FUT;1;SG]'):(saq) | (y'[V;FUT;2;PL]'):(nkichik) | (y'[V;FUT;2;SG]'):(nki) | (y'[V;FUT;3;PL]'):(nqaku) | (y'[V;FUT;3;SG]'):(nqa) | (y'[V;IMP;POS;2;PL]'):(ychik) | (y'[V;IMP;POS;2;SG]'):(y) | (y'[V;IMP;POS;3;PL]'):(chunku) | (y'[V;IMP;POS;3;SG]'):(chun) | (y'[V;NFIN]'):(y) | (y'[V;NFIN;AGT]'):(q) | (y'[V;PRS;1+EXCL;PL]'):(yku) | (y'[V;PRS;1+INCL;PL]'):(nchik) | (y'[V;PRS;1;SG]'):(ni) | (y'[V;PRS;2;PL]'):(nkichik) | (y'[V;PRS;2;SG]'):(nki) | (y'[V;PRS;3;PL]'):(nku) | (y'[V;PRS;3;SG]'):(n) | (y'[V;PST;FH;1+EXCL;PL]'):(rqaniku) | (y'[V;PST;FH;1+INCL;PL]'):(rqanchik) | (y'[V;PST;FH;1;SG]'):(rqani) | (y'[V;PST;FH;2;PL]'):(rqankichik) | (y'[V;PST;FH;2;SG]'):(rqanki) | (y'[V;PST;FH;3;PL]'):(rqanku) | (y'[V;PST;FH;3;SG]'):(rqan) | (y'[V;PST;NFH;1+EXCL;PL]'):(sqaniku) | (y'[V;PST;NFH;1+INCL;PL]'):(sqanchik) | (y'[V;PST;NFH;1;SG]'):(sqani) | (y'[V;PST;NFH;2;PL]'):(sqankichik) | (y'[V;PST;NFH;2;SG]'):(sqanki) | (y'[V;PST;NFH

In [53]:
# define a FST with the previous rewrite rules
fsts['suffix_rules'] = FST.re("$^rewrite("+rewrite_suffix_rules+")")

In [54]:
# compose with the previous FST
fsts['with_suffixes'] = FST.re("$lexicon @ $suffix_rules", fsts)

In [55]:
print(Paradigm(fsts['with_suffixes'], ".*"))

achalachiy     [Prefix][V;FUT;1+EXCL;PL]      [Prefix]achalachisaqku                 
achalachiy     [Prefix][V;FUT;1+INCL;PL]      [Prefix]achalachisunchik               
achalachiy     [Prefix][V;FUT;1;SG]           [Prefix]achalachisaq                   
achalachiy     [Prefix][V;FUT;2;PL]           [Prefix]achalachinkichik               
achalachiy     [Prefix][V;FUT;2;SG]           [Prefix]achalachinki                   
achalachiy     [Prefix][V;FUT;3;PL]           [Prefix]achalachinqaku                 
achalachiy     [Prefix][V;FUT;3;SG]           [Prefix]achalachinqa                   
achalachiy     [Prefix][V;IMP;NEG;2;PL]       [Prefix]achalachiy[V;IMP;NEG;2;PL]     
achalachiy     [Prefix][V;IMP;NEG;2;SG]       [Prefix]achalachiy[V;IMP;NEG;2;SG]     
achalachiy     [Prefix][V;IMP;NEG;3;PL]       [Prefix]achalachiy[V;IMP;NEG;3;PL]     
achalachiy     [Prefix][V;IMP;NEG;3;SG]       [Prefix]achalachiy[V;IMP;NEG;3;SG]     
achalachiy     [Prefix][V;IMP;POS;2;PL]       [Prefix]

In [56]:
# Note that tag sets coresponding to circumfixes
#  have NOT been replaced

## Replacement rules for circumfixes

In [57]:
# characthers of the quechua alphabet
fsts["A"] = FST.re("[ñ\'a-z]") 

In [58]:
# get the tag sets corresponding to circumfixes

In [59]:
circumfix_tags = quechua_circumfixes.keys()
circumfix_tags = ["'[" + tag + "]'" for tag in circumfix_tags]
circumfix_tags = "(" + "|".join(circumfix_tags) + ")"
circumfix_tags

"('[V;IMP;NEG;2;PL]'|'[V;IMP;NEG;2;SG]'|'[V;IMP;NEG;3;PL]'|'[V;IMP;NEG;3;SG]')"

In [60]:
# When there is a tagset for circumfixes 
# at the end of the word, the prefix symbol 
# is replaced by "ama"

In [61]:
fsts['rewrite_prefix_symbol'] = FST.re("$^rewrite( ('[Prefix]'):(ama) / _ ( $A+ "+ circumfix_tags+ " ) )", fsts)

In [62]:
# Similarly as done with the prefixes:
# the tag sets for circumfixes are replaced
# by ther corresponding endings

In [63]:
rewrite_circumfix_rules = [ 
    "("
    + "y" 
    + "'["+ tag + "]'" 
    + ")"
    + ":" 
    + "(" + suffix + ")" 
    for tag,(prefix, suffix) in quechua_circumfixes.items()]
rewrite_circumfix_rules = " | ".join(rewrite_circumfix_rules)
rewrite_circumfix_rules

"(y'[V;IMP;NEG;2;PL]'):(ychikchu) | (y'[V;IMP;NEG;2;SG]'):(ychu) | (y'[V;IMP;NEG;3;PL]'):(chunkuchu) | (y'[V;IMP;NEG;3;SG]'):(chunchu)"

In [64]:
fsts['circumfix_rules'] = FST.re("$^rewrite("+rewrite_circumfix_rules+")")

In [65]:
# Then, I erase all instances of the prefix symbol.
# Note that it was only left on those words
# that do NOT come from circumfix tag sets

In [66]:
# delete the prefix symbol
fsts['delete_prefix_symbol'] = FST.re("$^rewrite('[Prefix]':'')")

In [67]:
# comporse the FSTs
fsts['with_circumfixes'] = FST.re("$with_suffixes @ $rewrite_prefix_symbol @ $circumfix_rules @ $delete_prefix_symbol", fsts)

In [68]:
print(Paradigm(fsts['with_circumfixes'], ".*"))

achalachiy     [Prefix][V;FUT;1+EXCL;PL]      achalachisaqku            
achalachiy     [Prefix][V;FUT;1+INCL;PL]      achalachisunchik          
achalachiy     [Prefix][V;FUT;1;SG]           achalachisaq              
achalachiy     [Prefix][V;FUT;2;PL]           achalachinkichik          
achalachiy     [Prefix][V;FUT;2;SG]           achalachinki              
achalachiy     [Prefix][V;FUT;3;PL]           achalachinqaku            
achalachiy     [Prefix][V;FUT;3;SG]           achalachinqa              
achalachiy     [Prefix][V;IMP;NEG;2;PL]       amaachalachiychikchu      
achalachiy     [Prefix][V;IMP;NEG;2;SG]       amaachalachiychu          
achalachiy     [Prefix][V;IMP;NEG;3;PL]       amaachalachichunkuchu     
achalachiy     [Prefix][V;IMP;NEG;3;SG]       amaachalachichunchu       
achalachiy     [Prefix][V;IMP;POS;2;PL]       achalachiychik            
achalachiy     [Prefix][V;IMP;POS;2;SG]       achalachiy                
achalachiy     [Prefix][V;IMP;POS;3;PL]       achal

## Final Grammar

In [69]:
# Finally, I obtain the grammar for quechua verbs

In [70]:
fsts['quechua_grammar'] =FST.re("$with_circumfixes ", fsts) # fsts['with_circumfixes']
quechua_fst = fsts['quechua_grammar']

In [71]:
# Analyze and generate some examples

In [72]:
list(quechua_fst.analyze("achalachichunku"))

['[Prefix]achalachiy[V;IMP;POS;3;PL]']

In [73]:
list(quechua_fst.generate('[Prefix]'+'achalachiy[V;IMP;POS;2;SG]'))

['achalachiy']

In [74]:
list(quechua_fst.generate('[Prefix]achalachiy[V;IMP;POS;3;SG]'))

['achalachichun']

# Evaluation

In [75]:
# I evaluate against the dataset from Unimorph

In [76]:
import pandas as pd

In [77]:
# load the dataset
dataset_path = "quechua_dev_verbs.csv"
quechua_verbs_dataset = pd.read_csv(dataset_path)

In [78]:
# example entry
quechua_verbs_dataset['root_and_tags'][0]

'achalachiy[V;IMP;POS;2;SG]'

In [79]:
#We need to do some preprocessing 
# to be able to feed the entries to the FST

In [80]:
# list of all words generated by the FST
generated_by_fst = [
    next(quechua_fst.generate(
        "[Prefix]"+ root_and_tag
        ))
    for
    root_and_tag
    in
    quechua_verbs_dataset['root_and_tags']
]

In [81]:
# example generated word
generated_by_fst[10]

'arinsarqaniku'

In [82]:
# get all the true inflections from the dataset

In [83]:
quechua_verbs_true_inflections = list(quechua_verbs_dataset['inflection'])

In [84]:
quechua_verbs_true_inflections[10]

'arinsarqaniku'

In [85]:
quechua_verbs_true_inflections.index("amakañiychu")

68

In [86]:
# Evaluate

In [87]:
evaluation = [ 
    generated == true # compare equality
    for generated, true
    in
    zip(
        quechua_verbs_true_inflections,
        generated_by_fst
    )

]

In [88]:
# count correctly genearted words

In [89]:
{
    "Correct": evaluation.count(True) / len(evaluation),
    "Incorrect": evaluation.count(False)/ len(evaluation)
}

{'Correct': 0.9930555555555556, 'Incorrect': 0.006944444444444444}

In [90]:
# find the errors

In [91]:
fails = [ 
    (generated , true)
    for generated, true
    in
    zip(
        quechua_verbs_true_inflections,
        generated_by_fst
    )
    if generated != true

]
fails

[("wich'qan", "wichq'an")]

In [92]:
# This looks like the result of a phonological rule involving the distribution of ejective stop constants.