In [1]:
# !git clone https://github.com/fadhleryani/malti_arabi_fst.git
# # !git pull

# %pip install pynini
# %pip install pyfoma

In [2]:
# %cd malti_arabi_fst

In [1]:
import pynini as pn
import kenlm
from itertools import product


In [2]:
malti2arabi = pn.string_file('malti2arabi.map')


malti_sigma = pn.project(malti2arabi,'input')
arabi_sigma = pn.project(malti2arabi,'output') 


sigma_input = pn.union(malti_sigma ).closure()

transcriber = pn.union(malti2arabi ).closure()



dediac_cross = pn.string_file('dediac.map')

diacs = 'ًٌٍَُِّْ'
dediac = pn.cdrewrite(dediac_cross,'','',pn.union(arabi_sigma,*diacs).closure())

words = pn.string_file('tn-maghreb-words.txt') @ dediac


In [3]:
# test dediac
('ابَطا' @ dediac).string()

'ابطا'

In [110]:
def get_paths(fst,words_only=False):
    paths = list(fst.paths().items())
    if words_only:
        return [x[1] for x in paths]
    else:
        return paths

# get_paths('merħba' @ rwrs @ transcriber)



fst = ('<BOS>malta<EOS>' )  @ transcriber @ dediac @ words

get_paths(fst)

[('<BOS>malta<EOS>', 'ملت', <tropical Weight 0 at 0x10d311b30>),
 ('<BOS>malta<EOS>', 'ملط', <tropical Weight 0 at 0x10d311b10>),
 ('<BOS>malta<EOS>', 'مالت', <tropical Weight 0 at 0x10d311450>),
 ('<BOS>malta<EOS>', 'مالطا', <tropical Weight 0 at 0x10d311f10>),
 ('<BOS>malta<EOS>', 'مالطا', <tropical Weight 0 at 0x10d311670>)]

In [107]:
wordmodel = kenlm.Model('aggregated_country/lm/word/tn-maghreb.arpa')
charmodel = kenlm.Model('aggregated_country/lm/char/tn-maghreb.arpa')

Loading the LM will be faster if you build a binary file.
Reading /Users/f/ba3sasah/malti_arabi_fst/aggregated_country/lm/word/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Loading the LM will be faster if you build a binary file.
Reading /Users/f/ba3sasah/malti_arabi_fst/aggregated_country/lm/char/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [138]:
def translit_fst(tok):
    fst = (f'<BOS>{tok}<EOS>')  @ transcriber @ dediac @ pn.cdrewrite(pn.cross('+ ',''),'','',arabi_sigma.closure()) 
    return fst

def rank_backoff(options,cutoff,words_only=False):
    sorted_options = sorted(list({(x,charmodel.score(' '.join(x))) for x in options}),key=lambda x: -x[1])[:cutoff]
    if words_only:
        return [x[0] for x in sorted_options]
    else:
        return sorted_options


    
def translit_sentence(sentence,cutoff=3): #select on merged but return unmerged
    lattice = []
    for tok in sentence.split():
        options_fst = translit_fst(tok) @ words
        if not get_paths(options_fst): # backoff should be character level ranked before it is limited
            options_fst = translit_fst(tok)            
            options = get_paths(options_fst,words_only=True)
            options = rank_backoff(options,cutoff,words_only=True)
            
            if cutoff==0:
                options = ['']
            elif cutoff <0:
                raise Exception('limit_backoff_options must be larger than zero')
        else:
            options = get_paths(options_fst,words_only=True)
        lattice.append(options)
    # return lattice
    return list({' '.join(x) for x in list(product(*lattice))})

rank_backoff(get_paths(translit_fst( 'merħeba'),True),cutoff=5),rank_backoff(get_paths(translit_fst( 'uffiċjalment'),True),cutoff=5)
# translit_sentence('uffiċjalment',cutoff=5)

([('مرحبا', -4.798305034637451),
  ('مرحب', -6.210062026977539),
  ('مرحبه', -6.854912281036377),
  ('مرحبة', -6.998781681060791),
  ('مرحبى', -8.243759155273438)],
 [('فتشيلمانت', -17.73682403564453),
  ('فتشيلمنت', -18.161087036132812),
  ('فتشيالمنت', -18.192659378051758),
  ('وفيتشيلمانت', -18.739917755126953),
  ('افتشيلمانت', -18.87792205810547)])

In [92]:
fst = (f'<BOS>feyn<EOS>')  @ transcriber  @ pn.cdrewrite(pn.cross('+ ',''),'','',arabi_sigma.closure()) @ dediac
get_paths(fst)

[('<BOS>feyn<EOS>', 'فين', <tropical Weight 0 at 0x10d3228b0>),
 ('<BOS>feyn<EOS>', 'فاين', <tropical Weight 0 at 0x10d322c70>)]

In [106]:
def score_generated_sentences(sentences):
    return sorted([(x,model.score(x)) for x in sentences],key=lambda y: -y[1])


score_generated_sentences(translit_sentence('malta magħrufa uffiċjalment bħala',limit_backoff_options=1))


[('مالطا معروف', -11.395761489868164),
 ('مالطا معروفة', -11.442479133605957),
 ('مالطا معرفة', -11.834766387939453),
 ('ملت معروف', -11.97829818725586),
 ('مالت معروف', -11.97829818725586),
 ('ملت معروفة', -12.025016784667969),
 ('مالت معروفة', -12.025016784667969),
 ('ملط معروف', -12.207475662231445),
 ('ملط معروفة', -12.254193305969238),
 ('مالطا مغرفة', -12.271644592285156),
 ('مالت معرفة', -12.417303085327148),
 ('ملت معرفة', -12.417303085327148),
 ('مالطا معروفا', -12.61601734161377),
 ('ملط معرفة', -12.646480560302734),
 ('مالت مغرفة', -12.854182243347168),
 ('ملت مغرفة', -12.854182243347168),
 ('مالطا مغرفه', -12.993379592895508),
 ('ملط مغرفة', -13.083358764648438),
 ('مالت معروفا', -13.198554039001465),
 ('ملت معروفا', -13.198554039001465),
 ('مالطا معرفا', -13.222556114196777),
 ('مالطا ماعرف', -13.222556114196777),
 ('ملط معروفا', -13.42773151397705),
 ('ملت مغرفه', -13.57591724395752),
 ('مالت مغرفه', -13.57591724395752),
 ('مالت معرفا', -13.805093765258789),
 ('ملط مغرفه'

In [90]:
x = translit_fst('għ')
y = x
y,get_paths(y)

(<vector Fst at 0x10d7578b0>,
 [('<BOS>għ<EOS>', 'جح', <tropical Weight 0 at 0x10d322df0>),
  ('<BOS>għ<EOS>', 'جخ', <tropical Weight 0 at 0x10d322890>),
  ('<BOS>għ<EOS>', 'ع', <tropical Weight 0 at 0x10d3228f0>),
  ('<BOS>għ<EOS>', 'غ', <tropical Weight 0 at 0x10d322610>)])

In [99]:
get_paths(translit_fst('uffiċjalment'))

[('<BOS>uffiċjalment<EOS>', 'ففتشيلمنت', <tropical Weight 0 at 0x10d347590>),
 ('<BOS>uffiċjalment<EOS>', 'ففتشيلمنط', <tropical Weight 0 at 0x10d347870>),
 ('<BOS>uffiċjalment<EOS>', 'ففتشيلمانت', <tropical Weight 0 at 0x10d321510>),
 ('<BOS>uffiċjalment<EOS>', 'ففتشيلمانط', <tropical Weight 0 at 0x10d321f50>),
 ('<BOS>uffiċjalment<EOS>', 'ففتشيالمنت', <tropical Weight 0 at 0x10d321c50>),
 ('<BOS>uffiċjalment<EOS>', 'ففتشيالمنط', <tropical Weight 0 at 0x10d3218d0>),
 ('<BOS>uffiċjalment<EOS>', 'ففتشيالمانت', <tropical Weight 0 at 0x10d321fb0>),
 ('<BOS>uffiċjalment<EOS>', 'ففتشيالمانط', <tropical Weight 0 at 0x10d321690>),
 ('<BOS>uffiċjalment<EOS>', 'ففيتشيلمنت', <tropical Weight 0 at 0x10d3214b0>),
 ('<BOS>uffiċjalment<EOS>', 'ففيتشيلمنط', <tropical Weight 0 at 0x10d3210f0>),
 ('<BOS>uffiċjalment<EOS>', 'ففيتشيلمانت', <tropical Weight 0 at 0x10d321830>),
 ('<BOS>uffiċjalment<EOS>', 'ففيتشيلمانط', <tropical Weight 0 at 0x10d3215b0>),
 ('<BOS>uffiċjalment<EOS>', 'ففيتشيالمنت', <tropic