In [65]:
import pandas as pd
from camel_tools.utils.charmap import CharMapper
from camel_tools.utils.transliterate import Transliterator
import pynini as pn
from camel_tools.utils.charsets import SAFEBW_CHARSET
import re

### Convert between Arabic Script and Safebuckwalter

In [2]:
# Instantiate the builtin bw2ar (Buckwalter to Arabic) CharMapper
ar2safebw = CharMapper.builtin_mapper('ar2safebw')
safebw2ar = CharMapper.builtin_mapper('safebw2ar')

# Instantiate Transliterator with the bw2ar CharMapper with '@@IGNORE@@' marker (default)
ar2safebw_translit = Transliterator(ar2safebw)
def ar2safebw(ar):   
    # Generate Arabic transliteration from BW
    sentence_safebw = ar2safebw_translit.transliterate(ar, strip_markers=True)

    return ar2safebw_translit.transliterate(ar) #strip_markers=False

safebw2ar_translit = Transliterator(safebw2ar)
def safebw2ar(safebw):   
    # Generate Arabic transliteration from BW
    sentence_ar = safebw2ar_translit.transliterate(safebw, strip_markers=True)

    return safebw2ar_translit.transliterate(safebw) #strip_markers=False

### Load Malti to Arabic map file (and generate symbol tables)

In [69]:
def dict2symboltable(dictfile):
    lefttable = pn.SymbolTable()
    lefttable.add_symbol("<eps>", 0)
    righttable = pn.SymbolTable()
    righttable.add_symbol("<eps>", 0)
    
    with open(dictfile,'r') as d:
        for line in d.readlines():
            for idx, tuple in enumerate(line.strip().split('\t')):
                if idx==0:
                    for char in tuple.split():
                        lefttable.add_symbol(char)
                elif idx==1:
                    for char in tuple.split():
                        righttable.add_symbol(char)
    
    return lefttable,righttable
                    
malti_ortho, arabi_ortho = dict2symboltable('malti2arabi.dict')

malti2arabi = pn.string_file("malti2arabi.dict", input_token_type=malti_ortho,output_token_type=arabi_ortho).optimize()



### generate fst based on input string (), and get all possible output paths

In [81]:
def space_separate(string):
    return re.sub(r'(.)',r'\1 ',string).strip()

def create_fst(string,direction='malti2arabi'):
    string = space_separate(string)
    if direction == 'malti2arabi':
        return ((pn.accep(string,token_type=malti_ortho) ) @ malti2arabi.closure()).optimize().set_input_symbols(malti_ortho).set_output_symbols(arabi_ortho)
    elif direction == 'arabi2malti':
        return ((pn.accep(string,token_type=arabi_ortho) ) @ malti2arabi.invert().closure()).optimize().set_input_symbols(arabi_ortho).set_output_symbols(malti_ortho)

def get_paths(fst,target=None):
    path_items = list(fst.paths(input_token_type=malti_ortho,output_token_type=arabi_ortho).items())
    if target:
        if not list(filter(lambda x: x[1].replace(' ','')==target,path_items)):
            print('target not in paths'.upper())
    return path_items


create_fst('katab',direction='arabi2malti')
create_fst('katab',direction='malti2arabi')
# get_paths(fst = create_fst('katab'),target ='katab')


TARGET NOT IN PATHS


[('d e j j e m', 'd A y A m', <tropical Weight 0 at 0x7ffc5c37d810>),
 ('d e j j e m', 'd A y i m', <tropical Weight 0 at 0x7ffc5c37dbb0>),
 ('d e j j e m', 'd A y a m', <tropical Weight 0 at 0x7ffc5c37d950>),
 ('d e j j e m', 'd i y A m', <tropical Weight 0 at 0x7ffc5c37d750>),
 ('d e j j e m', 'd i y i m', <tropical Weight 0 at 0x7ffc5c37db90>),
 ('d e j j e m', 'd i y a m', <tropical Weight 0 at 0x7ffc5c37dc70>),
 ('d e j j e m', 'd a y A m', <tropical Weight 0 at 0x7ffc5c37dc90>),
 ('d e j j e m', 'd a y i m', <tropical Weight 0 at 0x7ffc5c37d830>),
 ('d e j j e m', 'd a y a m', <tropical Weight 0 at 0x7ffc5c37d7d0>)]

In [82]:
get_paths(fst = create_fst('dejjem'),target ='dAyimuuu')

TARGET NOT IN PATHS


[('d e j j e m', 'd A y A m', <tropical Weight 0 at 0x7ffc5c37dbf0>),
 ('d e j j e m', 'd A y i m', <tropical Weight 0 at 0x7ffc5c37d990>),
 ('d e j j e m', 'd A y a m', <tropical Weight 0 at 0x7ffc5c37d890>),
 ('d e j j e m', 'd i y A m', <tropical Weight 0 at 0x7ffc5c37db70>),
 ('d e j j e m', 'd i y i m', <tropical Weight 0 at 0x7ffc5c37d970>),
 ('d e j j e m', 'd i y a m', <tropical Weight 0 at 0x7ffc5c37d8b0>),
 ('d e j j e m', 'd a y A m', <tropical Weight 0 at 0x7ffc5c37d8d0>),
 ('d e j j e m', 'd a y i m', <tropical Weight 0 at 0x7ffc5c37daf0>),
 ('d e j j e m', 'd a y a m', <tropical Weight 0 at 0x7ffc5c37db10>)]