In [1]:
# !git clone https://github.com/fadhleryani/malti_arabi_fst.git
# # !git pull

# %pip install pynini
# %pip install pyfoma

In [2]:
# %cd malti_arabi_fst

In [308]:
import pynini as pn
import kenlm
from itertools import product
import pyconll
import pandas as pd
import numpy as np


In [None]:
wordmodel = kenlm.Model('aggregated_country/lm/word/tn-maghreb.arpa')
charmodel = kenlm.Model('aggregated_country/lm/char/tn-maghreb.arpa')

In [293]:
mudt = pyconll.load_from_file('mt_mudt-ud-dev.conllu')

keys = ["id","form","lemma","upos","xpos","feats","head","deprel","deps","misc"]

sents = []
for sent in mudt:
    toks = [pd.Series({'sent_id':sent.id,'sent':sent.text})]
    # toks = []
    for tok in sent:
        tokdict = {'sent_id':sent.id}
        tokdict.update( {k:tok.__getattribute__(k) for k in keys})
        toks.append (pd.Series(tokdict))

    sents.append(pd.DataFrame(toks))

df = pd.concat(sents)

# word_hist
word_hist = df['sent'].dropna().str.split().explode().value_counts()
# word_hist.to_clipboard()

# char hist
char_hist = pd.DataFrame([y for x in df['sent'].dropna().str.split().explode().str.casefold().unique() for y in x]).value_counts()
# char_hist.to_clipboard()


In [322]:
words_df= pd.read_fwf('tn-maghreb-words.txt',header=None).rename(columns={0:'words'})
def dediac_fst(text):
    try:
        return (text @ dediac).string()
    except:
        return np.nan
words_df['dediac'] = words_df['words'].apply(dediac_fst)

words_df

Unnamed: 0,words,dediac
0,يا,يا
1,ديني,ديني
2,محلالي,محلالي
3,فرحو,فرحو
4,❤,
...,...,...
203089,يهبلوا,يهبلوا
203090,لعمرباش,لعمرباش
203091,فالفوضى,فالفوضى
203092,خزانتك,خزانتك


In [377]:
closedclass = pd.read_csv('closed_class_mappings.tsv',sep='\t',header=None) # already unique 
closedclass = dict(closedclass.values)


In [396]:
malti2arabi = pn.string_file('malti2arabi.map')

malti_sigma = pn.project(malti2arabi,'input')
arabi_sigma = pn.project(malti2arabi,'output') 

sigma_input = pn.union(malti_sigma ).closure()

transcriber = pn.union(malti2arabi ).closure()

dediac_cross = pn.string_file('dediac.map')

diacs = 'ًٌٍَُِّْ'
dediac = pn.cdrewrite(dediac_cross,'','',pn.union(arabi_sigma,*diacs).closure())

words = pn.string_file('tn-maghreb-words.txt').optimize() @ dediac
closedclass_fst = pn.string_file('closed_class_mappings.tsv').optimize()

In [422]:
def get_paths(fst,words_only=False):
    paths = list(fst.paths().items())
    if words_only:
        return [x[1] for x in paths]
    else:
        return paths
    
def translit_fst(tok):
    fst = (f'<BOS>{tok}<EOS>')  @ transcriber  
    # fst = (f'<BOS>{tok}<EOS>')  @ transcriber @ dediac 
    return fst

def remove_plus(text):
    return text.replace('+ ','')
    # return (text @ pn.cdrewrite(pn.cross('+'+pn.accep(' ').ques,''),'','',arabi_sigma.closure(),direction='rtl')).string() # takes a long time

def filter_edge_diacritics(options):
    return [y for y in options if y[0] not in diacs and y[-1] not in diacs]


def translit_closed_class(word):
    outvalue = []
    lowered = word.lower()
    for tok in lowered.split(): 
        outvalue.append(closedclass.get(tok,'#na'))

    return ' '.join(outvalue)

def translit_word(lowered,cutoff=3,useclosedclass=False): #select on merged but return unmerged
    lattice = []
     # iterate on lower cased
    for tok in lowered.replace('-','- ').replace("'","' ").split():
        if useclosedclass :
            cc = closedclass.get(tok)
            if cc:
                lattice.append([cc])
                continue
        tok_fst = translit_fst(tok)
        translit_toks = get_paths(tok_fst,words_only=True) 
        translit_toks = filter_edge_diacritics(translit_toks) 
        translit_toks = [(x @ dediac).string() for x in translit_toks]  # dediacritize
        lattice.append(translit_toks)

    return [' '.join(x) for x in product(*lattice)]

def translit_and_rank_options(word,useclosedclass=False,cutoff=3):
    lowered = word.lower()
    translit = translit_word(lowered,useclosedclass=useclosedclass)
    df = pd.DataFrame({'original_word':word,'translit':translit})
    df['word'] = lowered.replace('-','- ').replace("'","' ")
    df['merged'] = df['translit'].str.replace('+ ','')
    df['wordmodel_score'] = df['merged'].apply(wordmodel.score) 
    df['charmodel_score'] = df['merged'].apply(lambda x: charmodel.score(' '.join(x))) 
    df['capitalized'] = word[0].isupper() # TODO: what about letter after sink as in 'L-Innu', does it matter?
    df['in_langmodel'] = df['merged'].apply(lambda x: x in set(words_df['dediac']))
    df['closed_class'] = df['word'].apply(translit_closed_class)
    # df['translit_closed_class'] = [translit_cc]*len(df)
    if useclosedclass:
        df.rename(columns={'translit':'translit_cc'},inplace=True)
    # df['fertility'] = df['word'].apply(get_fertility) #TODO
    return df.drop_duplicates('merged').sort_values('charmodel_score',ascending=False)[:cutoff]
    return df.drop_duplicates('merged').sort_values('wordmodel_score',ascending=False)[:cutoff]


word = "L-Innu"
word = "Id-dinja"
translit_and_rank_options(word,cutoff=10,useclosedclass=False).merge(translit_and_rank_options(word,cutoff=10,useclosedclass=True),how='outer')
# sorted([(wordmodel.score(x),x) for x in merged],key=lambda x: -x[0])


Unnamed: 0,original_word,translit,word,merged,wordmodel_score,charmodel_score,capitalized,in_langmodel,closed_class,translit_cc
0,Id-dinja,ال+ دنيا,id- dinja,الدنيا,-4.473696,-4.370441,True,True,ال+ #na,ال+ دنيا
1,Id-dinja,ال+ دنية,id- dinja,الدنية,-6.431094,-6.43386,True,True,ال+ #na,ال+ دنية
2,Id-dinja,ال+ دينية,id- dinja,الدينية,-6.782957,-6.594251,True,True,ال+ #na,ال+ دينية
3,Id-dinja,ال+ دينيا,id- dinja,الدينيا,-8.091814,-7.463097,True,False,ال+ #na,ال+ دينيا
4,Id-dinja,ال+ دينيه,id- dinja,الدينيه,-8.047945,-7.516132,True,True,ال+ #na,ال+ دينيه
5,Id-dinja,ال+ دنيه,id- dinja,الدنيه,-8.091814,-7.928232,True,False,ال+ #na,ال+ دنيه
6,Id-dinja,يال+ دنيا,id- dinja,يالدنيا,-8.047945,-7.958244,True,True,ال+ #na,
7,Id-dinja,اال+ دنيا,id- dinja,االدنيا,-8.091814,-9.041514,True,False,ال+ #na,
8,Id-dinja,ال+ دنيى,id- dinja,الدنيى,-8.091814,-9.453617,True,False,ال+ #na,ال+ دنيى
9,Id-dinja,ال+ ضنية,id- dinja,الضنية,-8.091814,-9.467267,True,False,ال+ #na,ال+ ضنية


In [None]:
# tuple(original_word,
# boolean(capitalized or not),
# closed class mapping or nill    ,
# boolean(Exists or not),
# wordmodel score,
# charactermodel score,
# kurts fertility)

In [423]:
def translit_sent(sent):
    lattice = []
    for word in sent.split():
        optionsdf = translit_and_rank_options(word)
        options = optionsdf['merged'].values
        lattice.append(options)

    return list([' '.join(x) for x in product(*lattice)])
        
def score_generated_sentences(sentences):
    return sorted([(x,model.score(x)) for x in sentences],key=lambda y: -y[1])


# score_generated_sentences(translit_sentence('malta magħrufa uffiċjalment bħala',cutoff=1))

sentout = translit_sent("kien bilqiegħda f'dik il-parti")

In [424]:
dfsentout = pd.DataFrame({'sent':sentout})
dfsentout['score'] = dfsentout['sent'].apply(wordmodel.score)
dfsentout.sort_values('score',ascending=False)

Unnamed: 0,sent,score
0,كان بلقعدة في ديك البارتي,-23.318352
3,كان بلقاعدة في ديك البارتي,-23.318352
6,كان بلقعده في ديك البارتي,-23.318352
1,كان بلقعدة في ديك البرتي,-23.817825
4,كان بلقاعدة في ديك البرتي,-23.817825
7,كان بلقعده في ديك البرتي,-23.817825
2,كان بلقعدة في ديك البرطي,-23.861692
5,كان بلقاعدة في ديك البرطي,-23.861692
8,كان بلقعده في ديك البرطي,-23.861692


In [437]:
word_hist = word_hist
word_hist['sent'].iloc[:10].apply(translit_and_rank_options).iloc[0]

Unnamed: 0,original_word,translit,word,merged,wordmodel_score,charmodel_score,capitalized,in_langmodel,closed_class
0,li,لي,li,لي,-4.859493,-5.063514,False,True,اللي
