In [1]:
# !git clone https://github.com/fadhleryani/malti_arabi_fst.git
# # !git pull

# %pip install pynini
# %pip install pyfoma

In [2]:
# %cd malti_arabi_fst

In [1]:
import pynini as pn
import kenlm
from itertools import product
import pyconll
import pandas as pd
import numpy as np
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-mix")

In [4]:
wordmodel = kenlm.Model('data/arabi_data/arabic_lm/aggregated_country/lm/word/tn-maghreb.arpa')
charmodel = kenlm.Model('data/arabi_data/arabic_lm/aggregated_country/lm/char/tn-maghreb.arpa')

Loading the LM will be faster if you build a binary file.
Reading /Users/f/ba3sasah/malti_arabi_fst/data/arabi_data/arabic_lm/aggregated_country/lm/word/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Loading the LM will be faster if you build a binary file.
Reading /Users/f/ba3sasah/malti_arabi_fst/data/arabi_data/arabic_lm/aggregated_country/lm/char/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [35]:
def merge_conllu(dataset,charhist=False):
    dev = pyconll.load_from_file(f'data/malti_data/{dataset}/dev.conllu')
    train = pyconll.load_from_file(f'data/malti_data/{dataset}/train.conllu')
    test = pyconll.load_from_file(f'data/malti_data/{dataset}/test.conllu')
    allsets = dev._sentences + train._sentences + test._sentences

    print(f'# of sents in {dataset}',len(allsets))

    keys = ["id","form","lemma","upos","xpos","feats","head","deprel","deps","misc"]

    sents = []
    for sent in allsets:
        # toks = [pd.Series({'sent_id':sent.id,'sent':sent.text})]
        toks = []
        for tok in sent:
            tokdict = {'sent_id':sent.id}
            tokdict.update( {k:tok.__getattribute__(k) for k in keys})
            toks.append (pd.Series(tokdict))
        sents.append(pd.DataFrame(toks))
    df = pd.concat(sents)    
    # word_hist
    word_hist = df['form'].dropna().value_counts().reset_index()
    # word_hist.to_clipboard()
    print(f'# of words in {dataset}',len(word_hist))
    # # char hist
    char_hist = pd.DataFrame([y for x in df['form'].dropna().str.casefold() for y in x]).value_counts()
    # char_hist.to_clipboard()
    print(f'# of chars {dataset}',len(char_hist))
    return word_hist

In [36]:
mlrs = merge_conllu('MLRS POS')
sa = merge_conllu('Sentiment Analysis')
mapa = merge_conllu('MAPA')
mudt = merge_conllu('mt_mudt-ud')

# of sents in MLRS POS 6167
# of words in MLRS POS 15774
# of chars MLRS POS 81
# of sents in Sentiment Analysis 851
# of words in Sentiment Analysis 5425
# of chars Sentiment Analysis 70
# of sents in MAPA 8763
# of words in MAPA 19059
# of chars MAPA 143
# of sents in mt_mudt-ud 2074
# of words in mt_mudt-ud 8471
# of chars mt_mudt-ud 74


In [12]:
closedclass = pd.read_csv('mappings/closed_class_mappings.tsv',sep='\t',header=None) # already unique 
closedclass = dict(closedclass.values)



In [190]:

malti2arabi_2char = pn.string_file('mappings/malti2arabi_2char.map').optimize()
arabic2arabic = pn.string_file('mappings/arabic2arabic.map').optimize()
malti2arabi_1char = pn.string_file('mappings/malti2arabi_1char.map').optimize()
shadda = pn.string_file('mappings/shadda.map').optimize()
final_vowels = pn.string_file('mappings/final_vowels.map').optimize()
special = pn.string_file('mappings/special.map').optimize()
alif_initial = pn.string_file('mappings/alif_initial.map').optimize()
baby_closed_class = pn.string_file('mappings/baby_closed_class.map').optimize()

sigma_malti = pn.project(malti2arabi_1char,'input')
sigma_arabi = pn.project(arabic2arabic,'output') 

# SIGMA
sigma_in = pn.project(pn.union(malti2arabi_1char,special,arabic2arabic,final_vowels),'input')
sigma = pn.project(pn.union(sigma_in,special,final_vowels),'output')
sigma = pn.union(sigma,"-").optimize() 

rwr_first_fsts = pn.union(
    malti2arabi_2char,
    shadda,
    final_vowels,
    alif_initial,
).optimize()

rwr_first = pn.cdrewrite(rwr_first_fsts,"","",sigma.closure())

second_fsts = pn.union(
    malti2arabi_1char,
    arabic2arabic, 
    special,
).optimize()

translit_fst = rwr_first @ second_fsts.closure()

malti2arabi_det= pn.string_file('mappings_deterministic/malti2arabi_1char_vowels_short.map').optimize()

diacs = 'ًٌٍَُِّْ'
dediac_cross = pn.string_file('mappings/dediac.map')
dediac = pn.cdrewrite(dediac_cross,'','',sigma.closure())

augmented_closed_class = pn.string_file('mappings/augmented_closed_class.map').optimize()

words = pn.string_file('tn-maghreb-words.txt').optimize() @ dediac

In [8]:
# paths = get_paths(("<BOS>mil-<EOS>"  @ baby_closed_class @ transcriber  )),get_paths(("<BOS>fok<EOS>"   @ transcriber  ))
# paths,len(paths)

In [14]:
def dediac_fst(text):
    text = text.replace('[','\[').replace(']','\]')
    try:
        return (text @ dediac).string()
    except:
        return np.nan
    
words_df= pd.read_fwf('tn-maghreb-words.txt',header=None).rename(columns={0:'words'})
words_df['dediac'] = pd.Series([dediac_fst(x) for x in words_df['words']])

# words_df

In [243]:
def get_paths(fst,words_only=False):
    paths = list(fst.paths().items())
    if words_only:
        return [x[1] for x in paths]
    else:
        return paths



def apply_translit_fst(tok,backoff_fsts=[baby_closed_class,augmented_closed_class]):
    tok = tok.replace('[','\[').replace(']','\]')
    tok = (f'<BOS>{tok}<EOS>')
    # if type=='det':
    #     return tok @ deterministic_transcriber @ dediac
    if backoff_fsts:
        backoff =  tok @ pn.union(*backoff_fsts).optimize() @ dediac
        if get_paths(backoff):
            return backoff
        else:
            return tok  @ translit_fst @ dediac
    else:
        return tok  @ translit_fst @ dediac

def filter_edge_diacritics(options):
    return [y for y in options if y[0] not in diacs and y[-1] not in diacs]

def translit_deterministic(lowered):
    lowered = lowered.replace('[','\[').replace(']','\]')
    try:
        return (lowered @ pn.union(malti2arabi_det,special).closure().optimize() @ dediac).string()
    except:
        print('deterministic fst error on:',lowered)
        return '#na'

def translit_word(lowered_tok,backoffs): #select on merged but return unmerged

    tok_fst = apply_translit_fst(lowered_tok,backoffs)
    translit_toks = get_paths(tok_fst,words_only=True) 
    if not translit_toks:
        return ['#NA']
    try:
        translit_toks = filter_edge_diacritics(translit_toks) 
    except:
        print('err filtering diacs',translit_toks,lowered_tok)
    
    translit_toks = [ dediac_fst(x) for x in translit_toks]  # dediacritize
    return translit_toks
    

langmodelset =  set(words_df['dediac'])

def count_subtokens(text, tokenizer):
    text = [x.rstrip('+') for x in text]
    return tokenizer(text, add_special_tokens=False, return_length=True)["length"]


def translit_and_rank_options(word,name='translit',fsttype='non-det',backoffs=[baby_closed_class,augmented_closed_class]):
  
    lowered = word.lower()
    translit_dict = {
        'word_raw':word,
        'word_lowered':lowered,
        }
    
    if fsttype == 'det':
        translit = [translit_deterministic(lowered)]
    elif fsttype == 'non-det':
        translit = translit_word(lowered,backoffs)
    else:
        raise Exception('wrong fsttype')

    translit_dict[name] = translit
    translit_dict['translit'] = translit # keep this, in order to merge later
    translit_dict['translit_stripped'] = [x.rstrip('+') for x in translit_dict[name]]
    translit_dict['wordmodel_score'] = [wordmodel.score(x) for x in translit_dict['translit_stripped']]
    translit_dict['charmodel_score'] = [charmodel.score(' '.join(x)) for x in translit_dict['translit_stripped'] ]
    translit_dict['capitalized'] = word[0].isupper() # TODO: what about letter after sink as in 'L-Innu', does it matter?
    translit_dict['in_langmodel'] = [x in langmodelset for x in translit_dict['translit_stripped']]
    translit_dict['subtokens'] = count_subtokens(translit_dict['translit_stripped'], tokenizer)
    translit_dict['subtokens_lowest_ties'] = sum(np.array(translit_dict['subtokens']) == max(translit_dict['subtokens']))

    return translit_dict
    

word = "t'"
word = "L-Innu"
word = "din"
word = "f'dik"
word = "d-dinja"
word = "f'dil-konferenza d-dinja"
word = "mil-dinja" # check how many tokens it breaks into and how that affects lang model scores
word = "id-"
word = "fil- linja ."
word = "m'" 
word = "a"
word = "uffiċjali"
word = "tielgħa"
word = "il-"
# word = "[għandhomx" 
# translit_and_rank_options(word,cutoff=None,useclosedclass=False).merge(translit_and_rank_options(word,cutoff=None,useclosedclass=True),how='outer').sort_values(['charmodel_score'],ascending=False)
# translit_and_rank_options(word,cutoff=None,useclosedclass=False).merge(translit_and_rank_options(word,cutoff=None,useclosedclass=True),how='outer').sort_values(['wordmodel_score'],ascending=False)
# sorted([(wordmodel.score(x),x) for x in merged],key=lambda x: -x[0])

def generate_table(word):
    small_cc = translit_and_rank_options(word,name='small_cc',fsttype='non-det',backoffs=[baby_closed_class])
    augmented_cc = translit_and_rank_options(word,name='augmented_cc',fsttype='non-det',backoffs=[baby_closed_class,augmented_closed_class])
    deterministic = translit_and_rank_options(word,name='deterministic',fsttype='det')    
    
    return (deterministic,small_cc,augmented_cc)
   
a,b,c = generate_table(word)
a = pd.DataFrame(a)
b = pd.DataFrame(b)
c = pd.DataFrame(c)
a.merge(b,how='outer').merge(c,how='outer').sort_values('wordmodel_score',ascending=False)[[
        'translit',
        'deterministic',
        'small_cc',
        'augmented_cc',
        'translit_stripped',
        'word_raw',
        'word_lowered',
        'wordmodel_score',
        'charmodel_score',
        'capitalized',
        'in_langmodel',
        'subtokens',
        'subtokens_lowest_ties',
        ]]

Unnamed: 0,translit,deterministic,small_cc,augmented_cc,translit_stripped,word_raw,word_lowered,wordmodel_score,charmodel_score,capitalized,in_langmodel,subtokens,subtokens_lowest_ties
1,ال+,,ال+,ال+,ال,il-,il-,-4.779278,-4.863126,False,True,2,1
0,ل+,ل+,,,ل,il-,il-,-5.295598,-4.732144,False,True,1,1


In [27]:
# word_hist = word_hist
# mudtdev_translit = pd.concat(word_hist['sent'].iloc[:].apply(translit_and_rank_options).values)
def translit_dataset(word_hist):
    words_translita = []
    words_translitb = []
    words_translitc = []
    for word,freq in word_hist.values[:]:
        a,b,c = generate_table(word)    
        a['freq'] = freq
        b['freq'] = freq
        c['freq'] = freq
        # options.update({'freq':freq})
        words_translita.append(a)
        words_translitb.append(b)
        words_translitc.append(c)
    
    words_translit_exploded0 = pd.DataFrame(words_translita).explode(['translit','deterministic','translit_stripped','wordmodel_score','charmodel_score','in_langmodel','subtokens'])#.drop_duplicates(['word_raw','translit'])
    words_translit_exploded1 = pd.DataFrame(words_translitb).explode(['translit','small_cc','translit_stripped','wordmodel_score','charmodel_score','in_langmodel','subtokens'])#.drop_duplicates(['word_raw','translit'])
    words_translit_exploded2 = pd.DataFrame(words_translitc).explode(['translit','augmented_cc','translit_stripped','wordmodel_score','charmodel_score','in_langmodel','subtokens'])#.drop_duplicates(['word_raw','translit'])
    words_translit_exploded = words_translit_exploded0.merge(words_translit_exploded1,how='outer').merge(words_translit_exploded2,how='outer').sort_values('wordmodel_score',ascending=False)[[
            'freq',
            'word_raw',
            'word_lowered',
            'translit',
            'deterministic',
            'small_cc',
            'augmented_cc',
            'translit_stripped',
            'wordmodel_score',
            'charmodel_score',
            'capitalized',
            'in_langmodel',
            'subtokens',
            'subtokens_lowest_ties',
            ]].sort_values(['freq','word_lowered'],ascending=False).reset_index(drop=True)

    

In [33]:
mudt_translit = translit_dataset(mudt)
len(mudt_translit),len(mudt_translit),len(mudt_translit)/len(mudt_translit['word_raw'].unique())
mapa_translit = translit_dataset(mapa)
len(mapa_translit),len(mapa_translit),len(mapa_translit)/len(mapa_translit['word_raw'].unique())
mlrs_translit = translit_dataset(mlrs)
len(mlrs_translit),len(mlrs_translit),len(mlrs_translit)/len(mlrs_translit['word_raw'].unique())
sa_translit = translit_dataset(sa)
len(sa_translit),len(sa_translit),len(sa_translit)/len(sa_translit['word_raw'].unique())

ValueError: too many values to unpack (expected 2)

(278305, 278305, 32.85385432652579)

In [267]:
mudt_translit.to_csv('mudt_transliterated_tuples.tsv',sep='\t')

In [83]:
char_hist.to_clipboard()

In [423]:
def translit_sent(sent):
    lattice = []
    for word in sent.split():
        optionsdf = translit_and_rank_options(word)
        options = optionsdf['merged'].values
        lattice.append(options)

    return list([' '.join(x) for x in product(*lattice)])
        
def score_generated_sentences(sentences):
    return sorted([(x,model.score(x)) for x in sentences],key=lambda y: -y[1])


# score_generated_sentences(translit_sentence('malta magħrufa uffiċjalment bħala',cutoff=1))

sentout = translit_sent("kien bilqiegħda f'dik il-parti")

In [424]:
dfsentout = pd.DataFrame({'sent':sentout})
dfsentout['score'] = dfsentout['sent'].apply(wordmodel.score)
dfsentout.sort_values('score',ascending=False)

Unnamed: 0,sent,score
0,كان بلقعدة في ديك البارتي,-23.318352
3,كان بلقاعدة في ديك البارتي,-23.318352
6,كان بلقعده في ديك البارتي,-23.318352
1,كان بلقعدة في ديك البرتي,-23.817825
4,كان بلقاعدة في ديك البرتي,-23.817825
7,كان بلقعده في ديك البرتي,-23.817825
2,كان بلقعدة في ديك البرطي,-23.861692
5,كان بلقاعدة في ديك البرطي,-23.861692
8,كان بلقعده في ديك البرطي,-23.861692
