In [1]:
# !git clone https://github.com/fadhleryani/malti_arabi_fst.git
# # !git pull

# %pip install pynini
# %pip install pyfoma

In [2]:
# %cd malti_arabi_fst

In [2]:
import pynini as pn
import kenlm
from itertools import product
import pyconll
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import re
from sklearn.feature_extraction.text import strip_accents_unicode

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-mix")

In [4]:
wordmodel = kenlm.Model('data/arabi_data/arabic_lm/aggregated_country/lm/word/tn-maghreb.arpa')
charmodel = kenlm.Model('data/arabi_data/arabic_lm/aggregated_country/lm/char/tn-maghreb.arpa')

Loading the LM will be faster if you build a binary file.
Reading /Users/f/ba3sasah/malti_arabi_fst/data/arabi_data/arabic_lm/aggregated_country/lm/word/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Loading the LM will be faster if you build a binary file.
Reading /Users/f/ba3sasah/malti_arabi_fst/data/arabi_data/arabic_lm/aggregated_country/lm/char/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [5]:
def merge_conllu(dataset,charhist=False):
    dev = pyconll.load_from_file(f'data/malti_data/{dataset}/dev.conllu')
    train = pyconll.load_from_file(f'data/malti_data/{dataset}/train.conllu')
    test = pyconll.load_from_file(f'data/malti_data/{dataset}/test.conllu')
    allsets = dev._sentences + train._sentences + test._sentences

    print(f'# of sents in {dataset}',len(allsets))

    keys = ["id","form","lemma","upos","xpos","feats","head","deprel","deps","misc"]

    sents = []
    for sent in allsets:
        # toks = [pd.Series({'sent_id':sent.id,'sent':sent.text})]
        toks = []
        for tok in sent:
            tokdict = {'sent_id':sent.id}
            tokdict.update( {k:tok.__getattribute__(k) for k in keys})
            toks.append (pd.Series(tokdict))
        sents.append(pd.DataFrame(toks))
    df = pd.concat(sents)    
    # word_hist
    
    word_hist = df['form'].dropna().value_counts().reset_index()
    # word_hist.to_clipboard()
    print(f'# of words (uniq) in {dataset}',len(word_hist))
    # # char hist
    char_hist = pd.DataFrame([y for x in df['form'].dropna().str.casefold() for y in x]).value_counts()
    # char_hist.to_clipboard()
    print(f'# of chars (uniq) {dataset}',len(char_hist))
    return df

In [7]:
mlrs = merge_conllu('MLRS POS')
sa = merge_conllu('Sentiment Analysis')
mapa = merge_conllu('MAPA')
mudt = merge_conllu('mt_mudt-ud')

# of sents in MLRS POS 6167
# of words (uniq) in MLRS POS 15774
# of chars (uniq) MLRS POS 81
# of sents in Sentiment Analysis 851
# of words (uniq) in Sentiment Analysis 5425
# of chars (uniq) Sentiment Analysis 70
# of sents in MAPA 8763
# of words (uniq) in MAPA 19059
# of chars (uniq) MAPA 143
# of sents in mt_mudt-ud 2074
# of words (uniq) in mt_mudt-ud 8471
# of chars (uniq) mt_mudt-ud 74


In [8]:
alldata = pd.concat([mlrs,sa,mapa,mudt])
alldata_hist = alldata['form'].dropna().value_counts().reset_index()

In [263]:
# charset
# closedclass = pd.read_csv('mappings/closed_class_mappings.tsv',sep='\t',header=None) # already unique 
# closedclass = dict(closedclass.values)
pd.Series([' '.join(x) for x in alldata['form'].values]).str.lower().str.split().explode().value_counts().to_clipboard()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:

malti2arabi_2char = pn.string_file('mappings/malti2arabi_2char.map').optimize()
arabic2arabic = pn.string_file('mappings/arabic2arabic.map').optimize()
malti2arabi_1char = pn.string_file('mappings/malti2arabi_1char.map').optimize()
shadda = pn.string_file('mappings/shadda.map').optimize()
final_vowels = pn.string_file('mappings/final_vowels.map').optimize()
special = pn.string_file('mappings/special.map').optimize()
everything_else = pn.string_file('mappings/everything_else.map').optimize()
alif_initial = pn.string_file('mappings/alif_initial.map').optimize()
baby_closed_class = pn.string_file('mappings/baby_closed_class.map').optimize()
baby_closed_class_deterministic = pn.string_file('mappings_deterministic/baby_closed_class_deterministic.map').optimize()

sigma_malti = pn.project(malti2arabi_1char,'input')
sigma_arabi = pn.project(arabic2arabic,'output') 

# SIGMA
sigma_in = pn.project(pn.union(malti2arabi_1char,special,arabic2arabic,final_vowels,everything_else),'input')
sigma = pn.project(pn.union(sigma_in,special,final_vowels),'output').optimize()

rwr_first_fsts = pn.union(
    malti2arabi_2char,
    shadda,
    final_vowels,
    alif_initial,
).optimize()

rwr_first = pn.cdrewrite(rwr_first_fsts,"","",sigma.closure())

second_fsts = pn.union(
    malti2arabi_1char,
    arabic2arabic, 
    special,
    everything_else,
).optimize()

translit_fst = (rwr_first @ second_fsts.closure()).optimize()

# deterministic
malti2arabi_det= pn.string_file('mappings_deterministic/malti2arabi_1char_vowels_short.map').optimize()
special_deterministic = pn.string_file('mappings_deterministic/special_deterministic.map').optimize()

diacs = 'ًٌٍَُِّْ'
dediac_cross = pn.string_file('mappings/dediac.map')
dediac = pn.cdrewrite(dediac_cross,'','',sigma.closure())

augmented_closed_class = pn.string_file('mappings/augmented_closed_class.map').optimize()

words = pn.string_file('data/arabi_data/tn-maghreb-words.txt').optimize() @ dediac

In [314]:
# paths = get_paths(("<BOS>mil-<EOS>"  @ baby_closed_class @ transcriber  )),get_paths(("<BOS>fok<EOS>"   @ transcriber  ))
# paths,len(paths)

In [11]:
def dediac_fst(text):
    text = text.replace('[','\[').replace(']','\]')
    try:
        return (text @ dediac).string()
    except:
        return np.nan
    
words_df= pd.read_fwf('data/arabi_data/tn-maghreb-words.txt',header=None).rename(columns={0:'words'})
words_df['dediac'] = pd.Series([dediac_fst(x) for x in words_df['words']])

# words_df

In [12]:
def get_paths(fst,words_only=False):
    paths = list(fst.paths().items())
    if words_only:
        return [x[1] for x in paths]
    else:
        return paths



def apply_translit_fst(tok,backoff_fsts=[baby_closed_class,augmented_closed_class]):
    tok = tok.replace('[','\[').replace(']','\]')
    tok = (f'<BOS>{tok}<EOS>')
    # if type=='det':
    #     return tok @ deterministic_transcriber @ dediac
    if backoff_fsts:
        backoff =  tok @ pn.union(*backoff_fsts).optimize() @ dediac
        if get_paths(backoff):
            return backoff
        else:
            return tok  @ translit_fst @ dediac
    else:
        return tok  @ translit_fst @ dediac

def filter_edge_diacritics(options):
    return [y for y in options if y[0] not in diacs and y[-1] not in diacs]

def translit_deterministic(lowered,backoffs=[]):
    lowered = lowered.replace('[','\[').replace(']','\]') 
    if backoffs:
        backofflowered = f'<BOS>{lowered}<EOS>'
        backoff = backofflowered @ pn.union(*backoffs).optimize() @ dediac
        if len(get_paths(backoff))==1:
            return backoff.string()
        elif len(get_paths(backoff))>1:
            print('error: fst is NOT deterministic on:',lowered)
            return '#na'
    # default
    try:
        maptranslit = (lowered @ pn.union(malti2arabi_det,special_deterministic).closure().optimize() @ dediac).string()
        return maptranslit
    except:
        print('error detfst on:',lowered)
        return '#na'
            

def translit_word(lowered_tok,backoffs): #select on merged but return unmerged

    tok_fst = apply_translit_fst(lowered_tok,backoffs)
    translit_toks = get_paths(tok_fst,words_only=True) 
    if not translit_toks:
        return ['#NA']
    try:
        translit_toks = filter_edge_diacritics(translit_toks) 
    except:
        print('err filtering diacs',translit_toks,lowered_tok)
    
    translit_toks = [ dediac_fst(x) for x in translit_toks]  # dediacritize
    return translit_toks
    

langmodelset =  set(words_df['dediac'])

def count_subtokens(text, tokenizer):
    return tokenizer(text, add_special_tokens=False, return_length=True)["length"]

def strip_plus(x):
    if x == "+":
        return x
    else:
        return x.rstrip("+")
    
def dediacritise_non_malti_accents(text: str, diacritics_to_keep: str = "ċġħż") -> str:
    """
    Removes diacritics from the text.
    This preserves any special symbols which aren't diacritised characters.
    Args:
        text: The text to dediacritise.
        diacritics_to_keep: Optional diacritics to keep in the text.
    Returns:
        The dediacritised text.
    """

    normalised_text = strip_accents_unicode(text)
    if diacritics_to_keep:
        for character in re.finditer(rf"[{diacritics_to_keep}]", text):
            normalised_text = normalised_text[:character.start()] \
                              + character.group() \
                              + normalised_text[character.end():]
    return normalised_text


def translit_and_rank_options(word,name='translit',fsttype='non-det',backoffs=[baby_closed_class,augmented_closed_class]):
    normalized = dediacritise_non_malti_accents(word)
    lowered = normalized.lower()
    translit_dict = {
        'word_raw':word,
        'word_normalized':word,
        # 'word_normalized':lowered,
        }
    
    if fsttype == 'det':
        translit = [translit_deterministic(lowered)]
    elif fsttype == 'non-det':
        translit = translit_word(lowered,backoffs)
    else:
        raise Exception('wrong fsttype')

    translit_dict[name] = translit
    translit_dict['translit'] = translit # keep this, in order to merge later
    translit_dict['translit_stripped'] = [strip_plus(x) for x in translit]
    translit_dict['wordmodel_score'] = [wordmodel.score(x) for x in translit_dict['translit_stripped']]
    translit_dict['charmodel_score'] = [charmodel.score(' '.join(x)) for x in translit_dict['translit_stripped'] ]
    translit_dict['capitalized'] = word[0].isupper() # TODO: what about letter after sink as in 'L-Innu', does it matter?
    translit_dict['in_langmodel'] = [x in langmodelset for x in translit_dict['translit_stripped']]
    translit_dict['subtokens'] = count_subtokens(translit_dict['translit_stripped'], tokenizer)
    translit_dict['subtokens_lowest_ties'] = sum(np.array(translit_dict['subtokens']) == min(translit_dict['subtokens']))

    return translit_dict
    

word = "t'"
word = "L-Innu"
word = "din"
word = "f'dik"
word = "d-dinja"
word = "f'dil-konferenza d-dinja"
word = "mil-dinja" # check how many tokens it breaks into and how that affects lang model scores
word = "id-"
word = "fil- linja ."
word = "m'" 
word = "a"
word = "uffiċjali"
word = "il-"
word = "tielgħa"
# word = "[għandhomx" 
# translit_and_rank_options(word,cutoff=None,useclosedclass=False).merge(translit_and_rank_options(word,cutoff=None,useclosedclass=True),how='outer').sort_values(['charmodel_score'],ascending=False)
# translit_and_rank_options(word,cutoff=None,useclosedclass=False).merge(translit_and_rank_options(word,cutoff=None,useclosedclass=True),how='outer').sort_values(['wordmodel_score'],ascending=False)
# sorted([(wordmodel.score(x),x) for x in merged],key=lambda x: -x[0])



def generate_table(word):
    det = translit_and_rank_options(word,name='det',fsttype='det')    
    det_smallcc = translit_and_rank_options(word,name='det_smallcc',fsttype='det', backoffs=[baby_closed_class_deterministic])    
    det_fullcc = translit_and_rank_options(word,name='det_fullcc',fsttype='det', backoffs=[baby_closed_class_deterministic,augmented_closed_class])    
    nondet = translit_and_rank_options(word,name='nondet',fsttype='non-det')
    nondet_smallcc = translit_and_rank_options(word,name='nondet_smallcc',fsttype='non-det',backoffs=[baby_closed_class])
    nondet_fullcc = translit_and_rank_options(word,name='nondet_fullcc',fsttype='non-det',backoffs=[baby_closed_class,augmented_closed_class])
    det['freq'] = np.nan
    det_smallcc['freq'] = np.nan
    det_fullcc['freq'] = np.nan
    nondet['freq'] = np.nan
    nondet_smallcc['freq'] = np.nan
    nondet_fullcc['freq'] = np.nan
    
    return (det,det_smallcc,det_fullcc,nondet,nondet_smallcc,nondet_fullcc,)
   
det,det_smallcc,det_fullcc,nondet,nondet_smallcc,nondet_fullcc = generate_table(word)
det = pd.DataFrame(det)
det_smallcc = pd.DataFrame(det_smallcc)
det_fullcc = pd.DataFrame(det_fullcc)
nondet = pd.DataFrame(nondet)
nondet_smallcc = pd.DataFrame(nondet_smallcc)
nondet_fullcc = pd.DataFrame(nondet_fullcc)

def merge_multiple(dfs=[det,det_smallcc,det_fullcc,nondet,nondet_smallcc,nondet_fullcc]):
    first = dfs[0]
    for df in dfs[1:]:
        first = first.merge(df,how='outer')
    
    return first.sort_values('wordmodel_score',ascending=False)[[
        'word_raw',
        'word_normalized',
        'freq',
        'translit',
        'det',
        'det_smallcc',
        'det_fullcc',
        'nondet',
        'nondet_smallcc',
        'nondet_fullcc',        
        'translit_stripped',
        'wordmodel_score',
        'charmodel_score',
        'capitalized',
        'in_langmodel',
        'subtokens',
        # 'subtokens_lowest_ties',
        ]]

merged = merge_multiple()
merged

Unnamed: 0,word_raw,word_normalized,freq,translit,det,det_smallcc,det_fullcc,nondet,nondet_smallcc,nondet_fullcc,translit_stripped,wordmodel_score,charmodel_score,capitalized,in_langmodel,subtokens
1,tielgħa,tielgħa,,طالعة,,,,طالعة,,طالعة,طالعة,-6.705869,-6.622174,False,True,2
11,tielgħa,tielgħa,,طالعة,,,,,طالعة,,طالعة,-6.705869,-6.622174,False,True,2
12,tielgħa,tielgħa,,طالعه,,,,,طالعه,,طالعه,-7.818769,-7.795122,False,True,2
0,tielgħa,tielgħa,,تلجح,تلجح,تلجح,تلجح,,,,تلجح,-8.091814,-9.504936,False,False,2
16,tielgħa,tielgħa,,طالغه,,,,,طالغه,,طالغه,-8.091814,-10.538343,False,False,2
15,tielgħa,tielgħa,,طالغة,,,,,طالغة,,طالغة,-8.091814,-10.012894,False,False,2
14,tielgħa,tielgħa,,طالغا,,,,,طالغا,,طالغا,-8.091814,-11.608114,False,False,2
13,tielgħa,tielgħa,,طالعى,,,,,طالعى,,طالعى,-8.091814,-9.582643,False,False,2
10,tielgħa,tielgħa,,طالعا,,,,,طالعا,,طالعا,-8.091814,-11.131311,False,False,2
9,tielgħa,tielgħa,,تالغى,,,,,تالغى,,تالغى,-8.091814,-10.442297,False,False,2


In [317]:
# word_hist = word_hist
# mudtdev_translit = pd.concat(word_hist['sent'].iloc[:].apply(translit_and_rank_options).values)
def translit_dataset(word_hist):
    
    detlist = []
    det_smallcclist = []
    det_fullcclist = []
    nondetlist = []
    nondet_smallcclist = []
    nondet_fullcclist = []

    for word,freq in word_hist.values[:]:
        
        det, det_smallcc, det_fullcc, nondet, nondet_smallcc, nondet_fullcc = generate_table(word)
        det['freq'] = freq
        det_smallcc['freq'] = freq
        det_fullcc['freq'] = freq
        nondet['freq'] = freq
        nondet_smallcc['freq'] = freq
        nondet_fullcc    ['freq'] = freq
        
        detlist.append(det)
        det_smallcclist.append(det_smallcc)
        det_fullcclist.append(det_fullcc)
        nondetlist.append(nondet)
        nondet_smallcclist.append(nondet_smallcc)
        nondet_fullcclist.append(nondet_fullcc)
        
    
    detlistdf = pd.DataFrame(detlist).explode(['translit','det','translit_stripped','wordmodel_score','charmodel_score','in_langmodel','subtokens'])
    det_smallcclistdf = pd.DataFrame(det_smallcclist).explode(['translit','det_smallcc','translit_stripped','wordmodel_score','charmodel_score','in_langmodel','subtokens'])
    det_fullcclistdf = pd.DataFrame(det_fullcclist).explode(['translit','det_fullcc','translit_stripped','wordmodel_score','charmodel_score','in_langmodel','subtokens'])
    nondetlistdf = pd.DataFrame(nondetlist).explode(['translit','nondet','translit_stripped','wordmodel_score','charmodel_score','in_langmodel','subtokens'])
    nondet_smallcclistdf = pd.DataFrame(nondet_smallcclist).explode(['translit','nondet_smallcc','translit_stripped','wordmodel_score','charmodel_score','in_langmodel','subtokens'])
    nondet_fullcclistdf = pd.DataFrame(nondet_fullcclist).explode(['translit','nondet_fullcc','translit_stripped','wordmodel_score','charmodel_score','in_langmodel','subtokens'])

    return merge_multiple(dfs=
                          [
detlistdf,
det_smallcclistdf,
det_fullcclistdf,
nondetlistdf,
nondet_smallcclistdf,
nondet_fullcclistdf,
                          ]
                          )

    

In [168]:
# mudt_translit = translit_dataset(mudt)

# mapa_translit = translit_dataset(mapa)

# mlrs_translit = translit_dataset(mlrs)

# sa_translit = translit_dataset(sa)


# 'mudt',len(mudt_translit),len(mudt_translit),len(mudt_translit)/len(mudt_translit['word_raw'].unique())
# 'mapa',len(mapa_translit),len(mapa_translit),len(mapa_translit)/len(mapa_translit['word_raw'].unique())
# 'mlrs',len(mlrs_translit),len(mlrs_translit),len(mlrs_translit)/len(mlrs_translit['word_raw'].unique())
# 'sa',len(sa_translit),len(sa_translit),len(sa_translit)/len(sa_translit['word_raw'].unique())

alldata_translit = translit_dataset(alldata_hist)
'alldata',len(alldata_translit),len(alldata_translit),len(alldata_translit)/len(alldata_translit['word_raw'].unique())

error detfst on: ’
error detfst on: ’
error detfst on: ’
error detfst on: ‐
error detfst on: ‐
error detfst on: ‐
error detfst on: “
error detfst on: “
error detfst on: “
error detfst on: ”
error detfst on: ”
error detfst on: ”
error detfst on: —
error detfst on: —
error detfst on: —
error detfst on: @
error detfst on: @
error detfst on: @
error detfst on: }
error detfst on: }
error detfst on: }
error detfst on: {
error detfst on: {
error detfst on: {
error detfst on: §
error detfst on: §
error detfst on: §
error detfst on: soċjeta`
error detfst on: soċjeta`
error detfst on: soċjeta`
error detfst on: ×
error detfst on: ×
error detfst on: ×
error detfst on: snipped_english_sentence
error detfst on: snipped_english_sentence
error detfst on: snipped_english_sentence
error detfst on: socjeta`
error detfst on: socjeta`
error detfst on: socjeta`
error detfst on: ­
error detfst on: ­
error detfst on: ­
error detfst on: proprjeta`
error detfst on: proprjeta`
error detfst on: proprjeta`
error d

('alldata', 1878763, 1878763, 59.738092209856916)

In [171]:
alldata_translit.drop_duplicates(['word_raw','translit']).replace(np.nan,'<nan>').to_csv('transliterations/all_transliterated_tuples.tsv',sep='\t',index=False)

In [190]:
# pd.read_csv('transliterations/all_transliterated_tuples.tsv',sep='\t')
pd.read_csv('transliterations/all_transliterated_tuples.tsv',sep='\t').replace(np.nan,'').replace('<nan>',np.nan)

Unnamed: 0,word_raw,word_normalized,freq,translit,det,det_smallcc,det_fullcc,nondet,nondet_smallcc,nondet_fullcc,translit_stripped,wordmodel_score,charmodel_score,capitalized,in_langmodel,subtokens
0,iii,iii,27,,,,,,,,,-2.061620,-4.263644,False,True,0
1,᾽,᾽,1,,,,,,,,,-2.061620,-4.263644,False,False,0
2,OOO,OOO,1,,,,,,,,,-2.061620,-4.263644,True,True,0
3,UE,UE,337,,,,,,,,,-2.061620,-4.263644,True,True,0
4,ai,ai,35,,,,,,,,,-2.061620,-4.263644,False,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1878758,ġaladarba,ġaladarba,3,جا لاداربا,,,,جا لاداربا,,جا لاداربا,جا لاداربا,-11.599415,-15.974298,False,False,4
1878759,Ġaladarba,Ġaladarba,2,جا لاداربا,,,,جا لاداربا,,جا لاداربا,جا لاداربا,-11.599415,-15.974298,True,False,4
1878760,talanqas,talanqas,1,تاع الانقص,,,,تاع الانقص,,تاع الانقص,تاع الانقص,-12.839350,-16.684526,False,False,4
1878761,bħallikieku,bħallikieku,22,بحال اللي كيكو,,,,بحال اللي كيكو,,بحال اللي كيكو,بحال اللي كيكو,-16.675682,-23.698778,False,False,4


In [None]:
mudt_translit.to_csv('transliterations/mudt_transliterated_tuples.tsv',sep='\t',index=False)

In [None]:
mudt_translit.to_csv('transliterations/mudt_transliterated_tuples.tsv',sep='\t',index=False,nan)
mapa_translit.to_csv('transliterations/mapa_transliterated_tuples.tsv',sep='\t',index=False)
mlrs_translit.to_csv('transliterations/mlrs_transliterated_tuples.tsv',sep='\t',index=False)
sa_translit.to_csv('transliterations/sa_transliterated_tuples.tsv',sep='\t',index=False)