# Demo

This notebook demonstrates how to process a Maltese text to perform transliteration and/or translation

In [1]:
import requests

text = "Il-karozza Porsche tal-2022 għandha speed fenomenali!"

tokens = requests.get("https://mlrs.research.um.edu.mt/tools/mlrsapi/tokenise", params={"text": text}).json()["result"]
tokens

['Il-',
 'karozza',
 'Porsche',
 'tal-',
 '2022',
 'għandha',
 'speed',
 'fenomenali',
 '!']

## Transliteration

In [2]:
from transliterate import transliterate_sequence
import token_rankers

token_mappings = ["token_mappings/small_closed_class.map", "token_mappings/additional_closed_class.map"]
token_rankers = [
    token_rankers.WordModelScoreRanker("../models/aggregated_country/lm/word/tn-maghreb.arpa"),
    token_rankers.CharacterModelScoreRanker("../models/aggregated_country/lm/char/tn-maghreb.arpa"),
]
transliteration = transliterate_sequence(tokens, token_mappings, token_rankers)
transliteration

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Loading the LM will be faster if you build a binary file.
Reading /mnt/c/Users/Kurt/Repositories/external/malti_arabi_fst/models/aggregated_country/lm/word/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Loading the LM will be faster if you build a binary file.
Reading /mnt/c/Users/Kurt/Repositories/external/malti_arabi_fst/models/aggregated_country/lm/char/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


['ال', 'كردزة', 'برسكهي', 'تاع ال', '٢٠٢٢', 'عندها', 'صباد', 'فنمنلي', '!']

## Translation

In [3]:
from translate import translate_token

translation_ar = [translate_token(token, "mt-ar") for token in tokens]
translation_ar

['ال', 'ترام', 'بورشه', 'ل', '2022', 'هو', 'سرعة', 'هائل', '!']

In [4]:
translation_it = [translate_token(token, "mt-it") for token in tokens]
translation_it

['IL', 'tram', 'Porsche', 'Di', '2022', 'Esso', 'velocità', 'fenomenale', '!']

In [5]:
translation_en = [translate_token(token, "mt-en") for token in tokens]
translation_en

['The', 'streetcar', 'Porsche', 'of', '2022', 'it', 'speed', 'phenomenal', '!']

## Conditional Transliteration/Translation

In [6]:
import pickle
from etymology_classification import featurise

with open("etymology_data/model.pickle", "rb") as file:
    model = pickle.load(file)

labels = model.predict([featurise(tokens)])[0]
labels.tolist()

Loading the LM will be faster if you build a binary file.
Reading /mnt/c/Users/Kurt/Repositories/external/malti_arabi_fst/models/aggregated_country/lm/word/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Loading the LM will be faster if you build a binary file.
Reading /mnt/c/Users/Kurt/Repositories/external/malti_arabi_fst/models/aggregated_country/lm/char/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


['Arabic',
 'Non-Arabic',
 'Name',
 'Arabic',
 'Symbol',
 'Arabic',
 'Code-Switching',
 'Non-Arabic',
 'Symbol']

In [7]:
transliteration_pass = []
for i, label in enumerate(labels):
    if label in ("Arabic",):
        transliteration_pass.append(transliteration[i])
    else:
        transliteration_pass.append(tokens[i])
transliteration_pass

['ال',
 'karozza',
 'Porsche',
 'تاع ال',
 '2022',
 'عندها',
 'speed',
 'fenomenali',
 '!']

In [8]:
transliteration_translation_ar = []
for i, label in enumerate(labels):
    if label in ("Arabic", "Symbol",):
        transliteration_translation_ar.append(transliteration[i])
    else:
        transliteration_translation_ar.append(translation_ar[i])
transliteration_translation_ar

['ال', 'ترام', 'بورشه', 'تاع ال', '٢٠٢٢', 'عندها', 'سرعة', 'هائل', '!']

In [9]:
transliteration_translation_it = []
for i, label in enumerate(labels):
    if label in ("Arabic",):
        transliteration_translation_it.append(transliteration[i])
    elif label not in ("Code-Switching",):
        transliteration_translation_it.append(translation_it[i])
    else:
        transliteration_translation_it.append(tokens[i])
transliteration_translation_it

['ال',
 'tram',
 'Porsche',
 'تاع ال',
 '2022',
 'عندها',
 'speed',
 'fenomenale',
 '!']

In [10]:
transliteration_translation_en = []
for i, label in enumerate(labels):
    if label in ("Arabic",):
        transliteration_translation_en.append(transliteration[i])
    elif label not in ("Code-Switching",):
        transliteration_translation_en.append(translation_en[i])
    else:
        transliteration_translation_en.append(tokens[i])
transliteration_translation_en

['ال',
 'streetcar',
 'Porsche',
 'تاع ال',
 '2022',
 'عندها',
 'speed',
 'phenomenal',
 '!']

## Comparison

In [11]:
import pandas as pd

pd.DataFrame({
    "tokens": tokens,
    "labels": labels,
    "transliteration": transliteration,
    "translation_ar": translation_ar,
    "translation_it": translation_it,
    "translation_en": translation_en,
    "transliteration+pass": transliteration_pass,
    "transliteration+translation_ar": transliteration_translation_ar,
    "transliteration+translation_it": transliteration_translation_it,
    "transliteration+translation_en": transliteration_translation_en,
})

Unnamed: 0,tokens,labels,transliteration,translation_ar,translation_it,translation_en,transliteration+pass,transliteration+translation_ar,transliteration+translation_it,transliteration+translation_en
0,Il-,Arabic,ال,ال,IL,The,ال,ال,ال,ال
1,karozza,Non-Arabic,كردزة,ترام,tram,streetcar,karozza,ترام,tram,streetcar
2,Porsche,Name,برسكهي,بورشه,Porsche,Porsche,Porsche,بورشه,Porsche,Porsche
3,tal-,Arabic,تاع ال,ل,Di,of,تاع ال,تاع ال,تاع ال,تاع ال
4,2022,Symbol,٢٠٢٢,2022,2022,2022,2022,٢٠٢٢,2022,2022
5,għandha,Arabic,عندها,هو,Esso,it,عندها,عندها,عندها,عندها
6,speed,Code-Switching,صباد,سرعة,velocità,speed,speed,سرعة,speed,speed
7,fenomenali,Non-Arabic,فنمنلي,هائل,fenomenale,phenomenal,fenomenali,هائل,fenomenale,phenomenal
8,!,Symbol,!,!,!,!,!,!,!,!
