# Dictionary models

Here we evaluate the dictionary model.

It does not need training, so it is tested on the entire dataset.

In [1]:
import re
from dictionary_model import DictionaryTranslation

import sys
sys.path.append("../")
import Data
from translation_utils import compute_metrics, print_example

## Entire Dataset, BlankSpace tokenizer

In [2]:
title_re = re.compile(r"[^a-zA-Z ]*((mr)|(ms)|(mrs)|(miss))[^a-zA-Z ]*")
remove_chars = re.compile(r"[126\[\],<>]")
space_norm = re.compile(r" +")
add_unk = re.compile(r"\?")
dash = re.compile(r"(?<=[a-zA-Z0-9])-(?=[a-zA-Z0-9])")

def clean_english(x: str) -> str:
    x = x.lower()
    x = title_re.sub(" ", x)
    x = dash.sub(" ", x)
    x = remove_chars.sub(" ", x)
    x = add_unk.sub(" ", x)
    x = space_norm.sub(" ", x)
    return x.strip()

def clean_etruscan(x: str) -> str:
    x = x.lower()
    x = remove_chars.sub(" ", x)
    x = space_norm.sub(" ", x)
    return x.strip()

In [3]:
et, eng = Data.load_translation_dataset(etruscan_fn=clean_etruscan, english_fn=clean_english)

In [4]:
model = DictionaryTranslation(Data._dir + "ETP_POS.csv", etruscan_fn=clean_etruscan, english_fn=clean_english)
print("Vocabulary size:", len(model.dict))

Vocabulary size: 821


In [5]:
translations = [model.predict(i) for i in et]
scores = compute_metrics(translations, eng)
scores

{'bleu': 0.16741008721357548,
 'chrf': 9.120301935355664,
 'ter': 89.79876701174828}

In [6]:
print_example((et[:5], eng[:5], translations[:5]))

------------------------------------------------------------
Etruscan: cleusinas laris larisal clan
Reference: laris cleusinas son of laris
Prediction: cleusinas   son
------------------------------------------------------------
------------------------------------------------------------
Etruscan: mi karkanas thahvna
Reference: i am the container of karkana
Prediction: i karkana container
------------------------------------------------------------
------------------------------------------------------------
Etruscan: mi ei minpi capi mi numar thevruclnas qupes fulushla
Reference: dont take me i am nunar i am the property of qupe thevrudnas the son of fulu
Prediction: i not me take i  thevruclnas qupe 
------------------------------------------------------------
------------------------------------------------------------
Etruscan: mini muluvanice tetana velkasnas veleliiasi
Reference: tetana velkasnas gave me to velellia
Prediction: me gave tetana velkasnas veleliia
-----------------

## ETP only, BlankSpace Tokenizer

In [7]:
et_etp, eng_etp = Data.load_translation_dataset(subset="etp", etruscan_fn=clean_etruscan, english_fn=clean_english)

In [8]:
model_etp = DictionaryTranslation(Data._dir + "ETP_POS.csv", etruscan_fn=clean_etruscan, english_fn=clean_english)

In [9]:
translations_etp = [model_etp.predict(i) for i in et_etp]
scores_etp = compute_metrics(translations_etp, eng_etp)
scores_etp

{'bleu': 4.505159738640008,
 'chrf': 40.77125596383003,
 'ter': 68.13450760608487}

In [10]:
print_example((et_etp[:5], eng_etp[:5], translations_etp[:5]))

------------------------------------------------------------
Etruscan: cleusinas laris larisal clan
Reference: laris cleusinas son of laris
Prediction: cleusinas   son
------------------------------------------------------------
------------------------------------------------------------
Etruscan: mi karkanas thahvna
Reference: i am the container of karkana
Prediction: i karkana container
------------------------------------------------------------
------------------------------------------------------------
Etruscan: mi ei minpi capi mi numar thevruclnas qupes fulushla
Reference: dont take me i am nunar i am the property of qupe thevrudnas the son of fulu
Prediction: i not me take i  thevruclnas qupe 
------------------------------------------------------------
------------------------------------------------------------
Etruscan: mini muluvanice tetana velkasnas veleliiasi
Reference: tetana velkasnas gave me to velellia
Prediction: me gave tetana velkasnas veleliia
-----------------

## CIEP only, BlankSpace Tokenizer

In [11]:
et_ciep, eng_ciep = Data.load_translation_dataset(subset="ciep", etruscan_fn=clean_etruscan, english_fn=clean_english)

In [12]:
model_ciep = DictionaryTranslation(Data._dir + "ETP_POS.csv", etruscan_fn=clean_etruscan, english_fn=clean_english)

In [13]:
translations_ciep = [model_ciep.predict(i) for i in et_ciep]
scores_ciep = compute_metrics(translations_ciep, eng_ciep)
scores_ciep

{'bleu': 2.4033683935548834e-06,
 'chrf': 1.896459864075202,
 'ter': 98.67191342843088}

In [14]:
print_example((et_ciep[:5], eng_ciep[:5], translations_ciep[:5]))

------------------------------------------------------------
Etruscan: tularspu
Reference: boundaries
Prediction: 
------------------------------------------------------------
------------------------------------------------------------
Etruscan: visl
Reference: of the goddess vipsi
Prediction: 
------------------------------------------------------------
------------------------------------------------------------
Etruscan: atichu
Reference: built
Prediction: 
------------------------------------------------------------
------------------------------------------------------------
Etruscan: cneunas
Reference: cneunas
Prediction: 
------------------------------------------------------------
------------------------------------------------------------
Etruscan: thusathur
Reference: herethe bones
Prediction: 
------------------------------------------------------------


## ETP only, Remove suffixes

In [15]:
suffix_tokenizer = Data.SuffixTokenizer()
def suffix_tokenize(x):
    return suffix_tokenizer(x)[0]

print("Terminal suffixes:", len(suffix_tokenizer._terminal_suffixes))
print("Non-terminal suffixes:", len(suffix_tokenizer._non_terminal_suffixes))

Terminal suffixes: 169
Non-terminal suffixes: 9


In [16]:
model_etp_no_suff = DictionaryTranslation(Data._dir + "ETP_POS.csv", etruscan_fn=clean_etruscan, english_fn=clean_english, tokenize_dictionary=suffix_tokenize, tokenizer=suffix_tokenizer)

In [17]:
translations_etp_no_suff = [model_etp_no_suff.predict(i) for i in et_etp]
scores_etp_no_suff = compute_metrics(translations_etp_no_suff, eng_etp)
scores_etp_no_suff

{'bleu': 1.6054153572561898,
 'chrf': 37.669248578243696,
 'ter': 82.66613290632506}

In [18]:
print_example((et_etp[:5], eng_etp[:5], translations_etp_no_suff[:5]))

------------------------------------------------------------
Etruscan: cleusinas laris larisal clan
Reference: laris cleusinas son of laris
Prediction: cleusinas          son
------------------------------------------------------------
------------------------------------------------------------
Etruscan: mi karkanas thahvna
Reference: i am the container of karkana
Prediction:   karkana  container 
------------------------------------------------------------
------------------------------------------------------------
Etruscan: mi ei minpi capi mi numar thevruclnas qupes fulushla
Reference: dont take me i am nunar i am the property of qupe thevrudnas the son of fulu
Prediction:   this  me pisna take pisna   numna constructed thevruclnas  qupe  fulu 
------------------------------------------------------------
------------------------------------------------------------
Etruscan: mini muluvanice tetana velkasnas veleliiasi
Reference: tetana velkasnas gave me to velellia
Prediction:    g