# Prepare data

This notebook is used to train the tokenizers and to create the dataset used to train the NN models.

## Tokenizer

Steps:
- read the files
- clean
- make word list (Etruscan and English)
- use additional English data (from nltk)
- write the files
- train the tokenizers

In [1]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

import sys
sys.path.append("../")
sys.path.append("../../")
import Data
import re
import utils

In [2]:
title_re = re.compile(r"[^a-zA-Z ]*((mr)|(ms)|(mrs)|(miss))[^a-zA-Z ]*")
remove_chars = re.compile(r"[126\[\],<>]")
space_norm = re.compile(r" +")
add_unk = re.compile(r"\?")
non_word = re.compile(r"[^a-zA-Z ]")

def clean_english(x: str) -> str:
    x = x.lower()
    x = title_re.sub(" ", x)
    x = remove_chars.sub(" ", x)
    # x = add_unk.sub("<unk>", x)
    x = add_unk.sub(" ", x) # Remove ? from training data -> ? to <unk>
    x = space_norm.sub(" ", x)
    return x.strip()

def clean_etruscan(x: str) -> str:
    x = x.lower()
    x = remove_chars.sub(" ", x)
    x = space_norm.sub(" ", x)
    return x.strip()

In [3]:
et, eng = Data.load_translation_dataset(etruscan_fn=clean_etruscan, english_fn=clean_english)

In [4]:
# Sanity check
et_chars = list(set("".join(et)))
et_chars = sorted(et_chars)
print("".join(et_chars))

 -abcdefghiklmnopqrstuvwxyz


In [5]:
# Sanity check
eng_chars = list(set("".join(eng)))
eng_chars = sorted(eng_chars)
print("".join(eng_chars))

 -abcdefghijklmnopqrstuvwxyz


In [None]:
etruscan_tokenizer = Data.SentencePieceTokenizer(Data.ETRUSCAN)
english_tokenizer = Data.SentencePieceTokenizer(Data.ENGLISH)

In [7]:
with open(Data._dir + "etruscan_tokenizer_data.txt", "wt") as f:
    for i in et:
        f.write(i)
        f.write("\n")

In [8]:
# TODO Integrate with Brown or https://github.com/cltk/grc_text_perseus/tree/master/cltk_json
with open(Data._dir + "english_tokenizer_data.txt", "wt") as f:
    for i in eng:
        f.write(i)
        f.write("\n")

In [9]:
etruscan_tokenizer.train(Data._dir + "etruscan_tokenizer_data.txt", Data._dir + "etruscan", vocab_size=1927)
english_tokenizer.train(Data._dir + "english_tokenizer_data.txt", Data._dir + "english")

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input_format: 
  model_prefix: /media/gianluca/Shared/DKE/MasterThesis/Translation/IthacaLike/../../Data/etruscan_word
  model_type: UNIGRAM
  vocab_size: 1927
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: ▁
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
  bos_id: 1
  eos_id: 2
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface: <unk>
  enable_differ

In [10]:
english_tokenizer.load(Data._dir + "english")
etruscan_tokenizer.load(Data._dir + "etruscan")

<Data.tokenizers.SentencePieceTokenizer at 0x7f78d93cbed0>

In [11]:
tmp_chars, tmp_words = english_tokenizer.tokenize(["test", "other test", "another one", "?", "\n"], align=False)
tmp_chars

[[1, 4, 8, 5, 7, 8, 2],
 [1, 4, 13, 8, 11, 5, 12, 4, 8, 5, 7, 8, 2],
 [1, 4, 6, 10, 13, 8, 11, 5, 12, 4, 13, 10, 5, 2],
 [1, 4, 3, 2],
 [1, 2]]

In [12]:
english_tokenizer.detokenize(tmp_words)

['test', 'other test', 'another one', '<unk>', '']

In [13]:
align_chars, align_words = english_tokenizer.tokenize(["test", "other test", "another one", "?", "\n"], align=True)
_, no_align_words = english_tokenizer.tokenize(["test", "other test", "another one", "?", "\n"], align=False)

In [14]:
print(align_words)
print(no_align_words)

[[1, 4, 247, 247, 247, 12, 2], [1, 4, 45, 7, 7, 7, 24, 4, 247, 247, 247, 12, 2], [1, 4, 1197, 1197, 1197, 7, 7, 7, 24, 4, 13, 13, 5, 2], [1, 4, 3, 2], [1, 2]]
[[1, 4, 247, 12, 2], [1, 4, 45, 7, 24, 4, 247, 12, 2], [1, 4, 1197, 7, 24, 4, 13, 5, 2], [1, 4, 3, 2], [1, 2]]


In [15]:
print(english_tokenizer.detokenize(align_words))
print(english_tokenizer.detokenize(no_align_words))
print(english_tokenizer.detokenize(align_chars, word=False))

['testestest', 'othethether testestest', 'anoanoanothethether onone', '<unk>', '']
['test', 'other test', 'another one', '<unk>', '']
['test', 'other test', 'another one', '<unk>', '']


In [16]:
tmp_chars, tmp_words = etruscan_tokenizer.tokenize(["larth", "mini mlakas", "questo è in italiano"], align=False)
tmp_words

[[1, 4, 34, 2],
 [1, 4, 71, 4, 522, 2],
 [1, 4, 505, 23, 112, 4, 3, 4, 39, 4, 207, 5, 104, 5, 355, 2]]

In [17]:
etruscan_tokenizer.detokenize(tmp_words)

['larth', 'mini mlakas', 'questo <unk> in italiano']

## Input tokenizer with multiple languages

In [11]:
multi_lang_tokenizer = Data.SentencePieceTokenizer(Data.ENGLISH)
extended_english_tokenizer = Data.SentencePieceTokenizer(Data.ENGLISH)

In [6]:
with open(Data._dir + "latin_tokenizer_data.txt", "rt") as f:
    lat = f.readlines()
lat = [i.strip().lower() for i in lat]
with open(Data._dir + "greek_tokenizer_data.txt", "rt") as f:
    grk = f.readlines()
grk = [i.strip().lower() for i in grk]

with open(Data._dir + "english_l_tokenizer_data.txt", "rt") as f:
    eng_l = f.readlines()
eng_l = [i.strip().lower() for i in eng_l]
with open(Data._dir + "english_g_tokenizer_data.txt", "rt") as f:
    eng_g = f.readlines()
eng_g = [i.strip().lower() for i in eng_g]

In [None]:
multi_lang_tokenizer.train(et + lat + grk, Data._dir + "multi", vocab_size = 4000)
extended_english_tokenizer.train(eng + eng_l + eng_g, Data._dir + "extended_english")

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input_format: 
  model_prefix: /media/gianluca/Shared/DKE/MasterThesis/Translation/IthacaLike/../../Data/multi_word
  model_type: UNIGRAM
  vocab_size: 4000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: ▁
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
  bos_id: 1
  eos_id: 2
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface: <unk>
  enable_different

## One tokenizer for everything

Use all the previous data and the Tatoeba

In [7]:
import unicodedata

In [8]:
# v2021-08-07: the train split does not contain Greek text
(t_train_grc_target,
    t_train_grc_source,
    t_test_grc_target, 
    t_test_grc_source, 
    t_dev_grc_target, 
    t_dev_grc_source
) = Data.load_tatoeba(Data._dir + "Tatoeba/data/release/v2021-08-07/eng-grc")

(t_train_lat_target,
    t_train_lat_source,
    t_test_lat_target, 
    t_test_lat_source, 
    t_dev_lat_target, 
    t_dev_lat_source
) = Data.load_tatoeba(Data._dir + "Tatoeba/data/release/v2021-08-07/eng-lat")

tatoeba_greek = t_test_grc_source + t_dev_grc_source
tatoeba_latin = t_train_lat_source + t_test_lat_source + t_dev_lat_source
tatoeba_english = t_train_lat_target + t_test_lat_target + t_dev_lat_target + t_test_grc_target + t_dev_grc_target

In [9]:
def clean_tatoeba(x: str) -> str:
    x = x.lower()
    
    # Split accents, remove and recombine
    x = unicodedata.normalize('NFD', x)
    x = ''.join(c for c in x if not unicodedata.combining(c))
    x = unicodedata.normalize('NFC', x)
    
    x = x.translate(utils.greek_to_latin)
    x = x.translate(utils.others)
    
    x = non_word.sub(" ", x) # This also remove punctuation
    x = space_norm.sub(" ", x)
    x = x.strip()
    return x

In [10]:
tatoeba_greek_clean = [clean_tatoeba(i) for i in tatoeba_greek]
tatoeba_latin_clean = [clean_tatoeba(i) for i in tatoeba_latin]
tatoeba_english_clean = [clean_tatoeba(i) for i in tatoeba_english]

In [11]:
tatoeba_greek_chars = sorted(list(set("".join(tatoeba_greek_clean))))
print("".join(tatoeba_greek_chars))

tatoeba_latin_chars = sorted(list(set("".join(tatoeba_latin_clean))))
print("".join(tatoeba_latin_chars))

tatoeba_english_chars = sorted(list(set("".join(tatoeba_english_clean))))
print("".join(tatoeba_english_chars))

 abcdefghijklmnoprstuvxyz
 abcdefghijklmnopqrstuvwxyz
 abcdefghijklmnopqrstuvwxyz


In [12]:
all_greek = tatoeba_greek_clean + grk
all_latin = tatoeba_latin_clean + lat
all_english = tatoeba_english_clean + eng_g + eng_l + eng
all_lang = et + all_greek + all_latin + all_english

In [16]:
all_chars = sorted(list(set("".join(all_lang))))
print("".join(all_chars))

 -abcdefghijklmnopqrstuvwxyz


In [17]:
vocab_size = len(set(" ".join(all_lang).split()))
print(vocab_size)

vocab_size = len(set(" ".join(all_greek).split()))
print(vocab_size)

vocab_size = len(set(" ".join(all_latin).split()))
print(vocab_size)

vocab_size = len(set(" ".join(all_english).split()))
print(vocab_size)

vocab_size = len(set(" ".join(et).split()))
print(vocab_size)

486060
99700
226776
201028
3051


In [18]:
big_english_tokenizer = Data.SentencePieceTokenizer(Data.ETRUSCAN)
big_english_tokenizer.train(all_english, name = Data._dir + "all_english", vocab_size=102792) # vocab size: set based on the error messages

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input_format: 
  model_prefix: /media/gianluca/Shared/DKE/MasterThesis/Translation/IthacaLike/../../Data/all_english_word
  model_type: UNIGRAM
  vocab_size: 102792
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: ▁
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
  bos_id: 1
  eos_id: 2
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface: <unk>
  enable_d

In [53]:
big_tokenizer = Data.SentencePieceTokenizer(Data.ETRUSCAN)
big_tokenizer.train(all_lang, name = Data._dir + "all", vocab_size=262104) # vocab size: set based on the error messages

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input_format: 
  model_prefix: /media/gianluca/Shared/DKE/MasterThesis/Translation/IthacaLike/../../Data/all_word
  model_type: UNIGRAM
  vocab_size: 262104
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: ▁
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
  bos_id: 1
  eos_id: 2
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface: <unk>
  enable_different

In [18]:
big_tokenizer = Data.SentencePieceTokenizer(Data.ETRUSCAN)
big_tokenizer.load(Data._dir + "all")

<Data.tokenizers.SentencePieceTokenizer at 0x7fc7f5b25510>

In [19]:
print(big_tokenizer.tokenize(["abcdedfghijklmnopqrstuvwxyz-§"], align=False)[0])
print(big_tokenizer._sp_chars.unk_id())
print(big_tokenizer.tokenize(["abcdedfghijklmnopqrstuvwxyz-§"], align=False)[0][0].count(3))

[[1, 4, 8, 23, 18, 16, 5, 16, 20, 21, 15, 6, 29, 27, 17, 14, 11, 10, 19, 26, 12, 9, 7, 13, 24, 25, 28, 22, 30, 3, 2]]
3
1


In [20]:
print(big_tokenizer.tokenize(["---§§§"], align=False)[0][0])

[1, 4, 3, 2]


In [21]:
big_tokenizer.detokenize(big_tokenizer.tokenize(["this is some english text", "larth", "ellas", "cane canem"], align=False)[1], word=True)

['this is some english text', 'larth', 'ellas', 'cane canem']

In [22]:
big_tokenizer.detokenize(big_tokenizer.tokenize(["testo in italiano"], align=False)[1], word=True)

['testo in italiano']

In [13]:
small_tokenizer = Data.SentencePieceTokenizer(Data.ETRUSCAN)
small_tokenizer.train(all_lang, name = Data._dir + "all_small", vocab_size=10000) # vocab size: set based on the error messages

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input_format: 
  model_prefix: /media/gianluca/Shared/DKE/MasterThesis/Translation/IthacaLike/../../Data/all_small_word
  model_type: UNIGRAM
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: ▁
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
  bos_id: 1
  eos_id: 2
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface: <unk>
  enable_diff

In [15]:
small_tokenizer = Data.SentencePieceTokenizer(Data.ETRUSCAN)
small_tokenizer.load(Data._dir + "all_small")
print(small_tokenizer.tokenize(["abcdedfghijklmnopqrstuvwxyz-§"], align=False)[0])
print(small_tokenizer._sp_chars.unk_id())
print(small_tokenizer.tokenize(["abcdedfghijklmnopqrstuvwxyz-§"], align=False)[0][0].count(3))
print(small_tokenizer.detokenize(small_tokenizer.tokenize(["this is some english text", "larth", "ellas", "cane canem"], align=False)[1], word=True))

[[1, 4, 8, 23, 18, 16, 5, 16, 20, 21, 15, 6, 29, 27, 17, 14, 11, 10, 19, 26, 12, 9, 7, 13, 24, 25, 28, 22, 30, 3, 2]]
3
1
['this is some english text', 'larth', 'ellas', 'cane canem']


## Datasets

Steps:
- read the data
- clean
- tokenize
- ~~save the data (pickle? / tf)~~
- move the functions/classes/etc... to `train.py`

Dataset structure:
- train
    - inputs
    - inputs_char
    - inputs_word
    - targets
    - targets_char
    - targets_word
- test
    - inputs
    - inputs_char
    - inputs_word
    - targets
    - targets_char
    - targets_word

In [18]:
import jax
import jax.numpy as jnp
import datasets
import numpy as np
from typing import Tuple, List, Optional, Union, Dict, Iterator

In [19]:
tmp = pd.read_csv(Data._dir + "Etruscan.csv", index_col=0).dropna(subset=["Translation"]).reset_index()
is_etp = tmp["key"].isna().to_list()

In [20]:
et_encoded_char, et_encoded_word = etruscan_tokenizer.tokenize(et)
# et_encoded_char = etruscan_tokenizer.pad_sequences(et_encoded_char)
# et_encoded_word = etruscan_tokenizer.pad_sequences(et_encoded_word)

eng_encoded_char, eng_encoded_word = english_tokenizer.tokenize(eng)
# eng_encoded_char = english_tokenizer.pad_sequences(eng_encoded_char)
# eng_encoded_word = english_tokenizer.pad_sequences(eng_encoded_word)

In [21]:
ds = datasets.Dataset.from_dict(
    {     
    "inputs": et,
    "inputs_chars": et_encoded_char,
    "inputs_words": et_encoded_word,
    "targets": eng,
    "targets_chars": eng_encoded_char,
    "targets_words": eng_encoded_word,
    "is_etp": is_etp
    },
)
# ds = ds.with_format("jax", device=str(jax.devices()[0]))

In [22]:
split = ds.train_test_split(train_size=0.9, seed=0)

In [23]:
class DataLoader:
    """
    Create the batched and iterate through the datasets
    """

    def __init__(
        self, ds: datasets.Dataset, batch_size: int, cached: bool = False
    ) -> None:
        """
        Args:
            ds: dataset
            batch_size: size of the batches
            cached: whether to immediatly create and store the batched
        """
        self.ds: datasets.Dataset = ds
        self.batch_size: int = batch_size
        self.cached: bool = cached
        # init cache
        iter(self)

    def __next__(self) -> Dict[str, Union[List[str], jax.Array]]:
        d = next(self.iterator)
        if self.cached:
            return d
        return self.make_batch(d)

    def __iter__(self) -> Iterator:
        self.iterator: Iterator = self.ds.iter(self.batch_size)
        if self.cached:
            self.iterator = [self.make_batch(i) for i in self.iterator]
            self.iterator = iter(self.iterator)
        return self

    def make_batch(
        self, d: Dict[str, Union[List[str], List[List[int]]]]
    ) -> Dict[str, Union[List[str], jax.Array]]:
        """
        Create a batch with jax Arrays.

        Args:
            d: batch as a dictionary

        Returns:
            Dict: batch as jax Arrays
        """
        inputs_chars, inputs_words = self.pad(d["inputs_chars"], d["inputs_words"])
        targets_chars, targets_words = self.pad(d["targets_chars"], d["targets_words"])
        return {
            # "inputs": d["inputs"],
            "inputs_chars": inputs_chars,
            "inputs_words": inputs_words,
            # "targets": d["targets"],
            "targets_chars": targets_chars,
            "targets_words": targets_words,
            # "is_etp": d["is_etp"]
        }

    def pad(self, l: List[List[int]], o: Optional[List[List[int]]]=None) -> Union[jax.Array, Tuple[jax.Array, jax.Array]]:
        """
        Pad sequences and cast then to a jax Array.
        Pad with 0

        Args:
            l: list of sequences
            o: other sequence if needed
        Returns:
            jax Array
        """
        l_lens = [len(i) for i in l]
        max_l = max(l_lens)
        if o is not None:
            o_lens = [len(i) for i in o]
            max_l = max(max_l, max(o_lens))

        new_l = jnp.array([np.pad(i, (0, max_l - j)) for i, j in zip(l, l_lens)])
        if o is not None:
            new_o = jnp.array([np.pad(i, (0, max_l - j)) for i, j in zip(l, l_lens)])
            return new_l, new_o
        return new_l

In [24]:
def get_training_data(etruscan_csv: str, etruscan_model: str, english_model: str, batch_size: int, train_size: float=0.9, cached: bool=False):
    # Load & clean
    title_re = re.compile(r"[^a-zA-Z ]*((mr)|(ms)|(mrs)|(miss))[^a-zA-Z ]*")
    remove_chars = re.compile(r"[126\[\],<>]")
    space_norm = re.compile(r" +")
    add_unk = re.compile(r"\?")

    def clean_english(x: str) -> str:
        x = x.lower()
        x = title_re.sub("", x)
        x = remove_chars.sub("", x)
        # x = add_unk.sub("<unk>", x)
        x = add_unk.sub(" ", x) # Remove ? from training data -> ? to <unk>
        x = space_norm.sub(" ", x)
        return x.strip()

    def clean_etruscan(x: str) -> str:
        x = x.lower()
        x = remove_chars.sub("", x)
        x = space_norm.sub(" ", x)
        return x.strip()
    
    et, eng = Data.load_translation_dataset(etruscan_fn=clean_etruscan, english_fn=clean_english)
    tmp = pd.read_csv(etruscan_csv, index_col=0).dropna(subset=["Translation"]).reset_index()
    is_etp = tmp["key"].isna().to_list()
    
    # Tokenizers
    etruscan_tokenizer = Data.SentencePieceTokenizer(Data.ETRUSCAN)
    english_tokenizer = Data.SentencePieceTokenizer(Data.ENGLISH)
    etruscan_tokenizer.load(etruscan_model)
    english_tokenizer.load(english_model)
    
    # Tokenization
    et_encoded_char, et_encoded_word = etruscan_tokenizer.tokenize(et)
    eng_encoded_char, eng_encoded_word = english_tokenizer.tokenize(eng)

    # Dataset & Split
    ds = datasets.Dataset.from_dict(
        {     
        "inputs": et,
        "inputs_chars": et_encoded_char,
        "inputs_words": et_encoded_word,
        "targets": eng,
        "targets_chars": eng_encoded_char,
        "targets_words": eng_encoded_word,
        "is_etp": is_etp
        },
    )
    split = ds.train_test_split(train_size=train_size)

    train = split["train"]
    test = split["test"]

    train_dl = DataLoader(train, batch_size=batch_size, cached=cached)
    test_dl = DataLoader(test, batch_size=batch_size, cached=cached)
    return train_dl, test_dl, etruscan_tokenizer, english_tokenizer
    

In [25]:
train_dl, test_dl, etruscan_tokenizer, english_tokenizer = get_training_data(Data._dir + "Etruscan.csv", Data._dir + "etruscan", Data._dir + "english", 8)

In [26]:
batch = next(train_dl)

In [27]:
batch["inputs_chars"].shape

(8, 21)