In [9]:
from lib import dataloading as dl
from lib import tokenizer as tk
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [10]:
english_train_df = pd.read_csv('D:\Dropbox\Bachelorarbeit\Datasets\English\Train\english_train.csv', dtype={5: str})
english_train_df = dl.preprocess(english_train_df, verbose=True)
english_test_df = pd.read_csv('D:\Dropbox\Bachelorarbeit\Datasets\English\Test\english_test.csv', dtype={5: str})
english_test_df = dl.preprocess(english_test_df, verbose=True)

turkish_train_df = pd.read_csv('D:\Dropbox\Bachelorarbeit\Datasets\Turkish\Train\\turkish_train.csv', dtype={5: str})
turkish_train_df = dl.preprocess(turkish_train_df, verbose=True)
turkish_test_df = pd.read_csv('D:\Dropbox\Bachelorarbeit\Datasets\Turkish\Test\\turkish_test.csv', dtype={5: str})
turkish_test_df = dl.preprocess(turkish_test_df, verbose=True)

english_dir = Path("D:/Dropbox/Bachelorarbeit/Tokenization by Word Tag/Tokenizers/English")
english_dir.mkdir(parents=True, exist_ok=True)

turkish_dir = Path("D:/Dropbox/Bachelorarbeit/Tokenization by Word Tag/Tokenizers/Turkish")
turkish_dir.mkdir(parents=True, exist_ok=True)


Dropped 8 rows with NaN in 'FORM' column.
Dropped 17911 rows with non-UPOS tags 
Tags dropped: ['_']
Dropped 3 rows with NaN in 'FORM' column.
Dropped 4791 rows with non-UPOS tags 
Tags dropped: ['_']
Dropped 6 rows with NaN in 'FORM' column.
Dropped 4541 rows with non-UPOS tags 
Tags dropped: ['_']
Dropped 0 rows with NaN in 'FORM' column.
Dropped 1209 rows with non-UPOS tags 
Tags dropped: ['_']


In [11]:
vocab_sizes = [4096, 8192, 16384]
algorithms = ["BPE", "WordPiece", "Unigram"]

## All UPOS-tags Tokenizers

In [12]:
with tqdm(total=len(vocab_sizes) * len(algorithms), desc="Training Tokenizers") as pbar:
    for vocab_size in vocab_sizes:
        for algorithm in algorithms:
            pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm")
            path = english_dir / f"upos_{algorithm.lower()}_{vocab_size}_tokenizer.json"
            tokenizer = tk.train_and_merge_tokenizers(
                english_train_df,
                tokenizer_algorithm=algorithm,
                vocab_size=vocab_size,
                allocation="proportional",
                save_path=str(path)
            )
            pbar.update(1)

Training tokenizer with vocab_size 16384 using the Unigram algorithm: 100%|██████████| 9/9 [00:36<00:00,  4.04s/it]  


In [13]:
with tqdm(total=len(vocab_sizes) * len(algorithms), desc="Training Tokenizers") as pbar:
    for vocab_size in vocab_sizes:
        for algorithm in algorithms:
            pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm")
            path = turkish_dir / f"upos_{algorithm.lower()}_{vocab_size}_tokenizer.json"
            tokenizer = tk.train_and_merge_tokenizers(
                turkish_train_df,
                tokenizer_algorithm=algorithm,
                vocab_size=vocab_size,
                allocation="proportional",
                save_path=str(path)
            )
            pbar.update(1)

Training tokenizer with vocab_size 16384 using the Unigram algorithm: 100%|██████████| 9/9 [00:56<00:00,  6.30s/it]  


## Baseline Tokenizers

In [14]:
with tqdm(total=len(vocab_sizes) * len(algorithms), desc="Training Tokenizers") as pbar:
    for vocab_size in vocab_sizes:
        for algorithm in algorithms:
            pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm")
            path = english_dir / f"base_{algorithm.lower()}_{vocab_size}_tokenizer.json"
            tokenizer = tk.train_tokenizer(
                english_train_df["FORM"].values.tolist(),
                vocab_size=vocab_size,
                algorithm=algorithm,
            )
            tokenizer.save(str(path))
            pbar.update(1)

Training tokenizer with vocab_size 16384 using the Unigram algorithm: 100%|██████████| 9/9 [00:23<00:00,  2.63s/it]  


In [15]:
with tqdm(total=len(vocab_sizes) * len(algorithms), desc="Training Tokenizers") as pbar:
    for vocab_size in vocab_sizes:
        for algorithm in algorithms:
            pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm")
            path = turkish_dir / f"base_{algorithm.lower()}_{vocab_size}_tokenizer.json"
            tokenizer = tk.train_tokenizer(
                turkish_train_df["FORM"].values.tolist(),
                vocab_size=vocab_size,
                algorithm=algorithm,
            )
            tokenizer.save(str(path))
            pbar.update(1)

Training tokenizer with vocab_size 16384 using the Unigram algorithm: 100%|██████████| 9/9 [00:56<00:00,  6.24s/it]  


## Lexical - Grammatical - Other Tokenizers

In [16]:
lexical = ["ADJ", "ADV", "INTJ", "NOUN", "PROPN", "VERB"]
grammatical = ["ADP", "AUX", "CCONJ", "DET", "NUM", "PART", "PRON", "SCONJ"]
other = ["PUNCT", "SYM", "X"]

In [17]:
with tqdm(total=len(vocab_sizes) * len(algorithms), desc="Training Tokenizers") as pbar:
    for vocab_size in vocab_sizes:
        for algorithm in algorithms:
            pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm")
            path = english_dir / f"lex_gram_oth_{algorithm.lower()}_{vocab_size}_tokenizer.json"
            tokenizer = tk.train_and_merge_tokenizers(
                english_train_df,
                tokenizer_algorithm=algorithm,
                vocab_size=vocab_size,
                allocation="proportional",
                grouping=[lexical, grammatical, other],
                save_path=str(path)
            )
            pbar.update(1)

Training tokenizer with vocab_size 16384 using the Unigram algorithm: 100%|██████████| 9/9 [00:30<00:00,  3.39s/it]  


In [18]:
with tqdm(total=len(vocab_sizes) * len(algorithms), desc="Training Tokenizers") as pbar:
    for vocab_size in vocab_sizes:
        for algorithm in algorithms:
            pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm")
            path = turkish_dir / f"lex_gram_oth_{algorithm.lower()}_{vocab_size}_tokenizer.json"
            tokenizer = tk.train_and_merge_tokenizers(
                turkish_train_df,
                tokenizer_algorithm=algorithm,
                vocab_size=vocab_size,
                allocation="proportional",
                grouping=[lexical, grammatical, other],
                save_path=str(path)
            )
            pbar.update(1)

Training tokenizer with vocab_size 16384 using the Unigram algorithm: 100%|██████████| 9/9 [00:59<00:00,  6.66s/it]  
