In [1]:
from lib import dataloading as dl
from lib import tokenizer as tk
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [2]:
english_train_df = pd.read_csv('D:\Dropbox\Bachelorarbeit\Datasets\English\Train\english_train.csv', dtype={5: str})
english_train_df = dl.preprocess(english_train_df, verbose=True)
english_test_df = pd.read_csv('D:\Dropbox\Bachelorarbeit\Datasets\English\Test\english_test.csv', dtype={5: str})
english_test_df = dl.preprocess(english_test_df, verbose=True)

turkish_train_df = pd.read_csv('D:\Dropbox\Bachelorarbeit\Datasets\Turkish\Train\\turkish_train.csv', dtype={5: str})
turkish_train_df = dl.preprocess(turkish_train_df, verbose=True)
turkish_test_df = pd.read_csv('D:\Dropbox\Bachelorarbeit\Datasets\Turkish\Test\\turkish_test.csv', dtype={5: str})
turkish_test_df = dl.preprocess(turkish_test_df, verbose=True)

english_dir = Path("D:/Dropbox/Bachelorarbeit/Tokenization by Word Tag/Tokenizers/English")
english_dir.mkdir(parents=True, exist_ok=True)

turkish_dir = Path("D:/Dropbox/Bachelorarbeit/Tokenization by Word Tag/Tokenizers/Turkish")
turkish_dir.mkdir(parents=True, exist_ok=True)


Dropped 8 rows with NaN in 'FORM' column.
Dropped 17911 rows with non-UPOS tags 
Tags dropped: ['_']
Dropped 3 rows with NaN in 'FORM' column.
Dropped 4791 rows with non-UPOS tags 
Tags dropped: ['_']
Dropped 6 rows with NaN in 'FORM' column.
Dropped 4541 rows with non-UPOS tags 
Tags dropped: ['_']
Dropped 1209 rows with non-UPOS tags 
Tags dropped: ['_']


In [3]:
vocab_sizes = [4096, 8192, 16384]
algorithms = ["BPE", "WordPiece", "Unigram"]

## All UPOS-tags Tokenizers

In [4]:
with tqdm(total=len(vocab_sizes) * len(algorithms), desc="Training Tokenizers") as pbar:
    for vocab_size in vocab_sizes:
        for algorithm in algorithms:
            pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm")
            path = english_dir / f"upos_{algorithm.lower()}_{vocab_size}_tokenizer.json"
            tokenizer = tk.train_and_merge_tokenizers(
                english_train_df,
                tokenizer_algorithm=algorithm,
                vocab_size=vocab_size,
                allocation="proportional",
                save_path=str(path)
            )
            pbar.update(1)

Training tokenizer with vocab_size 16384 using the Unigram algorithm: 100%|██████████| 9/9 [00:37<00:00,  4.14s/it]  


In [5]:
with tqdm(total=len(vocab_sizes) * len(algorithms), desc="Training Tokenizers") as pbar:
    for vocab_size in vocab_sizes:
        for algorithm in algorithms:
            pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm")
            path = turkish_dir / f"upos_{algorithm.lower()}_{vocab_size}_tokenizer.json"
            tokenizer = tk.train_and_merge_tokenizers(
                turkish_train_df,
                tokenizer_algorithm=algorithm,
                vocab_size=vocab_size,
                allocation="proportional",
                save_path=str(path)
            )
            pbar.update(1)

Training tokenizer with vocab_size 16384 using the Unigram algorithm: 100%|██████████| 9/9 [01:02<00:00,  6.94s/it]  


## Baseline Tokenizers

In [6]:
with tqdm(total=len(vocab_sizes) * len(algorithms), desc="Training Tokenizers") as pbar:
    for vocab_size in vocab_sizes:
        for algorithm in algorithms:
            pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm")
            path = english_dir / f"base_{algorithm.lower()}_{vocab_size}_tokenizer.json"
            tokenizer = tk.train_tokenizer(
                english_train_df["FORM"].values.tolist(),
                vocab_size=vocab_size,
                algorithm=algorithm,
            )
            tokenizer.save(str(path))
            pbar.update(1)

Training tokenizer with vocab_size 16384 using the Unigram algorithm: 100%|██████████| 9/9 [00:25<00:00,  2.80s/it]  


In [7]:
with tqdm(total=len(vocab_sizes) * len(algorithms), desc="Training Tokenizers") as pbar:
    for vocab_size in vocab_sizes:
        for algorithm in algorithms:
            pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm")
            path = turkish_dir / f"base_{algorithm.lower()}_{vocab_size}_tokenizer.json"
            tokenizer = tk.train_tokenizer(
                turkish_train_df["FORM"].values.tolist(),
                vocab_size=vocab_size,
                algorithm=algorithm,
            )
            tokenizer.save(str(path))
            pbar.update(1)

Training tokenizer with vocab_size 16384 using the Unigram algorithm: 100%|██████████| 9/9 [01:00<00:00,  6.74s/it]  


## Lexical - Grammatical - Other Tokenizers

In [8]:
lexical = ["ADJ", "ADV", "INTJ", "NOUN", "PROPN", "VERB"]
grammatical = ["ADP", "AUX", "CCONJ", "DET", "NUM", "PART", "PRON", "SCONJ"]
other = ["PUNCT", "SYM", "X"]
grouping = [lexical, grammatical, other]
allocations = ["proportional", "weighted_proportional"]
allocations_weights = [[2, 1, 1], [1, 2, 1]]
allocations_weight_names = ["lexical", "grammatical"]

In [9]:
with tqdm(total=int(len(vocab_sizes) * len(algorithms) * 1.5 * len(allocations)), desc="Training Tokenizers") as pbar:
    # Allocation proportional
    for vocab_size in vocab_sizes:
        for algorithm in algorithms:
            pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm and proportional allocation.")
            path = english_dir / f"lego_{algorithm.lower()}_{vocab_size}_proportional_tokenizer.json"
            tokenizer = tk.train_and_merge_tokenizers(
                english_train_df,
                tokenizer_algorithm=algorithm,
                vocab_size=vocab_size,
                allocation='proportional',
                grouping=grouping,
                save_path=str(path)
            )
            pbar.update(1)
    # Allocation weighted proportional
    for i, weights in enumerate(allocations_weights):
        for vocab_size in vocab_sizes:
            for algorithm in algorithms:
                pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm and weighted proportional allocation.")
                path = english_dir / f"lego_{algorithm.lower()}_{vocab_size}_{allocations_weight_names[i]}_weightedproportional_tokenizer.json"
                tokenizer = tk.train_and_merge_tokenizers(
                    english_train_df,
                    tokenizer_algorithm=algorithm,
                    vocab_size=vocab_size,
                    allocation='weighted_proportional',
                    allocation_weights=weights,
                    grouping=grouping,
                    save_path=str(path)
                )
                pbar.update(1)

Training tokenizer with vocab_size 16384 using the Unigram algorithm and weighted proportional allocation.: 100%|██████████| 27/27 [01:33<00:00,  3.46s/it]  


In [10]:
with tqdm(total=int(len(vocab_sizes) * len(algorithms) * 1.5 * len(allocations)), desc="Training Tokenizers") as pbar:
    # Allocation proportional
    for vocab_size in vocab_sizes:
            for algorithm in algorithms:
                pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm and proportional allocation.")
                path = turkish_dir / f"lego_{algorithm.lower()}_{vocab_size}_proportional_tokenizer.json"
                tokenizer = tk.train_and_merge_tokenizers(
                    turkish_train_df,
                    tokenizer_algorithm=algorithm,
                    vocab_size=vocab_size,
                    allocation='proportional',
                    grouping=grouping,
                    save_path=str(path)
                )
                pbar.update(1)
    # Allocation weighted proportional
    for i, weights in enumerate(allocations_weights):
        for vocab_size in vocab_sizes:
            for algorithm in algorithms:
                pbar.set_description(f"Training tokenizer with vocab_size {vocab_size} using the {algorithm} algorithm and weighted proportional allocation.")
                path = turkish_dir / f"lego_{algorithm.lower()}_{vocab_size}_{allocations_weight_names[i]}_weightedproportional_tokenizer.json"
                tokenizer = tk.train_and_merge_tokenizers(
                    turkish_train_df,
                    tokenizer_algorithm=algorithm,
                    vocab_size=vocab_size,
                    allocation='weighted_proportional',
                    allocation_weights=weights,
                    grouping=grouping,
                    save_path=str(path)
                )
                pbar.update(1)

Training tokenizer with vocab_size 16384 using the Unigram algorithm and weighted proportional allocation.: 100%|██████████| 27/27 [03:11<00:00,  7.11s/it]  
