In [1]:
import logging
from datasets import load_dataset
from transformers import AutoTokenizer
from tokenizers import Tokenizer, models, trainers, normalizers, pre_tokenizers, decoders, processors
from transformers import BertTokenizerFast, GPT2TokenizerFast, AlbertTokenizerFast

In [3]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

In [4]:
# Configurable parameters
CONFIG = {
    "dataset_name": "wikitext",
    "dataset_config": "wikitext-2-raw-v1",
    "split": "train",
    "vocab_size": 25000,
    "batch_size": 1000,
    "special_tokens": ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"],
}

In [5]:
def load_data(config):
    logging.info("Loading dataset...")
    dataset = load_dataset(config["dataset_name"], name=config["dataset_config"], split=config["split"])
    logging.info(f"Loaded dataset with {len(dataset)} samples.")
    return dataset

def batch_iterator(dataset, batch_size):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i: i + batch_size]["text"]

def train_tokenizer_with_model(model_type, dataset, config):
    if model_type == "bert":
        tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
        tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
        trainer = trainers.WordPieceTrainer(vocab_size=config["vocab_size"], special_tokens=config["special_tokens"])
        tokenizer.train_from_iterator(batch_iterator(dataset, config["batch_size"]), trainer=trainer)

        # Post-processing
        cls_id = tokenizer.token_to_id("[CLS]")
        sep_id = tokenizer.token_to_id("[SEP]")
        tokenizer.post_processor = processors.TemplateProcessing(
            single="[CLS]:0 $A:0 [SEP]:0",
            pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
            special_tokens=[("[CLS]", cls_id), ("[SEP]", sep_id)],
        )
        tokenizer.decoder = decoders.WordPiece(prefix="##")
        tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

    elif model_type == "gpt2":
        tokenizer = Tokenizer(models.BPE())
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
        trainer = trainers.BpeTrainer(vocab_size=config["vocab_size"], special_tokens=["<|endoftext|>"])
        tokenizer.train_from_iterator(batch_iterator(dataset, config["batch_size"]), trainer=trainer)
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)

    elif model_type == "t5":
        tokenizer = Tokenizer(models.Unigram())
        tokenizer.normalizer = normalizers.Sequence([
            normalizers.Replace("``", '"'), 
            normalizers.Replace("''", '"'), 
            normalizers.Lowercase()
        ])
        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()
        trainer = trainers.UnigramTrainer(
            vocab_size=config["vocab_size"], 
            special_tokens=config["special_tokens"], 
            unk_token="<unk>"
        )
        tokenizer.train_from_iterator(batch_iterator(dataset, config["batch_size"]), trainer=trainer)
        cls_id = tokenizer.token_to_id("[CLS]")
        sep_id = tokenizer.token_to_id("[SEP]")
        tokenizer.post_processor = processors.TemplateProcessing(
            single="[CLS]:0 $A:0 [SEP]:0",
            pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
            special_tokens=[("[CLS]", cls_id), ("[SEP]", sep_id)],
        )
        tokenizer.decoder = decoders.Metaspace()
        tokenizer = AlbertTokenizerFast(tokenizer_object=tokenizer)

    else:
        raise ValueError(f"Unsupported model type: {model_type}")

    return tokenizer

def save_tokenizer(tokenizer, output_dir):
    tokenizer.save_pretrained(output_dir)
    logging.info(f"Tokenizer saved at {output_dir}")

def main():
    dataset = load_data(CONFIG)
    tokenizer = train_tokenizer_with_model("bert", dataset, CONFIG)  # Change model type as needed
    save_tokenizer(tokenizer, "my-custom-tokenizer")

if __name__ == "__main__":
    main()

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]






In [6]:
from datasets import load_dataset

dataset = load_dataset("wikitext", name="wikitext-2-raw-v1")
print(dataset["train"][:5])  # Display the first 5 training samples

{'text': ['', ' = Valkyria Chronicles III = \n', '', ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n', " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game m

In [8]:
def batch_iterator(batch_size=1000):
    for i in range(0, len(dataset["train"]), batch_size):
        yield dataset["train"][i: i + batch_size]["text"]


from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors

# Initialize the WordPiece tokenizer
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

# Add normalization
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

# Add pre-tokenization (split inputs into words)
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

# Special tokens and trainer configuration
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

# Train the tokenizer on the dataset
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)

# Add post-processing for [CLS] and [SEP] tokens
cls_id = tokenizer.token_to_id("[CLS]")
sep_id = tokenizer.token_to_id("[SEP]")
tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS]:0 $A:0 [SEP]:0",
    pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_id), ("[SEP]", sep_id)],
)

# Save the tokenizer
tokenizer.save("my-wordpiece-tokenizer.json")
print("Tokenizer training complete. Saved as 'my-wordpiece-tokenizer.json'")





Tokenizer training complete. Saved as 'my-wordpiece-tokenizer.json'


In [9]:
# Load the tokenizer
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("my-wordpiece-tokenizer.json")

# Encode some text
encoding = tokenizer.encode("This is a test sentence.")
print("Tokens:", encoding.tokens)  # Check the tokenized output
print("IDs:", encoding.ids)  # Check the token IDs


Tokens: ['[CLS]', 'this', 'is', 'a', 'test', 'sentence', '.', '[SEP]']
IDs: [2, 1309, 1188, 43, 3395, 6026, 18, 3]


In [10]:
from transformers import BertTokenizerFast

hf_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

# Save it for reuse
hf_tokenizer.save_pretrained("my-wordpiece-tokenizer")

# Reload it for later use
reloaded_tokenizer = BertTokenizerFast.from_pretrained("my-wordpiece-tokenizer")
print(reloaded_tokenizer("This is a test sentence."))


{'input_ids': [2, 1309, 1188, 43, 3395, 6026, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
