In [32]:
import re, os

from tqdm.auto import tqdm
from datasets import load_dataset
from tokenizers import (
    Tokenizer, 
    Regex, 
    models, 
    trainers, 
    pre_tokenizers, 
    decoders, 
    normalizers
)

In [33]:
SIZE=800_000

In [34]:
def load_corpus(size=300_000, buffer_size=10_000, output='ka_corpus.txt', seed=42):
    dataset = load_dataset(
        'RichNachos/georgian-corpus',
        streaming = True,
        split     = 'train'
    )
    
    dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size).take(size)
    
    with open(output, 'w', encoding='utf-8') as f:
        for example in dataset:
            text = example['doc_content'].strip()

            if text:
                f.write(text + '\n\n')

In [35]:
load_corpus(SIZE)

Resolving data files:   0%|          | 0/120 [00:00<?, ?it/s]

In [37]:
CONFIG = {
    "corpus_path": "ka_corpus.txt",
    "vocab_size": 50000,
    "min_frequency": 2,
    "buffer_size": 100000,
    "preprocess_batch_size": 10000,
    "georgian_regex": r"[\u10D0-\u10F0]+",
    "special_tokens": ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
}

In [38]:
def text_generator():
    with open(CONFIG["corpus_path"], "r", encoding="utf-8") as f:
        batch = []
        for line in tqdm(f, desc="პრეპროცესინგი"):
            batch.append(line.strip())
            
            if len(batch) >= CONFIG["preprocess_batch_size"]:
                yield from batch
                batch = []
                    
        if batch:
            yield from batch

In [39]:
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

In [40]:
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.Whitespace(),
    pre_tokenizers.Split(
        CONFIG["georgian_regex"], 
        behavior="isolated",
        invert=False
    )
])

trainer = trainers.BpeTrainer(
    vocab_size=CONFIG["vocab_size"],
    min_frequency=CONFIG["min_frequency"],
    special_tokens=CONFIG["special_tokens"],
    continuing_subword_prefix="##",
    initial_alphabet=[c for c in 'აბგდევზთილმნოპჟრსტუფქღყშჩცძწჭხჯჰ'],
    show_progress=True
)

In [41]:
os.path.getsize(CONFIG["corpus_path"])

5845911266

In [42]:
tokenizer.train_from_iterator(
    iterator=text_generator(),
    trainer=trainer,
    length=os.path.getsize(CONFIG["corpus_path"])  # პროგრესის ბარისთვის
)

პრეპროცესინგი: 0it [00:00, ?it/s]

In [43]:
tokenizer.decoder = decoders.BPEDecoder(suffix="##")
tokenizer.save(f"georgian_tokenizer_{SIZE}.json")

In [44]:
test_cases = [
    "GPT-4 → ხელოვნური ინტელექტი 2023",
    "„ქართული ენის დღე“: 14 აპრილი",
    "სამედიცინო ტერმინები: COVID-19, MRI",
    'გამარჯობა, ჩემი სახლია ნიკა',
]

for text in test_cases:
    print(tokenizer.encode(text).tokens)

['G', '##P', '##T', '-', '4', '→', 'ხელოვნური', 'ინტელექტი', '2023']
['„', 'ქართული', 'ენის', 'დღე', '“:', '14', 'აპრილი']
['სამედიცინო', 'ტერმინები', ':', 'COVID', '-', '19', ',', 'M', '##RI']
['გამარჯობა', ',', 'ჩემი', 'სახლია', 'ნიკა']


In [47]:
from IPython.display import FileLink

FileLink(f'georgian_tokenizer_{SIZE}.json')