## Preparation filterin and cleaning of speakleash data

### Imports

In [1]:
from speakleash import Speakleash
import os
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import torch
from torch import nn
from torch.utils.data import DataLoader, Subset
import torch.nn.functional as F
import sentencepiece as spm

import json
import re
from collections import Counter

from transformers import PreTrainedTokenizer

from typing import Iterator, List

### Text Data Preparation

In [2]:
TRAINING_DATASET = "wolne_lektury_corpus"
RAW_DATASET_DIR = "./raw_data"
PREPARED_DATASET_DIR = "./prepared_data"

TOKENIZERS_PATH = "./tokenizers"

os.makedirs(RAW_DATASET_DIR, exist_ok=True)
os.makedirs(PREPARED_DATASET_DIR, exist_ok=True)
os.makedirs(TOKENIZERS_PATH, exist_ok=True)

In [3]:
sl = Speakleash(RAW_DATASET_DIR)
training_speakleash_data = sl.get(TRAINING_DATASET)
docs = list(training_speakleash_data.data)

print(f"Documents count: {training_speakleash_data.documents}")

100%|██████████| 119M/119M [00:02<00:00, 52.1MiB/s] 


Documents count: 6619


In [3]:
def filter_document(doc: str) -> str:
    filtered_doc = ""
    lines = doc.split("\n")
    for text_line in lines:
        if len(text_line) > 20:
            filtered_doc += text_line + "\n"

    return filtered_doc


def save_text_data(path: str, docs: List[str]):
    text_data = "\n".join(docs)
    with open(path, "w", encoding="utf-8") as file:
        file.write(text_data)

    print(f"Saved data at: {path}")


def load_text_data(path: str) -> str:
    with open(path, "r", encoding="utf-8") as file:
        return file.read()

In [None]:
docs = training_speakleash_data.data
filtered_docs = [filter_document(doc) for doc in docs]

n = len(filtered_docs)

split_idx = int(0.9 * n)
train_docs = filtered_docs[:split_idx]
eval_docs = filtered_docs[split_idx:]

print(f"train_docs.len = {len(train_docs)}")
print(f"train_docs.len = {len(eval_docs)}")

save_text_data(path=os.path.join(PREPARED_DATASET_DIR, "train.txt"), docs=train_docs)
save_text_data(path=os.path.join(PREPARED_DATASET_DIR, "eval.txt"), docs=train_docs)

train_docs.len = 5957
train_docs.len = 662
Saved data at: ./prepared_data/train.txt
Saved data at: ./prepared_data/eval.txt


In [4]:
train_text = load_text_data(os.path.join(PREPARED_DATASET_DIR, "train.txt"))
eval_text = load_text_data(os.path.join(PREPARED_DATASET_DIR, "eval.txt"))

## Create and train tokenizers

In [5]:
def load_tokenizer(load_path: str, custom_class=None):
    if custom_class is not None:
        tokenizer = custom_class.from_pretrained(load_path)
    else:
        tokenizer = AutoTokenizer.from_pretrained(load_path)
    print(f"Tokenizer loaded from: {load_path}")
    return tokenizer

### Pretrained tokenizer

In [None]:
model_name = "radlab/polish-gpt2-small-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)


VocabSize: 52000
Token: 4598, decoded: To
Token: 406, decoded:  jest
Token: 3097, decoded:  przykład
Token: 7937, decoded:  tekstu
Token: 262, decoded:  w
Token: 4745, decoded:  języku
Token: 6454, decoded:  polskim
Token: 17, decoded: .


In [16]:
tokenizer.save_pretrained(os.path.join(TOKENIZERS_PATH, "gpt2-pl"))

('./tokenizers/gpt2-pl/tokenizer_config.json',
 './tokenizers/gpt2-pl/special_tokens_map.json',
 './tokenizers/gpt2-pl/vocab.json',
 './tokenizers/gpt2-pl/merges.txt',
 './tokenizers/gpt2-pl/added_tokens.json',
 './tokenizers/gpt2-pl/tokenizer.json')

In [None]:
gpt2_pl_tokenizer = load_tokenizer(os.path.join(TOKENIZERS_PATH, "gpt2-pl"))

vocab_size = gpt2_pl_tokenizer.vocab_size
print(f"VocabSize: {vocab_size}")

text = "To jest przykład tekstu w języku polskim. Grzegorz Brzęczyszczykiewicz"
encoded = gpt2_pl_tokenizer.encode(text)

print(f"VocabSize: {gpt2_pl_tokenizer.vocab_size}")

for token in encoded:
    print(
        f"Token: {token}, decoded: {gpt2_pl_tokenizer.decode(token, skip_special_tokens=False)}"
    )


Tokenizer loaded from: ./tokenizers/gpt2-pl
VocabSize: 52000
VocabSize: 52000
Token: 4598, decoded: To
Token: 406, decoded:  jest
Token: 3097, decoded:  przykład
Token: 7937, decoded:  tekstu
Token: 262, decoded:  w
Token: 4745, decoded:  języku
Token: 6454, decoded:  polskim
Token: 17, decoded: .
Token: 14475, decoded:  Grzegorz
Token: 397, decoded:  B
Token: 1033, decoded: rzę
Token: 373, decoded: czy
Token: 4065, decoded: szczy
Token: 9011, decoded: kiewicz


### SentencePiece tokenizer

In [None]:
spm.SentencePieceTrainer.train(
    input=os.path.join(PREPARED_DATASET_DIR, "train.txt"),
    model_prefix="sentencepiece",
    vocab_size=vocab_size,
    model_type="unigram",
    character_coverage=1.0,
    pad_id=0,
    bos_id=2,
    eos_id=3,
    unk_id=1,
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./prepared_data/train.txt
  input_format: 
  model_prefix: sentencepiece
  model_type: UNIGRAM
  vocab_size: 52000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: 2
  eos_id: 3
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_priva

KeyboardInterrupt: 

In [11]:
from transformers import T5Tokenizer

sp_tokenizer = T5Tokenizer(
    vocab_file="./tokenizers/sentencepiece_raw/sentencepiece.model",
    unk_token="<UNK>",
    pad_token="<PAD>",
    bos_token="<BOS>",
    eos_token="<EOS>",
)

sp_tokenizer.save_pretrained(os.path.join(TOKENIZERS_PATH, "sentencepiece"))

('./tokenizers/sentencepiece/tokenizer_config.json',
 './tokenizers/sentencepiece/special_tokens_map.json',
 './tokenizers/sentencepiece/spiece.model',
 './tokenizers/sentencepiece/added_tokens.json')

In [37]:
sentencepiece_tokenizer = load_tokenizer(os.path.join(TOKENIZERS_PATH, "sentencepiece"))

text = "To jest przykład tekstu w języku polskim. Grzegorz Brzęczyszczykiewicz"
encoded = sentencepiece_tokenizer.encode(text, add_special_tokens=False)

print(f"VocabSize: {sentencepiece_tokenizer.vocab_size}")

for token in encoded:
    print(
        f"Token: {token}, decoded: {sentencepiece_tokenizer.decode(token, add_special_tokens=False)}"
    )

Tokenizer loaded from: ./tokenizers/sentencepiece
VocabSize: 52100
Token: 111, decoded: To
Token: 32, decoded: jest
Token: 774, decoded: przykład
Token: 18088, decoded: tekstu
Token: 9, decoded: w
Token: 4570, decoded: języku
Token: 7361, decoded: polskim
Token: 5, decoded: .
Token: 5880, decoded: Grzegorz
Token: 49505, decoded: Brzęcz
Token: 18, decoded: y
Token: 5818, decoded: szczy
Token: 7180, decoded: kiewicz


### Whitespace tokenizer

In [None]:
class WhitespaceTokenizer(PreTrainedTokenizer):
    def __init__(
        self,
        vocab=None,
        unk_token="<UNK>",
        pad_token="<PAD>",
        bos_token="<BOS>",
        eos_token="<EOS>",
        **kwargs,
    ):
        self.vocab = vocab or {}
        self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
        self.unk_token = unk_token
        self.pad_token = pad_token
        self.bos_token = bos_token
        self.eos_token = eos_token
        super().__init__(
            unk_token=unk_token,
            pad_token=pad_token,
            bos_token=bos_token,
            eos_token=eos_token,
            **kwargs,
        )

    def encode(self, text, add_special_tokens=False):
        tokens = self._tokenize(text)
        ids = [self._convert_token_to_id(tok) for tok in tokens]
        if add_special_tokens:
            ids = [self.vocab[self.bos_token]] + ids + [self.vocab[self.eos_token]]
        return ids

    def decode(self, token_ids, skip_special_tokens=True):
        if isinstance(token_ids, int):
            token_ids = [token_ids]
        tokens = [self._convert_id_to_token(i) for i in token_ids]
        if skip_special_tokens:
            tokens = [
                t
                for t in tokens
                if t
                not in [self.pad_token, self.unk_token, self.bos_token, self.eos_token]
            ]
        return self.convert_tokens_to_string(tokens)

    @property
    def vocab_size(self):
        return len(self.vocab)

    @property
    def all_special_tokens(self):
        return [self.pad_token, self.unk_token, self.bos_token, self.eos_token]

    @property
    def all_special_ids(self):
        return [
            self.vocab[self.pad_token],
            self.vocab[self.unk_token],
            self.vocab[self.bos_token],
            self.vocab[self.eos_token],
        ]

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text: str):
        words = text.split()
        tokens = []
        for word in words:
            tokens.extend(re.findall(r'\w+|[.,!?;:()"\']', word))
        return tokens

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index):
        return self.ids_to_tokens.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        return " ".join(tokens)

    def build_vocab(self, texts, vocab_size=50000):
        counter = Counter()
        for text in texts:
            tokens = self._tokenize(text)
            counter.update(tokens)

        most_common = [t for t, _ in counter.most_common(vocab_size - 4)]

        special_tokens = [
            self.pad_token,
            self.unk_token,
            self.bos_token,
            self.eos_token,
        ]

        vocab = {}

        for i, tok in enumerate(special_tokens):
            vocab[tok] = i

        for i, tok in enumerate(most_common, start=len(special_tokens)):
            vocab[tok] = i

        self.vocab = vocab
        self.ids_to_tokens = {v: k for k, v in vocab.items()}

    def save_pretrained(self, save_directory):
        vocab_file = f"{save_directory}/vocab.json"
        os.makedirs(save_directory, exist_ok=True)

        with open(vocab_file, "w", encoding="utf-8") as f:
            json.dump(self.vocab, f, ensure_ascii=False, indent=2)
        print(f"Saved vocabulary to {vocab_file}")

    @classmethod
    def from_pretrained(cls, load_directory):
        vocab_file = f"{load_directory}/vocab.json"
        with open(vocab_file, "r", encoding="utf-8") as f:
            vocab = json.load(f)
        print(f"Loaded vocabulary from {vocab_file}")
        return cls(
            vocab=vocab,
            unk_token="<UNK>",
            pad_token="<PAD>",
            bos_token="<BOS>",
            eos_token="<EOS>",
        )


In [26]:
whitespace_tokenizer = WhitespaceTokenizer()
whitespace_tokenizer.build_vocab([train_text], vocab_size=vocab_size)

whitespace_tokenizer.save_pretrained(
    save_directory=os.path.join(TOKENIZERS_PATH, "whitespace")
)

Saved vocabulary to ./tokenizers/whitespace/vocab.json


In [None]:
whitespace_tokenizer = load_tokenizer(
    os.path.join(TOKENIZERS_PATH, "whitespace"), custom_class=WhitespaceTokenizer
)

text = "To jest przykład tekstu w języku polskim. Grzegorz Brzęczyszczykiewicz"
encoded = whitespace_tokenizer.encode(text, add_special_tokens=False)

print(f"VocabSize: {whitespace_tokenizer.vocab_size}")

for token in encoded:
    print(f"Token: {token}, decoded: {whitespace_tokenizer.decode(token, False)}")

    # TODO - zastanowić się co zrobić z UNK

Loaded vocabulary from ./tokenizers/whitespace/vocab.json
Tokenizer loaded from: ./tokenizers/whitespace
VocabSize: 52000
Token: 81, decoded: To
Token: 23, decoded: jest
Token: 659, decoded: przykład
Token: 10864, decoded: tekstu
Token: 8, decoded: w
Token: 3264, decoded: języku
Token: 5100, decoded: polskim
Token: 5, decoded: .
Token: 6579, decoded: Grzegorz
Token: 1, decoded: <UNK>


## Creating torch dataset

In [None]:
class TokenDataset(Dataset):
    def __init__(self, data, seq_len):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) // self.seq_len - 1

    def __getitem__(self, idx):
        i = idx * self.seq_len
        x = self.data[i : i + self.seq_len]
        y = self.data[i + 1 : i + 1 + self.seq_len]
        return x, y


def create_tensor(docs: List[str], tokenizer):
    tokenized_docs = []
    sep = tokenizer.sep_token

    counter = 0
    for doc in docs:
        tokens = tokenizer.encode(doc + sep)
        tokenized_docs.extend(tokens)

        if counter % 1000 == 0:
            print(f"Parsed: {counter}/{len(docs)} docs")

        counter += 1

    data = torch.tensor(tokenized_docs, dtype=torch.long)
    print(f"Data shape: {data.shape}")
    return data