# Tokenization

In [1]:
import torch

from src.tokenizer import TokenizerBPE
from src.utils import saver, loader


from tqdm.notebook import tqdm

## Create Tokenizer

In [15]:
highlight_train_list = loader("corpus/cnn_dailymail_highlight_train_cleaned.pkl")
article_train_list = loader("corpus/cnn_dailymail_article_train_cleaned.pkl")

highlight_test_list = loader("corpus/cnn_dailymail_highlight_test_cleaned.pkl")
article_test_list = loader("corpus/cnn_dailymail_article_test_cleaned.pkl")


In [16]:
corpus = highlight_train_list + article_train_list + highlight_test_list + article_test_list
print(f"Total number of characters in the corpus: {len("".join(corpus))}")

Total number of characters in the corpus: 1288375458


In [4]:
tokenizer = TokenizerBPE(corpus=corpus, 
                         num_merges=24000,  # do 24k merges, resulting in ~24k tokens
                         ratio=0.1,         # tokenize 10% of words in corpus
                         verbose=True       # print merge details
                         )

Create character tokenizer


KeyboardInterrupt: 

In [9]:
#tokenizer.add_special_tokens(["<s>", "<h>", "<b>"])  # add special tokens

saver("cnn_tokenizer.pkl", tokenizer)

## Tokenize Corpus

In [17]:
tokenizer = loader("cnn_tokenizer.pkl")


In [18]:
def add_special_tokens(corpus_list):
    corpus_list_new = []
    for entry in tqdm(corpus_list, desc="Adding start and stop tokens"):
        highlight, article = entry
        new_entry = f"<s><h>{highlight}<b>{article}"
        corpus_list_new.append(new_entry)

    return "".join(corpus_list_new)

In [19]:
corpus_train = add_special_tokens(list(zip(highlight_train_list, article_train_list)))
length = len(corpus_train)

Adding start and stop tokens:   0%|          | 0/287113 [00:00<?, ?it/s]

In [20]:
print(len(corpus_train[:length//4]))

310482287


In [None]:
corpus_train_tokens = tokenizer.encode(corpus_train, verbose=True)

  0%|          | 0/24000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [11]:
article_train_tokens = [torch.tensor(tokenizer.encode(entry)) for entry in tqdm(article_train_list)]
saver("corpus/cnn_dailymail_article_train_tokens.pkl", article_train_tokens)

  0%|          | 0/287113 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
highlight_test_tokens = [torch.tensor(tokenizer.encode(entry)) for entry in tqdm(highlight_test_list[:100])]
saver("corpus/cnn_dailymail_highlight_test_tokens.pkl", highlight_test_tokens)

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
article_train_tokens = [torch.tensor(tokenizer.encode(entry)) for entry in tqdm(article_train_list)]
saver("corpus/cnn_dailymail_article_train_tokens.pkl", article_train_tokens)

## 