# Tokenization

In [2]:
import torch

from src.tokenizer import TokenizerBPE
from src.utils import saver, loader


from tqdm.notebook import tqdm

## Create Tokenizer

In [3]:
highlight_train_list = loader("corpus/cnn_dailymail_highlight_train_cleaned.pkl")
article_train_list = loader("corpus/cnn_dailymail_article_train_cleaned.pkl")

highlight_test_list = loader("corpus/cnn_dailymail_highlight_test_cleaned.pkl")
article_test_list = loader("corpus/cnn_dailymail_article_test_cleaned.pkl")


In [4]:
corpus = highlight_train_list + article_train_list + highlight_test_list + article_test_list
print(f"Total number of characters in the corpus: {len("".join(corpus))}")

Total number of characters in the corpus: 1288375458


In [4]:
tokenizer = TokenizerBPE(corpus=corpus, 
                         num_merges=24000,  # do 24k merges, resulting in ~24k tokens
                         ratio=0.1,         # tokenize 10% of words in corpus
                         verbose=True       # print merge details
                         )

Create character tokenizer


KeyboardInterrupt: 

In [None]:
# add special tokens
tokenizer.add_special_tokens(["<s>",  # start
                              "</s>", # end
                              "<h>",  # highlight
                              "<b>"]) # body                          

saver("cnn_tokenizer.pkl", tokenizer)

## Tokenize Corpus

In [5]:
tokenizer = loader("cnn_tokenizer.pkl")

In [6]:
def add_special_tokens(corpus_list):
    corpus_list_new = []
    for entry in tqdm(corpus_list, desc="Adding start and stop tokens"):
        highlight, article = entry
        new_entry = f"<s><h>{highlight}<b>{article}</s>"
        corpus_list_new.append(new_entry)

    return "".join(corpus_list_new)

def add_special_tokens_HLlast(corpus_list):
    corpus_list_new = []
    for entry in tqdm(corpus_list, desc="Adding start and stop tokens"):
        highlight, article = entry
        new_entry = f"<s><b>{article}<h>{highlight}</s>"
        corpus_list_new.append(new_entry)

    return "".join(corpus_list_new)

## Train Data

In [6]:
corpus_train = add_special_tokens(list(zip(highlight_train_list, article_train_list)))
length = len(corpus_train)

Adding start and stop tokens:   0%|          | 0/287113 [00:00<?, ?it/s]

In [8]:
corpus_train_tokens = tokenizer.encode(corpus_train[:length//4], verbose=True)
saver("corpus/cnn_dailymail_article_train_tokens1.pkl", corpus_train_tokens)

  0%|          | 0/24000 [00:00<?, ?it/s]

In [9]:
corpus_train_tokens = tokenizer.encode(corpus_train[length//4:length//2], verbose=True)
saver("corpus/cnn_dailymail_article_train_tokens2.pkl", corpus_train_tokens)

  0%|          | 0/24000 [00:00<?, ?it/s]

In [7]:
corpus_train_tokens = tokenizer.encode(corpus_train[length//2:3*length//4], verbose=True)
saver("corpus/cnn_dailymail_article_train_tokens3.pkl", corpus_train_tokens)

  0%|          | 0/24000 [00:00<?, ?it/s]

In [8]:
corpus_train_tokens = tokenizer.encode(corpus_train[3*length//4:], verbose=True)
saver("corpus/cnn_dailymail_article_train_tokens4.pkl", corpus_train_tokens)

  0%|          | 0/24000 [00:00<?, ?it/s]

### Highlight Last

In [13]:
corpus_train = add_special_tokens_HLlast(list(zip(highlight_train_list, article_train_list)))
length = len(corpus_train)

Adding start and stop tokens:   0%|          | 0/287113 [00:00<?, ?it/s]

In [14]:
corpus_train_tokens = tokenizer.encode(corpus_train[:length//4], verbose=True)
saver("corpus/cnn_dailymail_HLlast_train_tokens1.pkl", corpus_train_tokens)

  0%|          | 0/24000 [00:00<?, ?it/s]

In [15]:
corpus_train_tokens = tokenizer.encode(corpus_train[length//4:length//2], verbose=True)
saver("corpus/cnn_dailymail_HLlast_train_tokens2.pkl", corpus_train_tokens)

  0%|          | 0/24000 [00:00<?, ?it/s]

In [16]:
corpus_train_tokens = tokenizer.encode(corpus_train[length//2:3*length//4], verbose=True)
saver("corpus/cnn_dailymail_HLlast_train_tokens3.pkl", corpus_train_tokens)

  0%|          | 0/24000 [00:00<?, ?it/s]

In [17]:
corpus_train_tokens = tokenizer.encode(corpus_train[3*length//4:], verbose=True)
saver("corpus/cnn_dailymail_HLlast_train_tokens4.pkl", corpus_train_tokens)

  0%|          | 0/24000 [00:00<?, ?it/s]

### Test Data

In [11]:
corpus_test = add_special_tokens(list(zip(highlight_test_list, article_test_list)))
length = len(corpus_train)

Adding start and stop tokens:   0%|          | 0/11490 [00:00<?, ?it/s]

In [17]:
corpus_test_tokens = tokenizer.encode(corpus_test[:1000000], verbose=True)
saver("corpus/cnn_dailymail_article_test_tokens.pkl", corpus_test_tokens)

  0%|          | 0/24000 [00:00<?, ?it/s]

In [18]:
corpus_test = add_special_tokens_HLlast(list(zip(highlight_test_list, article_test_list)))
length = len(corpus_train)

Adding start and stop tokens:   0%|          | 0/11490 [00:00<?, ?it/s]

In [19]:
corpus_test_tokens = tokenizer.encode(corpus_test[:1000000], verbose=True)
saver("corpus/cnn_dailymail_HLlast_test_tokens.pkl", corpus_test_tokens)

  0%|          | 0/24000 [00:00<?, ?it/s]

## 