# Tokenization

In [1]:
import torch
import pickle as pkl
import matplotlib.pyplot as plt

from transformer_kristianwold.tokenizer import TokenizerBPE
from transformer_kristianwold.utils import saver, loader

from tqdm.notebook import tqdm

## Load cleaned corpus

In [None]:
highlight_train_list = loader("../../corpus/cnn_dailymail_highlight_train_cleaned.pkl")
article_train_list = loader("../../corpus/cnn_dailymail_article_train_cleaned.pkl")

highlight_test_list = loader("../../corpus/cnn_dailymail_highlight_test_cleaned.pkl")
article_test_list = loader("../../corpus/cnn_dailymail_article_test_cleaned.pkl")

FileNotFoundError: [Errno 2] No such file or directory: '../corpus/cnn_dailymail_highlight_train_cleaned.pkl'

In [None]:
corpus = highlight_train_list + article_train_list + highlight_test_list + article_test_list
print(f"Total number of characters in the corpus: {len("".join(corpus))}")

## Run Byte Pair Encoding

In [None]:
tokenizer = TokenizerBPE(corpus=corpus, 
                         num_merges=24000,  # do 24k merges, resulting in ~24k vocabulary
                         ratio=0.1,         # perform BPE on random 10% subset of words in corpus for efficiency
                         verbose=True       # print merge details
                         )

In [None]:
# add special tokens
tokenizer.add_special_tokens(["<s>",  # start
                              "</s>", # end
                              "<h>",  # highlight
                              "<b>"]) # bodytext                          

saver("../tokenizers/cnn_tokenizer.pkl", tokenizer)

## Tokenize Corpus

In [8]:
tokenizer = loader("../tokenizers/cnn_tokenizer3.pkl")

In [15]:
def add_special_tokens(corpus_list):
    corpus_list_new = []
    for entry in corpus_list:
        highlight, article = entry
        new_entry = f"<s><h>{highlight}<b>{article}</s>"
        corpus_list_new.append(new_entry)

    return "".join(corpus_list_new)

def add_special_tokens_HLlast(corpus_list):
    corpus_list_new = []
    for entry in corpus_list:
        highlight, article = entry
        new_entry = f"<s><b>{article}<h>{highlight}</s>"
        corpus_list_new.append(new_entry)

    return "".join(corpus_list_new)

## Tokenize Corpus

### Highlight First

In [None]:
corpus_train = add_special_tokens(list(zip(highlight_train_list, article_train_list)))
length = len(corpus_train)

In [None]:
# Tokenize in four chunks to avoid memory problems

corpus_train_tokens = tokenizer.encode(corpus_train[:length//4], verbose=True)
saver("../corpus/cnn_dailymail_article_train_tokens1.pkl", corpus_train_tokens)

In [None]:
corpus_train_tokens = tokenizer.encode(corpus_train[length//4:length//2], verbose=True)
saver("../corpus/cnn_dailymail_article_train_tokens2.pkl", corpus_train_tokens)

In [None]:
corpus_train_tokens = tokenizer.encode(corpus_train[length//2:3*length//4], verbose=True)
saver("../corpus/cnn_dailymail_article_train_tokens3.pkl", corpus_train_tokens)

In [None]:
corpus_train_tokens = tokenizer.encode(corpus_train[3*length//4:], verbose=True)
saver("../corpus/cnn_dailymail_article_train_tokens4.pkl", corpus_train_tokens)

In [None]:
# Concatenate results

corpus_train1 = torch.tensor(loader("corpus/cnn_dailymail_article_train_tokens1.pkl"))
corpus_train2 = torch.tensor(loader("corpus/cnn_dailymail_article_train_tokens2.pkl"))
corpus_train3 = torch.tensor(loader("corpus/cnn_dailymail_article_train_tokens3.pkl"))
corpus_train4 = torch.tensor(loader("corpus/cnn_dailymail_article_train_tokens4.pkl"))
corpus_train = torch.cat((corpus_train1, corpus_train2, corpus_train3, corpus_train4), dim=0)

saver("../corpus/cnn_dailymail_highlight_first_train.pkl", corpus_train)

### Highlight Last

In [None]:
corpus_train = add_special_tokens_HLlast(list(zip(highlight_train_list, article_train_list)))
length = len(corpus_train)

In [None]:
# Tokenize in four chunks to avoid memory problems

corpus_train_tokens = tokenizer.encode(corpus_train[:length//4], verbose=True)
saver("../corpus/cnn_dailymail_HLlast_train_tokens1.pkl", corpus_train_tokens)

In [None]:
corpus_train_tokens = tokenizer.encode(corpus_train[length//4:length//2], verbose=True)
saver("../corpus/cnn_dailymail_HLlast_train_tokens2.pkl", corpus_train_tokens)

In [None]:
corpus_train_tokens = tokenizer.encode(corpus_train[length//2:3*length//4], verbose=True)
saver("../corpus/cnn_dailymail_HLlast_train_tokens3.pkl", corpus_train_tokens)

In [None]:
corpus_train_tokens = tokenizer.encode(corpus_train[3*length//4:], verbose=True)
saver("../corpus/cnn_dailymail_HLlast_train_tokens4.pkl", corpus_train_tokens)

In [None]:
# Concatenate results

corpus_train1 = torch.tensor(loader("corpus/cnn_dailymail_HLlast_train_tokens1.pkl"))
corpus_train2 = torch.tensor(loader("corpus/cnn_dailymail_HLlast_train_tokens2.pkl"))
corpus_train3 = torch.tensor(loader("corpus/cnn_dailymail_HLlast_train_tokens3.pkl"))
corpus_train4 = torch.tensor(loader("corpus/cnn_dailymail_HLlast_train_tokens4.pkl"))
corpus_train = torch.cat((corpus_train1, corpus_train2, corpus_train3, corpus_train4), dim=0)

saver("../corpus/cnn_dailymail_highlight_last_train.pkl", corpus_train)

## Test Data

In [None]:
corpus_test = add_special_tokens(list(zip(highlight_test_list, article_test_list)))

In [None]:
corpus_test = tokenizer.encode(corpus_test, verbose=True)
saver("../corpus/cnn_dailymail_highlight_first_test.pkl", torch.tensor(corpus_test))

In [None]:
corpus_test = add_special_tokens_HLlast(list(zip(highlight_test_list, article_test_list)))

In [None]:
corpus_test = tokenizer.encode(corpus_test, verbose=True)
saver("../corpus/cnn_dailymail_highlight_last_test.pkl", torch.tensor(corpus_test))

## 