In [495]:
# Dataset

from datasets import load_dataset
ds = load_dataset("iohadrubin/wikitext-103-raw-v1")
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 29567
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 60
    })
    test: Dataset({
        features: ['text'],
        num_rows: 62
    })
})

In [496]:
def add_end_to_each_example(example):
    return {
        "text": f"{example} <end>"
    }

add_end_to_each_example("Hi, I am Harsh Agarwal.")

{'text': 'Hi, I am Harsh Agarwal. <end>'}

In [497]:
%%time

ds['train'].map(
    add_end_to_each_example
)

CPU times: user 1.78 ms, sys: 7.15 ms, total: 8.93 ms
Wall time: 13.2 ms


Dataset({
    features: ['text'],
    num_rows: 29567
})

## Parallel processing while processing text

In [498]:
%%time

ds['train'].map(
    add_end_to_each_example,
    num_proc=4
)

CPU times: user 5.37 ms, sys: 6.12 ms, total: 11.5 ms
Wall time: 15.8 ms


Dataset({
    features: ['text'],
    num_rows: 29567
})

In [499]:
ds['train'][0]['text']

'= Valkyria Chronicles III =\nSenjō no Valkyria 3: Unrecorded Chronicles (Japanese: 戦場のヴァルキュリア3, lit. Valkyria of the Battlefield 3), commonly referred to as Valkyria Chronicles III outside Japan, is a tactical role-playing video game developed by Sega and Media.Vision for the PlayStation Portable. Released in January 2011 in Japan, it is the third game in the Valkyria series. Employing the same fusion of tactical and real-time gameplay as its predecessors, the story runs parallel to the first game and follows the "Nameless", a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit "Calamaty Raven".\nThe game began development in 2010, carrying over a large portion of the work done on Valkyria Chronicles II. While it retained the standard features of the series, it also underwent multiple adjustments, such as making the game more forgiving for series newcomers. Character designer Raita 

# Implementation of BPE

### Steps of implementation of BPE are:
- Text Normalization
- Pre-tokenization
- Splitting words into characters
- Applying merge rules to reach the desired number of tokens

In [500]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [501]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [502]:
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(corpus[3]), tokenizer.tokenize(corpus[3])

([('Hopefully', (0, 9)),
  (',', (9, 10)),
  ('Ġyou', (10, 14)),
  ('Ġwill', (14, 19)),
  ('Ġbe', (19, 22)),
  ('Ġable', (22, 27)),
  ('Ġto', (27, 30)),
  ('Ġunderstand', (30, 41)),
  ('Ġhow', (41, 45)),
  ('Ġthey', (45, 50)),
  ('Ġare', (50, 54)),
  ('Ġtrained', (54, 62)),
  ('Ġand', (62, 66)),
  ('Ġgenerate', (66, 75)),
  ('Ġtokens', (75, 82)),
  ('.', (82, 83))],
 ['Hopefully',
  ',',
  'Ġyou',
  'Ġwill',
  'Ġbe',
  'Ġable',
  'Ġto',
  'Ġunderstand',
  'Ġhow',
  'Ġthey',
  'Ġare',
  'Ġtrained',
  'Ġand',
  'Ġgenerate',
  'Ġtokens',
  '.'])

In [503]:
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(corpus[1]), tokenizer.tokenize(corpus[1])

([('This', (0, 4)),
  ('Ġchapter', (4, 12)),
  ('Ġis', (12, 15)),
  ('Ġabout', (15, 21)),
  ('Ġtokenization', (21, 34)),
  ('.', (34, 35))],
 ['This', 'Ġchapter', 'Ġis', 'Ġabout', 'Ġtoken', 'ization', '.'])

In [504]:
word_freq = {}

for sent in corpus:
    words = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(sent)
    for word in words:
        if word[0] in word_freq.keys():
            word_freq[word[0]] += 1
        else:
            word_freq[word[0]] = 1

word_freq

{'This': 3,
 'Ġis': 2,
 'Ġthe': 1,
 'ĠHugging': 1,
 'ĠFace': 1,
 'ĠCourse': 1,
 '.': 4,
 'Ġchapter': 1,
 'Ġabout': 1,
 'Ġtokenization': 1,
 'Ġsection': 1,
 'Ġshows': 1,
 'Ġseveral': 1,
 'Ġtokenizer': 1,
 'Ġalgorithms': 1,
 'Hopefully': 1,
 ',': 1,
 'Ġyou': 1,
 'Ġwill': 1,
 'Ġbe': 1,
 'Ġable': 1,
 'Ġto': 1,
 'Ġunderstand': 1,
 'Ġhow': 1,
 'Ġthey': 1,
 'Ġare': 1,
 'Ġtrained': 1,
 'Ġand': 1,
 'Ġgenerate': 1,
 'Ġtokens': 1}

In [505]:
list(list(word_freq.keys())[0])

['T', 'h', 'i', 's']

In [506]:
alphabets = []

for word in word_freq.keys():
    letters = list(word)
    for letter in letters:
        if letter not in alphabets:
            alphabets.append(letter)

alphabets.sort()

alphabets

[',',
 '.',
 'C',
 'F',
 'H',
 'T',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'y',
 'z',
 'Ġ']

In [507]:
vocab = ["<|endoftext|>"] + alphabets
vocab

['<|endoftext|>',
 ',',
 '.',
 'C',
 'F',
 'H',
 'T',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'y',
 'z',
 'Ġ']

In [508]:
t = [[k]*v for k, v in word_freq.items()]
words_list = [k for i in t for k in i]

splits = [[i for i in word] for word in words_list]

words_list, splits

(['This',
  'This',
  'This',
  'Ġis',
  'Ġis',
  'Ġthe',
  'ĠHugging',
  'ĠFace',
  'ĠCourse',
  '.',
  '.',
  '.',
  '.',
  'Ġchapter',
  'Ġabout',
  'Ġtokenization',
  'Ġsection',
  'Ġshows',
  'Ġseveral',
  'Ġtokenizer',
  'Ġalgorithms',
  'Hopefully',
  ',',
  'Ġyou',
  'Ġwill',
  'Ġbe',
  'Ġable',
  'Ġto',
  'Ġunderstand',
  'Ġhow',
  'Ġthey',
  'Ġare',
  'Ġtrained',
  'Ġand',
  'Ġgenerate',
  'Ġtokens'],
 [['T', 'h', 'i', 's'],
  ['T', 'h', 'i', 's'],
  ['T', 'h', 'i', 's'],
  ['Ġ', 'i', 's'],
  ['Ġ', 'i', 's'],
  ['Ġ', 't', 'h', 'e'],
  ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],
  ['Ġ', 'F', 'a', 'c', 'e'],
  ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],
  ['.'],
  ['.'],
  ['.'],
  ['.'],
  ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],
  ['Ġ', 'a', 'b', 'o', 'u', 't'],
  ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n'],
  ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],
  ['Ġ', 's', 'h', 'o', 'w', 's'],
  ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],
  ['Ġ', 't', 'o', 'k', 'e', 'n', '

In [509]:
# splits = {word: freq*[i for i in word] for word, freq in word_freq.items()}
# splits

In [510]:
pair_freq = {}

for s in splits:
    for i in range(0, len(s)-1):
        key = tuple(s[i:i+2])
        if key in pair_freq.keys():
            pair_freq[key] += 1
        else:
            pair_freq[key] = 1

pair_freq

{('T', 'h'): 3,
 ('h', 'i'): 3,
 ('i', 's'): 5,
 ('Ġ', 'i'): 2,
 ('Ġ', 't'): 7,
 ('t', 'h'): 3,
 ('h', 'e'): 2,
 ('Ġ', 'H'): 1,
 ('H', 'u'): 1,
 ('u', 'g'): 1,
 ('g', 'g'): 1,
 ('g', 'i'): 1,
 ('i', 'n'): 2,
 ('n', 'g'): 1,
 ('Ġ', 'F'): 1,
 ('F', 'a'): 1,
 ('a', 'c'): 1,
 ('c', 'e'): 1,
 ('Ġ', 'C'): 1,
 ('C', 'o'): 1,
 ('o', 'u'): 3,
 ('u', 'r'): 1,
 ('r', 's'): 2,
 ('s', 'e'): 3,
 ('Ġ', 'c'): 1,
 ('c', 'h'): 1,
 ('h', 'a'): 1,
 ('a', 'p'): 1,
 ('p', 't'): 1,
 ('t', 'e'): 2,
 ('e', 'r'): 5,
 ('Ġ', 'a'): 5,
 ('a', 'b'): 2,
 ('b', 'o'): 1,
 ('u', 't'): 1,
 ('t', 'o'): 4,
 ('o', 'k'): 3,
 ('k', 'e'): 3,
 ('e', 'n'): 4,
 ('n', 'i'): 2,
 ('i', 'z'): 2,
 ('z', 'a'): 1,
 ('a', 't'): 2,
 ('t', 'i'): 2,
 ('i', 'o'): 2,
 ('o', 'n'): 2,
 ('Ġ', 's'): 3,
 ('e', 'c'): 1,
 ('c', 't'): 1,
 ('s', 'h'): 1,
 ('h', 'o'): 2,
 ('o', 'w'): 2,
 ('w', 's'): 1,
 ('e', 'v'): 1,
 ('v', 'e'): 1,
 ('r', 'a'): 3,
 ('a', 'l'): 2,
 ('z', 'e'): 1,
 ('l', 'g'): 1,
 ('g', 'o'): 1,
 ('o', 'r'): 1,
 ('r', 'i'): 1,
 ('i', '

In [511]:
# pair_freq = {}

# for k, v in splits.items():
#     for i in range(0, len(v)-1):
#         key = tuple(v[i:i+2])
#         if key in pair_freq.keys():
#             pair_freq[key] += 1
#         else:
#             pair_freq[key] = 1

# pair_freq

In [512]:
max_counts = 0
max_counts_str = ""

for ks, vs in pair_freq.items():
    if vs>=max_counts:
        max_counts = vs
        max_counts_str = ks

max_counts, max_counts_str

(7, ('Ġ', 't'))

In [513]:
merged_token = ''.join(list(max_counts_str))
merged_token

'Ġt'

In [514]:
vocab.append(merged_token)
vocab

['<|endoftext|>',
 ',',
 '.',
 'C',
 'F',
 'H',
 'T',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'y',
 'z',
 'Ġ',
 'Ġt']

In [515]:
sp = []
for s in splits:
    if list(max_counts_str)[0] in s and list(max_counts_str)[1] in s:
        print(s)
        new_values = []
        for i in range(len(s)):
            try:
                if new_values[-1]+s[i] == vocab[-1]:
                    new_values[-1] = new_values[-1]+s[i]
                else:
                    new_values.append(s[i])
            except Exception as e:
                new_values.append(s[i])
        print(new_values)
        sp.append(new_values)

splits, sp

['Ġ', 't', 'h', 'e']
['Ġt', 'h', 'e']
['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r']
['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r']
['Ġ', 'a', 'b', 'o', 'u', 't']
['Ġ', 'a', 'b', 'o', 'u', 't']
['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n']
['Ġt', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n']
['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n']
['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n']
['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r']
['Ġt', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r']
['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's']
['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's']
['Ġ', 't', 'o']
['Ġt', 'o']
['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd']
['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd']
['Ġ', 't', 'h', 'e', 'y']
['Ġt', 'h', 'e', 'y']
['Ġ', 't', 'r', 'a', 'i', 'n', 'e', 'd']
['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']
['Ġ', 'g', 'e', 'n', 'e', 'r', 'a', 't', 'e']
['Ġ', 'g', 'e', 'n', 'e', 'r', 'a', 't', 'e']
['Ġ', 't', 'o', 'k', 'e', 'n', 's']
['Ġt

([['T', 'h', 'i', 's'],
  ['T', 'h', 'i', 's'],
  ['T', 'h', 'i', 's'],
  ['Ġ', 'i', 's'],
  ['Ġ', 'i', 's'],
  ['Ġ', 't', 'h', 'e'],
  ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],
  ['Ġ', 'F', 'a', 'c', 'e'],
  ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],
  ['.'],
  ['.'],
  ['.'],
  ['.'],
  ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],
  ['Ġ', 'a', 'b', 'o', 'u', 't'],
  ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n'],
  ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],
  ['Ġ', 's', 'h', 'o', 'w', 's'],
  ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],
  ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],
  ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],
  ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],
  [','],
  ['Ġ', 'y', 'o', 'u'],
  ['Ġ', 'w', 'i', 'l', 'l'],
  ['Ġ', 'b', 'e'],
  ['Ġ', 'a', 'b', 'l', 'e'],
  ['Ġ', 't', 'o'],
  ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'],
  ['Ġ', 'h', 'o', 'w'],
  ['Ġ', 't', 'h', 'e', 'y'],
  ['Ġ', 'a', 'r', 'e'],
  ['Ġ', 't', 'r', 'a',

### Putting it together

In [516]:
def pre_tokenization(corpus):
    tokenizer = AutoTokenizer.from_pretrained("gpt2")

    word_freq = {}

    for sent in corpus:
        words = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(sent)
        for word in words:
            if word[0] in word_freq.keys():
                word_freq[word[0]] += 1
            else:
                word_freq[word[0]] = 1

    return word_freq

word_freq = pre_tokenization(corpus=corpus)
word_freq

{'This': 3,
 'Ġis': 2,
 'Ġthe': 1,
 'ĠHugging': 1,
 'ĠFace': 1,
 'ĠCourse': 1,
 '.': 4,
 'Ġchapter': 1,
 'Ġabout': 1,
 'Ġtokenization': 1,
 'Ġsection': 1,
 'Ġshows': 1,
 'Ġseveral': 1,
 'Ġtokenizer': 1,
 'Ġalgorithms': 1,
 'Hopefully': 1,
 ',': 1,
 'Ġyou': 1,
 'Ġwill': 1,
 'Ġbe': 1,
 'Ġable': 1,
 'Ġto': 1,
 'Ġunderstand': 1,
 'Ġhow': 1,
 'Ġthey': 1,
 'Ġare': 1,
 'Ġtrained': 1,
 'Ġand': 1,
 'Ġgenerate': 1,
 'Ġtokens': 1}

In [517]:
def initial_vocabulary(word_freq):
    alphabets = []

    for word in word_freq.keys():
        letters = list(word)
        for letter in letters:
            if letter not in alphabets:
                alphabets.append(letter)

    alphabets.sort()
    vocab = ["<|endoftext|>"] + alphabets
    return vocab

vocab = initial_vocabulary(word_freq)
vocab

['<|endoftext|>',
 ',',
 '.',
 'C',
 'F',
 'H',
 'T',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'y',
 'z',
 'Ġ']

In [518]:
def initial_splits(word_freq):
    t = [[k]*v for k, v in word_freq.items()]
    words_list = [k for i in t for k in i]

    splits = [[i for i in word] for word in words_list]
    return splits

splits = initial_splits(word_freq)
splits

[['T', 'h', 'i', 's'],
 ['T', 'h', 'i', 's'],
 ['T', 'h', 'i', 's'],
 ['Ġ', 'i', 's'],
 ['Ġ', 'i', 's'],
 ['Ġ', 't', 'h', 'e'],
 ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],
 ['Ġ', 'F', 'a', 'c', 'e'],
 ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],
 ['.'],
 ['.'],
 ['.'],
 ['.'],
 ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],
 ['Ġ', 'a', 'b', 'o', 'u', 't'],
 ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n'],
 ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],
 ['Ġ', 's', 'h', 'o', 'w', 's'],
 ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],
 ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],
 ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],
 ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],
 [','],
 ['Ġ', 'y', 'o', 'u'],
 ['Ġ', 'w', 'i', 'l', 'l'],
 ['Ġ', 'b', 'e'],
 ['Ġ', 'a', 'b', 'l', 'e'],
 ['Ġ', 't', 'o'],
 ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'],
 ['Ġ', 'h', 'o', 'w'],
 ['Ġ', 't', 'h', 'e', 'y'],
 ['Ġ', 'a', 'r', 'e'],
 ['Ġ', 't', 'r', 'a', 'i', 'n', 'e', 'd'],
 ['Ġ', 'a',

In [519]:
def get_pair_freq(splits):
    pair_freq = {}

    for s in splits:
        for i in range(0, len(s)-1):
            key = tuple(s[i:i+2])
            if key in pair_freq.keys():
                pair_freq[key] += 1
            else:
                pair_freq[key] = 1
    return pair_freq

pair_freq = get_pair_freq(splits=splits)
pair_freq

{('T', 'h'): 3,
 ('h', 'i'): 3,
 ('i', 's'): 5,
 ('Ġ', 'i'): 2,
 ('Ġ', 't'): 7,
 ('t', 'h'): 3,
 ('h', 'e'): 2,
 ('Ġ', 'H'): 1,
 ('H', 'u'): 1,
 ('u', 'g'): 1,
 ('g', 'g'): 1,
 ('g', 'i'): 1,
 ('i', 'n'): 2,
 ('n', 'g'): 1,
 ('Ġ', 'F'): 1,
 ('F', 'a'): 1,
 ('a', 'c'): 1,
 ('c', 'e'): 1,
 ('Ġ', 'C'): 1,
 ('C', 'o'): 1,
 ('o', 'u'): 3,
 ('u', 'r'): 1,
 ('r', 's'): 2,
 ('s', 'e'): 3,
 ('Ġ', 'c'): 1,
 ('c', 'h'): 1,
 ('h', 'a'): 1,
 ('a', 'p'): 1,
 ('p', 't'): 1,
 ('t', 'e'): 2,
 ('e', 'r'): 5,
 ('Ġ', 'a'): 5,
 ('a', 'b'): 2,
 ('b', 'o'): 1,
 ('u', 't'): 1,
 ('t', 'o'): 4,
 ('o', 'k'): 3,
 ('k', 'e'): 3,
 ('e', 'n'): 4,
 ('n', 'i'): 2,
 ('i', 'z'): 2,
 ('z', 'a'): 1,
 ('a', 't'): 2,
 ('t', 'i'): 2,
 ('i', 'o'): 2,
 ('o', 'n'): 2,
 ('Ġ', 's'): 3,
 ('e', 'c'): 1,
 ('c', 't'): 1,
 ('s', 'h'): 1,
 ('h', 'o'): 2,
 ('o', 'w'): 2,
 ('w', 's'): 1,
 ('e', 'v'): 1,
 ('v', 'e'): 1,
 ('r', 'a'): 3,
 ('a', 'l'): 2,
 ('z', 'e'): 1,
 ('l', 'g'): 1,
 ('g', 'o'): 1,
 ('o', 'r'): 1,
 ('r', 'i'): 1,
 ('i', '

In [520]:
def getting_max_counts(pair_freq):
    max_counts = 0
    max_counts_str = ""

    for ks, vs in pair_freq.items():
        if vs>=max_counts:
            max_counts = vs
            max_counts_str = ks

    return max_counts, max_counts_str

max_counts, max_counts_str = getting_max_counts(pair_freq=pair_freq)

max_counts_str, max_counts

(('Ġ', 't'), 7)

In [521]:
def updating_vocab(vocab, max_counts_str):
    merged_token = ''.join(list(max_counts_str))
    vocab.append(merged_token)
    return vocab

vocab = updating_vocab(vocab=vocab, max_counts_str=max_counts_str)
vocab

['<|endoftext|>',
 ',',
 '.',
 'C',
 'F',
 'H',
 'T',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'y',
 'z',
 'Ġ',
 'Ġt']

In [522]:
def updating_splits(splits, max_counts_str, vocab):
    sp = []
    for s in splits:
        if list(max_counts_str)[0] in s and list(max_counts_str)[1] in s:
            new_values = []
            for i in range(len(s)):
                try:
                    if new_values[-1]+s[i] == vocab[-1]:
                        new_values[-1] = new_values[-1]+s[i]
                    else:
                        new_values.append(s[i])
                except Exception as e:
                    new_values.append(s[i])
            sp.append(new_values)
        else:
            sp.append(s)
            
    splits = sp.copy()
    return splits

splits = updating_splits(splits=splits, max_counts_str=max_counts_str, vocab=vocab)
splits

[['T', 'h', 'i', 's'],
 ['T', 'h', 'i', 's'],
 ['T', 'h', 'i', 's'],
 ['Ġ', 'i', 's'],
 ['Ġ', 'i', 's'],
 ['Ġt', 'h', 'e'],
 ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],
 ['Ġ', 'F', 'a', 'c', 'e'],
 ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],
 ['.'],
 ['.'],
 ['.'],
 ['.'],
 ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],
 ['Ġ', 'a', 'b', 'o', 'u', 't'],
 ['Ġt', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n'],
 ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],
 ['Ġ', 's', 'h', 'o', 'w', 's'],
 ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],
 ['Ġt', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],
 ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],
 ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],
 [','],
 ['Ġ', 'y', 'o', 'u'],
 ['Ġ', 'w', 'i', 'l', 'l'],
 ['Ġ', 'b', 'e'],
 ['Ġ', 'a', 'b', 'l', 'e'],
 ['Ġt', 'o'],
 ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'],
 ['Ġ', 'h', 'o', 'w'],
 ['Ġt', 'h', 'e', 'y'],
 ['Ġ', 'a', 'r', 'e'],
 ['Ġt', 'r', 'a', 'i', 'n', 'e', 'd'],
 ['Ġ', 'a', 'n', 'd'],
 ['Ġ', 'g', 

## Training whole corpus

In [523]:
ds['train']['text'][:100]

['= Valkyria Chronicles III =\nSenjō no Valkyria 3: Unrecorded Chronicles (Japanese: 戦場のヴァルキュリア3, lit. Valkyria of the Battlefield 3), commonly referred to as Valkyria Chronicles III outside Japan, is a tactical role-playing video game developed by Sega and Media.Vision for the PlayStation Portable. Released in January 2011 in Japan, it is the third game in the Valkyria series. Employing the same fusion of tactical and real-time gameplay as its predecessors, the story runs parallel to the first game and follows the "Nameless", a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit "Calamaty Raven".\nThe game began development in 2010, carrying over a large portion of the work done on Valkyria Chronicles II. While it retained the standard features of the series, it also underwent multiple adjustments, such as making the game more forgiving for series newcomers. Character designer Raita

In [524]:
def bpe_tokenizer(vocab_size, corpus):
    size = 0
    merges = []
    word_freq = pre_tokenization(corpus=corpus)
    vocab = initial_vocabulary(word_freq)
    splits = initial_splits(word_freq)
    while size<vocab_size:
        if size%100==0:
            print(f"==={size}===")
        pair_freq = get_pair_freq(splits=splits)
        max_counts, max_counts_str = getting_max_counts(pair_freq=pair_freq)
        merges.append(max_counts_str)
        vocab = updating_vocab(vocab=vocab, max_counts_str=max_counts_str)
        # print(vocab)
        splits = updating_splits(splits=splits, max_counts_str=max_counts_str, vocab=vocab)
        size+=1
    return vocab, splits, merges

vocab, splits, merges = bpe_tokenizer(vocab_size=50, corpus=corpus)
vocab, splits, merges

===0===


(['<|endoftext|>',
  ',',
  '.',
  'C',
  'F',
  'H',
  'T',
  'a',
  'b',
  'c',
  'd',
  'e',
  'f',
  'g',
  'h',
  'i',
  'k',
  'l',
  'm',
  'n',
  'o',
  'p',
  'r',
  's',
  't',
  'u',
  'v',
  'w',
  'y',
  'z',
  'Ġ',
  'Ġt',
  'Ġa',
  'er',
  'is',
  'en',
  'Ġto',
  'nd',
  'Ġs',
  'ken',
  'Ġtoken',
  'ou',
  'his',
  'This',
  'll',
  'era',
  'ow',
  'how',
  'Ġse',
  'on',
  'ion',
  'tion',
  'iz',
  'Ġtokeniz',
  'Ġab',
  'in',
  'he',
  'Ġthe',
  'Ġis',
  'Ġtokens',
  'te',
  'erate',
  'enerate',
  'generate',
  'Ġgenerate',
  'Ġand',
  'ed',
  'ined',
  'ained',
  'rained',
  'Ġtrained',
  're',
  'Ġare',
  'Ġthey',
  'Ġhow',
  'and',
  'tand',
  'stand',
  'erstand',
  'nderstand',
  'understand'],
 [['This'],
  ['This'],
  ['This'],
  ['Ġis'],
  ['Ġis'],
  ['Ġthe'],
  ['Ġ', 'H', 'u', 'g', 'g', 'in', 'g'],
  ['Ġ', 'F', 'a', 'c', 'e'],
  ['Ġ', 'C', 'ou', 'r', 's', 'e'],
  ['.'],
  ['.'],
  ['.'],
  ['.'],
  ['Ġ', 'c', 'h', 'a', 'p', 't', 'er'],
  ['Ġab', 'ou', 't'

In [525]:
vocab, splits, merges = bpe_tokenizer(vocab_size=1000, corpus=ds['train']['text'][:100])
vocab, merges

===0===
===100===
===200===
===300===
===400===
===500===
===600===
===700===
===800===
===900===


(['<|endoftext|>',
  '!',
  '"',
  '#',
  '$',
  '%',
  '&',
  "'",
  '(',
  ')',
  '*',
  '+',
  ',',
  '-',
  '.',
  '/',
  '0',
  '1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  ':',
  ';',
  '<',
  '=',
  '>',
  '?',
  '@',
  'A',
  'B',
  'C',
  'D',
  'E',
  'F',
  'G',
  'H',
  'I',
  'J',
  'K',
  'L',
  'M',
  'N',
  'O',
  'P',
  'Q',
  'R',
  'S',
  'T',
  'U',
  'V',
  'W',
  'X',
  'Y',
  'Z',
  '[',
  ']',
  '^',
  '`',
  'a',
  'b',
  'c',
  'd',
  'e',
  'f',
  'g',
  'h',
  'i',
  'j',
  'k',
  'l',
  'm',
  'n',
  'o',
  'p',
  'q',
  'r',
  's',
  't',
  'u',
  'v',
  'w',
  'x',
  'y',
  'z',
  '~',
  '¡',
  '¢',
  '£',
  '¤',
  '¥',
  '¦',
  '§',
  '¨',
  '©',
  'ª',
  '«',
  '®',
  '¯',
  '°',
  '±',
  '²',
  '³',
  '´',
  'µ',
  '¶',
  '·',
  '¸',
  '¹',
  'º',
  '»',
  '¼',
  '½',
  '¾',
  '¿',
  'Â',
  'Ã',
  'Ä',
  'Å',
  'Ç',
  'È',
  'É',
  'Ë',
  'Î',
  'Ï',
  'Ð',
  'Ñ',
  '×',
  'Ø',
  'Ù',
  'à',
  'á',
  'â',
  'ã',
  'ä',
  'å',
  'æ',
 

## Training on English and Hindi dataset

In [526]:
import datasets

ds = datasets.load_dataset("cfilt/iitb-english-hindi")
ds

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [528]:
train_data = [sample['translation']['en'] for sample in ds['train']][:50000] + \
             [sample['translation']['hi'] for sample in ds['train']][:50000]
train_data

['Give your application an accessibility workout',
 'Accerciser Accessibility Explorer',
 'The default plugin layout for the bottom panel',
 'The default plugin layout for the top panel',
 'A list of plugins that are disabled by default',
 'Highlight duration',
 'The duration of the highlight box when selecting accessible nodes',
 'Highlight border color',
 'The color and opacity of the highlight border.',
 'Highlight fill color',
 'The color and opacity of the highlight fill.',
 'API Browser',
 'Browse the various methods of the current accessible',
 'Hide private attributes',
 'Method',
 'Property',
 'Value',
 'IPython Console',
 'Interactive console for manipulating currently selected accessible',
 'Event monitor',
 '_ Monitor Events',
 'C _ lear Selection',
 'Everything',
 'Selected application',
 'Selected accessible',
 'Source',
 'Event Monitor',
 'Shows events as they occur from selected types and sources',
 'Highlight last event entry',
 'Start / stop event recording',
 'Clear 

In [529]:
len(train_data)

100000

In [530]:
vocab, splits, merges = bpe_tokenizer(vocab_size=7500, corpus=train_data)
vocab, merges

===0===
===100===
===200===
===300===
===400===
===500===
===600===
===700===
===800===
===900===
===1000===
===1100===
===1200===
===1300===
===1400===
===1500===
===1600===
===1700===
===1800===
===1900===
===2000===
===2100===
===2200===
===2300===
===2400===
===2500===
===2600===
===2700===
===2800===
===2900===
===3000===
===3100===
===3200===
===3300===
===3400===
===3500===
===3600===
===3700===
===3800===
===3900===
===4000===
===4100===
===4200===
===4300===
===4400===
===4500===
===4600===
===4700===
===4800===
===4900===
===5000===
===5100===
===5200===
===5300===
===5400===
===5500===
===5600===
===5700===
===5800===
===5900===
===6000===
===6100===
===6200===
===6300===
===6400===
===6500===
===6600===
===6700===
===6800===
===6900===
===7000===
===7100===
===7200===
===7300===
===7400===


(['<|endoftext|>',
  '!',
  '"',
  '#',
  '$',
  '%',
  '&',
  "'",
  '(',
  ')',
  '*',
  '+',
  ',',
  '-',
  '.',
  '/',
  '0',
  '1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  ':',
  ';',
  '<',
  '=',
  '>',
  '?',
  '@',
  'A',
  'B',
  'C',
  'D',
  'E',
  'F',
  'G',
  'H',
  'I',
  'J',
  'K',
  'L',
  'M',
  'N',
  'O',
  'P',
  'Q',
  'R',
  'S',
  'T',
  'U',
  'V',
  'W',
  'X',
  'Y',
  'Z',
  '[',
  '\\',
  ']',
  '^',
  '_',
  '`',
  'a',
  'b',
  'c',
  'd',
  'e',
  'f',
  'g',
  'h',
  'i',
  'j',
  'k',
  'l',
  'm',
  'n',
  'o',
  'p',
  'q',
  'r',
  's',
  't',
  'u',
  'v',
  'w',
  'x',
  'y',
  'z',
  '{',
  '}',
  '~',
  '¡',
  '¢',
  '£',
  '¤',
  '¥',
  '¦',
  '§',
  '¨',
  '©',
  'ª',
  '«',
  '¬',
  '®',
  '¯',
  '°',
  '²',
  'µ',
  '¶',
  '·',
  '¸',
  '¹',
  '¼',
  '¾',
  '¿',
  'Â',
  'Ã',
  'à',
  'â',
  'Ġ',
  'Ģ',
  'ģ',
  'Ĥ',
  'ĥ',
  'Ħ',
  'ħ',
  'Ĩ',
  'ĩ',
  'Ī',
  'ī',
  'Ĭ',
  'ĭ',
  'Į',
  'į',
  'ı',
  'Ĳ',
  'ĳ',
  'ĵ',


### Retrieving tokenized data on new text

In [538]:
ht = [i['hi'] for i in ds['test']['translation']]
len(ht), ht

(2507,
 ['आपकी कार में ब्लैक बॉक्स?',
  'जबकि अमेरिका के सड़क योजनाकार, ध्वस्त होते हुए हाईवे सिस्टम को सुधारने के लिए धन की कमी से जूझ रहे हैं, वहीं बहुत-से लोग इसका समाधान छोटे से ब्लैक बॉक्स में देख रहे हैं, जो आपकी कार के डैशबोर्ड पर सफ़ाई से फिट हो जाता है।',
  'यह डिवाइस, जो मोटर-चालक द्वारा वाहन चलाए गए प्रत्येक मील को ट्रैक करती है तथा उस सूचना को अधिकारियों को संचारित करती है, आजकल अमेरिका की प्रमुख सड़कों का वित्त-पोषण करने के लिए पुराने हो चुके सिस्टम का जीर्णोद्धार करने के लिए वाशिंगटन और राज्य नियोजन कार्यालय के लिए एक विवादास्पद प्रयास का मुद्दा बन चुका है।',
  'आम तौर पर हाईवे नियोजन जैसा उबाऊ काम भी अचानक गहन बहस तथा जीवंत गठबंधनों का मुद्दा बन गया है।',
  'आपने द्वारा ड्राइव किए गए मील, तथा संभवतः ड्राइव किए गए स्थान का विवरण रखने - और फिर इस सूचना का उपयोग टैक्स बिल तैयार करने के लिए - सरकार को इन ब्लैक बॉक्स का उपयोग करने की अनुमति देने के पक्ष में समर्थन जुटाने के लिए लिबरेटेरियन पर्यावरणीय समूहों के साथ मिल गए हैं।',
  'चाय पार्टी भौचक्की है।',
  'अमेरिकी नागरिक स्

In [544]:
word_freq = pre_tokenization(corpus=ht)
vocab2 = initial_vocabulary(word_freq)
splits = initial_splits(word_freq)
splits

[['à', '¤', 'Ĩ', 'à', '¤', 'ª', 'à', '¤', 'ķ'],
 ['à', '¤', 'Ĩ', 'à', '¤', 'ª', 'à', '¤', 'ķ'],
 ['à', '¤', 'Ĩ', 'à', '¤', 'ª', 'à', '¤', 'ķ'],
 ['à', '¤', 'Ĩ', 'à', '¤', 'ª', 'à', '¤', 'ķ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ'],
 ['à', '¥', 'Ģ']

In [None]:
ms = []

for x in splits:
    for i in range(len(x)-1):
        sp = (x[i], x[i+1])
        if sp in merges:
            n = x[i]+x[i+1]

('à', '¤')
('¤', 'Ĩ')
('Ĩ', 'à')
('à', '¤')
('¤', 'ª')
('ª', 'à')
('à', '¤')
('¤', 'ķ')
('à', '¤')
('¤', 'Ĩ')
('Ĩ', 'à')
('à', '¤')
('¤', 'ª')
('ª', 'à')
('à', '¤')
('¤', 'ķ')
('à', '¤')
('¤', 'Ĩ')
('Ĩ', 'à')
('à', '¤')
('¤', 'ª')
('ª', 'à')
('à', '¤')
('¤', 'ķ')
('à', '¤')
('¤', 'Ĩ')
('Ĩ', 'à')
('à', '¤')
('¤', 'ª')
('ª', 'à')
('à', '¤')
('¤', 'ķ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')
('¥', 'Ģ')
('à', '¥')

KeyboardInterrupt: 