In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
with open('/content/drive/MyDrive/wikitext-2/wikitext-2/wiki.train.tokens', 'r') as file:
    content = file.read()

In [3]:
import re
from collections import defaultdict

In [4]:

def get_stats(vocab):
	"""
	Given a vocabulary (dictionary mapping words to frequency counts), returns a
	dictionary of tuples representing the frequency count of pairs of characters
	in the vocabulary.
	"""
	pairs = defaultdict(int)
	for word, freq in vocab.items():
		symbols = word.split()
		for i in range(len(symbols)-1):
			pairs[symbols[i],symbols[i+1]] += freq
	return pairs

In [5]:
def merge_vocab(pair, v_in):
	"""
	Given a pair of characters and a vocabulary, returns a new vocabulary with the
	pair of characters merged together wherever they appear.
	"""
	v_out = {}
	bigram = re.escape(' '.join(pair))
	p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
	for word in v_in:
		w_out = p.sub(''.join(pair), word)
		v_out[w_out] = v_in[word]
	return v_out

In [6]:
def get_vocab(data):
	"""
	Given a list of strings, returns a dictionary of words mapping to their frequency
	count in the data.
	"""
	vocab = defaultdict(int)
	for line in data:
		for word in line.split():
			vocab[' '.join(list(word))] += 1
	return vocab

In [7]:
def byte_pair_encoding(data, n):
	"""
	Given a list of strings and an integer n, returns a list of n merged pairs
	of characters found in the vocabulary of the input data.
	"""
	vocab = get_vocab(data)
	for i in range(n):
		pairs = get_stats(vocab)
		best = max(pairs, key=pairs.get)
		vocab = merge_vocab(best, vocab)
	return vocab

In [8]:
data = content.split()

n = 1000
bpe_pairs = byte_pair_encoding(data, n)

Hugging face


In [9]:
import nltk

# Download the Brown Corpus data
nltk.download('brown')

# Load the Brown Corpus
from nltk.corpus import brown

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [10]:
news_paragraphs = brown.paras(categories='news')

In [11]:
formatted_paragraphs = [
    ' '.join([' '.join(sentence) for sentence in paragraph])
    for paragraph in news_paragraphs
]

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [13]:
#compute the frequencies of each word in the corpus as we do the pre-tokenization

from collections import defaultdict

word_freqs = defaultdict(int)

for text in formatted_paragraphs:
  words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
  new_words = [word for word, offset in words_with_offsets]
  for word in new_words:
      word_freqs[word] += 1

print(word_freqs)



In [14]:
#compute the base vocabulary, formed by all the characters used in the corpus

alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(alphabet)

['!', '$', '%', '&', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ġ']


In [15]:
#add the special tokens used by the model at the beginning of that vocabulary. (For GPT-2)
vocab = ["<|endoftext|>"] + alphabet.copy()

In [16]:
#split each word into individual characters, to be able to start training
splits = {word: [c for c in word] for word in word_freqs.keys()}

In [17]:
#computes the frequency of each pair

def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

In [18]:
#part of this dictionary after the initial splits

pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 5:
        break

('T', 'h'): 1189
('h', 'e'): 9767
('Ġ', 'F'): 510
('F', 'u'): 41
('u', 'l'): 1017
('l', 't'): 351


In [19]:
#most frequent pair

best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

('Ġ', 't') 12102


In [20]:
#the first merge to learn is ('Ġ', 't') -> 'Ġt', and we add 'Ġt' to the vocabulary
merges = {("Ġ", "t"): "Ġt"}
vocab.append("Ġt")

In [21]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [22]:
#result of the first merge
splits = merge_pair("Ġ", "t", splits)
print(splits["Ġtrained"])

['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']


In [31]:
#we need to loop until we have learned all the merges we want
vocab_size = 1000

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [32]:
print(merges)

{('Ġ', 't'): 'Ġt', ('h', 'e'): 'he', ('Ġ', 'a'): 'Ġa', ('i', 'n'): 'in', ('Ġt', 'he'): 'Ġthe', ('e', 'r'): 'er', ('o', 'n'): 'on', ('Ġ', ','): 'Ġ,', ('r', 'e'): 're', ('Ġ', 's'): 'Ġs', ('Ġ', 'w'): 'Ġw', ('Ġ', 'o'): 'Ġo', ('e', 'n'): 'en', ('Ġ', '.'): 'Ġ.', ('a', 't'): 'at', ('e', 'd'): 'ed', ('o', 'r'): 'or', ('Ġ', 'c'): 'Ġc', ('Ġ', 'b'): 'Ġb', ('a', 'n'): 'an', ('Ġ', 'f'): 'Ġf', ('i', 's'): 'is', ('i', 't'): 'it', ('e', 's'): 'es', ('a', 'r'): 'ar', ('Ġo', 'f'): 'Ġof', ('Ġ', 'p'): 'Ġp', ('Ġa', 'n'): 'Ġan', ('Ġ', 'in'): 'Ġin', ('a', 'l'): 'al', ('Ġt', 'o'): 'Ġto', ('o', 'u'): 'ou', ('in', 'g'): 'ing', ('a', 's'): 'as', ('Ġ', 'h'): 'Ġh', ('Ġ', 'm'): 'Ġm', ('Ġan', 'd'): 'Ġand', ('i', 'c'): 'ic', ('i', 'on'): 'ion', ('Ġ', 'd'): 'Ġd', ('i', 'l'): 'il', ('o', 'm'): 'om', ('l', 'e'): 'le', ('en', 't'): 'ent', ('Ġt', 'h'): 'Ġth', ('i', 'd'): 'id', ('Ġ', 'n'): 'Ġn', ('r', 'o'): 'ro', ('Ġ', 'S'): 'ĠS', ('Ġ', 'M'): 'ĠM', ('a', 'y'): 'ay', ('e', 'l'): 'el', ('Ġ', 'l'): 'Ġl', ('Ġ', 'C'): 'ĠC', ('s

In [33]:
print(vocab)

['<|endoftext|>', '!', '$', '%', '&', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ġ', 'Ġt', 'he', 'Ġa', 'in', 'Ġthe', 'er', 'on', 'Ġ,', 're', 'Ġs', 'Ġw', 'Ġo', 'en', 'Ġ.', 'at', 'ed', 'or', 'Ġc', 'Ġb', 'an', 'Ġf', 'is', 'it', 'es', 'ar', 'Ġof', 'Ġp', 'Ġan', 'Ġin', 'al', 'Ġto', 'ou', 'ing', 'as', 'Ġh', 'Ġm', 'Ġand', 'ic', 'ion', 'Ġd', 'il', 'om', 'le', 'ent', 'Ġth', 'id', 'Ġn', 'ro', 'ĠS', 'ĠM', 'ay', 'el', 'Ġl', 'ĠC', 'st', 'Ġre', 'Ġbe', 'Ġg', 'Ġe', 'ol', 'ad', 'ac', 'Ġfor', 'ĠT', 'ut', 'ĠA', 'Ġon', 'et', 'ot', 'un', 'ur', 'am', 'Ġhe', 'ers', 've', 'ir', 'ly', 'ation', 'ig', 'ĠB', 'Ġst', 'ill', 'ĠH', 'Ġthat', 'ec', 'im', 'Ġis', 'ow', 'ĠP', 'Ġat', "Ġ'", '``', 'Ġwas', 'te

In [34]:
#To tokenize a new text, we pre-tokenize it, split it, then apply all the merge rules learned
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])

In [38]:
content[:2000]

' \n = Valkyria Chronicles III = \n \n Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more <unk> for series newcomers .

In [36]:
byte_pairs = tokenize(content[:2000])

In [37]:
byte_pairs

['Ġ',
 'Ċ',
 'Ġ',
 '=',
 'ĠV',
 'alk',
 'y',
 'ri',
 'a',
 'ĠCh',
 'r',
 'on',
 'ic',
 'les',
 'ĠI',
 'I',
 'I',
 'Ġ',
 '=',
 'Ġ',
 'Ċ',
 'Ġ',
 'Ċ',
 'ĠSen',
 'j',
 'Å',
 'į',
 'Ġno',
 'ĠV',
 'alk',
 'y',
 'ri',
 'a',
 'Ġ3',
 'Ġ:',
 'Ġ',
 '<',
 'un',
 'k',
 '>',
 'ĠCh',
 'r',
 'on',
 'ic',
 'les',
 'Ġ(',
 'ĠJ',
 'ap',
 'an',
 'es',
 'e',
 'Ġ:',
 'Ġ',
 'æ',
 'Ī',
 '¦',
 'å',
 'ł',
 '´',
 'ã',
 'ģ',
 '®',
 'ã',
 'ĥ',
 '´',
 'ã',
 'Ĥ',
 '¡',
 'ã',
 'ĥ',
 '«',
 'ã',
 'Ĥ',
 'Ń',
 'ã',
 'ĥ',
 '¥',
 'ã',
 'ĥ',
 'ª',
 'ã',
 'Ĥ',
 '¢',
 '3',
 'Ġ,',
 'Ġl',
 'it',
 'Ġ.',
 'ĠV',
 'alk',
 'y',
 'ri',
 'a',
 'Ġof',
 'Ġthe',
 'ĠB',
 'att',
 'le',
 'f',
 'ield',
 'Ġ3',
 'Ġ)',
 'Ġ,',
 'Ġcomm',
 'on',
 'ly',
 'Ġre',
 'fer',
 'red',
 'Ġto',
 'Ġas',
 'ĠV',
 'alk',
 'y',
 'ri',
 'a',
 'ĠCh',
 'r',
 'on',
 'ic',
 'les',
 'ĠI',
 'I',
 'I',
 'Ġout',
 's',
 'ide',
 'ĠJ',
 'ap',
 'an',
 'Ġ,',
 'Ġis',
 'Ġa',
 'Ġt',
 'act',
 'ical',
 'Ġro',
 'le',
 'Ġ',
 '@',
 '-',
 '@',
 'Ġplay',
 'ing',
 'Ġv',
 'ide',
 'o',
 'Ġ