## LLM

In [88]:
import matplotlib.pyplot as plt
import random
from IPython.display import clear_output
from src.tokenizer import TokenizerChar, TokenizerBPE, word_split, normalize_to_ascii, pair_freq

import re
import os
import time
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from src.transformer import *
from src.data_handling import read_first_n, sample_batch


print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


In [89]:
corpus = read_first_n('corpus/stories', 10000)

tokenizer = TokenizerChar(corpus)
vocab_size = tokenizer.vocab_size

In [52]:
stop_token = np.array(tokenizer.tokenize(" "))[0]

corpus_clean = [normalize_to_ascii(line) for line in corpus]
corpus_flatten = " ".join(corpus_clean)

corpus_flatten = re.findall(r"[\w']+|[^\w\s]", corpus_flatten)
corpus_flatten = " ".join(corpus_flatten)

corpus_indices = tokenizer.tokenize(corpus_flatten)

In [53]:
print(corpus_indices[:100])

tf.Tensor(
[42 85  8 84  1 80 71 71 74 68 74 66 77  1 27  1 54  1 15  1 52  1 15  1
 49 83 70 84 74 69 70 79 85  1 35 66 83 66 68 76  1 48 67 66 78 66  1 88
 66 79 85 84  1 77 66 88 78 66 76 70 83 84  1 85 80  1 88 70 74 72 73  1
 74 79  1 80 79  1 88 73 70 85 73 70 83  1 85 80  1 86 84 70  1 78 74 77
 74 85 66 83], shape=(100,), dtype=int32)


In [80]:
def pair_freq(indices, stop_token, vocab_size):
    indices = np.array(indices)
    mask = (indices[:-1] == stop_token) + (indices[1:] == stop_token)

    indices_large = indices[:-1] + indices[1:]*vocab_size

    indices_large = indices_large[~mask]
    temp = np.argmax(np.bincount(indices_large))
    idx1 = temp % vocab_size
    idx2 = temp // vocab_size

    return (idx1, idx2)

In [68]:
length = len(corpus_indices)
print(length)

ost = pair_freq(corpus_indices, stop_token)

43192481


In [71]:
idx1 = ost % vocab_size
idx2 = ost // vocab_size
print(tokenizer.vocab[idx1], tokenizer.vocab[idx2])

t h


In [None]:



class TokenizerBPE:
    def __init__(self, corpus, num_merges):
        self.tokenizer = TokenizerChar(corpus)
        self.token_to_idx = self.tokenizer.token_to_idx
        self.idx_to_token = {v: k for k, v in self.token_to_idx.items()}

        self.vocab_size = self.tokenizer.vocab_size

        self.create_hash()

        self.stop_token = np.array(self.tokenizer.tokenize(" "))[0]
        
        corpus_clean = [normalize_to_ascii(line) for line in corpus]
        corpus_flatten = " ".join(corpus_clean)
        
        corpus_flatten = re.findall(r"[\w']+|[^\w\s]", corpus_flatten)
        corpus_flatten = " ".join(corpus_flatten)
        
        corpus_indices = self.tokenizer.tokenize(corpus_flatten)

        self.merge_list = []
        for i in tqdm(range(num_merges)):
            corpus_indices = self.merge(corpus_indices)

        self.create_hash()
        self.word_list = None


    def tokenize(self, text):
        indices = np.array(self.tokenizer.tokenize(text))
        for (idx1, idx2), new_idx in self.merge_list:
            slice = np.where(np.logical_and(indices[:-1] == idx1,  indices[1:] == idx2))
            if len(slice[0]) > 0:
                indices[:-1][slice] = new_idx
                indices = np.delete(indices, (slice[0]+1))

        return tf.expand_dims(tf.convert_to_tensor(indices, dtype=tf.int32), axis=0)

    def detokenize(self, indices):
        text = self.table_detokenize.lookup(indices)
        text = tf.strings.reduce_join(text, axis=-1, separator="")
        return text

    def merge(self, corpus_indices):
        corpus_indices = np.array(corpus_indices)    

        new_idx = self.vocab_size
        idx1, idx2 = pair_freq(corpus_indices, self.stop_token, self.vocab_size)
        self.merge_list.append([(idx1, idx2), self.vocab_size])

    
        token1 = self.idx_to_token[idx1]
        token2 = self.idx_to_token[idx2]
        print(token1, token2)
        new_token = token1 + token2
        self.token_to_idx[new_token] = new_idx
        self.idx_to_token[new_idx] = new_token
        self.vocab_size += 1

        slice = np.where(np.logical_and(corpus_indices[:-1] == idx1, corpus_indices[1:] == idx2))
        if len(slice[0]) > 0:
            corpus_indices[:-1][slice] = new_idx
            corpus_indices = np.delete(corpus_indices, (slice[0]+1))

        return corpus_indices

    def create_hash(self):
        vocab = list(self.token_to_idx.keys())
        indicies = list(self.token_to_idx.values())

        self.tokenizer.create_hash()
        self.table_detokenize = tf.lookup.StaticHashTable(initializer=tf.lookup.KeyValueTensorInitializer(indicies, vocab), 
                                                          default_value="")
        
    def destroy_hash(self):
        self.tokenizer.destroy_hash()
        self.table_detokenize = None

In [82]:
tokenizer = TokenizerBPE(corpus, 8000)

  0%|          | 0/8000 [00:00<?, ?it/s]

t h
i n
e r
a n
th e
o n
r e
a t
e n
e d
o r
s t
a l
t o
in g
o u
a r
i s
e s
i t
o f
a s
an d
i c
h e
l e
i g
o m
i d
i on
i l
en t
a c
s e
ig h
b e
a y
v e
a d
f or
r o
a m
' s
o l
l y
c h
w h
i m
th at
o w
v er
o t
u r
s a
u t
t er
i r
igh t
u n
at ion
d e
T he
a re
c e
sa id
p e
il l
g e
i th
w as
n e
c on
er s
l d
h a
r i
u s
er e
re s
o p
a b
w ith
l i
c t
w e
m o
k e
c om
ou n
at e
ou t
t e
m ent
r a
u l
h igh
l ight
al l
f r
s u
l o
p ro
h is
ou r
i st
a g
high light
s h
ha ve
p l
f e
on e
e x
th er
or t
a p
fr om
d ay
a in
b y
ar t
it y
n ot
h as
s o
e ar
an t
p o
g o
a st
m an
m e
ou ld
d i
0 0
t r
l l
ac k
en d
p ort
om e
w or
es s
g h
i al
wh o
i es
u st
b ut
a k
a u
i ve
u p
d o
th is
of f
q u
e l
ic e
s ay
ac t
c l
y ear
o d
I n
a f
h er
w ill
an s
i f
the y
i e
ar d
o re
C N
CN N
' t
on g
w ere
m er
t s
i a
p r
t im
d ing
the ir
d er
p er
g r
h ad
id e
T h
y ou
o ver
b l
n ow
ab out
be en
in e
oun d
e p
id ent
op le
i v
ou gh
e v
u m
c an
mo re
o k
c r
e m
I t
ar y
ic h


In [84]:
tokenizer.destroy_hash()
pkl.dump(tokenizer, open("tokenizer_CNN8000_2.pkl", "wb"))

In [85]:
def tokenize(indices, merge_list):
    indices = np.array(indices)
    for pair, new_idx in merge_list:
        slice = np.where(np.logical_and(indices[:-1] == pair[0],  indices[1:] == pair[1]))
        if len(slice[0]) > 0:
            indices[:-1][slice] = new_idx
            indices = np.delete(indices, (slice[0]+1))

    return tf.expand_dims(tf.convert_to_tensor(indices, dtype=tf.int32), axis=0)

In [90]:
tokenizer = pkl.load(open("tokenizer_CNN8000_2.pkl", "rb"))
tokenizer.create_hash()
indicies_list = [np.array(tokenizer.tokenizer.tokenize(line)) for line in tqdm(corpus)]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [91]:
indicies_merged_list = [tokenize(indicies, tokenizer.merge_list) for indicies in tqdm(indicies_list)]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [92]:
pkl.dump(indicies_merged_list, open('corpus/CNN_tokenized8000_2.pkl', 'wb'))

## 