## LLM

In [1]:
import matplotlib.pyplot as plt
from IPython.display import clear_output
from src.tokenizer import TokenizerChar, word_split, normalize_to_ascii

import os
import time
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
import numpy as np
from tqdm.notebook import tqdm

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [2]:
import os

def read_first_n(directory_path, n):
    # List all entries in the directory
    filenames = os.listdir(directory_path)
    # Filter to only .txt files
    txt_files = [f for f in filenames if f.lower().endswith('.story')]
    # Sort alphabetically (or by any other criteria you like)
    #txt_files.sort()
    # Take the first n
    first_n = txt_files[:n]
    
    contents = []
    for fname in first_n:
        full_path = os.path.join(directory_path, fname)
        with open(full_path, 'r', encoding='utf-8') as f:
            contents.append(normalize_to_ascii(f.read()))
    return contents



In [3]:
corpus = read_first_n('corpus/stories', 1000)

In [4]:
def get_vocabulary(corpus):
    """Return a dict mapping words to their counts."""
    vocab = {}
    for line in corpus:
        for word in line.strip().split():
            vocab[word] = vocab.get(word, 0) + 1
    return vocab

In [5]:
def pair_freq(word_list):
    """Return a dict mapping pairs of words to their counts."""
    pairs = {}
    for word in word_list:
        for i in range(len(word) - 1):
            pair = (word[i], word[i + 1])
            pairs[pair] = pairs.get(pair, 0) + 1
    return pairs

In [None]:
class TokenizerBPE:
    def __init__(self, corpus, num_merges):
        self.tokenizer = TokenizerChar(corpus)
        self.token_to_idx = self.tokenizer.token_to_idx
        self.vocab_size = self.tokenizer.vocab_size

        self.word_list = []
        for line in corpus:
            self.word_list.extend(word_split(line))

        self.merge_list = []
        for i in tqdm(range(num_merges)):
            self.merge()

        vocab = list(self.token_to_idx.keys())
        indicies = list(self.token_to_idx.values())

        self.table_detokenize = tf.lookup.StaticHashTable(initializer=tf.lookup.KeyValueTensorInitializer(indicies, vocab), 
                                                          default_value="")



    def tokenize(self, text):
        indicies = np.array(self.tokenizer.tokenize(text))
        for (idx1, idx2), new_idx in self.merge_list:
            for i in reversed(range(len(indicies) - 1)):
                pair = (indicies[i], indicies[i + 1])
                if pair == (idx1, idx2):
                    indicies[i] = new_idx
                    indicies = np.delete(indicies, i + 1)
        
        return indicies

    def detokenize(self, indices):
        text = self.table_detokenize.lookup(indices)
        text = tf.strings.reduce_join(text, axis=-1, separator="")
        return text

    def merge(self):
        pf = pair_freq(self.word_list)
        key_max = max(pf, key=pf.get)
        token1, token2 = key_max
        new_token = token1 + token2
        self.token_to_idx[new_token] = self.vocab_size

        idx1, idx2 = self.token_to_idx[token1], self.token_to_idx[token2]
        self.merge_list.append([(idx1, idx2), self.vocab_size])

        self.vocab_size += 1

        for word in self.word_list:
            for i in reversed(range(len(word) - 1)):
                pair = (word[i], word[i + 1])
                if pair == key_max:
                    word[i] = new_token
                    word.pop(i + 1)

In [32]:
tokenizer_bpe = TokenizerBPE(corpus[:10], num_merges=150)

  0%|          | 0/150 [00:00<?, ?it/s]

In [33]:
print(tokenizer_bpe.merge_list)

[[(69, 57), 76], [(58, 63), 77], [(54, 67), 78], [(64, 63), 79], [(50, 63), 80], [(50, 69), 81], [(67, 54), 82], [(76, 54), 83], [(54, 63), 84], [(64, 67), 85], [(50, 61), 86], [(54, 53), 87], [(58, 68), 88], [(50, 68), 89], [(50, 67), 90], [(69, 64), 91], [(54, 68), 92], [(64, 70), 93], [(58, 69), 94], [(58, 52), 95], [(64, 55), 96], [(77, 56), 97], [(58, 79), 98], [(61, 54), 99], [(58, 53), 100], [(58, 56), 101], [(80, 53), 102], [(57, 54), 103], [(68, 69), 104], [(58, 61), 105], [(84, 69), 106], [(64, 62), 107], [(68, 50), 108], [(50, 62), 109], [(50, 52), 110], [(67, 58), 111], [(64, 61), 112], [(101, 57), 113], [(55, 85), 114], [(71, 54), 115], [(51, 54), 116], [(50, 53), 117], [(72, 57), 118], [(69, 78), 119], [(82, 68), 120], [(70, 67), 121], [(68, 54), 122], [(108, 100), 123], [(6, 68), 124], [(50, 74), 125], [(81, 98), 126], [(54, 62), 127], [(65, 54), 128], [(76, 81), 129], [(71, 78), 130], [(69, 67), 131], [(76, 78), 132], [(52, 79), 133], [(113, 69), 134], [(50, 56), 135], 

In [34]:
indicies = tf.cast(tokenizer_bpe.tokenize(corpus[0]), tf.int32)

In [18]:
import numpy as np

a = np.array(["3", "4", "5", "6", "7", "8", "9"])
b = np.array(["a", "b", "c", "d", "e", "f", "g"])


c = np.array([a, b])
print(c)
print(c[:,1:])
print(c[:,:-1])
print(c + c)


[['3' '4' '5' '6' '7' '8' '9']
 ['a' 'b' 'c' 'd' 'e' 'f' 'g']]
[['4' '5' '6' '7' '8' '9']
 ['b' 'c' 'd' 'e' 'f' 'g']]
[['3' '4' '5' '6' '7' '8']
 ['a' 'b' 'c' 'd' 'e' 'f']]


UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U1'), dtype('<U1')) -> None