## LLM

In [1]:
import matplotlib.pyplot as plt
from IPython.display import clear_output
from src.tokenizer import TokenizerChar

In [2]:
import os
import time
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
import tensorflow as tf
import numpy as np
from tqdm.notebook import tqdm

print("Physical devices:", tf.config.list_physical_devices())
print("GPUs:", tf.config.list_physical_devices('GPU'))


Physical devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
import os
import re
from src.tokenizer import TokenizerChar

def read_first_n(directory_path, n):
    # List all entries in the directory
    filenames = os.listdir(directory_path)
    # Filter to only .txt files
    txt_files = [f for f in filenames if f.lower().endswith('.story')]
    # Sort alphabetically (or by any other criteria you like)
    #txt_files.sort()
    # Take the first n
    first_n = txt_files[:n]
    
    contents = []
    for fname in first_n:
        full_path = os.path.join(directory_path, fname)
        with open(full_path, 'r', encoding='utf-8') as f:
            contents.append(f.read())
    return contents

import unicodedata

def normalize_to_ascii(s: str) -> str:
    # 1) Decompose Unicode characters (e.g. é → e +  ́)
    # 2) Drop the non-ASCII combining marks in the encode step
    normalized = unicodedata.normalize('NFKD', s)
    ascii_bytes = normalized.encode('ascii', 'ignore')
    return ascii_bytes.decode('ascii')

In [5]:
corpus = read_first_n('corpus/stories', 1000)

In [6]:
def get_vocabulary(corpus):
    """Return a dict mapping words to their counts."""
    vocab = {}
    for line in corpus:
        for word in line.strip().split():
            vocab[word] = vocab.get(word, 0) + 1
    return vocab

In [17]:
def pair_freq(word_list):
    """Return a dict mapping pairs of words to their counts."""
    pairs = {}
    for word in word_list:
        for i in range(len(word) - 1):
            pair = (word[i], word[i + 1])
            pairs[pair] = pairs.get(pair, 0) + 1
    return pairs

In [31]:
def word_split(line):
    
    normalized_line = normalize_to_ascii(line)
    # Split into words
    word_list = normalized_line.strip().split()
    word_list = [list(word) for word in word_list]
    return word_list


class TokenizerBPE:
    def __init__(self, corpus, num_merges):
        self.tokenizer = TokenizerChar(corpus)
        self.token_to_idx = self.tokenizer.token_to_idx

        self.word_list = []
        for line in corpus:
            self.word_list.extend(word_split(line))

        self.merge_list = []
        #for i in range(num_merges):
        #    self.merge()


    def tokenize(self, text):
        return self.tokenizer.tokenize(text)

    def detokenize(self, indices):
        return self.tokenizer.get_vocabulary()[indices]

    def merge(self):
        pf = pair_freq(self.word_list)
        key_max = max(pf, key=pf.get)
        token1, token2 = key_max
        new_token = token1 + token2
        new_idx = len(self.token_to_idx)
        self.token_to_idx[new_token] = new_idx 

        idx1, idx2 = self.token_to_idx[token1], self.token_to_idx[token2]
        self.merge_list.append([(idx1, idx2), new_token])

        for word in self.word_list:
            for i in reversed(range(len(word) - 1)):
                pair = (word[i], word[i + 1])
                if pair == key_max:
                    word[i] = new_token
                    word.pop(i + 1)



In [32]:
tokenizer_bpe = TokenizerBPE(corpus[:10], num_merges=10)

In [33]:
print(tokenizer_bpe.word_list)

[['I', 't', "'", 's'], ['o', 'f', 'f', 'i', 'c', 'i', 'a', 'l', ':'], ['U', '.', 'S', '.'], ['P', 'r', 'e', 's', 'i', 'd', 'e', 'n', 't'], ['B', 'a', 'r', 'a', 'c', 'k'], ['O', 'b', 'a', 'm', 'a'], ['w', 'a', 'n', 't', 's'], ['l', 'a', 'w', 'm', 'a', 'k', 'e', 'r', 's'], ['t', 'o'], ['w', 'e', 'i', 'g', 'h'], ['i', 'n'], ['o', 'n'], ['w', 'h', 'e', 't', 'h', 'e', 'r'], ['t', 'o'], ['u', 's', 'e'], ['m', 'i', 'l', 'i', 't', 'a', 'r', 'y'], ['f', 'o', 'r', 'c', 'e'], ['i', 'n'], ['S', 'y', 'r', 'i', 'a', '.'], ['O', 'b', 'a', 'm', 'a'], ['s', 'e', 'n', 't'], ['a'], ['l', 'e', 't', 't', 'e', 'r'], ['t', 'o'], ['t', 'h', 'e'], ['h', 'e', 'a', 'd', 's'], ['o', 'f'], ['t', 'h', 'e'], ['H', 'o', 'u', 's', 'e'], ['a', 'n', 'd'], ['S', 'e', 'n', 'a', 't', 'e'], ['o', 'n'], ['S', 'a', 't', 'u', 'r', 'd', 'a', 'y'], ['n', 'i', 'g', 'h', 't', ','], ['h', 'o', 'u', 'r', 's'], ['a', 'f', 't', 'e', 'r'], ['a', 'n', 'n', 'o', 'u', 'n', 'c', 'i', 'n', 'g'], ['t', 'h', 'a', 't'], ['h', 'e'], ['b', 'e', 

## 