## LLM

In [1]:
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [2]:
import os
import time
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
import tensorflow as tf
import numpy as np
from tqdm.notebook import tqdm

print("Physical devices:", tf.config.list_physical_devices())
print("GPUs:", tf.config.list_physical_devices('GPU'))


Physical devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [264]:

    

class DenseLayer():
    def __init__(self, input_dim, output_dim):
        d = tf.sqrt(tf.cast(input_dim, tf.float32))
        self.W = tf.Variable(tf.random.uniform([input_dim, output_dim], -1/d, 1/d))
        self.b = tf.Variable(tf.zeros([output_dim]))

        self.parameter_list = [self.W, self.b]

    def __call__(self, x):
        return tf.linalg.matmul(x, self.W) + self.b

    
class Transformer:
    def __init__(self, 
                 vocab_size, 
                 max_seq_len,
                 heads,
                 embed_dim,
                 key_dim,
                 ffnn_dims,
                 unembed_dims,
                 lr): 
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.heads = heads
        self.max_seq_len = max_seq_len
        self.key_dim = key_dim
        self.ffnn_dims = ffnn_dims
        self.unembed_dims = unembed_dims

        self.head_dim = embed_dim // heads

        d = tf.sqrt(tf.cast(self.embed_dim, tf.float32))

        self.word_embed = tf.Variable(tf.random.uniform([vocab_size, embed_dim], -1/d, 1/d))
        self.pos_embed = tf.Variable(tf.random.uniform([max_seq_len, embed_dim], -1/d, 1/d))

        self.WK =  tf.Variable(tf.random.uniform([heads, key_dim, embed_dim], -1/d, 1/d))
        self.WQ =  tf.Variable(tf.random.uniform([heads, key_dim, embed_dim], -1/d, 1/d))
        self.WV =  tf.Variable(tf.random.uniform([heads, self.head_dim, embed_dim], -1/d, 1/d))


        self.ffnn_dims.insert(0, embed_dim)
        self.ffnn_dims.append(embed_dim)
        self.ffnn_layers = []
        for i in range(len(ffnn_dims)-1):
             self.ffnn_layers.append(DenseLayer(ffnn_dims[i], ffnn_dims[i+1]))

        self.unembed_dims.insert(0, embed_dim)
        self.unembed_dims.append(vocab_size)   
        self.unembed_layers = []
        for i in range(len(unembed_dims)-1):
            self.unembed_layers.append(DenseLayer(unembed_dims[i], unembed_dims[i+1]))



        self.parameter_list = [self.word_embed, self.pos_embed, 
                               self.WK, self.WQ, self.WV]
        for layer in self.ffnn_layers:
            self.parameter_list += layer.parameter_list
        for layer in self.unembed_layers:
            self.parameter_list += layer.parameter_list
        
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    def pred(self, x):

        x_embeds = self.embed(x)
        x_embeds = self.attention(x_embeds)
        x_embeds = self.ffnn(x_embeds)
        y_pred = self.unembed(x_embeds)

        return y_pred
    
    
    def embed(self, x):
        seq = tf.shape(x)[1]
        if seq > self.max_seq_len:
            x = x[-self.max_seq_len:]
            seq = self.max_seq_len
        x_embeds = tf.nn.embedding_lookup(self.word_embed, x)
        x_embeds = x_embeds + tf.expand_dims(self.pos_embed[:seq], axis=0)

        return x_embeds
    

    def attention(self, x_embeds):
        batch = tf.shape(x_embeds)[0]
        seq = tf.shape(x_embeds)[1]

        x_k = tf.einsum('ikl, bjl -> bijk', self.WK, x_embeds)
        x_q = tf.einsum('ikl, bjl -> bijk', self.WQ, x_embeds)
        x_v = tf.einsum('ikl, bjl -> bijk', self.WV, x_embeds)

        inner = tf.einsum('bijl,bikl -> bijk', x_k, x_q)
        mask = tf.linalg.band_part(tf.ones((1, seq, seq), dtype = tf.bool), -1, 0)
        mask = tf.repeat(mask, self.heads, axis=0)

        inner_masked = tf.where(mask, inner, tf.constant(-np.inf))

        dk = tf.sqrt(tf.cast(self.key_dim, tf.float32))
        WA = tf.nn.softmax(inner_masked/dk, axis=-1)

        head_outs = WA @ x_v
        concat   = tf.transpose(head_outs, [0, 2, 1, 3])  # [batch, seq, heads, head_dim]
        out   = tf.reshape(concat, [batch, seq, self.embed_dim])
     
        x_embeds = x_embeds + out

        return x_embeds
    

    def ffnn(self, x_embeds):
        x_up = x_embeds
        for layer in self.ffnn_layers[:-1]:
            x_up = layer(x_up)
            x_up = tf.nn.relu(x_up)
        x_down = self.ffnn_layers[-1](x_up)

        x_embeds = x_embeds + x_down
        
        return x_embeds
        
    
    def unembed(self, x_embeds):
        for layer in self.unembed_layers[:-1]:
            x_embeds = layer(x_embeds)
            x_embeds = tf.nn.relu(x_embeds)
        
        x_embeds = self.unembed_layers[-1](x_embeds)
        y_pred = tf.nn.softmax(x_embeds, axis=-1)
        return y_pred

    
    @tf.function
    def train_step(self, indices, y_true):
        
        with tf.GradientTape() as tape:
            loss = self.evaluate(indices, y_true)

        grads = tape.gradient(loss, self.parameter_list)
        self.optimizer.apply_gradients(zip(grads, self.parameter_list))
        return loss

    def evaluate(self, indices, y_true):
        y_true = y_true[:, 1:]
        y_pred = self.pred(indices)[:,:-1]
        loss = CrossEntropyLoss(y_true, y_pred)
        return loss


    
def CrossEntropyLoss(y_true, y_pred):
    loss = -tf.reduce_mean(y_true * tf.math.log(y_pred + 1e-10))
    return loss


In [265]:
import os
from src.tokenizer import TokenizerChar

def read_first_n(directory_path, n):
    # List all entries in the directory
    filenames = os.listdir(directory_path)
    # Filter to only .txt files
    txt_files = [f for f in filenames if f.lower().endswith('.story')]
    # Sort alphabetically (or by any other criteria you like)
    #txt_files.sort()
    # Take the first n
    first_n = txt_files[:n]
    
    contents = []
    for fname in first_n:
        full_path = os.path.join(directory_path, fname)
        with open(full_path, 'r', encoding='utf-8') as f:
            contents.append(f.read())
    return contents

In [268]:
contents = read_first_n('stories', 1000)

In [285]:
def get_vocabulary(corpus):
    """Return a dict mapping words to their counts."""
    vocab = {}
    for line in corpus:
        for word in line.strip().split():
            vocab[word] = vocab.get(word, 0) + 1
    return vocab

In [286]:
vocab = get_vocabulary(contents)

In [None]:



for key in vocab.keys():
    tokens = [i for i in key]
    tokens[-1] += "</w>"

In [292]:
from collections import Counter

def get_pair_stats(vocab):
    pairs = Counter()
    for symbols, freq in vocab.items():
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i+1])] += freq
    return pairs

In [293]:
pairs = get_pair_stats(vocab)

In [294]:
print(pairs)

Counter({('t', 'h'): 73192, ('h', 'e'): 69701, ('i', 'n'): 62366, ('e', 'r'): 52071, ('a', 'n'): 49633, ('r', 'e'): 44624, ('o', 'n'): 41181, ('a', 't'): 36550, ('o', 'r'): 34559, ('e', 'n'): 34445, ('e', 'd'): 32003, ('e', 's'): 31767, ('t', 'e'): 31012, ('n', 'd'): 30909, ('t', 'o'): 30056, ('a', 'r'): 28946, ('n', 'g'): 28811, ('t', 'i'): 28429, ('s', 't'): 28041, ('a', 'l'): 26192, ('i', 't'): 26110, ('h', 'a'): 26011, ('i', 's'): 25475, ('n', 't'): 24999, ('o', 'u'): 23762, ('a', 's'): 22587, ('l', 'i'): 21574, ('l', 'e'): 21233, ('h', 'i'): 21233, ('v', 'e'): 21127, ('o', 'f'): 20521, ('e', 'a'): 20436, ('m', 'e'): 20317, ('s', 'e'): 19976, ('r', 'i'): 19804, ('d', 'e'): 18669, ('r', 'o'): 17634, ('n', 'e'): 17621, ('c', 'o'): 17531, ('i', 'c'): 17349, ('l', 'l'): 16836, ('i', 'o'): 16355, ('r', 'a'): 16152, ('c', 'e'): 14926, ('o', 'm'): 14490, ('i', 'l'): 14417, ('t', 'a'): 14414, ('i', 'g'): 14212, ('i', 'd'): 14206, ('a', 'i'): 13889, ('c', 'a'): 13507, ('l', 'a'): 13429, ('b

In [295]:
# string to byte

string = 'hello'
byte_string = string.encode('utf-8')
print(byte_string)  # Output: b'hello'



b'hello'


## 