## LLM

In [14]:
import matplotlib.pyplot as plt
import random
from IPython.display import clear_output
from src.tokenizer import TokenizerBPE, word_split, normalize_to_ascii

import os
import time
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from src.transformer import *
from src.data_handling import read_first_n, sample_batch


print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))



Num GPUs Available:  1


In [15]:
tf.keras.mixed_precision.set_global_policy('mixed_float16')

In [16]:
tokenizer = pkl.load(open("tokenizers/tokenizer_CNN8000_lowercase.pkl", 'rb'))
tokenizer.create_hash()

random.seed(42)
corpus_indicies = pkl.load(open('corpus/CNN_tokenized8000_lowercase.pkl', 'rb'))
random.shuffle(corpus_indicies)

## Define Model

In [17]:
class WarmUpThenDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self,
                 initial_learning_rate: float,
                 warmup_steps: int,
                 decay_schedule_fn: tf.keras.optimizers.schedules.LearningRateSchedule):
        """
        initial_learning_rate: peak LR reached at end of warmup
        warmup_steps:      # of steps to ramp from 0 → initial_learning_rate
        decay_schedule_fn: a tf.keras schedule to apply *after* warmup
        """
        super().__init__()
        self.initial_lr = initial_learning_rate
        self.warmup_steps = warmup_steps
        self.decay_schedule_fn = decay_schedule_fn

    def __call__(self, step):
        # Cast to float32 for safety in graph mode
        step = tf.cast(step, tf.float32)
        warmup_steps = tf.cast(self.warmup_steps, tf.float32)

        # compute linear warmup: lr = initial_lr * (step / warmup_steps)
        warmup_lr = self.initial_lr * (step / warmup_steps)

        # after warmup_steps, switch to decay schedule (shift step count)
        decay_step = step - warmup_steps
        decay_lr = self.decay_schedule_fn(decay_step)

        # if step < warmup_steps, pick warmup_lr, else decay_lr
        return tf.cond(step < warmup_steps,
                       lambda: warmup_lr,
                       lambda: decay_lr)
    


In [23]:
initial_lr = 1e-3
decay_steps = 10000
decay_rate = 0.5
decay_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=initial_lr,
    decay_steps=decay_steps,
    decay_rate=decay_rate,
    staircase=False)

warmup_steps = 1000
lr_schedule = WarmUpThenDecay(
    initial_learning_rate=initial_lr,
    warmup_steps=warmup_steps,
    decay_schedule_fn=decay_schedule)

np.random.seed(42)
tf.random.set_seed(42)

max_seq_len = 128
embed_dim = 768
tf_blocks = 6
heads = 6
ff_dim = 4*embed_dim
weight_decay = 0.01
dropout = 0.05

unembed_dims = []

model = Transformer(vocab_size=tokenizer.vocab_size,
                    max_seq_len=max_seq_len,
                    embed_dim=embed_dim,
                    tf_blocks=tf_blocks,
                    heads=heads,
                    ff_dim = ff_dim,
                    unembed_dims=unembed_dims,
                    lr=lr_schedule,
                    wd = weight_decay,
                    dropout=dropout,
                    )

losses_train = []
losses_test = []


In [24]:
name = "model_8k_tokens_largeBatch_2"


ckpt = tf.train.Checkpoint(
    optimizer=model.optimizer,
    model=model
)
ckpt_manager = tf.train.CheckpointManager(
    ckpt, 
    directory="checkpoints/" + name,      # folder where ckpts are saved
    max_to_keep=5                         # only keep 5 latest checkpoints
)

In [20]:
#ckpt.restore(ckpt_manager.latest_checkpoint)
#losses_train, losses_test = pkl.load(open("checkpoints/losses_" + name + ".pkl", 'rb'))

In [25]:
total_params = 0
for var in model.parameter_list:
    shape = var.get_shape()
    num_params = 1
    for dim in shape:
        num_params *= dim
    total_params += num_params
print(f"Total number of parameters: {total_params}")

Total number of parameters: 48811396


In [None]:
batch_size = 32
for i in tqdm(range(100000)):
    indices, y_true = sample_batch(corpus_indicies[:20000], 
                                   batch_size, 
                                   tokenizer, 
                                   max_seq_len)
    
    
    loss_train = model.train_step(indices, y_true).numpy()
    losses_train.append(loss_train)
    print("Step: ", i, "Train Loss: ", loss_train)


    indices, y_true = sample_batch(corpus_indicies[20000:], 
                                batch_size//4, 
                                tokenizer, 
                                max_seq_len)
    
    loss_test = model.evaluate(indices, y_true).numpy()
        
    losses_test.append(loss_test)

    if (i+1) % 1000 == 0:
        ckpt_manager.save()
        pkl.dump([losses_train, losses_test], open("checkpoints/losses_" + name + ".pkl", 'wb'))


    lr = model.optimizer._decayed_lr(tf.float32).numpy()
    #"""
    clear_output(wait=True)

    # prepare x-axis for the last 400 steps
    start = max(0, len(losses_train) - 1000)
    x_zoom = np.arange(start, len(losses_train))

    fig, axes = plt.subplots(2, 1, figsize=(10, 8), sharex=False)

    # Top subplot: zoom on last 400 steps
    ax1 = axes[0]
    ax1.plot(x_zoom, losses_test[-1000:], label="Test Loss")
    ax1.plot(x_zoom, losses_train[-1000:], label="Train Loss")

    _min = min(losses_train[-1000:] + losses_test[-1000:])
    _max = max(losses_train[-1000:] + losses_test[-1000:])
    delta = _max - _min
    #ax1.set_ylim(_min - 0.1 * delta, _max + 0.1 * delta)

    ax1.set_title("Training Loss (Last 1000 Steps)")
    ax1.set_xlabel("Step")
    ax1.set_ylabel("Loss")
    ax1.legend()
    ax1.grid(True)

    # Bottom subplot: full series
    ax2 = axes[1]
    ax2.plot(losses_test[10:], label="Test Loss")
    ax2.plot(losses_train[10:], label="Train Loss, lr = {:.2e}".format(lr))

    ax2.set_title("Training Loss (Full Series)")
    ax2.set_xlabel("Step")
    ax2.set_ylabel("Loss")
    ax2.legend()
    ax2.grid(True)

    plt.tight_layout()
    plt.show()
    #"""

KeyboardInterrupt: 

In [None]:
def tokenize(indices, merge_list):
    indices = np.array(indices)
    for pair, new_idx in merge_list:
        slice = np.where(np.logical_and(indices[:-1] == pair[0],  indices[1:] == pair[1]))
        if len(slice[0]) > 0:
            indices[:-1][slice] = new_idx
            indices = np.delete(indices, (slice[0]+1))

    return tf.expand_dims(tf.convert_to_tensor(indices, dtype=tf.int32), axis=0)

In [None]:
text = "obama's"
text = text.lower()

indices = tf.cast(tokenizer.tokenizer.tokenize(text), tf.int32)
indices = tokenize(indices, tokenizer.merge_list)
print(indices)

tf.Tensor([[2318]], shape=(1, 1), dtype=int32)


In [None]:
T = 0.5
tf.random.set_seed(43)
for i in range(10):
    logits = model.call(indices)[0,-1:]
    print(logits)
    idx = tf.cast(tf.random.categorical(logits/T, num_samples=1), tf.int32)
    indices = tf.concat([indices, idx], axis=1)
    text_pred = tokenizer.detokenize(indices)
    text_pred = text_pred.numpy()[0].decode('utf-8').replace("\n", " ")
    print(text_pred, end='\r', flush=True)
    #time.sleep(0.05)


tf.Tensor([[ 0.722  -4.01    0.3157 ...  0.765  -0.439   0.2062]], shape=(1, 8068), dtype=float16)
tf.Tensor([[-0.0224 -3.639   0.3625 ...  0.4646 -0.7905  0.7554]], shape=(1, 8068), dtype=float16)
tf.Tensor([[-0.0667 -3.426   1.136  ...  0.4553 -0.715   0.5737]], shape=(1, 8068), dtype=float16)
tf.Tensor([[ 0.3997 -3.154   0.918  ...  0.887  -1.003   0.1648]], shape=(1, 8068), dtype=float16)
tf.Tensor([[ 1.62   -2.684   1.318  ...  1.4375 -1.097   0.321 ]], shape=(1, 8068), dtype=float16)
tf.Tensor([[ 2.068  -2.271   0.735  ...  1.551  -0.6445 -0.0883]], shape=(1, 8068), dtype=float16)
tf.Tensor([[ 1.715  -2.178   1.016  ...  1.084  -0.4465  0.6475]], shape=(1, 8068), dtype=float16)
tf.Tensor([[ 1.681  -1.813   1.763  ...  1.788  -0.7764  1.059 ]], shape=(1, 8068), dtype=float16)
tf.Tensor([[ 1.69   -1.354   0.866  ...  1.911  -0.3223  0.5347]], shape=(1, 8068), dtype=float16)
tf.Tensor([[ 1.695  -0.8296  0.5176 ...  1.673  -0.3013  0.332 ]], shape=(1, 8068), dtype=float16)
obama'surs

In [None]:
from sklearn.cluster import KMeans

def cosine_similarity(embed_a, embed_b):
    """
    Compute the cosine similarity between two vectors.
    """
    embed_b_T = tf.transpose(embed_b)
    dot_product = embed_a@embed_b_T
    
    norm_a = tf.linalg.norm(embed_a, axis=1, keepdims=True)
    norm_b = tf.linalg.norm(embed_b_T, axis=0, keepdims=True)

    return dot_product / (norm_a * norm_b)


def cluster(X, n_clusters):
    X = X/np.linalg.norm(X, axis=1, keepdims=True)

    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    inertia = kmeans.inertia_
    labels = kmeans.labels_
    clusters = kmeans.cluster_centers_

    return inertia, labels, clusters


class EmbeddingClustering:
    def __init__(self, tokenizer, n_clusters=10):
        
        self.tokenizer = tokenizer
        self.n_clusters = n_clusters

    def fit(self, word_embed):
        inertia, labels, clusters = cluster(word_embed, self.n_clusters)
        self.word_embed = word_embed
        self.inertia = inertia
        self.labels = labels
        self.clusters = tf.convert_to_tensor(clusters, dtype=tf.float32)

        cos_sim = cosine_similarity(self.clusters, word_embed)
        self.idx_list =  tf.argsort(cos_sim, axis=-1, direction='DESCENDING', stable=False, name=None)

    def print_clusters(self, n_words=10):
        for idx in self.idx_list:
            for i in idx[:n_words]:
                word = self.tokenizer.detokenize(tf.expand_dims(tf.cast(i, tf.int32), axis=0))
                word = word.numpy().decode('utf-8')
                print(word)
            print("\n")

In [None]:
word_embed = model.word_embed
embedding_clustering = EmbeddingClustering(tokenizer, n_clusters=100)
embedding_clustering.fit(word_embed)
embedding_clustering.print_clusters(n_words=10)

punish
restrain
assum
accomplish
interven
disagree
commit
adjust
embarrass
proceed


born
connected
speaking
packed
talking
traveling
gathered
working
deployed
struggled


legislative
legal
federal
criminal
formal
constitutional
current
judicial
supreme
professional


oklahom
minnes
matthe
princi
trium
thomp
cele
wim
franch
unte


iet
ork
inois
aste
ool
aff
helicop
uck
ld
igh


vehicles
homes
buildings
stations
airports
boats
towns
equipment
spots
animals


absolutely
really
pretty
always
simply
probably
totally
usually
rarely
essentially


liverpool
midfielder
portugal
england
argentina
barcelona
goalkeeper
soccer
ferrari
striker


genu
^
includ
lished
bollah
apore
estab
massachu
issip
inclu


tun
hurrican
nutr
kne
exhib
var
gg
conven
hay
nam


republicans
governments
observers
leaders
lawmakers
conservatives
candidates
analysts
lawyers
critics


republican
conservative
democratic
hispanic
liberal
communist
former
independent
elected
senior


quiet
beautiful
perfect
genuine
unique
won

# Overlap

In [None]:
word_embed = model.word_embed

text = "queen"
text = text.lower()

idx = tf.cast(tokenizer.tokenizer.tokenize(text), tf.int32)
idx = tokenize(idx, tokenizer.merge_list)
print(idx)
embed1 = tf.expand_dims(word_embed[idx[0][0]], axis=0)


text = "woman"
text = text.lower()

idx = tf.cast(tokenizer.tokenizer.tokenize(text), tf.int32)
idx = tokenize(idx, tokenizer.merge_list)
print(idx)
embed2 = tf.expand_dims(word_embed[idx[0][0]], axis=0)

text = "man"
text = text.lower()

idx = tf.cast(tokenizer.tokenizer.tokenize(text), tf.int32)
idx = tokenize(idx, tokenizer.merge_list)
print(idx)
embed3 = tf.expand_dims(word_embed[idx[0][0]], axis=0)

embed = embed1/tf.norm(embed1) - embed2/tf.norm(embed2) + embed3/tf.norm(embed3)

cosine_sim = embed@tf.transpose(word_embed)
idx = tf.argsort(cosine_sim, axis=-1, 
                 direction='DESCENDING',
                 #direction='ASCENDING', 
                 stable=False, name=None)[0]

for i in idx[:100]:
    i = tf.expand_dims(i, axis=0)
    print(tokenizer.detokenize(i).numpy().decode('utf-8'))

tf.Tensor([[4225]], shape=(1, 1), dtype=int32)
tf.Tensor([[964]], shape=(1, 1), dtype=int32)
tf.Tensor([[187]], shape=(1, 1), dtype=int32)
queen
man
richard
mar
nic
prince
des
ca
van
friend
la
yan
pri
mon
fro
hop
ju
wife
lie
dest
epis
te
star
pa
bra
hi
daniel
bi
mic
jane
p
simp
ar
nor
den
adm
hua
198
lu
fa
miami
ham
bur
u
stron
fol
stat
clark
george
m
ra
legendary
ch
ja
phe
sla
stone
cy
hoff
bron
sha
year's
canad
cop
son
d
cape
hu
glas
isa
smu
rev
dy
(
vo
tan
mu
louis
pic
ti
pra
former
ta
da
bureau
aven
tom
l
rw
de
lou
mary
an
mum
kes
li
leon
photograp
dem
christ


## 