## LLM

In [1]:
import matplotlib.pyplot as plt
import random
from IPython.display import clear_output
from src.tokenizer import TokenizerBPE, word_split, normalize_to_ascii

import os
import time
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from src.transformer import *
from src.data_handling import read_first_n, sample_batch


print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))



Num GPUs Available:  1


In [2]:
#tf.keras.mixed_precision.set_global_policy('mixed_float16')

In [None]:
tokenizer = pkl.load(open("tokenizers/tokenizer_CNN16000_lowercase.pkl", 'rb'))
tokenizer.create_hash()

random.seed(42)
corpus_indicies = pkl.load(open('corpus/CNN_tokenized16000_lowercase.pkl', 'rb'))
random.shuffle(corpus_indicies)

## Define Model

In [3]:
class WarmUpThenDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self,
                 initial_learning_rate: float,
                 warmup_steps: int,
                 decay_schedule_fn: tf.keras.optimizers.schedules.LearningRateSchedule):
        """
        initial_learning_rate: peak LR reached at end of warmup
        warmup_steps:      # of steps to ramp from 0 → initial_learning_rate
        decay_schedule_fn: a tf.keras schedule to apply *after* warmup
        """
        super().__init__()
        self.initial_lr = initial_learning_rate
        self.warmup_steps = warmup_steps
        self.decay_schedule_fn = decay_schedule_fn

    def __call__(self, step):
        # Cast to float32 for safety in graph mode
        step = tf.cast(step, tf.float32)
        warmup_steps = tf.cast(self.warmup_steps, tf.float32)

        # compute linear warmup: lr = initial_lr * (step / warmup_steps)
        warmup_lr = self.initial_lr * (step / warmup_steps)

        # after warmup_steps, switch to decay schedule (shift step count)
        decay_step = step - warmup_steps
        decay_lr = self.decay_schedule_fn(decay_step)

        # if step < warmup_steps, pick warmup_lr, else decay_lr
        return tf.cond(step < warmup_steps,
                       lambda: warmup_lr,
                       lambda: decay_lr)
    


In [4]:
initial_lr = 1e-3
decay_steps = 50000
decay_rate = 0.5
decay_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=initial_lr,
    decay_steps=decay_steps,
    decay_rate=decay_rate,
    staircase=False)

warmup_steps = 10000
lr_schedule = WarmUpThenDecay(
    initial_learning_rate=initial_lr,
    warmup_steps=warmup_steps,
    decay_schedule_fn=decay_schedule)

np.random.seed(42)
tf.random.set_seed(42)

max_seq_len = 200
embed_dim = 1024
tf_blocks = 8
heads = 8
ff_dim = 4*embed_dim

unembed_dims = []

model = Transformer(vocab_size=tokenizer.vocab_size,
                    max_seq_len=max_seq_len,
                    embed_dim=embed_dim,
                    tf_blocks=tf_blocks,
                    heads=heads,
                    ff_dim = ff_dim,
                    unembed_dims=unembed_dims,
                    lr=lr_schedule,
                    wd = 0.01,
                    )

losses_train = []
losses_test = []


In [5]:
name = "model_16k_tokens_lowercase_8blocks_wt_higherLR"


ckpt = tf.train.Checkpoint(
    optimizer=model.optimizer,
    model=model
)
ckpt_manager = tf.train.CheckpointManager(
    ckpt, 
    directory="checkpoints/" + name,      # folder where ckpts are saved
    max_to_keep=5                         # only keep 5 latest checkpoints
)

In [347]:
ckpt.restore(ckpt_manager.latest_checkpoint)
losses_train, losses_test = pkl.load(open("checkpoints/losses_" + name + ".pkl", 'rb'))

In [6]:
total_params = 0
for var in model.parameter_list:
    shape = var.get_shape()
    num_params = 1
    for dim in shape:
        num_params *= dim
    total_params += num_params
print(f"Total number of parameters: {total_params}")

Total number of parameters: 117413574


## Text completion

In [312]:
def tokenize(indices, merge_list):
    indices = np.array(indices)
    for pair, new_idx in merge_list:
        slice = np.where(np.logical_and(indices[:-1] == pair[0],  indices[1:] == pair[1]))
        if len(slice[0]) > 0:
            indices[:-1][slice] = new_idx
            indices = np.delete(indices, (slice[0]+1))

    return tf.expand_dims(tf.convert_to_tensor(indices, dtype=tf.int32), axis=0)

In [345]:
text = "terrorist"
text = text.lower()

indices = tf.cast(tokenizer.tokenizer.tokenize(text), tf.int32)
indices = tokenize(indices, tokenizer.merge_list)
print(indices)

tf.Tensor([[2655]], shape=(1, 1), dtype=int32)


In [346]:
T = 0.5
tf.random.set_seed(42)
for i in range(200):
    logits = model.call(indices)[0,-1:]
    idx = tf.cast(tf.random.categorical(logits/T, num_samples=1), tf.int32)
    indices = tf.concat([indices, idx], axis=1)
    text_pred = tokenizer.detokenize(indices)
    text_pred = text_pred.numpy()[0].decode('utf-8').replace("\n", " ")
    print(text_pred, end='\r', flush=True)
    #time.sleep(0.05)


terrorist group in the khy region of afghanistan in the khyber district of khyber, khyber, is a major attack for the khyber district of khyber pakkhtunkhwa pakkhwatunkhwa province in khyberkhwakhwa pakkhwatunkhwa, kwaakwa and kkhyber kwawa kwa khwawawa sew khy ktunkhwa kwa kwa kwawa, kakwawa kwa kkwawa kkk-k  kk kkk kkkkkkkkk kkkkkkkkkkkkkkkkkkkkkk

## Embedding Clustering

In [339]:
from sklearn.cluster import KMeans

def cosine_similarity(embed_a, embed_b):
    """
    Compute the cosine similarity between two vectors.
    """
    embed_b_T = tf.transpose(embed_b)
    dot_product = embed_a@embed_b_T
    
    norm_a = tf.linalg.norm(embed_a, axis=1, keepdims=True)
    norm_b = tf.linalg.norm(embed_b_T, axis=0, keepdims=True)

    return dot_product / (norm_a * norm_b)


def cluster(X, n_clusters):
    X = X/np.linalg.norm(X, axis=1, keepdims=True)

    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    inertia = kmeans.inertia_
    labels = kmeans.labels_
    clusters = kmeans.cluster_centers_

    return inertia, labels, clusters


class EmbeddingClustering:
    def __init__(self, tokenizer, n_clusters=10):
        
        self.tokenizer = tokenizer
        self.n_clusters = n_clusters

    def fit(self, word_embed):
        inertia, labels, clusters = cluster(word_embed, self.n_clusters)
        self.word_embed = word_embed
        self.inertia = inertia
        self.labels = labels
        self.clusters = tf.convert_to_tensor(clusters, dtype=tf.float32)

        cos_sim = cosine_similarity(self.clusters, word_embed)
        self.idx_list =  tf.argsort(cos_sim, axis=-1, direction='DESCENDING', stable=False, name=None)

    def print_clusters(self, n_words=10):
        for idx in self.idx_list:
            for i in idx[:n_words]:
                word = self.tokenizer.detokenize(tf.expand_dims(tf.cast(i, tf.int32), axis=0))
                word = word.numpy().decode('utf-8')
                print(word)
            print("\n")

In [None]:
word_embed = model.word_embed
embedding_clustering = EmbeddingClustering(tokenizer, n_clusters=100)
embedding_clustering.fit(word_embed)
embedding_clustering.print_clusters(n_words=10)

colors
dresses
lips
makeup
dress
jeans
pieces
beauty
buttons
hair


damasc
0s
hurrican
mingham
nairo
sterdam
propof
barcelon
traged
catastro


hurrican
mingham
raik
prede
environ
recor
agric
theless
uguay
eleph


acare
assaul
inev
attem
adjac
ailand
wawrink
emerg
controver
strengthe


citiz
}
invol
dipl
itored
moil
possib
acare
eleph
environ


insurgent
haqqani
insurgency
airstrike
infantry
jihadist
environ
{
streng
abulary


a
the
some
an
more
many
this
one
their
his


hezbol
recor
mingham
environ
raik
hurrican
uguay
eleph
adjac
catastro


citiz
acp
firs
onathan
idency
employe
hift
streng
abulary
mogad


0s
injiang
wawrink
raik
alep
weren
endum
onathan
inev
portugu


hezbol
injiang
sterdam
mogad
wawrink
indust
propof
guate
citiz
assaul


wondering
wondered
think
believe
know
realize
understand
convinced
skeptical
worry


citiz
mogad
signific
injiang
abulary
luscon
immen
portra
hurrican
adjac


chry
discre
condem
nostal
teles
incar
refr
leng
subur
prelim


pleaded
appealed
refused
inst

## Overlap

In [None]:
word_embed = model.word_embed

text = "republican"
text = text.lower()

idx = tf.cast(tokenizer.tokenizer.tokenize(text), tf.int32)
idx = tokenize(idx, tokenizer.merge_list)
print(idx)
embed = tf.expand_dims(word_embed[idx[0][0]], axis=0)

cosine_sim = embed@tf.transpose(word_embed)
idx = tf.argsort(cosine_sim, axis=-1, 
                 #direction='DESCENDING',
                 direction='ASCENDING', 
                 stable=False, name=None)[0]

for i in idx[:100]:
    i = tf.expand_dims(i, axis=0)
    print(tokenizer.detokenize(i).numpy().decode('utf-8'))

tf.Tensor([[1583]], shape=(1, 1), dtype=int32)
amput
extin
penetr
dri
accounted
analy
hydr
abdu
coron
bag
dehydr
cru
pray
bodies
extr
injuries
bottles
explo
elev
distur
pret
deton
excav
sco
-
cl
un
bags
cann
shocks
sne
ww
evacu
nab
there
]
fever
susp
scri
orchestr
it
dis
we'd
sc
tons
bath
incorpor
circul
hang
belonged
ext
comb
situ
location
 
sed
satur
searchers
im
supplies
instructions
blankets
uses
respir
desper
ref
am
erup
@
accum
ju
us
photos
dim
sh
exposure
cr
tam
suc
links
"
escap
intend
foc
have
wels
accommod
ats
earthquakes
cups
rocks
orders
returned
sp
evalu
dha
l
origin
als
discovered


## 