In [2]:
from pathlib import Path
import tensorflow as tf
import sys

sys.path.append('/kaggle/input/axiom-utils')
import llm_components as lc

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

## Paths

In [4]:
directory = Path('/kaggle/input')
data_dir = directory / 'wikitext-103' / 'wikitext-103'
train_path = data_dir / 'wiki.train.tokens'
valid_path = data_dir / 'wiki.valid.tokens'
test_path = data_dir / 'wiki.test.tokens'
tokenizer_path = directory / 'axiom-utils' / 'sp_tokenizer.model'
# the reason for 2 checkpoint directories is because 
# kaggle reads from input directory, but saves in working directory
checkpoint_restore_dir = directory / 'axiom-utils' / 'checkpoints'
checkpoint_save_dir = Path('/kaggle/working/checkpoints')

## Hyper Parameters

In [5]:
SEQUENCE_LEN = 256    # Context size
SHIFT = 16
BATCH_SIZE = 64
N_EMBEDS = 512
N_HEADS = 8
N_BLOCKS = 8
STEPS_PER_EPOCH = 3000
VAL_STEPS = 200

## Loading Data

In [6]:
sp = lc.load_sp_tokenizer(str(tokenizer_path))
loader = lc.LMDatasetLoader(
    tokenizer= sp,
    shift= SHIFT,
    seq_len= SEQUENCE_LEN,
    batch_size= BATCH_SIZE,
    shuffle_buffer= 50_000
)

In [7]:
train_ds = loader.create(train_path, training= True)
valid_ds = loader.create(valid_path, training= False)
test_ds = loader.create(test_path, training= False)

I0000 00:00:1770733025.909551      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13757 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1770733025.915723      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13757 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [8]:
for item in train_ds.take(1):
    print(item)

(<tf.Tensor: shape=(64, 256), dtype=int32, numpy=
array([[11868,  1567,   105, ...,   509,    53,  9695],
       [  346,     6,   255, ...,  1551,  6801,    19],
       [   31,     8,   407, ...,  4343,   981, 10083],
       ...,
       [  582,   298,   340, ...,   451,    92,   679],
       [  108,   394,  2284, ...,  3804,  1404,    53],
       [  199,  9002,  1874, ...,    11,     6,    53]], dtype=int32)>, <tf.Tensor: shape=(64, 256), dtype=int32, numpy=
array([[ 1567,   105,   540, ...,    53,  9695,  2704],
       [    6,   255,    83, ...,  6801,    19,   707],
       [    8,   407,   117, ...,   981, 10083,   133],
       ...,
       [  298,   340,   580, ...,    92,   679,    11],
       [  394,  2284,    73, ...,  1404,    53, 15023],
       [ 9002,  1874,  4435, ...,     6,    53,  2798]], dtype=int32)>)


## Strategy, callbacks, & some required calculations

In [8]:
vocab_size = sp.get_piece_size()
vocab_size

16000

In [9]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [11]:
tensorboard_cb = tf.keras.callbacks.TensorBoard(
    log_dir= 'logs',
    histogram_freq= 1,
    update_freq= 100,    # every 100 batch
    embeddings_freq= 1
)

In [12]:
target_tokens = 500_000_000
steps_per_epoch = 3000
token_per_step = BATCH_SIZE * SEQUENCE_LEN
token_per_chunk = steps_per_epoch * token_per_step
total_steps = target_tokens // token_per_step
warmup_steps = int(total_steps * 0.05)  # 5%
print(f'Total target tokens: {target_tokens:3,}')
print(f'Steps per epoch: {steps_per_epoch:3,}')
print(f'Token per step: {token_per_step:3,}')
print(f'Token per chunk/epoch: {token_per_chunk:3,}')
print(f'Total steps for cosine decay: {total_steps:3,}')
print(f'Warmup steps for cosine decay: {warmup_steps:3,}')

Total target tokens: 500,000,000
Steps per epoch: 3,000
Token per step: 16,384
Token per chunk/epoch: 49,152,000
Total steps for cosine decay: 30,517
Warmup steps for cosine decay: 1,525


## Transformer Model

In [13]:
with strategy.scope():
    # creating model
    model = lc.GPT(
        vocab_size= vocab_size,
        seq_len= SEQUENCE_LEN,
        n_embeds= N_EMBEDS,
        n_heads= N_HEADS,
        n_blocks= N_BLOCKS
    )

    # lr schedule
    lr_schedule = lc.WarmupCosine(
        base_lr= 3e-4,
        warmup_steps= warmup_steps,
        total_steps= total_steps
    )

    # optimizer
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate= lr_schedule,
        weight_decay= 0.01,
        beta_2= 0.95,
        clipnorm= 1.0
    )
    
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits= True     # softmax is handled by loss function
    )

    # compiling
    model.compile(
        optimizer= optimizer,
        loss= loss_fn,
        metrics= [lc.Perplexity()]
    )

    # passing dummy input to build model
    dummy = tf.zeros((1, SEQUENCE_LEN), dtype= tf.int32)
    _ = model(dummy, training= False)

    # checkpoint logic
    checkpoint = tf.train.Checkpoint(
        model= model,
        optimizer= optimizer
    )

    latest_ch = tf.train.latest_checkpoint(checkpoint_restore_dir)
    if latest_ch:
        print('Restoring State from', latest_ch)
        checkpoint.restore(latest_ch)
        print(
            'Step:', optimizer.iterations.numpy(),
            'LR:', lr_schedule(optimizer.iterations).numpy()
        )

    else:
        print('No Checkpoint found, random initialization.')

    manager = tf.train.CheckpointManager(
        checkpoint,
        checkpoint_save_dir,
        max_to_keep= 1
    )

No Checkpoint found, random initialization.


In [14]:
model.summary()

In [15]:
history = model.fit(
    train_ds, 
    steps_per_epoch= steps_per_epoch,
    epochs= 1,
    validation_data= valid_ds,
    validation_steps= VAL_STEPS,
    callbacks= [tensorboard_cb]
)

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu

In [16]:
test_loss, test_perplexity = model.evaluate(test_ds)
print(f'{test_loss = }\n{test_perplexity = }')

[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 328ms/step - loss: 4.8143 - perplexity: 135.3956
test_loss = 4.88761043548584
test_perplexity = 147.60020446777344




In [17]:
manager.save()

'/kaggle/working/checkpoints/ckpt-1'

In [18]:
import subprocess

subprocess.run(['zip', '-r', 'working_dir.zip', '/kaggle/working'], stdout= subprocess.DEVNULL)

CompletedProcess(args=['zip', '-r', 'working_dir.zip', '/kaggle/working'], returncode=0)