In [2]:
from pathlib import Path
import tensorflow as tf
import sys

sys.path.append('/kaggle/input/datasets/harshit1234g/axiomlm-utils')
import llm_components as lc

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

## Paths

In [4]:
directory = Path('/kaggle/input/datasets')
data_dir = directory / 'vadimkurochkin'/ 'wikitext-103' / 'wikitext-103'
utils_dir = directory / 'harshit1234g' / 'axiomlm-utils'

train_path = data_dir / 'wiki.train.tokens'
valid_path = data_dir / 'wiki.valid.tokens'
test_path = data_dir / 'wiki.test.tokens'

tokenizer_path =  utils_dir / 'sp_tokenizer.model'
# the reason for 2 checkpoint directories is because 
# kaggle reads from input directory, but saves in working directory
checkpoint_restore_dir = utils_dir / 'checkpoints'
checkpoint_save_dir = Path('/kaggle/working/checkpoints')

## Hyper Parameters

In [5]:
SEQUENCE_LEN = 256    # Context size
SHIFT = SEQUENCE_LEN
BATCH_SIZE = 128
N_EMBEDS = 512
N_HEADS = 8
N_BLOCKS = 8
STEPS_PER_EPOCH = 2000
VAL_STEPS = 200

In [6]:
target_tokens = 600_000_000
token_per_step = BATCH_SIZE * SEQUENCE_LEN
token_per_chunk = STEPS_PER_EPOCH * token_per_step
total_steps = round(target_tokens // token_per_step, -3)
warmup_steps = int(total_steps * 0.05)  # 5%
print(f'Total target tokens: {target_tokens:3,}')
print(f'Steps per epoch: {STEPS_PER_EPOCH:3,}')
print(f'Token per step: {token_per_step:3,}')
print(f'Token per chunk/epoch: {token_per_chunk:3,}')
print(f'Total steps for cosine decay: {total_steps:3,}')
print(f'Warmup steps for cosine decay: {warmup_steps:3,}')

Total target tokens: 600,000,000
Steps per epoch: 2,000
Token per step: 32,768
Token per chunk/epoch: 65,536,000
Total steps for cosine decay: 18,000
Warmup steps for cosine decay: 900


## Loading Data

In [7]:
sp = lc.load_sp_tokenizer(str(tokenizer_path))
loader = lc.LMDatasetLoader(
    tokenizer= sp,
    shift= SHIFT,
    seq_len= SEQUENCE_LEN,
    batch_size= BATCH_SIZE,
    shuffle_buffer= 50_000
)

In [8]:
train_ds = loader.create(train_path, training= True)
valid_ds = loader.create(valid_path, training= False)
test_ds = loader.create(test_path, training= False)

I0000 00:00:1771067152.663018      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13757 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1771067152.669307      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13757 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [9]:
for item in train_ds.take(1):
    print(item)

(<tf.Tensor: shape=(128, 256), dtype=int32, numpy=
array([[10972, 13867,    36, ...,   215,    50,  1312],
       [ 4319,    37,   943, ...,   262,    11,    36],
       [  165,  2736,    37, ...,  3187,   105,  1524],
       ...,
       [  271,  6726,   113, ...,    11, 15914,     0],
       [   53,   592,   214, ...,  1344,    83,  8341],
       [  179,   314, 12199, ...,  2149,     0,    59]], dtype=int32)>, <tf.Tensor: shape=(128, 256), dtype=int32, numpy=
array([[13867,    36, 13237, ...,    50,  1312,   148],
       [   37,   943,  6430, ...,    11,    36,     8],
       [ 2736,    37,     8, ...,   105,  1524,  5480],
       ...,
       [ 6726,   113,    91, ..., 15914,     0,    31],
       [  592,   214,   582, ...,    83,  8341,   281],
       [  314, 12199,   612, ...,     0,    59,   528]], dtype=int32)>)


## Callbacks, Strategy & Vocab size

In [9]:
tensorboard_cb = tf.keras.callbacks.TensorBoard(
    log_dir= 'logs',
    histogram_freq= 1,
    update_freq= 100,    # every 100 batch
    embeddings_freq= 1
)

In [10]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [11]:
vocab_size = sp.get_piece_size()
vocab_size

16000

## Transformer Model

In [12]:
with strategy.scope():
    # creating model
    model = lc.GPT(
        vocab_size= vocab_size,
        seq_len= SEQUENCE_LEN,
        n_embeds= N_EMBEDS,
        n_heads= N_HEADS,
        n_blocks= N_BLOCKS
    )

    # lr schedule
    lr_schedule = lc.WarmupCosine(
        base_lr= 3e-4,
        warmup_steps= warmup_steps,
        total_steps= total_steps
    )

    # optimizer
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate= lr_schedule,
        weight_decay= 0.1,
        beta_2= 0.95,
        clipnorm= 1.0
    )
    
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits= True     # softmax is handled by loss function
    )

    # compiling
    model.compile(
        optimizer= optimizer,
        loss= loss_fn,
        metrics= [lc.Perplexity()]
    )

    # passing dummy input to build model
    dummy = tf.zeros((1, SEQUENCE_LEN), dtype= tf.int32)
    _ = model(dummy, training= False)

    # checkpoint logic
    checkpoint = tf.train.Checkpoint(
        model= model,
        optimizer= optimizer
    )

    latest_ch = tf.train.latest_checkpoint(checkpoint_restore_dir)
    if latest_ch:
        print('Restoring State from', latest_ch)
        
        optimizer.build(model.trainable_variables)
        checkpoint.restore(latest_ch).assert_existing_objects_matched()
        
        print('Step:', optimizer.iterations.numpy())
        print('LR:', lr_schedule(optimizer.iterations).numpy())
        print('Optimizer Variables:', len(optimizer.variables))

    else:
        print('No Checkpoint found, random initialization.')

    manager = tf.train.CheckpointManager(
        checkpoint,
        checkpoint_save_dir,
        max_to_keep= 3
    )

Restoring State from /kaggle/input/datasets/harshit1234g/axiomlm-utils/checkpoints/ckpt-1
Step: 2000
LR: 0.0002972526
Optimizer Variables: 185


In [13]:
model.summary()

In [14]:
history = model.fit(
    train_ds, 
    steps_per_epoch= STEPS_PER_EPOCH,
    epochs= 1,
    validation_data= valid_ds,
    validation_steps= VAL_STEPS,
    callbacks= [tensorboard_cb]
)

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu



[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3477s[0m 2s/step - loss: 4.1890 - perplexity: 68.7875 - val_loss: 3.2959 - val_perplexity: 45.4204


In [15]:
test_loss, test_perplexity = model.evaluate(test_ds)
print(f'{test_loss = }\n{test_perplexity = }')

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 610ms/step - loss: 3.6598 - perplexity: 44.0334
test_loss = 3.32709002494812
test_perplexity = 45.39208984375


In [16]:
manager.save()

'/kaggle/working/checkpoints/ckpt-2'

In [None]:
model.save('axiomlm.keras')

In [17]:
import subprocess

subprocess.run(['zip', '-r', 'working_dir.zip', '/kaggle/working'], stdout= subprocess.DEVNULL)

CompletedProcess(args=['zip', '-r', 'working_dir.zip', '/kaggle/working'], returncode=0)