In [2]:
from pathlib import Path
import tensorflow as tf
import sys

# appending llm_components path to sys.path to easily import
sys.path.append('/kaggle/input/datasets/harshit1234g/axiomlm-utils')
import llm_components as lc

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

## Paths

In [4]:
# directories
directory = Path('/kaggle/input/datasets')
data_dir = directory / 'vadimkurochkin'/ 'wikitext-103' / 'wikitext-103'
utils_dir = directory / 'harshit1234g' / 'axiomlm-utils'

# dataset paths
train_path = data_dir / 'wiki.train.tokens'
valid_path = data_dir / 'wiki.valid.tokens'
test_path = data_dir / 'wiki.test.tokens'

# tokenizer
tokenizer_path =  utils_dir / 'sp_tokenizer.model'

# Path for checkpoint directory, the reason for 2 checkpoint directories is because 
# kaggle reads from input directory, but saves in working directory
checkpoint_restore_dir = utils_dir / 'checkpoints'
checkpoint_save_dir = Path('/kaggle/working/checkpoints')

## Hyper Parameters

In [5]:
SEQUENCE_LEN = 256      # Context size
SHIFT = SEQUENCE_LEN    # using shift = seq_len because the dataset is quite large
BATCH_SIZE = 64         # previously used 128 batch size, but got OOM
N_EMBEDS = 512
N_HEADS = 8
N_BLOCKS = 8
STEPS_PER_EPOCH = 4000
VAL_STEPS = 200

In [6]:
target_tokens = 600_000_000
token_per_step = BATCH_SIZE * SEQUENCE_LEN
token_per_chunk = STEPS_PER_EPOCH * token_per_step
total_steps = round(target_tokens // token_per_step, -3)
warmup_steps = int(total_steps * 0.05)  # 5%
print(f'Total target tokens: {target_tokens:3,}')
print(f'Steps per epoch: {STEPS_PER_EPOCH:3,}')
print(f'Token per step: {token_per_step:3,}')
print(f'Token per chunk/epoch: {token_per_chunk:3,}')
print(f'Total steps for cosine decay: {total_steps:3,}')
print(f'Warmup steps for cosine decay: {warmup_steps:3,}')

Total target tokens: 600,000,000
Steps per epoch: 4,000
Token per step: 16,384
Token per chunk/epoch: 65,536,000
Total steps for cosine decay: 37,000
Warmup steps for cosine decay: 1,850


## Loading Data

In [7]:
sp = lc.load_sp_tokenizer(str(tokenizer_path))
loader = lc.LMDatasetLoader(
    tokenizer= sp,
    shift= SHIFT,
    seq_len= SEQUENCE_LEN,
    batch_size= BATCH_SIZE,
    shuffle_buffer= 16_000
)

In [8]:
train_ds = loader.create(train_path, training= True)
valid_ds = loader.create(valid_path, training= False)
test_ds = loader.create(test_path, training= False)

I0000 00:00:1771171490.737045      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13757 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1771171490.742945      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13757 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [9]:
for item in train_ds.take(1):
    print(item)

(<tf.Tensor: shape=(64, 256), dtype=int32, numpy=
array([[ 1490,   493,  5492, ...,  2029,    11,    84],
       [ 2781,  2285,    31, ...,  6534,  1135,    11],
       [  105,    67,  3257, ..., 15919,  2468,    92],
       ...,
       [ 4549,    11,  7599, ...,  2194,   589,  6583],
       [ 7125, 15920,    53, ..., 11798,   118,    11],
       [  269,  3481,    36, ...,  3683,  2594,   433]], dtype=int32)>, <tf.Tensor: shape=(64, 256), dtype=int32, numpy=
array([[  493,  5492,  1889, ...,    11,    84,     6],
       [ 2285,    31,     8, ...,  1135,    11,  7693],
       [   67,  3257,    84, ...,  2468,    92,   329],
       ...,
       [   11,  7599,    33, ...,   589,  6583,     6],
       [15920,    53,   119, ...,   118,    11,  1749],
       [ 3481,    36,  2159, ...,  2594,   433,     8]], dtype=int32)>)


## Callbacks, Strategy & Vocab size

In [9]:
tensorboard_cb = tf.keras.callbacks.TensorBoard(
    log_dir= 'logs',
    histogram_freq= 1,
    update_freq= 100,    # every 100 batch
    embeddings_freq= 1
)

In [10]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [11]:
vocab_size = sp.get_piece_size()
vocab_size

16000

## Transformer Model

In [12]:
with strategy.scope():
    # creating model
    model = lc.GPT(
        vocab_size= vocab_size,
        seq_len= SEQUENCE_LEN,
        n_embeds= N_EMBEDS,
        n_heads= N_HEADS,
        n_blocks= N_BLOCKS
    )

    # lr schedule
    lr_schedule = lc.WarmupCosine(
        base_lr= 3e-4,
        warmup_steps= warmup_steps,
        total_steps= total_steps
    )

    # optimizer
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate= lr_schedule,
        weight_decay= 0.1,
        beta_2= 0.95,
        clipnorm= 1.0
    )
    
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits= True     # softmax is handled by loss function
    )

    # compiling
    model.compile(
        optimizer= optimizer,
        loss= loss_fn,
        metrics= [lc.Perplexity()]
    )

    # passing dummy input to build model
    dummy = tf.zeros((1, SEQUENCE_LEN), dtype= tf.int32)
    _ = model(dummy, training= False)

    # checkpoint logic
    checkpoint = tf.train.Checkpoint(
        model= model,
        optimizer= optimizer
    )

    latest_ch = tf.train.latest_checkpoint(checkpoint_restore_dir)
    if latest_ch:
        print('Restoring State from', latest_ch)
        
        optimizer.build(model.trainable_variables)
        checkpoint.restore(latest_ch).assert_existing_objects_matched()
        
        print('Step:', optimizer.iterations.numpy())
        print('LR:', lr_schedule(optimizer.iterations).numpy())
        print('Optimizer Variables:', len(optimizer.variables))    # must be larger than 100

    else:
        print('No Checkpoint found, random initialization.')

    manager = tf.train.CheckpointManager(
        checkpoint,
        checkpoint_save_dir,
        max_to_keep= 3
    )

Restoring State from /kaggle/input/datasets/harshit1234g/axiomlm-utils/checkpoints/ckpt-1
Step: 4000
LR: 0.0002975152
Optimizer Variables: 185


In [13]:
model.summary()

In [14]:
history = model.fit(
    train_ds, 
    steps_per_epoch= STEPS_PER_EPOCH,
    epochs= 1,
    validation_data= valid_ds,
    validation_steps= VAL_STEPS,
    callbacks= [tensorboard_cb]
)

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu



[1m4000/4000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4035s[0m 994ms/step - loss: 4.0012 - perplexity: 56.8622 - val_loss: 3.4518 - val_perplexity: 41.9133


In [15]:
test_loss, test_perplexity = model.evaluate(test_ds)
print(f'{test_loss = }\n{test_perplexity = }')

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 348ms/step - loss: 3.5989 - perplexity: 39.7988
test_loss = 3.456530809402466
test_perplexity = 41.597496032714844


In [16]:
manager.save()

'/kaggle/working/checkpoints/ckpt-2'

In [None]:
model.save('axiomlm.keras')

In [17]:
import subprocess

# creating zip of kaggle working directory to easily download it on my system
subprocess.run(['zip', '-r', 'working_dir.zip', '/kaggle/working'], stdout= subprocess.DEVNULL)

CompletedProcess(args=['zip', '-r', 'working_dir.zip', '/kaggle/working'], returncode=0)