In [2]:
from pathlib import Path
import tensorflow as tf
import sys

sys.path.append('/kaggle/input/axiom-utils')
import llm_components as lc

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

## Paths

In [4]:
directory = Path('/kaggle/input')
data_dir = directory / 'wikitext-103' / 'wikitext-103'
train_path = data_dir / 'wiki.train.tokens'
valid_path = data_dir / 'wiki.valid.tokens'
test_path = data_dir / 'wiki.test.tokens'
# following path is created only for kaggle
# you can directly run the lc.load_sp_tokenizer() without path, if you are running locally
tokenizer_path = directory / 'axiom-utils' / 'sp_tokenizer.model'

## Loading Data

In [5]:
sp = lc.load_sp_tokenizer(str(tokenizer_path))
loader = lc.LMDatasetLoader(sp)

In [6]:
train_ds = loader.create(train_path, training= True)
valid_ds = loader.create(valid_path, training= False)
test_ds = loader.create(test_path, training= False)

I0000 00:00:1770378136.685888      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13757 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1770378136.691699      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13757 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


## Transformer Model

In [7]:
vocab_size = sp.get_piece_size()
vocab_size

16000

In [8]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [9]:
# steps_per_epoch are calculated in calculating_steps_per_epoch.ipynb
steps_per_epoch = 7389
total_steps = steps_per_epoch * lc.N_EPOCHS
warmup_steps = int(total_steps * 0.05)
total_steps, warmup_steps

(22167, 1108)

In [10]:
tensorboard_cb = tf.keras.callbacks.TensorBoard(
    histogram_freq= 1,
    update_freq= 100,  # every 100 batch
    embeddings_freq= 1
)
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    filepath= 'axiomlm.weights.h5',
    save_weights_only= True,
    save_best_only= True
)

In [11]:
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
    initial_learning_rate= 0.0,
    decay_steps= total_steps - warmup_steps,
    warmup_steps= warmup_steps,
    warmup_target= 1e-3,
    alpha= 0.1
)

In [12]:
with strategy.scope():
    # Model architecture
    model = tf.keras.Sequential([
        tf.keras.Input(shape= (lc.SEQUENCE_LEN,), batch_size= lc.BATCH_SIZE),
        tf.keras.layers.Embedding(vocab_size, lc.N_EMBEDS),
        lc.PositionalEncoding(),
        *[lc.TransformerBlock() for _ in range(lc.N_BLOCKS)],
        tf.keras.layers.Dense(vocab_size)   # no activation func, gives logits
    ])
    
    # Optimizer
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate= lr_schedule,
        weight_decay= 1e-3,
        clipnorm= 1.0
    )
    checkpoint = tf.train.Checkpoint(model= model, optimizer= optimizer)
    
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits= True     # softmax is handled by the loss function
    )
    model.compile(
        optimizer= optimizer,
        loss= loss_fn,
        metrics= [lc.perplexity]
    )

In [13]:
model.summary()

In [14]:
history = model.fit(
    train_ds, 
    epochs= lc.N_EPOCHS,
    validation_data= valid_ds,
    callbacks= [tensorboard_cb, checkpoint_cb],
    steps_per_epoch= steps_per_epoch
)

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu



[1m7389/7389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4735s[0m 636ms/step - loss: 5.3533 - perplexity: 776.4454 - val_loss: 3.5241 - val_perplexity: 43.1321
Epoch 2/3
[1m7389/7389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4700s[0m 636ms/step - loss: 3.7861 - perplexity: 45.8926 - val_loss: 3.2560 - val_perplexity: 32.3126
Epoch 3/3
[1m7389/7389[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4692s[0m 635ms/step - loss: 3.5507 - perplexity: 36.1185 - val_loss: 3.1536 - val_perplexity: 28.9532


In [15]:
test_loss, test_perplexity = model.evaluate(test_ds)
print(f'{test_loss = }\n{test_perplexity = }')

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 223ms/step - loss: 3.2829 - perplexity: 30.1757
test_loss = 3.162754774093628
test_perplexity = 29.92483139038086


In [16]:
model.save('AxiomLM.keras')

In [17]:
import pandas as pd

df = pd.DataFrame(history.history)
df.to_csv('axiomlm_history.csv', index= False)

In [18]:
import subprocess

subprocess.run(['zip', '-r', 'working_dir.zip', '/kaggle/working'], stdout= subprocess.DEVNULL)

CompletedProcess(args=['zip', '-r', 'working_dir.zip', '/kaggle/working'], returncode=0)