In [2]:
from pathlib import Path
import tensorflow as tf
import sys

sys.path.append('/kaggle/input/axiom-utils')
import llm_components as lc

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

## Paths

In [4]:
directory = Path('/kaggle/input')
data_dir = directory / 'wikitext-103' / 'wikitext-103'
train_path = data_dir / 'wiki.train.tokens'
valid_path = data_dir / 'wiki.valid.tokens'
test_path = data_dir / 'wiki.test.tokens'
# following path is created only for kaggle
# you can directly run the lc.load_sp_tokenizer() without path, if you are running locally
tokenizer_path = directory / 'axiom-utils' / 'sp_tokenizer.model'

## Loading Data

In [5]:
sp = lc.load_sp_tokenizer(str(tokenizer_path))
loader = lc.LMDatasetLoader(sp, shuffle_buffer= lc.SHUFFLE_BUFFER, cache= True)

In [6]:
train_ds = loader.create(train_path)
valid_ds = loader.create(valid_path)
test_ds = loader.create(test_path)

I0000 00:00:1770288653.511457      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13757 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1770288653.517765      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13757 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


## Transformer Model

In [7]:
vocab_size = sp.get_piece_size()
vocab_size

16000

In [8]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [9]:
# these are calculated in counting_train_tokens.ipynb
total_steps = 15_218
steps_per_epoch = 7_609
warmup_steps = 760

In [10]:
def perplexity_metric(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
    cross_entropy = tf.keras.losses.sparse_categorical_crossentropy(
        y_true, y_pred, from_logits= True
    )
    mean_cross_entropy = tf.reduce_mean(cross_entropy)
    ppl = tf.exp(mean_cross_entropy)
    return ppl

In [11]:
with strategy.scope():
    model = tf.keras.Sequential([
        tf.keras.Input(shape= (lc.SEQUENCE_LEN,), batch_size= lc.BATCH_SIZE),
        tf.keras.layers.Embedding(vocab_size, lc.N_EMBEDS),
        lc.PositionalEncoding(),
        *[lc.TransformerBlock() for _ in range(lc.N_BLOCKS)],
        tf.keras.layers.Dense(vocab_size)
    ])

    tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir= 'logs')
    checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
        monitor= 'val_loss',
        filepath= 'axiomlm.weights.h5',
        save_weights_only= True,
        save_best_only= True
    )
    
    lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
        initial_learning_rate= 3e-4,
        decay_steps= total_steps,
        warmup_steps= warmup_steps,
        warmup_target= 3e-4,
        alpha= 0.1
    )
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate= lr_schedule,
        beta_1= 0.9,
        beta_2= 0.95,
        epsilon= 1e-8,
        weight_decay= 0.1
    )
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits= True
    )
    
    model.compile(
        optimizer= optimizer,
        loss= loss_fn,
        metrics= [perplexity_metric]
    )

In [12]:
model.summary()

In [13]:
history = model.fit(
    train_ds, 
    epochs= 2,
    validation_data= valid_ds,
    callbacks= [tensorboard_cb, checkpoint_cb],
    steps_per_epoch= steps_per_epoch
)

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu



[1m7609/7609[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5995s[0m 781ms/step - loss: 5.4622 - perplexity_metric: 428.7919 - val_loss: 4.0881 - val_perplexity_metric: nan
Epoch 2/2
[1m7609/7609[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5933s[0m 780ms/step - loss: 4.1658 - perplexity_metric: 64.7941 - val_loss: 3.8244 - val_perplexity_metric: nan


In [18]:
test_loss, test_perplexity = model.evaluate(test_ds)
print(f'{test_loss = }\n{test_perplexity = }')

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 270ms/step - loss: 3.7849 - perplexity_metric: 44.0751
test_loss = 3.7943778038024902
test_perplexity = 42.21901321411133


In [19]:
model.save('AxiomLM.keras')

In [20]:
import pandas as pd

df = pd.DataFrame(history.history)
df.to_csv('axiomlm_history.csv', index= False)

In [21]:
import numpy as np

preds = model.predict(test_ds)
np.save('axiomlm_preds.npy', preds)

ResourceExhaustedError: Graph execution error:

Detected at node concat defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start

  File "/usr/local/lib/python3.12/dist-packages/tornado/platform/asyncio.py", line 211, in start

  File "/usr/lib/python3.12/asyncio/base_events.py", line 645, in run_forever

  File "/usr/lib/python3.12/asyncio/base_events.py", line 1999, in _run_once

  File "/usr/lib/python3.12/asyncio/events.py", line 88, in _run

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 499, in process_one

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 730, in execute_request

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/ipkernel.py", line 383, in do_execute

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/zmqshell.py", line 528, in run_cell

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "/tmp/ipykernel_55/474751031.py", line 3, in <cell line: 0>

  File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 566, in predict

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 263, in one_step_on_data_distributed

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 840, in reduce_per_replica

  File "/usr/local/lib/python3.12/dist-packages/keras/src/tree/tree_api.py", line 192, in map_structure

  File "/usr/local/lib/python3.12/dist-packages/keras/src/tree/optree_impl.py", line 111, in map_structure

  File "/usr/local/lib/python3.12/dist-packages/optree/ops.py", line 766, in tree_map

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 826, in _reduce

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 881, in concat

OOM when allocating tensor with shape[64,256,16000] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node concat}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_one_step_on_data_distributed_4250012]

In [22]:
import subprocess

subprocess.run(['zip', '-r', 'working_dir.zip', '/kaggle/working'], stdout= subprocess.DEVNULL)

CompletedProcess(args=['zip', '-r', 'working_dir.zip', '/kaggle/working'], returncode=0)