In [1]:
from lz78 import Sequence, LZ78SPA
from lz_python.lz import LZModel

import tensorflow_datasets as tfds
import tensorflow as tf
from sys import stdout
import numpy as np
from tqdm import tqdm

2025-03-12 11:23:53.048247: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-12 11:23:53.063918: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741803833.082431 1459657 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741803833.088064 1459657 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-12 11:23:53.108615: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
tf.config.set_visible_devices([], 'GPU')

## Data Loading

In [4]:
class PG19DataLoader:
    def __init__(self, data_type: str, start_index: int = 0, batch_size: int = 1, normalize: str = 'none'):
        self.data = tfds.load('pg19', split=data_type, shuffle_files=False)
        self.dataset = (self.data
                        .skip(start_index)
                        .batch(batch_size)
                        .prefetch(tf.data.experimental.AUTOTUNE))
        print(data_type, ": ", len(self.dataset))

    def __len__(self):
        return len(self.dataset)

    def __iter__(self):
        for batch in self.dataset:
            text_bytes = np.frombuffer(batch['book_text'].numpy()[0], dtype=np.uint8)
            text_bytes = text_bytes.tolist()
            yield text_bytes

## Set Up Models

In [5]:
class ConfigObject:
    def __init__(self, config_dict):
        self.__dict__.update(config_dict)

config = ConfigObject({
    "top_k": 256,
    "method": "Depth-Guided", # ensemble
    "ensemble_max_num": 6,
    "min_depth": 10,
    "vocab_size": 256,
    "adaptive_gamma": "none",
    "gamma": 1/256,
    "lower_bound": 1e-5,
    "temp": 1,
    "ensemble_type": "depth",
    "lb_or_temp": "lb_first",
})

In [6]:
py_lz = LZModel(config)

In [None]:
rust_lz = LZ78SPA(alphabet_size=256, gamma=1/256, compute_training_loss=False)
rust_lz.set_inference_config(
    lb=1e-5,
    temp=1,
    lb_or_temp_first="lb_first",
    ensemble_type="depth",
    ensemble_n=6,
    adaptive_gamma="disabled",
    backshift_parsing=True,
    backshift_ctx_len=10,
    backshift_break_at_phrase=True
)

## Train Models

In [8]:
N_TRAIN = 100

stdout.flush()
train_dataloader = PG19DataLoader("train")
rust_lz.reset_state()
for trn_iter, batch in enumerate(tqdm(train_dataloader, desc="Building LZ tree"), start=1):
    # build LZ model only 1 epoch
    stdout.flush()
    rust_lz.train_on_block(Sequence(batch, alphabet_size=256))
    rust_lz.reset_state()

    if trn_iter >= N_TRAIN:
        break

train :  28602


Building LZ tree:   0%|          | 0/28602 [00:00<?, ?it/s]2025-03-12 11:24:00.425640: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:376] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608
Building LZ tree:   0%|          | 99/28602 [00:05<25:12, 18.84it/s] 


In [9]:
train_dataloader = PG19DataLoader("train")
for trn_iter, batch in enumerate(tqdm(train_dataloader, desc="Building LZ tree"), start=1):
    # build LZ model only 1 epoch
    py_lz.build_tree(batch)

    if trn_iter >= N_TRAIN:
        break

train :  28602


Building LZ tree:   0%|          | 99/28602 [00:17<1:25:30,  5.56it/s]


## Evaluate Models

In [10]:
val_dataloader = PG19DataLoader("validation")
test_seq = next(iter(val_dataloader))

validation :  50


In [11]:
test_seqs = []
for i in range(0, len(test_seq)-1023, 512):
    test_seqs.append(test_seq[i:i+1024])

In [12]:
stdout.flush()
rust_lz.reset_state()

inputs = [Sequence(seq[512:],alphabet_size=256) for seq in test_seqs]
ctxs = [Sequence(seq[:512],alphabet_size=256) for seq in test_seqs]

rust_lz_losses = np.array(rust_lz.compute_test_loss_parallel(
    inputs, ctxs, num_threads=32
)) / 512
print(rust_lz_losses)

[2.09570746 2.02709416 1.92888051 2.24193296 1.94838384 2.246818
 2.5534734  1.92000791 2.23535322 2.14238168 2.10381214 2.09425761
 2.42312993 2.09602661 2.1823594  2.19796509 2.17546561 2.59972646
 2.01459331 2.44875539 3.13992654 2.87137981 2.89687572 2.69475711
 2.49312545 3.2878529  2.67884094 2.74689998 2.51736797 2.47719455
 2.66557975 3.1036876  2.91649416 3.15200403 2.82862683 2.15703869
 2.10913151 2.3850443  2.45824447 2.33437695 2.05443431 2.60664526
 2.43794071 2.54737539 2.57753351 2.457468   2.5339743  2.526699
 2.66021902 2.88097546 2.73773511 3.11597866 2.21688838 2.4715863
 2.27117645 2.00110983 2.19616657 3.0135945  2.66753531 2.50578933
 2.40214986 2.89978746 2.53483265 2.66448608 2.65264533 2.62424865
 2.7677693  2.75328829 2.34219973 2.34907167 2.59487538 2.5899051
 2.4779284  2.40448731 2.81881764 2.82428384 2.52666009 2.46482305
 2.47966515 2.31419246 2.15768252 2.53992891 2.59133082 2.41682575
 2.78699062 2.43098817 2.2893838  2.27222993 2.3029444  2.77299561
 

In [None]:
py_lz_losses = []
for seq in test_seqs:
    depths, btb, _ = py_lz.get_depth_and_perplexity(seq)
    py_lz_losses.append(float(np.mean(btb)))
print(py_lz_losses)

In [None]:
np.mean(btb)