In [1]:
from lz78 import Sequence, LZ78SPA
from lz_python.lz import LZModel

import tensorflow_datasets as tfds
import tensorflow as tf
from sys import stdout
import numpy as np
from tqdm import tqdm

2025-03-16 15:00:51.181430: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-16 15:00:51.197223: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742162451.216409 1807568 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742162451.222366 1807568 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-16 15:00:51.242493: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
tf.config.set_visible_devices([], 'GPU')

## Data Loading

In [4]:
class PG19DataLoader:
    def __init__(self, data_type: str, start_index: int = 0, batch_size: int = 1, normalize: str = 'none'):
        self.data = tfds.load('pg19', split=data_type, shuffle_files=False)
        self.dataset = (self.data
                        .skip(start_index)
                        .batch(batch_size)
                        .prefetch(tf.data.experimental.AUTOTUNE))
        print(data_type, ": ", len(self.dataset))

    def __len__(self):
        return len(self.dataset)

    def __iter__(self):
        for batch in self.dataset:
            text_bytes = np.frombuffer(batch['book_text'].numpy()[0], dtype=np.uint8)
            text_bytes = text_bytes.tolist()
            yield text_bytes

## Set Up Models

In [1]:
class ConfigObject:
    def __init__(self, config_dict):
        self.__dict__.update(config_dict)

config = ConfigObject({
    "top_k": 256,
    "method": "Depth-Guided", # ensemble
    "ensemble_max_num": 6,
    "min_depth": 10,
    "vocab_size": 256,
    "adaptive_gamma": "none",
    "gamma": 1/256,
    "lower_bound": 1e-5,
    "temp": 1,
    "ensemble_type": "depth",
    "lb_or_temp": "lb_first",
})

In [2]:
py_lz = LZModel(config)

NameError: name 'LZModel' is not defined

In [7]:
rust_lz = LZ78SPA(alphabet_size=256, gamma=1/256, compute_training_loss=False)
rust_lz.set_inference_config(
    lb=1e-5,
    temp=1,
    lb_or_temp_first="lb_first",
    ensemble_type="depth",
    ensemble_n=6,
    adaptive_gamma="disabled",
    backshift_parsing=True,
    backshift_ctx_len=10,
    backshift_break_at_phrase=True
)

## Train Models

In [8]:
N_TRAIN = 300

stdout.flush()
train_dataloader = PG19DataLoader("train")
rust_lz.reset_state()
for trn_iter, batch in enumerate(tqdm(train_dataloader, desc="Building LZ tree"), start=1):
    # build LZ model only 1 epoch
    stdout.flush()
    rust_lz.train_on_block(Sequence(batch, alphabet_size=256))
    rust_lz.reset_state()

    if trn_iter >= N_TRAIN:
        break

train :  28602


Building LZ tree:   0%|          | 0/28602 [00:00<?, ?it/s]2025-03-16 15:00:54.191168: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:376] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608
Building LZ tree:   0%|          | 83/28602 [00:04<23:34, 20.16it/s] 


KeyboardInterrupt: 

In [None]:
train_dataloader = PG19DataLoader("train")
for trn_iter, batch in enumerate(tqdm(train_dataloader, desc="Building LZ tree"), start=1):
    # build LZ model only 1 epoch
    py_lz.build_tree(batch)

    if trn_iter >= N_TRAIN:
        break

train :  28602


Building LZ tree:   1%|          | 299/28602 [00:59<1:33:53,  5.02it/s]


## Evaluate Models

In [None]:
val_dataloader = PG19DataLoader("validation")
test_seq = next(iter(val_dataloader))

In [None]:
test_seqs = []
for i in range(0, len(test_seq)-1023, 512):
    test_seqs.append(test_seq[i:i+1024])

test_seqs = test_seqs[:100]

In [None]:
stdout.flush()

inputs = [Sequence(seq[512:],alphabet_size=256) for seq in test_seqs]
ctxs = [Sequence(seq[:512],alphabet_size=256) for seq in test_seqs]

res = rust_lz.compute_test_loss_parallel(
    inputs, ctxs, num_threads=32, output_prob_dists=False, output_per_symbol_losses=False
)
print(np.array([x["avg_log_loss"] for x in res]))

In [None]:
py_lz_losses = []
for seq in test_seqs:
    depths, btb, _ = py_lz.get_depth_and_perplexity(seq)
    py_lz_losses.append(float(np.mean(btb)))
print(np.array(py_lz_losses))

## Time Full Validation

In [None]:
log_loss = 0
n_seqs = 0
val_dataloader = PG19DataLoader("validation")
for seq in tqdm(val_dataloader):
    stdout.flush()
    test_seqs = []
    for i in range(0, len(seq)-1023, 512):
        test_seqs.append(seq[i:i+1024])

    rust_lz.reset_state()

    inputs = [Sequence(seq[512:],alphabet_size=256) for seq in test_seqs]
    ctxs = [Sequence(seq[:512],alphabet_size=256) for seq in test_seqs]

    res = rust_lz.compute_test_loss_parallel(
        inputs, ctxs, num_threads=32, output_prob_dists=False, output_per_symbol_losses=False
    )

    losses = np.array([x[0] for x in res]) / 512
    log_loss += np.sum(losses)
    n_seqs += len(losses)

In [None]:
print(f"Val PPL: {2**float(log_loss / n_seqs)}")

## Return Patch Information

In [None]:
val_dataloader = PG19DataLoader("validation")
test_seq = next(iter(val_dataloader))[:40]

In [None]:
res = rust_lz.compute_test_loss( # also works for the parallel version!
    Sequence(test_seq, alphabet_size=256), output_prob_dists=False, output_per_symbol_losses=True, output_patch_info=True
)

In [None]:
# This looks reasonable; should maybe debug more
for info in res['patch_info']:
    print(f"{info[0]} through {info[1] - 1}")

In [None]:
import matplotlib.pyplot as plt
plt.stem(np.array(res['log_losses']))
plt.title("Log Loss per Symbol")