In [1]:
from pathlib import Path

import numpy as np
import torch
from datasets import load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer

from compression_horizon.inference.generation import generate_from_compression

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload
%autoreload 2

# unsloth/Llama-3.2-1B

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running device:", device)

Running device: cuda


In [9]:
checkpoint = "unsloth/Meta-Llama-3.1-8B"
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype=torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token});

Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 88.72it/s]


In [10]:
experiment_path = Path("../artifacts/experiments/Meta-Llama-3.1-8B_1568_2leading/")
experiment_result_path = experiment_path / "compressed_prefixes"

In [11]:
experiment_result = load_from_disk(experiment_result_path)
experiment_result[0]["model_checkpoint"], experiment_result[0]["num_input_tokens"]

('unsloth/Meta-Llama-3.1-8B', 1568)

In [12]:
sample_texts = []
restored_sample_texts = []
error_positions = []
index_0 = []
index_1 = []
index_2 = []
final_convergence = []
greedy_convergence = []
model.eval()
for sample_id in experiment_result["sample_id"]:
    print(sample_id)
    compression_embeddings = torch.load(experiment_path / f"compression_token_embeddings_{sample_id}.pt", map_location=device).unsqueeze(dim=0)

    sample = experiment_result[sample_id]
    sample_text = sample["text"]
    sample_texts.append(sample_text)
    max_length = sample["num_input_tokens"]
    num_compression_tokens = sample["num_compression_tokens"]
    tokenized_sample = tokenizer(sample_text, truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = tokenized_sample["input_ids"].to(device)
    attention_mask = tokenized_sample["attention_mask"].to(device)
    print("max_length", torch.sum(attention_mask).item(), max_length)
    with torch.no_grad():
        sequence_embeddings = model.get_input_embeddings()(input_ids)
        outputs = model(
            inputs_embeds=torch.cat((compression_embeddings, sequence_embeddings), dim=1),
            attention_mask=torch.cat((torch.ones(1, num_compression_tokens, dtype=torch.long, device=device), attention_mask), dim=1),
        )
        predicted_ids = outputs.logits[:, num_compression_tokens - 1 : -1].argmax(dim=-1)
        convergence_numerator = (predicted_ids == input_ids).sum(dim=-1)
        inference_convergence = (convergence_numerator / attention_mask.sum(dim=-1)).item()
        mismatch_positions = torch.nonzero(predicted_ids != input_ids)[:, 1].tolist()
        error_positions.extend(mismatch_positions)
        print("Training convergence:", sample["final_convergence"], "Inference convergence:", inference_convergence)

    try:
        restored_sample_text, generated_ids = generate_from_compression(model, tokenizer, compression_embeddings, max_length, 1, return_generated_ids=True)
        restored_sample_texts.append(restored_sample_text)
        greedy_generation_convergence = (torch.sum(generated_ids == input_ids) / input_ids.shape[-1]).item()
    except RuntimeError as e:
        print(e)
        greedy_generation_convergence = (torch.sum(generated_ids == input_ids[:, :generated_ids.shape[-1]]) / generated_ids.shape[-1]).item()
    print("Greedy generation convergence:", greedy_generation_convergence)

    index_0.append(0 in mismatch_positions)
    index_1.append(1 in mismatch_positions)
    index_2.append(2 in mismatch_positions)
    final_convergence.append(sample["final_convergence"])
    greedy_convergence.append(greedy_generation_convergence)

print(np.mean(index_0), np.mean(index_1), np.mean(index_2), np.mean(final_convergence), np.mean(greedy_convergence))

0
max_length 1568 1568
Training convergence: 0.9910714030265808 Inference convergence: 0.9910714030265808
The size of tensor a (47) must match the size of tensor b (1568) at non-singleton dimension 1
Greedy generation convergence: 0.021276595070958138
1
max_length 1568 1568
Training convergence: 0.8928571343421936 Inference convergence: 0.8845663070678711
Greedy generation convergence: 0.0012755101779475808
2
max_length 1568 1568
Training convergence: 0.920918345451355 Inference convergence: 0.920918345451355
The size of tensor a (45) must match the size of tensor b (1568) at non-singleton dimension 1
Greedy generation convergence: 0.02222222276031971
3
max_length 1568 1568
Training convergence: 0.9630101919174194 Inference convergence: 0.9630101919174194
The size of tensor a (61) must match the size of tensor b (1568) at non-singleton dimension 1
Greedy generation convergence: 0.016393441706895828
4
max_length 1568 1568
Training convergence: 0.9904336929321289 Inference convergence: 0

# unsloth/Llama-3.2-1B without BOS

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running device:", device)

Running device: cuda


In [None]:
checkpoint = "unsloth/Meta-Llama-3.1-8B"
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype=torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token});

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 46.73it/s]


In [None]:
experiment_path = Path("../artifacts/experiments/Meta-Llama-3.1-8B_1568_bos/")
experiment_result_path = experiment_path / "compressed_prefixes"

In [25]:
experiment_result = load_from_disk(experiment_result_path)
experiment_result[0]["model_checkpoint"], experiment_result[0]["num_input_tokens"]

('unsloth/Llama-3.2-3B', 1024)

In [26]:
sample_texts = []
restored_sample_texts = []
error_positions = []
index_0 = []
index_1 = []
index_2 = []
final_convergence = []
greedy_convergence = []
model.eval()
for sample_id in experiment_result["sample_id"]:
    print(sample_id)
    compression_embeddings = torch.load(experiment_path / f"compression_token_embeddings_{sample_id}.pt", map_location=device).unsqueeze(dim=0)

    sample = experiment_result[sample_id]
    sample_text = sample["text"]
    sample_texts.append(sample_text)
    max_length = sample["num_input_tokens"]
    num_compression_tokens = sample["num_compression_tokens"]
    tokenized_sample = tokenizer(sample_text, truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = tokenized_sample["input_ids"].to(device)
    attention_mask = tokenized_sample["attention_mask"].to(device)
    print("max_length", torch.sum(attention_mask).item(), max_length)
    with torch.no_grad():
        sequence_embeddings = model.get_input_embeddings()(input_ids)
        outputs = model(
            inputs_embeds=torch.cat((compression_embeddings, sequence_embeddings), dim=1),
            attention_mask=torch.cat((torch.ones(1, num_compression_tokens, dtype=torch.long, device=device), attention_mask), dim=1),
        )
        predicted_ids = outputs.logits[:, num_compression_tokens - 1 : -1].argmax(dim=-1)
        convergence_numerator = (predicted_ids == input_ids).sum(dim=-1)
        inference_convergence = (convergence_numerator / attention_mask.sum(dim=-1)).item()
        mismatch_positions = torch.nonzero(predicted_ids != input_ids)[:, 1].tolist()
        error_positions.extend(mismatch_positions)
        print("Training convergence:", sample["final_convergence"], "Inference convergence:", inference_convergence)

    try:
        restored_sample_text, generated_ids = generate_from_compression(
            model,
            tokenizer,
            torch.cat((compression_embeddings, model.get_input_embeddings()(input_ids[:, :1])), dim=1),
            max_length - 1,
            1,
            return_generated_ids=True,
        )
        restored_sample_texts.append(restored_sample_text)
        greedy_generation_convergence = (torch.sum(generated_ids == input_ids[:, 1:]) / input_ids[:, 1:].shape[-1]).item()
    except RuntimeError as e:
        print(e)
        greedy_generation_convergence = (torch.sum(generated_ids == input_ids[:, 1:generated_ids.shape[-1] + 1]) / generated_ids.shape[-1]).item()
    print("Greedy generation convergence:", greedy_generation_convergence)

    index_0.append(0 in mismatch_positions)
    index_1.append(1 in mismatch_positions)
    index_2.append(2 in mismatch_positions)
    final_convergence.append(sample["final_convergence"])
    greedy_convergence.append(greedy_generation_convergence)

print(np.mean(index_0), np.mean(index_1), np.mean(index_2), np.mean(final_convergence), np.mean(greedy_convergence))

0
max_length 1024 1024
Training convergence: 0.990234375 Inference convergence: 0.990234375
Greedy generation convergence: 0.004887585528194904
1
max_length 1024 1024
Training convergence: 0.8671875 Inference convergence: 0.86328125
The size of tensor a (53) must match the size of tensor b (1023) at non-singleton dimension 1
Greedy generation convergence: 0.01886792480945587
2
max_length 1024 1024
Training convergence: 0.9140625 Inference convergence: 0.9140625
The size of tensor a (59) must match the size of tensor b (1023) at non-singleton dimension 1
Greedy generation convergence: 0.033898305147886276
3
max_length 1024 1024
Training convergence: 0.958984375 Inference convergence: 0.958984375
The size of tensor a (50) must match the size of tensor b (1023) at non-singleton dimension 1
Greedy generation convergence: 0.0
4
max_length 1024 1024
Training convergence: 0.978515625 Inference convergence: 0.978515625
The size of tensor a (99) must match the size of tensor b (1023) at non-sin

# unsloth/Llama-3.2-1B corrected 2 leading tokens

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running device:", device)

Running device: cuda


In [34]:
checkpoint = "unsloth/Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype=torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token});

In [35]:
experiment_path = Path("../artifacts/experiments/Llama-3.2-1B_512_2leading/")
experiment_result_path = experiment_path / "compressed_prefixes"

In [36]:
experiment_result = load_from_disk(experiment_result_path)
experiment_result

Dataset({
    features: ['sample_id', 'text', 'input_ids', 'attention_mask', 'embedding', 'final_loss', 'final_convergence', 'convergence_after_steps', 'convergence_0.99_after_steps', 'convergence_0.95_after_steps', 'compression_tokens_mean', 'compression_tokens_std', 'num_input_tokens', 'num_compression_tokens', 'hidden_size', 'fix_position_ids', 'loss_type', 'hybrid_alpha', 'dtype', 'num_alignment_layers', 'model_checkpoint', 'max_optimization_steps_per_sample'],
    num_rows: 10
})

In [37]:
sample_texts = []
restored_sample_texts = []
error_positions = []
model.eval()
for sample_id in experiment_result["sample_id"]:
    print(sample_id)
    compression_embeddings = torch.load(experiment_path / f"compression_token_embeddings_{sample_id}.pt", map_location=device).unsqueeze(dim=0)

    sample = experiment_result[sample_id]
    sample_text = sample["text"]
    sample_texts.append(sample_text)
    max_length = sample["num_input_tokens"]
    num_compression_tokens = sample["num_compression_tokens"]
    tokenized_sample = tokenizer(sample_text, truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = tokenized_sample["input_ids"].to(device)
    attention_mask = tokenized_sample["attention_mask"].to(device)
    with torch.no_grad():
        sequence_embeddings = model.get_input_embeddings()(input_ids)
        outputs = model(
            inputs_embeds=torch.cat((compression_embeddings, sequence_embeddings), dim=1),
            attention_mask=torch.cat((torch.ones(1, 1, dtype=torch.long, device=device), attention_mask), dim=1),
        )
        predicted_ids = outputs.logits[:, num_compression_tokens - 1 : -1].argmax(dim=-1)
        convergence_numerator = (predicted_ids == input_ids).sum(dim=-1)
        inference_convergence = (convergence_numerator / attention_mask.sum(dim=-1)).item()
        mismatch_positions = torch.nonzero(predicted_ids != input_ids)
        error_positions.extend(mismatch_positions[:, 1].tolist())
        print("Training convergence:", sample["final_convergence"], "Inference convergence:", inference_convergence)

    try:
        restored_sample_text, generated_ids = generate_from_compression(model, tokenizer, compression_embeddings, max_length, 1, return_generated_ids=True)
        restored_sample_texts.append(restored_sample_text)
        greedy_generation_convergence = (torch.sum(generated_ids == input_ids) / input_ids.shape[-1]).item()
        print("Greedy generation convergence:", greedy_generation_convergence)
    except RuntimeError as e:
        print(e)

0
Training convergence: 0.990234375 Inference convergence: 0.98828125
The size of tensor a (77) must match the size of tensor b (512) at non-singleton dimension 1
1
Training convergence: 0.990234375 Inference convergence: 0.984375
The size of tensor a (55) must match the size of tensor b (512) at non-singleton dimension 1
2
Training convergence: 0.990234375 Inference convergence: 0.98828125
The size of tensor a (60) must match the size of tensor b (512) at non-singleton dimension 1
3
Training convergence: 0.990234375 Inference convergence: 0.990234375
The size of tensor a (57) must match the size of tensor b (512) at non-singleton dimension 1
4
Training convergence: 0.990234375 Inference convergence: 0.986328125
The size of tensor a (53) must match the size of tensor b (512) at non-singleton dimension 1
5
Training convergence: 0.990234375 Inference convergence: 0.98828125
Greedy generation convergence: 0.005859375
6
Training convergence: 0.990234375 Inference convergence: 0.990234375
T

# unsloth/Llama-3.2-3B

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running device:", device)

Running device: cuda


In [4]:
checkpoint = "unsloth/Llama-3.2-3B"
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype=torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token});

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 54.97it/s]


In [3]:
experiment_path = Path("../artifacts/experiments/Llama-3.2-3B_1024/")
experiment_result_path = experiment_path / "compressed_prefixes"

In [4]:
experiment_result = load_from_disk(experiment_result_path)
experiment_result

Dataset({
    features: ['sample_id', 'text', 'input_ids', 'attention_mask', 'embedding', 'final_loss', 'final_convergence', 'convergence_after_steps', 'convergence_0.99_after_steps', 'convergence_0.95_after_steps', 'compression_tokens_mean', 'compression_tokens_std', 'num_input_tokens', 'num_compression_tokens', 'hidden_size', 'fix_position_ids', 'loss_type', 'hybrid_alpha', 'dtype', 'num_alignment_layers', 'model_checkpoint', 'max_optimization_steps_per_sample'],
    num_rows: 10
})

In [7]:
sample_texts = []
restored_sample_texts = []
model.eval()
for sample_id in experiment_result["sample_id"]:
    print(sample_id)
    compression_embeddings = torch.load(experiment_path / f"compression_token_embeddings_{sample_id}.pt", map_location=device).unsqueeze(dim=0)

    sample = experiment_result[sample_id]
    sample_text = sample["text"]
    sample_texts.append(sample_text)
    max_length = sample["num_input_tokens"]
    tokenized_sample = tokenizer(sample_text, truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = tokenized_sample["input_ids"].to(device)
    attention_mask = tokenized_sample["attention_mask"].to(device)
    with torch.no_grad():
        sequence_embeddings = model.get_input_embeddings()(input_ids)

    with torch.no_grad():
        outputs = model(
            inputs_embeds=torch.cat((compression_embeddings, sequence_embeddings), dim=1),
            attention_mask=torch.cat((torch.ones(1, 1, dtype=torch.long, device=device), attention_mask), dim=1),
        )
        inference_convergence = (torch.sum(outputs.logits[:, :-1].argmax(-1) == input_ids) / input_ids.shape[-1]).item()
        print("Training convergence:", sample["final_convergence"], "Inference convergence:", inference_convergence)

    try:
        restored_sample_text, generated_ids = generate_from_compression(model, tokenizer, compression_embeddings, max_length, 1, return_generated_ids=True)
        restored_sample_texts.append(restored_sample_text)
        greedy_generation_convergence = (torch.sum(generated_ids == input_ids) / input_ids.shape[-1]).item()
        print("Greedy generation convergence:", greedy_generation_convergence)
    except RuntimeError as e:
        print(e)

0
Training convergence: 0.990234375 Inference convergence: 0.990234375
Greedy generation convergence: 0.009765625
1
Training convergence: 0.8779296875 Inference convergence: 0.8759765625
Greedy generation convergence: 0.0
2
Training convergence: 0.919921875 Inference convergence: 0.919921875
Greedy generation convergence: 0.0
3
Training convergence: 0.974609375 Inference convergence: 0.974609375
Greedy generation convergence: 0.0
4
Training convergence: 0.9775390625 Inference convergence: 0.9775390625
Greedy generation convergence: 0.001953125
5
Training convergence: 0.986328125 Inference convergence: 0.986328125
Greedy generation convergence: 0.0
6
Training convergence: 0.990234375 Inference convergence: 0.990234375
Greedy generation convergence: 0.0
7
Training convergence: 0.9541015625 Inference convergence: 0.9541015625
Greedy generation convergence: 0.0029296875
8
Training convergence: 0.96484375 Inference convergence: 0.96484375
The size of tensor a (5) must match the size of tens