<a href="https://colab.research.google.com/github/M1croZavr/compression_horizon/blob/task%2Fhybrid_loss/notebooks/Compression_hybrid_loss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import subprocess

import torch
import torch.nn.functional as F
from datasets import load_dataset, load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel

In [None]:
try:
    subprocess.check_output(["nvidia-smi"], shell=True)
except subprocess.CalledProcessError:
    print("nvidia-smi is not available")

In [None]:
!git clone --branch task/hybrid_loss https://github.com/M1croZavr/compression_horizon.git

# Experiments launching

In [None]:
# %load_ext tensorboard
%reload_ext tensorboard
# %tensorboard --logdir=/content/compression_horizon/artifacts/experiments/common_loss
# %tensorboard --logdir=/content/compression_horizon/artifacts/experiments/hybrid_loss
%tensorboard --logdir=/content/drive/MyDrive/compression_horizon/22-11-2025/hybrid_loss

## Common loss launches

In [None]:
!cd ./compression_horizon/; uv run python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 4 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100

In [None]:
!cd ./compression_horizon/; uv run python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 32 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100

In [None]:
!cd ./compression_horizon/; uv run python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 128 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100

In [None]:
!cp -R /content/compression_horizon/artifacts/experiments/common_loss ./drive/MyDrive/compression_horizon/

## Hybrid loss launches

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 4 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type l2 --hybrid_alpha 0.2 --num_alignment_layers 1

In [None]:
!cd ./compression_horizon/; uv run python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 32 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type l2 --hybrid_alpha 0.2 --num_alignment_layers 1

In [None]:
!cd ./compression_horizon/; uv run python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 128 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type l2 --hybrid_alpha 0.2 --num_alignment_layers 1

In [None]:
!cd ./compression_horizon/; uv run python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 128 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type cosine --hybrid_alpha 0.2 --num_alignment_layers 1

In [None]:
!cd ./compression_horizon/; uv run python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 128 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type l1 --hybrid_alpha 0.2 --num_alignment_layers 1

In [None]:
!cd ./compression_horizon/; uv run python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 128 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type cosine --hybrid_alpha 0.5 --num_alignment_layers 3

In [None]:
!cd ./compression_horizon/; uv run python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 128 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type cosine --hybrid_alpha 1 --num_alignment_layers 5

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 128 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type cosine --hybrid_alpha 1 --num_alignment_layers 10

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 128 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type cosine --hybrid_alpha 1 --num_alignment_layers 3

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 128 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type l2 --hybrid_alpha 1 --num_alignment_layers 3

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 128 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type l1 --hybrid_alpha 1 --num_alignment_layers 3

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 128 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type l2 --hybrid_alpha 0.1 --num_alignment_layers 3

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/hybrid_loss.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --learning_rate 0.01 --max_sequence_length 128 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type l1 --hybrid_alpha 0.1 --num_alignment_layers 3

In [None]:
!cp -r /content/compression_horizon/artifacts/experiments/hybrid_loss /content/drive/MyDrive/compression_horizon/22-11-2025/

# CE comparison

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running device:", device)

In [None]:
# checkpoint = "HuggingFaceTB/SmolLM2-135M"
checkpoint = "HuggingFaceTB/SmolLM2-1.7B"
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype=torch.float32).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token});

In [None]:
# Load the exact sample (index 0)
raw_dataset = load_dataset("mrsndmn/pg19", split="test")
train_dataset = raw_dataset.select(range(1))
example = tokenizer(train_dataset[0]["text"], truncation=True, max_length=128, return_tensors="pt")
input_ids = example["input_ids"].to(device)
attention_mask = example["attention_mask"].to(device)
with torch.no_grad():
    sequence_embeddings = model.get_input_embeddings()(input_ids)

In [None]:
def load_compression_embeddings(path: str, device: str | torch.device = "cpu") -> torch.Tensor:
    result = load_from_disk(path)
    compression_embeddings = torch.FloatTensor(result[0]["embedding"]).unsqueeze(dim=0).to(device)
    return compression_embeddings

In [None]:
def calculate_logits(
    compression_embeddings: torch.Tensor,
    sequence_embeddings: torch.Tensor,
    attention_mask: torch.Tensor,
    model: PreTrainedModel,
) -> torch.Tensor:
    united_embeddings = torch.cat(
        (compression_embeddings, sequence_embeddings),
        dim=1,
    )
    # TODO: Adjust code implementation to support > 1 compression tokens
    united_attention_mask = torch.cat(
        (torch.tensor([[1]]).to(device), attention_mask),
        dim=1,
    )
    with torch.no_grad():
        outputs = model(
            inputs_embeds=united_embeddings,
            attention_mask=united_attention_mask,
        )
    return outputs.logits

In [None]:
compression_embeddings = load_compression_embeddings(
    "/content/drive/MyDrive/compression_horizon/common_loss/"
    "HuggingFaceTB/SmolLM2-1.7B|128|1|bc818cdb-8346-4cf1-beb0-7459ce626638/compressed_prefixes/",
    device,
)
logits = calculate_logits(compression_embeddings, sequence_embeddings, attention_mask, model)
# TODO: Adjust implementation to support > 1 compression tokens
print(
    "Common loss cross entropy:", torch.nn.functional.cross_entropy(logits[:, :-1, :].flatten(0, 1), input_ids.flatten()).item()
)

In [None]:
import os
import pathlib

hybrid_loss_results_root = pathlib.Path("/content/drive/MyDrive/compression_horizon/22-11-2025/hybrid_loss/")
for result_dir in os.listdir(hybrid_loss_results_root):
    compression_embeddings = load_compression_embeddings(
        hybrid_loss_results_root / result_dir / "compressed_prefixes",
        device,
    )
    logits = calculate_logits(compression_embeddings, sequence_embeddings, attention_mask, model)
    print(
        result_dir,
        "\n\tHybrid loss cross entropy:",
        torch.nn.functional.cross_entropy(logits[:, :-1, :].flatten(0, 1), input_ids.flatten()).item(),
        end="\n\n",
    )

In [None]:
hybrid_loss_results_root = pathlib.Path("/content/drive/MyDrive/compression_horizon/17-11-2025/hybrid_loss/HuggingFaceTB/")
for result_dir in os.listdir(hybrid_loss_results_root):
    if "|128|" in result_dir:
        try:
            compression_embeddings = load_compression_embeddings(
                hybrid_loss_results_root / result_dir / "compressed_prefixes",
                device,
            )
            logits = calculate_logits(compression_embeddings, sequence_embeddings, attention_mask, model)
            print(
                result_dir,
                "\n\tHybrid loss cross entropy:",
                torch.nn.functional.cross_entropy(logits[:, :-1, :].flatten(0, 1), input_ids.flatten()).item(),
                end="\n\n",
            )
        except FileNotFoundError:
            continue

# Generation outside the compressed sequence

In [None]:
from compression_horizon.src.compression_horizon.inference.generation import generate_from_compression

In [None]:
# Ground truth for sequence length 128
print(tokenizer.decode(input_ids[0]))

## Embeddings trained on common loss

In [None]:
compression_embeddings = load_compression_embeddings(
    "/content/drive/MyDrive/compression_horizon/common_loss/HuggingFaceTB/"
    "SmolLM2-1.7B|128|1|bc818cdb-8346-4cf1-beb0-7459ce626638/compressed_prefixes/",
    device,
)

In [None]:
print(generate_from_compression(model, tokenizer, compression_embeddings, 256, 1)[0])

## Embeddings trained on hybrid loss

In [None]:
compression_embeddings = load_compression_embeddings(
    "/content/drive/MyDrive/compression_horizon/17-11-2025/hybrid_loss/HuggingFaceTB/"
    "SmolLM2-1.7B|128|1|0.01|cosine|1.0|5|978c86f1-57ac-4eba-a446-6a6dc874d451/compressed_prefixes/",
    device,
)

In [None]:
print(generate_from_compression(model, tokenizer, compression_embeddings, 256, 1)[0])

# Average distance between compression embeddings and (actual sequence/random sequence embeddings)

In [None]:
def calculate_distances(compression_embeddings: torch.Tensor, sequence_embeddings: torch.Tensor) -> tuple[float]:
    # Cosine
    cosine = F.cosine_similarity(compression_embeddings, sequence_embeddings, dim=-1)
    cosine = (1.0 - cosine).mean().item()
    # l2
    l2 = torch.sqrt(torch.sum((sequence_embeddings - compression_embeddings) ** 2, dim=-1)).mean().item()
    # l1
    l1 = torch.sum(torch.abs(sequence_embeddings - compression_embeddings), dim=-1).mean().item()
    return cosine, l2, l1

In [None]:
actual_sequence = input_ids
actual_embeddings = sequence_embeddings

random_sequence = torch.randint(0, tokenizer.vocab_size, input_ids.size(), device=device)
with torch.no_grad():
    random_embeddings = model.get_input_embeddings()(random_sequence)

In [None]:
# Common
compression_embeddings = load_compression_embeddings(
    "/content/drive/MyDrive/compression_horizon/common_loss/HuggingFaceTB/"
    "SmolLM2-1.7B|128|1|bc818cdb-8346-4cf1-beb0-7459ce626638/compressed_prefixes/",
    device,
)
cosine, l2, l1 = calculate_distances(compression_embeddings, actual_embeddings)
print(f"Cosine: {cosine} | l2: {l2} | l1: {l1}")
cosine, l2, l1 = calculate_distances(compression_embeddings, random_embeddings)
print(f"Cosine: {cosine} | l2: {l2} | l1: {l1}")

In [None]:
# Hybrid
hybrid_loss_results_root = pathlib.Path("/content/drive/MyDrive/compression_horizon/17-11-2025/hybrid_loss/HuggingFaceTB/")
for result_dir in os.listdir(hybrid_loss_results_root):
    if "|128|" in result_dir:
        try:
            compression_embeddings = load_compression_embeddings(
                hybrid_loss_results_root / result_dir / "compressed_prefixes",
                device,
            )
            cosine, l2, l1 = calculate_distances(compression_embeddings, actual_embeddings)
            cosine_r, l2_r, l1_r = calculate_distances(compression_embeddings, random_embeddings)
            print(
                result_dir,
                "\n\tHybrid loss result:",
                f"\n\tCosine: {cosine} | l2: {l2} | l1: {l1}\n\tCosine: {cosine_r} | l2: {l2_r} | l1: {l1_r}",
                end="\n\n",
            )
        except FileNotFoundError:
            continue