<a href="https://colab.research.google.com/github/M1croZavr/compression_horizon/blob/task%2Fgeneration_outside/notebooks/Compression_generation_outside.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import torch
from datasets import Dataset, load_dataset
from matplotlib import pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
!git clone --branch task/generation_outside https://github.com/M1croZavr/compression_horizon.git

# Launches

In [None]:
%load_ext tensorboard
%reload_ext tensorboard
%tensorboard --logdir=/content/compression_horizon/artifacts/experiments/

## 16

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/activation_distillation.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --limit_dataset_items 16 --learning_rate 0.01 --max_sequence_length 16 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/activation_distillation.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --limit_dataset_items 16 --learning_rate 0.01 --max_sequence_length 16 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type cosine --hybrid_alpha 1 --num_alignment_layers 5

## 32

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/activation_distillation.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --limit_dataset_items 16 --learning_rate 0.01 --max_sequence_length 32 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/activation_distillation.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --limit_dataset_items 16 --learning_rate 0.01 --max_sequence_length 32 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type cosine --hybrid_alpha 1 --num_alignment_layers 5

## 64

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/activation_distillation.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --limit_dataset_items 16 --learning_rate 0.01 --max_sequence_length 64 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100

In [None]:
!cd ./compression_horizon/; uv run --no-dev python scripts/activation_distillation.py --model_checkpoint HuggingFaceTB/SmolLM2-1.7B --limit_dataset_items 16 --learning_rate 0.01 --max_sequence_length 64 --number_of_mem_tokens 1 --max_optimization_steps_per_sample 1000 --warmup_steps 100 --loss_type cosine --hybrid_alpha 1 --num_alignment_layers 5

In [None]:
!cp -r /content/compression_horizon/artifacts/experiments/* /content/drive/MyDrive/compression_horizon/20-12-2025/

# Generation outside the compressed sequence

In [None]:
import pandas as pd
from compression_horizon.src.compression_horizon.inference.generation import generate_from_compression
from compression_horizon.src.compression_horizon.inference.load import load_compression_embeddings
from compression_horizon.src.compression_horizon.metric import calculate_perplexity, calculate_perplexity_logits
from datasets import load_from_disk
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast

In [None]:
def evaluate_dataset(
    result_dataset: Dataset,
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizerFast | PreTrainedTokenizer,
) -> pd.DataFrame:
    records = []
    for i, sample in enumerate(result_dataset, start=1):
        sample_text = sample["text"]
        max_length = sample["num_input_tokens"]
        tokenized_sample = tokenizer(sample_text, truncation=True, max_length=max_length, return_tensors="pt")
        input_ids = tokenized_sample["input_ids"].to(device)
        attention_mask = tokenized_sample["attention_mask"].to(device)
        with torch.no_grad():
            sequence_embeddings = model.get_input_embeddings()(input_ids)

        compressed_embeddings = torch.FloatTensor(sample["embedding"]).unsqueeze(dim=0).to(device)
        restored_text = generate_from_compression(model, tokenizer, compressed_embeddings, max_length, 1)[0]

        perplexity_logits_score = calculate_perplexity_logits(
            model,
            compressed_embeddings,
            input_ids,
            sequence_embeddings,
            attention_mask,
        )
        perplexity_score = calculate_perplexity(
            model,
            tokenizer,
            compressed_embeddings,
            sequence_embeddings,
            attention_mask,
            n=max_length,
        )

        records.append(
            {
                "original_text": sample_text,
                "restored_text": restored_text,
                "n_tokens": max_length,
                "convergence": sample["final_convergence"],
                "perplexity": perplexity_logits_score,
                "forward_perplexity": perplexity_score,
                "mean_perplexity": (perplexity_logits_score + perplexity_score) / 2

            }
        )

    dataframe = pd.DataFrame(records)
    return dataframe

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running device:", device)

In [None]:
# checkpoint = "HuggingFaceTB/SmolLM2-135M"
checkpoint = "HuggingFaceTB/SmolLM2-1.7B"
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype=torch.float32).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token});

In [None]:
raw_dataset = load_dataset("mrsndmn/pg19", split="test")

## Embeddings trained on common loss

### 16

In [None]:
common_16_path = "/content/drive/MyDrive/compression_horizon/20-12-2025/model_HuggingFaceTB_SmolLM2-1.7B_mem_1_init_random_seq_len_16/compressed_prefixes"
common_16_result = load_from_disk(common_16_path)
common_16_df = evaluate_dataset(common_16_result, model, tokenizer)

In [None]:
common_16_df.head()

### 32

In [None]:
common_32_path = "/content/drive/MyDrive/compression_horizon/20-12-2025/model_HuggingFaceTB_SmolLM2-1.7B_mem_1_init_random_seq_len_32/compressed_prefixes"
common_32_result = load_from_disk(common_32_path)
common_32_df = evaluate_dataset(common_32_result, model, tokenizer)

In [None]:
common_32_df.head()

### 64

In [None]:
common_64_path = "/content/drive/MyDrive/compression_horizon/20-12-2025/model_HuggingFaceTB_SmolLM2-1.7B_mem_1_init_random_seq_len_64/compressed_prefixes"
common_64_result = load_from_disk(common_64_path)
common_64_df = evaluate_dataset(common_64_result, model, tokenizer)

In [None]:
common_64_df.head()

### Random ids generation

In [None]:
compressed_embeddings = load_compression_embeddings(
    "/content/drive/MyDrive/compression_horizon/common_loss/HuggingFaceTB/"
    "SmolLM2-1.7B|128|1|bc818cdb-8346-4cf1-beb0-7459ce626638/compressed_prefixes/",
    device,
)

In [None]:
# Proper generation
print(generate_from_compression(model, tokenizer, compressed_embeddings, 32, 1)[0], end="\n#####\n")
# Random position_ids generation
print(generate_from_compression(model, tokenizer, compressed_embeddings, 32, 1, random_position_ids=True)[0])

## Embeddings trained on hybrid loss

### 16

In [None]:
hybrid_16_path = "/content/drive/MyDrive/compression_horizon/20-12-2025/model_HuggingFaceTB_SmolLM2-1.7B_mem_1_ch_cosine_hybrid_alpha_1.0_init_random_seq_len_16/compressed_prefixes"
hybrid_16_result = load_from_disk(hybrid_16_path)
hybrid_16_df = evaluate_dataset(hybrid_16_result, model, tokenizer)

In [None]:
hybrid_16_df.head()

### 32

In [None]:
hybrid_32_path = "/content/drive/MyDrive/compression_horizon/20-12-2025/model_HuggingFaceTB_SmolLM2-1.7B_mem_1_ch_cosine_hybrid_alpha_1.0_init_random_seq_len_32/compressed_prefixes"
hybrid_32_result = load_from_disk(hybrid_32_path)
hybrid_32_df = evaluate_dataset(hybrid_32_result, model, tokenizer)

In [None]:
hybrid_32_df.head()

### 64

In [None]:
hybrid_64_path = "/content/drive/MyDrive/compression_horizon/20-12-2025/model_HuggingFaceTB_SmolLM2-1.7B_mem_1_ch_cosine_hybrid_alpha_1.0_init_random_seq_len_64/compressed_prefixes"
hybrid_64_result = load_from_disk(hybrid_64_path)
hybrid_64_df = evaluate_dataset(hybrid_64_result, model, tokenizer)

In [None]:
hybrid_64_df.head()

### Random ids generation

In [None]:
compressed_embeddings = load_compression_embeddings(
    "/content/drive/MyDrive/compression_horizon/17-11-2025/hybrid_loss/HuggingFaceTB/"
    "SmolLM2-1.7B|128|1|0.01|cosine|1.0|5|978c86f1-57ac-4eba-a446-6a6dc874d451/compressed_prefixes/",
    device,
)

In [None]:
# Proper generation
print(generate_from_compression(model, tokenizer, compressed_embeddings, 32, 1)[0], end="\n#####\n")
# Random position_ids generation
print(generate_from_compression(model, tokenizer, compressed_embeddings, 32, 1, random_position_ids=True)[0])

In [None]:
# hybrid_loss_results_root = pathlib.Path("/content/drive/MyDrive/compression_horizon/22-11-2025/hybrid_loss/")
# for result_dir in os.listdir(hybrid_loss_results_root):
#     compressed_embeddings = load_compression_embeddings(
#         hybrid_loss_results_root / result_dir / "compressed_prefixes",
#         device,
#     )
#     perplexity_score = calculate_perplexity(
#         model,
#         tokenizer,
#         compressed_embeddings,
#         sequence_embeddings,
#         attention_mask,
#         n=32,
#     )
#     print(
#         result_dir,
#         "\n\tHybrid loss perplexity (n=32):",
#         perplexity_score,
#         end="\n\n",
#     )

In [None]:
# hybrid_loss_results_root = pathlib.Path("/content/drive/MyDrive/compression_horizon/17-11-2025/hybrid_loss/HuggingFaceTB/")
# for result_dir in os.listdir(hybrid_loss_results_root):
#     if "|128|" in result_dir:
#         try:
#             compressed_embeddings = load_compression_embeddings(
#                 hybrid_loss_results_root / result_dir / "compressed_prefixes",
#                 device,
#             )
#             perplexity_score = calculate_perplexity(
#                 model,
#                 tokenizer,
#                 compressed_embeddings,
#                 sequence_embeddings,
#                 attention_mask,
#                 n=32,
#             )
#             print(
#                 result_dir,
#                 "\n\tHybrid loss perplexity (n=32)",
#                 perplexity_score,
#                 end="\n\n",
#             )
#         except FileNotFoundError:
#             continue

## Result

In [None]:
metrics = [
    "n_tokens",
    "convergence",
    "perplexity",
    "forward_perplexity",
    "mean_perplexity",
]


def summarize_df(df):
    return df[metrics].agg(["mean", "std", "min", "max"])


def robust_summary_df(df):
    return df[metrics].agg(
        mean="mean",
        std="std",
        median="median",
        q25=lambda x: x.quantile(0.25),
        q75=lambda x: x.quantile(0.75),
    )


def tag(df, variant):
    df = df.copy()
    df["setup"] = variant
    return df


all_df = pd.concat([
    tag(common_16_df, "common_16"),
    tag(hybrid_16_df, "hybrid_16"),
    tag(common_32_df, "common_32"),
    tag(hybrid_32_df, "hybrid_32"),
    tag(common_64_df, "common_64"),
    tag(hybrid_64_df, "hybrid_64"),
])


summary_all = (
    all_df
    .groupby("setup")[metrics]
    .agg(["mean", "std", "median"])
)

summary_all

# LLM as a judge

In [None]:
!pip install langchain-openai --quiet

In [None]:
from pydantic import BaseModel, Field


class PrefixRestoreAndContinuationEvaluation(BaseModel):
    """
    Binary evaluation of a model output consisting of a restored prefix
    followed by a continuation.

    The evaluator is given the original prefix as ground truth.
    Evaluation focuses primarily on the continuation quality, with a
    lightweight check on restored prefix fidelity.
    """

    prefix_adequate: bool = Field(
        description=(
            "True if the restored prefix is semantically equivalent to the "
            "original prefix, allowing minor paraphrasing or formatting differences."
        )
    )

    continuation_coherent: bool = Field(
        description=(
            "True if the continuation logically follows from the original prefix "
            "without contradictions."
        )
    )

    continuation_relevant: bool = Field(
        description=(
            "True if the continuation meaningfully stays on-topic with respect "
            "to the original prefix."
        )
    )

    continuation_fluent: bool = Field(
        description=(
            "True if the continuation is grammatically correct and linguistically natural."
        )
    )

    style_consistent: bool = Field(
        description=(
            "True if the continuation matches the tone, register, and style "
            "of the original prefix."
        )
    )

In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert evaluator of language model outputs. "
            "You evaluate different performances of LLM to figure out which is the best. "
            "The task is to evaluate a restored prefix and its continuation "
            "given the original prefix as reference."
        ),
        (
            "human",
            """
You are given:

1) The ORIGINAL TEXT PREFIX (ground truth).
2) A MODEL OUTPUT consisting of:
   - a restored prefix (which may contain minor differences)
   - followed by a continuation.

Your primary task is to evaluate the QUALITY OF THE CONTINUATION.
You should only penalize the restored prefix if it is meaningfully incorrect
or alters the intended meaning.

Answer True or False for each criterion:

- Prefix adequately restored:
  Is the restored prefix semantically equivalent to the original prefix,
  allowing minor paraphrasing or formatting differences?

- Continuation coherent:
  Does the continuation logically follow from the original prefix?

- Continuation relevant:
  Does the continuation stay on-topic with respect to the original prefix?

- Continuation fluent:
  Is the continuation grammatically correct and natural?

- Style consistent:
  Does the continuation match the tone and style of the original prefix?

ORIGINAL PREFIX:
{original_prefix}

MODEL OUTPUT (restored prefix + continuation):
{generated_text}
"""
        ),
    ]
)

In [None]:
from google.colab import userdata
from langchain_openai import ChatOpenAI

judge_llm = ChatOpenAI(
    model="x-ai/grok-4.1-fast",
    temperature=0,
    base_url="https://openrouter.ai/api/v1",
    api_key=userdata.get('OpenRouter'),
)

judge_llm = judge_llm.with_structured_output(PrefixRestoreAndContinuationEvaluation)

In [None]:
def generate_samples(result_dataset: Dataset, model, tokenizer) -> tuple:
    for sample in result_dataset:
        original_text = sample["text"]
        compressed_embeddings = torch.FloatTensor(sample["embedding"]).unsqueeze(dim=0).to(device)
        generated_text = generate_from_compression(model, tokenizer, compressed_embeddings, 64, 1)[0]
        yield original_text, generated_text


starts = []
samples_common = []
samples_hybrid = []
for sample_a in generate_samples(common_32_result, model, tokenizer):
    starts.append(sample_a[0])
    samples_common.append(sample_a[1])
for sample_b in generate_samples(hybrid_32_result, model, tokenizer):
    samples_hybrid.append(sample_b[1])

samples = [{"text_start": starts[i], "continuation_a": samples_common[i], "continuation_b": samples_hybrid[i]} for i in range(len(starts))]

In [None]:
import pickle

# with open("./samples.pkl", "wb") as file:
#     pickle.dump(samples, file)
with open("./samples.pkl", "rb") as file:
    samples = pickle.load(file)

In [None]:
from typing import List

chain = prompt | judge_llm

common_results: List[PrefixRestoreAndContinuationEvaluation] = []
hybrid_results: List[PrefixRestoreAndContinuationEvaluation] = []

for i, sample in enumerate(samples, start=1):
    print(i)
    result_common = chain.invoke(
        {
            "original_prefix": sample["text_start"],
            "generated_text": sample["continuation_a"],
        }
    )
    common_results.append(result_common)
    result_hybrid = chain.invoke(
        {
            "original_prefix": sample["text_start"],
            "generated_text": sample["continuation_b"],
        }
    )
    hybrid_results.append(result_hybrid)

# with open("./llm_judge.pkl", "wb") as file:
#     pickle.dump({"common": common_results, "hybrid": hybrid_results}, file)

In [None]:


def per_criterion_rates(results):
    fields = PrefixRestoreAndContinuationEvaluation.model_fields.keys()
    rates = {}

    for field in fields:
        rates[field] = sum(
            getattr(r, field) for r in results
        ) / len(results)

    return rates


print("Common:")
rates = per_criterion_rates(common_results)
for k, v in rates.items():
    print(f"{k}: {v:.3f}")

print("Hybrid:")
rates = per_criterion_rates(hybrid_results)
for k, v in rates.items():
    print(f"{k}: {v:.3f}")

# Activations and attentions

In [None]:
from compression_horizon.src.compression_horizon.inference.generation import calculate_outputs

In [None]:
# Load the exact sample as in checkpoint (index 0)
sample = raw_dataset.select(range(1))
example = tokenizer(sample[0]["text"], truncation=True, max_length=128, return_tensors="pt")
input_ids = example["input_ids"].to(device)
attention_mask = example["attention_mask"].to(device)
with torch.no_grad():
    sequence_embeddings = model.get_input_embeddings()(input_ids)

In [None]:
# Common
compression_embeddings = load_compression_embeddings(
    "/content/drive/MyDrive/compression_horizon/common_loss/HuggingFaceTB/"
    "SmolLM2-1.7B|128|1|bc818cdb-8346-4cf1-beb0-7459ce626638/compressed_prefixes/",
    device,
)
outputs = calculate_outputs(model, compression_embeddings, sequence_embeddings, attention_mask)

In [None]:
outputs.attentions[0].shape  # torch.Size([1, 32, 129, 129])

In [None]:
for i in range(len(outputs.attentions)):
    attn_layer = outputs.attentions[i]
    attn_to_t = attn_layer[0, :, :, 0].mean(dim=0)
    plt.figure(figsize=(10, 3))
    plt.bar(range(len(attn_to_t)), attn_to_t.detach().cpu())
    plt.title(f"Attention paid TO compressed token, layer{i}")
    plt.xlabel("Query token index")
    plt.ylabel("Attention weight")
    plt.show()

In [None]:
plt.figure(figsize=(12, 6))
for i in range(5):
    plt.plot(outputs["hidden_states"][i].mean(dim=-1).squeeze(dim=0).cpu())

In [None]:
# Hybrid
compression_embeddings = load_compression_embeddings(
    "/content/drive/MyDrive/compression_horizon/17-11-2025/hybrid_loss/HuggingFaceTB/"
    "SmolLM2-1.7B|128|1|0.01|cosine|1.0|5|978c86f1-57ac-4eba-a446-6a6dc874d451/compressed_prefixes/",
    device,
)
outputs = calculate_outputs(model, compression_embeddings, sequence_embeddings, attention_mask)

In [None]:
for i in range(len(outputs.attentions)):
    attn_layer = outputs.attentions[i]
    attn_to_t = attn_layer[0, :, :, 0].mean(dim=0)
    plt.figure(figsize=(10, 3))
    plt.bar(range(len(attn_to_t)), attn_to_t.detach().cpu())
    plt.title(f"Attention paid TO compressed token, layer{i}")
    plt.xlabel("Query token index")
    plt.ylabel("Attention weight")
    plt.show()

In [None]:
plt.figure(figsize=(10, 5))
for i in range(5):
    plt.plot(outputs["hidden_states"][i].mean(dim=-1).squeeze(dim=0).cpu())

# Average distance between compression embeddings and (actual sequence/random sequence embeddings)

In [None]:
from compression_horizon.src.compression_horizon.inference.generation import calculate_outputs
from compression_horizon.src.compression_horizon.metric import calculate_distances

In [None]:
actual_sequence  = input_ids
actual_embeddings = sequence_embeddings

random_sequence = torch.randint(0, tokenizer.vocab_size, input_ids.size(), device=device)
with torch.no_grad():
    random_embeddings = model.get_input_embeddings()(random_sequence)

In [None]:
# Common
compression_embeddings = load_compression_embeddings(
    "/content/drive/MyDrive/compression_horizon/common_loss/HuggingFaceTB/"
    "SmolLM2-1.7B|128|1|bc818cdb-8346-4cf1-beb0-7459ce626638/compressed_prefixes/",
    device,
)
actual_outputs = calculate_outputs(model, compression_embeddings, sequence_embeddings, attention_mask)
random_outputs = calculate_outputs(model, compression_embeddings, random_embeddings, attention_mask)

for i in range(len(actual_outputs["hidden_states"])):
    print(f"Hidden states layer {i}:")
    cosine, l2, l1 = calculate_distances(
        actual_outputs["hidden_states"][i][:, :1, :],
        actual_outputs["hidden_states"][i][:, 1:, :],
    )
    print(f"Cosine: {cosine} | l2: {l2} | l1: {l1}")
    cosine, l2, l1 = calculate_distances(
        random_outputs["hidden_states"][i][:, :1, :],
        random_outputs["hidden_states"][i][:, 1:, :],
    )
    print(f"Cosine: {cosine} | l2: {l2} | l1: {l1}", end="\n\n")

In [None]:
# Hybrid
compression_embeddings = load_compression_embeddings(
    "/content/drive/MyDrive/compression_horizon/17-11-2025/hybrid_loss/HuggingFaceTB/"
    "SmolLM2-1.7B|128|1|0.01|cosine|1.0|5|978c86f1-57ac-4eba-a446-6a6dc874d451/compressed_prefixes/",
    device,
)
actual_outputs = calculate_outputs(model, compression_embeddings, sequence_embeddings, attention_mask)
random_outputs = calculate_outputs(model, compression_embeddings, random_embeddings, attention_mask)

for i in range(len(actual_outputs["hidden_states"])):
    print(f"Hidden states layer {i}:")
    cosine, l2, l1 = calculate_distances(
        actual_outputs["hidden_states"][i][:, :1, :],
        actual_outputs["hidden_states"][i][:, 1:, :],
    )
    print(f"Cosine: {cosine} | l2: {l2} | l1: {l1}")
    cosine, l2, l1 = calculate_distances(
        random_outputs["hidden_states"][i][:, :1, :],
        random_outputs["hidden_states"][i][:, 1:, :],
    )
    print(f"Cosine: {cosine} | l2: {l2} | l1: {l1}", end="\n\n")