# Geometric Alignment Metrics

We reuse the miniature Transformer from `simpleLM_drift.ipynb` to measure how different prompt styles align with a reference latent manifold. After training the model on country–capital statements, we:

1. Define several prompts (direct question, Chain-of-Thought, scaffolded instructions, noisy context).
2. Extract the mean hidden state for each prompt.
3. Compare each hidden state to a “gold” reference vector and report cosine similarities.

This approximates controllability (how far a prompt moves us toward the concept basin) and provides a geometric alignment metric for prompt design.



In [None]:
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8")
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)



In [None]:
vocab = [
    "<pad>", "<sos>", "<eos>", "The", "the", "capital", "of", "is", "What", "?", "It", ".", "city",
    "France", "Paris", "Germany", "Berlin", "Italy", "Rome", "Spain", "Madrid",
    "Portugal", "Lisbon", "Greece", "Athens", "UK", "London", "Russia", "Moscow",
    "Japan", "Tokyo", "China", "Beijing", "India", "New", "Delhi", "Brazil", "Brasilia",
    "Canada", "Ottawa", "Australia", "Canberra", "Egypt", "Cairo", "Turkey", "Ankara"
]
word_to_idx = {w: i for i, w in enumerate(vocab)}

countries_capitals = {
    "France": "Paris", "Germany": "Berlin", "Italy": "Rome", "Spain": "Madrid",
    "Portugal": "Lisbon", "Greece": "Athens", "UK": "London", "Russia": "Moscow",
    "Japan": "Tokyo", "China": "Beijing", "India": "New Delhi", "Brazil": "Brasilia",
    "Canada": "Ottawa", "Australia": "Canberra", "Egypt": "Cairo", "Turkey": "Ankara"
}

sentences = []
for country, capital in countries_capitals.items():
    sentences.append(f"The capital of {country} is {capital} .")
    sentences.append(f"What is the capital of {country} ? It is {capital} .")
    sentences.append(f"The capital city of {country} is {capital} .")

sentences *= 3


def tokenize(sentence: str):
    tokens = sentence.replace(".", " .").split()
    input_ids = [word_to_idx["<sos>"]] + [word_to_idx.get(tok, word_to_idx["<pad>"]) for tok in tokens]
    target_ids = [word_to_idx.get(tok, word_to_idx["<pad>"]) for tok in tokens] + [word_to_idx["<eos>"]]
    return torch.tensor(input_ids), torch.tensor(target_ids)


class CapitalDataset(Dataset):
    def __init__(self, sentences):
        self.examples = [tokenize(s) for s in sentences]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]


def collate(batch):
    inputs = pad_sequence([b[0] for b in batch], batch_first=True, padding_value=word_to_idx["<pad>"])
    targets = pad_sequence([b[1] for b in batch], batch_first=True, padding_value=word_to_idx["<pad>"])
    return inputs, targets


dataset = CapitalDataset(sentences)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate)



In [None]:
def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones(sz, sz)) == 1
    mask = mask.transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


class SimpleLM(nn.Module):
    def __init__(self, vocab_size, d_model=48, nhead=3, num_layers=2):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_embedding = nn.Parameter(torch.zeros(512, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward=192)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, src):
        emb = self.embedding(src) * math.sqrt(self.d_model)
        seq_len = src.size(1)
        emb = emb + self.pos_embedding[:seq_len, :]
        emb = emb.transpose(0, 1)
        mask = generate_square_subsequent_mask(seq_len)
        hidden = self.encoder(emb, mask=mask)
        hidden = hidden.transpose(0, 1)
        return self.proj(hidden)



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleLM(len(vocab)).to(device)
optimizer = optim.Adam(model.parameters(), lr=5e-3)
criterion = nn.CrossEntropyLoss(ignore_index=word_to_idx["<pad>"])

epochs = 8
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.reshape(-1, len(vocab)), targets.reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if (epoch + 1) % 2 == 0:
        print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss / len(dataloader):.4f}")



In [None]:
@torch.no_grad()
def hidden_state(prompt: str) -> torch.Tensor:
    model.eval()
    prompt = prompt.replace(".", " .")
    tokens = prompt.split()
    ids = torch.tensor([[word_to_idx["<sos>"]] + [word_to_idx.get(tok, word_to_idx["<pad>"]) for tok in tokens]], device=device)
    emb = model.embedding(ids) * math.sqrt(model.d_model)
    seq_len = ids.size(1)
    emb = emb + model.pos_embedding[:seq_len, :]
    emb = emb.transpose(0, 1)
    mask = generate_square_subsequent_mask(seq_len).to(device)
    hidden = model.encoder(emb, mask=mask)
    hidden = hidden.transpose(0, 1)
    return hidden.mean(dim=1).squeeze(0).cpu()



In [None]:
question = "The capital of Spain is"
reference_prompt = "The capital of France is Paris. The capital of Germany is Berlin. The capital of Italy is Rome. The capital of Spain is Madrid. Therefore, the capital of Spain is Madrid."
reference_hidden = hidden_state(reference_prompt)

prompt_variants = {
    "Direct": "The capital of Spain is",
    "Chain-of-Thought": "Let's reason carefully. Spain is a country in Europe. Its well-known capital city is Madrid. So the capital of Spain is",
    "Scaffolded": "Instructions: recall European capitals. Fact: France -> Paris, Germany -> Berlin, Italy -> Rome, Spain -> Madrid. Answer: The capital of Spain is",
    "Noisy": "Some people think Barcelona is the capital. Others mention Madrid. The capital of Spain is",
}



In [None]:
import torch.nn.functional as F

rows = []
ref_norm = F.normalize(reference_hidden, dim=0)
for label, prompt in prompt_variants.items():
    vec = hidden_state(prompt)
    similarity = torch.dot(F.normalize(vec, dim=0), ref_norm).item()
    rows.append({"variant": label, "cos_similarity": similarity})

alignment_df = pd.DataFrame(rows).sort_values("cos_similarity", ascending=False)
alignment_df


In [None]:
plt.figure(figsize=(6, 3))
sns.barplot(data=alignment_df, x="cos_similarity", y="variant", palette="viridis")
plt.title("Latent alignment vs. prompt style")
plt.xlabel("Cosine similarity to reference manifold")
plt.ylabel("Prompt variant")
plt.xlim(0, 1)
plt.show()



Chain-of-Thought and scaffolded prompts land much closer to the reference manifold than direct or noisy prompts. This aligns with the controllability framing: richer control inputs (reasoning steps, explicit instructions) steer the hidden state into the desired basin, while noisy context drifts away.
