# Experiment 037: Task-Relative AQ Extraction

**Question:** Does the same input produce different Action Quanta under different task framings?

**Hypothesis:** AQ are emergent from signal-task interaction, not intrinsic properties of signals.

**Key Prediction:** Activations should diverge across tasks (similarity < 0.9 in late layers).

In [None]:
# Install dependencies (run once)
!pip install transformers torch matplotlib numpy -q

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import Dict, List

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

## 1. Load Model

In [None]:
MODEL_NAME = "gpt2"  # Change to "gpt2-medium" for better results

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Loaded {MODEL_NAME} with {model.config.n_layer} layers")

## 2. Define Task Scenarios

In [None]:
# Core test: Same input, different tasks
INPUT_TEXT = "A red ball on a blue table, next to a green cup."

TASKS = {
    'color': "What color is the ball?",
    'location': "Where is the ball?",
    'existence': "Is there a ball?",
    'relation': "What is the ball next to?",
    'count': "How many objects are mentioned?"
}

print(f"Input: {INPUT_TEXT}")
print(f"\nTasks:")
for name, prompt in TASKS.items():
    print(f"  {name}: {prompt}")

## 3. Extract Activations Under Each Task

In [None]:
def extract_activations(input_text: str, task_prompt: str, layers: List[int]):
    """Extract activations for input under task framing."""
    full_prompt = f"{task_prompt}\n\nContext: {input_text}\n\nAnswer:"
    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
    
    activations = {}
    hooks = []
    
    def make_hook(layer_idx):
        def hook(module, inp, out):
            activations[layer_idx] = out[0].detach().cpu()
        return hook
    
    # Register hooks
    for idx in layers:
        h = model.transformer.h[idx].register_forward_hook(make_hook(idx))
        hooks.append(h)
    
    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True)
    
    # Remove hooks
    for h in hooks:
        h.remove()
    
    return {
        'activations': activations,
        'attentions': [a.cpu() for a in outputs.attentions],
        'logits': outputs.logits.cpu()
    }

# Extract for all tasks
LAYERS = [0, 2, 4, 6, 8, 10, 11]
task_extractions = {}

for task_name, task_prompt in TASKS.items():
    print(f"Extracting for task: {task_name}")
    task_extractions[task_name] = extract_activations(INPUT_TEXT, task_prompt, LAYERS)

print("\nExtraction complete!")

## 4. Compare Activations Across Tasks

In [None]:
def cosine_similarity(a: torch.Tensor, b: torch.Tensor) -> float:
    """Compute cosine similarity between two tensors."""
    a_flat = a.flatten().float()
    b_flat = b.flatten().float()
    min_len = min(len(a_flat), len(b_flat))
    return torch.nn.functional.cosine_similarity(
        a_flat[:min_len].unsqueeze(0),
        b_flat[:min_len].unsqueeze(0)
    ).item()

# Compute pairwise similarities
task_names = list(TASKS.keys())
n_tasks = len(task_names)

# Store similarities per layer
layer_similarities = {layer: np.zeros((n_tasks, n_tasks)) for layer in LAYERS}

for layer in LAYERS:
    for i, task_a in enumerate(task_names):
        for j, task_b in enumerate(task_names):
            act_a = task_extractions[task_a]['activations'][layer]
            act_b = task_extractions[task_b]['activations'][layer]
            sim = cosine_similarity(act_a, act_b)
            layer_similarities[layer][i, j] = sim

print("Similarity matrices computed")

In [None]:
# Plot similarity matrices by layer
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()

for idx, layer in enumerate(LAYERS):
    ax = axes[idx]
    im = ax.imshow(layer_similarities[layer], cmap='RdYlGn', vmin=0.5, vmax=1.0)
    ax.set_xticks(range(n_tasks))
    ax.set_yticks(range(n_tasks))
    ax.set_xticklabels(task_names, rotation=45, ha='right', fontsize=8)
    ax.set_yticklabels(task_names, fontsize=8)
    ax.set_title(f'Layer {layer}')

# Hide last subplot if odd number
if len(LAYERS) < len(axes):
    axes[-1].axis('off')

plt.colorbar(im, ax=axes, shrink=0.6, label='Cosine Similarity')
plt.suptitle('Activation Similarity Across Tasks by Layer', fontsize=14)
plt.tight_layout()
plt.show()

## 5. Analyze Divergence by Layer

In [None]:
# Compute mean off-diagonal similarity per layer
mean_divergences = []

for layer in LAYERS:
    sim_matrix = layer_similarities[layer]
    # Get off-diagonal elements (exclude self-similarity)
    off_diag = sim_matrix[np.triu_indices(n_tasks, k=1)]
    mean_sim = off_diag.mean()
    mean_divergences.append((layer, mean_sim))
    print(f"Layer {layer:2d}: mean cross-task similarity = {mean_sim:.4f}")

# Plot
layers_plot = [x[0] for x in mean_divergences]
sims_plot = [x[1] for x in mean_divergences]

plt.figure(figsize=(10, 5))
plt.plot(layers_plot, sims_plot, 'bo-', markersize=10, linewidth=2)
plt.axhline(y=0.9, color='r', linestyle='--', label='Threshold (0.9)')
plt.xlabel('Layer', fontsize=12)
plt.ylabel('Mean Cross-Task Similarity', fontsize=12)
plt.title('Activation Divergence Across Tasks by Layer', fontsize=14)
plt.ylim(0.5, 1.05)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Check hypothesis
late_layer_sim = sims_plot[-1]
print(f"\n{'='*50}")
print(f"HYPOTHESIS TEST")
print(f"{'='*50}")
print(f"Late layer similarity: {late_layer_sim:.4f}")
if late_layer_sim < 0.9:
    print(f"RESULT: SUPPORTED - Tasks produce divergent representations")
else:
    print(f"RESULT: NOT SUPPORTED - Representations remain similar")

## 6. Attention Pattern Analysis

In [None]:
def get_attention_focus(attentions, top_k=5):
    """Get top-attended positions from last layer, last token."""
    # Last layer attention, averaged over heads, last query token
    last_attn = attentions[-1][0].mean(dim=0)[-1]  # Shape: [seq_len]
    top_positions = last_attn.topk(top_k).indices.tolist()
    return set(top_positions)

# Compare attention focus across tasks
attention_focuses = {}
for task_name in task_names:
    focus = get_attention_focus(task_extractions[task_name]['attentions'])
    attention_focuses[task_name] = focus
    print(f"{task_name}: top-5 attended positions = {sorted(focus)}")

# Compute pairwise attention overlap
print("\nAttention Focus Overlap (Jaccard):")
for i, task_a in enumerate(task_names):
    for task_b in task_names[i+1:]:
        focus_a = attention_focuses[task_a]
        focus_b = attention_focuses[task_b]
        jaccard = len(focus_a & focus_b) / len(focus_a | focus_b)
        print(f"  {task_a} vs {task_b}: {jaccard:.3f}")

## 7. Output Distribution Comparison

In [None]:
# Compare predicted next tokens across tasks
print("Predicted Next Tokens by Task:\n")

for task_name in task_names:
    logits = task_extractions[task_name]['logits'][0, -1, :]
    probs = torch.softmax(logits, dim=0)
    
    # Top 5 predictions
    top_probs, top_ids = probs.topk(5)
    top_tokens = [tokenizer.decode([id]) for id in top_ids]
    
    print(f"{task_name}:")
    for token, prob in zip(top_tokens, top_probs):
        print(f"  '{token.strip()}': {prob:.4f}")
    print()

## 8. Test with Ambiguous Input

In [None]:
# Test the "Fire!" example
AMBIGUOUS_INPUT = "Fire!"

CONTEXT_TASKS = {
    'emergency': "You are in a crowded building. Someone shouts:",
    'military': "You are a soldier with weapon ready. The commander says:",
    'campfire': "You are camping and someone points at wood saying:",
    'pottery': "You are in a pottery studio. The instructor says:"
}

print(f"Ambiguous input: '{AMBIGUOUS_INPUT}'\n")

context_extractions = {}
for ctx_name, ctx_prompt in CONTEXT_TASKS.items():
    context_extractions[ctx_name] = extract_activations(
        AMBIGUOUS_INPUT, ctx_prompt, LAYERS
    )

# Compare similarities
print("Cross-context similarities (Layer 11):")
ctx_names = list(CONTEXT_TASKS.keys())
for i, ctx_a in enumerate(ctx_names):
    for ctx_b in ctx_names[i+1:]:
        act_a = context_extractions[ctx_a]['activations'][11]
        act_b = context_extractions[ctx_b]['activations'][11]
        sim = cosine_similarity(act_a, act_b)
        print(f"  {ctx_a} vs {ctx_b}: {sim:.4f}")

# Show predicted tokens per context
print("\nPredicted next tokens by context:")
for ctx_name in ctx_names:
    logits = context_extractions[ctx_name]['logits'][0, -1, :]
    probs = torch.softmax(logits, dim=0)
    top_id = probs.argmax()
    top_token = tokenizer.decode([top_id])
    print(f"  {ctx_name}: '{top_token.strip()}' ({probs[top_id]:.3f})")

## 9. Summary

In [None]:
print("="*60)
print("EXPERIMENT 037 SUMMARY")
print("="*60)

print("\n1. ACTIVATION DIVERGENCE:")
print(f"   Early layers (0-4): High similarity (structure shared)")
print(f"   Late layers (8-11): Lower similarity (task-specific)")
print(f"   Final layer mean similarity: {sims_plot[-1]:.4f}")

print("\n2. ATTENTION PATTERNS:")
print("   Different tasks attend to different positions")

print("\n3. OUTPUT DISTRIBUTIONS:")
print("   Different tasks produce different predictions")

print("\n4. VERDICT:")
if sims_plot[-1] < 0.9:
    print("   HYPOTHESIS SUPPORTED")
    print("   Action Quanta ARE task-relative")
    print("   Same input -> Different representations under different tasks")
else:
    print("   HYPOTHESIS NOT SUPPORTED")
    print("   Representations remain similar across tasks")