# Experiment 035E: Cross-Model AQ Validation with Real AQ Prompts

**AKIRA Project - Oscar Goldman - Shogu Research Group @ Datamutant.ai**

---

## Goal

Test whether AQ (Action Quanta) cluster by **action discrimination type** across multiple models.

Unlike previous experiments that tested output format categories (COMPUTE_NUMBER, etc.),
this experiment tests **true AQ** - patterns that enable discrimination between action alternatives:
- FLEE vs STAY
- APPROACH vs AVOID
- URGENT vs DELAYED
- ATTACK vs DEFEND

---

## Hypothesis

If AQ are fundamental to how LLMs represent actionable information:
1. Prompts requiring the same action discrimination should cluster together
2. This clustering should appear across different model families
3. Clustering should be strongest in late layers (where crystallization occurs)

---

## 1. Setup

In [None]:
# Install dependencies for Colab
!pip install transformers torch numpy scikit-learn matplotlib seaborn scipy tqdm -q

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass, field
import warnings
from scipy import stats
import json
from tqdm import tqdm
import gc

warnings.filterwarnings('ignore')

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")
print(f"PyTorch version: {torch.__version__}")
if DEVICE == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## 2. Configuration

In [None]:
@dataclass
class ExperimentConfig:
    """Configuration for cross-model AQ validation."""
    
    models: Dict[str, str] = field(default_factory=lambda: {
        "gpt2-medium": "gpt2-medium",
        "pythia-410m": "EleutherAI/pythia-410m",
        "pythia-1.4b": "EleutherAI/pythia-1.4b",
    })
    
    prompts_per_category: int = 200
    n_bootstrap: int = 1000
    test_size: float = 0.2
    random_seed: int = 42
    layers_to_probe: List[int] = field(default_factory=list)
    
    def __post_init__(self) -> None:
        np.random.seed(self.random_seed)
        torch.manual_seed(self.random_seed)


config = ExperimentConfig()
print(f"Models to test: {list(config.models.keys())}")
print(f"Prompts per category: {config.prompts_per_category}")
print(f"Bootstrap samples: {config.n_bootstrap}")

## 3. True AQ Prompt Generation

These prompts test **action discriminations**, not output formats:

| AQ Type | Discrimination | Example |
|---------|----------------|----------|
| FLEE_STAY | Should I flee or stay? | "A predator approaches..." |
| APPROACH_AVOID | Should I approach or avoid? | "A strange object appears..." |
| URGENT_DELAYED | Act now or wait? | "The situation is escalating..." |
| ATTACK_DEFEND | Offensive or defensive? | "The enemy is vulnerable..." |

In [None]:
# TRUE AQ CATEGORIES - Action Discriminations

AQ_CATEGORIES = {
    "FLEE_STAY": {
        "description": "Discriminates between fleeing and staying",
        "discrimination": "FLEE vs STAY",
        "templates": [
            "A predator is approaching rapidly. You should",
            "The building is on fire and spreading. You must",
            "An attacker is coming toward you. You need to",
            "The flood waters are rising quickly. You should",
            "A dangerous snake is nearby. You must",
            "The volcano is erupting. You need to",
            "Toxic gas is leaking into the room. You should",
            "The ice beneath you is cracking. You must",
            "A swarm of bees is heading your way. You need to",
            "The ceiling is about to collapse. You should",
            "A wild bear has spotted you. You must",
            "The tsunami warning has sounded. You need to",
            "Gunshots are getting closer. You should",
            "The car is about to explode. You must",
            "A tornado is forming nearby. You need to",
            "The bridge is collapsing. You should",
            "Lava is flowing toward you. You must",
            "The bomb will detonate soon. You need to",
            "A pack of wolves surrounds you. You should",
            "The radiation levels are rising. You must",
        ]
    },
    "APPROACH_AVOID": {
        "description": "Discriminates between approaching and avoiding",
        "discrimination": "APPROACH vs AVOID",
        "templates": [
            "A strange glowing object appears ahead. You should",
            "An unknown person is waving at you. You need to",
            "A mysterious door has opened. You should",
            "An unfamiliar animal is watching you. You must",
            "A new path has revealed itself. You need to",
            "Something is moving in the shadows. You should",
            "A light flickers in the distance. You must",
            "An unmarked package sits on the ground. You need to",
            "A voice calls from the darkness. You should",
            "A hidden passage has been discovered. You must",
            "An abandoned vehicle blocks the road. You need to",
            "A figure stands motionless ahead. You should",
            "Something shiny catches your eye. You must",
            "A cave entrance appears before you. You need to",
            "An unusual sound comes from nearby. You should",
            "A locked chest sits in the corner. You must",
            "A trail of footprints leads away. You need to",
            "A reflection appears in the water. You should",
            "A signal fire burns on the hill. You must",
            "A message is carved into the tree. You need to",
        ]
    },
    "URGENT_DELAYED": {
        "description": "Discriminates between acting now and waiting",
        "discrimination": "NOW vs LATER",
        "templates": [
            "The window of opportunity is closing. You should",
            "The situation is rapidly escalating. You must",
            "Time is running out to decide. You need to",
            "The moment to act is almost gone. You should",
            "Events are unfolding quickly. You must",
            "The deadline is approaching fast. You need to",
            "Circumstances are changing rapidly. You should",
            "The critical moment is here. You must",
            "Things are moving too fast. You need to",
            "The situation demands immediate action. You should",
            "Everything is happening at once. You must",
            "The pressure is mounting quickly. You need to",
            "Events are spiraling out of control. You should",
            "The turning point has arrived. You must",
            "Momentum is building rapidly. You need to",
            "The situation is reaching a climax. You should",
            "Forces are converging now. You must",
            "The crisis point is imminent. You need to",
            "Everything hangs in the balance. You should",
            "The decisive moment approaches. You must",
        ]
    },
    "ATTACK_DEFEND": {
        "description": "Discriminates between offensive and defensive action",
        "discrimination": "ATTACK vs DEFEND",
        "templates": [
            "The enemy has left an opening. You should",
            "Your opponent is off balance. You must",
            "A weakness has been revealed. You need to",
            "The adversary is distracted. You should",
            "An opportunity to strike appears. You must",
            "The hostile force is vulnerable. You need to",
            "Your rival has made a mistake. You should",
            "The threat has lowered its guard. You must",
            "A gap in the defense appears. You need to",
            "The aggressor is momentarily stunned. You should",
            "Your foe is recovering from a blow. You must",
            "The opposition is in disarray. You need to",
            "A critical vulnerability is exposed. You should",
            "The enemy formation has broken. You must",
            "Your adversary is retreating. You need to",
            "The hostile target is isolated. You should",
            "A flanking opportunity presents itself. You must",
            "The opposing force is overextended. You need to",
            "Your enemy is out of position. You should",
            "The threat has exhausted its resources. You must",
        ]
    },
    "SHARE_KEEP": {
        "description": "Discriminates between sharing and keeping",
        "discrimination": "SHARE vs KEEP",
        "templates": [
            "You have found limited supplies. You should",
            "Resources are scarce for everyone. You must",
            "Others are in need of what you have. You need to",
            "Your provisions could help the group. You should",
            "The community is struggling. You must",
            "You possess something valuable. You need to",
            "Others depend on your generosity. You should",
            "Your reserves could make a difference. You must",
            "The group faces shortage. You need to",
            "What you have could save others. You should",
            "Your surplus could help many. You must",
            "The collective need is great. You need to",
            "Your abundance contrasts with others' lack. You should",
            "Sharing would cost you significantly. You must",
            "Others are watching what you do. You need to",
            "Your choice affects the community. You should",
            "Generosity has consequences here. You must",
            "Self-preservation conflicts with helping. You need to",
            "The moral choice is unclear. You should",
            "Your decision will be remembered. You must",
        ]
    }
}

print(f"Defined {len(AQ_CATEGORIES)} AQ categories:")
for name, data in AQ_CATEGORIES.items():
    print(f"  {name}: {data['discrimination']} ({len(data['templates'])} templates)")

In [None]:
def generate_aq_prompts(n_per_category: int = 200) -> Tuple[List[str], List[str]]:
    """Generate prompts for each AQ category.
    
    Args:
        n_per_category: Number of prompts per AQ category
        
    Returns:
        Tuple of (prompts, labels)
    """
    all_prompts = []
    all_labels = []
    
    for category_name, category_data in AQ_CATEGORIES.items():
        templates = category_data["templates"]
        
        for i in range(n_per_category):
            template = templates[i % len(templates)]
            
            # Add variations
            variations = [
                template,
                template.replace("You should", "You must"),
                template.replace("You should", "You need to"),
                template.replace("You must", "You should"),
                template.replace("You need to", "You should"),
                "Warning: " + template,
                "Alert: " + template,
                template + " quickly",
                template + " immediately",
                template + " now",
            ]
            
            prompt = variations[i % len(variations)]
            all_prompts.append(prompt)
            all_labels.append(category_name)
    
    return all_prompts, all_labels


PROMPTS, LABELS = generate_aq_prompts(config.prompts_per_category)
print(f"\nGenerated {len(PROMPTS)} prompts total")
print(f"Categories: {list(set(LABELS))}")
print(f"\nExample prompts:")
for cat in list(AQ_CATEGORIES.keys())[:3]:
    idx = LABELS.index(cat)
    print(f"  {cat}: {PROMPTS[idx][:60]}...")

## 4. Activation Extraction

In [None]:
def get_model_layers(model_name: str) -> List[int]:
    """Get layer indices to probe for a model."""
    if "gpt2" in model_name:
        return [0, 4, 8, 12, 16, 20, 23]
    elif "pythia-410m" in model_name:
        return [0, 4, 8, 12, 16, 20, 23]
    elif "pythia-1.4b" in model_name:
        return [0, 4, 8, 12, 16, 20, 23]
    else:
        return [0, 4, 8, 12]


def load_model(model_path: str):
    """Load model and tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    model = model.to(DEVICE)
    model.eval()
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    return model, tokenizer


def get_activation(prompt: str, model, tokenizer, layer: int) -> np.ndarray:
    """Get activation at specified layer for a prompt."""
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    
    activations = {}
    
    def hook_fn(module, input, output):
        if isinstance(output, tuple):
            activations["act"] = output[0].detach()
        else:
            activations["act"] = output.detach()
    
    # Register hook
    if hasattr(model, 'transformer'):
        handle = model.transformer.h[layer].register_forward_hook(hook_fn)
    elif hasattr(model, 'gpt_neox'):
        handle = model.gpt_neox.layers[layer].register_forward_hook(hook_fn)
    else:
        handle = model.model.layers[layer].register_forward_hook(hook_fn)
    
    with torch.no_grad():
        _ = model(**inputs)
    
    handle.remove()
    
    # Get last token activation
    act = activations["act"][0, -1, :].cpu().numpy()
    return act


print("Activation extraction functions ready")

## 5. Statistical Analysis Functions

In [None]:
def bootstrap_silhouette(activations: np.ndarray, labels: List[str], 
                         n_bootstrap: int = 1000) -> Tuple[float, float, float]:
    """Compute silhouette score with bootstrap confidence interval."""
    n_samples = len(labels)
    bootstrap_scores = []
    
    for _ in range(n_bootstrap):
        indices = np.random.choice(n_samples, n_samples, replace=True)
        boot_acts = activations[indices]
        boot_labels = [labels[i] for i in indices]
        
        if len(set(boot_labels)) > 1:
            score = silhouette_score(boot_acts, boot_labels)
            bootstrap_scores.append(score)
    
    mean_score = np.mean(bootstrap_scores)
    ci_low = np.percentile(bootstrap_scores, 2.5)
    ci_high = np.percentile(bootstrap_scores, 97.5)
    
    return mean_score, ci_low, ci_high


def compute_cohens_d(activations: np.ndarray, labels: List[str]) -> float:
    """Compute Cohen's d for within vs between cluster distances."""
    unique_labels = list(set(labels))
    
    within_distances = []
    between_distances = []
    
    for i in range(len(labels)):
        for j in range(i+1, len(labels)):
            dist = np.linalg.norm(activations[i] - activations[j])
            if labels[i] == labels[j]:
                within_distances.append(dist)
            else:
                between_distances.append(dist)
    
    within_mean = np.mean(within_distances)
    between_mean = np.mean(between_distances)
    pooled_std = np.sqrt((np.var(within_distances) + np.var(between_distances)) / 2)
    
    if pooled_std == 0:
        return 0.0
    
    return (between_mean - within_mean) / pooled_std


def compute_distance_ratio(activations: np.ndarray, labels: List[str]) -> float:
    """Compute ratio of between-cluster to within-cluster distance."""
    within_distances = []
    between_distances = []
    
    for i in range(len(labels)):
        for j in range(i+1, len(labels)):
            dist = np.linalg.norm(activations[i] - activations[j])
            if labels[i] == labels[j]:
                within_distances.append(dist)
            else:
                between_distances.append(dist)
    
    if np.mean(within_distances) == 0:
        return 0.0
    
    return np.mean(between_distances) / np.mean(within_distances)


print("Statistical functions ready")

## 6. Run Cross-Model Experiment

In [None]:
def run_experiment_for_model(model_name: str, model_path: str, 
                              prompts: List[str], labels: List[str]) -> Dict:
    """Run full experiment for a single model."""
    print(f"\n{'='*60}")
    print(f"Processing: {model_name}")
    print(f"{'='*60}")
    
    model, tokenizer = load_model(model_path)
    layers = get_model_layers(model_name)
    
    print(f"Model loaded. Layers to probe: {layers}")
    
    results = {"layers": {}}
    
    for layer in layers:
        print(f"\n  Layer {layer}...")
        
        # Extract activations
        activations = []
        for prompt in tqdm(prompts, desc=f"    Extracting"):
            act = get_activation(prompt, model, tokenizer, layer)
            activations.append(act)
        
        activations = np.array(activations)
        
        # Train/test split
        train_acts, test_acts, train_labels, test_labels = train_test_split(
            activations, labels, test_size=config.test_size, 
            random_state=config.random_seed, stratify=labels
        )
        
        # Compute metrics on training set
        sil_mean, sil_low, sil_high = bootstrap_silhouette(
            train_acts, train_labels, config.n_bootstrap
        )
        
        cohens_d = compute_cohens_d(train_acts, train_labels)
        dist_ratio = compute_distance_ratio(train_acts, train_labels)
        
        # Test set validation
        test_sil = silhouette_score(test_acts, test_labels)
        
        # K-means clustering and ARI
        n_clusters = len(set(labels))
        kmeans = KMeans(n_clusters=n_clusters, random_state=config.random_seed, n_init=10)
        pred_labels = kmeans.fit_predict(train_acts)
        
        label_to_int = {l: i for i, l in enumerate(sorted(set(train_labels)))}
        true_labels_int = [label_to_int[l] for l in train_labels]
        ari = adjusted_rand_score(true_labels_int, pred_labels)
        
        results["layers"][layer] = {
            "silhouette": sil_mean,
            "silhouette_ci": [sil_low, sil_high],
            "cohens_d": cohens_d,
            "distance_ratio": dist_ratio,
            "test_silhouette": test_sil,
            "ari": ari
        }
        
        print(f"    Silhouette: {sil_mean:.3f} [{sil_low:.3f}, {sil_high:.3f}]")
        print(f"    Cohen's d: {cohens_d:.3f}")
        print(f"    Test Silhouette: {test_sil:.3f}")
        
        # Memory cleanup
        del activations, train_acts, test_acts
        torch.cuda.empty_cache() if DEVICE == "cuda" else None
    
    # Cleanup model
    del model, tokenizer
    torch.cuda.empty_cache() if DEVICE == "cuda" else None
    gc.collect()
    
    return results


print("Experiment runner ready")

In [None]:
# Run experiments for all models
print("\n" + "="*70)
print("RUNNING CROSS-MODEL AQ VALIDATION")
print("="*70)

ALL_RESULTS = {}

for model_name, model_path in config.models.items():
    try:
        results = run_experiment_for_model(model_name, model_path, PROMPTS, LABELS)
        ALL_RESULTS[model_name] = results
    except Exception as e:
        print(f"Error with {model_name}: {e}")
        continue

print("\nAll models processed.")

## 7. Results Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Silhouette by Layer
ax = axes[0, 0]
for model_name, results in ALL_RESULTS.items():
    layers = sorted(results["layers"].keys())
    sils = [results["layers"][l]["silhouette"] for l in layers]
    ax.plot(layers, sils, marker='o', label=model_name, linewidth=2, markersize=8)

ax.axhline(y=0.15, color='r', linestyle='--', alpha=0.7, label='Threshold (0.15)')
ax.set_xlabel("Layer")
ax.set_ylabel("Silhouette Score")
ax.set_title("Silhouette Score by Layer (Higher = Better Clustering)")
ax.legend()
ax.grid(True, alpha=0.3)

# 2. Effect Size by Layer
ax = axes[0, 1]
for model_name, results in ALL_RESULTS.items():
    layers = sorted(results["layers"].keys())
    effects = [results["layers"][l]["cohens_d"] for l in layers]
    ax.plot(layers, effects, marker='s', label=model_name, linewidth=2, markersize=8)

ax.axhline(y=0.5, color='orange', linestyle='--', alpha=0.7, label='Medium effect (0.5)')
ax.axhline(y=0.8, color='red', linestyle='--', alpha=0.7, label='Large effect (0.8)')
ax.set_xlabel("Layer")
ax.set_ylabel("Cohen's d")
ax.set_title("Effect Size by Layer")
ax.legend()
ax.grid(True, alpha=0.3)

# 3. Best Silhouette per Model
ax = axes[1, 0]
model_names = list(ALL_RESULTS.keys())
best_sils = []
ci_lows = []
ci_highs = []

for model_name in model_names:
    results = ALL_RESULTS[model_name]
    best_layer = max(results["layers"].keys(), 
                     key=lambda l: results["layers"][l]["silhouette"])
    best_sils.append(results["layers"][best_layer]["silhouette"])
    ci = results["layers"][best_layer]["silhouette_ci"]
    ci_lows.append(best_sils[-1] - ci[0])
    ci_highs.append(ci[1] - best_sils[-1])

ax.bar(model_names, best_sils, yerr=[ci_lows, ci_highs], capsize=5, alpha=0.7)
ax.axhline(y=0.15, color='r', linestyle='--', label='Threshold')
ax.set_ylabel("Silhouette Score")
ax.set_title("Best Silhouette Score per Model (95% CI)")
ax.legend()
ax.tick_params(axis='x', rotation=45)

# 4. Distance Ratio per Model
ax = axes[1, 1]
ratios = []
for model_name in model_names:
    results = ALL_RESULTS[model_name]
    best_layer = max(results["layers"].keys(), 
                     key=lambda l: results["layers"][l]["silhouette"])
    ratios.append(results["layers"][best_layer]["distance_ratio"])

ax.bar(model_names, ratios, alpha=0.7, color='green')
ax.axhline(y=1.0, color='r', linestyle='--', label='No separation')
ax.set_ylabel("Between/Within Distance Ratio")
ax.set_title("Cluster Separation Ratio per Model")
ax.legend()
ax.tick_params(axis='x', rotation=45)

plt.suptitle("035E: Cross-Model AQ Validation (Real AQ Prompts)", fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig("035E_real_aq_results.png", dpi=150, bbox_inches='tight')
plt.show()
print("Saved: 035E_real_aq_results.png")

## 8. Summary

In [None]:
print("\n" + "="*80)
print("CROSS-MODEL COMPARISON SUMMARY")
print("="*80)

print("\nBest Layer Results per Model:")
print("-"*100)
print(f"{'Model':<15} {'Layer':<6} {'Silhouette':<12} {'95% CI':<20} {'Cohen d':<10} {'Ratio':<8} {'Test Sil':<10} {'ARI':<8}")
print("-"*100)

for model_name, results in ALL_RESULTS.items():
    best_layer = max(results["layers"].keys(), 
                     key=lambda l: results["layers"][l]["silhouette"])
    best = results["layers"][best_layer]
    ci = best["silhouette_ci"]
    
    print(f"{model_name:<15} {best_layer:<6} {best['silhouette']:<12.3f} "
          f"[{ci[0]:.3f}, {ci[1]:.3f}]{'':>5} {best['cohens_d']:<10.3f} "
          f"{best['distance_ratio']:<8.3f} {best['test_silhouette']:<10.3f} {best['ari']:<8.3f}")

print("-"*100)

# Save results
with open("035E_real_aq_results.json", "w") as f:
    json.dump(ALL_RESULTS, f, indent=2, default=str)

print("\nResults saved to 035E_real_aq_results.json")