# Experiment 038: Three-Level Distinction

**Question:** Can we empirically distinguish Measurements (L1), Inferences (L2), and Action Quanta (L3)?

**Hypothesis:** L3 is a strict subset of L2, more stable, and uniquely load-bearing.

**Key Prediction:** Ablating L3 causes >50% degradation; L2-only <20%.

In [None]:
!pip install transformers torch scikit-learn matplotlib -q

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from typing import Dict, List, Set

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

## 1. Load Model

In [None]:
MODEL_NAME = "gpt2"
PROBE_LAYER = 8

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Loaded {MODEL_NAME}")

## 2. Define Three Levels

In [None]:
# L1: Raw activations (Measurements)
def extract_L1(text: str, layer: int = PROBE_LAYER) -> torch.Tensor:
    """Extract raw activations."""
    inputs = tokenizer(text, return_tensors="pt").to(device)
    activations = {}
    
    def hook(m, i, o):
        activations['out'] = o[0].detach().cpu()
    
    h = model.transformer.h[layer].register_forward_hook(hook)
    with torch.no_grad():
        model(**inputs)
    h.remove()
    
    return activations['out'].mean(dim=1).squeeze()  # Pool over sequence

# Test L1
test_L1 = extract_L1("This is a test sentence.")
print(f"L1 shape: {test_L1.shape}")

## 3. Train L2 Probes (Inferences)

In [None]:
# Training data for sentiment probe
POSITIVE_TEXTS = [
    "This movie was wonderful and amazing!",
    "I loved every minute, fantastic experience.",
    "Excellent service, delicious food, great atmosphere.",
    "Beautiful day, everything is perfect!",
    "Best thing ever, so happy and grateful.",
    "Brilliant work, truly outstanding performance.",
    "Exceptional quality, exceeded expectations.",
    "Absolutely delightful, highly recommend!",
]

NEGATIVE_TEXTS = [
    "This movie was terrible and boring.",
    "I hated every minute, awful experience.",
    "Horrible service, disgusting food, bad atmosphere.",
    "Terrible day, everything went wrong!",
    "Worst thing ever, so sad and frustrated.",
    "Awful work, completely disappointing performance.",
    "Poor quality, failed to meet expectations.",
    "Absolutely dreadful, do not recommend!",
]

# Extract features
X_train = []
y_train = []

for text in POSITIVE_TEXTS:
    X_train.append(extract_L1(text).numpy())
    y_train.append(1)  # Positive

for text in NEGATIVE_TEXTS:
    X_train.append(extract_L1(text).numpy())
    y_train.append(0)  # Negative

X_train = np.array(X_train)
y_train = np.array(y_train)

print(f"Training data: {X_train.shape}")

In [None]:
# Train sentiment probe (L2)
sentiment_probe = LogisticRegression(max_iter=1000)

# Cross-validation
cv_scores = cross_val_score(sentiment_probe, X_train, y_train, cv=4)
print(f"Probe CV accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")

# Fit on all data
sentiment_probe.fit(X_train, y_train)

# L2 extraction function
def extract_L2(text: str) -> Dict:
    """Extract L2 inference (sentiment prediction)."""
    L1 = extract_L1(text).numpy().reshape(1, -1)
    pred = sentiment_probe.predict(L1)[0]
    prob = sentiment_probe.predict_proba(L1)[0]
    return {
        'prediction': 'positive' if pred == 1 else 'negative',
        'confidence': prob.max(),
        'prob_positive': prob[1],
        'prob_negative': prob[0]
    }

# Test L2
print("\nL2 Test:")
print(f"  'Great movie!': {extract_L2('Great movie!')}")
print(f"  'Terrible film!': {extract_L2('Terrible film!')}")

## 4. Identify L3 Features (Load-Bearing)

In [None]:
def compute_feature_importance(text: str, target_token: str) -> torch.Tensor:
    """Compute feature importance via gradient."""
    inputs = tokenizer(text, return_tensors="pt").to(device)
    target_ids = tokenizer.encode(target_token, add_special_tokens=False)
    target_id = target_ids[0] if target_ids else tokenizer.eos_token_id
    
    # Get embeddings with gradients
    embeddings = model.get_input_embeddings()(inputs['input_ids'])
    embeddings = embeddings.clone().requires_grad_(True)
    
    outputs = model(inputs_embeds=embeddings)
    target_logit = outputs.logits[0, -1, target_id]
    target_logit.backward()
    
    # Importance = gradient magnitude per dimension
    importance = embeddings.grad.abs().mean(dim=(0, 1)).cpu()
    return importance

def identify_L3(text: str, target_token: str, top_k: int = 50) -> Set[int]:
    """Identify L3 features (most important for target prediction)."""
    importance = compute_feature_importance(text, target_token)
    top_indices = importance.topk(top_k).indices.tolist()
    return set(top_indices), importance

# Test L3 identification
TEST_INPUT = "This movie was absolutely wonderful, I loved every minute."
TARGET = "positive"

L3_indices, importance = identify_L3(TEST_INPUT, TARGET)
print(f"L3 features (top 50): {len(L3_indices)} dimensions")
print(f"Top 10 indices: {sorted(list(L3_indices))[:10]}")

## 5. Test Ablation Effects

In [None]:
def get_output_prob(text: str, target_token: str) -> float:
    """Get probability of target token."""
    inputs = tokenizer(text, return_tensors="pt").to(device)
    target_ids = tokenizer.encode(target_token, add_special_tokens=False)
    target_id = target_ids[0] if target_ids else tokenizer.eos_token_id
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits[0, -1, :], dim=-1)
    return probs[target_id].item()

def ablate_features(text: str, target_token: str, feature_indices: Set[int]) -> float:
    """Get output prob after ablating features."""
    inputs = tokenizer(text, return_tensors="pt").to(device)
    target_ids = tokenizer.encode(target_token, add_special_tokens=False)
    target_id = target_ids[0] if target_ids else tokenizer.eos_token_id
    
    embeddings = model.get_input_embeddings()(inputs['input_ids'])
    
    # Ablate: zero out features
    for idx in feature_indices:
        embeddings[:, :, idx] = 0
    
    with torch.no_grad():
        outputs = model(inputs_embeds=embeddings)
        probs = torch.softmax(outputs.logits[0, -1, :], dim=-1)
    return probs[target_id].item()

# Baseline
baseline_prob = get_output_prob(TEST_INPUT, TARGET)
print(f"Baseline probability for '{TARGET}': {baseline_prob:.6f}")

In [None]:
# Compare L3 ablation vs L2-only ablation

# L3: Top 50 most important features
L3_indices, importance = identify_L3(TEST_INPUT, TARGET, top_k=50)

# L2-only: Random features NOT in L3 (representing probed but non-load-bearing)
all_dims = set(range(model.config.n_embd))
L2_only_indices = all_dims - L3_indices
L2_only_sample = set(list(L2_only_indices)[:50])  # Sample same size

# Ablate L3
prob_after_L3_ablation = ablate_features(TEST_INPUT, TARGET, L3_indices)
L3_degradation = (baseline_prob - prob_after_L3_ablation) / baseline_prob * 100

# Ablate L2-only
prob_after_L2_ablation = ablate_features(TEST_INPUT, TARGET, L2_only_sample)
L2_only_degradation = (baseline_prob - prob_after_L2_ablation) / baseline_prob * 100

print(f"\n{'='*50}")
print("ABLATION RESULTS")
print(f"{'='*50}")
print(f"\nBaseline prob: {baseline_prob:.6f}")
print(f"\nL3 Ablation (top 50 important features):")
print(f"  Prob after: {prob_after_L3_ablation:.6f}")
print(f"  Degradation: {L3_degradation:.1f}%")
print(f"\nL2-only Ablation (50 non-important features):")
print(f"  Prob after: {prob_after_L2_ablation:.6f}")
print(f"  Degradation: {L2_only_degradation:.1f}%")
print(f"\nRatio: {L3_degradation / max(L2_only_degradation, 0.1):.1f}x")

## 6. Stability Across Paraphrases

In [None]:
PARAPHRASES = [
    "This movie was absolutely wonderful, I loved every minute.",
    "This film was totally amazing, I enjoyed every second.",
    "What a fantastic movie, loved it from start to finish.",
    "An absolutely brilliant film, thoroughly enjoyable.",
]

# Get L2 predictions for all paraphrases
L2_predictions = [extract_L2(p) for p in PARAPHRASES]

print("L2 Stability Across Paraphrases:")
print("-" * 50)
for text, pred in zip(PARAPHRASES, L2_predictions):
    print(f"'{text[:40]}...'")
    print(f"  -> {pred['prediction']} ({pred['confidence']:.3f})")

# Compute variance
confidences = [p['confidence'] for p in L2_predictions]
variance = np.var(confidences)
stability = 1.0 / (1.0 + variance * 10)

print(f"\nL2 Confidence variance: {variance:.4f}")
print(f"L2 Stability score: {stability:.3f}")

In [None]:
# Compare L3 feature stability
L3_sets = []
for text in PARAPHRASES:
    L3, _ = identify_L3(text, TARGET, top_k=50)
    L3_sets.append(L3)

# Compute Jaccard similarities
print("L3 Feature Stability (Jaccard Similarity):")
print("-" * 50)

jaccards = []
for i in range(len(L3_sets)):
    for j in range(i+1, len(L3_sets)):
        intersection = len(L3_sets[i] & L3_sets[j])
        union = len(L3_sets[i] | L3_sets[j])
        jaccard = intersection / union if union > 0 else 0
        jaccards.append(jaccard)
        print(f"  Paraphrase {i+1} vs {j+1}: {jaccard:.3f}")

print(f"\nMean L3 Jaccard similarity: {np.mean(jaccards):.3f}")

## 7. Visualize Three Levels

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# L1: Raw activations histogram
ax1 = axes[0]
L1_example = extract_L1(TEST_INPUT).numpy()
ax1.hist(L1_example, bins=50, alpha=0.7, color='blue')
ax1.set_title('L1: Measurements\n(Raw Activations)')
ax1.set_xlabel('Activation Value')
ax1.set_ylabel('Frequency')

# L2: Probe prediction distribution
ax2 = axes[1]
L2_preds = [extract_L2(text)['prob_positive'] for text in POSITIVE_TEXTS + NEGATIVE_TEXTS]
labels = ['Positive'] * len(POSITIVE_TEXTS) + ['Negative'] * len(NEGATIVE_TEXTS)
colors = ['green' if l == 'Positive' else 'red' for l in labels]
ax2.scatter(range(len(L2_preds)), L2_preds, c=colors, alpha=0.7, s=100)
ax2.axhline(y=0.5, color='black', linestyle='--', label='Decision boundary')
ax2.set_title('L2: Inferences\n(Probe Predictions)')
ax2.set_xlabel('Sample Index')
ax2.set_ylabel('P(Positive)')
ax2.legend()

# L3: Feature importance
ax3 = axes[2]
_, importance = identify_L3(TEST_INPUT, TARGET)
sorted_imp = importance.sort(descending=True).values.numpy()[:100]
ax3.bar(range(len(sorted_imp)), sorted_imp, alpha=0.7, color='purple')
ax3.axvline(x=50, color='red', linestyle='--', label='L3 threshold (top 50)')
ax3.set_title('L3: Action Quanta\n(Feature Importance)')
ax3.set_xlabel('Feature Rank')
ax3.set_ylabel('Importance')
ax3.legend()

plt.tight_layout()
plt.show()

## 8. Summary

In [None]:
print("="*60)
print("EXPERIMENT 038 SUMMARY")
print("="*60)

print("\n1. THREE LEVELS IDENTIFIED:")
print(f"   L1 (Measurements): {L1_example.shape[0]} dimensional raw activations")
print(f"   L2 (Inferences): Semantic probes with {cv_scores.mean():.1%} accuracy")
print(f"   L3 (Action Quanta): Top 50 load-bearing features")

print("\n2. SUBSET RELATIONSHIP:")
print(f"   L3 size: 50 features")
print(f"   L1 size: {L1_example.shape[0]} features")
print(f"   Ratio: {50/L1_example.shape[0]:.1%} (L3 << L1)")

print("\n3. ABLATION ASYMMETRY:")
print(f"   L3 ablation degradation: {L3_degradation:.1f}%")
print(f"   L2-only ablation degradation: {L2_only_degradation:.1f}%")
ratio = L3_degradation / max(L2_only_degradation, 0.1)
print(f"   Ratio: {ratio:.1f}x")

print("\n4. STABILITY:")
print(f"   L2 confidence variance: {variance:.4f}")
print(f"   L3 mean Jaccard: {np.mean(jaccards):.3f}")

print("\n5. VERDICT:")
if L3_degradation > L2_only_degradation * 2:
    print("   HYPOTHESIS SUPPORTED")
    print("   - L3 (AQ) is a distinct, load-bearing subset")
    print("   - Ablating L3 is significantly more damaging")
    print("   - Three levels ARE empirically distinguishable")
else:
    print("   HYPOTHESIS NOT SUPPORTED")
    print("   - No clear distinction between levels")