# Gemma2-27B Basic Weight Susceptibility Analysis

This notebook analyzes how PC vectors from persona subspace data activate downstream neurons differently between Gemma2-27B base and instruct models.

**Key Questions:**
1. Do PC vectors (especially PC1) show different activation patterns in base vs instruct models?
2. Which layers show the strongest weight differences?
3. How do PC vectors compare to random baselines?

**Approach:**
- Load both base and instruct models
- Compute weight differences
- Load PC vectors from PCA data
- Apply layernorm and project through weight matrices
- Compute cosine distances between base and instruct activations

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoConfig

# Import from chatspace.analysis
from chatspace.analysis import (
    load_pca_data,
    load_individual_role_vectors,
    load_individual_trait_vectors,
    extract_pc_components,
    normalize_vector,
    gemma2_rmsnorm,
    compute_cosine_distances_batch,
    compute_weight_statistics
)

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load Gemma2-27B Models

In [None]:
# Model identifiers
base_model_id = "google/gemma-2-27b"
instruct_model_id = "google/gemma-2-27b-it"

print(f"Loading models...")
print(f"  Base: {base_model_id}")
print(f"  Instruct: {instruct_model_id}")
print(f"\nThis will take several minutes...\n")

# Load config
config = AutoConfig.from_pretrained(base_model_id)
print(f"Model config:")
print(f"  Hidden size: {config.hidden_size}")
print(f"  Num layers: {config.num_hidden_layers}")
print(f"  Intermediate size: {config.intermediate_size}")
print(f"  Attention heads: {config.num_attention_heads}")
print(f"  KV heads: {config.num_key_value_heads}")

In [None]:
# Load base model
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.bfloat16,
    device_map="cpu",
    low_cpu_mem_usage=True
)
base_state_dict = {k: v.cpu() for k, v in base_model.state_dict().items()}
print(f"✓ Base model loaded: {len(base_state_dict)} parameters")

In [None]:
# Load instruct model
print("Loading instruct model...")
instruct_model = AutoModelForCausalLM.from_pretrained(
    instruct_model_id,
    torch_dtype=torch.bfloat16,
    device_map="cpu",
    low_cpu_mem_usage=True
)
instruct_state_dict = {k: v.cpu() for k, v in instruct_model.state_dict().items()}
print(f"✓ Instruct model loaded: {len(instruct_state_dict)} parameters")

## 2. Compute Weight Differences

In [None]:
# Compute weight differences
print("Computing weight differences...")
weight_diffs = {}
for name in base_state_dict.keys():
    if name in instruct_state_dict:
        diff = instruct_state_dict[name] - base_state_dict[name]
        weight_diffs[name] = diff

print(f"\nComputed {len(weight_diffs)} weight differences")
print(f"\nSample statistics:")
for i, (name, diff) in enumerate(list(weight_diffs.items())[:5]):
    print(f"  {name}: shape={diff.shape}, norm={diff.float().norm().item():.4f}")

## 3. Load PC Vectors and Semantic Data

In [None]:
# Path to persona subspace data
persona_data_root = Path("/workspace/persona-data")
gemma_model_name = "gemma-2-27b"

roles_pca_dir = persona_data_root / gemma_model_name / "roles_240" / "pca"
traits_pca_dir = persona_data_root / gemma_model_name / "traits_240" / "pca"
roles_vectors_dir = persona_data_root / gemma_model_name / "roles_240" / "vectors"
traits_vectors_dir = persona_data_root / gemma_model_name / "traits_240" / "vectors"

print(f"Checking data paths...")
print(f"  Roles PCA: {roles_pca_dir.exists()}")
print(f"  Traits PCA: {traits_pca_dir.exists()}")

# Load PCA data
pca_data, all_pca_files = load_pca_data(roles_pca_dir)
pca_layer = pca_data['layer']
print(f"\n✓ Loaded PCA data from layer {pca_layer}")

# Extract ALL PCs (load 10 for flexibility)
n_pcs_total = 10
pcs_all, variance_all = extract_pc_components(pca_data, n_components=n_pcs_total)
print(f"  Extracted {n_pcs_total} PCs")
print(f"  Variance explained by first 5: {variance_all[:5]}")

# Load individual role and trait vectors at PCA layer
# Uses discriminative defaults:
# - Roles: pos_3 - default_1 (difference vectors)
# - Traits: pos_neg_50 (precomputed contrast vectors)
role_vectors = load_individual_role_vectors(roles_vectors_dir, pca_layer)
trait_vectors = load_individual_trait_vectors(traits_vectors_dir, pca_layer)

print(f"\n✓ Loaded semantic vectors:")
print(f"  {len(role_vectors)} role difference vectors")
print(f"  {len(trait_vectors)} trait contrast vectors")

In [None]:
# ============================================================================
# CONFIGURATION: Set analysis parameters here
# ============================================================================

# Which PCs to visualize in plots
plot_pcs = ["PC1", "PC2", "PC3"]
# plot_pcs = ["PC1"]  # Uncomment to focus on PC1 only

# How many layers before/after PCA layer to analyze
n_layers_context = 5

# How many random baseline vectors
n_random_baseline = 20

# How many role/trait vectors to sample
n_sample_roles = 5
n_sample_traits = 5

print(f"📋 Analysis Configuration:")
print(f"  Analyzing {len(plot_pcs)} PCs: {plot_pcs}")
print(f"  Context: ±{n_layers_context} layers around layer {pca_layer}")
print(f"  Random baseline: {n_random_baseline} vectors")
print(f"  Sampling {n_sample_roles} roles + {n_sample_traits} traits")

## 4. Extract Weights and Compute Cosine Distances

In [None]:
# Define layers to analyze (based on config)
upstream_layers = range(max(0, pca_layer - n_layers_context), pca_layer)
downstream_layers = range(pca_layer + 1, min(pca_layer + n_layers_context + 1, config.num_hidden_layers))
analysis_layers = list(upstream_layers) + list(downstream_layers)

print(f"Analyzing layers (±{n_layers_context} from layer {pca_layer}):")
print(f"  Upstream: {list(upstream_layers)}")
print(f"  Downstream: {list(downstream_layers)}")

# Extract relevant weights
target_weight_types = ['up_proj.weight', 'gate_proj.weight', 'q_proj.weight', 'k_proj.weight', 'v_proj.weight']
relevant_weights = {}

for name, diff in weight_diffs.items():
    for layer_num in analysis_layers:
        layer_prefix = f"model.layers.{layer_num}."
        if name.startswith(layer_prefix):
            for weight_type in target_weight_types:
                if name.endswith(weight_type):
                    relevant_weights[name] = diff
                    break

print(f"\n✓ Found {len(relevant_weights)} relevant weight matrices")

# Extract layernorm weights
layernorm_weights = {}
for layer_num in analysis_layers:
    input_ln_name = f"model.layers.{layer_num}.input_layernorm.weight"
    pre_ffn_ln_name = f"model.layers.{layer_num}.pre_feedforward_layernorm.weight"
    
    if input_ln_name in base_state_dict:
        layernorm_weights[f"{layer_num}.input_layernorm"] = {
            'base': base_state_dict[input_ln_name],
            'instruct': instruct_state_dict[input_ln_name]
        }
    
    if pre_ffn_ln_name in base_state_dict:
        layernorm_weights[f"{layer_num}.pre_feedforward_layernorm"] = {
            'base': base_state_dict[pre_ffn_ln_name],
            'instruct': instruct_state_dict[pre_ffn_ln_name]
        }

print(f"✓ Extracted {len(layernorm_weights)} layernorm weights")

In [None]:
# Build test vector batch
all_vectors = []
vector_names = []

# Add configured PCs
for pc_name in plot_pcs:
    pc_idx = int(pc_name.replace("PC", "")) - 1
    all_vectors.append(pcs_all[pc_idx])
    vector_names.append(pc_name)

# Sample role vectors
sample_roles = list(role_vectors.items())[:n_sample_roles]
for name, vec in sample_roles:
    all_vectors.append(vec)
    vector_names.append(f"role:{name}")

# Sample trait vectors
sample_traits = list(trait_vectors.items())[:n_sample_traits]
for name, vec in sample_traits:
    all_vectors.append(vec)
    vector_names.append(f"trait:{name}")

# Add random baseline
torch.manual_seed(42)
for i in range(n_random_baseline):
    rand_vec = torch.randn(config.hidden_size, dtype=torch.float32)
    rand_vec = normalize_vector(rand_vec).to(torch.bfloat16)
    all_vectors.append(rand_vec)
    vector_names.append(f"Random{i+1}")

# Stack into batch
vectors_batch = torch.stack(all_vectors)
print(f"✓ Prepared {len(vector_names)} test vectors:")
print(f"  {len(plot_pcs)} PCs: {plot_pcs}")
print(f"  {len(sample_roles)} roles")
print(f"  {len(sample_traits)} traits")
print(f"  {n_random_baseline} random baseline")

In [None]:
# Compute cosine distances for each weight matrix
results = []

print(f"Computing cosine distances for {len(relevant_weights)} weight matrices...")

for weight_name, weight_diff in tqdm(relevant_weights.items(), desc="Processing weights"):
    # Parse layer and weight type
    parts = weight_name.split('.')
    layer_num = int(parts[2])
    weight_type = parts[-2]  # e.g., 'up_proj', 'gate_proj', etc.
    
    # Get corresponding weights from base and instruct
    base_weight = base_state_dict[weight_name]
    instruct_weight = instruct_state_dict[weight_name]
    
    # Determine appropriate layernorm
    if weight_type in ['up_proj', 'gate_proj']:
        ln_key = f"{layer_num}.pre_feedforward_layernorm"
    else:  # attention weights
        ln_key = f"{layer_num}.input_layernorm"
    
    if ln_key not in layernorm_weights:
        continue
    
    ln_base = layernorm_weights[ln_key]['base']
    ln_instruct = layernorm_weights[ln_key]['instruct']
    
    # Apply layernorm and compute projections
    normed_base = gemma2_rmsnorm(vectors_batch, ln_base)
    normed_instruct = gemma2_rmsnorm(vectors_batch, ln_instruct)
    
    proj_base = normed_base @ base_weight.T
    proj_instruct = normed_instruct @ instruct_weight.T
    
    # Compute cosine distances
    cosine_dists = compute_cosine_distances_batch(proj_base, proj_instruct)
    
    # Store results
    for i, vec_name in enumerate(vector_names):
        results.append({
            'vector': vec_name,
            'layer': layer_num,
            'weight_type': weight_type,
            'cosine_distance': float(cosine_dists[i])
        })

results_df = pd.DataFrame(results)
print(f"\n✓ Computed {len(results_df)} cosine distance measurements")
print(f"  Shape: {results_df.shape}")

## 5. Visualize Results

In [None]:
# Summary statistics
print("="*80)
print("SUMMARY STATISTICS")
print("="*80)

# Mean cosine distance by vector type
summary = results_df.groupby('vector')['cosine_distance'].agg(['mean', 'std', 'min', 'max'])
summary = summary.sort_values('mean', ascending=False)

print("\nMean cosine distance by vector (top 10):")
print(summary.head(10))

# Compare configured PCs to random baseline
pc_means = {pc: results_df[results_df['vector'] == pc]['cosine_distance'].mean() for pc in plot_pcs}
random_mean = results_df[results_df['vector'].str.startswith('Random')]['cosine_distance'].mean()

print(f"\nPC vs Random baseline comparison:")
print(f"  Random baseline mean: {random_mean:.6f}")
for pc in plot_pcs:
    pc_mean = pc_means[pc]
    ratio = pc_mean / random_mean
    print(f"  {pc} mean: {pc_mean:.6f} ({ratio:.2f}x random)")

In [None]:
# Plot: Cosine distance by weight type and layer
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# By weight type (configured PCs only)
pc_data = results_df[results_df['vector'].isin(plot_pcs)]
sns.boxplot(data=pc_data, x='weight_type', y='cosine_distance', ax=axes[0])
axes[0].set_title(f'Cosine Distance by Weight Type ({", ".join(plot_pcs)})')
axes[0].set_ylabel('Cosine Distance')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3, axis='y')

# By layer (line plot for each configured PC)
pc_data_mean = pc_data.groupby(['layer', 'vector'])['cosine_distance'].mean().reset_index()
for pc in plot_pcs:
    data = pc_data_mean[pc_data_mean['vector'] == pc]
    axes[1].plot(data['layer'], data['cosine_distance'], marker='o', label=pc, linewidth=2)

axes[1].axvline(pca_layer, color='red', linestyle=':', alpha=0.5, label=f'PCA layer ({pca_layer})')
axes[1].set_title('Mean Cosine Distance by Layer')
axes[1].set_xlabel('Layer')
axes[1].set_ylabel('Cosine Distance')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Heatmap: Cosine distances for configured PCs across layers and weight types
pivot_data = pc_data.pivot_table(
    index='weight_type',
    columns='layer',
    values='cosine_distance',
    aggfunc='mean'
)

fig, ax = plt.subplots(figsize=(12, 5))
sns.heatmap(pivot_data, annot=True, fmt='.4f', cmap='YlOrRd', ax=ax, cbar_kws={'label': 'Cosine Distance'})
ax.set_title(f'Mean Cosine Distance: {", ".join(plot_pcs)} across Layers and Weight Types')
ax.set_xlabel('Layer')
ax.set_ylabel('Weight Type')
plt.tight_layout()
plt.show()

## 6. Key Findings

This analysis reveals which layers and weight types show the strongest differences between base and instruct models when processing PC vectors from persona subspace.

**Interpretation:**
- **Higher cosine distances** = Instruction tuning significantly altered how that weight responds to the semantic direction
- **Layer patterns** = Shows where in the network instruction tuning has strongest effect
- **Weight type differences** = Reveals whether MLP (gate/up) or attention (q/k/v) is more affected
- **PC vs Random** = Measures whether PCs are specifically targeted vs general weight changes