# Gemma2-27B Basic Weight Susceptibility Analysis

This notebook analyzes how PC vectors from persona subspace data activate downstream neurons differently between Gemma2-27B base and instruct models.

**Key Questions:**
1. Do PC vectors (especially PC1) show different activation patterns in base vs instruct models?
2. Which layers show the strongest weight differences?
3. How do PC vectors compare to random baselines?

**Approach:**
- Load both base and instruct models
- Compute weight differences
- Load PC vectors from PCA data
- Apply layernorm and project through weight matrices
- Compute cosine distances between base and instruct activations

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoConfig

# Import from chatspace.analysis
from chatspace.analysis import (
    load_pca_data,
    load_individual_role_vectors,
    load_individual_trait_vectors,
    extract_pc_components,
    normalize_vector,
    gemma2_rmsnorm,
    compute_cosine_distances_batch,
    compute_weight_statistics
)

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load Gemma2-27B Models

In [None]:
# Model identifiers
base_model_id = "google/gemma-2-27b"
instruct_model_id = "google/gemma-2-27b-it"

print(f"Loading models...")
print(f"  Base: {base_model_id}")
print(f"  Instruct: {instruct_model_id}")
print(f"\nThis will take several minutes...\n")

# Load config
config = AutoConfig.from_pretrained(base_model_id)
print(f"Model config:")
print(f"  Hidden size: {config.hidden_size}")
print(f"  Num layers: {config.num_hidden_layers}")
print(f"  Intermediate size: {config.intermediate_size}")
print(f"  Attention heads: {config.num_attention_heads}")
print(f"  KV heads: {config.num_key_value_heads}")

In [None]:
# Load base model
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True
)
base_state_dict = {k: v.cpu() for k, v in base_model.state_dict().items()}
print(f"✓ Base model loaded: {len(base_state_dict)} parameters")

In [None]:
# Load instruct model
print("Loading instruct model...")
instruct_model = AutoModelForCausalLM.from_pretrained(
    instruct_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True
)
instruct_state_dict = {k: v.cpu() for k, v in instruct_model.state_dict().items()}
print(f"✓ Instruct model loaded: {len(instruct_state_dict)} parameters")

## 2. Compute Weight Differences

In [None]:
# Compute weight differences
print("Computing weight differences...")
weight_diffs = {}
for name in base_state_dict.keys():
    if name in instruct_state_dict:
        diff = instruct_state_dict[name] - base_state_dict[name]
        weight_diffs[name] = diff

print(f"\nComputed {len(weight_diffs)} weight differences")
print(f"\nSample statistics:")
for i, (name, diff) in enumerate(list(weight_diffs.items())[:5]):
    print(f"  {name}: shape={diff.shape}, norm={diff.float().norm().item():.4f}")

## 3. Load PC Vectors from Persona Subspace Data

In [None]:
# Path to persona subspace data
persona_data_root = Path("/workspace/persona-data")
gemma_model_name = "gemma-2-27b"

roles_pca_dir = persona_data_root / gemma_model_name / "roles_240" / "pca"
traits_pca_dir = persona_data_root / gemma_model_name / "traits_240" / "pca"

print(f"Checking for PCA data...")
print(f"  Roles PCA dir: {roles_pca_dir} (exists: {roles_pca_dir.exists()})")
print(f"  Traits PCA dir: {traits_pca_dir} (exists: {traits_pca_dir.exists()})")

In [None]:
# Load PCA data for roles
if roles_pca_dir.exists():
    pca_data, all_pca_files = load_pca_data(roles_pca_dir)
    print(f"\n✓ Loaded PCA data from: {all_pca_files[0].name}")
    print(f"  Layer: {pca_data.get('layer', 'unknown')}")
    print(f"  Number of components: {pca_data.get('n_components', 'unknown')}")
    print(f"  Total PCA files available: {len(all_pca_files)}")
    
    # Extract PC components
    pcs, variance_explained = extract_pc_components(pca_data, n_components=3)
    pc1, pc2, pc3 = pcs[0], pcs[1] if len(pcs) > 1 else None, pcs[2] if len(pcs) > 2 else None
    
    print(f"\nPC components extracted:")
    print(f"  PC1 shape: {pc1.shape}, norm: {pc1.float().norm().item():.4f}")
    if pc2 is not None:
        print(f"  PC2 shape: {pc2.shape}, norm: {pc2.float().norm().item():.4f}")
    if pc3 is not None:
        print(f"  PC3 shape: {pc3.shape}, norm: {pc3.float().norm().item():.4f}")
    
    print(f"\nVariance explained by first 5 PCs:")
    for i, var in enumerate(variance_explained[:5]):
        print(f"  PC{i+1}: {var:.4f} ({var*100:.2f}%)")
else:
    print("\n⚠️  PCA directory not found!")

In [None]:
# Load individual role and trait vectors
pca_layer = pca_data['layer']
print(f"\nLoading individual role and trait vectors for layer {pca_layer}...")

# Path to vectors directories
roles_vectors_dir = persona_data_root / gemma_model_name / "roles_240" / "vectors"
traits_vectors_dir = persona_data_root / gemma_model_name / "traits_240" / "vectors"

# Load vectors with discriminative defaults:
# - Roles: pos_3 - default_1 (difference vectors)
# - Traits: pos_neg_50 (precomputed contrast vectors)
role_vectors = load_individual_role_vectors(roles_vectors_dir, pca_layer)
trait_vectors = load_individual_trait_vectors(traits_vectors_dir, pca_layer)

print(f"  ✓ Loaded {len(role_vectors)} role difference vectors (pos_3 - default_1)")
print(f"  ✓ Loaded {len(trait_vectors)} trait contrast vectors (pos_neg_50)")

# Show examples
if role_vectors:
    print(f"\nExample roles:")
    for name in list(role_vectors.keys())[:5]:
        print(f"  - {name}")

if trait_vectors:
    print(f"\nExample traits:")
    for name in list(trait_vectors.keys())[:5]:
        print(f"  - {name}")

## 4. Extract Relevant Weights and Layernorms

We'll look at weights from nearby layers (±5 from PCA layer) and extract:
- MLP weights: `gate_proj`, `up_proj`
- Attention weights: `q_proj`, `k_proj`, `v_proj`
- Layernorms: `input_layernorm` (for attention), `pre_feedforward_layernorm` (for MLP)

In [None]:
# Get the layer corresponding to the PCA data
pca_layer = pca_data['layer']
print(f"PCA data is from layer {pca_layer}")

# Find weight matrices from nearby layers
relevant_weights = {}
target_weight_types = ['up_proj.weight', 'gate_proj.weight', 'q_proj.weight', 'k_proj.weight', 'v_proj.weight']

# Look at N layers before and after the PCA layer
n_layers_context = 5
upstream_layers = range(max(0, pca_layer - n_layers_context), pca_layer)
downstream_layers = range(pca_layer + 1, min(pca_layer + n_layers_context + 1, config.num_hidden_layers))
all_layers = list(upstream_layers) + list(downstream_layers)

print(f"\nLooking for weights in nearby layers (±{n_layers_context}):")
print(f"  Upstream: {list(upstream_layers)}")
print(f"  Downstream: {list(downstream_layers)}")

for name, diff in weight_diffs.items():
    for layer_num in all_layers:
        layer_prefix = f"model.layers.{layer_num}."
        if name.startswith(layer_prefix):
            for weight_type in target_weight_types:
                if name.endswith(weight_type):
                    relevant_weights[name] = diff
                    break

print(f"\n✓ Found {len(relevant_weights)} relevant weight matrices")

In [None]:
# Extract layernorm weights
layernorm_weights = {}
for layer_num in all_layers:
    input_ln_name = f"model.layers.{layer_num}.input_layernorm.weight"
    pre_ffn_ln_name = f"model.layers.{layer_num}.pre_feedforward_layernorm.weight"
    
    if input_ln_name in base_state_dict:
        layernorm_weights[f"{layer_num}.input_layernorm"] = {
            'base': base_state_dict[input_ln_name],
            'instruct': instruct_state_dict[input_ln_name]
        }
    
    if pre_ffn_ln_name in base_state_dict:
        layernorm_weights[f"{layer_num}.pre_feedforward_layernorm"] = {
            'base': base_state_dict[pre_ffn_ln_name],
            'instruct': instruct_state_dict[pre_ffn_ln_name]
        }

print(f"✓ Extracted {len(layernorm_weights)} layernorm weights")

## 5. Compute Cosine Distances

For each weight matrix, we:
1. Apply appropriate layernorm to input vectors
2. Project through base and instruct weight matrices
3. Compute cosine distance between resulting activation patterns

In [None]:
# Collect vectors to test
all_vectors = [pc1]
vector_names = ['PC1']

if pc2 is not None:
    all_vectors.append(pc2)
    vector_names.append('PC2')

if pc3 is not None:
    all_vectors.append(pc3)
    vector_names.append('PC3')

# Optionally add semantic vectors (sample a few for efficiency)
if 'role_vectors' in globals() and len(role_vectors) > 0:
    sample_roles = list(role_vectors.items())[:5]  # Sample 5 roles
    for name, vec in sample_roles:
        all_vectors.append(vec)
        vector_names.append(name)

if 'trait_vectors' in globals() and len(trait_vectors) > 0:
    sample_traits = list(trait_vectors.items())[:5]  # Sample 5 traits
    for name, vec in sample_traits:
        all_vectors.append(vec)
        vector_names.append(name)

# Add random vectors
torch.manual_seed(42)
for i in range(n_random_baseline):
    rand_vec = torch.randn(config.hidden_size, dtype=torch.float32)
    rand_vec = rand_vec / rand_vec.norm()
    all_vectors.append(rand_vec.to(torch.bfloat16))
    vector_names.append(f'Random{i+1}')

# Stack into batch
vectors_batch = torch.stack(all_vectors)


## 6. Visualize Results

In [None]:
# Summary statistics
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)

# Compute mean cosine distance by vector type
summary = results_df.groupby('vector')['cosine_distance'].agg(['mean', 'std', 'min', 'max'])
summary = summary.sort_values('mean', ascending=False)

print("\nMean cosine distance by vector:")
print(summary.head(10))

# Compare PC1 to random baseline
pc1_mean = results_df[results_df['vector'] == 'PC1']['cosine_distance'].mean()
random_mean = results_df[results_df['vector'].str.startswith('Random')]['cosine_distance'].mean()
ratio = pc1_mean / random_mean

print(f"\nPC1 vs Random baseline:")
print(f"  PC1 mean cosine distance: {pc1_mean:.6f}")
print(f"  Random mean cosine distance: {random_mean:.6f}")
print(f"  Ratio (PC1 / Random): {ratio:.2f}x")

In [None]:
# Plot: Cosine distance by weight type
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# By weight type
pc_data = results_df[results_df['vector'].isin(['PC1', 'PC2', 'PC3'])]
sns.boxplot(data=pc_data, x='weight_type', y='cosine_distance', ax=axes[0])
axes[0].set_title('Cosine Distance by Weight Type (PCs only)')
axes[0].set_ylabel('Cosine Distance')
axes[0].tick_params(axis='x', rotation=45)

# By layer
pc_data_mean = pc_data.groupby(['layer', 'vector'])['cosine_distance'].mean().reset_index()
for pc in ['PC1', 'PC2', 'PC3']:
    data = pc_data_mean[pc_data_mean['vector'] == pc]
    axes[1].plot(data['layer'], data['cosine_distance'], marker='o', label=pc)
axes[1].set_title('Mean Cosine Distance by Layer')
axes[1].set_xlabel('Layer')
axes[1].set_ylabel('Cosine Distance')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Key Findings

This analysis reveals which layers and weight types show the strongest differences between base and instruct models when processing PC vectors from persona subspace. Higher cosine distances indicate that instruction tuning has significantly altered how that weight matrix responds to the semantic direction represented by the PC.