# Comparison: Trained Steering Vectors vs Activation Vectors (CAA)

This notebook compares two methods for steering model behavior:
1. **Trained Steering Vectors**: Learned via logistic regression on activation differences
2. **Activation Vectors (CAA)**: Contrastive Activation Addition - simple mean differences
   - Roles: `pos_3 - default_1` at layer 22 (original), evaluated at layer 31
   - Traits: `pos_neg_50` at layer 22 (original), evaluated at layer 31

Both are evaluated as classifiers by projecting hidden states and fitting only scale+offset.

In [None]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

## Load Results

In [None]:
def load_results(result_dir):
    """Load all evaluation results from a directory."""
    results = []
    result_path = Path(result_dir)
    
    for json_file in result_path.glob("*.json"):
        if json_file.name == "eval_summary.json":
            continue
            
        with open(json_file) as f:
            data = json.load(f)
            
        # Extract key metrics
        result = {
            "dataset": data["dataset"],
            "dataset_type": data["dataset_type"],
            "n_records": data.get("n_records"),
            "n_positive": data.get("n_positive"),
        }
        
        # Steering vector metrics
        if "steering_vector" in data and "metrics" in data["steering_vector"]:
            steering = data["steering_vector"]["metrics"]
            result["steering_test_roc_auc"] = steering.get("test_roc_auc")
            result["steering_test_accuracy"] = steering.get("test_accuracy")
            result["steering_test_f1"] = steering.get("test_f1")
            result["steering_train_roc_auc"] = steering.get("train_roc_auc")
        
        # Activation vector metrics
        if "activation_vector" in data and "metrics" in data["activation_vector"]:
            activation = data["activation_vector"]["metrics"]
            result["activation_test_roc_auc"] = activation.get("test_roc_auc")
            result["activation_test_accuracy"] = activation.get("test_accuracy")
            result["activation_test_f1"] = activation.get("test_f1")
            result["activation_train_roc_auc"] = activation.get("train_roc_auc")
        else:
            result["activation_test_roc_auc"] = None
            result["activation_test_accuracy"] = None
            result["activation_test_f1"] = None
            result["activation_train_roc_auc"] = None
            
        results.append(result)
    
    return pd.DataFrame(results)

# Load both roles and traits
roles_df = load_results("/workspace/classifier_eval_comprehensive/")
traits_df = load_results("/workspace/classifier_eval_comprehensive_traits/")

# Combine
df = pd.concat([roles_df, traits_df], ignore_index=True)

print(f"Loaded {len(roles_df)} roles and {len(traits_df)} traits")
print(f"Total: {len(df)} evaluations")
print(f"\nWith both vectors: {df['activation_test_roc_auc'].notna().sum()}")
print(f"Steering only: {df['activation_test_roc_auc'].isna().sum()}")

In [None]:
df.head()

## Summary Statistics

In [None]:
# Filter to cases where we have both vectors
df_both = df[df['activation_test_roc_auc'].notna()].copy()

print(f"Comparing {len(df_both)} datasets with both vectors\n")
print("=" * 60)

# Overall statistics
for dataset_type in ['role', 'trait']:
    subset = df_both[df_both['dataset_type'] == dataset_type]
    if len(subset) == 0:
        continue
        
    print(f"\n{dataset_type.upper()}S (n={len(subset)})")
    print("-" * 60)
    
    for method, col in [('Trained Steering', 'steering_test_roc_auc'), 
                        ('Activation (CAA)', 'activation_test_roc_auc')]:
        values = subset[col].dropna()
        print(f"\n{method}:")
        print(f"  Mean:   {values.mean():.4f}")
        print(f"  Median: {values.median():.4f}")
        print(f"  Std:    {values.std():.4f}")
        print(f"  Min:    {values.min():.4f}")
        print(f"  Max:    {values.max():.4f}")
    
    # Difference
    diff = subset['activation_test_roc_auc'] - subset['steering_test_roc_auc']
    print(f"\nDifference (Activation - Steering):")
    print(f"  Mean:   {diff.mean():.4f}")
    print(f"  Median: {diff.median():.4f}")
    print(f"  Std:    {diff.std():.4f}")
    print(f"\nActivation wins: {(diff > 0).sum()} ({100 * (diff > 0).sum() / len(diff):.1f}%)")
    print(f"Steering wins:   {(diff < 0).sum()} ({100 * (diff < 0).sum() / len(diff):.1f}%)")
    print(f"Ties:            {(diff == 0).sum()}")
    
    # Paired t-test
    t_stat, p_value = stats.ttest_rel(
        subset['activation_test_roc_auc'],
        subset['steering_test_roc_auc']
    )
    print(f"\nPaired t-test: t={t_stat:.3f}, p={p_value:.2e}")

## Visualizations

In [None]:
# Scatter plot: Steering vs Activation
fig, axes = plt.subplots(1, 2, figsize=(16, 7))

for idx, dataset_type in enumerate(['role', 'trait']):
    ax = axes[idx]
    subset = df_both[df_both['dataset_type'] == dataset_type]
    
    if len(subset) == 0:
        ax.text(0.5, 0.5, f'No {dataset_type} data yet', 
                ha='center', va='center', transform=ax.transAxes)
        continue
    
    # Scatter
    ax.scatter(subset['steering_test_roc_auc'], 
               subset['activation_test_roc_auc'],
               alpha=0.5, s=50)
    
    # Diagonal line (equal performance)
    lims = [0.5, 1.0]
    ax.plot(lims, lims, 'r--', alpha=0.5, label='Equal performance')
    
    # Correlation
    corr = subset['steering_test_roc_auc'].corr(subset['activation_test_roc_auc'])
    
    ax.set_xlabel('Trained Steering Vector (Test ROC-AUC)', fontsize=12)
    ax.set_ylabel('Activation Vector / CAA (Test ROC-AUC)', fontsize=12)
    ax.set_title(f'{dataset_type.capitalize()}s (n={len(subset)})\nCorrelation: {corr:.3f}', 
                 fontsize=14)
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_aspect('equal')

plt.tight_layout()
plt.savefig('/workspace/steering_vs_activation_scatter.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Distribution comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

for row_idx, dataset_type in enumerate(['role', 'trait']):
    subset = df_both[df_both['dataset_type'] == dataset_type]
    
    if len(subset) == 0:
        continue
    
    # Histograms
    ax = axes[row_idx, 0]
    ax.hist(subset['steering_test_roc_auc'], bins=30, alpha=0.5, 
            label='Trained Steering', color='blue')
    ax.hist(subset['activation_test_roc_auc'], bins=30, alpha=0.5, 
            label='Activation (CAA)', color='orange')
    ax.set_xlabel('Test ROC-AUC', fontsize=12)
    ax.set_ylabel('Count', fontsize=12)
    ax.set_title(f'{dataset_type.capitalize()}s - Distribution', fontsize=14)
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Difference histogram
    ax = axes[row_idx, 1]
    diff = subset['activation_test_roc_auc'] - subset['steering_test_roc_auc']
    ax.hist(diff, bins=30, alpha=0.7, color='green')
    ax.axvline(0, color='red', linestyle='--', linewidth=2, label='No difference')
    ax.axvline(diff.mean(), color='black', linestyle='-', linewidth=2, 
               label=f'Mean: {diff.mean():.4f}')
    ax.set_xlabel('Activation ROC-AUC - Steering ROC-AUC', fontsize=12)
    ax.set_ylabel('Count', fontsize=12)
    ax.set_title(f'{dataset_type.capitalize()}s - Performance Difference', fontsize=14)
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/workspace/steering_vs_activation_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Box plots for comparison
fig, ax = plt.subplots(figsize=(12, 8))

plot_data = []
labels = []

for dataset_type in ['role', 'trait']:
    subset = df_both[df_both['dataset_type'] == dataset_type]
    if len(subset) == 0:
        continue
        
    plot_data.append(subset['steering_test_roc_auc'].values)
    labels.append(f'{dataset_type.capitalize()}\nSteering')
    
    plot_data.append(subset['activation_test_roc_auc'].values)
    labels.append(f'{dataset_type.capitalize()}\nActivation')

bp = ax.boxplot(plot_data, labels=labels, patch_artist=True)

# Color the boxes
colors = ['lightblue', 'lightcoral'] * 2
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)

ax.set_ylabel('Test ROC-AUC', fontsize=12)
ax.set_title('Comparison: Trained Steering vs Activation Vectors (CAA)', fontsize=14)
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('/workspace/steering_vs_activation_boxplot.png', dpi=150, bbox_inches='tight')
plt.show()

## Top/Bottom Performers

In [None]:
# Calculate performance difference
df_both['diff'] = df_both['activation_test_roc_auc'] - df_both['steering_test_roc_auc']

print("Top 10: Activation vectors much better than trained steering")
print("=" * 80)
top10 = df_both.nlargest(10, 'diff')[['dataset', 'dataset_type', 
                                        'steering_test_roc_auc', 
                                        'activation_test_roc_auc', 'diff']]
print(top10.to_string(index=False))

print("\n\nTop 10: Trained steering better than activation vectors")
print("=" * 80)
bottom10 = df_both.nsmallest(10, 'diff')[['dataset', 'dataset_type',
                                           'steering_test_roc_auc',
                                           'activation_test_roc_auc', 'diff']]
print(bottom10.to_string(index=False))

## Train vs Test Performance

In [None]:
# Check for overfitting - compare train vs test
fig, axes = plt.subplots(1, 2, figsize=(16, 7))

for idx, method in enumerate([('steering', 'Trained Steering'), 
                               ('activation', 'Activation (CAA)')]):
    prefix, label = method
    ax = axes[idx]
    
    train_col = f'{prefix}_train_roc_auc'
    test_col = f'{prefix}_test_roc_auc'
    
    subset = df_both[[train_col, test_col]].dropna()
    
    ax.scatter(subset[train_col], subset[test_col], alpha=0.5, s=50)
    
    # Diagonal
    lims = [0.5, 1.0]
    ax.plot(lims, lims, 'r--', alpha=0.5, label='No overfitting')
    
    ax.set_xlabel('Train ROC-AUC', fontsize=12)
    ax.set_ylabel('Test ROC-AUC', fontsize=12)
    ax.set_title(f'{label}\n(n={len(subset)})', fontsize=14)
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_aspect('equal')

plt.tight_layout()
plt.savefig('/workspace/train_vs_test_overfitting.png', dpi=150, bbox_inches='tight')
plt.show()

## Export Summary Table

In [None]:
# Create summary table
summary = df_both.groupby('dataset_type').agg({
    'steering_test_roc_auc': ['mean', 'std', 'min', 'max'],
    'activation_test_roc_auc': ['mean', 'std', 'min', 'max'],
    'diff': ['mean', 'std']
}).round(4)

print("\nSummary Table:")
print(summary)

# Save to CSV
df_both.to_csv('/workspace/steering_vs_activation_comparison.csv', index=False)
print("\nFull comparison saved to: /workspace/steering_vs_activation_comparison.csv")