# Top 10 Features from Alignment Steering Analysis

This notebook displays the top 10 features by total usage from the alignment steering analysis, along with their detailed descriptions from the NeuronpediaCache.

In [19]:
# Import required libraries
import json
import pandas as pd
import os
from pathlib import Path

In [20]:
# Set up file paths
base_path = Path('/home/jazhyc/projects/FSRL/feature-steering-RL')
alignment_data_path = base_path / 'outputs' / 'feature_classification' / 'comprehensive_analysis' / 'alignment_steering_analysis_prompt_chosen_ignore_mask_adapter_feature_usage.json'
descriptions_path = base_path / 'models' / 'NeuronpediaCache' / 'gemma-2-2b' / '12-gemmascope-res-65k__l0-21.json'

print(f"Alignment data path: {alignment_data_path}")
print(f"Alignment data exists: {alignment_data_path.exists()}")
print(f"Descriptions path: {descriptions_path}")
print(f"Descriptions exists: {descriptions_path.exists()}")

Alignment data path: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/feature_classification/comprehensive_analysis/alignment_steering_analysis_prompt_chosen_ignore_mask_adapter_feature_usage.json
Alignment data exists: True
Descriptions path: /home/jazhyc/projects/FSRL/feature-steering-RL/models/NeuronpediaCache/gemma-2-2b/12-gemmascope-res-65k__l0-21.json
Descriptions exists: True


In [21]:
# Load alignment steering feature usage data
with open(alignment_data_path, 'r') as f:
    alignment_data = json.load(f)

print(f"Loaded {len(alignment_data)} features from alignment data")
print(f"Sample feature keys: {list(alignment_data.keys())[:5]}")

Loaded 2 features from alignment data
Sample feature keys: ['summary', 'feature_usage_details']


In [22]:
# Load feature descriptions
with open(descriptions_path, 'r') as f:
    descriptions_list = json.load(f)

# Convert list to dictionary for easy lookup by index
descriptions_dict = {item['index']: item['description'] for item in descriptions_list}

print(f"Loaded descriptions for {len(descriptions_dict)} features")
print(f"Sample description indices: {list(descriptions_dict.keys())[:5]}")

Loaded descriptions for 65344 features
Sample description indices: ['53844', '53847', '53890', '53924', '53896']


In [23]:
# Calculate total usage for each feature and get top 10
feature_usage_list = alignment_data['feature_usage_details']

# Sort by usage_count and get top 10
top_10_features = sorted(feature_usage_list, key=lambda x: x['usage_count'], reverse=True)[:10]

print("Top 10 Features by Total Usage:")
for i, feature in enumerate(top_10_features, 1):
    print(f"{i}. Feature {feature['feature_index']}: {feature['usage_count']:,} total usage ({feature['usage_percentage']:.4%}) - Classification: {feature['classification']}")

Top 10 Features by Total Usage:
1. Feature 37761: 1,360,962 total usage (14.9685%) - Classification: related
2. Feature 64067: 1,330,427 total usage (14.6327%) - Classification: not-related
3. Feature 60867: 1,301,138 total usage (14.3105%) - Classification: not-related
4. Feature 62715: 1,280,258 total usage (14.0809%) - Classification: not-related
5. Feature 65241: 1,246,377 total usage (13.7082%) - Classification: not-related
6. Feature 32656: 1,235,197 total usage (13.5853%) - Classification: related
7. Feature 48790: 1,218,058 total usage (13.3968%) - Classification: not-related
8. Feature 14472: 1,217,204 total usage (13.3874%) - Classification: not-related
9. Feature 64365: 1,214,469 total usage (13.3573%) - Classification: not-related
10. Feature 64257: 1,206,512 total usage (13.2698%) - Classification: not-related


In [24]:
# Create a detailed display of top 10 features with descriptions
print("\n" + "="*80)
print("TOP 10 FEATURES WITH DESCRIPTIONS")
print("="*80 + "\n")

for i, feature in enumerate(top_10_features, 1):
    feature_idx = feature['feature_index']
    feature_idx_str = str(feature_idx)
    usage_count = feature['usage_count']
    usage_percentage = feature['usage_percentage']
    classification = feature['classification']
    
    # Get description if available
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    print(f"🏆 RANK {i}: Feature {feature_idx}")
    print(f"   Description: {description}")
    print(f"   Usage Count: {usage_count:,}")
    print(f"   Usage Percentage: {usage_percentage:.4%}")
    print(f"   Classification: {classification}")
    print("-" * 80 + "\n")


TOP 10 FEATURES WITH DESCRIPTIONS

🏆 RANK 1: Feature 37761
   Description:  instances of failure and error indications in the text
   Usage Count: 1,360,962
   Usage Percentage: 14.9685%
   Classification: related
--------------------------------------------------------------------------------

🏆 RANK 2: Feature 64067
   Description:  Japanese words or phrases
   Usage Count: 1,330,427
   Usage Percentage: 14.6327%
   Classification: not-related
--------------------------------------------------------------------------------

🏆 RANK 3: Feature 60867
   Description: technical methodologies and analyses related to data processing and modeling
   Usage Count: 1,301,138
   Usage Percentage: 14.3105%
   Classification: not-related
--------------------------------------------------------------------------------

🏆 RANK 4: Feature 62715
   Description: concepts related to healthcare and employment
   Usage Count: 1,280,258
   Usage Percentage: 14.0809%
   Classification: not-related
--------

In [33]:
# Create a summary table
summary_data = []

for i, feature in enumerate(top_10_features, 1):
    feature_idx_str = str(feature['feature_index'])
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    summary_data.append({
        'Rank': i,
        'Feature Index': feature['feature_index'],
        'Usage Count': feature['usage_count'],
        'Usage %': f"{feature['usage_percentage']:.2%}",
        'Classification': feature['classification'],
        'Description': description
    })

# Create DataFrame for better display
summary_df = pd.DataFrame(summary_data)

print("📊 SUMMARY TABLE: Top 10 Features")
print("=" * 120)
print(summary_df.to_string(index=False, max_colwidth=200))

📊 SUMMARY TABLE: Top 10 Features
 Rank  Feature Index  Usage Count Usage % Classification                                                                                                         Description
    1          37761      1360962  14.97%        related                                                              instances of failure and error indications in the text
    2          64067      1330427  14.63%    not-related                                                                                           Japanese words or phrases
    3          60867      1301138  14.31%    not-related                                        technical methodologies and analyses related to data processing and modeling
    4          62715      1280258  14.08%    not-related                                                                       concepts related to healthcare and employment
    5          65241      1246377  13.71%    not-related  references to specific chemical compounds or

In [26]:
# Save the results to a CSV file for reference
output_path = base_path / 'outputs' / 'top_10_alignment_features.csv'
summary_df.to_csv(output_path, index=False)
print(f"\n✅ Results saved to: {output_path}")

# Also save full descriptions (not truncated)
full_summary_data = []
for i, feature in enumerate(top_10_features, 1):
    feature_idx_str = str(feature['feature_index'])
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    full_summary_data.append({
        'Rank': i,
        'Feature Index': feature['feature_index'],
        'Usage Count': feature['usage_count'],
        'Usage Percentage': feature['usage_percentage'],
        'Classification': feature['classification'],
        'Full Description': description
    })

full_output_path = base_path / 'outputs' / 'top_10_alignment_features_full_descriptions.csv'
pd.DataFrame(full_summary_data).to_csv(full_output_path, index=False)
print(f"✅ Full descriptions saved to: {full_output_path}")


✅ Results saved to: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/top_10_alignment_features.csv
✅ Full descriptions saved to: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/top_10_alignment_features_full_descriptions.csv


# Top Features by Mean Activation

Now let's analyze features based on their mean activation values - both highest positive and lowest (most negative) mean activations.

In [27]:
# Sort features by mean activation to find top positive and most negative
feature_usage_list = alignment_data['feature_usage_details']

# Top 5 features with highest positive mean activation
top_positive_activation = sorted(feature_usage_list, key=lambda x: x['mean_activation'], reverse=True)[:5]

# Top 5 features with most negative mean activation (lowest values)
top_negative_activation = sorted(feature_usage_list, key=lambda x: x['mean_activation'])[:5]

In [28]:
# Create detailed display with descriptions for top positive activation features
print("\n" + "="*80)
print("🔥 TOP 5 FEATURES WITH HIGHEST POSITIVE MEAN ACTIVATION - WITH DESCRIPTIONS")
print("="*80 + "\n")

for i, feature in enumerate(top_positive_activation, 1):
    feature_idx = feature['feature_index']
    feature_idx_str = str(feature_idx)
    mean_activation = feature['mean_activation']
    usage_count = feature['usage_count']
    usage_percentage = feature['usage_percentage']
    classification = feature['classification']
    firing_rate = feature['firing_rate_percent']
    
    # Get description if available
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    print(f"🏆 RANK {i}: Feature {feature_idx}")
    print(f"   Description: {description}")
    print(f"   Mean Activation: {mean_activation:.6f}")
    print(f"   Usage Count: {usage_count:,}")
    print(f"   Usage Percentage: {usage_percentage:.4%}")
    print(f"   Firing Rate: {firing_rate:.2f}%")
    print(f"   Classification: {classification}")
    print("-" * 80 + "\n")


🔥 TOP 5 FEATURES WITH HIGHEST POSITIVE MEAN ACTIVATION - WITH DESCRIPTIONS

🏆 RANK 1: Feature 64067
   Description:  Japanese words or phrases
   Mean Activation: 0.061550
   Usage Count: 1,330,427
   Usage Percentage: 14.6327%
   Firing Rate: 91.26%
   Classification: not-related
--------------------------------------------------------------------------------

🏆 RANK 2: Feature 37761
   Description:  instances of failure and error indications in the text
   Mean Activation: 0.055190
   Usage Count: 1,360,962
   Usage Percentage: 14.9685%
   Firing Rate: 93.36%
   Classification: related
--------------------------------------------------------------------------------

🏆 RANK 3: Feature 60867
   Description: technical methodologies and analyses related to data processing and modeling
   Mean Activation: 0.054161
   Usage Count: 1,301,138
   Usage Percentage: 14.3105%
   Firing Rate: 89.25%
   Classification: not-related
------------------------------------------------------------------

In [29]:
# Create detailed display with descriptions for top negative activation features
print("\n" + "="*80)
print("❄️ TOP 5 FEATURES WITH LOWEST (MOST NEGATIVE) MEAN ACTIVATION - WITH DESCRIPTIONS")
print("="*80 + "\n")

for i, feature in enumerate(top_negative_activation, 1):
    feature_idx = feature['feature_index']
    feature_idx_str = str(feature_idx)
    mean_activation = feature['mean_activation']
    usage_count = feature['usage_count']
    usage_percentage = feature['usage_percentage']
    classification = feature['classification']
    firing_rate = feature['firing_rate_percent']
    
    # Get description if available
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    print(f"🏆 RANK {i}: Feature {feature_idx}")
    print(f"   Description: {description}")
    print(f"   Mean Activation: {mean_activation:.6f}")
    print(f"   Usage Count: {usage_count:,}")
    print(f"   Usage Percentage: {usage_percentage:.4%}")
    print(f"   Firing Rate: {firing_rate:.2f}%")
    print(f"   Classification: {classification}")
    print("-" * 80 + "\n")


❄️ TOP 5 FEATURES WITH LOWEST (MOST NEGATIVE) MEAN ACTIVATION - WITH DESCRIPTIONS

🏆 RANK 1: Feature 62837
   Description:  phrases that include the word "with"
   Mean Activation: -0.033287
   Usage Count: 71,552
   Usage Percentage: 0.7870%
   Firing Rate: 4.91%
   Classification: not-related
--------------------------------------------------------------------------------

🏆 RANK 2: Feature 22069
   Description: specific details about events and their outcomes in narrative contexts
   Mean Activation: -0.033005
   Usage Count: 59,184
   Usage Percentage: 0.6509%
   Firing Rate: 4.06%
   Classification: not-related
--------------------------------------------------------------------------------

🏆 RANK 3: Feature 63256
   Description:  sequences of code or programming syntax
   Mean Activation: -0.030185
   Usage Count: 37,472
   Usage Percentage: 0.4121%
   Firing Rate: 2.57%
   Classification: not-related
-----------------------------------------------------------------------------

In [30]:
# Create summary tables for mean activation analysis
positive_activation_data = []
negative_activation_data = []

# Positive activation summary
for i, feature in enumerate(top_positive_activation, 1):
    feature_idx_str = str(feature['feature_index'])
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    positive_activation_data.append({
        'Rank': i,
        'Feature Index': feature['feature_index'],
        'Mean Activation': f"{feature['mean_activation']:.6f}",
        'Usage Count': feature['usage_count'],
        'Usage %': f"{feature['usage_percentage']:.2%}",
        'Firing Rate %': f"{feature['firing_rate_percent']:.2f}%",
        'Classification': feature['classification'],
        'Description': description[:80] + '...' if len(description) > 80 else description
    })

# Negative activation summary
for i, feature in enumerate(top_negative_activation, 1):
    feature_idx_str = str(feature['feature_index'])
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    negative_activation_data.append({
        'Rank': i,
        'Feature Index': feature['feature_index'],
        'Mean Activation': f"{feature['mean_activation']:.6f}",
        'Usage Count': feature['usage_count'],
        'Usage %': f"{feature['usage_percentage']:.2%}",
        'Firing Rate %': f"{feature['firing_rate_percent']:.2f}%",
        'Classification': feature['classification'],
        'Description': description[:80] + '...' if len(description) > 80 else description
    })

# Display tables
positive_df = pd.DataFrame(positive_activation_data)
negative_df = pd.DataFrame(negative_activation_data)

print("📊 SUMMARY TABLE: Top 5 Features by Highest Positive Mean Activation")
print("=" * 140)
print(positive_df.to_string(index=False, max_colwidth=50))

print("\n📊 SUMMARY TABLE: Top 5 Features by Lowest (Most Negative) Mean Activation")
print("=" * 140)
print(negative_df.to_string(index=False, max_colwidth=50))

📊 SUMMARY TABLE: Top 5 Features by Highest Positive Mean Activation
 Rank  Feature Index Mean Activation  Usage Count Usage % Firing Rate % Classification                                        Description
    1          64067        0.061550      1330427  14.63%        91.26%    not-related                          Japanese words or phrases
    2          37761        0.055190      1360962  14.97%        93.36%        related  instances of failure and error indications in ...
    3          60867        0.054161      1301138  14.31%        89.25%    not-related technical methodologies and analyses related to...
    4          65241        0.046128      1246377  13.71%        85.50%    not-related  references to specific chemical compounds or s...
    5          32656        0.044654      1235197  13.59%        84.73%        related  visual content identifiers such as images and ...

📊 SUMMARY TABLE: Top 5 Features by Lowest (Most Negative) Mean Activation
 Rank  Feature Index Mean Act

In [31]:
# Save the mean activation analysis results to CSV files
positive_output_path = base_path / 'outputs' / 'top_5_positive_mean_activation_features.csv'
negative_output_path = base_path / 'outputs' / 'top_5_negative_mean_activation_features.csv'

positive_df.to_csv(positive_output_path, index=False)
negative_df.to_csv(negative_output_path, index=False)

print(f"✅ Positive activation results saved to: {positive_output_path}")
print(f"✅ Negative activation results saved to: {negative_output_path}")

# Also save full descriptions (not truncated) for mean activation analysis
positive_full_data = []
negative_full_data = []

for i, feature in enumerate(top_positive_activation, 1):
    feature_idx_str = str(feature['feature_index'])
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    positive_full_data.append({
        'Rank': i,
        'Feature Index': feature['feature_index'],
        'Mean Activation': feature['mean_activation'],
        'Usage Count': feature['usage_count'],
        'Usage Percentage': feature['usage_percentage'],
        'Firing Rate Percentage': feature['firing_rate_percent'],
        'Classification': feature['classification'],
        'Full Description': description
    })

for i, feature in enumerate(top_negative_activation, 1):
    feature_idx_str = str(feature['feature_index'])
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    negative_full_data.append({
        'Rank': i,
        'Feature Index': feature['feature_index'],
        'Mean Activation': feature['mean_activation'],
        'Usage Count': feature['usage_count'],
        'Usage Percentage': feature['usage_percentage'],
        'Firing Rate Percentage': feature['firing_rate_percent'],
        'Classification': feature['classification'],
        'Full Description': description
    })

positive_full_output_path = base_path / 'outputs' / 'top_5_positive_mean_activation_features_full_descriptions.csv'
negative_full_output_path = base_path / 'outputs' / 'top_5_negative_mean_activation_features_full_descriptions.csv'

pd.DataFrame(positive_full_data).to_csv(positive_full_output_path, index=False)
pd.DataFrame(negative_full_data).to_csv(negative_full_output_path, index=False)

print(f"✅ Positive activation full descriptions saved to: {positive_full_output_path}")
print(f"✅ Negative activation full descriptions saved to: {negative_full_output_path}")

✅ Positive activation results saved to: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/top_5_positive_mean_activation_features.csv
✅ Negative activation results saved to: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/top_5_negative_mean_activation_features.csv
✅ Positive activation full descriptions saved to: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/top_5_positive_mean_activation_features_full_descriptions.csv
✅ Negative activation full descriptions saved to: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/top_5_negative_mean_activation_features_full_descriptions.csv
