# Top 10 Features from Alignment Steering Analysis

This notebook displays the top 10 features by total usage from the alignment steering analysis, along with their detailed descriptions from the NeuronpediaCache.

In [1]:
# Import required libraries
import json
import pandas as pd
import os
from pathlib import Path

In [2]:
# Set up file paths
base_path = Path('/home/jazhyc/projects/FSRL/feature-steering-RL')
alignment_data_path = base_path / 'outputs' / 'feature_classification' / 'comprehensive_analysis' / 'alignment_steering_analysis_prompt_only_ignore_mask_adapter_feature_usage.json'
descriptions_path = base_path / 'models' / 'NeuronpediaCache' / 'gemma-2-2b' / '12-gemmascope-res-65k__l0-21.json'

print(f"Alignment data path: {alignment_data_path}")
print(f"Alignment data exists: {alignment_data_path.exists()}")
print(f"Descriptions path: {descriptions_path}")
print(f"Descriptions exists: {descriptions_path.exists()}")

Alignment data path: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/feature_classification/comprehensive_analysis/alignment_steering_analysis_prompt_only_ignore_mask_adapter_feature_usage.json
Alignment data exists: True
Descriptions path: /home/jazhyc/projects/FSRL/feature-steering-RL/models/NeuronpediaCache/gemma-2-2b/12-gemmascope-res-65k__l0-21.json
Descriptions exists: True


In [3]:
# Load alignment steering feature usage data
with open(alignment_data_path, 'r') as f:
    alignment_data = json.load(f)

print(f"Loaded {len(alignment_data)} features from alignment data")
print(f"Sample feature keys: {list(alignment_data.keys())[:5]}")

Loaded 2 features from alignment data
Sample feature keys: ['summary', 'feature_usage_details']


In [4]:
# Load feature descriptions
with open(descriptions_path, 'r') as f:
    descriptions_list = json.load(f)

# Convert list to dictionary for easy lookup by index
descriptions_dict = {item['index']: item['description'] for item in descriptions_list}

print(f"Loaded descriptions for {len(descriptions_dict)} features")
print(f"Sample description indices: {list(descriptions_dict.keys())[:5]}")

Loaded descriptions for 65344 features
Sample description indices: ['53844', '53847', '53890', '53924', '53896']


In [5]:
# Calculate total usage for each feature and get top 10
feature_usage_list = alignment_data['feature_usage_details']

# Sort by usage_count and get top 10
top_10_features = sorted(feature_usage_list, key=lambda x: x['usage_count'], reverse=True)[:10]

print("Top 10 Features by Total Usage:")
for i, feature in enumerate(top_10_features, 1):
    print(f"{i}. Feature {feature['feature_index']}: {feature['usage_count']:,} total usage ({feature['usage_percentage']:.4%}) - Classification: {feature['classification']}")

Top 10 Features by Total Usage:
1. Feature 37761: 438,692 total usage (11.6656%) - Classification: related
2. Feature 64067: 433,371 total usage (11.5241%) - Classification: not-related
3. Feature 60867: 421,840 total usage (11.2174%) - Classification: not-related
4. Feature 62715: 415,408 total usage (11.0464%) - Classification: not-related
5. Feature 65241: 408,836 total usage (10.8716%) - Classification: not-related
6. Feature 32656: 406,304 total usage (10.8043%) - Classification: related
7. Feature 48790: 401,773 total usage (10.6838%) - Classification: not-related
8. Feature 14472: 398,370 total usage (10.5933%) - Classification: not-related
9. Feature 64257: 396,790 total usage (10.5513%) - Classification: not-related
10. Feature 64365: 394,366 total usage (10.4869%) - Classification: not-related


In [6]:
# Create a detailed display of top 10 features with descriptions
print("\n" + "="*80)
print("TOP 10 FEATURES WITH DESCRIPTIONS")
print("="*80 + "\n")

for i, feature in enumerate(top_10_features, 1):
    feature_idx = feature['feature_index']
    feature_idx_str = str(feature_idx)
    usage_count = feature['usage_count']
    usage_percentage = feature['usage_percentage']
    classification = feature['classification']
    
    # Get description if available
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    print(f"🏆 RANK {i}: Feature {feature_idx}")
    print(f"   Description: {description}")
    print(f"   Usage Count: {usage_count:,}")
    print(f"   Usage Percentage: {usage_percentage:.4%}")
    print(f"   Classification: {classification}")
    print("-" * 80 + "\n")


TOP 10 FEATURES WITH DESCRIPTIONS

🏆 RANK 1: Feature 37761
   Description:  instances of failure and error indications in the text
   Usage Count: 438,692
   Usage Percentage: 11.6656%
   Classification: related
--------------------------------------------------------------------------------

🏆 RANK 2: Feature 64067
   Description:  Japanese words or phrases
   Usage Count: 433,371
   Usage Percentage: 11.5241%
   Classification: not-related
--------------------------------------------------------------------------------

🏆 RANK 3: Feature 60867
   Description: technical methodologies and analyses related to data processing and modeling
   Usage Count: 421,840
   Usage Percentage: 11.2174%
   Classification: not-related
--------------------------------------------------------------------------------

🏆 RANK 4: Feature 62715
   Description: concepts related to healthcare and employment
   Usage Count: 415,408
   Usage Percentage: 11.0464%
   Classification: not-related
----------------

In [7]:
# Create a summary table
summary_data = []

for i, feature in enumerate(top_10_features, 1):
    feature_idx_str = str(feature['feature_index'])
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    summary_data.append({
        'Rank': i,
        'Feature Index': feature['feature_index'],
        'Usage Count': feature['usage_count'],
        'Usage %': f"{feature['usage_percentage']:.2%}",
        'Classification': feature['classification'],
        'Description': description[:100] + '...' if len(description) > 100 else description
    })

# Create DataFrame for better display
summary_df = pd.DataFrame(summary_data)

print("📊 SUMMARY TABLE: Top 10 Features")
print("=" * 120)
print(summary_df.to_string(index=False, max_colwidth=50))

📊 SUMMARY TABLE: Top 10 Features
 Rank  Feature Index  Usage Count Usage % Classification                                        Description
    1          37761       438692  11.67%        related  instances of failure and error indications in ...
    2          64067       433371  11.52%    not-related                          Japanese words or phrases
    3          60867       421840  11.22%    not-related technical methodologies and analyses related to...
    4          62715       415408  11.05%    not-related      concepts related to healthcare and employment
    5          65241       408836  10.87%    not-related  references to specific chemical compounds or s...
    6          32656       406304  10.80%        related  visual content identifiers such as images and ...
    7          48790       401773  10.68%    not-related  key references or identifiers in a structured ...
    8          14472       398370  10.59%    not-related          technical terms and code-related elem

In [8]:
# Save the results to a CSV file for reference
output_path = base_path / 'outputs' / 'top_10_alignment_features.csv'
summary_df.to_csv(output_path, index=False)
print(f"\n✅ Results saved to: {output_path}")

# Also save full descriptions (not truncated)
full_summary_data = []
for i, feature in enumerate(top_10_features, 1):
    feature_idx_str = str(feature['feature_index'])
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    full_summary_data.append({
        'Rank': i,
        'Feature Index': feature['feature_index'],
        'Usage Count': feature['usage_count'],
        'Usage Percentage': feature['usage_percentage'],
        'Classification': feature['classification'],
        'Full Description': description
    })

full_output_path = base_path / 'outputs' / 'top_10_alignment_features_full_descriptions.csv'
pd.DataFrame(full_summary_data).to_csv(full_output_path, index=False)
print(f"✅ Full descriptions saved to: {full_output_path}")


✅ Results saved to: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/top_10_alignment_features.csv
✅ Full descriptions saved to: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/top_10_alignment_features_full_descriptions.csv
