# Top 10 Features from Alignment Steering Analysis

This notebook displays the top 10 features by total usage from the alignment steering analysis, along with their detailed descriptions from the NeuronpediaCache.

In [9]:
# Import required libraries
import json
import pandas as pd
import os
from pathlib import Path

In [10]:
# Set up file paths
base_path = Path('/home/jazhyc/projects/FSRL/feature-steering-RL')
alignment_data_path = base_path / 'outputs' / 'feature_classification' / 'comprehensive_analysis' / 'alignment_steering_analysis_prompt_only_ignore_mask_feature_usage.json'
descriptions_path = base_path / 'models' / 'NeuronpediaCache' / 'gemma-2-2b' / '12-gemmascope-res-65k__l0-21.json'

print(f"Alignment data path: {alignment_data_path}")
print(f"Alignment data exists: {alignment_data_path.exists()}")
print(f"Descriptions path: {descriptions_path}")
print(f"Descriptions exists: {descriptions_path.exists()}")

Alignment data path: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/feature_classification/comprehensive_analysis/alignment_steering_analysis_prompt_only_ignore_mask_feature_usage.json
Alignment data exists: True
Descriptions path: /home/jazhyc/projects/FSRL/feature-steering-RL/models/NeuronpediaCache/gemma-2-2b/12-gemmascope-res-65k__l0-21.json
Descriptions exists: True


In [11]:
# Load alignment steering feature usage data
with open(alignment_data_path, 'r') as f:
    alignment_data = json.load(f)

print(f"Loaded {len(alignment_data)} features from alignment data")
print(f"Sample feature keys: {list(alignment_data.keys())[:5]}")

Loaded 2 features from alignment data
Sample feature keys: ['summary', 'feature_usage_details']


In [12]:
# Load feature descriptions
with open(descriptions_path, 'r') as f:
    descriptions_list = json.load(f)

# Convert list to dictionary for easy lookup by index
descriptions_dict = {item['index']: item['description'] for item in descriptions_list}

print(f"Loaded descriptions for {len(descriptions_dict)} features")
print(f"Sample description indices: {list(descriptions_dict.keys())[:5]}")

Loaded descriptions for 65344 features
Sample description indices: ['53844', '53847', '53890', '53924', '53896']


In [13]:
# Calculate total usage for each feature and get top 10
feature_usage_list = alignment_data['feature_usage_details']

# Sort by usage_count and get top 10
top_10_features = sorted(feature_usage_list, key=lambda x: x['usage_count'], reverse=True)[:10]

print("Top 10 Features by Total Usage:")
for i, feature in enumerate(top_10_features, 1):
    print(f"{i}. Feature {feature['feature_index']}: {feature['usage_count']:,} total usage ({feature['usage_percentage']:.4%}) - Classification: {feature['classification']}")

Top 10 Features by Total Usage:
1. Feature 56750: 203,984 total usage (4.4208%) - Classification: not-related
2. Feature 49896: 198,219 total usage (4.2958%) - Classification: not-related
3. Feature 41564: 196,986 total usage (4.2691%) - Classification: related
4. Feature 41362: 195,708 total usage (4.2414%) - Classification: not-related
5. Feature 44542: 194,954 total usage (4.2251%) - Classification: not-related
6. Feature 30441: 194,662 total usage (4.2187%) - Classification: not-related
7. Feature 40634: 194,476 total usage (4.2147%) - Classification: related
8. Feature 756: 194,231 total usage (4.2094%) - Classification: not-related
9. Feature 62960: 194,122 total usage (4.2070%) - Classification: not-related
10. Feature 38140: 193,913 total usage (4.2025%) - Classification: not-related


In [14]:
# Create a detailed display of top 10 features with descriptions
print("\n" + "="*80)
print("TOP 10 FEATURES WITH DESCRIPTIONS")
print("="*80 + "\n")

for i, feature in enumerate(top_10_features, 1):
    feature_idx = feature['feature_index']
    feature_idx_str = str(feature_idx)
    usage_count = feature['usage_count']
    usage_percentage = feature['usage_percentage']
    classification = feature['classification']
    
    # Get description if available
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    print(f"🏆 RANK {i}: Feature {feature_idx}")
    print(f"   Description: {description}")
    print(f"   Usage Count: {usage_count:,}")
    print(f"   Usage Percentage: {usage_percentage:.4%}")
    print(f"   Classification: {classification}")
    print("-" * 80 + "\n")


TOP 10 FEATURES WITH DESCRIPTIONS

🏆 RANK 1: Feature 56750
   Description:  geographical and administrative classifications
   Usage Count: 203,984
   Usage Percentage: 4.4208%
   Classification: not-related
--------------------------------------------------------------------------------

🏆 RANK 2: Feature 49896
   Description:  comment and license indicators in source code
   Usage Count: 198,219
   Usage Percentage: 4.2958%
   Classification: not-related
--------------------------------------------------------------------------------

🏆 RANK 3: Feature 41564
   Description: conjunctions and logical operators in sentences
   Usage Count: 196,986
   Usage Percentage: 4.2691%
   Classification: related
--------------------------------------------------------------------------------

🏆 RANK 4: Feature 41362
   Description: technical language related to software development and project management
   Usage Count: 195,708
   Usage Percentage: 4.2414%
   Classification: not-related
--------

In [15]:
# Create a summary table
summary_data = []

for i, feature in enumerate(top_10_features, 1):
    feature_idx_str = str(feature['feature_index'])
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    summary_data.append({
        'Rank': i,
        'Feature Index': feature['feature_index'],
        'Usage Count': feature['usage_count'],
        'Usage %': f"{feature['usage_percentage']:.2%}",
        'Classification': feature['classification'],
        'Description': description[:100] + '...' if len(description) > 100 else description
    })

# Create DataFrame for better display
summary_df = pd.DataFrame(summary_data)

print("📊 SUMMARY TABLE: Top 10 Features")
print("=" * 120)
print(summary_df.to_string(index=False, max_colwidth=50))

📊 SUMMARY TABLE: Top 10 Features
 Rank  Feature Index  Usage Count Usage % Classification                                        Description
    1          56750       203984   4.42%    not-related    geographical and administrative classifications
    2          49896       198219   4.30%    not-related      comment and license indicators in source code
    3          41564       196986   4.27%        related    conjunctions and logical operators in sentences
    4          41362       195708   4.24%    not-related technical language related to software developm...
    5          44542       194954   4.23%    not-related programming-related terms and structures, parti...
    6          30441       194662   4.22%    not-related  keywords related to programming topics and res...
    7          40634       194476   4.21%        related sentences discussing the outcomes of political ...
    8            756       194231   4.21%    not-related references to programming components or annota

In [16]:
# Save the results to a CSV file for reference
output_path = base_path / 'outputs' / 'top_10_alignment_features.csv'
summary_df.to_csv(output_path, index=False)
print(f"\n✅ Results saved to: {output_path}")

# Also save full descriptions (not truncated)
full_summary_data = []
for i, feature in enumerate(top_10_features, 1):
    feature_idx_str = str(feature['feature_index'])
    description = descriptions_dict.get(feature_idx_str, "No description available")
    
    full_summary_data.append({
        'Rank': i,
        'Feature Index': feature['feature_index'],
        'Usage Count': feature['usage_count'],
        'Usage Percentage': feature['usage_percentage'],
        'Classification': feature['classification'],
        'Full Description': description
    })

full_output_path = base_path / 'outputs' / 'top_10_alignment_features_full_descriptions.csv'
pd.DataFrame(full_summary_data).to_csv(full_output_path, index=False)
print(f"✅ Full descriptions saved to: {full_output_path}")


✅ Results saved to: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/top_10_alignment_features.csv
✅ Full descriptions saved to: /home/jazhyc/projects/FSRL/feature-steering-RL/outputs/top_10_alignment_features_full_descriptions.csv
