# visualize results from probing tests


In [2]:
import sys
sys.path.append('../')
import pandas as pd
import plotly.express as px
from utils import *
import numpy as np
from scipy.stats import spearmanr
import plotly.graph_objects as go
import tqdm
from plotly.express.colors import qualitative


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import re


def test_process_column_headers() -> None:
    """Test the column header processing functionality."""
    # Sample data
    headers = ["col_1", "data_23", "value_456", "metric_7_8", "item9","10"]
    exclude_list = [23, 7, 10]
    
    # Process headers
    result = process_column_headers(headers, exclude_list)
    
    print(f"Column headers: {headers}")
    print(f"Extracted numbers: {extract_numbers_from_headers(headers)}")
    print(f"Exclude list: {exclude_list}")
    print(f"Result (non-overlapping numbers): {result}")
    
    # Expected result: [1, 456, 8, 9]
    expected = [1, 456, 8, 9]
    assert set(result) == set(expected), f"Expected {expected}, got {result}"
    print("Test passed!")


if __name__ == "__main__":
    test_process_column_headers()

Column headers: ['col_1', 'data_23', 'value_456', 'metric_7_8', 'item9', '10']
Extracted numbers: [1, 23, 456, 7, 8, 9, 10]
Exclude list: [23, 7, 10]
Result (non-overlapping numbers): [1, 456, 8, 9]
Test passed!


In [3]:
def load_result(
    model_name:str,
    dataset:str = 'stas/c4-en-10k',
    data_range_start:int = 0,
    data_range_end:int = 100,
    k = 10
    ):
    save_path = f'./{output_dir}/{model_name}/unigram/{dataset.replace("/","_")}_{data_range_start}-{data_range_end}/k{k}.feather'
    final_df = pd.read_feather(save_path)
    # %%
    final_df['delta_loss'] = final_df['loss_post_ablation'] - final_df['loss']
    final_df['delta_loss_with_frozen_unigram'] = final_df['loss_post_ablation_with_frozen_unigram'] - final_df['loss']
    final_df['abs_delta_loss_post_ablation'] = np.abs(final_df['loss_post_ablation'] - final_df['loss'])
    final_df['abs_delta_loss_post_ablation_with_frozen_unigram'] = np.abs(final_df['loss_post_ablation_with_frozen_unigram'] - final_df['loss'])
    final_df['delta_entropy'] = final_df['entropy_post_ablation'] - final_df['entropy']
    if 'kl_divergence_before' in final_df.columns:
        print('kl_divergence_before found')
        final_df['kl_from_unigram_diff'] = final_df['kl_divergence_after'] - final_df['kl_divergence_before']
        final_df['kl_from_unigram_diff_with_frozen_unigram'] = final_df['kl_divergence_after_frozen_unigram'] - final_df['kl_divergence_before']
        final_df['abs_kl_from_unigram_diff'] = final_df['kl_from_unigram_diff'].abs()
    final_df['abs_kl_from_unigram_diff'] = final_df['kl_from_unigram_diff'].abs()

    
    return final_df 

In [16]:
def select_top_token_frequency_neurons(
    final_df: pd.DataFrame, 
    unigram_kl_threshold: float = 2.0, 
    unigram_mediation_threshold: float = 0.5, 
    top_n: int = 10
) -> dict[str, list[str]]:
    """
    Correctly select top token frequency neurons based on multiple criteria.
    """
    # Calculate the mediation effect
    final_df['mediation_effect'] = (
        1 - final_df['abs_delta_loss_post_ablation_with_frozen_unigram'] 
        / final_df['abs_delta_loss_post_ablation']
    )

    '''
    # Filter neurons based on KL divergence and mediation effect
    filtered_neurons = final_df[
        (final_df['abs_kl_from_unigram_diff'] > unigram_kl_threshold) &
        (final_df['mediation_effect'] > unigram_mediation_threshold)
    ]

    # Rank neurons by their KL divergence and mediation effect
    
    ranked_neurons = filtered_neurons.sort_values(
        by=['abs_kl_from_unigram_diff', 'mediation_effect'], 
        ascending=False
    )

    ranked_neurons = filtered_neurons.sort_values(
        by='mediation_effect', 
        ascending=False
    )
    '''
    ranked_neurons = final_df.sort_values(
        by='mediation_effect', 
        ascending=False
    )
    
    # Select top N neurons, preserving the original sorting
    #top_neurons = ranked_neurons['component_name'].head(top_n).tolist()
    top_neurons = final_df['component_name'].head(top_n).tolist()
    return {
        model_name: top_neurons
    }

def aggregate_result(final_df:pd.DataFrame,unigram_neurons_dict:dict)->pd.DataFrame:

    if len(unigram_neurons_dict)>0:
        unigram_neurons = unigram_neurons_dict.get(model_name, [])
        final_df['is_unigram'] = final_df['component_name'].isin(unigram_neurons).astype(bool)
        
    columns_to_aggregate =list(final_df.columns[8:]) + ['loss']
    print(columns_to_aggregate)
    agg_results = final_df[columns_to_aggregate].groupby('component_name').mean().reset_index()

    # make scatter plot of delta_loss and delta_loss_with_frozen_unigram for each neuron
    agg_results['delta_loss-delta_loss_with_frozen_unigram'] = agg_results['delta_loss'] - agg_results['delta_loss_with_frozen_unigram']
    agg_results['abs_delta_loss-abs_delta_loss_with_frozen_unigram'] = agg_results['abs_delta_loss_post_ablation'] - agg_results['abs_delta_loss_post_ablation_with_frozen_unigram']
    # %%
    # make scatter plot of delta_loss and delta_loss_with_frozen_unigram for each neuron
    agg_results['delta_loss-delta_loss_with_frozen_unigram'] = agg_results['delta_loss'] - agg_results['delta_loss_with_frozen_unigram']
    agg_results['abs_delta_loss-abs_delta_loss_with_frozen_unigram'] = agg_results['abs_delta_loss_post_ablation'] - agg_results['abs_delta_loss_post_ablation_with_frozen_unigram']
    agg_results['1-abs_delta_loss_with_frozen_unigram/abs_delta_loss'] = 1 - agg_results['abs_delta_loss_post_ablation_with_frozen_unigram'] / agg_results['abs_delta_loss_post_ablation']

    return agg_results
    
def plot_top_token_frequency_neurons(
    agg_results: pd.DataFrame, 
    unigram_neurons: list[str], 
    model_name: str
):  # Consider specifying the exact return type of your plotting library
    """
    Create a scatter plot highlighting top token frequency neurons.

    Args:
        agg_results: Aggregated results DataFrame
        unigram_neurons: List of top token frequency neuron names
        model_name: Name of the model being analyzed

    Returns:
        Plotly figure object
    """
    

    # Prepare neuron type column
    conditions = [(agg_results['is_unigram'] == True)]
    choices = ['Token Frequency']
    agg_results['Neuron Type'] = np.select(conditions, choices, default='Normal')

    # Define axes and color mapping
    x_axis = '1-abs_delta_loss_with_frozen_unigram/abs_delta_loss'
    y_axis = 'abs_kl_from_unigram_diff'
    tf_color = qualitative.Plotly[2]

    # Create scatter plot
    fig = px.scatter(
        agg_results, 
        y=y_axis, 
        x=x_axis, 
        hover_data=['component_name'], 
        color='Neuron Type', 
        color_discrete_map={
            'Normal': qualitative.Plotly[0], 
            'Entropy': qualitative.Plotly[1], 
            'Token Frequency': tf_color
        }
    )

    # Add text labels for top token frequency neurons
    entropy_neuron_indices = [
        int(neuron.split('.')[1]) for neuron in unigram_neurons
    ]
    
    for neuron_index in entropy_neuron_indices:
        entropy_df = agg_results[
            agg_results['component_name'] == f'23.{neuron_index}'
        ]
        fig.add_trace(
            go.Scatter(
                x=entropy_df[x_axis]-0.01, 
                y=entropy_df[y_axis], 
                mode='text', 
                text=str(neuron_index), 
                textposition='bottom left', 
                showlegend=False, 
                textfont=dict(color=tf_color)
            )
        )

    # Customize layout
    fig.update_layout(
        legend=dict(
            orientation='h',
            yanchor='bottom',
            y=-0.6,
            xanchor='right',
            x=0.9
        ),
        title=f'(c) Top Token Frequency Neurons',
        margin=dict(l=0, r=3, t=30, b=0),
        width=350, 
        height=275,
        title_font_size=16
    )

    # Update axes
    fig.update_yaxes(title_text='Avg. |Δ D<sub>KL</sub>(P<sub>model</sub>||P<sub>freq</sub>)|')
    fig.update_xaxes(
        title_text='1 - DE<sub>freq</sub>/TE',
        range=[-0.25, 0.62]
    )

    return fig



In [31]:
def select_token_frequency_neurons(
    agg_results: pd.DataFrame, 
    x_threshold: float = 0.2,  # Threshold for indirect effect
    y_threshold: float = 0  # Threshold for KL divergence change
) -> list[str]:
    """
    Dynamically select token frequency neurons based on x and y axis values
    
    Args:
    - agg_results: Aggregated results DataFrame
    - x_threshold: Threshold for 1 - DE_freq/TE (indirect effect)
    - y_threshold: Threshold for KL divergence change
    
    Returns:
    - List of token frequency neuron names
    """
    x_axis = '1-abs_delta_loss_with_frozen_unigram/abs_delta_loss'
    y_axis = 'abs_kl_from_unigram_diff'
    
    token_freq_neurons = agg_results[
        (agg_results[x_axis] > x_threshold) &  # Indirect effect
        (agg_results[y_axis] > y_threshold)    # Significant distribution change
    ]['component_name'].tolist()
    
    return token_freq_neurons

def plot_token_frequency_neurons(
    agg_results: pd.DataFrame, 
    model_name: str
):
    """
    Create a scatter plot dynamically identifying token frequency neurons
    
    Args:
    - agg_results: Aggregated results DataFrame
    - model_name: Name of the model being analyzed
    
    Returns:
    - Plotly figure object
    """
    # Dynamically select token frequency neurons
    token_freq_neurons = select_token_frequency_neurons(agg_results)

    # Prepare neuron type column
    x_axis = '1-abs_delta_loss_with_frozen_unigram/abs_delta_loss'
    y_axis = 'abs_kl_from_unigram_diff'
    
    # Create neuron type column based on dynamic selection
    agg_results['Neuron Type'] = np.where(
        agg_results['component_name'].isin(token_freq_neurons), 
        'Token Frequency', 
        'Normal'
    )
    
    # Create scatter plot
    fig = px.scatter(
        agg_results, 
        y=y_axis, 
        x=x_axis, 
        hover_data=['component_name'], 
        color='Neuron Type', 
        color_discrete_map={
            'Normal': qualitative.Plotly[0],
            'Token Frequency': qualitative.Plotly[2]
        }
    )
    
    # Add text labels for token frequency neurons
    for neuron_name in token_freq_neurons:
        neuron_df = agg_results[
            agg_results['component_name'] == neuron_name
        ]
        fig.add_trace(
            go.Scatter(
                x=neuron_df[x_axis]-0.01, 
                y=neuron_df[y_axis], 
                mode='text', 
                text=neuron_name.split('.')[-1], 
                textposition='bottom left', 
                showlegend=False, 
                textfont=dict(color=qualitative.Plotly[2])
            )
        )
    
    # Customize layout
    fig.update_layout(
        legend=dict(
            orientation='h',
            yanchor='bottom',
            y=-0.6,
            xanchor='right',
            x=0.9
        ),
        title=f'Token Frequency Neurons',
        margin=dict(l=0, r=3, t=30, b=0),
        width=350, 
        height=275,
        title_font_size=16
    )
    
    # Update axes
    fig.update_yaxes(title_text='Avg. |Δ D<sub>KL</sub>(P<sub>model</sub>||P<sub>freq</sub>)|')
    fig.update_xaxes(
        title_text='1 - DE<sub>freq</sub>/TE',
        range=[-0.25, 0.62]
    )
    
    return fig, token_freq_neurons

In [35]:
# %%
output_dir = 'ablations/results'
model_name = 'pythia'
dataset = 'stas/c4-en-10k'
data_range_start = 0
data_range_end = 100
k = 10

In [39]:
token_freq_neurons

['5.1345', '5.207', '5.427', '5.509', '5.585', '5.613']

In [38]:
model_name = 'pythia'
# load data and unigram_neuron dict
final_df = load_result(
    model_name,
    data_range_end = 500

    )

agg_results = aggregate_result(final_df,unigram_neurons_dict)



unigram_neurons_dict = select_top_token_frequency_neurons(
        final_df, 
        unigram_kl_threshold=0.1, 
        unigram_mediation_threshold=0.5, 
        top_n=1000
    )



# plot the figure  
fig, token_freq_neurons = plot_token_frequency_neurons(
        agg_results, 
        model_name
    )
fig.show()


kl_divergence_before found
['entropy', 'top_logit', 'pred', 'loss', 'top_logp', 'ln_final_scale', 'rank_of_correct_token', 'correct_token_rank', 'pred_in_top1', 'pred_in_top5', 'activation', 'component_name', 'loss_post_ablation', 'loss_post_ablation_with_frozen_unigram', 'entropy_post_ablation', 'entropy_post_ablation_with_frozen_unigram', 'kl_divergence_before', 'kl_divergence_after', 'kl_divergence_after_frozen_unigram', 'delta_loss', 'delta_loss_with_frozen_unigram', 'abs_delta_loss_post_ablation', 'abs_delta_loss_post_ablation_with_frozen_unigram', 'delta_entropy', 'kl_from_unigram_diff', 'kl_from_unigram_diff_with_frozen_unigram', 'abs_kl_from_unigram_diff', 'is_unigram', 'loss']


In [None]:
model_name = 'gpt2-small'
# load data and unigram_neuron dict
final_df = load_result(
    model_name,
    data_range_end = 500

    )


unigram_neurons_dict = select_top_token_frequency_neurons(
        final_df, 
        unigram_kl_threshold=0.1, 
        unigram_mediation_threshold=0.5, 
        top_n=1000
    )

agg_results = aggregate_result(final_df,unigram_neurons_dict)


# plot the figure  
fig = plot_top_token_frequency_neurons(
        agg_results, 
        unigram_neurons_dict[model_name], 
        model_name
    )
fig.show()


In [7]:
model_name = 'pythia'
# load data and unigram_neuron dict
final_df = load_result(
    model_name,data_range_end = 100
    )


unigram_neurons_dict = select_top_token_frequency_neurons(
        final_df, 
        unigram_kl_threshold=0.1, 
        unigram_mediation_threshold=0.5, 
        top_n=5
    )

agg_results = aggregate_result(final_df,unigram_neurons_dict)


# plot the figure  
fig = plot_top_token_frequency_neurons(
        agg_results, 
        unigram_neurons_dict[model_name], 
        model_name
    )
fig.show()


kl_divergence_before found
['entropy', 'top_logit', 'pred', 'loss', 'top_logp', 'ln_final_scale', 'rank_of_correct_token', 'correct_token_rank', 'pred_in_top1', 'pred_in_top5', 'activation', 'component_name', 'loss_post_ablation', 'loss_post_ablation_with_frozen_unigram', 'entropy_post_ablation', 'entropy_post_ablation_with_frozen_unigram', 'kl_divergence_before', 'kl_divergence_after', 'kl_divergence_after_frozen_unigram', 'delta_loss', 'delta_loss_with_frozen_unigram', 'abs_delta_loss_post_ablation', 'abs_delta_loss_post_ablation_with_frozen_unigram', 'delta_entropy', 'kl_from_unigram_diff', 'kl_from_unigram_diff_with_frozen_unigram', 'abs_kl_from_unigram_diff', 'mediation_effect', 'is_unigram', 'loss']


In [27]:
# %%
output_dir = 'ablations/results'
model_name = 'gpt2-small'
dataset = 'stas/c4-en-10k'
data_range_start = 0
data_range_end = 500
k = 10

In [47]:
final_df.shape[0]

7492608

In [32]:
model_name = 'gpt2-small'
# load data and unigram_neuron dict
final_df = load_result(
    model_name,
    data_range_end = 500

    )


unigram_neurons_dict = select_top_token_frequency_neurons(
        final_df, 
        unigram_kl_threshold=0.05, 
        unigram_mediation_threshold=0.2, 
        top_n=10
    )

agg_results = aggregate_result(final_df,unigram_neurons_dict)


# plot the figure  
fig = plot_top_token_frequency_neurons(
        agg_results, 
        unigram_neurons_dict[model_name], 
        model_name
    )
fig.show()


kl_divergence_before found
['entropy', 'top_logit', 'pred', 'loss', 'top_logp', 'ln_final_scale', 'rank_of_correct_token', 'correct_token_rank', 'pred_in_top1', 'pred_in_top5', 'activation', 'component_name', 'loss_post_ablation', 'loss_post_ablation_with_frozen_unigram', 'entropy_post_ablation', 'entropy_post_ablation_with_frozen_unigram', 'kl_divergence_before', 'kl_divergence_after', 'kl_divergence_after_frozen_unigram', 'delta_loss', 'delta_loss_with_frozen_unigram', 'abs_delta_loss_post_ablation', 'abs_delta_loss_post_ablation_with_frozen_unigram', 'delta_entropy', 'kl_from_unigram_diff', 'kl_from_unigram_diff_with_frozen_unigram', 'abs_kl_from_unigram_diff', 'mediation_effect', 'is_unigram', 'loss']


In [38]:
model_name = 'pythia'
# load data and unigram_neuron dict
final_df = load_result(
    model_name,
    data_range_end = 500

    )


unigram_neurons_dict = select_top_token_frequency_neurons(
        final_df, 
        unigram_kl_threshold=0.05, 
        unigram_mediation_threshold=0.2, 
        top_n=10
    )

agg_results = aggregate_result(final_df,unigram_neurons_dict)


# plot the figure  
fig = plot_top_token_frequency_neurons(
        agg_results, 
        unigram_neurons_dict[model_name], 
        model_name
    )
fig.show()


kl_divergence_before found
['entropy', 'top_logit', 'pred', 'loss', 'top_logp', 'ln_final_scale', 'rank_of_correct_token', 'correct_token_rank', 'pred_in_top1', 'pred_in_top5', 'activation', 'component_name', 'loss_post_ablation', 'loss_post_ablation_with_frozen_unigram', 'entropy_post_ablation', 'entropy_post_ablation_with_frozen_unigram', 'kl_divergence_before', 'kl_divergence_after', 'kl_divergence_after_frozen_unigram', 'delta_loss', 'delta_loss_with_frozen_unigram', 'abs_delta_loss_post_ablation', 'abs_delta_loss_post_ablation_with_frozen_unigram', 'delta_entropy', 'kl_from_unigram_diff', 'kl_from_unigram_diff_with_frozen_unigram', 'abs_kl_from_unigram_diff', 'mediation_effect', 'is_unigram', 'loss']
