# Set configs


In [7]:
from pathlib import Path

import pandas as pd
from scipy import stats


In [2]:
# load all the predefined functions
from plot_util import SurprisalLoader, SurprisalPlotter, plot_geometry_step, load_kl

# plot and save the results
neuron_colors = {
    0: "#1f77b4",  # blue for baseline
    1: "#4589b9",  # interpolated between 0 and 10
    2: "#6b9bbe",  # interpolated between 0 and 10
    5: "#a5bec6",  # interpolated between 0 and 10
    10: "#ff7f0e",  # orange (unchanged)
    25: "#c89f1d",  # interpolated between 10 and 50
    50: "#2ca02c",  # green (unchanged)
    500: "#9467bd",  # purple (unchanged)
}

In [3]:
ROOT = Path("/Users/jliu/workspace/RAG/")
fig_path = ROOT / "fig"
surprisal_path = ROOT / "results" / "surprisal"
KL_path = ROOT / "results" / "token_freq"
geometry_path = ROOT / "results" / "directions" / "geometry" / "EleutherAI"
freq_path = ROOT / "datasets/freq/EleutherAI/pythia-410m"

In [1]:
import pandas as pd
import typing as t

def rank_neurons(df: pd.DataFrame) -> pd.DataFrame:
    """
    Sort neurons by mediation effect and KL divergence, both in descending order.
    
    Args:
        df: DataFrame containing neuron data with required columns
        
    Returns:
        Sorted DataFrame with neurons ranked by importance
    """
    return df.sort_values(
        by=["mediation_effect", "abs_kl_from_unigram_diff"], 
        ascending=[False, False]
    )

# Example usage
def test_ranking() -> None:
    """Test the ranking function with sample data."""
    # Create sample data
    data = {
        "neuron_id": [1, 2, 3, 4],
        "mediation_effect": [0.5, 0.7, 0.5, 0.3],
        "abs_kl_from_unigram_diff": [0.2, 0.1, 0.3, 0.4]
    }
    test_df = pd.DataFrame(data)
    
    # Apply ranking
    result = rank_neurons(test_df)
    print("Ranked neurons:")
    print(result)
    return result 


In [2]:
result=test_ranking()

Ranked neurons:
   neuron_id  mediation_effect  abs_kl_from_unigram_diff
1          2               0.7                       0.1
2          3               0.5                       0.3
0          1               0.5                       0.2
3          4               0.3                       0.4


## Surprisal dynamics

In [5]:
# load file
stat_path = surprisal_path / "stat_all.csv"
resume = False
if stat_path.is_file() and resume:
    stat_frame = pd.read_csv(stat_path)
    print(f"Load from {stat_path}")
else:
    analyzer = SurprisalLoader(surprisal_path)
    # Process all files and get statistics
    stats = analyzer.get_stat_all()


Stat file has been saved to /Users/jliu/workspace/RAG/results/surprisal/stat_all.csv


In [19]:
test = [1,2,3,4,5]
test[:3]

[1, 2, 3]

In [15]:
data = pd.read_csv("/Users/jliu/workspace/RAG/results/directions/500_50.debug")

In [1]:
header_dict = {
        "component_name":"top_neurons",
        "mediation_effect":"med_effect",
        "kl_from_unigram_diff":"kl_diff",
        "delta_loss_post_ablation": "delta_loss_post",
        "delta_loss_post_ablation_with_frozen_unigram": "delta_loss_post_frozen"
        }

In [4]:
list(header_dict.values())

['top_neurons',
 'med_effect',
 'kl_diff',
 'delta_loss_post',
 'delta_loss_post_frozen']

In [11]:
data["full_mean_angle_degrees"].max()

90.23107147216795

In [6]:
ylim_dict = {
    "merged": {"70m": [11.5, 15.5], "410m": [12.5, 14]},
    "longtail_words": {"70m": [13, 17], "410m": [13, 15.5]},
}


surprisal_plotter = SurprisalPlotter(
    df=stats,
    output_dir=fig_path/"surprisal",
    neuron_colors=neuron_colors,
    ylim_dict=ylim_dict,
    neurons=[10, 50, 500],
    #ablations=["random", "mean", "zero"],
)

surprisal_plotter.plot_all(eval_set="longtail_words")


## KL distribution


In [118]:
# plot KL difference in differnt conditions
effect_lst=["boost","suppress"]
vec_lst = ["mean","longtail"]
model_lst = ["70m","410m"]
neuron_lst = [10,50,500]

stat_frame = pd.DataFrame()
for vec in vec_lst:
    for model in model_lst:
        suffix_path = Path(vec)/"EleutherAI"/f"pythia-{model}-deduped"
        data_dict = {}
        for neuron in neuron_lst:
            data_dict[neuron] = {}
            for effect in effect_lst:
                # load file
                file_path = KL_path / effect / suffix_path / f"500_{neuron}.csv"
                kl_lst,stat_df = load_kl(file_path)
                data_dict[neuron][effect] = kl_lst
                header_dict = {"vec":vec,"model":model,"neuron":neuron,"effect":effect}
                for header,col in header_dict.items():
                    stat_df[header]=col
                stat_frame = pd.concat([stat_frame,stat_df])

stat_frame.to_csv(KL_path / "kl_stat.csv")

## Geometric difference

In [109]:
# load file
model_lst = ["70m", "410m"]
metric_lst = ["subspace", "orthogonality"]
neuron_lst = [10, 50, 500]
metric_dict = {
    "subspace": ["dim_prop", "sv_decay_rate_2"],
    "orthogonality": ["pct_near_orthogonal", "mean_angle_degrees"],
}
ylim_dict = {
    "dim_prop": [0, 1.1],
    "sv_decay_rate_2": [1, 3],
    "pct_near_orthogonal": [0, 100],
    "mean_angle_degrees": [0, 90],
}


neuron_type_lst = []


for metric in metric_lst:
    output_path = fig_path / metric
    plot_geometry_step(geometry_path,output_path, metric, model_lst, neuron_lst, neuron_type_lst, ylim_dict, metric_dict)
