# Set configs


In [1]:
from pathlib import Path

import pandas as pd
from scipy import stats


In [2]:
# load all the predefined functions
from plot_util import SurprisalLoader, SurprisalPlotter, plot_geometry_step, load_kl

# plot and save the results
neuron_colors = {
    0: "#1f77b4",  # blue for baseline
    1: "#4589b9",  # interpolated between 0 and 10
    2: "#6b9bbe",  # interpolated between 0 and 10
    5: "#a5bec6",  # interpolated between 0 and 10
    10: "#ff7f0e",  # orange (unchanged)
    25: "#c89f1d",  # interpolated between 10 and 50
    50: "#2ca02c",  # green (unchanged)
    500: "#9467bd",  # purple (unchanged)
}

In [44]:
ROOT = Path("/Users/jliu/workspace/RAG/")
fig_path = ROOT / "fig"
surprisal_path = ROOT / "results" / "surprisal"
KL_path = ROOT / "results" / "token_freq"
geometry_path = ROOT / "results" / "directions" / "geometry" / "EleutherAI"
freq_path = ROOT / "datasets/freq/EleutherAI/pythia-410m"

## Surprisal dynamics

In [45]:
import matplotlib.pyplot as plt

class SurprisalPlotter:
    """Class for plotting model effect data with various filtering options."""

    def __init__(
        self,
        df: pd.DataFrame,
        output_dir: Path,
        neuron_colors: dict[int, str],
        ylim_dict: dict[str, dict[str, tuple[float, float]]],
        models=None,
        effect_lst=None,
        vec_lst=None,
        ablations=None,
        neurons=None,
    ):
        """Initialize the plotter with data and configuration."""
        self.df = df
        self.output_dir = Path(output_dir)
        self.neuron_colors = neuron_colors
        self.ylim_dict = ylim_dict

        # Set models or compute from df if None
        self.models = models if models is not None else df["model"].unique().tolist()
        self.effect_lst = effect_lst if effect_lst is not None else df["effect"].unique().tolist()
        self.vec_lst = vec_lst if vec_lst is not None else df["vec"].unique().tolist()
        # Make sure "base" is not included in ablations to avoid creating "base_*.png" files
        if ablations is not None:
            self.ablations = [a for a in ablations if a != "base"]
        else:
            self.ablations = [a for a in df["ablation"].unique() if a != "base"]
        self.neurons = neurons if neurons is not None else df["neuron"].unique().tolist()

    def _plot_line(self, data: pd.DataFrame, label: str) -> bool:
        """ Plot a single line for the given data."""
        if data.empty:
            return False

        baseline_grouped = data.groupby("log_step")
        x_values = sorted(data["log_step"].unique())

        # Extract surprisal values
        y_values = [
            baseline_grouped.get_group(log_step)["surprisal"].values[0]
            for log_step in x_values
            if log_step in baseline_grouped.groups
        ]

        if not y_values:  # Skip if no values to plot
            return False

        # Determine color based on label
        if label == "baseline":
            color = self.neuron_colors.get(0, "black")
        else:
            try:
                neuron_id = int(label)
                color = self.neuron_colors.get(neuron_id, "black")
            except ValueError:
                color = "black"

        # Plot the line
        plt.plot(x_values, y_values, color=color, linewidth=2, label=label)
        return True

    def plot_all(self, eval_set: str, figure_size: tuple[int, int] = (10, 8)) -> list[Path]:
        """Plot the overall development using the configuration from initialization. """

        # Process each model and ablation type using class attributes
        for effect in self.effect_lst:
            for vec in self.vec_lst:
                for model in self.models:
                    for ablation in self.ablations:
                        # Create a new figure
                        plt.figure(figsize=figure_size)

                        # Get baseline data (always include baseline for comparison)
                        baseline_data = self.df[
                            (self.df["model"] == model)
                            & (self.df["ablation"] == "base")
                            & (self.df["eval"] == eval_set)
                            & (self.df["effect"] == effect)
                        ]

                        # Filter data for this model and configuration
                        model_data = self.df[
                            (self.df["model"] == model)
                            & (self.df["vec"] == vec)
                            & (self.df["eval"] == eval_set)
                            & (self.df["effect"] == effect)
                            & (self.df["ablation"] == ablation)
                        ]

                        # slice df to have the same log steps  
                        # TODO: wrap into a function
                        common_steps = set(baseline_data["log_step"]) & set(model_data["log_step"])
                        baseline_data = baseline_data[baseline_data["log_step"].isin(common_steps)]
                        model_data = model_data[model_data["log_step"].isin(common_steps)]

                        # Plot baseline data first
                        baseline_plotted = self._plot_line(baseline_data, "baseline")

                        # Count how many lines we've plotted
                        lines_plotted = 1 if baseline_plotted else 0

                        if model_data.empty and not baseline_plotted:
                            plt.close()
                            continue

                        # Process each neuron condition for this ablation
                        for neuron in self.neurons:
                            # Filter data for this neuron and ablation combination
                            condition_data = model_data[(model_data["neuron"] == neuron)]

                            # Plot neuron data
                            if self._plot_line(condition_data, str(neuron)):
                                lines_plotted += 1

                        # Check if we have any plotted data or if it's a base ablation
                        if lines_plotted == 0 or ablation == "base":
                            plt.close()
                            continue

                        # Style the plot
                        plt.xlabel("Log step", fontsize=12)
                        plt.ylabel("Surprisal", fontsize=12)
                        plt.title(f"neuron={effect}, vec={vec}, intervention={ablation}", fontsize=13)
                        plt.grid(alpha=0.2)

                        # Create legend with baseline first
                        handles, labels = plt.gca().get_legend_handles_labels()

                        if handles:  # Only create legend if we have items to show
                            # If baseline is in the legend, make sure it comes first
                            if "baseline" in labels:
                                base_idx = labels.index("baseline")
                                # Move baseline to front
                                handles = [handles[base_idx]] + [h for i, h in enumerate(handles) if i != base_idx]
                                labels = [labels[base_idx]] + [l for i, l in enumerate(labels) if i != base_idx]

                            plt.legend(handles, labels, loc="lower left")

                        # Set y-axis limits if provided
                        if eval_set in self.ylim_dict and model in self.ylim_dict[eval_set]:
                            plt.ylim(self.ylim_dict[eval_set][model])

                        # Save the figure
                        plt.tight_layout()

                        # Create output directory if it doesn't exist
                        output_path = self.output_dir / effect / eval_set
                        output_path.mkdir(parents=True, exist_ok=True)

                        # Final check to absolutely make sure we never save any files with "base" in the name
                        if vec != "base":
                            output_file = output_path / f"{vec}_{model}_{ablation}.png"
                            plt.savefig(output_file, dpi=300, bbox_inches="tight")

                        plt.close()



In [46]:
# load file
stat_path = surprisal_path / "stat_all.csv"
resume = False
if stat_path.is_file() and resume:
    stat_frame = pd.read_csv(stat_path)
    print(f"Load from {stat_path}")
else:
    analyzer = SurprisalLoader(surprisal_path)
    # Process all files and get statistics
    stats = analyzer.get_stat_all()


Stat file has been saved to /Users/jliu/workspace/RAG/results/surprisal/stat_all.csv


In [47]:
ylim_dict = {
    "merged": {"70m": [11.5, 15.5], "410m": [12.5, 14]},
    "longtail_words": {"70m": [13, 17], "410m": [12, 15.5]},
}


surprisal_plotter = SurprisalPlotter(
    df=stats,
    output_dir=fig_path/"surprisal",
    neuron_colors=neuron_colors,
    ylim_dict=ylim_dict,
    neurons=[10, 50],
    ablations=["mean", "zero"]
)

surprisal_plotter.plot_all(eval_set="longtail_words")


## KL distribution


In [118]:
# plot KL difference in differnt conditions
effect_lst=["boost","suppress"]
vec_lst = ["mean","longtail"]
model_lst = ["70m","410m"]
neuron_lst = [10,50,500]

stat_frame = pd.DataFrame()
for vec in vec_lst:
    for model in model_lst:
        suffix_path = Path(vec)/"EleutherAI"/f"pythia-{model}-deduped"
        data_dict = {}
        for neuron in neuron_lst:
            data_dict[neuron] = {}
            for effect in effect_lst:
                # load file
                file_path = KL_path / effect / suffix_path / f"500_{neuron}.csv"
                kl_lst,stat_df = load_kl(file_path)
                data_dict[neuron][effect] = kl_lst
                header_dict = {"vec":vec,"model":model,"neuron":neuron,"effect":effect}
                for header,col in header_dict.items():
                    stat_df[header]=col
                stat_frame = pd.concat([stat_frame,stat_df])

stat_frame.to_csv(KL_path / "kl_stat.csv")

## Geometric difference

In [19]:
import numpy as np



class GeometryLoader:
    """Class for loading and processing geometric metric data."""

    def __init__(self, min_step: float = 3.5):
        self.min_step = min_step

    @staticmethod
    def convert_log(step: float) -> float:
        return np.log10(step + 1e-10)

    def convert_log_step(self, file_path: Path) -> pd.DataFrame:
        # Load the data
        data = pd.read_csv(file_path)
        # Apply log conversion
        data["log_step"] = data["step"].apply(self.convert_log)
        # Filter by minimum log step
        return data[data["log_step"] > self.min_step].copy()

    def load_subspace(self, data: pd.DataFrame, neuron_type_lst=None) -> pd.core.groupby.DataFrameGroupBy:
        """Process subspace data with string replacements and filtering."""
        # Create a copy to avoid SettingWithCopyWarning
        df = data.copy()
        # Replace neuron type strings
        df["neuron"] = df["neuron"].str.replace("sampled_common", "random", regex=False)
        df["neuron"] = df["neuron"].str.replace("common", "all", regex=False)
        # Filter by neuron type if list provided
        if neuron_type_lst:
            df = df[~df["neuron"].isin(neuron_type_lst)]
        # Calculate dimension proportion
        df["dim_prop"] = df["effective_dim"] / df["total_dim"]
        return df.groupby("neuron")

    def load_orthogonality(
        self, data: pd.DataFrame, neuron_type_lst = None
    ) -> pd.core.groupby.DataFrameGroupBy:
        """Process orthogonality data with string replacements and filtering."""
        # Create a copy to avoid SettingWithCopyWarning
        df = data.copy()
        # Replace pair strings (order matters - replace longer pattern first)
        df["pair"] = df["pair"].str.replace("sampled_common", "random", regex=False)
        df["pair"] = df["pair"].str.replace("common", "all", regex=False)
        # Filter by neuron type if list provided
        if neuron_type_lst:
            # Exclude pairs containing any item from neuron_type_lst
            df = df[~df["pair"].apply(lambda pair: any(item in pair for item in neuron_type_lst))]
        return df.groupby("pair")

    def load_file(self, file_path: Path, metric: str, neuron_type_lst = None) :
        """Main function to load and process metric data files."""
        # Load data with log step conversion
        data = self.convert_log_step(file_path)
        # Route to appropriate processing function based on metric
        if metric == "subspace":
            return self.load_subspace(data, neuron_type_lst=neuron_type_lst)
        if metric == "orthogonality":
            return self.load_orthogonality(data, neuron_type_lst=neuron_type_lst)
        return None

In [20]:

def plot_geometry_step(geometry_path,output_path, metric, model_lst, neuron_lst, neuron_type_lst, ylim_dict, metric_dict):
    for model in model_lst:
        for neuron in neuron_lst:
            file_path = geometry_path / f"pythia-{model}-deduped" / metric / f"500_{neuron}.csv"
            geometry_loader = GeometryLoader()
            data_grouped = geometry_loader.load_file(file_path, metric=metric, neuron_type_lst=neuron_type_lst)
            for metric_val in metric_dict[metric]:
                for neuron_type, data_group in data_grouped:
                    plt.grid(alpha=0.2)
                    plt.plot(data_group["log_step"], data_group[metric_val], label=neuron_type)
                    plt.title(f"{metric_val}: #neuron={neuron}, model={model}")
                plt.ylim(ylim_dict[metric_val])
                plt.xlabel("Log step", fontsize=12)
                plt.ylabel(metric_val, fontsize=12)
                plt.legend()
                output_file = output_path / metric_val / f"{model}_{neuron}.png"
                output_file.parent.mkdir(parents=True, exist_ok=True)
                plt.savefig(output_file, dpi=300, bbox_inches="tight")
                plt.close()


In [29]:
# load file
model_lst = ["70m", "410m"]
#metric_lst = ["subspace"]
metric_lst = ["orthogonality"]
neuron_lst = [10, 50, 500]
metric_dict = {
    "subspace": ["dim_prop", "sv_decay_rate_2"],
    "orthogonality": ["full_mean_angle_degrees", "principal_mean_angle_degrees"],
}
ylim_dict = {
    "dim_prop": [0, 1.1],
    "sv_decay_rate_2": [1, 3],
    "full_mean_angle_degrees": [85, 95],
    "principal_mean_angle_degrees": [5, 120],
}


neuron_type_lst = ["all-","random_2-"]

for metric in metric_lst:
    output_path = fig_path / metric
    plot_geometry_step(geometry_path,output_path, metric, model_lst, neuron_lst, neuron_type_lst, ylim_dict, metric_dict)
