Just plotting the AUROC heatmap for the paper.

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly

In [None]:
path: str = "../linear-probes/results/all_datasets_layer_auroc_phi4.json"

if os.path.exists(path):
    df = pd.read_json(path)

In [None]:
train_datasets = ["TQA", "DQA", "RepEng", "AmongUs (I/C)"]
train_df_names = [
    "TruthfulQADataset",
    "DishonestQADataset",
    "RepEngDataset",
    "AmongUsDataset",
]
eval_datasets = ["TQA", "DQA", "RepEng", "AmongUs (lying)", "AmongUs (deception)"]

In [None]:
aurocs = {
    train: {
        eval: df[train_df_names[i]][eval][20]
        if len(df[train_df_names[i]][eval]) > 20
        else df[train_df_names[i]][eval][0]
        for eval in eval_datasets
    }
    for i, train in enumerate(train_datasets)
}

In [None]:
aurocs_matrix = np.zeros((len(train_datasets), len(eval_datasets)))
for i, train_dataset in enumerate(train_datasets):
    for j, eval_dataset in enumerate(eval_datasets):
        aurocs_matrix[i][j] = aurocs[train_dataset][eval_dataset]

In [None]:
threshold = (np.min(aurocs_matrix) + np.max(aurocs_matrix)) / 2
fig = go.Figure(
    data=go.Heatmap(
        z=aurocs_matrix,
        x=eval_datasets,
        y=train_datasets,
        # Soft blue palette - more elegant than RdBu
        # Alternative options:
        # colorscale='Viridis',
        # colorscale='Cividis',
        colorscale=[[0, "#b05a44"], [0.5, "#567d46"], [1, "#1a3a5c"]],
        showscale=False,
        autocolorscale=False,
        text=[[f"{val:.3f}" for val in row] for row in aurocs_matrix],
        texttemplate="%{text}",
        # Use a uniform text color that works well with your colorscale
        textfont={"color": "white"},
    )
)

# Add a custom colorbar if needed
fig.update_traces(
    colorbar=dict(
        thickness=20,
        tickvals=[np.min(aurocs_matrix), threshold, np.max(aurocs_matrix)],
        ticktext=[
            f"{np.min(aurocs_matrix):.3f}",
            f"{threshold:.3f}",
            f"{np.max(aurocs_matrix):.3f}",
        ],
    )
)

fig.update_layout(
    title="",
    xaxis_nticks=36,
    width=550,
    height=450,
    font=dict(family="Computer Modern"),
    template="plotly_white",
    xaxis=dict(
        tickangle=0,
        tickmode="array",
        tickvals=list(range(len(eval_datasets))),
        ticktext=["<br>".join(dataset.split()) for dataset in eval_datasets],
    ),
)
fig.update_layout(paper_bgcolor="white", plot_bgcolor="white")
fig.update_layout(yaxis=dict(scaleanchor="x"))

In [None]:
import plotly.io as pio

pio.kaleido.scope.default_format = "pdf"
pio.kaleido.scope.default_scale = 2
pio.write_image(
    fig,
    "plots/auroc.png",
    scale=4,
    width=550,
    height=450,
    format="png",
    engine="kaleido",
)
pio.write_image(fig, "plots/auroc.pdf")

In [None]:
path: str = "../linear-probes/results/all_datasets_layer_auroc_phi4.json"

if os.path.exists(path):
    df = pd.read_json(path)

In [None]:
# Create a figure with 4 subplots (one for each training dataset)
from plotly.subplots import make_subplots
import plotly.graph_objects as go

colors = ["#4C72B0", "#55A868", "#C44E52", "#8172B3", "#CCB974"]

# Get the list of training and evaluation datasets
train_datasets = df.columns
eval_datasets = df.index

# Create a 4x1 subplot figure
fig = make_subplots(
    rows=4,
    cols=1,
    subplot_titles=[f"Probe trained on {dataset}" for dataset in train_datasets],
    vertical_spacing=0.05,
    shared_xaxes=True,
)

# Define a color palette for the evaluation datasets
# colors = px.colors.qualitative.Plotly

# Number of layers (assuming 40 layers based on the description)
num_layers = 40
layers = list(range(num_layers))

# For each training dataset (subplot)
for i, train_dataset in enumerate(train_datasets):
    # For each evaluation dataset (line in the subplot)
    for j, eval_dataset in enumerate(eval_datasets):
        # Get the AUROC values for this train-eval pair
        auroc_values = df.loc[eval_dataset, train_dataset]

        # Add a trace for this evaluation dataset
        fig.add_trace(
            go.Scatter(
                x=layers,
                y=auroc_values,
                mode="lines+markers",
                name=f"{eval_dataset}",
                line=dict(color=colors[j]),
                showlegend=(i == 0),  # Only show legend for the first subplot
                legendgroup=eval_dataset,  # Group legends by eval dataset
            ),
            row=i + 1,
            col=1,
        )

    # Add a horizontal line at 0.5 (random chance)
    fig.add_shape(
        type="line",
        x0=0,
        x1=num_layers - 1,
        y0=0.5,
        y1=0.5,
        line=dict(color="gray", width=1, dash="dash"),
        row=i + 1,
        col=1,
    )

# Update layout
fig.update_layout(
    height=800,
    width=1000,
    title_text="",
    legend_title_text="",
    template="plotly_white",
    font=dict(family="Computer Modern"),
)

# Update x and y axes
for i in range(4):
    fig.update_xaxes(title_text="Layer" if i == 3 else None, row=i + 1, col=1)
    fig.update_yaxes(title_text="AUROC", range=[0, 1.1], row=i + 1, col=1)

# show legend on top of the plot
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.05,
        xanchor="center",
        x=0.5,
        title_font=dict(size=24),
        font=dict(size=20),
        bgcolor="rgba(255, 255, 255, 0.8)",
    )
)

fig.show()

In [None]:
# save fig
pio.kaleido.scope.default_format = "pdf"
pio.kaleido.scope.default_scale = 2
pio.write_image(
    fig,
    "plots/auroc_layers.png",
    scale=4,
    width=1000,
    height=800,
    format="png",
    engine="kaleido",
)
pio.write_image(
    fig,
    "plots/auroc_layers.pdf",
    scale=4,
    width=1000,
    height=800,
    format="pdf",
    engine="kaleido",
)