In [1]:
# import pandas as pd

# df = pd.DataFrame(
#     {
#         "statement": [
#             "Alice is not happy.",
#             "Alice is happy.",
#             "Alice is not unhappy.",
#             "Alice is unhappy.",
#             "Bob is not happy.",
#             "Bob is happy.",
#             "Bob is not unhappy.",
#             "Bob is unhappy.",
#         ],
#         "containsAlice": [1, 1, 1, 1, 0, 0, 0, 0],
#         "isHappy": [0, 1, 1, 0, 0, 1, 1, 0],
#         "containsNot": [1, 0, 1, 0, 1, 0, 1, 0],
#     }
# )

# df.to_csv("datasets/happy.csv", index=False)

In [2]:
import os
import torch
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns

layers = [0, 10, 16, 20, 31]
os.makedirs("figures", exist_ok=True)

In [3]:
df = pd.read_csv("datasets/happy.csv")
statements = df["statement"].values
containsAlice = df["containsAlice"].values
isHappy = df["isHappy"].values
containsNot = df["containsNot"].values

In [4]:
path = "acts/llama-3-8b/happy"
all = np.zeros((len(layers) * 8, 4096))
attAll = np.zeros((len(layers) * 8, 4096))
containsAliceAll = np.zeros((len(layers) * 8, 1))
containsNotAll = np.zeros((len(layers) * 8, 1))
isHappyAll = np.zeros((len(layers) * 8, 1))
statementsAll = []
layersAll = []

for i, layer in enumerate(layers):
    index = range(i * 8, (i + 1) * 8)
    all[index] = torch.load(os.path.join(path, f"layer_{layer}_0.pt")).cpu().numpy()
    attAll[index] = (
        torch.load(os.path.join(path, f"attn_layer_{layer}_0.pt")).cpu().numpy()
    )
    containsAliceAll[index, 0] = containsAlice
    containsNotAll[index, 0] = containsNot
    isHappyAll[index, 0] = isHappy
    statementsAll.extend(statements)
    layersAll.extend([layer] * 8)

In [5]:
# Save heatmap at each layer
for i, layer in enumerate(layers):
    index = range(i * 8, (i + 1) * 8)
    all_normalized = all[index] / np.linalg.norm(all[index], axis=1, keepdims=True)
    plt.figure(figsize=(10, 10))
    sns.heatmap(all_normalized @ all_normalized.T, yticklabels=statements)
    plt.savefig(f"figures/gram_layer_{layer}.png")
    plt.close()

In [8]:
# plot pca
all_pc2 = PCA(n_components=2, whiten=True).fit_transform(all)
px.scatter(
    x = all_pc2[:, 0],
    y = all_pc2[:, 1],
    color=layersAll,
    color_continuous_scale=px.colors.sequential.Blues,
    hover_data={"statements": statementsAll, "layer": layersAll},
)

In [9]:
# plot pca
attAll_pc2 = PCA(n_components=2, whiten=True).fit_transform(attAll)
px.scatter(
    x = attAll_pc2[:, 0],
    y = attAll_pc2[:, 1],
    color=layersAll,
    color_continuous_scale=px.colors.sequential.Blues,
    hover_data={"statements": statementsAll, "layer": layersAll},
)

1. Clusters are formed by layers. The first layer and last layer activations are clustered in parallel lines.
2. First layer attention is a line.

In [21]:
torch.Tensor(all).cuda().shape, torch.Tensor(containsNotAll).cuda().shape

(torch.Size([40, 4096]), torch.Size([40, 1]))

In [24]:
# probe

from probes import LRProbe

# containsAlice
all_cuda = torch.Tensor(all).cuda()
containsAlice_cuda = torch.Tensor(containsAliceAll).cuda().flatten()
probe = LRProbe.from_data(all_cuda, containsAlice_cuda, bias=True, device="cuda")
train_acc = (probe(all_cuda).round() == containsAlice_cuda).float().mean().item()
train_acc

# containsNot
containsNot_cuda = torch.Tensor(containsNotAll).cuda().flatten()
probe = LRProbe.from_data(all_cuda, containsNot_cuda, bias=True, device="cuda")
train_acc = (probe(all_cuda).round() == containsNot_cuda).float().mean().item()
train_acc

1.0