In [None]:
import pandas as pd

df = pd.read_parquet("/workspace/steering_vectors_trained_activation.parquet")


In [None]:
df

In [None]:
import numpy as np

df["steering_vector"] = df["steering_vector"].apply(lambda x: np.array(x))


In [None]:
df

In [None]:
df_trained = df[df["trained_or_activation"]]
df_activation = df[~df["trained_or_activation"]]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Ensure both DataFrames are sorted by dataset for alignment
df_trained_sorted = df_trained.sort_values("dataset").reset_index(drop=True)
df_activation_sorted = df_activation.sort_values("dataset").reset_index(drop=True)

# Sanity check: datasets should match
assert all(df_trained_sorted["dataset"].values == df_activation_sorted["dataset"].values), "Datasets do not align!"

trained_vectors = np.stack(df_trained_sorted["steering_vector"].values)
activation_vectors = np.stack(df_activation_sorted["steering_vector"].values)

# Compute cosine similarity for each pair
cos_sims = np.array([
    cosine_similarity(trained_vec.reshape(1, -1), activation_vec.reshape(1, -1))[0, 0]
    for trained_vec, activation_vec in zip(trained_vectors, activation_vectors)
])

# Add to DataFrame for inspection
df_similarity = pd.DataFrame({
    "dataset": df_trained_sorted["dataset"].values,
    "cosine_similarity": cos_sims
})

df_similarity



In [None]:
import matplotlib.pyplot as plt
# histogram of cosine similarity
plt.hist(cos_sims, bins=20, edgecolor='black')
plt.title("Histogram of Cosine Similarity")
plt.xlabel("Cosine Similarity")
plt.ylabel("Frequency")
plt.show()


In [None]:
# histogram of vector magnitudes
plt.hist(np.linalg.norm(trained_vectors, axis=1), bins=20, edgecolor='black')
plt.title("Histogram of Vector Magnitude")
plt.xlabel("Magnitude")
plt.ylabel("Frequency")
plt.show()


In [None]:
# histogram of activation vector magnitudes
plt.hist(np.linalg.norm(activation_vectors, axis=1), bins=20, edgecolor='black')
plt.title("Histogram of Activation Vector Magnitude")
plt.xlabel("Magnitude")
plt.ylabel("Frequency")
plt.show()


In [None]:
from scipy.stats import pearsonr, spearmanr

# Compute norms
trained_norms = np.linalg.norm(trained_vectors, axis=1)
activation_norms = np.linalg.norm(activation_vectors, axis=1)

# Pearson correlation
pearson_corr, pearson_p = pearsonr(trained_norms, activation_norms)
print(f"Pearson correlation between vector norms: r={pearson_corr:.4f}, p={pearson_p:.4g}")

# Spearman correlation
spearman_corr, spearman_p = spearmanr(trained_norms, activation_norms)
print(f"Spearman correlation between vector norms: r={spearman_corr:.4f}, p={spearman_p:.4g}")

# Optionally, scatter plot
plt.scatter(trained_norms, activation_norms, alpha=0.6)
plt.xlabel("Trained Vector Norm")
plt.ylabel("Activation Vector Norm")
plt.title("Scatter Plot of Vector Norms")
plt.show()


In [None]:
from typing import Optional, Dict
import torch
from pathlib import Path

PERSONA_ROOT = Path("/workspace/persona-data")
def load_activation_vector(dataset: str, target_layer: int) -> Optional[torch.Tensor]:
    if "__trait__" in dataset:
        model_prefix = dataset.split("__trait__", 1)[0]
        trait = dataset.split("__trait__", 1)[1]
        vec_file = PERSONA_ROOT / f"{model_prefix}/traits_240/vectors/{trait}.pt"
        if not vec_file.exists():
            return None
        data: Dict[str, torch.Tensor] = torch.load(vec_file, map_location="cpu")
        vec = data["pos_neg_50"][target_layer]
        return vec.float()
    if "__role__" in dataset:
        model_prefix = "qwen-3-32b"  # use Qwen activation vectors for roles
        role = dataset.split("__role__", 1)[1]
        vec_file = PERSONA_ROOT / f"{model_prefix}/roles_240/vectors/{role}.pt"
        if not vec_file.exists():
            return None
        data: Dict[str, torch.Tensor] = torch.load(vec_file, map_location="cpu")
        vec_pos = data["pos_3"][target_layer]
        vec_default = data["default_1"][target_layer]
        vec_contrast = vec_pos - vec_default
        return vec_contrast.float()
    return None

In [None]:
# Load activation vectors for all datasets for all layers

from tqdm import tqdm

def load_all_activation_vectors(datasets, num_layers=64):
    """
    Load activation vectors for all datasets for all layers.

    Args:
        datasets (list[str]): List of dataset names.
        num_layers (int): Number of layers to load (default: 32).

    Returns:
        dict[str, dict[int, torch.Tensor]]: 
            Mapping from dataset name to {layer_idx: activation_vector}.
    """
    all_vectors = {}
    for dataset in tqdm(datasets):
        layer_vectors = {}
        for layer_idx in range(num_layers):
            vec = load_activation_vector(dataset, layer_idx)
            if vec is not None:
                layer_vectors[layer_idx] = vec
        all_vectors[dataset] = layer_vectors
    return all_vectors

# Example usage:
all_activation_vectors = load_all_activation_vectors(df["dataset"].unique(), num_layers=32)


In [None]:
set(len(all_activation_vectors[dataset]) for dataset in all_activation_vectors)

In [None]:
from tqdm import trange
# compare trained vector to corresponding activation vector
datasets_in_order = df["dataset"].unique()
cosine_sims_by_layer = []
for layer in trange(32):
    cosine_sims = {}
    for dataset in datasets_in_order:
        trained_vector = df_trained[df_trained["dataset"] == dataset]["steering_vector"].values[0]
        activation_vector = all_activation_vectors[dataset][layer]
        cosine_sim = cosine_similarity(
            trained_vector.reshape(1, -1), activation_vector.reshape(1, -1)
        )[0, 0]
        cosine_sims[dataset] = cosine_sim
    cosine_sims_by_layer.append(cosine_sims)




In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Compute mean cosine similarity per layer
mean_cosine_sims = [np.mean(list(layer_dict.values())) for layer_dict in cosine_sims_by_layer]

plt.figure(figsize=(8, 5))
plt.axvline(x=22, color='red', linestyle='--', label='Trained Layer (22)')
plt.legend()
plt.plot(range(32), mean_cosine_sims, marker='o')
plt.xlabel("Layer")
plt.ylabel("Mean Cosine Similarity")
plt.title("Mean Cosine Similarity between Trained and Activation Vectors vs Layer")
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Choose the layer N to compare with layer 32 (Python index 31)
layer_n = 22  # You can change this to any layer index you want

datasets_in_order = df["dataset"].unique()
cosine_sims_n_vs_32 = []

for dataset in datasets_in_order:
    vec_n = all_activation_vectors[dataset][layer_n]
    vec_32 = all_activation_vectors[dataset][31]
    sim = cosine_similarity(
        vec_n.reshape(1, -1), vec_32.reshape(1, -1)
    )[0, 0]
    cosine_sims_n_vs_32.append(sim)

plt.figure(figsize=(8, 5))
plt.hist(cosine_sims_n_vs_32, bins=30, color='skyblue', edgecolor='k')
plt.xlabel(f"Cosine Similarity (Layer {layer_n} vs Layer 32)")
plt.ylabel("Count")
plt.title(f"Cosine Similarity between Activation Vectors @ Layer {layer_n} and @ Layer 32")
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# For each layer, compute the mean cosine similarity between that layer's activation vector and layer 32's activation vector, averaged over all datasets
num_layers = 32
mean_cosine_sims_vs_32 = []

for layer_idx in range(num_layers):
    sims = []
    for dataset in datasets_in_order:
        vec_n = all_activation_vectors[dataset][layer_idx]
        vec_32 = all_activation_vectors[dataset][31]
        sim = cosine_similarity(
            vec_n.reshape(1, -1), vec_32.reshape(1, -1)
        )[0, 0]
        sims.append(sim)
    mean_cosine_sims_vs_32.append(np.mean(sims))

plt.figure(figsize=(8, 5))
plt.plot(range(1, num_layers + 1), mean_cosine_sims_vs_32, marker='o')
plt.xlabel("Layer")
plt.ylabel("Mean Cosine Similarity to Layer 32")
plt.title("Mean Cosine Similarity: Activation Vector @ Layer N vs Layer 32 (Averaged Over Datasets)")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# For layer 22, get the full similarity matrix between activation vectors and trained vectors
datasets_in_order = df["dataset"].unique()
trained_vectors = np.stack([
    df_trained[df_trained["dataset"] == dataset]["steering_vector"].values[0]
    for dataset in datasets_in_order
])
activation_vectors = np.stack([
    all_activation_vectors[dataset][22]
    for dataset in datasets_in_order
])

sim_matrix = cosine_similarity(activation_vectors, trained_vectors)

sim_df = pd.DataFrame(
    sim_matrix,
    index=[f"{ds} (activation)" for ds in datasets_in_order],
    columns=[f"{ds} (trained)" for ds in datasets_in_order]
)


# Flatten the similarity matrix to get all pairwise similarities
all_sims = sim_matrix.flatten()

# Plot the CDF
plt.figure(figsize=(8, 6))
sorted_sims = np.sort(all_sims)
cdf = np.arange(1, len(sorted_sims) + 1) / len(sorted_sims)
plt.plot(sorted_sims, cdf, label="CDF of Cosine Similarities")
plt.xlabel("Cosine Similarity")
plt.ylabel("Cumulative Probability")
plt.title("CDF of Cosine Similarities (Activation vs Trained, Layer 22)")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
trained_var = np.var(trained_vectors, axis=-1)
activation_var = np.var(activation_vectors, axis=-1)

plt.figure(figsize=(8, 5))
plt.scatter(trained_var, activation_var, alpha=0.6)
plt.xlabel("Trained Vector Variance")
plt.ylabel("Activation Vector Variance")
plt.title("Variance of Trained and Activation Vectors")
plt.show()

In [None]:
trained_mean = np.mean(trained_vectors, axis=-1)
activation_mean = np.mean(activation_vectors, axis=-1)

plt.figure(figsize=(8, 5))
plt.scatter(trained_mean, activation_mean, alpha=0.6)
plt.xlabel("Trained Vector Mean")
plt.ylabel("Activation Vector Mean")