In [None]:
# This setup takes roughly 15 mins to run
#!pip install -q accelerate
#!pip install -q -i https://pypi.org/simple/ bitsandbytes
#!git clone https://github.com/saprmarks/geometry-of-truth.git
import torch
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm   # Change this
from transformers import AutoModelForCausalLM, AutoTokenizer

dataset_dir = 'geometry-of-truth/datasets'
OUTPUT_DIR = os.getenv("OUTPUT_DIR_MSC")
cache_dir = os.path.join(
    OUTPUT_DIR, "cache_dir", "huggingface"
)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf", device_map="auto", load_in_4bit=True, cache_dir=cache_dir)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf", cache_dir=cache_dir)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m711.6 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 MB[0m [31m751.7 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

In [None]:
print(model.model)
print(model.model.layers[0])   # Source code: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L693

In [None]:
from typing import List
import transformers


dtype = np.float16


class Hook:
  # Inspired by https://github.com/saprmarks/geometry-of-truth/blob/main/generate_acts.py
  def __init__(self):
    self.activations = []
    self.idx = -1   # The index of the token we look at the internal state for

  def __call__(self, module, args, output, **kwargs):
    assert len(output) == 2 and isinstance(output[1], transformers.cache_utils.DynamicCache) and isinstance(output[0], torch.Tensor)
    o = output[0][...,self.idx,:].detach().cpu().numpy().astype(dtype)
    self.activations.append(o)


def compute_activations(statements: List[str], model: torch.nn.Module) -> np.ndarray:
  """
    Returns:
    - Activations of shape [num_layers, num_samples, n_hidden_dim]
  """
  hooks = []
  handles = []
  for i, layer in enumerate(model.model.layers):
    hook = Hook()
    handle = layer.register_forward_hook(hook)
    hooks.append(hook)
    handles.append(handle)

  for statement in tqdm(statements):
    tokens = tokenizer.encode(statement, return_tensors='pt').cuda()
    _ = model(tokens)

  for handle in handles:
    handle.remove()

  activations = []
  for hook in hooks:
    activations.append(np.vstack(hook.activations))

  return np.stack(activations, axis=0)


# Empty forward hooks just in case something happened.
for layer in model.model.layers:
  layer._forward_hooks.clear()
  assert not len(layer._forward_hooks)

In [None]:
idx = 100 #50
df = pd.read_csv(os.path.join(dataset_dir, 'common_claim_true_false.csv')) #'larger_than.csv'))   # Larger than produces obvious separability.... In most layers!!
df_concat = pd.concat((df.loc[df['label'] == 0][0:idx], df.loc[df['label'] == 1][0:idx]))
activations = compute_activations(df_concat['statement'], model)   # [layers, statements, np.array([1,tokens,4096])]

Not super clear for `common_claim_true_false.csv` (log reg accuracy of 0.615 in CV), but it seems like there is some for Llama-2-13b. But looking in the paper, it also seemed like this one might not be super clear in the model.<br>
For `larger_than.csv`, it is very clear, but then the representation seems to be present basically everywhere in the model.<br>
With `cities.csv`, it is very clear and logistic regression gets an accuracy of 0.995 in CV.

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


def plot_pca(layer, n_components):
    X, y = activations[layer,...], df_concat['label']
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)

    fig, ax = plt.subplots(n_components, n_components, figsize=(20,20))
    for x_component in range(n_components):
        for y_component in range(n_components):
            ax[x_component, y_component].scatter(X_pca[y==0, x_component], X_pca[y==0, y_component], label='False')
            ax[x_component, y_component].scatter(X_pca[y==1, x_component], X_pca[y==1, y_component], label='True')
            ax[x_component, y_component].set_title(f'x: {x_component}, y: {y_component}')
    plt.show()

In [None]:
plot_pca(14, n_components=5)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

layer = 14
X, y = activations[layer,...], df_concat['label'].to_numpy()
accs = []
for train_idx, test_idx in KFold(n_splits=5).split(X, y):
    X_train, y_train, X_test, y_test = X[train_idx,...], y[train_idx], X[test_idx,...], y[test_idx]
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)
    accs.append(accuracy_score(y_test, y_pred))
print(np.mean(accs))

In [None]:
for layer in range(40):
  print(f"Layer {layer+1}")
  plot_pca(layer, n_components=5)