In [2]:
import os
drive_dir = '/work3/s184399/msc'
data_dir = os.path.join(drive_dir, 'data')
cache_dir = os.path.join(drive_dir, "cache_dir", "huggingface")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
from typing import List, Tuple
import torch
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm


class ITI:
  """
    A class implementing Inference-Time Intervention from Li et al. 2023 (arXiv:2306.03341v5)

    It works like this:
    1) Attach to each attention-head by giving a self-attention mechanism
    2) Run a lot of data through the model
    3) Call `.fit` to supply the model with labels
    4) Now the probe intervenes

    You can deactivate the probe by setting "mode" to 0, and activate by setting "mode" to 1. "-1" is that it is logging, and is the default.
  """
  _probe = None
  _hook_handle = None
  _log: List
  _mode: int
  _head_idx: int
  _head_dim: int
  _head_start_idx: int
  _head_end_idx: int
  LOGGING, INACTIVE, ACTIVE = -1,0,1

  def __init__(self, self_attention, head_idx, clf_cls, clf_kwargs={}):
    self.idx = -1   # The index of the token we look at the internal state for
    self._head_idx = head_idx
    self._head_dim = self_attention.head_dim
    self._head_start_idx = self._head_idx * self._head_dim
    self._head_end_idx = self._head_start_idx + self._head_dim
    self._attn = self_attention
    self.attach_hook(self_attention)
    self.reset()
    self.clf_cls = clf_cls
    self.clf_kwargs = clf_kwargs

  def reset(self):
    self._mode = -1
    self._log = []
    self._probe = None

  def deactivate(self):
    self._mode = self.INACTIVE

  def activate(self):
    self._mode = self.ACTIVATE

  def log(self, input):
    assert len(input) == 1 and isinstance(input[0], torch.Tensor)
    o = input[0][...,self.idx,self._head_start_idx:self._head_end_idx].detach().cpu().numpy().astype(np.float16)
    self._log.append(o)

  def fit(self, y, train_idx=None, eval_idx=None):
    if (train_idx is None) and (eval_idx is None):
      train_idx = np.arange(len(y))
    elif (train_idx is None) and (eval_idx is not None):
      raise ValueError

    X = np.vstack(self._log)
    X_train, y_train = X[train_idx], y[train_idx]
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    clf = self.clf_cls(**self.clf_kwargs)
    clf.fit(X_train, y_train)
    self._probe = clf

    if eval_idx is not None:
      X_test, y_test = X[eval_idx], y[eval_idx]
      X_test = scaler.transform(X_test)
      y_pred = self._probe.predict(X_test)
      return accuracy_score(y_test, y_pred)
    return -1

  def intervene(self, input):
    raise NotImplementedError
    input = (input[0].clone(),)
    o = input[0][...,self.idx,self._head_start_idx:self._head_end_idx]
    o = o + self.strength*self._std_dev*self._direction    # Should also estimate and multiply by the standard deviation head activation.
    input[0][...,self.idx,self._head_start_idx:self._head_end_idx] = o
    return input

  def __call__(self, module, input, **kwargs):
    if self._mode == self.LOGGING:
      return self.log(input)
    elif (self._mode == self.INACTIVE) or self.strength == 0.:
      return None

  def attach_hook(self, self_attention):
    self._hook_handle = self_attention.o_proj.register_forward_pre_hook(self)

  def clear_hook(self):
    self._hook_handle.remove()

In [50]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

df = pd.read_csv(os.path.join(data_dir, 'politicians_edge_substitutions.csv'))
ord = OrdinalEncoder()
groups = ord.fit_transform(np.array(df['name']).reshape(-1,1))
df['Group'] = groups.astype(np.uint32)
df = df.sort_values(by='Group', axis=0)

#idx = 100
#df_concat = pd.concat((df.loc[df['isTrue'] == 0][0:idx], df.loc[df['isTrue'] == 1][0:idx]))
#df_concat = df_concat.reset_index()
#df_concat

n_groups=500

selected_rows = []
for i in range(n_groups):
  sel = df[df['Group']==i]
  n_true = sel['isTrue'].sum()
  n_false = (1-sel['isTrue']).sum()
  n_sel = min(n_true, n_false)
  selected_rows.append(sel[sel['isTrue']==0][:n_sel])
  selected_rows.append(sel[sel['isTrue']==1][:n_sel])
df_concat = pd.concat(selected_rows)
df_concat = df_concat.reset_index()
df_concat

Unnamed: 0.1,index,Unnamed: 0,name,nameLabel,education,educationLabel,countryLabel,isTrue,Questions,Statements,Group
0,27755,28362,Q1000053,Vasily Nebenzya,Q4407713,St. Petersburg Institute of Mechanical Enginee...,Russia,0,Did Vasily Nebenzya attend St. Petersburg Inst...,Vasily Nebenzya attended St. Petersburg Instit...,0
1,27750,28357,Q1000053,Vasily Nebenzya,Q322964,Moscow State Institute of International Relations,Russia,1,Did Vasily Nebenzya attend Moscow State Instit...,Vasily Nebenzya attended Moscow State Institut...,0
2,4487,4639,Q100218,Martina Bunge,Q504723,University of Siegen,Germany,0,Did Martina Bunge attend University of Siegen?,Martina Bunge attended University of Siegen.,1
3,4482,4634,Q100218,Martina Bunge,Q159895,University of Rostock,Germany,1,Did Martina Bunge attend University of Rostock?,Martina Bunge attended University of Rostock.,1
4,4493,4646,Q100234,Marion Schick,Q28971028,United Nations University (Germany),Germany,0,Did Marion Schick attend United Nations Univer...,Marion Schick attended United Nations Universi...,2
...,...,...,...,...,...,...,...,...,...,...,...
1199,7774,8003,Q120001,Jean-Jacques Rey-Bellet,Q262760,Swiss Federal Institute of Technology in Lausanne,Switzerland,1,Did Jean-Jacques Rey-Bellet attend Swiss Feder...,Jean-Jacques Rey-Bellet attended Swiss Federal...,497
1200,29097,29734,Q1200058,Derek Mackay,Q17040338,University of Kent – School of Sport & Exercis...,United Kingdom,0,Did Derek Mackay attend University of Kent – S...,Derek Mackay attended University of Kent – Sch...,498
1201,29094,29731,Q1200058,Derek Mackay,Q192775,University of Glasgow,United Kingdom,1,Did Derek Mackay attend University of Glasgow?,Derek Mackay attended University of Glasgow.,498
1202,29101,29738,Q1200134,Derek Vaughan,Q4731673,Allen Hall Seminary,United Kingdom,0,Did Derek Vaughan attend Allen Hall Seminary?,Derek Vaughan attended Allen Hall Seminary.,499


# LLaMa-3-8b

In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "meta-llama/Meta-Llama-3-8B"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", cache_dir=cache_dir, quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
from sklearn.model_selection import GroupKFold, KFold

# Test if we can even apply ITI to LLaMa
def setup_iti(model, clf_cls, clf_kwargs={}):
  iti_adapters = []
  for layer in model.model.layers:
    att = layer.self_attn
    iti_layer_adapters = []
    for head_idx in range(att.num_heads):
      iti_layer_adapters.append(ITI(att, head_idx, clf_cls, clf_kwargs))
    iti_adapters.append(iti_layer_adapters)
  return iti_adapters

def clear_hooks(model):
  for layer in model.model.layers:
    layer.self_attn.o_proj._forward_pre_hooks.clear()
    assert not len(layer.self_attn.o_proj._forward_pre_hooks)

def compute_activations(model, data):
  statements = data['Statements']
  for statement in tqdm(statements, desc="Computing activations", leave=False):
    tokens = tokenizer.encode(statement, return_tensors='pt').cuda()
    _ = model(tokens)

def train_all_iti_adapters(iti_adapters, y, train_idx=None, eval_idx=None):
  """
    Assuming that data has already been logged in the adapters
  """
  accs = []
  for iti_layer_adapters in tqdm(iti_adapters, desc="Layers", leave=False):
    layer_accs = []
    for adapter in tqdm(iti_layer_adapters, desc="Heads", leave=False):
      acc = adapter.fit(y, train_idx, eval_idx)
      layer_accs.append(acc)
    accs.append(layer_accs)
  return accs

def cross_val_all_iti_adapters(iti_adapters, data, n_splits=5, verbose=False):
  y = data['isTrue']
  groups = None
  if 'Group' in data.columns:
    from sklearn.model_selection import GroupKFold
    if verbose:
      print("Using GroupedKFold")
    k_fold = GroupKFold(n_splits=n_splits)
    groups = data['Group']
  else:
    if verbose:
      print("Using KFold")
    from sklearn.model_selection import KFold
    k_fold = KFold(n_splits=n_splits, shuffle=True)
  accs = None
  for train_idx, val_idx in tqdm(k_fold.split(np.zeros((len(y),1)), groups=groups), desc="CV fold", total=n_splits, leave=False):
    if verbose:
      print(f"Evaluating on groups: {data.iloc[val_idx]['Group'].unique()}")
    acc = train_all_iti_adapters(iti_adapters, y, train_idx=train_idx, eval_idx=val_idx)
    acc = np.array(acc)
    acc = np.expand_dims(acc, axis=2)
    if accs is None:
      accs = acc
    else:
      accs = np.concatenate((accs, acc), axis=2)
  return np.mean(accs, axis=2)

def two_layer_cv_for_head_assessment(iti_adapters, data_set, n_outer_splits=5, n_inner_splits=5, verbose=False):
  k_fold_outer = GroupKFold(n_splits=n_outer_splits)
  k_fold_inner = GroupKFold(n_splits=n_inner_splits)
  outer_groups = data_set['Group']
  perf = []
  order = []
  inner_accs = []
  y = data_set['isTrue']
  for dev_idx, test_idx in tqdm(k_fold_outer.split(np.zeros((len(data_set),1)), groups=outer_groups), desc="Outer fold", leave=False, total=n_outer_splits):
    # Outer CV fold - finds the best heads
    inner_groups = data_set.iloc[dev_idx]['Group']
    accs = []
    for train_idx_, val_idx_ in tqdm(k_fold_inner.split(np.zeros((len(dev_idx),1)), groups=inner_groups), desc="Inner fold", leave=False, total=n_inner_splits):
      # Inner CV fold - evaluates the heads by CV on the dev set
      train_idx, val_idx = dev_idx[train_idx_], dev_idx[val_idx_]
      acc = train_all_iti_adapters(iti_adapters, y, train_idx=train_idx, eval_idx=val_idx)
      acc = np.array(acc)
      #plot_performance_matrix(acc)
      acc = np.expand_dims(acc, axis=2)
      accs.append(acc)

    accs = np.concatenate(accs, axis=2)
    accs = np.mean(accs, axis=2)
    inner_accs.append(accs)
    assert accs.ndim == 2

    # Using the accuracy matrix, we find the heads that were estimated to be the best.
    n_layers, n_heads = accs.shape
    ix = np.argsort(accs, axis=None)
    r = ix//n_heads
    c = ix%n_heads
    r = np.array(list(reversed(r)))
    c = np.array(list(reversed(c)))
    assert len(accs[r,c]) == accs.shape[0]*accs.shape[1]
    assert np.all(accs[r,c][:-1] >= accs[r,c][1:])      # Check if sorted.

    # Then we evaluate the model intervened with those heads. (How do we select the number of heads and intervention strength though??)
    # OR: We evaluate the generalization errors of those best heads.
    acc = train_all_iti_adapters(iti_adapters, y, train_idx=dev_idx, eval_idx=test_idx)
    acc = np.array(acc)
    acc = acc[r,c]
    order.append(list(zip(r, c, acc)))
    acc = np.expand_dims(acc, axis=1)
    perf.append(acc)

  perf = np.array(perf)
  inner_accs = np.array(inner_accs)
  assert inner_accs.ndim == 3

  mean_perf = np.mean(perf, axis=0)                # Mean performance of which-ever head turns out to be the best one.
  mean_inner_accs = np.mean(inner_accs, axis=0)    # Mean accuracies computed across the inner folds (i.e. mean of means)
  assert mean_inner_accs.shape == (n_layers, n_heads)
  return mean_perf, order, mean_inner_accs          # Order is in decreasing value. Has shape (n_outer_folds, n_heads*n_layers, 2), where the last dim is rows and cols.

def plot_performance_matrix(performance_matrix):
  n_layers, n_heads = performance_matrix.shape
  fig, ax = plt.subplots(figsize=(20,20))
  im = ax.imshow(performance_matrix)

  # Rotate the tick labels and set their alignment.
  plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
          rotation_mode="anchor")

  # Loop over data dimensions and create text annotations.
  for l in range(n_layers):
      for h in range(n_heads):
          text = ax.text(h, l, f"{performance_matrix[l, h]:.2f}",
                        ha="center", va="center", color="w")

  ax.set_title("Accuracies for linear probes")
  ax.set_xlabel("head")
  ax.set_ylabel("layer")
  fig.tight_layout()
  plt.show()


#####################################
## Two-layer cross validation test ##
#####################################
class TestAdapter:
  def __init__(self, id):
    self.train_idxs = []
    self.eval_idxs = []
    self.ys = []
    self.id = id

  def fit(self, y, train_idx, eval_idx):
    self.train_idxs.append(train_idx)
    self.eval_idxs.append(eval_idx)
    self.ys.append(y)
    return self.id

  def __str__(self):
    return f"TestAdapter({self.id})"

  def __repr__(self):
    return self.__str__()

test_adapters = []
for l in range(5):
  layer_test_adapters = []
  for h in range(7):
    layer_test_adapters.append(TestAdapter(id=l*7+h))
  test_adapters.append(layer_test_adapters)

mean_perf, order, mean_inner_accs = two_layer_cv_for_head_assessment(test_adapters, df_concat, n_outer_splits=2, n_inner_splits=3)

# Two completely random adapters
ada1 = test_adapters[0][3]
ada2 = test_adapters[3][2]
assert len(ada1.train_idxs) > 0
assert len(ada1.eval_idxs) > 0
assert len(ada1.ys) > 0

## Check that all heads see the same data
assert ada1.train_idxs == ada2.train_idxs
assert ada1.eval_idxs == ada2.eval_idxs
assert ada1.ys == ada2.ys

## Check that the data indices are correct
first_inner_all_eval = np.concatenate(ada1.eval_idxs[0:3], axis=0)
second_inner_all_eval = np.concatenate(ada1.eval_idxs[4:7], axis=0)
assert len(np.unique(first_inner_all_eval)) == len(first_inner_all_eval)
assert len(np.unique(second_inner_all_eval)) == len(second_inner_all_eval)
assert set(first_inner_all_eval).intersection(set(second_inner_all_eval)) == set()   # The model is never evaluated on the same indices twice in the inner loop.

first_outer_all_train = ada1.train_idxs[3]
second_outer_all_train = ada1.train_idxs[7]
first_outer_all_eval = ada1.eval_idxs[3]
second_outer_all_eval = ada1.eval_idxs[7]

assert set(first_outer_all_train).intersection(set(second_outer_all_train)) == set()
assert set(first_outer_all_eval).intersection(set(second_outer_all_eval)) == set()

Outer fold:   0%|          | 0/2 [00:00<?, ?it/s]

Inner fold:   0%|          | 0/3 [00:00<?, ?it/s]

Layers:   0%|          | 0/5 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Layers:   0%|          | 0/5 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Layers:   0%|          | 0/5 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Layers:   0%|          | 0/5 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Inner fold:   0%|          | 0/3 [00:00<?, ?it/s]

Layers:   0%|          | 0/5 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Layers:   0%|          | 0/5 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Layers:   0%|          | 0/5 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Layers:   0%|          | 0/5 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

Heads:   0%|          | 0/7 [00:00<?, ?it/s]

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessClassifier

clear_hooks(model)
iti_adapters = setup_iti(model, GaussianProcessClassifier)
compute_activations(model, df_concat)

Computing activations:   0%|          | 0/1204 [00:00<?, ?it/s]

In [None]:
performance_matrix = cross_val_all_iti_adapters(iti_adapters, df_concat, verbose=True)
plot_performance_matrix(performance_matrix)

Using GroupedKFold


CV fold:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluating on groups: [  0   7  11  13  18  26  33  34  39  44  50  57  63  64  69  74  77  80
  84  90  92  97 101 110 123 125 129 134 136 139 144 148 152 156 161 164
 171 173 175 181 183 186 190 195 207 215 220 224 226 229 233 239 247 248
 253 260 263 268 274 280 281 285 290 291 300 305 308 313 319 326 333 335
 340 345 353 355 361 363 368 374 380 383 387 388 393 398 406 412 417 422
 427 433 438 439 444 455 458 459 480 487]


Layers:   0%|          | 0/32 [00:00<?, ?it/s]

Heads:   0%|          | 0/32 [00:00<?, ?it/s]

Heads:   0%|          | 0/32 [00:00<?, ?it/s]

Heads:   0%|          | 0/32 [00:00<?, ?it/s]

Heads:   0%|          | 0/32 [00:00<?, ?it/s]

Heads:   0%|          | 0/32 [00:00<?, ?it/s]

Heads:   0%|          | 0/32 [00:00<?, ?it/s]

Heads:   0%|          | 0/32 [00:00<?, ?it/s]

In [None]:
def change_adapter_classifier_cls(iti_adapters, new_clf_cls, new_clf_kwargs={}):
  """
    So we can just reuse the activations
  """
  for iti_layer_adapters in iti_adapters:
    for adapter in iti_layer_adapters:
      adapter.clf_cls = new_clf_cls
      adapter.clf_kwargs = new_clf_kwargs

In [None]:
from sklearn.svm import SVC

change_adapter_classifier_cls(iti_adapters, SVC)
performance_matrix = cross_val_all_iti_adapters(iti_adapters, df_concat, verbose=True)
plot_performance_matrix(performance_matrix)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


target_adapter = iti_adapters[9][28]

X = np.vstack(target_adapter._log)
y = df_concat['isTrue']

n_components = 5

pca = PCA(n_components=n_components)    # Måske plotte på en anden måde? Såsom difference in means, fjerne denne og så igen difference in means...
X_pca = pca.fit_transform(X)

fig, ax = plt.subplots(n_components, n_components, figsize=(20,20))
for x_component in range(n_components):
  for y_component in range(n_components):
    ax[x_component, y_component].scatter(X_pca[(y==0), x_component], X_pca[(y==0), y_component], label="False", color='tab:blue')
    ax[x_component, y_component].scatter(X_pca[(y==1), x_component], X_pca[(y==1), y_component], label="True", color='tab:orange')
    ax[x_component, y_component].set_title(f'x: {x_component}, y: {y_component}')
plt.legend(loc=(1.04, n_components))

In [None]:
from sklearn.model_selection import cross_val_score

estimator = SVC()
cross_val_score(estimator, X, y, scoring='accuracy')

In [None]:
mean_perf, order, mean_inner_accs = two_layer_cv_for_head_assessment(iti_adapters, df_concat, n_outer_splits=5, n_inner_splits=5)

In [None]:
plot_performance_matrix(mean_inner_accs)