In [None]:
dtu_hpc = "false"

In [2]:
if (not dtu_hpc) or (dtu_hpc == "false"):
  from google.colab import drive, userdata
  import os
  print("Running on Google Colab")
  drive.mount('/content/drive')
  drive_dir = '/content/drive/My Drive/'
  data_dir = os.path.join(drive_dir, 'Detection-datasets')
  cache_dir = os.path.join(drive_dir, 'model_cache')
  !pip install -q accelerate datasets
  !pip install -q -i https://pypi.org/simple/ bitsandbytes
  !pip install -q seaborn
  !pip install -q optuna
  disable_pbar = False

else:
  import os
  print("Running at DTU HPC")
  drive_dir = '/work3/s184399/msc'
  data_dir = os.path.join(drive_dir, 'Detection-datasets')
  cache_dir = os.path.join(drive_dir, "cache_dir", "huggingface")
  disable_pbar = True

Mounted at /content/drive


# TruthfulQA

In [4]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset('truthful_qa', 'generation')
df = ds['validation'].to_pandas()
TruthfulQA_df = []
for i in range(len(df)):
    row = df.iloc[i]
    for answer in row['correct_answers']:
        TruthfulQA_df.append(pd.DataFrame([{"Statement": f"{row['question']} {answer}.", "Label": 1, 'Type': row['type'], 'Category': row['category'], 'Group': i}]))
    for answer in row['incorrect_answers']:
        TruthfulQA_df.append(pd.DataFrame([{"Statement": f"{row['question']} {answer}.", "Label": 0, 'Type': row['type'], 'Category': row['category'], 'Group': i}]))

TruthfulQA_df = pd.concat(TruthfulQA_df, axis=0).reindex()
TruthfulQA_df = TruthfulQA_df.rename({'Statement': 'Statements', 'Label': 'isTrue'}, axis=1)
TruthfulQA_df.to_csv(os.path.join(data_dir, 'TruthfulQA_prepared.csv'))

Downloading readme:   0%|          | 0.00/9.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/817 [00:00<?, ? examples/s]

# Politicians

In [5]:
"""import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

df = pd.read_csv(os.path.join(data_dir, 'politicians_edge_substitutions.csv'))
ord = OrdinalEncoder()
groups = ord.fit_transform(np.array(df['name']).reshape(-1,1))
df['Group'] = groups.astype(np.uint32)
df = df.sort_values(by='Group', axis=0)

n_groups = 1000

selected_rows = []
for i in range(n_groups):
  sel = df[df['Group']==i]
  n_true = sel['isTrue'].sum()
  n_false = (1-sel['isTrue']).sum()
  n_sel = min(n_true, n_false)
  selected_rows.append(sel[sel['isTrue']==0][:n_sel])
  selected_rows.append(sel[sel['isTrue']==1][:n_sel])
politicians_df = pd.concat(selected_rows)
politicians_df = politicians_df.reset_index()
politicians_df.to_csv(os.path.join(data_dir, 'Politicians_prepared.csv'))
politicians_df"""

"import pandas as pd\nfrom sklearn.preprocessing import OrdinalEncoder\nimport numpy as np\n\ndf = pd.read_csv(os.path.join(data_dir, 'politicians_edge_substitutions.csv'))\nord = OrdinalEncoder()\ngroups = ord.fit_transform(np.array(df['name']).reshape(-1,1))\ndf['Group'] = groups.astype(np.uint32)\ndf = df.sort_values(by='Group', axis=0)\n\nn_groups = 1000\n\nselected_rows = []\nfor i in range(n_groups):\n  sel = df[df['Group']==i]\n  n_true = sel['isTrue'].sum()\n  n_false = (1-sel['isTrue']).sum()\n  n_sel = min(n_true, n_false)\n  selected_rows.append(sel[sel['isTrue']==0][:n_sel])\n  selected_rows.append(sel[sel['isTrue']==1][:n_sel])\npoliticians_df = pd.concat(selected_rows)\npoliticians_df = politicians_df.reset_index()\npoliticians_df.to_csv(os.path.join(data_dir, 'Politicians_prepared.csv'))\npoliticians_df"

# Tegmark's datasets
Here I only use "neg_cities", "common_claim_true_false", and "cities". Put these in the `Tegmark` folder.

In [6]:
import os
import pandas as pd

for ds in os.listdir(os.path.join(data_dir, 'Tegmark')):
  df = pd.read_csv(os.path.join(data_dir, 'Tegmark', ds))
  ds_new_name = ds.replace('.csv', '_prepared.csv')
  df = df.rename({'statement': 'Statements', 'label': 'isTrue'}, axis=1)
  df.to_csv(os.path.join(data_dir, ds_new_name))

# Compute activations

In [7]:
from typing import List
import transformers
from tqdm.notebook import tqdm
import numpy as np
import torch

dtype = np.float16


class Hook:
  # Inspired by https://github.com/saprmarks/geometry-of-truth/blob/main/generate_acts.py
  def __init__(self):
    self.activations = []
    self.idx = -1   # The index of the token we look at the internal state for

  def __call__(self, module, input, **kwargs):
    assert len(input) == 1 and isinstance(input[0], torch.Tensor)
    o = input[0][...,self.idx,:].detach().cpu().numpy().astype(dtype)
    self.activations.append(o)


def clear_hooks(model):
  for layer in model.model.layers:
    layer.self_attn.o_proj._forward_pre_hooks.clear()
    assert not len(layer.self_attn.o_proj._forward_pre_hooks)


def add_hooks(model):
  hooks = []
  handles = []
  for i, layer in enumerate(model.model.layers):
    hook = Hook()
    handle = layer.self_attn.o_proj.register_forward_pre_hook(hook)
    hooks.append(hook)
    handles.append(handle)
  return hooks, handles


def compute_activations(statements: List[str], model: torch.nn.Module, tokenizer) -> np.ndarray:
  """
    Returns:
    - Activations of shape [num_samples, num_layers, num_heads, head_dim]
  """
  # Empty forward hooks just in case something happened.
  clear_hooks(model)
  hooks, handles = add_hooks(model)

  for statement in tqdm(statements, leave=False, desc="Computing activations for each statement"):
    tokens = tokenizer.encode(statement, return_tensors='pt').cuda()    # Wait... How will this end up looking? <inst> Question answer </inst>? Should be <inst> Question </inst> answer
    _ = model(tokens)

  for handle in handles:
    handle.remove()

  activations = []
  for hook in hooks:
    activations.append(np.vstack(hook.activations))
  activations = np.stack(activations, axis=0)                                   # [num_layers, num_samples, n_hidden_dim]

  num_layers = len(model.model.layers)
  num_samples = len(statements)
  num_heads = model.model.layers[0].self_attn.num_heads
  head_dim = model.model.layers[0].self_attn.head_dim
  out = activations.reshape(num_layers, num_samples, num_heads, head_dim)       # [num_layers, num_samples, num_heads, head_dim]
  out = out.transpose(1,0,2,3)                                                  # [num_samples, num_layers, num_heads, head_dim]
  assert (out[2,4,3,:] == activations[4,2,(3*head_dim):(4*head_dim)]).all()
  return out                                                                    # [num_layers, num_samples, num_heads, head_dim]

In [8]:
import torch, gc
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from typing import List, Tuple
import pandas as pd
from tqdm.notebook import tqdm

def compute_activations_on_datasets(model_name: str, datasets: List[Tuple[str, pd.DataFrame]], model_setup_fn=None):
  """
    Should traverse datasets, not models (so we don't have to re-load the same model multiple times - this is expensive)
  """

  # Load 4 bit quantized model
  quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_compute_dtype=torch.float16,
  )
  model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", cache_dir=cache_dir, quantization_config=quantization_config, trust_remote_code=True)
  tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
  if model_setup_fn is not None:
    model, tokenizer = model_setup_fn(model, tokenizer)

  # Compute activations and save
  for dataset_name, df in tqdm(datasets, leave=False, desc="Datasets"):
    file_name_str = f"Activations_{dataset_name}_{model_name.split('/')[-1]}.npz"
    activations = compute_activations(df["Statements"], model, tokenizer)
    np.savez(os.path.join(data_dir, file_name_str), activations=activations, labels=df['isTrue'], group=(df['Group'] if 'Group' in df.columns else None))

    # Check if correctly saved
    a_ = np.load(os.path.join(data_dir, file_name_str))['activations']
    assert np.all(activations == a_)

  # Clean up model
  model.cpu()
  del model
  torch.cuda.empty_cache()
  gc.collect()


#compute_activations_on_dataset("meta-llama/Meta-Llama-3-8B", "Politicians", df_concat_politicians)

In [9]:
# Note: These chat templates are not used right now... Just added them here because they're needed for fine-tuning, hence maybe also for this task later on...
def opt_setup_fn(model, tokenizer):
  # Patch the attention heads to make it compatible with the rest (this one is needed!)
  model.model.layers = model.model.decoder.layers
  for l in model.model.layers:
    l.self_attn.o_proj = l.self_attn.out_proj
  return model, tokenizer

In [None]:
datasets = [(ds_file_name.replace('_prepared.csv', ''), pd.read_csv(os.path.join(data_dir, ds_file_name))) for ds_file_name in ["TruthfulQA_prepared.csv", "Politicians_prepared.csv", "neg_cities_prepared.csv", "common_claim_true_false_prepared.csv", "cities_prepared.csv"]] #filter(lambda x: '.csv' in x, os.listdir(data_dir))]
datasets = list(map(lambda ds: (ds[0], (ds[1][0:4000] if len(ds[1])>4000 else ds[1])), datasets))
models = ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-2.7b", "meta-llama/Llama-2-7b-chat-hf", "microsoft/Phi-3-mini-4k-instruct", "meta-llama/Meta-Llama-3-8B"]
model_setup_fns = [opt_setup_fn, opt_setup_fn, opt_setup_fn, None, None, None]

for model, model_setup_fn in tqdm(list(zip(models, model_setup_fns)), leave=False, desc="Models"):
  # Remember model setup function!
  ## For opt: Set model.model.layers = model.model.decoder.layers
  compute_activations_on_datasets(model, datasets, model_setup_fn=model_setup_fn)

Models:   0%|          | 0/6 [00:00<?, ?it/s]

Datasets:   0%|          | 0/5 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/1496 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/1496 [00:00<?, ?it/s]

Datasets:   0%|          | 0/5 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/1496 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/1496 [00:00<?, ?it/s]

Datasets:   0%|          | 0/5 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/1496 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/1496 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Datasets:   0%|          | 0/5 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/1496 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/1496 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/931 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Datasets:   0%|          | 0/5 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]



Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/1496 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/1496 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Datasets:   0%|          | 0/5 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/1496 [00:00<?, ?it/s]

Computing activations for each statement:   0%|          | 0/4000 [00:00<?, ?it/s]

In [None]:
[i for i, _ in datasets]

In [None]:
from google.colab import runtime
runtime.unassign()  # Closes runtime.