In [2]:
import sys
import os
import pandas as pd
import numpy as np
import itertools
sys.path.append('..')
sys.path.append('../src')
sys.path.append('../data')
# from Src.config import hf_access_token, hf_model_cache_dir # noqa: E402
# os.environ["HF_HOME"] = hf_model_cache_dir 
# import torch

%load_ext autoreload
%autoreload 2

Load the model "gpt2" or "EleutherAI/pythia-6.9b"

In [3]:
from Src.model import ModelFactory
model = ModelFactory.create("gpt2")




Loaded pretrained model gpt2 into HookedTransformer


Then load the dataset (Warning, select the right dataset for the model you loaded). Load also the ablator class


In [4]:
from Src.dataset import BaseDataset
from Src.experiment import Ablator


dataset = BaseDataset(path = "../data/full_data_sampled_gpt2.json",
                      model = model,
                      experiment="copyVSfact",
                      no_subject=True)
ablator = Ablator(model=model, dataset=dataset, experiment="copyVSfact", batch_size=20)

[91m No subject found in the dataset [0m, proceeding with no subject data


Tokenizing and computing lengths: 100%|██████████| 10000/10000 [00:44<00:00, 227.10it/s]


Set the heads that you want to modify, the value to multiply the heads and the token position that you want to modify (all, or attribute):

In [5]:
cofa_heads = [(7, 10), (9, 9), (9, 6), (10, 0)]
fa_heads = [(10, 7), (11, 10), (11, 3)]

betas = [1, 0]
alphas = [1, 5]

In [58]:
# ablator.set_heads(heads=[(10, 3)], value=2, position='attribute')
# print(ablator.model.model.hook_dict.values())


Run the attention modification:


- mem --> logit of the factual token


- cp --> logit of the counterfactual token

- mem win --> number of factual predictions

- cp win --> number of counterfactual predictions

In [65]:
result_boost = []
for alpha in alphas:
    # all subsets of fa heads
    for fa_max in range(1, len(fa_heads)+1):
        for fa_heads_subset in itertools.combinations(fa_heads, fa_max):
            print(f'alpha: {alpha}, fa_heads_subset: {fa_heads_subset}')
            ablator.set_heads(heads=list(fa_heads_subset), value=alpha, position='attribute')
            
            cur_df = ablator.run()
            cur_df['alpha'] = alpha
            cur_df['heads'] = str(fa_heads_subset)
            cur_df['experiment'] = 'boost'
            result_boost.append(cur_df)

result_boost = pd.concat(result_boost)


alpha: 1, fa_heads_subset: ((10, 7),)


Ablating:   0%|          | 0/24 [00:00<?, ?it/s]

Ablating: 100%|██████████| 24/24 [01:19<00:00,  3.30s/it]


alpha: 1, fa_heads_subset: ((11, 10),)


Ablating: 100%|██████████| 24/24 [01:19<00:00,  3.31s/it]


alpha: 1, fa_heads_subset: ((11, 3),)


Ablating: 100%|██████████| 24/24 [01:19<00:00,  3.32s/it]


alpha: 1, fa_heads_subset: ((10, 7), (11, 10))


Ablating: 100%|██████████| 24/24 [01:51<00:00,  4.65s/it]


alpha: 1, fa_heads_subset: ((10, 7), (11, 3))


Ablating: 100%|██████████| 24/24 [01:51<00:00,  4.64s/it]


alpha: 1, fa_heads_subset: ((11, 10), (11, 3))


Ablating: 100%|██████████| 24/24 [01:52<00:00,  4.67s/it]


alpha: 1, fa_heads_subset: ((10, 7), (11, 10), (11, 3))


Ablating: 100%|██████████| 24/24 [02:24<00:00,  6.03s/it]


alpha: 5, fa_heads_subset: ((10, 7),)


Ablating: 100%|██████████| 24/24 [01:21<00:00,  3.41s/it]


alpha: 5, fa_heads_subset: ((11, 10),)


Ablating: 100%|██████████| 24/24 [01:20<00:00,  3.36s/it]


alpha: 5, fa_heads_subset: ((11, 3),)


Ablating: 100%|██████████| 24/24 [01:25<00:00,  3.54s/it]


alpha: 5, fa_heads_subset: ((10, 7), (11, 10))


Ablating: 100%|██████████| 24/24 [01:55<00:00,  4.83s/it]


alpha: 5, fa_heads_subset: ((10, 7), (11, 3))


Ablating: 100%|██████████| 24/24 [01:57<00:00,  4.88s/it]


alpha: 5, fa_heads_subset: ((11, 10), (11, 3))


Ablating: 100%|██████████| 24/24 [02:06<00:00,  5.29s/it]


alpha: 5, fa_heads_subset: ((10, 7), (11, 10), (11, 3))


Ablating: 100%|██████████| 24/24 [02:43<00:00,  6.81s/it]


In [66]:
print(result_boost) 

         mem         cp      diff   mem_std    cp_std  diff_std  mem_win  \
0  13.981473  17.561539 -3.580064  1.674214  2.685192  2.665300    413.0   
0  13.981473  17.561539 -3.580064  1.674214  2.685192  2.665300    413.0   
0  13.981473  17.561539 -3.580064  1.674214  2.685192  2.665300    413.0   
0  13.981473  17.561539 -3.580064  1.674214  2.685192  2.665300    413.0   
0  13.981473  17.561539 -3.580064  1.674214  2.685192  2.665300    413.0   
0  13.981473  17.561539 -3.580064  1.674214  2.685192  2.665300    413.0   
0  13.981473  17.561539 -3.580064  1.674214  2.685192  2.665300    413.0   
0  14.456228  16.385796 -1.929568  1.665862  2.691089  2.686700   2392.0   
0  13.594149  15.910871 -2.316722  1.651094  2.787582  2.762573   1864.0   
0  13.966662  17.532597 -3.565933  1.673361  2.688501  2.663716    421.0   
0  13.857364  14.130378 -0.273015  1.666047  2.702236  2.729101   5014.0   
0  14.439142  16.357597 -1.918455  1.665223  2.691837  2.685108   2419.0   
0  13.578178

In [68]:
result_surpress = []
# for beta in betas:
    # all subsets of cofa heads
for cofa_max in range(1, len(cofa_heads)+1):
    for cofa_heads_subset in itertools.combinations(cofa_heads, cofa_max):
        print(f'beta: {0}, cofa_heads_subset: {cofa_heads_subset}')
        ablator.set_heads(heads=list(cofa_heads_subset), value=0, position='attribute')
        cur_df = ablator.run()
        cur_df['beta'] = 0
        cur_df['heads'] = str(cofa_heads_subset)
        cur_df['experiment'] = 'surpress'
        result_surpress.append(cur_df)

result_surpress = pd.concat(result_surpress)

beta: 0, cofa_heads_subset: ((7, 10),)


Ablating: 100%|██████████| 24/24 [01:20<00:00,  3.36s/it]


beta: 0, cofa_heads_subset: ((9, 9),)


Ablating: 100%|██████████| 24/24 [01:23<00:00,  3.50s/it]


beta: 0, cofa_heads_subset: ((9, 6),)


Ablating: 100%|██████████| 24/24 [01:21<00:00,  3.41s/it]


beta: 0, cofa_heads_subset: ((10, 0),)


Ablating: 100%|██████████| 24/24 [01:20<00:00,  3.35s/it]


beta: 0, cofa_heads_subset: ((7, 10), (9, 9))


Ablating: 100%|██████████| 24/24 [01:52<00:00,  4.70s/it]


beta: 0, cofa_heads_subset: ((7, 10), (9, 6))


Ablating: 100%|██████████| 24/24 [01:52<00:00,  4.70s/it]


beta: 0, cofa_heads_subset: ((7, 10), (10, 0))


Ablating: 100%|██████████| 24/24 [01:53<00:00,  4.74s/it]


beta: 0, cofa_heads_subset: ((9, 9), (9, 6))


Ablating: 100%|██████████| 24/24 [01:53<00:00,  4.74s/it]


beta: 0, cofa_heads_subset: ((9, 9), (10, 0))


Ablating: 100%|██████████| 24/24 [01:52<00:00,  4.71s/it]


beta: 0, cofa_heads_subset: ((9, 6), (10, 0))


Ablating: 100%|██████████| 24/24 [01:51<00:00,  4.64s/it]


beta: 0, cofa_heads_subset: ((7, 10), (9, 9), (9, 6))


Ablating: 100%|██████████| 24/24 [02:18<00:00,  5.78s/it]


beta: 0, cofa_heads_subset: ((7, 10), (9, 9), (10, 0))


Ablating: 100%|██████████| 24/24 [02:25<00:00,  6.08s/it]


beta: 0, cofa_heads_subset: ((7, 10), (9, 6), (10, 0))


Ablating: 100%|██████████| 24/24 [02:45<00:00,  6.89s/it]


beta: 0, cofa_heads_subset: ((9, 9), (9, 6), (10, 0))


Ablating: 100%|██████████| 24/24 [03:23<00:00,  8.47s/it]


beta: 0, cofa_heads_subset: ((7, 10), (9, 9), (9, 6), (10, 0))


Ablating: 100%|██████████| 24/24 [02:56<00:00,  7.34s/it]


In [69]:
data = pd.concat([result_boost, result_surpress])
data.to_csv('results_boost_surpress.csv', index=False)
data

Unnamed: 0,mem,cp,diff,mem_std,cp_std,diff_std,mem_win,cp_win,alpha,heads,experiment,beta
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((10, 7),)",boost,
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((11, 10),)",boost,
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((11, 3),)",boost,
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((10, 7), (11, 10))",boost,
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((10, 7), (11, 3))",boost,
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((11, 10), (11, 3))",boost,
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((10, 7), (11, 10), (11, 3))",boost,
0,14.456228,16.385796,-1.929568,1.665862,2.691089,2.6867,2392.0,7495.0,5.0,"((10, 7),)",boost,
0,13.594149,15.910871,-2.316722,1.651094,2.787582,2.762573,1864.0,8009.0,5.0,"((11, 10),)",boost,
0,13.966662,17.532597,-3.565933,1.673361,2.688501,2.663716,421.0,9540.0,5.0,"((11, 3),)",boost,


In [74]:
best_boost = [((10, 7), (11, 10), (11, 3)), ((10, 7), (11, 10))]
best_surpress = [((7, 10), (9, 9), (9, 6), (10, 0)), ((7, 10), (9, 9), (10, 0)), ((9, 9), (9, 6), (10, 0))]
combined_result = []
for bost in best_boost:
    for surpress in best_surpress:
        ablator.set_heads(heads=list(bost), value=5, position='attribute')
        ablator.set_heads(heads=list(surpress), value=0, position='attribute', reset=False)
        cur_df = ablator.run()
        cur_df['experiment'] = 'combined'
        cur_df['heads_boost'] = str(bost)
        cur_df['beta'] = 0
        cur_df['alpha'] = 5
        cur_df['heads_surpress'] = str(surpress)
        combined_result.append(cur_df)

combined_result = pd.concat(combined_result)

Ablating: 100%|██████████| 24/24 [05:01<00:00, 12.57s/it]
Ablating: 100%|██████████| 24/24 [04:34<00:00, 11.46s/it]
Ablating: 100%|██████████| 24/24 [04:32<00:00, 11.37s/it]
Ablating: 100%|██████████| 24/24 [04:27<00:00, 11.16s/it]
Ablating: 100%|██████████| 24/24 [03:12<00:00,  8.02s/it]
Ablating: 100%|██████████| 24/24 [03:12<00:00,  8.00s/it]


In [75]:
data = pd.concat([result_boost, result_surpress, combined_result])
data.to_csv('results_boost_surpress.csv', index=False)
data

Unnamed: 0,mem,cp,diff,mem_std,cp_std,diff_std,mem_win,cp_win,alpha,heads,experiment,beta,heads_boost,heads_surpress
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((10, 7),)",boost,,,
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((11, 10),)",boost,,,
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((11, 3),)",boost,,,
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((10, 7), (11, 10))",boost,,,
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((10, 7), (11, 3))",boost,,,
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((11, 10), (11, 3))",boost,,,
0,13.981473,17.561539,-3.580064,1.674214,2.685192,2.6653,413.0,9547.0,1.0,"((10, 7), (11, 10), (11, 3))",boost,,,
0,14.456228,16.385796,-1.929568,1.665862,2.691089,2.6867,2392.0,7495.0,5.0,"((10, 7),)",boost,,,
0,13.594149,15.910871,-2.316722,1.651094,2.787582,2.762573,1864.0,8009.0,5.0,"((11, 10),)",boost,,,
0,13.966662,17.532597,-3.565933,1.673361,2.688501,2.663716,421.0,9540.0,5.0,"((11, 3),)",boost,,,


In [32]:
data.to_csv("../data/ablation_results.csv", index=False)

In [8]:
individual = []
ablator.set_heads(heads=[(10, 0)], value=0, position="attribute") #counter
individual.append(ablator.run())


ablator.set_heads(heads=[(11, 3)], value=5, position="attribute") #fact
individual.append(ablator.run())


ablator.set_heads(heads=[(10, 0)], value=0, position="attribute")
ablator.set_heads(heads=[(11, 3)], value=5, position="attribute", reset=False)
individual.append(ablator.run())

individual_data = pd.concat(individual)
individual_data["experiment"] = "individual"
individual_data["fa_alpha"] = [1, 5, 5]
individual_data["cofa_alpha"] = [0, 1, 0]
individual_data['percentage'] = individual_data['mem_win'] / (individual_data['mem_win'] + individual_data['cp_win']).astype(float)

individual_data



Ablating: 100%|██████████| 24/24 [01:14<00:00,  3.12s/it]
Ablating: 100%|██████████| 24/24 [01:14<00:00,  3.09s/it]
Ablating: 100%|██████████| 24/24 [01:46<00:00,  4.42s/it]


Unnamed: 0,mem,cp,diff,mem_std,cp_std,diff_std,mem_win,cp_win,experiment,fa_alpha,cofa_alpha,percentage
0,14.5323,17.067745,-2.535445,1.582479,2.740251,2.591399,1274.0,8660.0,individual,1,0,0.128246
0,13.966662,17.532597,-3.565933,1.673361,2.688501,2.663716,421.0,9540.0,individual,5,1,0.042265
0,14.5096,17.066998,-2.557397,1.58819,2.716855,2.574206,1205.0,8733.0,individual,5,0,0.121252


## Random boosting

You can pass to the model your own modification function. You can pass a list of hook (string, function) to the model using set_hooks method. The hooks should be consistent with the hook of the transformer lens library (https://neelnanda-io.github.io/TransformerLens/index.html)

In [7]:
import numpy as np
np.random.seed(34)
random_early_heads = np.random.choice(range(5), (10, 2))

random_late_heads = np.random.choice(range(8, 13), (10, 2))


print(random_early_heads)
print(random_late_heads)

random = []
for head in random_early_heads:
    ablator.set_heads(heads=[head], value=5, position="attribute") #fact
    df = ablator.run()
    df["experiment"] = "random"
    df["heads"] = str(head)
    random.append(df)

for head in random_late_heads:
    ablator.set_heads(heads=[head], value=5, position="attribute") #fact
    df = ablator.run()
    df["experiment"] = "random"
    df["heads"] = str(head)
    random.append(df)

random_data = pd.concat(random)
random_data["fa_alpha"] = 5
random_data["cofa_alpha"] = 0

    

[[1 2]
 [2 1]
 [4 3]
 [3 2]
 [2 1]
 [0 4]
 [0 4]
 [3 3]
 [0 0]
 [2 4]]
[[ 8 10]
 [12 10]
 [10 10]
 [10 11]
 [10  9]
 [10  8]
 [10 10]
 [ 8 11]
 [12  8]
 [12 12]]


Ablating:   0%|          | 0/24 [00:00<?, ?it/s]


TypeError: HookedTransformer.forward() got an unexpected keyword argument 'ruturn_type'