In [None]:
import os
import sys

import torch.utils.data

os.environ["CUDA_VISIBLE_DEVICES"] = '0'
sys.path.append("sae_visualizer")

In [None]:
import sys
sys.path.append('/tmp/pycharm_project_349/ioi_subspaces')
sys.path.append('/tmp/pycharm_project_451')
from new_codebase import Circuit
from load_sae import load_head_sae, is_available


c = Circuit()

In [None]:
def load_name_mover_sae(c):
    ioi_nodes = [c.heads[head].zs['end'] for head, head_data in c.heads.items() if head_data.head_class=='nm']

    saes = {}
    for node in ioi_nodes:
        if is_available(node):
            print(node)
            component, layer, head = node.component_name, node.layer, node.head
            saes[node] = load_head_sae(layer, head, component, perform_sanity_check=False)
            # turn off gradients
            saes[node].requires_grad_(False)
    return ioi_nodes, saes

In [None]:
def load_saes(c):
    saes = {}
    ioi_nodes = []
    for node in c.nodes:
        if is_available(node, project_name='sae-all-ioi-heads'):
            print(node)
            component, layer, head = node.component_name, node.layer, node.head
            sae = load_head_sae(layer, head, component, perform_sanity_check=False, project_name='sae-all-ioi-heads')
            if sae.cfg['d_hidden'] != 1000:
                ioi_nodes.append(node)
                saes[node] = sae
                # turn off gradients
                saes[node].requires_grad_(False)
    return ioi_nodes, saes
ioi_nodes, saes = load_saes(c)

In [None]:
from transformer_lens import HookedTransformer

model = HookedTransformer.from_pretrained('gpt2-small', device='cuda')
model.requires_grad_(False)
print('')

# Load IOI dataset

In [None]:
from ioi_utils import PromptDataset, PromptDistribution, NAMES, PLACES, OBJECTS, PREFIXES, TEMPLATES


def sample_ioi(model, prompt_distribution, patterns, samples_per_combination: int):
    prompts = []
    for pattern in patterns:
        for _ in range(samples_per_combination):
            prompts.append(prompt_distribution.sample_one(pattern, model))
    dataset = PromptDataset(prompts, model)
    return dataset


ioi_distribution = PromptDistribution(
    prefix_len=2,
    templates=TEMPLATES[:2],
    names=NAMES,
    places=PLACES,
    objects=OBJECTS,
    prefixes=PREFIXES
)

ioi_dataset = sample_ioi(model, ioi_distribution, ['ABB', 'BAB'], 10000)
batch_sampler = torch.utils.data.BatchSampler(
    torch.utils.data.SequentialSampler(ioi_dataset),
    batch_size=10,
    drop_last=False,
)
ioi_loader = torch.utils.data.DataLoader(
    ioi_dataset,
    sampler=batch_sampler,
    batch_size=None,
)

# for test set, we need the same names etc, just different combinations
ioi_test_dataset = sample_ioi(model, ioi_distribution, ['ABB', 'BAB'], 10000)
batch_sampler = torch.utils.data.BatchSampler(
    torch.utils.data.SequentialSampler(ioi_test_dataset),
    batch_size=10,
    drop_last=False,
)
ioi_test_loader = torch.utils.data.DataLoader(
    ioi_test_dataset,
    sampler=batch_sampler,
    batch_size=None,
)

# Calculate alive and ioi features

In [None]:
from autoscoring import get_alive_neurons
from IPython.display import clear_output


n_alive = {}
features_alive = {}
for node, encoder in saes.items():
    idx_alive = get_alive_neurons(encoder, model)
    n_alive[node] = idx_alive.numel()
    features_alive[node] = idx_alive
clear_output()
n_alive

In [None]:
# save features_alive
import pickle

with open('../data/features_alive.pkl', 'wb') as f:
    pickle.dump(features_alive, f)

In [None]:
# load features_alive
import pickle

with open('../data/features_alive.pkl', 'rb') as f:
    features_alive = pickle.load(f)

In [None]:
from autoscoring import get_ioi_neurons
n_ioi = {}
ioi_neurons_idx = {}
for node, encoder in saes.items():
    idx_alive = get_ioi_neurons(encoder, model, ioi_loader, node, max_batches=1000)
    n_ioi[node] = idx_alive.numel()
    ioi_neurons_idx[node] = idx_alive
n_ioi

In [None]:
import pickle

with open('../data/ioi_neurons_idx.pkl', 'wb') as f:
    pickle.dump(ioi_neurons_idx, f)

In [None]:
import pickle

with open('../data/ioi_neurons_idx.pkl', 'rb') as f:
    ioi_neurons_idx = pickle.load(f)

# Extract feature activations and count them

In [None]:
import torch
from tqdm import tqdm
from research.autoscoring import run_with_cache, LabeledTensor


measures = ['count', 'sum', 'total', ]
patterns = ['ABB', 'BAB']
roles = ['s', 'io']
num_neurons = max([saes[node].cfg['d_hidden'] for node in ioi_nodes])
shape = (len(measures), len(ioi_nodes), len(patterns), len(roles), len(NAMES), num_neurons)


@torch.no_grad()
def get_actv_counts(model, ioi_loader, saes, ioi_nodes, shape):
    actv_counts = LabeledTensor(shape, measures=measures, ioi_nodes=ioi_nodes, patterns=patterns, 
                            roles=roles, names=NAMES, device='cpu')
    for batch in tqdm(ioi_loader):
        cache = run_with_cache(model, batch, ioi_nodes)
        for i, node in enumerate(ioi_nodes):
            actvs = cache[i]  # (batch, neurons)
            feature_actvs = saes[node].encoder(actvs)
            is_active = feature_actvs > 0
            for j, prompt in enumerate(batch.prompts):
                p = prompt.pattern
                s = prompt.s_name
                io = prompt.io_name
                
                # Count
                actv_counts['count', node, p, 's', s] += is_active[j].int().cpu()
                actv_counts['count', node, p, 'io', io] += is_active[j].int().cpu()
                
                # Sum
                actv_counts['sum', node, p, 's', s] += feature_actvs[j].cpu()
                actv_counts['sum', node, p, 'io', io] += feature_actvs[j].cpu()
                
                # Total
                actv_counts['total', node, p, 's', s] += 1
                actv_counts['total', node, p, 'io', io] += 1

    return actv_counts

In [None]:
actv_counts = get_actv_counts(model, ioi_loader, saes, ioi_nodes, shape)
test_actv_counts = get_actv_counts(model, ioi_test_loader, saes, ioi_nodes, shape)

actv_counts.save('../data/feature_actvs.pt')
test_actv_counts.save('../data/test_feature_actvs.pt')

In [None]:
actv_counts = LabeledTensor.load('../data/feature_actvs.pt')
test_actv_counts = LabeledTensor.load('../data/test_feature_actvs.pt')

# Feature Scoring based on activation counts
      

In [None]:
from research.autoscoring import get_genders


is_male = get_genders(NAMES)

In [None]:
import pandas as pd


ioi_components = pd.DataFrame(columns=['component', 'layer', 'head', 'id', 'seq_pos', 'node'])
for node in ioi_nodes:
    component = node.component_name
    layer = node.layer
    head = node.head
    id = f'({component}, {layer}, {head})'
    seq_pos = node.seq_pos
    ioi_components.loc[len(ioi_components)] = {'component': component, 'layer': layer, 'head': head, 'id': id, 'seq_pos': seq_pos, 'node': node}
ioi_components

In [None]:
from functools import partial
import pandas as pd
from tqdm.auto import tqdm

from research.autoscoring import io_name_score, s_name_score, contains_name_score, first_name_score, second_name_score, name_x_pos, gender_x_role_score, name_score, io_pos_score, gender_score, context_position_score, name_x_context_pos_score


tqdm.pandas()


# these feature types are only well-defined over s2 and end nodes
# you can't calculate them on io or s1 nodes because their sequence position isn't fixed which is an assumption that these functions make
name_feature_types = {
    'io_name': io_name_score,
    's_name': s_name_score,
    'contains_name': contains_name_score,
    'first_name': first_name_score,
    'second_name': second_name_score,
    'first_name_S': partial(name_x_pos, pattern='BAB', role='s'),
    'second_name_IO': partial(name_x_pos, pattern='BAB', role='io'),
    'first_name_IO': partial(name_x_pos, pattern='ABB', role='io'),
    'second_name_S': partial(name_x_pos, pattern='ABB', role='s'),
    's_is_male': partial(gender_x_role_score, role='s', gender='M', is_male=is_male),
    's_is_female': partial(gender_x_role_score, role='s', gender='F', is_male=is_male),
    'io_is_male': partial(gender_x_role_score, role='io', gender='M', is_male=is_male),
    'io_is_female': partial(gender_x_role_score, role='io', gender='F', is_male=is_male),

}

def describe_features(df):
    num_neurons = max([saes[node].cfg['d_hidden'] for node in ioi_nodes])
    feature_descriptions = []
    id = df['id'].iloc[0]
    for feature_id in range(num_neurons):
        for _, row in df[df.seq_pos.isin(['s2', 'end'])].iterrows():
            node = row['node']

            # DEAD NEURONS
            if feature_id not in features_alive[node]:
                row = {'node': node, 'feature_id': feature_id, 'feature_type': 'dead', 'topk': 0, 'names': '', 
                    'recall': 0, 'precision': 0, 'f_score': 0, 'component': id}
                feature_descriptions.append(row)

            # NEURONS THAT NEVER FIRE ON IOI PROMPTS
            elif feature_id not in ioi_neurons_idx[node]:
                row = {'node': node, 'feature_id': feature_id, 'feature_type': 'not_ioi', 'topk': 0, 'names': '',
                    'recall': 0, 'precision': 0, 'f_score': 0, 'component': id}
                feature_descriptions.append(row)
            else:
                # FEATURES ON S2 AND END NODES
                for feature_type, func in name_feature_types.items():
                    scores = func(actv_counts[:, node, :, :, :, feature_id], test_actv_counts=test_actv_counts[:, node, :, :, :, feature_id])
                    scores['node'] = node
                    scores['feature_id'] = feature_id
                    scores['feature_type'] = feature_type
                    scores['component'] = id
                    feature_descriptions.append(scores)

                for pattern in patterns:
                    # select the number of activations at the io position
                    score = io_pos_score(actv_counts[:, node, :, :, :, feature_id], pattern, test_actv_counts=test_actv_counts[:, node, :, :, :, feature_id])
                    score['node'] = node
                    score['feature_id'] = feature_id
                    score['component'] = id
                    feature_descriptions.append(score)

                for role in roles:
                    for gender in ['M', 'F']:
                        score = gender_x_role_score(actv_counts[:, node, :, :, :, feature_id], role, gender, is_male=is_male, test_actv_counts=test_actv_counts[:, node, :, :, :, feature_id])
                        score['node'] = node
                        score['feature_id'] = feature_id
                        score['component'] = id
                        feature_descriptions.append(score)

        # FEATURES ON IO AND S1 NODES
        if df.seq_pos.str.contains('io').any():
            io_node = df.loc[df.seq_pos == 'io', 'node'].iloc[0]
            s_node = df.loc[df.seq_pos == 's1', 'node'].iloc[0]
            score = name_score(actv_counts[:, :, :, :, :, feature_id], io_node, s_node, test_actv_counts=test_actv_counts[:, :, :, :, :, feature_id])
            score['node'] = io_node
            score['feature_id'] = feature_id
            score['feature_type'] = 'current_name'
            score['component'] = id
            feature_descriptions.append(score)

            for gender in ['M', 'F']:
                score = gender_score(actv_counts[:, :, :, :, :, feature_id], io_node, s_node, gender, is_male, test_actv_counts=test_actv_counts[:, :, :, :, :, feature_id])
                score['node'] = io_node
                score['feature_id'] = feature_id
                score['component'] = id
                feature_descriptions.append(score)

            for position in [1, 2]:
                score = context_position_score(actv_counts[:, :, :, :, :, feature_id], io_node, s_node, position, test_actv_counts=test_actv_counts[:, :, :, :, :, feature_id])
                score['node'] = io_node
                score['feature_id'] = feature_id
                score['component'] = id
                feature_descriptions.append(score)

                score = name_x_context_pos_score(actv_counts[:, :, :, :, :, feature_id], io_node, s_node, position, test_actv_counts=test_actv_counts[:, :, :, :, :, feature_id])
                score['node'] = io_node
                score['feature_id'] = feature_id
                score['feature_type'] = f'current_name_pos_{position}'
                score['component'] = id
                feature_descriptions.append(score)
    df = pd.DataFrame(feature_descriptions)
    return df


In [None]:
# without parallelization
df = ioi_components.groupby('id').progress_apply(describe_features)

In [None]:
# for each SAE neuron, accept the feature with the best training f_score
final_df = df.groupby(['node', 'feature_id']).apply(lambda df: df.loc[df.f_score.idxmax()])
final_df.to_csv('../data/feature_descriptions.csv')
final_df[final_df.f_score > 0.75]

In [None]:
# Summary
for node in ioi_nodes:
    print(node)
    for feature_type in final_df.feature_type.unique():
        print(feature_type, len(final_df[(final_df.feature_type == feature_type) & (final_df.f_score > 0.4) & (final_df.node == node)]))
    print()

# Random Plots

In [None]:
melt = pd.melt(final_df.reset_index(drop=True), id_vars=['node', 'feature_id', 'feature_type', 'topk', 'names'], 
        value_vars=['recall', 'precision', 'f_score'],
        value_name='value', var_name='metric')


In [None]:
import seaborn as sns

sns.displot(data=melt[melt['value'] > 0.], x='value', col='metric', hue='node', bins=100, 
            alpha=0.2, kind='hist', col_wrap=3, legend=False, facet_kws={'sharex': False, 'sharey': False})


In [None]:
sns.histplot(final_df[final_df.feature_type=='first_name_S'], x='topk', hue='node', bins=30)
import matplotlib.pyplot as plt
plt.xlabel('Number of names in the IO name feature')
plt.title('IO name features')

# This implementation is nice and readable but fucking slow

Due to how pandas accesses multilevel index values. It's currently being fixed, e.g. https://github.com/pandas-dev/pandas/issues/38650

In [None]:
import pandas as pd
import numpy as np

measure = pd.Categorical(['count', 'sum', 'total'], ordered=True)
pattern = pd.Categorical(['ABB', 'BAB'], ordered=True)
role = pd.Categorical(['s', 'io'])
multi_index = pd.MultiIndex.from_product([measure, pd.Categorical(ioi_nodes), pattern, role, pd.Categorical(NAMES), np.arange(saes[ioi_nodes[0]].cfg['d_hidden'])], names=['measure', 'node', 'pattern', 'role', 'name', 'neuron'])
df = pd.DataFrame(0, index=multi_index, columns=['data'])
df.info()
from tqdm import tqdm
from research.autoscoring import run_with_cache


for batch in tqdm(ioi_loader):
    cache = run_with_cache(model, batch, ioi_nodes)
    for i, node in enumerate(ioi_nodes):
        actvs = cache[i]  # (batch, neurons)
        feature_actvs = saes[node].encoder(actvs)
        is_active = feature_actvs > 0
        # this is slow because we can't batch it but the code is much more readable by using pandas
        for j, prompt in enumerate(batch.prompts):                
            df.loc[('count', node, prompt.pattern, 's', [prompt.s_name]), 'data'] += is_active[j].int().cpu().numpy()
            df.loc[('count', node, prompt.pattern, 'io', [prompt.io_name]), 'data'] += is_active[j].int().cpu().numpy()
            df.loc[('sum', node, prompt.pattern, 's', [prompt.s_name]), 'data'] += feature_actvs[j].cpu().numpy()
            df.loc[('sum', node, prompt.pattern, 'io', [prompt.io_name]), 'data'] += feature_actvs[j].cpu().numpy()
            df.loc[('total', node, prompt.pattern, 's', [prompt.s_name]), 'data'] += 1
            df.loc[('total', node, prompt.pattern, 'io', [prompt.io_name]), 'data'] += 1        
df[df.data>0]
# normalize the dataframe: divide sum by total and divide count by total, then remove total
df = df.unstack(level='measure')
df['sum'] = df[('data', 'count')] / df[('data', 'total')]
df['count'] = df[('data', 'count')] / df[('data', 'total')]
df = df.drop(columns='data')
df.head()
feature_types = pd.Categorical(['io_name', 's_name'], ordered=True)
feature_idx = np.arange(saes[ioi_nodes[0]].cfg['d_hidden'])
multi_index = pd.MultiIndex.from_product([pd.Categorical(ioi_nodes), feature_types, pd.Categorical(NAMES), feature_idx], names=['node', 'feature_type', 'name', 'feature_idx'])
feature_scores = pd.DataFrame(0, index=multi_index, columns=['score'])
feature_scores = feature_scores.sort_index(level=['node', 'feature_type', 'name', 'feature_idx'])
feature_scores
for node in ioi_nodes:
    for name in tqdm(NAMES):
        io_scores_abb = df.loc[(node, 'ABB', 'io', name, feature_idx), 'count'].to_numpy()
        io_scores_bab = df.loc[(node, 'BAB', 'io', name, feature_idx), 'count'].to_numpy()
        s_scores_aba = df.loc[(node, 'ABB', 's', name, feature_idx), 'count'].to_numpy()
        s_scores_baa = df.loc[(node, 'BAB', 's', name, feature_idx), 'count'].to_numpy()
        score = io_scores_bab * io_scores_abb - (s_scores_aba + s_scores_baa)
        feature_scores.loc[(node, 'io_name', name, feature_idx), 'score'] = score
        feature_scores.loc[(node, 's_name', name, feature_idx), 'score'] = s_scores_aba * s_scores_baa - (io_scores_bab + io_scores_abb)
feature_scores.to_csv('../data/feature_scores.csv')