In [1]:
import sys
import pickle
import os

import torch
import numpy as np
import matplotlib.pyplot as plt

sys.path.append(os.getenv('SPARSE_PROBING_ROOT'))
from sparse_probing_paper.probing_datasets import wikidata
from sparse_probing_paper.config import ExperimentConfig, FeatureDatasetConfig
from sparse_probing_paper.load import *
from sparse_probing_paper.analysis.load_results import *
from sparse_probing_paper.analysis.plots.sparsity_probes import *

## Load Results

In [2]:
dataset_property = "sex_or_gender"
dataset_size = 6000
dataset_property = "occupation"
dataset_size = 6000
# dataset_property = 'political_party'
# dataset_size = 3000
# dataset_property = 'is_alive'
# dataset_size = 6000
# dataset_property = 'occupation_athlete'
# dataset_size = 5000

model_sizes = ['6.9b']

# prefix = 'wikidata_full'
prefix = 'wikidata_sorted'

experiment_info = {
    'results_dir': os.getenv('RESULTS_DIR'),
    'experiment_name': f'wikidata/{dataset_property}',
    'dataset_name': f'{prefix}_{dataset_property}.pyth.128.{dataset_size}',
    'inner_loop': 'telescopic_sparsity_sweep',
}
all_results = []
for s in model_sizes:
    results_dict, config = load_probing_experiment_results(**experiment_info, model_name=f'pythia-{s}')

    flattened_results = {}
    for (feature, layer, aggregation, hook_loc), value in results_dict.items():
        if aggregation == 'max':
            for k in value.keys():
                flattened_results[(feature, layer, k)] = value[k]
    rdf = pd.DataFrame(flattened_results).T.rename_axis(index=['feature', 'layer', 'k']).sort_index()
    rdf.index = pd.MultiIndex.from_tuples(
        [(s, *i) for i in rdf.index.values],
        names=['model_size', 'feature', 'layer', 'k'])

    all_results.append(rdf)
rdf = pd.concat(all_results)
rdf.shape

(3360, 20)

In [3]:
# for any k
n_per_model = 5
cols = ['test_mcc', 'test_f1_score',]
all_pr_auc = []
for s in model_sizes:
    model_rdf = rdf.query(f'model_size=="{s}"').sort_values('test_mcc', ascending=False).head(n_per_model)[cols]
    all_pr_auc.append(model_rdf)
pd.concat(all_pr_auc)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,test_mcc,test_f1_score
model_size,feature,layer,k,Unnamed: 4_level_1,Unnamed: 5_level_1
6.9b,association football player,19,4,0.882738,0.90183
6.9b,association football player,19,3,0.882738,0.90183
6.9b,association football player,19,5,0.87913,0.899174
6.9b,association football player,19,2,0.878739,0.898502
6.9b,association football player,19,1,0.877478,0.898026


In [4]:
# for k=1
n_per_model = 5
cols = ['test_mcc', 'test_f1_score', 'coef']
all_pr_auc = []
for s in model_sizes:
    model_rdf = rdf.query(f'model_size=="{s}" and k==1').sort_values('test_mcc', ascending=False).head(n_per_model)[cols]
    all_pr_auc.append(model_rdf)
pd.concat(all_pr_auc)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,test_mcc,test_f1_score,coef
model_size,feature,layer,k,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6.9b,association football player,19,1,0.877478,0.898026,[13.818521]
6.9b,association football player,15,1,0.869818,0.891803,[13.8263445]
6.9b,association football player,11,1,0.850729,0.876221,[13.777381]
6.9b,athlete,9,1,0.807188,0.833631,[11.684192]
6.9b,association football player,21,1,0.806448,0.83304,[12.206711]


## Sparsity Curves

In [5]:
# for s in model_sizes:
#     plot_layer_metric_over_sparsity_per_feature(rdf.query(f'model_size=="{s}"').reset_index(level=0, drop=True), metric='test_mcc', n_cols=2)
#     plt.suptitle(f'pythia-{s}', y=1.0)

In [6]:
# for s in model_sizes:
#     plot_metric_over_sparsity_per_layer(rdf.query(f'model_size=="{s}"').reset_index(level=0, drop=True), metric='test_f1_score')
#     plt.suptitle(f'pythia-{s}', y=1.0)

## Monosemantic Neurons

In [7]:
em_results = {}
for s in model_sizes:
    em_results[s] = {}
    em_path = os.path.join(os.environ['RESULTS_DIR'], experiment_info['experiment_name'], f'pythia-{s}', experiment_info['dataset_name'], 'enumerate_monosemantic')
    for file_name in os.listdir(em_path):
        if file_name == 'config.json':
            continue

        _, feature, _, _, _, layer, _ = file_name.split('.')
        d = pickle.load(open(os.path.join(em_path, file_name), 'rb'))
        scores = sorted([(k, v['test_mcc'], v['test_f1_score'], v['coef'].item()) for k, v in d.items()], key=lambda x: x[1], reverse=True)

        if feature not in em_results[s]:
            em_results[s][feature] = {}
        em_results[s][feature][int(layer)] = scores


In [8]:
# model top single neurons
num_top_neurons = 1
n_per_layer = 30
features = sorted(rdf.index.get_level_values(1).unique())
for s in model_sizes:
    print(f'pythia-{s}')
    for f in features:
        print(f'\t{f}')
        # NOTE: we are sorting by f1 score
        top_neurons = [(l,) + em_results[s][f][l][i] for i in range(n_per_layer) for l in em_results[s][f]]
        top_neurons = sorted(top_neurons, key=lambda x: x[3], reverse=True)[:num_top_neurons]
        for n in top_neurons:
            lix, nix, mcc, f1, coef = n
            # print(f'\t\t{lix}, {nix}: mcc={mcc:.3f}, f1={f1:.3f}, coef={coef:.3f}')
            print(f'\t\t({lix}, {nix}): mcc={mcc:.3f}, f1={f1:.3f}, coef={coef:.3f}')

pythia-6.9b
	actor
		(9, 4502): mcc=0.662, f1=0.714, coef=15.877
	association football player
		(19, 10761): mcc=0.886, f1=0.901, coef=19.048
	athlete
		(9, 12997): mcc=0.802, f1=0.832, coef=16.932
	journalist
		(9, 3974): mcc=0.577, f1=0.640, coef=12.740
	politician
		(11, 12520): mcc=0.533, f1=0.604, coef=11.011
	researcher
		(12, 11379): mcc=0.529, f1=0.597, coef=-14.224
	singer
		(23, 8865): mcc=0.718, f1=0.761, coef=13.583


In [9]:
# best neuron per feature per layer
num_top_neurons = 999
n_per_layer = 1
features = sorted(rdf.index.get_level_values(1).unique())
for s in model_sizes:
    print(f'\tpythia-{s}')
    for f in features:
        print(f'\t\t{f}')
        # NOTE: we are sorting by f1 score
        top_neurons = [(l,) + em_results[s][f][l][i] for i in range(n_per_layer) for l in em_results[s][f]]
        top_neurons = sorted(top_neurons, key=lambda x: x[3], reverse=True)[:num_top_neurons]
        for n in top_neurons:
            lix, nix, mcc, f1, coef = n
            # print(f'\t\t{lix}, {nix}: mcc={mcc:.3f}, f1={f1:.3f}, coef={coef:.3f}')
            print(f'pythia-{s},{experiment_info["dataset_name"]},{f},{lix},{nix},{mcc:.3f},{f1:.3f},{coef:.3f}')
        print()

	pythia-6.9b
		actor
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,9,4502,0.662,0.714,15.877
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,11,12827,0.633,0.695,14.174
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,19,2285,0.604,0.671,15.233
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,8,2581,0.597,0.665,14.405
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,15,10563,0.599,0.664,13.258
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,10,7731,0.572,0.642,11.190
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,13,6293,0.567,0.640,12.322
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,14,11981,0.565,0.639,14.081
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,7,1655,0.559,0.634,13.420
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,23,14315,0.558,0.631,13.782
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,16,1333,0.550,0.624,11.132
pythia-6.9b,wikidata_sorted_occu

In [10]:
# top n neurons for each feature across all layers
num_top_neurons = 5
n_per_layer = 30
features = sorted(rdf.index.get_level_values(1).unique())
for s in model_sizes:
    # print(f'\tpythia-{s}')
    for f in features:
        # print(f'\t\t{f}')
        # NOTE: we are sorting by f1 score
        top_neurons = [(l,) + em_results[s][f][l][i] for i in range(n_per_layer) for l in em_results[s][f]]
        top_neurons = sorted(top_neurons, key=lambda x: x[3], reverse=True)[:num_top_neurons]
        for n in top_neurons:
            lix, nix, mcc, f1, coef = n
            # print(f'\t\t{lix}, {nix}: mcc={mcc:.3f}, f1={f1:.3f}, coef={coef:.3f}')
            print(f'pythia-{s},{experiment_info["dataset_name"]},{f},{lix},{nix},{mcc:.3f},{f1:.3f},{coef:.3f}')
        # print()

pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,9,4502,0.662,0.714,15.877
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,11,12827,0.633,0.695,14.174
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,9,4815,0.629,0.691,14.781
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,9,8260,0.619,0.683,13.714
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,actor,19,2285,0.604,0.671,15.233
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,association football player,19,10761,0.886,0.901,19.048
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,association football player,19,3514,0.878,0.899,18.969
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,association football player,15,2298,0.865,0.888,18.749
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,association football player,11,6071,0.842,0.870,19.073
pythia-6.9b,wikidata_sorted_occupation.pyth.128.6000,association football player,21,5514,0.809,0.835,18.933
pythia-6.9b,wikidata_sorted_occ