In [1]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'opt-6.7b' #'llama3-8b'
checkpoint_path = '../models/opt-6.7b' #'../models/llama3-8b'

# Step1：Load Model

In [2]:
from transformers import OPTForCausalLM,AutoModelForCausalLM
from transformers import AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(checkpoint_path, device_map=device, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, device_map=device)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

# Step2：Core Neurons similarity evaluation

In [6]:
import torch
import re
import numpy as np
import matplotlib.pyplot as plt

# model.cuda()
token_sparsity = 0.2
sparsity = 0.4


def get_activation(name, activation_dict):
    def hook(model, input, output):
        activation_dict[name] = input[0].detach().cpu()
    return hook


def register_act_hooks(model, activation_dict):
    hooks = []
    for name, layer in model.named_modules():
        if isinstance(layer, torch.nn.ReLU):
            hooks.append(layer.register_forward_hook(get_activation(name, activation_dict)))
    return hooks


def remove_hooks(hooks):
    for hook in hooks:
        hook.remove()


def activation_opt_similarity(sentence1,sentence2):
    data = [sentence1,sentence2]
    activations = []
    for i in range(len(data)):
        prompt = data[i]
        
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,!?;:]', '', prompt)
        tokenized_input = tokenizer(cleaned_text, return_tensors="pt", max_length=256, truncation=True).to(device)
        
        activation_dict = {}
        hooks = register_act_hooks(model, activation_dict)
    
        with torch.no_grad():
            outputs = model(**tokenized_input)
        
        activations.append(activation_dict)
        remove_hooks(hooks)
        
        del outputs
        del tokenized_input
        del activation_dict

    Layer_num = 25

    layer_act=[]
    sentence_lenth=[]
    for i in range(len(activations)):
        str_act="model.decoder.layers."+str(Layer_num)+".activation_fn"
        tensor = activations[i][str_act].cpu()
        layer_act.append(tensor)
        m=tensor.size(0)
        sentence_lenth.append(m)
    A_tensor = torch.cat(layer_act, dim=0)
    tensorA=torch.sign(A_tensor)
    sentence=[]
    
    num=0
    for i in range(len(sentence_lenth)):
        lenth=sentence_lenth[i]
        list_now=list(range(num, num+lenth))
        sentence.append(list_now)
        num=num+lenth
    
    act_all=(A_tensor).cpu()
    count_act_all = (act_all > 0).sum(dim=1)
    sorted_values, sorted_indices = torch.sort(act_all, dim=1, descending=True)
    
    top50_indices=[]
    for i in range(act_all.size(0)):
        indices = sorted_indices[i, :int(torch.round(count_act_all[i]*token_sparsity))]
        top50_indices.append(indices.tolist())
    
    SEN_F=[]
    for i in range(len(sentence)):
        cluster5= sentence[i]
        act_clu =  [top50_indices[i] for i in cluster5]
        data_flattened = [item for sublist in act_clu for item in sublist]
        data_flattened=torch.tensor(data_flattened)
        unique_numbers, counts = data_flattened.unique(return_counts=True, sorted=True)
    
        sorted_indices = torch.argsort(counts, descending=True)
        sorted_numbers = unique_numbers[sorted_indices]
        sorted_counts = counts[sorted_indices]
        neurons_remained = int(sparsity * len(sorted_numbers))
        SEN_F.append(sorted_numbers[:neurons_remained].numpy())

    similarity=len(np.intersect1d(SEN_F[0], SEN_F[1]))/len(SEN_F[1])
    return similarity




In [4]:
import torch
import re
import numpy as np
import matplotlib.pyplot as plt

# model.cuda()
token_sparsity = 0.2
sparsity = 0.4

def get_activation(name, activation_dict):
    def hook(model, input, output):
        activation_dict[name] = input[0].detach().cpu()
    return hook


def register_act_hooks(model, activation_dict):
    hooks = []
    for name, layer in model.named_modules():
        if isinstance(layer, torch.nn.SiLU):
            hooks.append(layer.register_forward_hook(get_activation(name, activation_dict)))
    return hooks


def remove_hooks(hooks):
    for hook in hooks:
        hook.remove()


def activation_llama_similarity(sentence1,sentence2):
    data = [sentence1,sentence2]
    activations = []
    for i in range(len(data)):
        prompt = data[i]
        tokenized_input = tokenizer(prompt, return_tensors="pt").to(device)
        
        activation_dict = {}
        hooks = register_act_hooks(model, activation_dict)
    
        with torch.no_grad():
            outputs = model(**tokenized_input)
        
        activations.append(activation_dict)
        remove_hooks(hooks)
        
        del outputs
        del tokenized_input
        del activation_dict

    Layer_num = 70

    layer_act=[]
    sentence_lenth=[]
    for i in range(len(activations)):
        str_act="model.layers."+str(Layer_num)+".mlp.act_fn"
        tensor = activations[i][str_act].cpu()
        layer_act.append(tensor.squeeze(0))
        m=tensor.size(1)
        sentence_lenth.append(m)
    A_tensor = torch.cat(layer_act, dim=0)
    tensorA=torch.sign(A_tensor)
    sentence=[]
    
    num=0
    for i in range(len(sentence_lenth)):
        lenth=sentence_lenth[i]
        list_now=list(range(num, num+lenth))
        sentence.append(list_now)
        num=num+lenth
    
    act_all=(A_tensor).cpu()
    count_act_all = (act_all > 0).sum(dim=1)
    sorted_values, sorted_indices = torch.sort(act_all, dim=1, descending=True)
    
    top50_indices=[]
    for i in range(act_all.size(0)):
        indices = sorted_indices[i, :int(torch.round(count_act_all[i]*token_sparsity))]
        top50_indices.append(indices.tolist())
    
    SEN_F=[]
    for i in range(len(sentence)):
        cluster5= sentence[i]
        act_clu =  [top50_indices[i] for i in cluster5]
        data_flattened = [item for sublist in act_clu for item in sublist]
        data_flattened=torch.tensor(data_flattened)
        unique_numbers, counts = data_flattened.unique(return_counts=True, sorted=True)
    
        sorted_indices = torch.argsort(counts, descending=True)
        sorted_numbers = unique_numbers[sorted_indices]
        sorted_counts = counts[sorted_indices]
        neurons_remained = int(sparsity * len(sorted_numbers))
        SEN_F.append(sorted_numbers[:neurons_remained].numpy())

    similarity=len(np.intersect1d(SEN_F[0], SEN_F[1]))/len(SEN_F[1])
    return similarity


# Step4: Pearson Correlation Coefficient on STS-B

In [32]:
from datasets import load_dataset
dataset = load_dataset( 'mteb/stsbenchmark-sts')

print("Dataset Info:")
print(dataset)

Dataset Info:
DatasetDict({
    train: Dataset({
        features: ['split', 'genre', 'dataset', 'year', 'sid', 'score', 'sentence1', 'sentence2'],
        num_rows: 5749
    })
    validation: Dataset({
        features: ['split', 'genre', 'dataset', 'year', 'sid', 'score', 'sentence1', 'sentence2'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['split', 'genre', 'dataset', 'year', 'sid', 'score', 'sentence1', 'sentence2'],
        num_rows: 1379
    })
})


In [33]:
sim_all=[]
true_all=[]
data=dataset['validation']

for i in range(len(data)):
    sentence1 = dataset['validation'][i]['sentence1']
    sentence2 = dataset['validation'][i]['sentence2']
    if "opt" in model_name:
        act_sim=activation_opt_similarity(sentence1,sentence2)
    elif "llama" in model_name:
        act_sim=activation_llama_similarity(sentence1,sentence2)
    true_sim=dataset['validation'][i]['score']
    sim_all.append(act_sim)
    true_all.append(true_sim)
    

In [34]:
from scipy.stats import spearmanr

correlation, p_value = spearmanr(sim_all, true_all)
print(f"Spearman correlation: {correlation}")
print(f"P-value: {p_value}")

Spearman correlation: 0.5874475337292442
P-value: 7.283538395503928e-140


# Step4: Pearson Correlation Coefficient on Sick

In [5]:
from datasets import load_dataset
dataset = load_dataset( 'maximedb/sick_nl')

print("Dataset Info:")
print(dataset)

Dataset Info:
DatasetDict({
    train: Dataset({
        features: ['pair_ID', 'sentence_A', 'sentence_B', 'entailment_label', 'relatedness_score', 'entailment_AB', 'entailment_BA', 'sentence_A_original', 'sentence_B_original', 'sentence_A_dataset', 'sentence_B_dataset', 'SemEval_set', 'label', 'label_seq2seq'],
        num_rows: 4439
    })
    validation: Dataset({
        features: ['pair_ID', 'sentence_A', 'sentence_B', 'entailment_label', 'relatedness_score', 'entailment_AB', 'entailment_BA', 'sentence_A_original', 'sentence_B_original', 'sentence_A_dataset', 'sentence_B_dataset', 'SemEval_set', 'label', 'label_seq2seq'],
        num_rows: 495
    })
    test: Dataset({
        features: ['pair_ID', 'sentence_A', 'sentence_B', 'entailment_label', 'relatedness_score', 'entailment_AB', 'entailment_BA', 'sentence_A_original', 'sentence_B_original', 'sentence_A_dataset', 'sentence_B_dataset', 'SemEval_set', 'label', 'label_seq2seq'],
        num_rows: 4906
    })
})


In [7]:
sim_all=[]
true_all=[]
data=dataset['validation']

for i in range(len(data)):
    sentence1 = dataset['validation'][i]['sentence_A']
    sentence2 = dataset['validation'][i]['sentence_B']
    if "opt" in model_name:
        act_sim=activation_opt_similarity(sentence1,sentence2)
    elif "llama" in model_name:
        act_sim=activation_llama_similarity(sentence1,sentence2)
    true_sim=dataset['validation'][i]['relatedness_score']
    sim_all.append(act_sim)
    true_all.append(true_sim)

In [8]:
from scipy.stats import spearmanr

correlation, p_value = spearmanr(sim_all, true_all)
print(f"Spearman correlation: {correlation}")
print(f"P-value: {p_value}")

Spearman correlation: 0.3683946806722181
P-value: 2.3418180334252915e-17
