### Notebook to convert the activations obtained by running Llama 3 8B on the LiReF paper datasets to direction vectors using difference of means and logistic regression probes

In [35]:
import torch
# Load the activations from .pt file
base = "../../../../ignored/"
#path = "/Users/jorisholshuijsen/Documents/Artificial Intelligence/Thesis/reasoning-reciting-probing/inputs/chess/interventions/liref_data_activations.pt"


In [37]:
def get_difference_of_means_directions(hs_cache_no_cot, model_layers_num, mlp_dim_num, reason_indices, memory_indices):

    candidate_directions = torch.zeros((model_layers_num, mlp_dim_num), dtype=torch.float64, device='cpu')

    # calculating candidate reasoning features
    for layer in range(model_layers_num):
            
        hs_no_cot = hs_cache_no_cot[layer]

        #  we store the mean activations in high-precision to avoid numerical issues
        reason_hs_no_cot = hs_no_cot[reason_indices, :].to(torch.float64)
        #print('reason_hs_no_cot.shape: ',reason_hs_no_cot.shape) reason有点多，memory有点少，需要进一步把数据集做scale up    
        memory_hs_no_cot = hs_no_cot[memory_indices, :].to(torch.float64)

        mean_reason_hs_no_cot = reason_hs_no_cot.mean(dim=0)
        mean_memory_hs_no_cot = memory_hs_no_cot.mean(dim=0)

        mean_diff = mean_reason_hs_no_cot - mean_memory_hs_no_cot  #Reasoning features shape: [bsz, dims] 
        candidate_directions[layer] = mean_diff

    return candidate_directions

In [38]:
import json
import os
loaded_dict = torch.load(os.path.join(base, 'liref_data_activations.pt'))
print(loaded_dict.keys())
hs_cache_no_cot = loaded_dict['mmlu-pro_3000samples'] 

with open(os.path.join(base, 'mmlu-pro-3000samples.json'), 'r', encoding='utf-8') as f:
        sampled_data = json.load(f)

reason_indices = [ix for ix, sample in enumerate(sampled_data) if sample['memory_reason_score'] > 0.5]
memory_indices = [ix for ix, sample in enumerate(sampled_data) if sample['memory_reason_score'] <= 0.5]

diff_of_means_dirs = get_difference_of_means_directions(hs_cache_no_cot, 32, 4096, reason_indices, memory_indices)

dict_keys(['mmlu-pro_3000samples'])


In [39]:
# Save the difference of means directions to a json file with the following format:
# {
#     "layer_0": [0.1, 0.2, 0.3],
#     "layer_1": [0.4, 0.5, 0.6],
#     ...
# }
# Convert the tensor to a dictionary with the required format
directions_dict = {}
for layer in range(diff_of_means_dirs.shape[0]):
    # Convert tensor values to Python floats and store in list
    layer_values = diff_of_means_dirs[layer].tolist()
    directions_dict[f"layer_{layer}"] = layer_values

# Save to a JSON file
output_path = os.path.join('../../../../inputs/chess/interventions/', 'liref_reasoning_directions.json')
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(directions_dict, f)

print(f"Saved reasoning directions to {output_path}")


Saved reasoning directions to ../../../../inputs/chess/interventions/liref_reasoning_directions.json
