In [1]:
import os

os.chdir('..')

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from src.geometry import calcurate_concept_matrics_rank
from src.utils.model_analysis import (
    compute_inner_product_LOO,
    get_concept_vector,
    get_hidden_layer_n,
)
from src.utils.preprocess_data import get_counterfactual_pairs
from src.utils.visualization import show_histogram_LOO

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_path = "meta-llama/Llama-3.2-3B-Instruct"
dataset_type = "valuenet"
concept_direction_type = "pos2neg"
norm_type = "base"
prompt_type = "explicit_schwartz"
embedding_strategy = "last"
device_id = 4
concept_vectorize_strategy = "embedding"
embedding_batch_size = 4
target_layers = [i + 1 for i in range(28)]

model_name = model_path.split("/")[1].lower()
num_sample = 1000

In [4]:
device = torch.device(f"cuda:{device_id}")
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

num_hidden_layers: int = model.config.num_hidden_layers
unembedding = model.lm_head.weight.detach()

values_list_str: list[str] = [
    line.strip()
    for line in open("/home/itai/research/PersonalValuesGeometry/datasets/values.txt")
]

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.66it/s]


In [5]:
target_layer = 2

In [6]:
print("\n*===== Args =====*")
print(f"{model_path=}")
print(f"{dataset_type=}")
print(f"{concept_direction_type=}")
print(f"{norm_type=}")
print(f"{prompt_type=}")
print(f"{embedding_strategy=}")
print(f"{target_layer=}")
print(f"{num_sample=}")
print(f"!{concept_vectorize_strategy=}")
print("*===============*\n")



*===== Args =====*
model_path='meta-llama/Llama-3.2-3B-Instruct'
dataset_type='valuenet'
concept_direction_type='pos2neg'
norm_type='base'
prompt_type='explicit_schwartz'
embedding_strategy='last'
target_layer=2
num_sample=1000
!concept_vectorize_strategy='embedding'



In [7]:
random_txt_path = f"/home/itai/research/PersonalValuesGeometry/datasets/ValueNet/schwartz/random_pairs/{norm_type}/random_1000_pairs.txt"
# Random Pair
print("[Random Pair] random文書pairを取得 ...")
# random_pairs = get_sequence_pairs(random_txt_path, int(num_sample))
random_positive_sequences, random_negative_sequences = get_counterfactual_pairs(
    random_txt_path, prompt_type=prompt_type, num_sample=int(num_sample)
)

[Random Pair] random文書pairを取得 ...


In [8]:
print("[Random Pair] positive文章のembeddingを計算 ...")
random_positive_embeddings = get_hidden_layer_n(
    model=model,
    tokenizer=tokenizer,
    sequences=random_positive_sequences,
    n_layer=target_layer,
    embedding_strategy=embedding_strategy,
    batch_size=embedding_batch_size,
)
print(f"{random_positive_embeddings[0]=}")

print("[Random Pair] negative文章のembeddingを計算 ...")
random_negative_embeddings = get_hidden_layer_n(
    model=model,
    tokenizer=tokenizer,
    sequences=random_negative_sequences,
    n_layer=target_layer,
    embedding_strategy=embedding_strategy,
    batch_size=embedding_batch_size,
)

[Random Pair] positive文章のembeddingを計算 ...


Processing embeddings: 100%|██████████| 250/250 [01:55<00:00,  2.16batch/s]


random_positive_embeddings[0]=tensor([ 0.0387, -0.0101,  0.0102,  ..., -0.0458, -0.0251,  0.0107])
[Random Pair] negative文章のembeddingを計算 ...


Processing embeddings: 100%|██████████| 250/250 [01:58<00:00,  2.11batch/s]


In [9]:
random_diff_embeddings = random_positive_embeddings - random_negative_embeddings
random_diff_embeddings

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [17]:
random_positive_embeddings

tensor([[ 0.0387, -0.0101,  0.0102,  ..., -0.0458, -0.0251,  0.0107],
        [ 0.0387, -0.0101,  0.0102,  ..., -0.0458, -0.0251,  0.0107],
        [ 0.0387, -0.0101,  0.0102,  ..., -0.0458, -0.0251,  0.0107],
        ...,
        [ 0.0387, -0.0101,  0.0102,  ..., -0.0458, -0.0251,  0.0107],
        [ 0.0387, -0.0101,  0.0102,  ..., -0.0458, -0.0251,  0.0107],
        [ 0.0387, -0.0101,  0.0102,  ..., -0.0458, -0.0251,  0.0107]])

In [20]:
from sklearn.decomposition import FastICA, PCA

def apply_pca(embeddings: torch.Tensor, n_components=10):
    """Apply ICA to reduce dimensions of embeddings."""
    ica = PCA(n_components=n_components)
    reduced_embeddings = ica.fit_transform(embeddings)
    return torch.tensor(reduced_embeddings)

In [21]:
reduced_random_positive_embeddings = apply_pca(random_positive_embeddings)
print("applied ICA to random positive embeddings")

reduced_random_negative_embeddings = apply_pca(random_negative_embeddings)
print("applied ICA to random negative embeddings")

reduced_random_diff_embeddings = reduced_random_positive_embeddings - reduced_random_negative_embeddings
reduced_random_diff_embeddings

applied ICA to random positive embeddings
applied ICA to random negative embeddings


  self.explained_variance_ratio_ = self.explained_variance_ / total_var
  self.explained_variance_ratio_ = self.explained_variance_ / total_var


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

In [22]:
reduced_random_positive_embeddings

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)