In [1]:
import os
import torch
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
from generate_acts_and_attn import load_model

In [2]:
model_name = "llama-3-8b"
tokenizer, model, all_layers, all_attn_layers = load_model(
    model_name=model_name, device="cuda"
)

Loading model llama-3-8b...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
words = ["healthy", "happy", "kind"]
names = ["Alice", "Bob", "Charlie"]

for word in words:
    for name in names:
        seqs = [
            f"{name} is {word}. Is {name} {word}?",
            f"{name} is {word}. Is {name} un{word}?",
        ]
        for seq in seqs:
            tokens = tokenizer(seq, return_tensors="pt").to(model.device).input_ids
            with torch.no_grad():
                outputs = model.generate(
                    tokens, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id
                )
            print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Alice is healthy. Is Alice healthy? Yes
Alice is healthy. Is Alice unhealthy? No
Bob is healthy. Is Bob healthy? Yes
Bob is healthy. Is Bob unhealthy? No
Charlie is healthy. Is Charlie healthy? Yes
Charlie is healthy. Is Charlie unhealthy? No
Alice is happy. Is Alice happy? Yes
Alice is happy. Is Alice unhappy? No
Bob is happy. Is Bob happy? Yes
Bob is happy. Is Bob unhappy? No
Charlie is happy. Is Charlie happy? Yes
Charlie is happy. Is Charlie unhappy? No
Alice is kind. Is Alice kind? Yes
Alice is kind. Is Alice unkind? No
Bob is kind. Is Bob kind? Yes
Bob is kind. Is Bob unkind? No
Charlie is kind. Is Charlie kind? Yes
Charlie is kind. Is Charlie unkind? No


In [4]:
words = ["healthy", "happy", "kind"]
names = ["Alice", "Bob", "Charlie"]

for word in words:
    for name in names:
        seqs = [
            f"{name} is not un{word}. Is {name} {word}?",
            f"{name} is not {word}. Is {name} {word}?",
        ]
        for seq in seqs:
            tokens = tokenizer(seq, return_tensors="pt").to(model.device).input_ids
            with torch.no_grad():
                outputs = model.generate(
                    tokens, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id
                )
            print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Alice is not unhealthy. Is Alice healthy? No
Alice is not healthy. Is Alice healthy? No
Bob is not unhealthy. Is Bob healthy? No
Bob is not healthy. Is Bob healthy? No
Charlie is not unhealthy. Is Charlie healthy? No
Charlie is not healthy. Is Charlie healthy? No
Alice is not unhappy. Is Alice happy? No
Alice is not happy. Is Alice happy? No
Bob is not unhappy. Is Bob happy? No
Bob is not happy. Is Bob happy? No
Charlie is not unhappy. Is Charlie happy? No
Charlie is not happy. Is Charlie happy? No
Alice is not unkind. Is Alice kind? No
Alice is not kind. Is Alice kind? No
Bob is not unkind. Is Bob kind? No
Bob is not kind. Is Bob kind? No
Charlie is not unkind. Is Charlie kind? No
Charlie is not kind. Is Charlie kind? No


In [5]:
all_attn_layers[0]

LlamaAttention(
  (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
  (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
  (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (rotary_emb): LlamaRotaryEmbedding()
)

In [6]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm)

In [None]:
# check the attention of the model

In [3]:
model_name = "llama-2-7b"
tokenizer, model, all_layers, all_attn_layers = load_model(
    model_name=model_name, device="cuda"
)

Loading model llama-2-7b...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
words = ["healthy", "happy", "kind"]
names = ["Alice", "Bob", "Charlie"]

for word in words:
    for name in names:
        seqs = [
            f"{name} is not un{word}. Is {name} {word}?",
            f"{name} is not {word}. Is {name} {word}?",
        ]
        for seq in seqs:
            tokens = tokenizer(seq, return_tensors="pt").to(model.device).input_ids
            with torch.no_grad():
                outputs = model.generate(tokens, max_new_tokens=3)
            print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Alice is not unhealthy. Is Alice healthy?
Alice
Alice is not healthy. Is Alice healthy?
The two
Bob is not unhealthy. Is Bob healthy? Is Bob not
Bob is not healthy. Is Bob healthy?
Bob is
Charlie is not unhealthy. Is Charlie healthy?
Charlie
Charlie is not healthy. Is Charlie healthy?
Charlie
Alice is not unhappy. Is Alice happy?
Alice
Alice is not happy. Is Alice happy? No. Alice
Bob is not unhappy. Is Bob happy?
Bob is
Bob is not happy. Is Bob happy?
I'
Charlie is not unhappy. Is Charlie happy? Charlie is happy
Charlie is not happy. Is Charlie happy?
I’
Alice is not unkind. Is Alice kind? Is Alice un
Alice is not kind. Is Alice kind?
Alice
Bob is not unkind. Is Bob kind?
Bob is
Bob is not kind. Is Bob kind?
I'
Charlie is not unkind. Is Charlie kind? Is Charlie cruel
Charlie is not kind. Is Charlie kind? Is Charlie not
