# 1. Settings

Let's start by importing all the needed packages and setting the function to get the `device`:

In [None]:
import os
from dotenv import load_dotenv
import torch

from huggingface_hub import login

In [None]:
from AdversarialPromptGenerator import AdversarialPromptGenerator

from integrated_gradients import integrated_gradients

from our_base import LocalModel, HuggingFaceEmbeddings
from our_token_shap import TokenizerSplitter, TokenSHAP

In [None]:
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    # torch.backends.mps may not exist on all builds, guard with getattr
    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
        return torch.device("mps")
    return torch.device("cpu")

DEVICE = get_device()

# 2. Hugging Face

First, retrieve the Hugging Face Key:

In [None]:
load_dotenv()
hf_api_key = os.getenv("HUGGINGFACE_API_KEY")
if not hf_api_key:
    raise RuntimeError("Missing HUGGINGFACE_API_KEY. Set it in your environment or .env file.")
login(hf_api_key)

# 3. TokenSHAP

Then, instantiate TokenSHAP using HuggingFace, specifically using the `meta-llama/Llama-3.2-1B-Instruct` model:

In [None]:
model_path = "meta-llama/Llama-3.2-1B-Instruct"
local_model = LocalModel(model_name=model_path, max_new_tokens=1, temperature=None, device=DEVICE)
hf_embedding = HuggingFaceEmbeddings(device=DEVICE)
splitter = TokenizerSplitter(local_model.tokenizer)
token_shap_local = TokenSHAP(model=local_model, splitter=splitter, vectorizer=hf_embedding, debug=True)

In [None]:
local_model.device

Instantiate the `PromptGenerator` to retrieve the adversarial prompts:

In [None]:
adv_prompt_generator = AdversarialPromptGenerator()
adversarial_suffix_path = "./adv_suffixes.pt" # tensor of all 100 suffixes
all_prompts = adv_prompt_generator.get_from(adversarial_suffix_path)

In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM

# # tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
# tokenizer = local_model.tokenizer
# # model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
# model = local_model.model
# # model = local_model
# for prompt in all_prompts:
# 	messages = [
# 		{"role": "user", "content": prompt},
# 	]
# 	inputs = tokenizer.apply_chat_template(
# 		messages,
# 		add_generation_prompt=True,
# 		tokenize=True,
# 		return_dict=True,
# 		return_tensors="pt",
# 	).to(model.device)

# 	# print(inputs)

# 	outputs = model.generate(
# 		**inputs,
# 		max_new_tokens=1,
# 		do_sample=False,
# 		temperature=None,
# 		top_p=None,
# 		pad_token_id=tokenizer.eos_token_id
# 	)
# 	print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

##### Test of Specific Functions of TokenSHAP

In [None]:
token_shap_local._calculate_baseline(all_prompts[3])

In [None]:
token_shap_local._get_result_per_combination(all_prompts[3], 0.0)

In [None]:
df_local = token_shap_local.analyze(all_prompts[3], sampling_ratio=0.0)
token_shap_local.print_colored_text()

##### Full Loop to Analyse All 100 Prompts

In [None]:
for prompt in all_prompts:
    df_local = token_shap_local.analyze(prompt, sampling_ratio=0.0)
    token_shap_local.print_colored_text()

# 4. Integrated Gradients

In [None]:
result = integrated_gradients(
    model=local_model.model,
    tokenizer=local_model.tokenizer,
    content=all_prompts[3],
    steps=50,
    device=DEVICE
)

for tok, score in zip(result["tokens"], result["attributions"]):
    print(f"{tok:>10s} : {score:.4f}")

# 5. Integrated Gradients (Captum)

In [None]:
from captum.attr import LayerIntegratedGradients
import torch

In [None]:
def captum_integrated_gradients(model, tokenizer, content, device, steps=50):
    model.eval()
    model.zero_grad()

    # 1. Tokenize using chat template
    prompt = [{"role": "user", "content": content}]
    inputs = tokenizer.apply_chat_template(
        prompt,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(device)
    input_ids = inputs["input_ids"]

    # Match user tokens for coherence with previous implementation
    user_ids = tokenizer(
        content,
        add_special_tokens=False,
        return_tensors="pt"
    )["input_ids"][0].to(device)

    # Simple subsequence match
    def find_subsequence(sequence, subseq):
        for i in range(len(sequence) - len(subseq) + 1):
            if torch.equal(sequence[i:i+len(subseq)], subseq):
                return i, i + len(subseq)
        return None, None

    user_start, user_end = find_subsequence(input_ids[0], user_ids)

    # 2. Identify target token (greedy)
    with torch.no_grad():
        outputs = model(input_ids)
        target_token_id = outputs.logits[0, -1].argmax().item()

    # 3. Define forward function for Captum
    # We need to compute gradients w.r.t embeddings, so we capture the embedding layer.
    embed_layer = model.get_input_embeddings()

    def forward_func(inputs_coords):
        # LayerIntegratedGradients passes the output of the layer (embeddings) as the first argument
        # We need to pass these embeddings to the model
        # However, model() expects input_ids usually, but can take inputs_embeds
        outputs = model(inputs_embeds=inputs_coords)
        return outputs.logits[0, -1, target_token_id]

    lig = LayerIntegratedGradients(forward_func, embed_layer)

    # 4. Baselines
    baseline_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
    baseline_ids = torch.full_like(input_ids, baseline_token_id)

    # 5. Attribute
    # We pass input_ids to attribute. LIG will pass it to the layer, get embeddings, 
    # then pass embeddings to forward_func.
    attributions = lig.attribute(inputs=input_ids,
                                 baselines=baseline_ids,
                                 n_steps=steps,
                                 internal_batch_size=1)

    # Sum over hidden dimension
    attributions = attributions.sum(dim=-1).squeeze(0)
    attributions = attributions.detach().cpu()
    
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    # Filter to user tokens if found
    if user_start is not None:
        tokens = tokens[user_start:user_end]
        attributions = attributions[user_start:user_end]

    return tokens, attributions


In [None]:
tokens_cap, attrs_cap = captum_integrated_gradients(
    model=local_model.model,
    tokenizer=local_model.tokenizer,
    content=all_prompts[3],
    device=DEVICE,
    steps=50
)

print("Captum Integrated Gradients Results:")
for tok, score in zip(tokens_cap, attrs_cap):
    print(f"{tok:>10s} : {score:.4f}")
