# llama_3.1 8B Instruct with Logits

This model could ben downloaded and deploy locally, despite its slow runtime. 

# Extract Logits

In [1]:
from transformers import LlamaForCausalLM, AutoTokenizer
import torch
from huggingface_hub import snapshot_download, login
import numpy as np
import os
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
login(token=os.environ["HF_TOKEN"])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [22]:
# Load tokenizer and model
local_dir = snapshot_download(repo_id="meta-llama/Llama-3.2-3B-Instruct")
model = LlamaForCausalLM.from_pretrained(local_dir, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
tokeniser = AutoTokenizer.from_pretrained(local_dir)

Fetching 16 files: 100%|██████████| 16/16 [00:00<00:00, 90321.49it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.48s/it]


In [23]:
def response_tokeniser_with_sgc(message_lst: list[dict], tokeniser: AutoTokenizer, model: LlamaForCausalLM, max_new_tokens = 5) -> tuple:
    flat_msg_list = []
    for x in range(len(message_lst)): flat_msg_list += [{"role":k , "content": v} for k, v in message_lst[x].items()]
    inputs = tokeniser(tokeniser.apply_chat_template(flat_msg_list, tokenize=False, add_generation_prompt=True), return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'], 
            max_new_tokens=max_new_tokens, 
            return_dict_in_generate=True, 
            output_logits=True,
            output_scores=True,
            top_p =0.9
        )
    return "".join(tokeniser.batch_decode(outputs.sequences[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)), float(np.exp(model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True).numpy(force=True)).mean())

In [24]:
msgs = [{"user": "What is the most populated city in Australia?"}, {"system": "provide the answer as concisely as possible"}]
multi_response = [response_tokeniser_with_sgc(msgs, tokeniser, model) for _ in range(50)]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

In [76]:
multi_response

[('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney', 0.7071358561515808),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.9696481227874756),
 ('Sydney.', 0.

In [66]:
model_name="all-MiniLM-L6-v2"
distance_threshold = 0.3
lnll_lst = [(x)[1] for x in multi_response]
response_list = [x[0] for x in multi_response]
embeddings = SentenceTransformer(model_name).encode(response_list)
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, metric="cosine", linkage="average")
labels = clustering.fit_predict(embeddings)

# 1. Single Generation Confidence (SGC)

In [78]:
def single_generation_confidence(lnll_lst, response_list, labels):
    return response_list[0], lnll_lst[0]

In [79]:
single_generation_confidence(lnll_lst, response_list, labels)

('Sydney.', 0.9696481227874756)

# 2. Empirical Semantic Confidence (E-SC)

From the $m$ responses grouped into $k$ clusters, we select the final output by identifying the most confident cluster $C^*$ and choosing the response within it with the highest $\text{LN-LL: } y^*= \text{argmax}_{y∈C^*}\bar{l}(y |x).$

In [80]:
def empirical_semantic_confidence(lnll_lst, response_list, labels):
    counts = Counter(labels)
    opt_cluster, opt_conf = max([(int(cluster_id), count/sum(counts.values())) for cluster_id, count in counts.items()], key=lambda x: x[1])
    optimal_response = max([(response_list[i], lnll_lst[i]) for i, label in enumerate(labels) if label == opt_cluster], key=lambda x: x[1])[0]
    return optimal_response, opt_conf

In [81]:
empirical_semantic_confidence(lnll_lst, response_list, labels)

('Sydney.', 1.0)

# 3. Likelihood-based Semantic Confidence (L-SC)

In [82]:
def likelihood_based_semantic_confidence(lnll_lst, response_list, labels):
    clustered = []
    # for each cluster calculat the following:
    for c in np.unique(labels):
        sum_ci = sum([lnll_lst[i] for i in range(len(lnll_lst)) if labels[i] == c]) # s(Ci | x) = sum(LN-LL)
        clustered.append((int(c), sum_ci))
    total_lsc = sum([x[1] for x in clustered])
    lsc = [(c[0], c[1] / total_lsc) for c in clustered]
    
    opt_cluster, opt_conf = max(lsc, key=lambda x: x[1])
    optimal_response = max([(response_list[i], lnll_lst[i]) for i, label in enumerate(labels) if label == opt_cluster], key=lambda x: x[1])[0]

    return optimal_response, opt_conf

In [83]:
likelihood_based_semantic_confidence(lnll_lst, response_list, labels)

('Sydney.', 1.0)

# 4. Mean Likelihood-based Semantic Confidence (ML-SC)

In [84]:
def mean_likelihood_based_semantic_confidence(lnll_lst, response_list, labels):
    clustered = []
    # for each cluster calculate s_bar(Ci | x):
    for c in np.unique(labels):
        sum_ci = sum([lnll_lst[i] for i in range(len(lnll_lst)) if labels[i] == c]) # s_bar(Ci | x) = sum(LN-LL) in cluster Ci
        clustered.append((int(c), sum_ci / Counter(labels)[c])) # s_bar(Ci | x) = sum(LN-LL) / count(Ci)
    total_mlsc = sum([x[1] for x in clustered])
    mlsc = [(c[0], c[1] / total_mlsc) for c in clustered]
    
    opt_cluster, opt_conf = max(mlsc, key=lambda x: x[1])
    optimal_response = max([(response_list[i], lnll_lst[i]) for i, label in enumerate(labels) if label == opt_cluster], key=lambda x: x[1])[0]

    return optimal_response, opt_conf

In [85]:
mean_likelihood_based_semantic_confidence(lnll_lst, response_list, labels)

('Sydney.', 1.0)

# 5. Bayesian Semantic Confidence (B-SC)

In [86]:
def bayesian_semantic_confidence(lnll_lst, response_list, labels):
    clustered = []
    for c in np.unique(labels):
        pi = Counter(labels)[c] / len(labels) # pi = count(Ci) / count
        joint_lnll = np.prod([lnll_lst[i] for i in range(len(lnll_lst)) if labels[i] == c]) # joint lnll = ∏(LN-LL) in cluster Ci
        clustered.append((int(c), joint_lnll * pi)) # joint_lnll * pi
    
    total_bsc = sum([x[1] for x in clustered])
    bsc = [(c[0], float(c[1] / total_bsc)) for c in clustered]

    opt_cluster, opt_conf = max(bsc, key=lambda x: x[1])
    optimal_response = max([(response_list[i], lnll_lst[i]) for i, label in enumerate(labels) if label == opt_cluster], key=lambda x: x[1])[0]

    return optimal_response, opt_conf

In [87]:
bayesian_semantic_confidence(lnll_lst, response_list, labels)

('Sydney.', 1.0)