# llama_3.1 8B Instruct

This model could ben downloaded and deploy locally, despite its slow runtime. 

# Pipeline

In [1]:
import json
import torch
import transformers
from huggingface_hub import snapshot_download, login
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
login(token=os.environ["HF_TOKEN"])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
local_dir = snapshot_download(repo_id="meta-llama/Llama-3.2-3B-Instruct")
local_dir

Fetching 16 files: 100%|██████████| 16/16 [00:00<00:00, 132104.06it/s]


'/Users/ivan/.cache/huggingface/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95'

In [5]:
pipeline: transformers.pipeline = transformers.pipeline("text-generation", model=local_dir, device_map="auto", model_kwargs={"torch_dtype": torch.float16, "low_cpu_mem_usage": True})
terminators = [pipeline.tokenizer.eos_token_id, pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")]

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.33s/it]
Some parameters are on the meta device because they were offloaded to the disk.
Device set to use mps


In [6]:
# takes a str query and returns str responses
def get_response(query, message_history=[], max_tokens=2048, temperature=0.6, top_p=0.9):
    user_prompt = message_history + [{"role": "user", "content": query}]
    prompt = pipeline.tokenizer.apply_chat_template(
        user_prompt, tokenize=False, add_generation_prompt=True
    )
    outputs = pipeline(
        prompt,
        max_new_tokens=max_tokens,
        eos_token_id=terminators,
        do_sample=True,
        temperature=temperature,
        top_p=top_p
    )
    response = outputs[0]["generated_text"][len(prompt):]
    return response

# take a list of messages (dict[str, any] -> role, content) and a response
def response_from_msg_list(msg_list: list[dict], message_history=[], max_tokens=2048, temperature=0.6, top_p=0.9):
    flat_msg_list = []
    for x in range(len(msg_list)): flat_msg_list += [{"role":k , "content": v} for k, v in msg_list[x].items()]
    user_prompt = message_history + flat_msg_list
    print("message list:", user_prompt)
    prompt = pipeline.tokenizer.apply_chat_template(
        user_prompt, tokenize=False, add_generation_prompt=True
    )
    outputs = pipeline(
        prompt,
        max_new_tokens=max_tokens,
        eos_token_id=terminators,
        do_sample=True,
        temperature=temperature,
        top_p=top_p
    )
    response = outputs[0]["generated_text"][len(prompt):]
    return response

In [7]:
get_response("What is the capital of Canada in one word and what's your confidence as a percentage in parentheses that this is the correct answer?")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


'Ottawa (95%)'

# Extract Logits

In [257]:
from transformers import LlamaTokenizer, LlamaForCausalLM, AutoTokenizer
import torch
from huggingface_hub import snapshot_download
import torch.nn.functional as F
import numpy as np
import math

In [5]:
# Load tokenizer and model
local_dir = snapshot_download(repo_id="meta-llama/Llama-3.2-3B-Instruct")
# Load model with dtype for efficiency
model = LlamaForCausalLM.from_pretrained(local_dir, torch_dtype=torch.float16, low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(local_dir)

Fetching 16 files: 100%|██████████| 16/16 [00:00<00:00, 271695.81it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.31s/it]


In [324]:
text = "Whats the capital of Canada in as few words as possible?"
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    outputs = model.generate(
        inputs['input_ids'], 
        max_new_tokens=5, 
        return_dict_in_generate=True, 
        output_logits=True,
        output_scores=True,
        top_p =0.9
    )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [325]:
tokenizer.batch_decode(outputs.sequences[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)

[' \nOttawa.']

In [326]:
def single_generation_confidence(logits):
    n = len(logits)
    log_probs = 0
    for logit in logits:
        log_probs += torch.log_softmax(logit, dim=len(logit)).max()
    return math.exp(float(log_probs) / n)

In [327]:
single_generation_confidence(outputs.logits)

0.6341904959597433

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# Sample texts
texts = [
    "I love programming in Python"
]

# Load pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all texts
embeddings = model.encode(texts)

# Perform K-means clustering
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(embeddings)

# Assign texts to clusters
clusters = kmeans.labels_

# Print out the texts in each cluster
for cluster_num in range(num_clusters):
    print(f"Cluster {cluster_num}:")
    for i, label in enumerate(clusters):
        if label == cluster_num:
            print(f"- {texts[i]}")
    print("\n")

Cluster 0:
- I enjoy cooking pasta
- Pasta is delicious


Cluster 1:
- I love programming in Python
- Python is a great programming language
- Data science and machine learning are interesting
- I like reading about artificial intelligence




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
