In [8]:
import torch
from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForCausalLM

In [6]:
model_id = "meta-llama/Llama-3.2-1B"

pipe = pipeline(
    "text-generation", 
    model=model_id, 
    torch_dtype=torch.float32, 
    device_map="auto"
)

pipe("The key to life is")

Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'The key to life is to be happy and to enjoy the moment. The key to happiness is to be in the moment.\n'}]

In [7]:
pipe

<transformers.pipelines.text_generation.TextGenerationPipeline at 0x1f16f323a60>

In [12]:
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model_kwargs = {
    "device_map": "auto",
    "torch_dtype": torch.float16,
    "load_in_8bit": False
}
model = AutoModel.from_pretrained(
    model_name,
    **model_kwargs
)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_text = ["A painting of a squirrel eating a burger"]
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512
).to(device)
inputs["input_ids"].shape

torch.Size([1, 9])

In [16]:
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)
    
    # Get the last hidden state
    last_hidden_state = outputs.hidden_states[-1]
    print(last_hidden_state.shape)
    
    # Average pooling over sequence length (excluding padding)
    attention_mask = inputs['attention_mask'].unsqueeze(-1)
    embeddings = (last_hidden_state * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
    print(embeddings.shape)

torch.Size([1, 9, 2048])
torch.Size([1, 2048])
