# Classify with Llama

In [None]:
#!python -m pip install --upgrade pip
#!pip install dotenv

In [1]:
import transformers
import torch
from transformers import AutoTokenizer
import os
from dotenv import load_dotenv

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'dotenv'

In [2]:
# Set Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [3]:
token = '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'
model_id = "meta-llama/Meta-Llama-3-8B"
#model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
#model_id = "meta-llama/Meta-Llama-3-70B"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
print("Tokenizer loaded successfully!")

Tokenizer loaded successfully!


In [4]:
from transformers import pipeline

pipeline = transformers.pipeline("text-generation", # LLM task
                                 model=model_id,
                                 model_kwargs={"torch_dtype": torch.float16},
                                 device=device,
                                 token=token,
                                 )

print("Model loaded successfully on GPU!" if device == "cuda" else "Model running on CPU.")

Downloading shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [06:21<00:00, 95.49s/it]
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.50it/s]


Model loaded successfully on GPU!


In [5]:
prompt_original = "Extract the verbs from the following sentence. \nReturn a list of verbs formatted as ['verb1', 'verb2', 'verb3', and so on]. If no verbs are present within the sentence, return []. \nSentence: \"a user may actuate the latch 300 by providing a force to the latch 300 directed away from the seat plate 204.\" \nOutput:"
prompt_no_line_breaks = "Extract the verbs from the following sentence. Output a list of verbs formatted as ['verb1', 'verb2', 'verb3', and so on]. If no verbs are present within the sentence, the output is []. Sentence: \"a user may actuate the latch 300 by providing a force to the latch 300 directed away from the seat plate 204.\" Output:"
prompt_base = "Extract all the verbs from the following sentence. \nSentence: \"a user may actuate the latch 300 by providing a force to the latch 300 directed away from the seat plate 204.\". \nReturn only the list of verbs."
print(prompt_base)

Extract all the verbs from the following sentence. 
Sentence: "a user may actuate the latch 300 by providing a force to the latch 300 directed away from the seat plate 204.". 
Return only the list of verbs.


In [6]:
response = pipeline(prompt_base)
response[0]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'generated_text': 'Extract all the verbs from the following sentence. \nSentence: "a user may actuate the latch 300 by providing a force to the latch 300 directed away from the seat plate 204.". \nReturn only the list of verbs. \n'}

In [7]:
print(response[0]['generated_text'])

Extract all the verbs from the following sentence. 
Sentence: "a user may actuate the latch 300 by providing a force to the latch 300 directed away from the seat plate 204.". 
Return only the list of verbs. 



In [8]:
response_1 = pipeline(prompt_no_line_breaks)
print(response_1[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Extract the verbs from the following sentence. Output a list of verbs formatted as ['verb1', 'verb2', 'verb3', and so on]. If no verbs are present within the sentence, the output is []. Sentence: "a user may actuate the latch 300 by providing a force to the latch 300 directed away from the seat plate 204." Output: ['actuate']



In [13]:
import transformers
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import os

# Define the model ID
model_id = "meta-llama/Meta-Llama-3-8B"

device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        torch_dtype=torch.float16, 
        local_files_only=True  # Force loading from local cache
    )

tokenizer = AutoTokenizer.from_pretrained(
        model_id,
        local_files_only=True  # Prevent re-downloading
    )
print("Model successfully loaded from local cache!")

# Create a text generation pipeline
text_gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if device == "cuda" else -1
)


# Test the pipeline
output = text_gen_pipeline("What is the meaning of life?", max_length=100)
print(output)

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.99it/s]


Model successfully loaded from local cache!


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'What is the meaning of life? This is a question that has been pondered by philosophers, theologians, and scientists for centuries. There is no one answer that is universally accepted, but there are many different perspectives on the matter. Some people believe that life has no meaning, while others believe that it is up to each individual to find their own meaning. There are also those who believe that life has a specific purpose or meaning, which is determined by a higher power or by nature itself.\n'}]


In [28]:
# Define a wrapper function over the method pipeline()
def get_llama_response(prompt: str) -> None:
    """
    Generate a response from the Llama model.
    Parameters:
        prompt (str): The user's input/question for the model.
    Returns:
        None: Prints the model's response.
    """
    sequences = pipeline(
        prompt,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=50,
    )
    print("Chatbot:", sequences[0]['generated_text'])

In [29]:
get_llama_response(prompt_base)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Chatbot: Extract all the verbs from the following sentence. 
Sentence: "a user may actuate the latch 300 by providing a force to the latch 300 directed away from the seat plate 204.". 
Return only the list of verbs. 


