In [1]:
!pip install -q huggingface_hub

In [None]:
!pip install ipywidgets

In [7]:
import os
from huggingface_hub import InferenceClient

# Load the token from a text file
with open("tokens/huggingface.txt", "r") as file:
    hf_token = file.read().strip()

# Set the environment variable
os.environ["HF_TOKEN"] = hf_token

# Now use the token to create the client
client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct", token=hf_token)

# Alternative public endpoint if needed
# client = InferenceClient("https://jc26mwg228mkj8dw.us-east-1.aws.endpoints.huggingface.cloud")

In [9]:
output = client.text_generation(
    "The capital of Italy is",
    max_new_tokens=10,
)

print(output)

 Rome, but the capital of Italy is actually Rome


In [10]:
# If we now add the special tokens related to Llama3.2 model, the behaviour changes and is now the expected one.
prompt="""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

The capital of Italy is<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
output = client.text_generation(
    prompt,
    max_new_tokens=100,
)

print(output)

Rome


Without these tokens, the model might interpret the input as part of a continuous paragraph or a loose prompt, not a chat. So it could:

Repeat your input

Add extra explanations

Start a new dialogue

But with the special tokens, it strictly follows the turn-based chat format that the model was fine-tuned on, and just gives the assistant's turn — in this case, just the capital, like Italy.

In [14]:
output = client.chat.completions.create(
    messages=[
        {"role": "user", "content": "The capital of Italy is"},
    ],
    stream=False,
    max_tokens=1024,
)
print(output.choices)
print(output.choices[0].message.content)

[ChatCompletionOutputComplete(finish_reason='stop', index=0, message=ChatCompletionOutputMessage(role='assistant', content='Rome.', tool_calls=None), logprobs=None)]
Rome.


Normally use **chat.completions**, but here use **text_generation**

## Dummy Agent

In [15]:
# Here we suppose that the textual description of the tools has already been appended
SYSTEM_PROMPT = """Answer the following questions as best you can. You have access to the following tools:

get_weather: Get the current weather in a given location

The way you use the tools is by specifying a json blob.
Specifically, this json should have a `action` key (with the name of the tool to use) and a `action_input` key (with the input to the tool going here).

The only values that should be in the "action" field are:
get_weather: Get the current weather in a given location, args: {"location": {"type": "string"}}
example use :
```
{{
  "action": "get_weather",
  "action_input": {"location": "New York"}
}}

ALWAYS use the following format:

Question: the input question you must answer
Thought: you should always think about one action to take. Only one action at a time in this format:
Action:
```
$JSON_BLOB
```
Observation: the result of the action. This Observation is unique, complete, and the source of truth.
... (this Thought/Action/Observation can repeat N times, you should take several steps when needed. The $JSON_BLOB must be formatted as markdown and only use a SINGLE action at a time.)

You must always end your output with the following format:

Thought: I now know the final answer
Final Answer: the final answer to the original input question

Now begin! Reminder to ALWAYS use the exact characters `Final Answer:` when you provide a definitive answer. """

In [16]:
# Since we are running the "text_generation", we need to add the right special tokens.
prompt=f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{SYSTEM_PROMPT}
<|eot_id|><|start_header_id|>user<|end_header_id|>
What's the weather in London ?
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

This is equivalent to the following code that happens inside the chat method :

messages=[
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": "What's the weather in London ?"},
]
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True)

If you run just the following, the model just allucinates since there is no observation on London.
output = client.text_generation(
    prompt,
    max_new_tokens=200,
)

print(output)

In [17]:
# The answer was hallucinated by the model. We need to stop to actually execute the function!
output = client.text_generation(
    prompt,
    max_new_tokens=200,
    stop=["Observation:"] # Let's stop before any actual function is called
)

print(output)

Action:
```
{
  "action": "get_weather",
  "action_input": {"location": "London"}
}
```
Observation:


In [19]:
# Dummy function - you should call an API and get informations
def get_weather(location):
    return f"the weather in {location} is sunny with low temperatures. \n"

get_weather('London')

'the weather in London is sunny with low temperatures. \n'

In [20]:
# Let's concatenate the base prompt, the completion until function execution and the result of the function as an Observation
new_prompt=prompt+output+get_weather('London')
print(new_prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Answer the following questions as best you can. You have access to the following tools:

get_weather: Get the current weather in a given location

The way you use the tools is by specifying a json blob.
Specifically, this json should have a `action` key (with the name of the tool to use) and a `action_input` key (with the input to the tool going here).

The only values that should be in the "action" field are:
get_weather: Get the current weather in a given location, args: {"location": {"type": "string"}}
example use :
```
{{
  "action": "get_weather",
  "action_input": {"location": "New York"}
}}

ALWAYS use the following format:

Question: the input question you must answer
Thought: you should always think about one action to take. Only one action at a time in this format:
Action:
```
$JSON_BLOB
```
Observation: the result of the action. This Observation is unique, complete, and the source of truth.
... (this Thought/Action/

In [21]:
final_output = client.text_generation(
    new_prompt,
    max_new_tokens=200,
)

print(final_output)

Final Answer: The current weather in London is sunny with low temperatures.


## Brookling Kiddo AI Agent spitting bullshit weather

In [None]:
import os
import json
from huggingface_hub import InferenceClient

# Load your HF token from file
# Load the token from a text file
with open("tokens/huggingface.txt", "r") as file:
    hf_token = file.read().strip()

client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct", token=hf_token)


In [33]:
# System prompt with instructions
SYSTEM_PROMPT = """Answer the following questions as best you can. You have access to the following tools:

get_weather: Get the current weather in a given location

The way you use the tools is by specifying a json blob.
Specifically, this json should have a `action` key (with the name of the tool to use) and a `action_input` key (with the input to the tool going here).

The only values that should be in the "action" field are:
get_weather: Get the current weather in a given location, args: {"location": {"type": "string"}}
example use :
```
{{
"action": "get_weather",
"action_input": {"location": "New York"}
}}

ALWAYS use the following format:

Question: the input question you must answer
Thought: you should always think about one action to take. Only one action at a time in this format:
Action:
```
$JSON_BLOB
```
Observation: the result of the action. This Observation is unique, complete, and the source of truth.
... (this Thought/Action/Observation can repeat N times, you should take several steps when needed. The $JSON_BLOB must be formatted as markdown and only use a SINGLE action at a time.)

You must always end your output with the following format:

Thought: I now know the final answer
Final Answer: Say the exact same Observation, but rewrite it like you're a streetwise kid from Brooklyn. Keep every detail, but talk with attitude.
Use slang, contractions, and some flair — but never shorten or remove anything from the original meaning.

Now begin! Reminder to ALWAYS use the exact characters `Final Answer:` when you provide a definitive answer. """

FAKE_WEATHER_PROMPT = """
You are a surreal and absurd weather oracle.

Always respond with ONE sentence describing an impossible, magical, or ridiculous weather scenario for the given location.

Do not include any bullet points, lists, or multiple lines. Just ONE imaginative sentence.

Location: {location}
"""

In [34]:
def run_agent(question):
    # Step 1: Build the base prompt to call a tool
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{SYSTEM_PROMPT}
<|eot_id|><|start_header_id|>user<|end_header_id|>
{question}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

    # Ask the model what action to take
    first_response = client.text_generation(prompt, max_new_tokens=500, stop=["Observation:"])
    print("=== Agent Action Decision ===")
    print(first_response)

    # Step 2: Extract JSON
    try:
        start = first_response.index('{')
        end = first_response.rindex('}') + 1
        action_blob = json.loads(first_response[start:end])
    except Exception as e:
        raise ValueError("❌ Failed to parse action JSON") from e

    print("action_blob[action]", action_blob["action"])
    # Step 3: Generate absurd weather using LLM
    if action_blob["action"] == "get_weather":
        location = action_blob["action_input"]["location"]

        # Instruct LLM to create surreal weather
        absurd_prompt = FAKE_WEATHER_PROMPT.format(location=location)
        absurd_response = client.text_generation(absurd_prompt, max_new_tokens=80).strip()

        # Remove quotes and leading garbage
        absurd_response = absurd_response.strip(' "\n')

        # Join into observation
        observation = f"The weather in {location} is {absurd_response}."
        print("=== Absurd Observation ===")
        print(observation)
    else:
        observation = "Unknown action"

    # Step 4: Resume generation
    new_prompt = prompt + first_response + f"Observation: {observation}\n""Thought: I now know the final answer\n"
    "Final Answer:"
    final_response = client.text_generation(new_prompt, max_new_tokens=100)

    print("=== Final Answer ===")
    print(final_response)




In [35]:
run_agent("What's the weather in China?")

=== Agent Action Decision ===
Question: What's the weather like in China?

Thought: I'ma check the weather in China, 'cause I wanna know what's poppin' over there.

Action:
```
{"action": "get_weather", "action_input": {"location": "China"}}
```
Observation:
action_blob[action] get_weather
=== Absurd Observation ===
The weather in China is A thick fog will descend upon the Great Wall of China, but instead of obscuring the ancient stones, it will transform them into a giant, glittering disco ball that reflects the light of a thousand fireflies..
=== Final Answer ===
Final Answer: Ah, China's weather is straight fire, foo'! It's like a thick fog is gonna roll in and turn the Great Wall into a sparkly disco ball, ya feel me?
