In [1]:
!pip install numexpr text_generation -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import os
import numpy as np
import pandas as pd
import datasets
import numexpr  # noqa: F401
import math
from transformers import Tool
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [4]:
def split_answer(row):
    splitted = row["answer"].split("####")
    row["true_reasoning"] = splitted[0]
    row["true_answer"] = float(splitted[1].strip())
    return row


math_dataset = (
    datasets.load_dataset("gsm8k", "main")["train"].shuffle(seed=42).select(range(100))
)
math_dataset = pd.DataFrame(math_dataset)
math_dataset = math_dataset.apply(split_answer, axis=1)
math_dataset = math_dataset.drop(columns=["answer"])
math_dataset = datasets.Dataset.from_pandas(math_dataset)

eval_dataset = math_dataset.select(range(30))
eval_df = pd.DataFrame(eval_dataset)

In [5]:
def fake_llm(prompt: str, stop=None) -> str:
    if "special_marker" not in prompt:
        return """
Thought: I should multiply 2 by 3.6452. special_marker
Action:
{
    "action": "calculator",
    "action_input": {"expression": "2*3.6452"}
}
        """
    else:  # We're at step 2
        return """
Thought: I can now answer the initial question
Action:
{
    "action": "final_answer",
    "action_input": {"expression": "3.14159"}
}
            """

In [6]:
class CalculatorTool(Tool):
    name = "calculator"
    description = "This is a tool that calculates. It can be used to perform simple arithmetic operations. The variables used CANNOT be placeholders like 'x' or 'mike's age', they must be numbers"

    inputs = {"expression": str}
    outputs = {"calculator_result": str}

    def __call__(self, expression):
        local_dict = {"pi": math.pi, "e": math.e}
        output = str(
            numexpr.evaluate(
                expression.strip(),
                global_dict={},  # restrict access to globals
                local_dict=local_dict,  # add common mathematical functions
            )
        )
        return output

In [7]:
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chat_models import ChatHuggingFace

llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    max_new_tokens=3000,
)
llm_engine = ChatHuggingFace(llm=llm)


def call_llm(input: str, stop=["Observation", "Final Answer"]) -> str:
    return llm_engine.invoke(input, stop=stop).content

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/aymeric/.cache/huggingface/token
Login successful


### Evaluation functions

In [8]:
import numpy as np
import re


def extract_numbers(output):
    if isinstance(output, float) or isinstance(output, int):
        return [output]
    try:
        found_strings = [el.strip() for el in re.findall(r"(?:[,\d]+.?\d*)", output)]

        found_strings = [
            "".join(ch for ch in el if (ch.isalnum() or ch == "."))
            for el in found_strings
            if el[0].isdigit() or el[0] == "."
        ]
        found_strings = [float(el) for el in found_strings if len(el) > 0]

        return found_strings

    except Exception as e:
        print("Error when extracting string:", e)
        return []


def score_any_match(prediction, true_answer):
    extracted_numbers = extract_numbers(prediction)
    found_match = any(
        [
            np.isclose(extracted_number, true_answer, rtol=0.1)
            for extracted_number in extracted_numbers
        ]
    )
    return found_match


def score_last_match(prediction, true_answer):
    extracted_numbers = extract_numbers(prediction)
    if len(extracted_numbers) == 0:
        return False
    else:
        extracted_number = extracted_numbers[-1]
    return np.isclose(extracted_number, true_answer, rtol=0.1)

# Test Code Agent

In [9]:
from transformers import CodeAgent

code_agent = CodeAgent(llm_callable=call_llm, toolbox=[CalculatorTool()])

In [10]:
output = code_agent.run("What is 2 multiplied by 3.6452?")
output



=====Parse output
==Explanation from the agent==
I will use the following  tool: `calculator` to perform the arithmetic operation.


==Code generated by the agent==
result = calculator(expression="2 * 3.6452")
print(f"The result is {result}.")


==Execution==
The result is 7.2904.


'7.2904'

In [11]:
def get_answers(agent, questions: pd.Series):
    return questions.apply(lambda x: agent.run(x))


answers_code = get_answers(code_agent, eval_df["question"])



=====Parse output
==Explanation from the agent==
I will use the following  tools: `calculator` to perform the necessary calculations and find the total number of seashells Kyle found, then use `calculator` again to find one-third of that number, which is the number of seashells Leigh grabbed.


==Code generated by the agent==
mimi_shells = 2 * 12  # Mimi picked up 2 dozen seashells
kyle_shells = 2 * mimi_shells  # Kyle found twice as many shells as Mimi
leigh_shells = kyle_shells / 3  # Leigh grabbed one-third of the shells Kyle found
print(f"Leigh had {leigh_shells} seashells.")


==Execution==
Evaluation of the code stopped at line 0 before the end because of the following error:
BinOp is not supported.


=====Parse output
==Explanation from the agent==
I will use the following  tool: `calculator` to perform the necessary arithmetic operations to solve the problem.


==Code generated by the agent==
# Let x be the number of cats
# Frankie has six more snakes than he has cats, so he 

In [12]:
eval_df["predictions"] = answers_code
answer_any_match = eval_df.apply(
    lambda row: score_any_match(row["predictions"], row["true_answer"]), axis=1
)
answer_last_match = eval_df.apply(
    lambda row: score_last_match(row["predictions"], row["true_answer"]), axis=1
)
print("\nResults:")
answer_any_match.mean(), answer_last_match.mean()

Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Erro

(0.2, 0.2)

# Test ReAct Agent

In [13]:
from transformers import ReactAgent

react_agent = ReactAgent(
    llm_callable=call_llm, toolbox=[CalculatorTool()], max_iterations=7
)

In [14]:
output = react_agent.run("What is 2 multiplied by 3.6452?")
output

=====Calling LLM with this prompt:=====
Solve the following task as best you can. You have access to the following tools:

- calculator: This is a tool that calculates. It can be used to perform simple arithmetic operations. The variables used CANNOT be placeholders like 'x' or 'mike's age', they must be numbers
     Takes inputs: {'expression': <class 'str'>}

- final_answer: Provides a final answer to the given problem
     Takes inputs: {'answer': <class 'str'>}


The way you use the tools is by specifying a json blob.
Specifically, this json should have a `action` key (name of the tool to use) and a `action_input` key (input to the tool).

The value in the "action" field should belong to this list: calculator, final_answer.

The $ACTION_JSON_BLOB should only contain a SINGLE action, do NOT return a list of multiple actions. It should be formatted in markdown. Do not try to escape scpecial characters. Here is an example of a valid $ACTION_JSON_BLOB:
```json
{
  "action": $TOOL_NAME,

'7.2904'

In [17]:
answers_react = get_answers(react_agent, eval_df["question"])

=====Calling LLM with this prompt:=====
Solve the following task as best you can. You have access to the following tools:

- calculator: This is a tool that calculates. It can be used to perform simple arithmetic operations. The variables used CANNOT be placeholders like 'x' or 'mike's age', they must be numbers
     Takes inputs: {'expression': <class 'str'>}

- final_answer: Provides a final answer to the given problem
     Takes inputs: {'answer': <class 'str'>}


The way you use the tools is by specifying a json blob.
Specifically, this json should have a `action` key (name of the tool to use) and a `action_input` key (input to the tool).

The value in the "action" field should belong to this list: calculator, final_answer.

The $ACTION_JSON_BLOB should only contain a SINGLE action, do NOT return a list of multiple actions. It should be formatted in markdown. Do not try to escape scpecial characters. Here is an example of a valid $ACTION_JSON_BLOB:
```json
{
  "action": $TOOL_NAME,

In [19]:
eval_df["predictions"] = answers_react
answer_any_match = eval_df.apply(
    lambda row: score_any_match(row["predictions"], row["true_answer"]), axis=1
)
answer_last_match = eval_df.apply(
    lambda row: score_last_match(row["predictions"], row["true_answer"]), axis=1
)
print("\nResults:")
answer_any_match.mean(), answer_last_match.mean()

Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'
Error when extracting string: expected string or bytes-like object, got 'NoneType'

Res

(0.6333333333333333, 0.6)

# Baseline: LangChain agent

In [20]:
SYSTEM_PROMPT = """Answer the following questions as best you can. You have access to the following tools:

{tool_description_with_args}

The way you use the tools is by specifying a json blob.
Specifically, this json should have a `action` key (name of the tool to use) and a `action_input` key (input to the tool).

The only values that should be in the "action" field are: {tool_names}

The $JSON_BLOB should only contain a SINGLE action and MUST be formatted as markdown, do NOT return a list of multiple actions. Here is an example of a valid $JSON_BLOB:

```
{{
  "action": $TOOL_NAME,
  "action_input": $INPUT
}}
```
Make sure to have the $INPUT in the right format for the tool you are using, and do not put variable names as input if you can find the right values.

You will be given:

Question: the input question you must answer

You should ALWAYS use the following format:

Thought: you should always think about one action to take. Then use the action as follows:
Action:
```
$JSON_BLOB
```
Observation: the result of the action
... (this Thought/Action/Observation can repeat N times, you should take several steps when needed. The $JSON_BLOB must only use a SINGLE action at a time.)

You must always end your output with the following format:

Thought: I now know the final answer.
Final Answer: the final answer to the original input question

ALWAYS use the exact characters `Final Answer:` when you provide a definitive answer, and provide no additional explanations in the final answer: only the answer. MAKE SURE TO PROVIDE ONLY ONE ANSWER IN THE PROPER UNIT.

Now begin! """


HUMAN_PROMPT = "Question: {input}"

SCRATCHPAD_PROMPT = "{agent_scratchpad}"

In [21]:
from langchain.agents import AgentExecutor, load_tools
from langchain.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain.schema import HumanMessage

from langchain.agents.output_parsers import ReActJsonSingleInputOutputParser
from langchain.tools.render import render_text_description_and_args
from langchain.agents.format_scratchpad import format_log_to_str


def init_tools_with_llm(llm):
    tools = load_tools(["llm-math"], llm=llm)
    tools[0].name = "calculator"
    return tools


def build_hf_agent_with_tools(
    hf_endpoint_url: str = None, repo_id: str = None
) -> AgentExecutor:
    """
    Build a zero-shot ReAct chat agent from HF endpoint.

    Args:
        hf_endpoint_url (str): The endpoint URL for the Hugging Face model.

    Returns:
        AgentExecutor: An agent executor object that can be used to run the agent.

    """
    assert hf_endpoint_url or repo_id, "hf_endpoint_url or repo_id must be provided."
    assert not (
        hf_endpoint_url and repo_id
    ), "Only one of hf_endpoint_url or repo_id can be provided."

    # instantiate LLM and chat model
    if hf_endpoint_url:
        llm = HuggingFaceEndpoint(
            endpoint_url=hf_endpoint_url,
            task="text-generation",
            max_new_tokens=512,
            do_sample=False,
            repetition_penalty=1.03,
        )
    else:
        llm = HuggingFaceEndpoint(
            repo_id=repo_id,
            task="text-generation",
            max_new_tokens=512,
            do_sample=False,
            repetition_penalty=1.03,
        )

    chat_model = ChatHuggingFace(llm=llm)
    tools = init_tools_with_llm(llm)

    prompt = ChatPromptTemplate.from_messages(
        [
            HumanMessagePromptTemplate.from_template(
                SYSTEM_PROMPT + "\nSo, here is my question:" + HUMAN_PROMPT
            ),
            AIMessagePromptTemplate.from_template(SCRATCHPAD_PROMPT),
            HumanMessage(content="Now give your next thoughts: "),
        ]
    )

    prompt = prompt.partial(
        tool_description_with_args=render_text_description_and_args(tools),
        tool_names=", ".join([t.name for t in tools]),
    )

    # define the agent
    chat_model_with_stop = chat_model.bind(stop=["\nObservation"])
    agent = (
        {
            "input": lambda x: x["input"],
            "agent_scratchpad": lambda x: format_log_to_str(x["intermediate_steps"]),
        }
        | prompt
        | chat_model_with_stop
        | ReActJsonSingleInputOutputParser()
    )

    return AgentExecutor(
        agent=agent,
        tools=tools,
        verbose=True,
        return_intermediate_steps=True,
        handle_parsing_errors=True,
        max_iterations=5,
    )

In [22]:
langchain_agent = build_hf_agent_with_tools(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1"
)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/aymeric/.cache/huggingface/token
Login successful


In [23]:
def call_langchain(x):
    try:
        return langchain_agent.invoke({"input": x})["output"]
    except Exception as e:
        return ""


answers_langchain = eval_df["question"].apply(call_langchain)



[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Thought: This question involves multiple types of pets and a relationship between their numbers. To answer it, I first need to understand the relationships between the number of snakes, cats, and parrots. Then, I can calculate the total number of pets, including the dogs.

Action:
```json
{
  "action": "calculator",
  "action_input": {
    "expression": "6 (snakes) - 1 (parrot) = cats"
  }
}
```[0m[36;1m[1;3mAnswer: 5[0m[32;1m[1;3m Thought: Now I know that Frankie has 5 cats. Using this information, I can calculate the number of snakes and parrots. After that, I can add the number of dogs to find out the total number of pets.

Action:
```json
{
  "action": "calculator",
  "action_input": {
    "expression": "cats + 1 (parrot) = number of birds"
  }
}
```[0m[36;1m[1;3mAnswer: 3[0m[32;1m[1;3m Thought: Now I know that Frankie has 3 parrots and 5 cats. I can now calcula

In [25]:
eval_df["predictions_langchain"] = answers_langchain
answer_any_match = eval_df.apply(
    lambda row: score_any_match(row["predictions_langchain"], row["true_answer"]),
    axis=1,
)
answer_last_match = eval_df.apply(
    lambda row: score_last_match(row["predictions_langchain"], row["true_answer"]),
    axis=1,
)
print("\nResults:")
answer_any_match.mean(), answer_last_match.mean()


Results:


(0.43333333333333335, 0.4)

##### Results as of 29 February
- Langchain Agent: any_match = 0.43, last_match = 0.4
- HF ReactAgent: any_match = 0.63, last_match = 0.6
- HF CodeAgent: any_match = 0.20, last_match = 0.20