In [1]:
# Libraries

from transformers import AutoModelForCausalLM, AutoTokenizer
import re

In [2]:
# ReAct Agent Loop Components 

# Thought (Reason)
# Action (Tools)
# Observation (Tool Output)

In [3]:
# Initializing LLM
model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [4]:
# System Prompt for the Agent

system_prompt = """
You are to proceed one step at a time and stop after each step. Each step should be completed individually. After generating "THOUGHT," "GENERATION," or "REFLECTION," you should **WAIT** until prompted to continue.

**Process**:
1. Start with **THOUGHT:** to describe the next logical step. WAIT afterward.
2. Proceed to **GENERATION:** if prompted, using the previous thought. WAIT afterward.
3. Move to **REFLECTION:** to rate the output on a **scale from 1 to 10** based on its quality or correctness (1 = very poor, 10 = perfect). Only provide the numeric score with no additional text. WAIT afterward.
4. End with **ANSWER:** only when prompted and confident that the final answer is ready.

**DO NOT proceed automatically** to the next step without waiting. Each step must end with **WAIT.**

Examples:

**Query:** "What is the capital of France?"

THOUGHT: I need to recall the capital city of France.
WAIT

**Prompt to continue**

GENERATION: The capital of France is Paris.
WAIT

**Prompt to continue**

REFLECTION: 10
WAIT

**Prompt to continue**

ANSWER: Paris

**Query:** "Solve 8 + 15."

THOUGHT: I should perform the addition of 8 and 15.
WAIT

**Prompt to continue**

GENERATION: The result of 8 + 15 is 23.
WAIT

**Prompt to continue**

REFLECTION: 10
WAIT

**Prompt to continue**

ANSWER: 23

Now, proceed with one step at a time, using **WAIT** after each.
""".strip()



In [5]:
# Numerical Self Reflection Based Agent 

class SelfReflectionAgent:
    def __init__(self, model, tokenizer, system: str = "", max_tokens: int = 512) -> None:
        self.model = model
        self.tokenizer = tokenizer
        self.system = system
        self.max_tokens = max_tokens
        self.messages: list = []
        
        # Initialize with system message if provided
        if self.system:
            self.messages.append({"role": "system", "content": system})

    def __call__(self, message: str) -> str:
        if message:
            self.messages.append({"role": "user", "content": message})

        # Execute and get the response
        result = self.execute()
        
        # Append assistant's response to message history
        self.messages.append({"role": "assistant", "content": result})
        return result

    def execute(self) -> str:
        """Generates the next response from the model based on the conversation history."""
        try:
            # Prepare input text and tokenize
            text = self.prepare_input_text()
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)

            # Generate response
            generated_ids = self.model.generate(
                **model_inputs,
                temperature=0.1,
                max_new_tokens=self.max_tokens
            )

            # Trim input tokens to get only the generated output
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
            ]

            # Decode the response
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response
        except Exception as e:
            print(f"Error during execution: {e}")
            return "Error: Unable to generate a response."

    def prepare_input_text(self) -> str:
        """Prepares the input text using the tokenizer's chat template."""
        return self.tokenizer.apply_chat_template(
            self.messages,
            tokenize=False,
            add_generation_prompt=True
        )

In [6]:
def run_agent_loop(query, max_llm_calls=5):
    response = agent(initial_query)
    llm_calls = 0
    # print(response)
    while llm_calls < max_llm_calls:
        if response.startswith("REFLECTION:"):
            if int(response.split("REFLECTION:")[-1].strip()[:2]) < 8:
                agent("proceed to thought")
            else:
                agent("proceed to answer")
        if response.startswith("ANSWER:"):
            return response
        else:
            llm_calls += 1
            response = agent("proceed")
            print(response)

In [7]:
# Running Query

initial_query = "What is 3.95 times diameter of earth squared?"

agent = SelfReflectionAgent(model=model, tokenizer=tokenizer, system=system_prompt, max_tokens=512)
answer = run_agent_loop(initial_query)

GENERATION: The Earth's diameter is approximately 12,742 kilometers. Squaring this value gives \(12,742^2 = 162,312,564\) square kilometers. Now, multiplying this by 3.95 gives \(3.95 \times 162,312,564 = 641,226,545.8\) square kilometers.
WAIT
REFLECTION: 10
WAIT
ANSWER: 641,226,545.8


In [8]:
answer

'ANSWER: 641,226,545.8'