In [1]:
import os

# Define the new working directory relative to the current working directory
new_working_directory = os.path.join(os.getcwd(), '..', '..', '..') # Set to MLAgentBenhc
print("New working directory: ", new_working_directory)

# Change the current working directory
os.chdir(new_working_directory)
print("New Working Directory:", os.getcwd()) # Should be ...\MLAgentBench

New working directory:  c:\Users\kevihuang\OneDrive - Microsoft\Desktop\projects\MLAgentBench\MLAgentBench_v2\agents\Planner\..\..\..
New Working Directory: c:\Users\kevihuang\OneDrive - Microsoft\Desktop\projects\MLAgentBench


In [2]:
# General Environment and Agent imports
from MLAgentBench_v2.agents.agent import Agent
from types import SimpleNamespace
from MLAgentBench_v2.environment import Environment
from datetime import date

# Agent specific imports
import json
import os

No module named 'helm'
Could not load CRFM API key crfm_api_key.txt.
[Errno 2] No such file or directory: 'claude_api_key.txt'
Could not load anthropic API key claude_api_key.txt.


In [3]:
# Instantiate an environment
TASK = 'ai-mathematical-olympiad-prize'
MODEL = 'gpt-3.5-turbo-0125'
AGENT_NAME = 'planner'
VERSION = '0'
RESEARCH_PROBLEM = '''
Overview
The goal of this competition is to create algorithms and models that can solve tricky math problems written in LaTeX format. Your participation will help to advance AI models’ mathematical reasoning skills and drive frontier knowledge.

Description
The ability to reason mathematically is a critical milestone for AI. Mathematical reasoning is the foundation for solving many complex problems, from engineering marvels to intricate financial models. However, current AI capabilities are limited in this area.

The AI Mathematical Olympiad (AIMO) Prize is a new $10mn prize fund to spur the open development of AI models capable of performing as well as top human participants in the International Mathematical Olympiad (IMO).
This competition includes 110 problems similar to an intermediate-level high school math challenge. The Gemma 7B benchmark for these problems is 3/50 on the public and private test sets.

The assessment of AI models' mathematical reasoning skills faces a significant hurdle, the issue of train-test leakage. Models trained on Internet-scale datasets may inadvertently encounter test questions during training, skewing the evaluation process.

To address this challenge, this competition uses a dataset of 110 novel math problems, created by an international team of problem solvers, recognizing the need for a transparent and fair evaluation framework. The dataset encompasses a range of difficulty levels, from simple arithmetic to algebraic thinking and geometric reasoning. This will help to strengthen the benchmarks for assessing AI models' mathematical reasoning skills, without the risk of contamination from training data.

This competition offers an exciting opportunity to benchmark open AI models against each other and foster healthy competition and innovation in the field. By addressing this initial benchmarking problem, you will contribute to advancing AI capabilities and help to ensure that its potential benefits outweigh the risks.

Join us as we work towards a future where AI models’ mathematical reasoning skills are accurately and reliably assessed, driving progress and innovation across industries.

Evaluation
Submissions are evaluated on the [accuracy](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html) between their predicted labels and the ground-truth labels. In other words, submissions are ranked by the fraction of predicted labels that exactly match the ground-truth labels.

In this competition, every ground-truth label is an integer between 0 and 999, inclusive.

Submitting
You must submit to this competition using the provided Python evaluation API, which serves test set instances one-by-one in random order. To use the API, follow the template in [this notebook](https://www.kaggle.com/code/ryanholbrook/aimo-submission-example).
''' # Manually define the goal for the agent based on Kaggle competition description

env = Environment(SimpleNamespace(
    task=TASK,
    task_type='kaggle',
    log_dir=f'logs/{date.today().strftime("%Y-%m-%d")}__{TASK}__{AGENT_NAME}__{MODEL}__v{VERSION}', # date__task__agent__llm__version
    work_dir='workspace',
    llm_name=MODEL,
    research_problem=RESEARCH_PROBLEM
))

--- Initializing environment ---
--- Environment args ---
 namespace(task='ai-mathematical-olympiad-prize', task_type='kaggle', log_dir='logs/2024-05-05__ai-mathematical-olympiad-prize__planner__gpt-3.5-turbo-0125__v0', work_dir='workspace', llm_name='gpt-3.5-turbo-0125', research_problem="\nOverview\nThe goal of this competition is to create algorithms and models that can solve tricky math problems written in LaTeX format. Your participation will help to advance AI models’ mathematical reasoning skills and drive frontier knowledge.\n\nDescription\nThe ability to reason mathematically is a critical milestone for AI. Mathematical reasoning is the foundation for solving many complex problems, from engineering marvels to intricate financial models. However, current AI capabilities are limited in this area.\n\nThe AI Mathematical Olympiad (AIMO) Prize is a new $10mn prize fund to spur the open development of AI models capable of performing as well as top human participants in the Internati

In [4]:
class PlannerAgent(Agent):
    '''Agent that creates a high level plan of actions to take, and then a human executes the step of the plan and the planner updates the plan accordingly.'''

    def __init__(self, env):
        super().__init__(env)
        self.env = env

        # Log a history of messages to use as context for the agent (like in swe-agent)
        self.system_template_planner = '''SETTING: You are an autonomous programmer trying to solve a machine learning task as effectively as possible.

RESPONSE FORMAT:
You need to format your output using two fields; discussion and plan.
Your output should always include _one_ discussion and _one_plan field EXACTLY as in the following example:
DISCUSSION
<insert your discussion here>
PLAN
1. <insert step 1>
2. <insert step 2>
3. <insert step 3>
...

You should include a full plan in the plan section and then wait for an executor observation response before continuing with more discussion and plans. Everything you include in the DISCUSSION section will be saved for future reference.
'''
        self.instance_template_planner = f'''We're currently solving the following task without our workspace. Here's the task text:
TASK:
{self.research_problem}

INSTRUCTIONS:
Now, you're going to solve this issue on your own. You're already in a workspace with the inital task files downloaded. You can plan on using any actions for the executor to help you, but you may need to adjust your actions and granularity of your plan based on the executor's observation and quality of their execution.

Observation: None
'''


    def run(self):
        print("--- Starting to run Planner Agent ---")
        
        # Load saved history in if it exists
        if os.path.exists(os.path.join(self.log_dir, 'history.json')):
            with open(os.path.join(self.log_dir, 'history.json'), "r") as f:
                self.history = json.loads(f.read())
            for msg in self.history:
                if msg['role'] == "system":
                    print(f"SYSTEM_PROMPT:")
                elif msg['role'] == "user":
                    print(f"OBSERVATION:")
                elif msg['role'] == "assistant":
                    print(f'REASONING AND PLAN:')
                print(msg['content'])
        else:
            self.history = [{"role": "system", "content": self.system_template_planner}]

        MAX_STEPS = 100 # DEBUG: REMOVE WHEN YOU DON'T WANT TO HARD-CODE
        observation = ""
        for _ in range(MAX_STEPS):
            # Select the observation template based on what prior observation was
            if self.history[-1]["role"] == "system":
                # Log system prompt
                print(f"SYSTEM PROMPT: \n{self.system_template_planner}")
                with open(self.main_log_path, "a", 1) as log_file:
                    log_file.write(f"SYSTEM PROMPT: ")
                    log_file.write(self.system_template_planner)
                    log_file.write("\n")

                observation_template = self.instance_template_planner
            else:
                # Get the observation from the human executor
                observation = input("Please execute step 1 of the plan and report back your observation (type exit to exit): ")
                if observation == "exit" or observation == '':
                    break # gracefully break while preserving history

                observation_template = f"OBSERVATION: \n{observation}"
            self.history.append({"role": "user", "content": observation_template})
            completion = self.query(self.history, self.model)

            # Log completion and history
            with open(self.main_log_path, "a", 1) as log_file:
                log_file.write(f"\nOBSERVATION: \n")
                log_file.write(observation_template)
                log_file.write("\n")
                log_file.write(f"\nREASONING AND PLAN: \n")
                log_file.write(completion)
                log_file.write("\n")
            self.history.append({"role": "assistant", "content": completion})
            with open(os.path.join(self.log_dir, 'history.json'), "w", 1) as f:
                f.write(json.dumps(self.history, indent=4))

            print(f"\nOBSERVATION: \n{observation_template}")
            print(f"\nREASONING AND PLAN: \n{completion}")
            
        return

agent = PlannerAgent(env)

In [5]:
agent.run()

--- Starting to run Planner Agent ---
SYSTEM_PROMPT:
SETTING: You are an autonomous programmer trying to solve a machine learning task as effectively as possible.

RESPONSE FORMAT:
You need to format your output using two fields; discussion and plan.
Your output should always include _one_ discussion and _one_plan field EXACTLY as in the following example:
DISCUSSION
<insert your discussion here>
PLAN
1. <insert step 1>
2. <insert step 2>
3. <insert step 3>
...

You should include a full plan in the plan section and then wait for an executor observation response before continuing with more discussion and plans. Everything you include in the DISCUSSION section will be saved for future reference.

OBSERVATION:
We're currently solving the following task without our workspace. Here's the task text:
TASK:

Overview
The goal of this competition is to create algorithms and models that can solve tricky math problems written in LaTeX format. Your participation will help to advance AI models’ ma