In [None]:
# Imports and Setup

import asyncio
import logging
from pathlib import Path

import weave
from dotenv import load_dotenv
from nest_asyncio import apply

from agent import zero_shot_solver, rag_solver, rag_solver_with_reflection
from retriever import Retriever
from utils import Problem, FAST_LLM, STRONG_LLM
from utils import check_correctness, load_problem

In [None]:
# Some logging to see the progress
logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)

logger = logging.getLogger(__name__)

apply()
# Load the environment variables
# You might need to set the `OPENAI_API_KEY` and `WANDB_API_KEY` in your environment variables
load_dotenv()


In [None]:
# Weave Setup

WEAVE_PROJECT = "parambharat/hackercup"  # REPLACE WITH YOUR PROJECT NAME
weave_client = weave.init(WEAVE_PROJECT)

In [None]:
# Simple check to see if the code evaluation works
# We will use this to check the programs our the agents generate

program_code = "print('hello, world!')"
input_data = ""
expected_output = "hello, world!"
timeout = 2

test_result = check_correctness(program_code, input_data, expected_output, timeout)
print("Example 1: ", test_result)
test_result = check_correctness("print('goodbye')", input_data, "hi there", timeout)
print("Example 2: ", test_result)

In [None]:
# NOTE: DOWNLOAD THE DATASET BEFORE RUNNING THIS
# see README.MD for details or 
# checkout `starter-kits/submit_first_solution/download.py`


dataset_dir = Path("data/dataset/2023/practice")
problem_names = map(lambda x: x.stem, dataset_dir.rglob("*in"))
problems = list(map(lambda x: load_problem(x, dataset_dir), problem_names))

In [None]:
# Let's build a simple evaluation using weave for the Agents
# You'll quickly see how this simple evaluation can become very powerful
# and will scale to very complex workflows
# The agent workflow already takes care of running the code
# evaluating the solution against the expected output for the sample test cases
# and returning the report in the model output
# we expect that the `test_report` is "passed" in the agent output
# so we can use that to evaluate the agent


# This is a simple dataset that contains the problem and the expected output
examples = [{"problem": problem, "expected": "passed"} for problem in problems]


# This is a simple scorer that checks if the agent output has passed the test
@weave.op
def scorer(expected: str, model_output: dict) -> dict:
    return {'passed': expected == model_output['test_report']}


# This is a simple evaluation that checks if the agent output has passed the test
eval = weave.Evaluation(
    dataset=examples,
    scorers=[scorer]
)

In [None]:
# A simple agent that uses zero shot solver
# Nothing fancy here, just a model that takes in a problem and returns a solution

class ZeroshotAgent(weave.Model):
    model: str = FAST_LLM
    temperature: float = 0.0
    timeout: int = 30

    @weave.op
    async def predict(self, problem: Problem):
        return await zero_shot_solver(
            Problem(**problem),
            model=self.model,
            temperature=self.temperature,
            timeout=self.timeout)



In [None]:
# Uncomment the following line to run the zero shot agent directly
# sample_zeroshot_result = await zero_shot_solver(problems[0], timeout=30)

In [None]:
# Evaluate the zero shot agent for all the models and temperatures

eval_models = [FAST_LLM, STRONG_LLM]
eval_temperatures = [0.0, 0.5, 1.0]
tasks = []
for LLM in eval_models:
    for temperature in eval_temperatures:
        zeroshot_agent = ZeroshotAgent(model=LLM, temperature=temperature, timeout=30)
        zeroshot_results = eval.evaluate(zeroshot_agent)
        tasks.append(zeroshot_results)

# Phew that's 2(models)*3(temps)*5(problems) = 30 evaluations

zeroshot_results = await asyncio.gather(*tasks)

In [None]:
# For the RAG agents, we'll use the same evaluation framework
# But first we need to load the retriever
# You might need to process the data and create the retriever
# checkout `starter-kits/rag/retriever.py` for more details
# We will share a `weave.dataset` for the retriever soon and
# you can use that to load the retriever instead


logger.info("Loading retriever ... this may take a while ...")
retriever = Retriever.load("data/cache/retriever")

In [None]:
# A RAG agent is a model that takes in a problem and returns a solution
# using the retriever to retrieve the similar problems and the solutions
# and then use the model to generate a new solution

class RAGAgent(weave.Model):
    retriever: Retriever
    model: str = FAST_LLM
    temperature: float = 0.0
    timeout: int = 30

    @weave.op
    async def predict(self, problem: Problem):
        return await rag_solver(
            retriever=self.retriever,
            problem=Problem(**problem),
            model=self.model,
            temperature=self.temperature,
            timeout=self.timeout,
        )

In [None]:
# Uncomment the following line to run the RAG agent directly
# sample_rag_result = await rag_solver(retriever, problems[0], timeout=30)

In [None]:
# Evaluate the RAG agent for all the models and temperatures

tasks = []
for LLM in eval_models:
    for temperature in eval_temperatures:
        rag_agent = RAGAgent(retriever=retriever, model=LLM, temperature=temperature, timeout=30)
        rag_results = eval.evaluate(rag_agent)
        tasks.append(rag_results)

# Again, 30 evals for the RAG agent with different models and temperatures

rag_results = await asyncio.gather(*tasks)

In [None]:
# Now, A more complex agent that uses reflection
# This agent will try to solve the problem using the retriever
# and if it fails, it will ask the model to reflect on the problem
# and then re-work the solution
# and repeat this process for a fixed number of iterations
# or until the solution is correct or the iteration limit is reached

class RAGReflectionAgent(weave.Model):
    retriever: Retriever
    max_iterations: int = 2
    timeout: int = 30
    model: str = STRONG_LLM
    temperature: float = 0.0

    @weave.op
    async def predict(self, problem: Problem):
        return await rag_solver_with_reflection(
            self.retriever,
            Problem(**problem),
            model=self.model,
            temperature=self.temperature,
            max_iterations=self.max_iterations,
            timeout=self.timeout)

In [None]:
# Uncomment the following line to run the RAG reflection agent directly
# sample_rag_reflection_result = await rag_solver_with_reflection(retriever, problems[0], max_iterations=2, timeout=30)

In [None]:
# Evaluate the RAG reflection agent for all the models and temperatures
tasks = []
for LLM in eval_models:
    for temperature in eval_temperatures:
        rag_reflection_agent = RAGReflectionAgent(
            retriever=retriever, model=LLM,
            temperature=temperature, timeout=30)
        rag_reflection_results = eval.evaluate(rag_reflection_agent)
        tasks.append(rag_reflection_results)
rag_reflection_results = await asyncio.gather(*tasks)