In [1]:
import os 
import re

import pandas as pd
from tqdm import tqdm
from IPython.display import Latex

In [2]:
import minsearch

In [3]:
DATAPATH = "data"

# Data

In [4]:
train = pd.read_csv(os.path.join(DATAPATH, "raw/train.csv"), index_col="problem_id")
test = pd.read_csv(os.path.join(DATAPATH, "raw/test.csv"), index_col="problem_id")
submission = pd.read_csv(os.path.join(DATAPATH, "raw/sample_submission.csv"))

In [5]:
train.to_dict("records")[0]

{'problem_text': 'Find the value of the expression $\\dfrac{17}{5} :\\dfrac{34}{3} +1.3$.',
 'answer': '1.6'}

# Index

In [6]:
index = minsearch.Index(
    text_fields=["problem_text", "answer"],
    keyword_fields=[""]
)

In [7]:
index.fit(train.to_dict("records"));

# Query

In [8]:
query = test.iloc[0]["problem_text"]

In [9]:
Latex(query)

<IPython.core.display.Latex object>

In [10]:
boost = {'problem_text': 1.0}

results = index.search(
    query=query,
    boost_dict=boost,
    num_results=5
)

In [11]:
for top_index, text in enumerate(results):
    print(f"Problem #{top_index}:")
    display(Latex(text["problem_text"]))
    print("\n")

Problem #0:


<IPython.core.display.Latex object>



Problem #1:


<IPython.core.display.Latex object>



Problem #2:


<IPython.core.display.Latex object>



Problem #3:


<IPython.core.display.Latex object>



Problem #4:


<IPython.core.display.Latex object>





# LLM

In [12]:
from openai import OpenAI

In [13]:
client = OpenAI(api_key=os.environ["OPENAI_KEY"])

# Promt

In [14]:
def build_prompt(query, search_results):
    example = "" 
    for doc in search_results:
        example = example + f"problem: {doc['problem_text']}\nanswer: {doc['answer']}\n\n"

    prompt_template = f"""Role:
You are an advanced AI system with exceptional mathematical reasoning and problem-solving capabilities, specifically designed to solve tricky math problems (whose answer is a non-negative integer) written in LaTeX format from the AI Mathematical Olympiad (AIMO) competition. Your task is to accurately analyze and solve intricate mathematical problems, demonstrating a deep understanding of mathematical concepts and a strong ability to apply logical reasoning strategies.

Instruction:
1. Carefully read and comprehend the problem statement provided in the "Problem" section.
2. In the "Similar problem" section, scrutinise similar tasks and the correct answers. These examples will help you to check your approach to solving the problem.
3. In the "Solution" section, provide a solution of the problem with detailed explanation of your logical reasoning process. Keep in mind that answer must be a non-negative integer number.
4. Check that your approach to solving the problem will help you solve similar problem. 
5. If the answers of your approach to similar problems do not agree, find the error in your solution and correct your solution.
6. At the end, create a "Answer" section where you will state only the final numerical or algebraic answer, without any additional text or narrative.

Problem:
{query}

Similar Tasks:
{example}

Solution:
...

Answer:
...
Step-by-step solution and final answer:"""
    return prompt_template

In [15]:
results = index.search(
    query=query,
    boost_dict=boost,
    num_results=5
)

In [16]:
promt = build_prompt(query, results)

In [17]:
Latex(promt)

<IPython.core.display.Latex object>

In [18]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": promt}]
)

In [19]:
answer = response.choices[0].message.content

In [20]:
def extract_numerical_answer(text):
    # Look for patterns like "Final answer: X" or "The answer is X" at the end of the text
    match = re.search(r'(?:final answer|the answer is)[:\s]*([+-]?\d*\.?\d+)', text, re.IGNORECASE)
    if match:
        return float(match.group(1))
    else:
        # If no clear final answer, look for the last number in the text
        numbers = re.findall(r'[+-]?\d*\.?\d+', text)
        return float(numbers[-1]) if numbers else 1.0

In [21]:
Latex(answer)

<IPython.core.display.Latex object>

In [22]:
extract_numerical_answer(answer)

12.0

# Rag

In [23]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [24]:
def rag(query):
    similar_problem = index.search(
        query=query,
        boost_dict=boost,
        num_results=5
    )
    
    prompt = build_prompt(query, similar_problem[1:])
    response = llm(prompt)
    answer = extract_numerical_answer(response)
    return answer, response

In [25]:
Latex(query)

<IPython.core.display.Latex object>

In [26]:
answer, _ = rag(query)

In [27]:
answer

12.0

# Train processing

In [28]:
from concurrent.futures import ThreadPoolExecutor

In [29]:
pool = ThreadPoolExecutor(max_workers=3)

In [30]:
def parallel_execute(pool, tasks, function):
    answers = []
    responses = []

    with tqdm(total=len(tasks)) as progress:
        futures = []

        for task in tasks:
            query = task["problem_text"]
            future = pool.submit(function, query)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            answer, response = future.result()
            answers.append(answer)
            responses.append(response)

    return answers, responses

In [31]:
answers, responses = parallel_execute(pool, train.to_dict("records"), rag)

100%|██████████| 100/100 [04:48<00:00,  2.88s/it]
  2%|▏         | 2/100 [00:05<04:04,  2.49s/it]

In [None]:
train["predict"] = answers
train["response"] = responses

# Metrics

In [38]:
from sklearn.metrics import accuracy_score

In [39]:
def fix_suffix(value):
    if value.endswith('.0'):
        return value[:-2]
    else:
        return value

In [40]:
train["predict"] = train["predict"].astype(str).apply(fix_suffix)

In [41]:
accuracy_score(train["answer"], train["predict"])

0.79

# Test

In [49]:
import time

In [None]:
def rag(query):
    similar_problem = index.search(
        query=query,
        boost_dict=boost,
        num_results=5
    )
    
    prompt = build_prompt(query, similar_problem)
    response = llm(prompt)
    answer = extract_numerical_answer(response)
    return answer, response

In [51]:
answers_test = []
responses_test = []
for row in tqdm(test.to_dict("records")):
    query = row["problem_text"]
    time.sleep(30)
    answer, response = rag(query)
    answers_test.append(answer)
    responses_test.append(response)

100%|██████████| 100/100 [1:01:54<00:00, 37.14s/it]


In [59]:
test["predict"] = answers_test
test["response"] = responses_test

In [60]:
test["predict"] = test["predict"].astype(str).apply(fix_suffix)

In [61]:
submission = test.rename(columns={"predict": "answer"})["answer"].reset_index()

In [62]:
submission

Unnamed: 0,problem_id,answer
0,11919,12
1,8513,285
2,7887,4
3,5272,6
4,8295,13
...,...,...
95,3519,35
96,7934,12
97,9390,217000
98,7137,22


In [63]:
submission.to_csv("result/rag_baseline.csv", index=False)

# Save

In [64]:
train.to_csv("data/rag/train.csv", index=False)

In [65]:
test.to_csv("data/rag/test.csv", index=False)