In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import yaml
import math
import warnings
from openai import OpenAI
from groq import Groq
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
config_path = os.path.join(os.getcwd(), "config.yaml")

with open(config_path, "r") as file:
    config = yaml.safe_load(file)

openai_api_key = config["openai"]["api_key"]
openai_organization = config["openai"]["organization"]

In [3]:
client_openai = OpenAI(api_key=openai_api_key, organization=openai_organization)

def get_response_openai(model, prompt, temperature=0.7, top_p=0.95, max_tokens=2048):
    completion = client_openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens
    )
    response = completion.choices[0].message.content
    return response

In [None]:
misconception_mapping = pd.read_csv("./comp_data/misconception_mapping.csv")
print(misconception_mapping.shape)
misconception_mapping.head()

In [5]:
prompt = """You are tasked with designing a complex math-related question, its correct answer, and an incorrect answer that demonstrates a specific type of mistake. Follow these instructions carefully:

1. Create a complex math question suitable for high school or early college-level students. The question should be challenging but solvable with standard mathematical knowledge.

2. Determine the correct answer to your question. Generate your solution process in <correct_solution> tags, your solution must be correct, clear and logical.

3. Generate an incorrect answer that demonstrates the following type of mistake:
<mistake_type>
{{MISTAKE_TYPE}}
</mistake_type>

4. Format your output as follows:
   - Enclose the question within <question> tags
   - Enclose the correct answer within <correct_answer> tags (only the correct answer, do not contain the solution process)
   - Enclose the incorrect answer within <incorrect_answer> tags (only the incorrect answer)
   - Include a brief explanation in 50 words of how the incorrect answer demonstrates the specified mistake type within <explanation> tags

Remember to make the question sufficiently complex and ensure that the incorrect answer clearly demonstrates the specified mistake type. Be creative in your question design while maintaining mathematical accuracy and relevance."""

In [None]:
new_data = pd.DataFrame(columns=["QuestionText", "CorrectAnswer", "IncorrectAnswer", "MisconceptionId", "MisconceptionName", "UsedModel", "input_prompt"])
new_data.head()

In [7]:
misconception_mapping = misconception_mapping[:5]

In [None]:
for idx, row in tqdm(misconception_mapping.iterrows(), total=len(misconception_mapping)):
    mistake_type = row["MisconceptionName"]
    input_prompt = prompt.replace("{{MISTAKE_TYPE}}", mistake_type)
    try:
        for (model, temperature, top_p) in [("gpt-4o-mini", 0.0, 0.95)]:
            response = get_response_openai(
                model=model, 
                prompt=input_prompt, 
                temperature=temperature, 
                top_p=top_p
            )
            # print(response)
            question = re.search(r"<question>(.*?)</question>", response, re.DOTALL).group(1).strip()
            correct_answer = re.search(r"<correct_answer>(.*?)</correct_answer>", response, re.DOTALL).group(1).strip()
            incorrect_answer = re.search(r"<incorrect_answer>(.*?)</incorrect_answer>", response, re.DOTALL).group(1).strip()
            new_data_row = pd.DataFrame([{
                "QuestionText": question,
                "CorrectAnswer": correct_answer,
                "IncorrectAnswer": incorrect_answer,
                "MisconceptionId": row["MisconceptionId"],
                "MisconceptionName": row["MisconceptionName"],
                "UsedModel": model,
                "input_prompt": input_prompt
            }])
            new_data = pd.concat([new_data, new_data_row], ignore_index=True)
    except Exception as e:
        print(f"Error for misconception {row['MisconceptionName']}: {e}")

In [None]:
len(new_data["QuestionText"].unique())

In [None]:
new_data.shape

In [None]:
new_data.head()

In [12]:
new_data.to_csv("./adv_data/new_generated_data.csv", index=False)