In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import yaml
import math
import warnings
from openai import OpenAI
from groq import Groq
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
config_path = os.path.join(os.getcwd(), "config.yaml")

with open(config_path, "r") as file:
    config = yaml.safe_load(file)

openai_api_key = config["openai"]["api_key"]
openai_organization = config["openai"]["organization"]

In [3]:
client_openai = OpenAI(api_key=openai_api_key, organization=openai_organization)

def get_response_openai(model, prompt, temperature=0.0, top_p=0.95, max_tokens=2048):
    completion = client_openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens
    )
    response = completion.choices[0].message.content
    return response

In [None]:
df = pd.read_parquet("./adv_data/new_generated_data_4o-mini_all.parquet")
print(df.shape)
df.head()

In [None]:
new_data_cot = pd.DataFrame(columns=["QuestionText", "CorrectAnswer", "IncorrectAnswer", "MisconceptionId", "MisconceptionName", "UsedModel", "cot_input", "cot_output"])
new_data_cot.head()

In [6]:
cot_prompt = """You are an excellent teacher with a keen ability to understand students' thought processes and identify misconceptions. Your task is to analyze a student's incorrect answer to a question and explain the reasoning behind their mistake.

You will be given the following information:
<question>{{QuestionText}}</question>
<correct_answer>{{CorrectAnswerText}}</correct_answer>
<student_answer>{{IncorrectAnswerText}}</student_answer>

Your job is to carefully analyze the student's answer and explain why they might have given this incorrect response. Compare the student's answer with the correct answer to identify the exact error(s) made.

Provide your analysis within <analyze> tags. In your analysis:
1. Break down the student's thought process step by step.
2. Explain how the student's approach differs from the correct method.
3. Suggest what concepts or skills the student might be struggling with.
4. Control the length of your analysis to be within 300 words.

Be thorough and empathetic in your analysis, considering various possible reasons for the student's mistake. Remember to view the problem from the student's perspective.

After your detailed analysis, conclude with the following sentence:
"So the misconception the student made is: {{MisconceptionName}}"

Your entire response should be structured as follows:

<analyze>
[Your detailed analysis here]

So the misconception the student made is: {{MisconceptionName}}
</analyze>"""

In [7]:
# df = df[:10]

In [None]:
model = "gpt-4o-mini"

for idx, row in tqdm(df.iterrows(), total=len(df)):
    cot_input = cot_prompt.replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["CorrectAnswer"]).replace("{{IncorrectAnswerText}}", row["IncorrectAnswer"]).replace("{{MisconceptionName}}", row["MisconceptionName"])
    response = get_response_openai(model=model, prompt=cot_input)
    response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
    new_data_cot_row = pd.DataFrame([{
        "QuestionText": row["QuestionText"],
        "CorrectAnswer": row["CorrectAnswer"],
        "IncorrectAnswer": row["IncorrectAnswer"],
        "MisconceptionId": row["MisconceptionId"],
        "MisconceptionName": row["MisconceptionName"],
        "UsedModel": model,
        "cot_input": cot_input,
        "cot_output": response
    }])
    new_data_cot = pd.concat([new_data_cot, new_data_cot_row], ignore_index=True)

In [None]:
len(new_data_cot["cot_output"].unique())

In [None]:
new_data_cot.shape

In [None]:
new_data_cot.head()

In [None]:
new_data_cot["cot_output"][0]

In [11]:
new_data_cot.to_parquet("./adv_data/new_generated_data_4o-mini_all_cot.parquet", index=False)