In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import yaml
import math
import warnings
from openai import OpenAI
from groq import Groq
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
config_path = os.path.join(os.getcwd(), "config.yaml")

with open(config_path, "r") as file:
    config = yaml.safe_load(file)

openai_api_key = config["openai"]["api_key"]
openai_organization = config["openai"]["organization"]

groq_api_key = config["groq"]["api_key"]

In [3]:
client_openai = OpenAI(api_key=openai_api_key, organization=openai_organization)
client_groq = Groq(api_key=groq_api_key)

def get_response_openai(model, prompt, temperature=0.7, top_p=0.95, max_tokens=1024):
    completion = client_openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens
    )
    response = completion.choices[0].message.content
    return response


def get_response_groq(model, prompt, temperature=0.7, top_p=0.95, max_tokens=1024):
    completion = client_groq.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens
    )
    response = completion.choices[0].message.content
    return response

In [None]:
train = pd.read_csv("./comp_data/train.csv")
train.head()

In [None]:
misconception_mapping = pd.read_csv("./comp_data/misconception_mapping.csv")
misconception_mapping.head()

In [None]:
train.shape

In [None]:
len(train["ConstructId"].unique())

In [None]:
cot_data = pd.DataFrame(columns=["QuestionId", "ConstructId", "ConstructName", "SubjectId", "SubjectName", "QuestionText", "CorrectAnswer", "AnswerText", "IsCorrect", "MisconceptionId", "MisconceptionName", "cot_input", "cot_inference"])
cot_data.head()

In [9]:
def lowercase_first_letter(text):
    if not text:
        return text
    return text[0].lower() + text[1:]

In [10]:
cot_prompt = """You are an excellent teacher with a keen ability to understand students' thought processes and identify misconceptions. Your task is to analyze a student's incorrect answer to a question and explain the reasoning behind their mistake.

You will be given the following information:
<question>{{QuestionText}}</question>
<correct_answer>{{CorrectAnswerText}}</correct_answer>
<student_answer>{{IncorrectAnswerText}}</student_answer>

This question requires the student to {{ConstructName}}.

Your job is to carefully analyze the student's answer and explain why they might have given this incorrect response. Compare the student's answer with the correct answer to identify the exact error(s) made.

Provide your analysis within <analyze> tags. In your analysis:
1. Break down the student's thought process step by step.
2. Explain how the student's approach differs from the correct method.
3. Suggest what concepts or skills the student might be struggling with.
4. Control the length of your analysis to be within 300 words.

Be thorough and empathetic in your analysis, considering various possible reasons for the student's mistake. Remember to view the problem from the student's perspective.

After your detailed analysis, conclude with the following sentence:
"So the misconception the student made is: {{MisconceptionName}}"

Your entire response should be structured as follows:

<analyze>
[Your detailed analysis here]

So the misconception the student made is: {{MisconceptionName}}
</analyze>"""

In [None]:
type(misconception_mapping[misconception_mapping["MisconceptionId"] == 1]["MisconceptionName"].values[0])

In [12]:
# train = train[:]

#### Use OpenAI Models

In [None]:
for idx, row in tqdm(train.iterrows(), total=len(train)):
    try:
        if row["CorrectAnswer"] == "A":
            if math.isnan(row["MisconceptionBId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerAText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerBText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionBId"])]["MisconceptionName"].values[0])
                response = get_response_openai(model="gpt-4o-mini", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerAText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerBText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionBId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionBId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionCId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerAText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerCText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionCId"])]["MisconceptionName"].values[0])
                response = get_response_openai(model="gpt-4o-mini", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerAText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerCText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionCId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionCId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionDId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerAText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerDText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionDId"])]["MisconceptionName"].values[0])
                response = get_response_openai(model="gpt-4o-mini", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerAText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerDText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionDId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionDId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
        elif row["CorrectAnswer"] == "B":
            if math.isnan(row["MisconceptionAId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerBText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerAText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionAId"])]["MisconceptionName"].values[0])
                response = get_response_openai(model="gpt-4o-mini", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerBText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerAText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionAId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionAId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionCId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerBText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerCText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionCId"])]["MisconceptionName"].values[0])
                response = get_response_openai(model="gpt-4o-mini", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerBText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerCText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionCId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionCId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionDId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerBText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerDText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionDId"])]["MisconceptionName"].values[0])
                response = get_response_openai(model="gpt-4o-mini", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerBText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerDText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionDId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionDId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
        elif row["CorrectAnswer"] == "C":
            if math.isnan(row["MisconceptionAId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerCText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerAText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionAId"])]["MisconceptionName"].values[0])
                response = get_response_openai(model="gpt-4o-mini", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerCText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerAText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionAId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionAId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionBId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerCText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerBText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionBId"])]["MisconceptionName"].values[0])
                response = get_response_openai(model="gpt-4o-mini", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerCText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerBText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionBId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionBId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionDId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerCText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerDText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionDId"])]["MisconceptionName"].values[0])
                response = get_response_openai(model="gpt-4o-mini", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerCText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerDText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionDId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionDId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
        elif row["CorrectAnswer"] == "D":
            if math.isnan(row["MisconceptionAId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerDText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerAText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionAId"])]["MisconceptionName"].values[0])
                response = get_response_openai(model="gpt-4o-mini", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerDText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerAText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionAId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionAId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionBId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerDText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerBText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionBId"])]["MisconceptionName"].values[0])
                response = get_response_openai(model="gpt-4o-mini", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerDText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerBText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionBId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionBId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionCId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerDText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerCText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionCId"])]["MisconceptionName"].values[0])
                response = get_response_openai(model="gpt-4o-mini", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerDText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerCText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionCId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionCId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
    except Exception as e:
        print(f"An error occurred: {e}")

#### Use Groq Models

In [None]:
for idx, row in tqdm(train.iterrows(), total=len(train)):
    try:
        if row["CorrectAnswer"] == "A":
            if math.isnan(row["MisconceptionBId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerAText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerBText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionBId"])]["MisconceptionName"].values[0])
                response = get_response_groq(model="llama-3.1-70b-versatile", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerAText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerBText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionBId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionBId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionCId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerAText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerCText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionCId"])]["MisconceptionName"].values[0])
                response = get_response_groq(model="llama-3.1-70b-versatile", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerAText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerCText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionCId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionCId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionDId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerAText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerDText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionDId"])]["MisconceptionName"].values[0])
                response = get_response_groq(model="llama-3.1-70b-versatile", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerAText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerDText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionDId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionDId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
        elif row["CorrectAnswer"] == "B":
            if math.isnan(row["MisconceptionAId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerBText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerAText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionAId"])]["MisconceptionName"].values[0])
                response = get_response_groq(model="llama-3.1-70b-versatile", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerBText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerAText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionAId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionAId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionCId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerBText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerCText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionCId"])]["MisconceptionName"].values[0])
                response = get_response_groq(model="llama-3.1-70b-versatile", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerBText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerCText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionCId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionCId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionDId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerBText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerDText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionDId"])]["MisconceptionName"].values[0])
                response = get_response_groq(model="llama-3.1-70b-versatile", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerBText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerDText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionDId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionDId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
        elif row["CorrectAnswer"] == "C":
            if math.isnan(row["MisconceptionAId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerCText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerAText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionAId"])]["MisconceptionName"].values[0])
                response = get_response_groq(model="llama-3.1-70b-versatile", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerCText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerAText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionAId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionAId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionBId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerCText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerBText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionBId"])]["MisconceptionName"].values[0])
                response = get_response_groq(model="llama-3.1-70b-versatile", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerCText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerBText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionBId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionBId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionDId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerCText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerDText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionDId"])]["MisconceptionName"].values[0])
                response = get_response_groq(model="llama-3.1-70b-versatile", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerCText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerDText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionDId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionDId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
        elif row["CorrectAnswer"] == "D":
            if math.isnan(row["MisconceptionAId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerDText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerAText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionAId"])]["MisconceptionName"].values[0])
                response = get_response_groq(model="llama-3.1-70b-versatile", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerDText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerAText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionAId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionAId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionBId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerDText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerBText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionBId"])]["MisconceptionName"].values[0])
                response = get_response_groq(model="llama-3.1-70b-versatile", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerDText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerBText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionBId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionBId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
            if math.isnan(row["MisconceptionCId"]) is False:
                cot_input = cot_prompt.replace("{{ConstructName}}", lowercase_first_letter(row["ConstructName"])).replace("{{QuestionText}}", row["QuestionText"]).replace("{{CorrectAnswerText}}", row["AnswerDText"]) \
                .replace("{{IncorrectAnswerText}}", row["AnswerCText"]).replace("{{MisconceptionName}}", misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionCId"])]["MisconceptionName"].values[0])
                response = get_response_groq(model="llama-3.1-70b-versatile", prompt=cot_input)
                response = re.search(r"<analyze>(.*?)</analyze>", response, re.DOTALL).group(1).strip()
                new_row = pd.DataFrame([{
                    "QuestionId": row["QuestionId"],
                    "ConstructId": row["ConstructId"],
                    "ConstructName": row["ConstructName"],
                    "SubjectId": row["SubjectId"],
                    "SubjectName": row["SubjectName"],
                    "CorrectAnswer": row["AnswerDText"],
                    "QuestionText": row["QuestionText"],
                    "AnswerText": row["AnswerCText"],
                    "IsCorrect": 0,
                    "MisconceptionId": row["MisconceptionCId"],
                    "MisconceptionName": misconception_mapping[misconception_mapping["MisconceptionId"] == int(row["MisconceptionCId"])]["MisconceptionName"].values[0],
                    "cot_input": cot_input,
                    "cot_inference": response
                }])
                cot_data = pd.concat([cot_data, new_row], ignore_index=True)
    except Exception as e:
        # print(response)
        print(f"An error occurred: {e}")

In [None]:
cot_data.head()

In [None]:
cot_data.shape

In [None]:
cot_data["cot_inference"][0]

In [18]:
cot_data.to_parquet("./adv_data/cot_data_version2_4o-mini_20240923.parquet", index=False)