In [None]:
# MisstepMath Dataset Generation Notebook
# This Colab-friendly version organizes and comments the original Azure Functions script.

import openai
import json
import csv
import os
from openai import OpenAI

# === CONFIGURATION ===
openai.organization = "<organization-id>"
openai.api_key = "<secret-id>"
os.environ["OPENAI_API_KEY"] = openai.api_key

client = OpenAI(organization=openai.organization)

# === STEP 1: Curriculum Map ===
# Paste your full curriculum topic map here as a dictionary:
math_topic_data = {
  "K5": {
    "Data Analysis": [
      "solve real world word problems by referring to line plots"
    ]
  }
  # Add rest of the grades and topics as needed
}

# === STEP 2: Retrieve existing entries from CSV ===
def retrieve_relevant_context(grade, topic, sub_topic, challenge_type):
    rag_file_path = "seed_dataset.csv"
    relevant_data = []
    with open(rag_file_path, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            if (row["Grade"] == str(grade) and
                row["Topic"] == topic and
                row["Sub Topic"] == sub_topic and
                row["Challenge Type"] == challenge_type):
                relevant_data.append({
                    "challenge_type": row["Challenge Type"],
                    "challenge_faced": row.get("Challenge Faced", ""),
                    "example": row.get("Example", ""),
                    "student_mistake": row.get("Student's Mistake", ""),
                    "teachers_resolution_text_only": row.get("Teacher's Resolution - Text", ""),
                    "teacher_response_whiteboard": {
                        "whiteboard": row.get("Teacher's Resolution - Whiteboard", ""),
                        "text": row.get("Teacher's Resolution - Text", "")
                    }
                })
    return json.dumps(relevant_data) if relevant_data else ""

# === STEP 3: Run ChatGPT Generation ===
def generate_student_mistakes(k_class, k_topic, cur_sub_topic, challenge_type):
    previous_data = retrieve_relevant_context(k_class, k_topic, cur_sub_topic, challenge_type)

    query = f"""
    Generate 5â€“10 diverse student mistakes that follow the pattern of existing examples below.
    Grade: {k_class}
    Topic: {k_topic}
    Sub-topic: {cur_sub_topic}
    Challenge Type: {challenge_type}

    Use the following previous examples as inspiration (do not repeat them, but produce similarly styled new examples): {previous_data}

    Use this format:
    {{
        "challenge_type": "...",
        "challenge_faced": "...",
        "example": "...",
        "student_mistake": "...",
        "teachers_resolution_text_only": "...",
        "teacher_response_whiteboard": {{"whiteboard": "...", "text": "..."}}
    }}
    Return a VALID JSON array.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": query}],
            temperature=0.7,
        )
        content = response.choices[0].message.content
        json_data = content.split("```json")[-1].split("```")[-2]
        return json.loads(json_data)
    except Exception as e:
        print(f"Error generating for {challenge_type}:", e)
        return []

# === STEP 4: Example Run ===
k_class = "K5"
k_topic = "Data Analysis"
sub_topic_index = 0
cur_sub_topic = math_topic_data[k_class][k_topic][sub_topic_index]

# Add challenge types
challenge_types = ["Misconception", "Attention", "Language Barrier"]

# Store final dataset
final_output = []
for challenge_type in challenge_types:
    generated = generate_student_mistakes(k_class, k_topic, cur_sub_topic, challenge_type)
    final_output.extend(generated)

# Save to JSONL
with open("generated_misstepmath_data.jsonl", "w", encoding="utf-8") as f:
    for item in final_output:
        f.write(json.dumps(item) + "\n")

print(f"Generated {len(final_output)} entries for {k_class} - {k_topic} - {cur_sub_topic}.")
