In [None]:
# Install necessary libraries
!pip install openai
!pip install openai pandas faiss-cpu sentence-transformers

# Import required libraries
import openai
import pandas as pd
import faiss
import numpy as np
import json
import time
import os
from openai import OpenAI
from sentence_transformers import SentenceTransformer

# Set OpenAI API credentials
secret_key = ''
organization_id = ''
openai.organization = organization_id
openai.api_key = secret_key
os.environ["OPENAI_API_KEY"] = secret_key
client = OpenAI(organization=organization_id)

# Load Sentence Transformer model for embedding retrieval
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Define file paths
train_dataset_path = "data/train_dataset.json"
selected_combinations_path = "data/selected_50_unique_rows_teacher_model.csv"
output_csv_path = "data/student_rag_results.csv"
output_copy_csv_path = "data/student_rag_results_copy.csv"

# Load the train dataset (RAG database)
train_data={}
with open(train_dataset_path, "r", encoding="utf-8") as file:
    train_data = json.load(file)
rag_df = pd.DataFrame(train_data)
selected_combinations = pd.read_csv(selected_combinations_path)

# Create a FAISS index for fast retrieval
embeddings = embedding_model.encode(rag_df["Student's mistake prompt"].tolist(), convert_to_numpy=True)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Function to retrieve relevant context from RAG database
def retrieve_context(query, top_k=3):
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    retrieved = rag_df.iloc[indices[0]][["Challenge Type", "Problem", "Student's mistake prompt"]].to_dict(orient='records')
    return retrieved

# Function to generate student mistakes with RAG

def generate_student_mistake_rag(grade, topic, sub_topic):
    query = f"Common mistakes in {sub_topic} within {topic} for grade {grade} students."
    retrieved_contexts = retrieve_context(query)
    context_str = "\n".join([json.dumps(ctx) for ctx in retrieved_contexts])

    prompt = (
        f"Given the following examples of student mistakes: {context_str}\n"
        f"Generate a divervent diverse stduent mistake for a grade:{grade} student with topic:{topic} in sub-topic:{sub_topic}."
        f"Output only JSON format (valid): {{\"challenge_type\":\"<type>\", \"question\":\"<math problem>\", \"mistake\":\"<student's mistake>\"}}"
    )

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",  # Use GPT-4 or another model
            messages=[{"role": "user", "content": prompt}],
            max_tokens=500
        )
        output = response.choices[0].message.content.strip()
        if "```json" in output:
          o = output.split("```json")[1].split("```")[0]
        else:
          o = output
        return json.loads(o)
    except Exception as e:
        print(f"Error: {e}")
        return None

# Generate dataset using RAG-enhanced model
math_data = []
num_samples = 100

for _, row in selected_combinations.iterrows():
    grade, topic, sub_topic = row["Grade"], row["Topic"], row["Sub Topic"]

    for _ in range(num_samples):
        print(f"📝 Generating sample for: {grade}, {topic}, {sub_topic}")
        response = generate_student_mistake_rag(grade, topic, sub_topic)

        if response:
            print(response)
            challenge_type, question, mistake = response["challenge_type"], response["question"], response["mistake"]
            math_data.append([grade, topic, sub_topic, challenge_type, question, mistake])


            extracted_data = pd.DataFrame([{"Grade":grade, "Topic":topic, "Sub Topic":sub_topic, "Challenge Type":challenge_type, "Problem":question, "Student Mistake Prompt":mistake}])

            if not os.path.exists(output_csv_path):
                extracted_data.to_csv(output_csv_path, index=False)
            else:
                extracted_data.to_csv(output_csv_path, mode='a', header=False, index=False)

        time.sleep(1)  # Avoid rate limits

# Convert to DataFrame
math_df = pd.DataFrame(math_data, columns=["Grade", "Topic", "Sub Topic", "Challenge Type", "Problem", "Student Mistake Prompt"])

# Save dataset to Google Drive
math_df.to_csv(output_copy_csv_path, index=False)
print(f"Dataset generation complete. Saved at: {output_copy_csv_path}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'challenge_type': 'Attention Error', 'question': 'Measure the length of a crayon using a ruler.', 'mistake': 'Student holds the ruler upside down and measures in centimeters instead of inches.'}
📝 Generating sample for: K3, Measurement, measuring lengths (including in fractions of an inch)
{'challenge_type': 'Attention Error', 'question': 'Measure the length of a toy car using a ruler.', 'mistake': 'Student measures the length of the ruler itself instead of the toy car.'}
📝 Generating sample for: K3, Measurement, measuring lengths (including in fractions of an inch)
{'challenge_type': 'Attention Error', 'question': 'Use a ruler to measure the length of a crayon.', 'mistake': 'Student measures the length and width of the crayon instead of just the length.'}
📝 Generating sample for: K3, Measurement, measuring lengths (including in fractions of an inch)
{'challenge_type': 'Conceptual Error', 'question': 'Measure the length 