# **Notebook for synthetic data generation**

In [None]:
# Imports
from openai import OpenAI
import os
import json

In [None]:
# Retrieve the system prompt
with open("prompt/data-synthetization.json", "r") as f:
    data = json.load(f)

prompt = data["prompt"]
print(prompt)

Generate a realistic and diverse list of JSON objects simulating synthetic users’ interactions with ScholéAI, an AI-powered personalized online learning platform for Data Science learners. Each object must represent both explicit feedback and implicit behavior during platform usage. The structure of each object must follow exactly the schema below:

{
  "user_id": int,
  "explicit_data": {
    "ratings_on_modules": { "module_X": int (1–5), ... },
    "approval_of_content_modifications": [
      {
        "change_id": str,
        "change": str,
        "status": "approved" | "rejected"
      }
    ],
    "explicit_learning_goals": str,
    "drag_and_drop_curriculum_edits": [
      {
        "module": str,
        "from_index": int,
        "to_index": int
      }
    ],
    "curriculum_editing_feedback": str,
    "preferred_content_format": "text" | "video" | "audio",
    "reflection_inputs": str,
    "satisfaction_surveys": {
      "overall_satisfaction": int (1–5),
      "interface_u

In [None]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as env var>"))
model = "gpt-4o-mini"
response = client.responses.create(
    instructions=prompt,
    model=model,
    input="Generate 1 sample.",
)

response.to_dict()["output"]


[{'id': 'msg_681795ac28988191bf9eebb1d09c289d0c15dc02627e95db',
  'content': [{'annotations': [],
    'text': '```json\n{\n  "user_id": 1,\n  "explicit_data": {\n    "ratings_on_modules": { "Introduction to Data Science": 5, "Machine Learning Basics": 4, "Data Visualization": 3 },\n    "approval_of_content_modifications": [\n      {\n        "change_id": "mod123",\n        "change": "Added more examples to Data Visualization",\n        "status": "approved"\n      }\n    ],\n    "explicit_learning_goals": "To gain a solid understanding of machine learning algorithms and improve data visualization skills.",\n    "drag_and_drop_curriculum_edits": [\n      {\n        "module": "Machine Learning Basics",\n        "from_index": 1,\n        "to_index": 0\n      }\n    ],\n    "curriculum_editing_feedback": "Would prefer more practical examples and case studies.",\n    "preferred_content_format": "video",\n    "reflection_inputs": "I find it useful to see real-world applications of concepts.",