In [1]:
import json
import os
from collections import OrderedDict
from functools import partial
from pathlib import Path
from typing import Annotated, Literal, Optional
from uuid import uuid4

import anthropic
import instructor
import openai
from chromadb import Collection as ChromaCollection
from dotenv import load_dotenv
from langsmith import traceable
from langsmith.wrappers import wrap_openai
from pydantic import (
    AfterValidator,
    BaseModel,
    Field,
    ValidationInfo,
    field_validator,
    model_validator,
)
from tenacity import RetryError, Retrying, stop_after_attempt, wait_random_exponential

from dreamai.ai import ModelName, system_message, user_message
from dreamai.chroma import chroma_collection, query_collection, traverse_ids
from dreamai.pdf import pdf_to_collection
from dreamai.utils import deindent

load_dotenv()

ask_oai = instructor.from_openai(wrap_openai(openai.OpenAI()))
ask_cld = instructor.from_anthropic(anthropic.Anthropic())

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# from collections import OrderedDict

# topics = json.load(open("math_102_app_topics.json"))
# topics = OrderedDict(
#     {
#         topic["name"]: OrderedDict(
#             {
#                 subtopic["name"]: {
#                     concept["concept"]: concept["question_ids"]
#                     for concept in subtopic["concepts"]
#                 }
#                 for subtopic in topic["subtopics"]
#             }
#         )
#         for topic in topics
#     }
# )

In [3]:
# data_dir = Path("/media/hamza/data2/MATH/train/")
# questions_dir = Path("math_102_questions")
# question_id = 0
# for folder in data_dir.iterdir():
#     if folder.is_dir() and folder.name != "counting_and_probability":
#         folder_questions = []
#         for question_file in folder.glob("*.json"):
#             question = json.loads(question_file.read_text())
#             if "5" in question["level"]:
#                 dest = questions_dir / f"{folder.name}/{question_id}.json"
#                 os.makedirs(dest.parent, exist_ok=True)
#                 with open(dest, "w") as f:
#                     json.dump(
#                         {
#                             "id": str(question_id),
#                             "problem": question["problem"],
#                             "solution": question["solution"],
#                         },
#                         f,
#                         indent=2,
#                     )
#                 question_id += 1

In [4]:
MODEL = ModelName.GPT_3
MAX_TOKENS = 2048
ATTEMPTS = 3

QUESTIONS_PER_FOLDER = 30
CONCEPT_WORD_COUNT = 3
MIN_SUBQUESTIONS = 3
MAX_SUBQUESTIONS = 5
MIN_TOPICS = 10
MAX_TOPICS = 15
MIN_SUBTOPICS = 3
MAX_SUBTOPICS = 5
MIN_CONCEPTS = 2
MAX_CONCEPTS = 5

In [5]:
def dict_to_ordereddict(d: dict | OrderedDict) -> OrderedDict:
    return OrderedDict(d)


class CourseSubtopic(BaseModel):
    name: str
    concepts: list[str] = Field(
        f"{MIN_CONCEPTS}-{MAX_CONCEPTS} concepts covered in the subtopic.",
        min_length=MIN_CONCEPTS,
        max_length=MAX_CONCEPTS,
    )


class CourseTopic(BaseModel):
    name: str
    subtopics: list[CourseSubtopic] = Field(
        f"{MIN_SUBTOPICS}-{MAX_SUBTOPICS} ordered subtopics with concepts.",
        min_length=MIN_SUBTOPICS,
        max_length=MAX_SUBTOPICS,
    )


class ConceptWithQuestionIDs(BaseModel):
    concept: str
    question_ids: Optional[list[str]] = Field(default_factory=list)


class Subtopic(BaseModel):
    name: str
    concepts: Annotated[
        dict[str, ConceptWithQuestionIDs], AfterValidator(dict_to_ordereddict)
    ] = Field(
        description=f"{MIN_CONCEPTS}-{MAX_CONCEPTS} ordered concepts with question IDs.",
        min_length=MIN_CONCEPTS,
        max_length=MAX_CONCEPTS,
    )


class Topic(BaseModel):
    name: str
    subtopics: Annotated[dict[str, Subtopic], AfterValidator(dict_to_ordereddict)] = (
        Field(
            description=f"{MIN_SUBTOPICS}-{MAX_SUBTOPICS} ordered subtopics with concepts.",
            min_length=MIN_SUBTOPICS,
            max_length=MAX_SUBTOPICS,
        )
    )

    @classmethod
    def from_course_topic(cls, course_topic: CourseTopic) -> "Topic":
        return cls(
            name=course_topic.name,
            subtopics={
                subtopic.name: Subtopic(
                    name=subtopic.name,
                    concepts={
                        concept: ConceptWithQuestionIDs(concept=concept)
                        for concept in subtopic.concepts
                    },
                )
                for subtopic in course_topic.subtopics
            },
        )


class Topics(BaseModel):
    topics: Annotated[dict[str, Topic], AfterValidator(dict_to_ordereddict)] = Field(
        description=f"{MIN_TOPICS}-{MAX_TOPICS} ordered topics with subtopics.",
        # min_length=MIN_TOPICS,
        # max_length=MAX_TOPICS,
    )

    @model_validator(mode="after")  # type: ignore
    def create_groups(self) -> "Topics":
        self._groups = {}
        group_id = 0
        for topic_name, topic in self.topics.items():
            for subtopic_name, subtopic in topic.subtopics.items():
                for concept_name in subtopic.concepts.keys():
                    self._groups[group_id] = {
                        "id": group_id,
                        "topic": topic_name,
                        "subtopic": subtopic_name,
                        "concept": concept_name,
                    }
                    group_id += 1
        return self

In [6]:
course_topic = CourseTopic(
    name="Counting and Probability",
    subtopics=[
        CourseSubtopic(name="Counting", concepts=["Permutations", "Combinations"]),
        CourseSubtopic(
            name="Probability", concepts=["Conditional Probability", "Bayes' Theorem"]
        ),
        CourseSubtopic(
            name="Random Variables", concepts=["Expected Value", "Variance"]
        ),
    ],
)

In [7]:
Topic.from_course_topic(course_topic)

Topic(name='Counting and Probability', subtopics=OrderedDict([('Counting', Subtopic(name='Counting', concepts=OrderedDict([('Permutations', ConceptWithQuestionIDs(concept='Permutations', question_ids=[])), ('Combinations', ConceptWithQuestionIDs(concept='Combinations', question_ids=[]))]))), ('Probability', Subtopic(name='Probability', concepts=OrderedDict([('Conditional Probability', ConceptWithQuestionIDs(concept='Conditional Probability', question_ids=[])), ("Bayes' Theorem", ConceptWithQuestionIDs(concept="Bayes' Theorem", question_ids=[]))]))), ('Random Variables', Subtopic(name='Random Variables', concepts=OrderedDict([('Expected Value', ConceptWithQuestionIDs(concept='Expected Value', question_ids=[])), ('Variance', ConceptWithQuestionIDs(concept='Variance', question_ids=[]))])))]))

In [8]:
@traceable(name="102_topics")
def create_topic(
    text: str, model: ModelName = MODEL, attempts: int = ATTEMPTS
) -> Topic | None:
    sys_message = deindent(
        f"""
            You are a world class math course instructor. Extract the topic, subtopics, and concepts. Feel free to use your knowledge of the subject.
            {MIN_SUBTOPICS}-{MAX_SUBTOPICS} subtopics and {MIN_CONCEPTS}-{MAX_CONCEPTS} concepts each.
            """
    )

    oai_messages = [system_message(sys_message), user_message(text)]
    cld_messages = [user_message(text)]
    ask_kwargs = dict(
        model=model,
        max_retries=attempts,
        max_tokens=MAX_TOKENS,
        response_model=CourseTopic,
    )
    try:
        if "gpt" in model.lower():
            course_topic = ask_oai.create(messages=oai_messages, **ask_kwargs)  # type: ignore
            return Topic.from_course_topic(course_topic)
        else:
            course_topic = ask_cld.create(
                system=sys_message,
                messages=cld_messages,  # type: ignore
                **ask_kwargs,  # type: ignore[all]
            )
            return Topic.from_course_topic(course_topic)
    except Exception as e:
        print(e)
        return None

In [9]:
math_102_text = Path("math_102.txt").read_text()
print(math_102_text)

Set, its different representations and types of sets
Complex numbers their addition, subtraction, multiplication and division and Modulus of a complex number
Mapping and their types, function and their types, composite and inverse of a function, addition, subtraction, multiplication and division of functions
Quadratic functions and quadratic formula with the types of solutions
Matrices with their addition, subtraction  and multiplication , finding the inverse of a matrix using augmented matrix and co-factors. Application of matrices in solving system of linear equations, Crammer rule Determinants
Arithmetic and Geometric Sequence with their nth terms and Series sum.
Permutations and Combinations with their applications on some real life scenarios
Binomial theorem and its applications in generalizing the formulas for higher powers 
In coordinate geometry find  the distance between two points, the slope of a line,condition for parallel and perpendicular lines,
Circle and equation of circ

In [10]:
created_topics = [
    create_topic(topic, model=ModelName.HAIKU, attempts=2)
    for topic in math_102_text.splitlines()
]

In [None]:
topics = Topics(
    topics={
        topic.name: Topic(**topic.model_dump()) for topic in created_topics if topic
    }
)

In [None]:
def validate_word_count(text: str, word_count: int = 3, text_name: str = "Text") -> str:
    if len(text.split()) < word_count:
        raise ValueError(f"{text_name} should be at least {word_count} words long")
    return text


def validate_topic_subtopic(cls, info: ValidationInfo):
    if not info.context:
        return cls
    topics: Topics = info.context.get("topics")  # type: ignore
    if not topics.topics.get(cls.topic):
        raise ValueError(f"Topic {cls.topic} not found in topics")
    if not topics.topics[cls.topic].subtopics.get(cls.subtopic):
        raise ValueError(f"Subtopic {cls.subtopic} not found in {cls.topic}")
    return cls


class Question(BaseModel):
    id: str
    problem: str
    solution: str

    @field_validator("problem")
    @classmethod
    def validate_problem(cls, problem: str, info: ValidationInfo) -> str:
        if not info.context:
            return problem
        main_problem = info.context.get("main_problem")  # type: ignore
        if main_problem is None:
            return problem
        if problem == main_problem:
            raise ValueError("Subquestions should be different from the main question.")
        return problem


class TopicSubtopic(BaseModel):
    topic: str
    subtopic: str

    @model_validator(mode="after")  # type: ignore
    def validate_topic_subtopic(self, info: ValidationInfo) -> "TopicSubtopic":
        return validate_topic_subtopic(self, info)


class QuestionWithTopicSubtopic(Question):
    topic: str
    subtopic: str

    @model_validator(mode="after")  # type: ignore
    def validate_topic_subtopic(
        self, info: ValidationInfo
    ) -> "QuestionWithTopicSubtopic":
        return validate_topic_subtopic(self, info)


class SubQuestionsWithTopicSubtopic(BaseModel):
    subquestions: list[QuestionWithTopicSubtopic] = Field(
        ..., min_length=3, max_length=4
    )

    @field_validator("subquestions")
    @classmethod
    def set_ids(
        cls, subquestions: list[QuestionWithTopicSubtopic], info: ValidationInfo
    ) -> list[QuestionWithTopicSubtopic]:
        if not info.context:
            return subquestions
        main_id = info.context.get("main_id", str(uuid4()))  # type: ignore
        for i, question in enumerate(subquestions, start=1):
            question.id = f"{main_id}_{i}"
        return subquestions


class QuestionWithTopicSubtopicAndSubquestions(QuestionWithTopicSubtopic):
    subquestions: list[QuestionWithTopicSubtopic]


class Concept(BaseModel):
    concept: Annotated[
        str,
        AfterValidator(
            partial(
                validate_word_count, word_count=CONCEPT_WORD_COUNT, text_name="Concept"
            )
        ),
    ]


class QuestionWithConcept(QuestionWithTopicSubtopic):
    concept: str

    @model_validator(mode="after")  # type: ignore
    def find_duplicate(self, info: ValidationInfo) -> "QuestionWithConcept":
        if not info.context:
            return self
        topics: Topics = info.context.get("topics")  # type: ignore
        questions: dict[str, QuestionWithConcept] = info.context.get("questions")  # type: ignore
        if topics is None or questions is None:
            return self
        concepts = topics.topics[self.topic].subtopics[self.subtopic].concepts
        concept = concepts.get(
            self.concept, ConceptWithQuestionIDs(concept=self.concept)
        )
        concept_questions = "\n\n".join(
            [
                questions[id].model_dump_json(include={"id", "problem", "solution"})
                for id in concept.question_ids
            ]
        )
        if not concept.question_ids:
            return self
        new_id = "NEW"
        response_model = Literal[*concept.question_ids, new_id]  # type: ignore
        questions_message = user_message(f"EXISTING QUESTIONS\n\n{concept_questions}")
        sys_message = system_message(
            deindent(
                f"""
                You are a world class validation model, your job is to check if a question is similar to any of the questions in the list.
                If it's similar to any of the questions return the ID of that question.
                If it's not similar to any of the questions return {new_id}.
                By similar we mean that both questions are asking the same thing but in different ways.
                So there would be no point in having both questions in the same set.
                """
            )
        )
        question_message = user_message(f"\n\nQUESTION\n\n{self.problem}")
        messages = [sys_message, questions_message, question_message]
        try:
            unique_res = ask_oai.create(
                messages=messages,  # type: ignore
                model=MODEL,
                response_model=response_model,
                max_retries=ATTEMPTS,
            )
            print(messages)
            if unique_res == new_id:
                return self
            sim_question = questions[unique_res]
            return sim_question.model_copy()
        except Exception as e:
            print(e)
            return self


class QuestionWithTopicSubtopicAndConceptSubquestions(QuestionWithTopicSubtopic):
    subquestions: list[QuestionWithConcept]


class QuestionWithConceptAndSubquestions(QuestionWithConcept):
    subquestions: list[QuestionWithConcept]


class Questions(BaseModel):
    questions: Annotated[dict[str, Question], AfterValidator(dict_to_ordereddict)] = (
        Field(default_factory=OrderedDict)
    )

In [None]:
questions = {
    "1": QuestionWithConcept(
        id="1",
        problem="What is 2+2?",
        solution="4",
        topic="Math",
        subtopic="Operations",
        concept="Single digits",
    ),
    "2": QuestionWithConcept(
        id="2",
        problem="What is 9-3?",
        solution="6",
        topic="Math",
        subtopic="Operations",
        concept="Single digits",
    ),
    "3": QuestionWithConcept(
        id="3",
        problem="What is 3*3?",
        solution="9",
        topic="Math",
        subtopic="Operations",
        concept="Single digits",
    ),
    # "4": QuestionWithConcept(
    #     id="4",
    #     problem="What is 8/2?",
    #     solution="4",
    #     topic="Math",
    #     subtopic="Operations",
    #     concept="Single digits",
    # ),
}

topics = Topics(
    topics=OrderedDict(
        {
            "Math": Topic(
                name="Math",
                subtopics=dict(
                    {
                        "Operations": Subtopic(
                            name="Operations",
                            # concepts=dict(
                            #     {
                            #         "Single digits": ConceptWithQuestionIDs(
                            #             concept="Single digits",
                            #             question_ids=["1", "2", "3"],
                            #         )
                            #     }
                            # ),
                        )
                    }
                ),
            )
        }
    )
)

In [None]:
def topics_str(topics: Topics) -> str:
    topic_dumps = "\n\n".join(
        [topic.model_dump_json(indent=2) for topic in topics.topics.values()]
    )
    return deindent(
        f"""
TOPICS:

{topic_dumps}
"""
    )

In [None]:
print(topics_str(topics))

In [None]:
def topics_str(topics: Topics) -> str:
    topic_dumps = "\n\n".join(
        [topic.model_dump_json(indent=2) for topic in topics.topics.values()]
    )
    return deindent(
        f"""
TOPICS:

{topic_dumps}
"""
    )


def topic_subtopic_message(topics: Topics) -> str:
    return deindent(
        f"""
You are a world class math course instructor.
You will be given a question with a 'problem' and a 'solution'.
Given these topics and subtopics below, assign a 'topic' and a 'subtopic' to the question.
The 'subtopic' must be one of the subtopics of the 'topic'.

{topics_str(topics)}
    """
    )


def subqs_message(topics: Topics, book_pages: str = "") -> str:
    prompt = deindent(
        """
You are a world class math course instructor.
You will be given a question with a 'problem', a 'solution', a 'topic', and a 'subtopic'.
Based on the main question's problem and solution, break the question down into 3-4 smaller subquestions.
Answering these questions in sequence should lead to the solution of the main question.
So the subquestions are basically the steps to solve the main question.
And if a student can solve the main question, we can assume that they have learned the underlying concepts of the subquestions.
No 2 subquestions can have the same concept.
For each subquestion:
    1. Don't repeat the main question's 'problem' or 'solution'.
    2. Define the 'problem'. Keep the wording general and avoid using specific numbers. For example, if the main question has a formula, use variables instead of numbers.
    3. Give a detailed 'solution'. This should be specifically for the subquestion and its variables.
    4. Given these topics below, assign a 'topic' and a 'subtopic' to the subquestion.
       The 'subtopic' must be one of the subtopics of the 'topic'.
"""
    )

    if book_pages:
        prompt += f"\nYou can use these book pages for reference:\n\n{book_pages}"
    return prompt + f"\n\n{topics_str(topics)}"


def concepts_str(concepts: list[str]) -> str:
    concepts_list = "\n".join([f"- {concept}" for concept in concepts])
    return deindent(
        f"""
You can use a concept from the list below or come up with a new one if needed.

CONCEPTS:

{concepts_list}
"""
    )


def question_w_concept_message(concepts: list[str]) -> str:
    prompt = deindent(
        f"""
You are a world class math course instructor.
You will be given a question with a 'problem', a 'solution', a 'topic', and a 'subtopic'.
Assign a 'concept' to the question. The 'concept' should have at least {CONCEPT_WORD_COUNT} words.
Solving this question should help students understand this concept.
"""
    )
    if len(concepts) > 0:
        prompt += concepts_str(concepts)
    return prompt


def question_w_subqs_concept_message(concepts: list[str]) -> str:
    prompt = deindent(
        f"""
You are a world class math course instructor.
You will be given a question with a 'problem', a 'solution', a 'topic', and a 'subtopic'.
It will also have 3-5 subquestions. Each subquestion will have a 'problem', a 'solution', a 'topic', a 'subtopic', and a 'concept'.
Based on the subquestions, assign a 'concept' to the main question. The 'concept' should have at least {CONCEPT_WORD_COUNT} words.
Try not to repeat the concepts of the subquestions. Because the subquestions are the steps to solve the main question.
Solving this question should help students understand this concept.
"""
    )
    if len(concepts) > 0:
        prompt += concepts_str(concepts)
    return prompt

In [None]:
def create_question_w_topic_subtopic(
    question: Question, topics: Topics, model: ModelName = MODEL, attempts: int = 1
) -> QuestionWithTopicSubtopic:
    question_message = deindent(f"Question:\n\n{question.model_dump_json(indent=2)}")
    ask_kwargs = dict(
        max_tokens=MAX_TOKENS,
        model=model,
        response_model=TopicSubtopic,
        max_retries=attempts,
        validation_context={"topics": topics},
    )
    if model in [ModelName.HAIKU, ModelName.SONNET, ModelName.OPUS]:
        question_topic_subtopic = ask_cld.create(
            system=topic_subtopic_message(topics),
            messages=[
                user_message(content=question_message),  # type: ignore
            ],
            **ask_kwargs,  # type: ignore
        )
    else:
        question_topic_subtopic = ask_oai.create(
            messages=[
                system_message(topic_subtopic_message(topics)),
                user_message(content=question_message),  # type: ignore
            ],
            **ask_kwargs,  # type: ignore[all]
        )
    return QuestionWithTopicSubtopic.model_construct(
        **question.model_dump(), **question_topic_subtopic.model_dump()
    )


def create_subquestions_w_topic_subtopic(
    question_w_topic_subtopic: QuestionWithTopicSubtopic,
    topics: Topics,
    model: ModelName = MODEL,
    attempts: int = 1,
    pdf_collection: ChromaCollection | None = None,
    n_results: int = 3,
    n_next_links: int = 2,
    n_prev_links: int = 2,
) -> list[QuestionWithTopicSubtopic]:
    book_pages = ""
    if pdf_collection is not None:
        question_res, _ = query_collection(
            query_text=question_w_topic_subtopic.model_dump_json(
                exclude={"id"}, indent=2
            ),
            collection=pdf_collection,
            n_results=n_results,
            n_next_links=n_next_links,
            n_prev_links=n_prev_links,
        )
        book_pages = "\n\n".join(["\n".join(res["documents"]) for res in question_res])  # type: ignore
    question_w_topic_subtopic_message = deindent(
        f"Question with Topic and Subtopic:\n\n{question_w_topic_subtopic.model_dump_json(indent=2)}"  # type: ignore
    )
    sys_message = subqs_message(topics=topics, book_pages=book_pages)
    ask_kwargs = dict(
        max_tokens=MAX_TOKENS,
        model=model,
        max_retries=attempts,
        validation_context={
            "topics": topics,
            "main_id": question_w_topic_subtopic.id,
            "main_problem": question_w_topic_subtopic.problem,
        },
    )
    # print(f"SYSTEM MESSAGE:\n\n{sys_message}\n\n")
    if model in [ModelName.HAIKU, ModelName.SONNET, ModelName.OPUS]:
        return ask_cld.create(
            system=sys_message,
            messages=[user_message(content=question_w_topic_subtopic_message)],  # type: ignore
            response_model=SubQuestionsWithTopicSubtopic,
            **ask_kwargs,  # type: ignore
        ).subquestions
    else:
        return ask_oai.create(
            messages=[
                system_message(sys_message),
                user_message(content=question_w_topic_subtopic_message),  # type: ignore
            ],
            response_model=SubQuestionsWithTopicSubtopic,
            **ask_kwargs,  # type: ignore
        ).subquestions


# def get_topic_subtopic_from_question(
#     question: QuestionWithTopicSubtopic, topics: Topics
# ) -> tuple[Topic | None, Subtopic | None, int | None, int | None]:
#     topic_filter = [
#         (i, topic) for i, topic in enumerate(topics) if topic.name == question.topic
#     ]
#     if len(topic_filter) == 0:
#         return None, None, None, None
#     topic_id, topic = topic_filter[0]
#     subtopic_filter = [
#         (i, subtopic)
#         for i, subtopic in enumerate(topic.subtopics)
#         if subtopic.name == question.subtopic
#     ]
#     if len(subtopic_filter) == 0:
#         return topic, None, topic_id, None
#     subtopic_id, subtopic = subtopic_filter[0]
#     return topic, subtopic, topic_id, subtopic_id


def create_subquestions_concepts(
    subquestions: list[QuestionWithTopicSubtopic],
    topics: Topics,
    model: ModelName = MODEL,
    attempts: int = 1,
) -> tuple[list[QuestionWithConcept], Topics]:
    subquestions_concepts = []
    for subquestion_idx, subquestion in enumerate(subquestions):
        subtopic_concepts = (
            topics.topics[subquestion.topic].subtopics[subquestion.subtopic].concepts
        )
        subtopic_concept_names = list(subtopic_concepts.keys())
        concept_prompt = question_w_concept_message(concepts=subtopic_concept_names)
        subquestion_message = deindent(
            f"Question:\n\n{subquestion.model_dump_json(indent=2)}"
        )
        try:
            for attempt in Retrying(
                wait=wait_random_exponential(min=30, max=60), stop=stop_after_attempt(3)
            ):
                with attempt:
                    ask_kwargs = dict(
                        max_tokens=MAX_TOKENS,
                        model=model,
                        max_retries=attempts,
                        validation_context={"topics": topics},
                    )
                    if model in [ModelName.HAIKU, ModelName.SONNET, ModelName.OPUS]:
                        subquestion_concept = ask_cld.create(
                            system=concept_prompt,
                            messages=[user_message(content=subquestion_message)],  # type: ignore
                            response_model=Concept,
                            **ask_kwargs,  # type: ignore
                        ).concept
                    else:
                        subquestion_concept = ask_oai.create(
                            messages=[
                                system_message(concept_prompt),
                                user_message(content=subquestion_message),  # type: ignore
                            ],
                            response_model=Concept,
                            **ask_kwargs,  # type: ignore
                        ).concept
            subquestions_concepts.append(subquestion_concept)
            if subquestion_concept not in subtopic_concepts:
                subtopic_concepts[subquestion_concept] = ConceptWithQuestionIDs(
                    concept=subquestion_concept, question_ids=[subquestion.id]
                )
        except RetryError as e:
            print(f"Failed to generate concept for subquestion: {subquestion_idx}: {e}")
            continue
    subquestions_w_concepts = [
        QuestionWithConcept.model_construct(**subquestion.model_dump(), concept=concept)
        for subquestion, concept in zip(subquestions, subquestions_concepts)
    ]
    return subquestions_w_concepts, topics


def construct_question_w_topic_subtopic_and_subquestions_w_concepts(
    question_w_topic_subtopic: QuestionWithTopicSubtopic,
    subquestions_w_concepts: list[QuestionWithConcept],
) -> QuestionWithTopicSubtopicAndConceptSubquestions:
    return QuestionWithTopicSubtopicAndConceptSubquestions.model_construct(
        **question_w_topic_subtopic.model_dump(), subquestions=subquestions_w_concepts
    )


def create_question_w_concept_and_subquestions(
    question: QuestionWithTopicSubtopicAndConceptSubquestions,
    topics: list[Topic],
    model: ModelName = MODEL,
    attempts: int = 1,
) -> QuestionWithConceptAndSubquestions:
    subtopic = get_topic_subtopic_from_question(question, topics)[1]
    if subtopic is None:
        raise ValueError("Subtopic not found in topics")
    subtopic_concepts = [c.concept for c in subtopic.concepts]
    concept_prompt = question_w_subqs_concept_message(concepts=subtopic_concepts)
    question_message = deindent(
        f"Question with Subquestions:\n\n{question.model_dump_json(indent=2)}"
    )
    ask_kwargs = dict(
        max_tokens=MAX_TOKENS,
        model=model,
        max_retries=attempts,
        validation_context={"topics": topics},
    )
    if model in [ModelName.HAIKU, ModelName.SONNET, ModelName.OPUS]:
        question_concept = ask_cld.create(
            system=concept_prompt,
            messages=[user_message(content=question_message)],  # type: ignore
            response_model=Concept,
            **ask_kwargs,  # type: ignore
        ).concept
    else:
        question_concept = ask_oai.create(
            messages=[
                system_message(concept_prompt),
                user_message(content=question_message),  # type: ignore
            ],
            response_model=Concept,
            **ask_kwargs,  # type: ignore
        ).concept
    return QuestionWithConceptAndSubquestions.model_construct(
        **question.model_dump(exclude={"subquestions"}),
        concept=question_concept,
        subquestions=question.subquestions,
    )


def update_topic_subtopic_concepts(
    question: QuestionWithConceptAndSubquestions, topics: list[Topic]
) -> list[Topic]:
    _, subtopic, topic_id, subtopic_id = get_topic_subtopic_from_question(
        question, topics
    )
    if subtopic is None or topic_id is None or subtopic_id is None:
        return topics
    subtopic_concepts = [c.concept for c in subtopic.concepts]
    if question.concept not in subtopic_concepts:
        subtopic.concepts.append(
            ConceptWithQuestionIDs(
                concept=question.concept,
                question_ids=[question.id],
            )
        )
    else:
        subtopic.concepts[
            subtopic_concepts.index(question.concept)
        ].question_ids.append(question.id)
    topics[topic_id].subtopics[subtopic_id] = subtopic
    return topics

In [None]:
# questions_dir = Path("math_102_final_questions")
# for file in questions_dir.glob("*.json"):
#     question = QuestionWithConceptAndSubquestions.model_construct(
#         **json.loads(file.read_text())
#     )
#     topic, subtopic, topic_id, subtopic_id = get_topic_subtopic_from_question(
#         question=question, topics=topics
#     )
#     if subtopic is None:
#         continue
#     subtopic_concepts = [c.concept for c in subtopic.concepts]
#     if question.concept not in subtopic_concepts:
#         subtopic.concepts.append(
#             ConceptWithQuestionIDs(
#                 concept=question.concept,
#                 question_ids=[question.id],
#             )
#         )
#     topics[topic_id].subtopics[subtopic_id] = subtopic  # type: ignore

In [None]:
# with open("math_102_app_topics.json", "w") as f:
#     json.dump([topic.model_dump() for topic in topics], f, indent=2)

In [None]:
pdf_file = "/media/hamza/data2/algebra.pdf"
collection_name = "algebra_collection"

In [None]:
# pdf_collection = pdf_to_collection(
#     pdf_file,
#     collection_name=collection_name,
#     chunk_size=4000,
#     chunk_overlap=200,
#     device="cuda",
#     delete_existing=True,
# )

In [None]:
pdf_collection = chroma_collection(name=collection_name, delete_existing=False)
pdf_collection.count()

In [None]:
topics = json.load(open("math_102_topics.json"))
topics = [
    Topic(
        name=topic["name"],
        subtopics=[Subtopic(name=subtopic) for subtopic in topic["subtopics"]],
    )
    for topic in topics
]
topics[0].model_dump()

In [None]:
questions_dir = Path("math_102_questions")
questions = [
    Question.model_construct(**json.loads(question_file.read_text()))
    for folder in questions_dir.iterdir()
    for question_file in list(folder.glob("*.json"))[:QUESTIONS_PER_FOLDER]
]

In [None]:
final_dir = Path("math_102_final_questions")
os.makedirs(final_dir, exist_ok=True)
final_topics = Path("math_102_final_topics.json")
id_to_questions_file = Path("math_102_id_to_questions.json")
id_to_questions = {}
missed_questions = []
for i, question in enumerate(questions):
    current_ids = [int(q.stem) for q in final_dir.glob("*.json")]
    if int(question.id) not in current_ids:
        try:
            question = create_question_w_topic_subtopic(
                question=question, topics=topics, attempts=ATTEMPTS
            )
            subquestions = create_subquestions_w_topic_subtopic(
                question_w_topic_subtopic=question,
                topics=topics,
                attempts=ATTEMPTS,
                pdf_collection=pdf_collection,
            )
            subquestions, topics = create_subquestions_concepts(
                subquestions=subquestions, topics=topics, attempts=ATTEMPTS
            )
            question = construct_question_w_topic_subtopic_and_subquestions_w_concepts(
                question_w_topic_subtopic=question, subquestions_w_concepts=subquestions
            )
            question = create_question_w_concept_and_subquestions(
                question=question, topics=topics, attempts=ATTEMPTS
            )
            topics = update_topic_subtopic_concepts(question=question, topics=topics)
            dest = final_dir / f"{question.id}.json"
            id_to_questions[question.id] = question.model_dump()
            with open(id_to_questions_file, "w") as f:
                json.dump(id_to_questions, f, indent=2)
            with open(dest, "w") as f:
                json.dump(question.model_dump(), f, indent=2)
            with open(final_topics, "w") as f:
                json.dump(
                    [topic.model_dump() for topic in topics],
                    f,
                    indent=2,
                )
        except Exception as e:
            missed_questions.append([i, e])
            print(f"Failed to generate question: {i}: {e}")

In [None]:
topics2 = OrderedDict(
    {
        topic["name"]: OrderedDict(
            {
                subtopic["name"]: OrderedDict(
                    {
                        concept["concept"]: concept["question_ids"]
                        for concept in subtopic["concepts"]
                    }
                )
                for subtopic in topic["subtopics"]
            }
        )
        for topic in json.load(open("math_102_final_topics.json"))
    }
)

questions2 = {
    str(q["id"]): q
    for q in [
        json.loads(f.read_text())
        for f in Path("math_102_final_questions").glob("*.json")
    ]
}

In [None]:
with open("math_102_app_topics.json", "w") as f:
    json.dump(topics2, f, indent=2)

with open("math_102_app_questions.json", "w") as f:
    json.dump(questions2, f, indent=2)