In [None]:
import instructor
import llama_cpp

# from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
from pydantic import BaseModel

llama = llama_cpp.Llama(
    model_path="../models/Hermes-2-Pro-Llama-3-8B-Q4.gguf",
    n_gpu_layers=-1,
    n_ctx=2048,
    chat_format="llama-3",
    verbose=False,
)


create = instructor.patch(
    create=llama.create_chat_completion_openai_v1,
    mode=instructor.Mode.JSON_SCHEMA,
)

In [None]:
from parser.model import QuestionClassification, QuestionDescription


def extract_question_description(
    primary_text: str, sub_text: str | None = None
) -> QuestionDescription:
    # Construct the prompt with a hint about the question type
    prompt: str = (
        f"Extract the expected input type for the following exam question: "
        f"Main Question: <text>{primary_text}</text>"
        f"{' Sub Question: <text>' + sub_text + '</text>' if sub_text is not None else ''}"
    )

    extraction: QuestionDescription = create(
        messages=[
            {
                "role": "system",
                "content": (
                    "You are an expert at analyzing exam questions. "
                    "Your task is to determine what the exam question is asking for."
                    "Focus on identifying what the user would have to input to correctly answer the question."
                    "Do not attempt to solve the question itself."
                ),
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
        response_model=QuestionDescription,
    )
    return extraction


def extract_question_data(
    primary_text: str, sub_text: str | None = None
) -> QuestionClassification:
    # Construct the prompt with a hint about the question type
    prompt: str = (
        f"Extract the expected input type for the following exam question: "
        f"Main Question: <text>{primary_text}</text>"
        f"{' Sub Question: <text>' + sub_text + '</text>' if sub_text is not None else ''}"
    )

    extraction: QuestionClassification = create(
        messages=[
            {
                "role": "system",
                "content": (
                    "You are an expert at analyzing exam questions. "
                    "Your task is to determine the type of input expected for each question. "
                    "Focus on identifying the expected input type based on the question's context. "
                    "Do not attempt to solve the question itself."
                ),
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
        response_model=QuestionClassification,
    )
    return extraction

In [None]:
from parser.model import Section
from parser.parse import main
from typing import List

sections: List[Section] = main("../fe_files/exams/FE-Aug23.pdf")


for section in sections:
    for question in section.questions:
        question.metadata.run_nlp_preprocessing()
        input_question = question.metadata.lemmatized_text
        print(input_question)
        if question.sub_questions is not None and len(question.sub_questions) > 0:
            for sub_question in question.sub_questions:
                # print(sub_question.text)
                extraction: QuestionClassification = extract_question_data(
                    input_question, sub_question.text
                )
                sub_question.classification = extraction

                # description: QuestionDescription = extract_question_description(input_question, sub_question.text)
                # sub_question.metadata.description = description

                # classification_on_description: QuestionClassification = extract_question_data(description.description)
                # sub_question.metadata.classification_on_description = classification_on_description

        else:
            extraction: QuestionClassification = extract_question_data(input_question)
            question.metadata.classification = extraction

            description: QuestionDescription = extract_question_description(
                input_question
            )
            question.metadata.description = description

            classification_on_description: QuestionClassification = (
                extract_question_data(description.description)
            )
            question.metadata.classification_on_description = (
                classification_on_description
            )


class Document(BaseModel):
    sections: List[Section]


document = Document(sections=sections)

# Write pydantic models to JSON file
with open("document_preprocessed_classified.json", "w") as json_file:
    json_file.write(document.model_dump_json())