In [None]:
import instructor
import llama_cpp

# from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
from pydantic import BaseModel, Field

llama = llama_cpp.Llama(
    model_path="../models/Hermes-2-Pro-Llama-3-8B-Q4.gguf",
    n_gpu_layers=-1,
    n_ctx=4096,
    chat_format="llama-3",
    verbose=False,
)


create = instructor.patch(
    create=llama.create_chat_completion_openai_v1,
    mode=instructor.Mode.JSON_SCHEMA,
)

In [None]:
from typing import List

from parser.dataset.dataloader import DataLoader
from parser.model import Section
from parser.parse import main

sections: List[Section] = main(
    "../fe_files/exams/FE-Aug23.pdf", "./FE-Aug23_extracted.json"
)


data_loader = DataLoader("../fe_files/exams/", None)
data_loader.load_data()

for exam in data_loader.exams:
    print(exam.semester, exam.year)

In [None]:
class QuestionExtraction(BaseModel):
    chain_of_thought: str = Field(
        ...,
        description="The chain of thought that led to the prediction.",
    )
    user_input_code: str = Field(
        ...,
        description="Code that the user will implement, such as blank function implementations for the user to implement. Use the question prompt to help determine what the user needs to implement.",
    )
    predefined_code: str = Field(
        ...,
        description="Code that is predefined by the system, such as imports, function definitions, struct definitions, etc.",
    )


def extract_question_data(data: str) -> QuestionExtraction:
    extraction: QuestionExtraction = create(
        messages=[
            {
                "role": "system",
                "content": (
                    "You are an expert at extracting information from exams. "
                    "You will be given a question from a Computer Science exam, "
                    "and you will need to extract metadata about the question. "
                    "Focus on identifying the user_input_code and predefined_code. "
                    "Do not solve the question. "
                    "For example, if the question provides a function prototype, "
                    "extract it as user_input_code. If there are any predefined "
                    "function implementations or imports, extract them as predefined_code."
                ),
            },
            {
                "role": "user",
                "content": (
                    f"Extract user_input_code and predefined_code from the following text: "
                    f"<text>{data}</text>"
                ),
            },
        ],
        response_model=QuestionExtraction,
    )
    return extraction


input_question = sections[1].questions[1].filtered_text

print(input_question)

extraction = extract_question_data(input_question)

print(extraction.model_dump_json(indent=2))

In [None]:
from copy import deepcopy
from typing import List

from pydantic import BaseModel, Field


class SubQuestionExtraction(BaseModel):
    has_sub_questions: bool = Field(
        ..., description="Indicates whether the question contains sub-questions."
    )
    sub_questions: List[str] = Field(
        default_factory=list,
        description="List of sub-questions extracted from the main question.",
    )


def extract_sub_questions(data: str) -> SubQuestionExtraction:
    extraction: SubQuestionExtraction = create(
        messages=[
            {
                "role": "system",
                "content": (
                    "You are an expert at analyzing exam questions. "
                    "Your task is to determine if a question contains distinct sub-questions. "
                    "Sub-questions are separate tasks or parts that require distinct answers, "
                    "often labeled as 'a)', 'b)', 'c)', or numbered parts like '1.', '2.', '3.'. "
                    "Instructions, assumptions, or clarifications within a single task do not count as sub-questions. "
                    "For example, consider the following:\n"
                    "Example with sub-questions:\n"
                    "Give the hash code produced for each of the following strings:\n"
                    "hash(“Not”) = ________\n"
                    "hash(“Know”) = ________\n"
                    "traversal:   18   14   12   9   31   24   19   22   23   36\n"
                    "traversal:   9   12   14   23   22   19   24   36   31   18\n"
                    "Each line represents a distinct sub-question because it requires a separate answer."
                ),
            },
            {
                "role": "user",
                "content": (
                    f"Analyze the following question and determine if it contains sub-questions: "
                    f"<text>{data}</text>"
                ),
            },
        ],
        response_model=SubQuestionExtraction,
    )
    return extraction


for exam in data_loader.exams:
    exam_copy = deepcopy(exam)
    print(exam_copy.semester, exam_copy.year)
    for section in exam_copy.sections:
        for question in section.questions:
            print(f"Question {question.question_number}, Page # {question.pages[0]}")
            if question.sub_questions is not None and len(question.sub_questions) > 0:
                for sub_question in question.sub_questions:
                    sub_extraction = extract_sub_questions(sub_question)
                    print(sub_extraction.model_dump_json(indent=2))
            else:
                extracted_sub_questions = extract_sub_questions(question.filtered_text)
                print(extracted_sub_questions.model_dump_json(indent=2))


# print(input_question)

# extraction = extract_sub_questions(input_question)

# print(extraction.model_dump_json(indent=2))