In [6]:
import llama_cpp
import instructor

#from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
from pydantic import BaseModel, Field


llama = llama_cpp.Llama(
    model_path="../models/Hermes-2-Pro-Llama-3-8B-Q4.gguf",
    n_gpu_layers=-1,
    n_ctx=2048,
    chat_format="llama-3",
)


create = instructor.patch(
    create=llama.create_chat_completion_openai_v1,
    mode=instructor.Mode.JSON_SCHEMA, 
)



llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from ../models/Hermes-2-Pro-Llama-3-8B-Q4.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Hermes-2-Pro-Llama-3-8B
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   7:              llama.attent

llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128003
llama_model_loader: - kv  20:            tokenizer.ggml.padding_token_id u32              = 128001
llama_model_loader: - kv  21:                    tokenizer.chat_template str              = {{bos_token}}{% for message in messag...
llama_model_loader: - kv  22:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:   65 tensors
llama_model_loader: - type q4_K:  193 tensors
llama_model_loader: - type q6_K:   33 tensors
llm_load_vocab: special tokens cache size = 288
llm_load_vocab: token to piece cache size = 0.8007 MB
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = llama
llm_load_p

In [7]:
class QuestionExtraction(BaseModel):
    chain_of_thought: str = Field(
        ...,
        description="The chain of thought that led to the prediction.",
    )
    user_input_code: str = Field(
        ...,
        description="Code that the user will implement, such as blank function implementations for the user to implement. Use the question prompt to help determine what the user needs to implement.",
    )
    predefined_code: str = Field(
        ...,
        description="Code that is predefined by the system, such as imports, function definitions, struct definitions, etc.",
    )


def extract_question_data(data: str) -> QuestionExtraction:
    extraction: QuestionExtraction = create(
        messages=[
            {
                "role": "system",
                "content": (
                    "You are an expert at extracting information from exams. "
                    "You will be given a question from a Computer Science exam, "
                    "and you will need to extract metadata about the question. "
                    "Focus on identifying the user_input_code and predefined_code. "
                    "Do not solve the question. "
                    "For example, if the question provides a function prototype, "
                    "extract it as user_input_code. If there are any predefined "
                    "function implementations or imports, extract them as predefined_code."
                ),
            },
            {
                "role": "user",
                "content": (
                    f"Extract user_input_code and predefined_code from the following text: "
                    f"<text>{data}</text>"
                ),
            },
        ],
        response_model=QuestionExtraction,
    )
    return extraction


In [8]:
from typing import List
from parser.model import Section
from parser.parse import main

sections: List[Section] = main("../fe_files/exams/FE-Aug23.pdf")

input_question = sections[1].questions[1].text;

print(input_question)

extraction = extract_question_data(input_question)

print(extraction.model_dump_json(indent=2))

A mininum  heap  is typically implemented with an array, with the root node ( minimum value ) being 
stored in index 1 of the array. To insert a new value into a heap, it’s originally placed in the first open slot, 
followed by running a “percolate up” operation. Write a function that inserts a value into a heap in this 
manner. You may assume that the array is allocated to be big enough to store the new ly inserted value. 
The function prototype is as follows:  
 
void insert(int* heap, int curSize, int newVal);  
 
heap is a pointer to an array which currently stores curSize  number of values (but has room for at 
least 1 more). newVal  is the new number to be ins erted into the heap. Write this function which inserts 
the value newVal into this minimum heap . Take care to avoid infinite loops or array out of bounds issues. 
You may assume that index curSize+1 is in bounds for the array heap. Also, remember that index 0 of the 
array heap is unused.  You may not write any helper func


llama_print_timings:        load time =     349.65 ms
llama_print_timings:      sample time =    1141.86 ms /   104 runs   (   10.98 ms per token,    91.08 tokens per second)
llama_print_timings: prompt eval time =     434.01 ms /   635 tokens (    0.68 ms per token,  1463.08 tokens per second)
llama_print_timings:        eval time =    1995.69 ms /   103 runs   (   19.38 ms per token,    51.61 tokens per second)
llama_print_timings:       total time =    3934.56 ms /   738 tokens


{
  "chain_of_thought": "The user needs to implement a function that inserts a new value into a minimum heap using an array. The function prototype is provided, and the user should use the prompt to understand what to implement.",
  "user_input_code": "void insert(int* heap, int curSize, int newVal);",
  "predefined_code": "void insert(int* heap, int curSize, int newVal) {\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n}"
}
