In [206]:
import json
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import pymupdf4llm
from collections import defaultdict
# from huggingface_hub import login
# login()

In [None]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
# Create a generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [222]:
def generate_qa_pairs(chunk):
    # A prompt crafted to encourage generating QA pairs in a clear, educational tone
    # prompt = (
    #     "You are a financial education assistant for youths. Read the following passage and generate a clear, "
    #     "engaging multiple choice (4 choice) question-answer that test the key concepts. Avoid the questions like what is the main idea of the passage, cause the passage is not known. Questions and answers must be short. Use simple language appropriate for young learners. The format of output should be question:, a:, b:, c:, d:, correct: {a,b,c,d}.\n\n"
    #     "Passage:\n" + chunk
    # )
    prompt = (
    "You are a financial education assistant for youths. Read the following passage and generate a clear, concise, and engaging "
    "multiple-choice question (4 options) that tests a key financial concept. Do not ask about the main idea, since the overall context is unknown. "
    "Keep both the question and answers short and simple, using language appropriate for young learners. Format your output exactly as follows: "
    "question:, a:, b:, c:, d:, correct: {a, b, c, d}.\n\n"
    "Passage:\n" + chunk
)
    # Generate the output; adjust max_length, temperature, and other parameters as needed
    result = generator(prompt, max_length=1024, do_sample=True, temperature=0.7)
    return result[0]["generated_text"]


In [223]:
def to_json(text):
    data = {}
    
    # Extract the question text.
    # This regex finds the line starting with "Question:" and captures everything after it.
    question_match = re.search(r"Question[:\)]\s*(.*)", text)
    if question_match:
        question = question_match.group(1).strip()
        data["question"] = question
    else:
        return None
    
    # Extract answer choices.
    # This regex finds any occurrence of a letter (A-D or a-d) followed by ')'
    # and then captures the rest of the text on that line.
    choices = re.findall(r"([A-Da-d])[\):]\s*(.*)", text)
    # We'll use lower-case keys for consistent JSON keys.
    for letter, choice in choices:
        data[letter.lower()] = choice.strip()
    if(len(choices) == 0):
        return None
    # Extract the correct answer.
    # The regex finds "Correct answer:" followed by a letter (A-D, case-insensitive) and a colon.
    correct_match = re.search(r"Correct answer:\s*([A-Da-d])", text)
    if correct_match:
        data["correct_answer"] = correct_match.group(1).lower()
    else:
        return None
        
    
    return data

In [260]:
def parse_md(md_text):
    pattern = re.compile(r'(?s)\*\*SAY\*\*(.*?)(?=\n\*|$)')
    matches = pattern.findall(md_text)
    extracted_segments = []
    for match in matches:
        if match.strip():
            cleaned = match.strip().replace('\n', ' ').replace('¡', '')
            cleaned = re.sub(r'\s+', ' ', cleaned)
            extracted_segments.append(cleaned)
    return extracted_segments
    
def generate_segments(files):
    extracted_segments_map = {}
    for pdf_file in files:
        md_text = pymupdf4llm.to_markdown("./data/" +pdf_file+".pdf")
        extracted_segments_map[pdf_file] = parse_md(md_text)
    return extracted_segments_map
    
def generate_questions(segments_map):
    questions_json = {}
    for key in segments_map:
        questions_list = {
            "questions": []
        }
        starting_point = 3
        for index in range(starting_point,len(segments_map[key])):
            question = None
            while(question == None):
                question = to_json(generate_qa_pairs(segments_map[key][index]))
            questions_list["questions"].append(question)
        questions_json[key] = questions_list
    return questions_json
        

In [254]:
files = ["borrowing","buy_car","credit","financial_decision","financial_institution","income","living_on_your_own","saving_plan"]
segments = generate_segments(files)

In [261]:
questions = generate_questions(segments)
file_path = "./questions.json"
with open(file_path, 'w') as json_file:
    json.dump(questions, json_file, indent=4)