In [1]:
import pandas as pd
from docling.document_converter import DocumentConverter
from tqdm.notebook import tqdm_notebook
import re
from typing import Any
from solutions import sol_day_1, sol_day_2, sol_day_3
from utils import convert_qa_to_prompt_answer_format, convert_conversation_to_completion_format

In [2]:
converter = DocumentConverter()

In [3]:
task_documents = ['AeP-F2011-1.Tag.pdf', 'AeP-F2011-2.Tag.pdf', 'AeP-F2011-3.Tag.pdf']

In [4]:
outputs = []
documents = []
for path in tqdm_notebook(task_documents):
    doc = converter.convert(path)
    documents.append(doc)
    outputs.append(doc.document.export_to_markdown())


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
def clean_answers(unclean_answers: list[str]) -> dict[str, Any]:
    """
    Cleans a list of raw question and answer strings by stripping whitespace and formatting
    the answers into a dictionary keyed by letter choices (A, B, C, etc.).

    :param unclean_answers: A list of strings where the first element is the question and the
                            later elements are the unprocessed answer choices.
    :return: A dictionary with two keys:
             - 'question': the cleaned question string.
             - 'choices': a dictionary mapping choice letters ('A', 'B', etc.) to answer strings.
    """
    idx_choice_map = {0:'A', 1:'B', 2:'C', 3:'D', 4:'E', 5:'F', 6:'G', 7:'H', 8:'I', 9:'J', 10:'K', 11:'L', 12:'M', 13:'N'}
    question = unclean_answers[0].strip('-').strip()
    unclean_answers = unclean_answers[1:]
    unclean_answers = [i.strip() for i in unclean_answers]
    cleaned_answers = [i.split('\n', 1)[0] for i in unclean_answers]
    return {'question': question, 'choices': {idx_choice_map[num]: answer for num, answer in enumerate(cleaned_answers)}}


def extract_singular_question_qa_pairs(output: str, solution_map) -> list[dict[str, Any]]:
    """
    Extracts individual question-answer pairs from a formatted output string and matches each question
    with the correct answer letter using a provided solution map.

    :param output: The full raw text, which contains a section of individual questions and answers.
    :param solution_map: A dictionary mapping question numbers (1-based index) to the correct
                         answer letters (e.g., 'A', 'B', etc.).
    :return: A list of dictionaries, each containing:
             - 'question': the question string,
             - 'choices': a dict mapping answer letters to the answer text,
             - 'correct_answer_letter': the correct answer letter if valid, otherwise None,
             - 'question_id': a numeric ID starting from 1.
             Only questions with exactly 5 answer choices and valid answers are included in the final list.
    """
    singular_qa_part = output.split('Fallstudie Nr')[0].split('Einzelaufgaben\n\n')[1]
    isolated_qa_sections = re.split(r'\n- \d+ ', singular_qa_part)
    qa_data = [clean_answers(re.split(r'\s*\([A-Z]\)', qa_text)) for qa_text in isolated_qa_sections]
    for n, qa_data_point in enumerate(qa_data):
        correct_answer_letter = solution_map.get(n+1, None)
        if len(qa_data_point['choices']) != 5:
            correct_answer_letter = None
        qa_data_point['correct_answer_letter'] = correct_answer_letter
        qa_data_point['question_id'] = n+1
    cleaned_qa_data = [data_point for data_point in qa_data if data_point['correct_answer_letter'] is not None]
    return cleaned_qa_data



In [6]:
final_qa_data = []
for out, solution in zip(outputs, [sol_day_1, sol_day_2, sol_day_3]):
    output_data = extract_singular_question_qa_pairs(out, solution)
    final_qa_data.extend(output_data)

In [7]:
len(final_qa_data)

136

In [8]:
df = pd.DataFrame(final_qa_data)

In [9]:
df.to_feather('german_medical_exam_multiple_choice.feather')

In [10]:
df['conversation'] = df.apply(lambda x: convert_qa_to_prompt_answer_format(x['question'], x['choices'], x['correct_answer_letter']), axis=1)

In [11]:
df = df.explode('conversation')

In [12]:
df.to_feather('german_medical_exam_boolean_questions.feather')

In [13]:
df = convert_conversation_to_completion_format(df)
df.to_feather('german_medical_exam_multiple_choice_conversation_format.feather')

In [14]:
df.shape

(680, 7)