# Demo for MedAgent - Question Dataset Parsing
This is the manual testing playground to test some basic workflows later properly implemented in the MedAgent repository.

This file focuses on transforming the questions currently stored in an Excel sheet into something accessible via MongoDB. Additionally, the nature of the current dataset state is visualized.

In [None]:
## SETUP
import os
import sys

import pandas as pd

sys.path.append(os.path.abspath("../src"))
from general.data_model.question_dataset import all_question_classes, all_supercategories
from general.helper.mongodb_interactor import MongoDBInterface, CollectionName
from scripts.QuestionDataset.question_setup import insert_question_classes
from scripts.QuestionDataset.question_setup import insert_csv_entry_to_db
from scripts.QuestionDataset.question_analysis import *

mongo_url = os.getenv("MONGO_URL", "mongodb://mongo:mongo@host.docker.internal:27017/")

question_dataset_csv_location = "input/question-dataset/question_dataset.csv"
statistics_doc = "output/question_dataset/evaluation/statistics_document.txt"
questions_per_types, questions_per_gl = "output/question_dataset/evaluation/questions_per_types.png", "output/question_dataset/evaluation/questions_per_gl.png"

for file_or_dir in [statistics_doc, questions_per_types, questions_per_gl]:
    os.makedirs(os.path.dirname(file_or_dir), exist_ok=True)

# Scale for screen display and saving options for all images
screen_width, screen_height = 650, 450
width, height = 750, 500

dbi = MongoDBInterface(mongo_url)
dbi.register_collections(
    CollectionName.QUESTION_TYPES,
    CollectionName.CORRECT_ANSWERS,
    CollectionName.QUESTIONS,
    CollectionName.GUIDELINES
)

## Setup and insert question types

In [None]:
insert_question_classes(question_types_collection=dbi.get_collection(CollectionName.QUESTION_TYPES), classes=all_question_classes)

## Load csv-file and transform to entries
Important to mention here: the assumption is, that the csv content is valid. Meaning:
- No bullshit categories (only existing ones)
- Guideline exists and is OMS related
- Answer text is in guideline on correct page; only negative examples can have empty answer

In [None]:
df = pd.read_csv(question_dataset_csv_location, encoding="utf-8", encoding_errors='ignore')

counter_success, counter_failed = 0, 0
for _, csv_entry in df.iterrows():
    try:
        insert_csv_entry_to_db(dbi=dbi, entry=csv_entry)
        counter_success += 1
    except Exception as e:
        counter_failed += 1

print(f"Successful: {counter_success}, Failed: {counter_failed}")

## Analyze questions
We will now look at some statistics for the created and translated question dataset.

In [None]:
total_number_questions = get_total_question_count(dbi.get_collection(CollectionName.QUESTIONS))

df__question_dist, img__question_dist = analyze_and_visualize_question_distribution(
    questions_collection=dbi.get_collection(CollectionName.QUESTIONS),
    question_type_collection=dbi.get_collection(CollectionName.QUESTION_TYPES),
    all_classes=all_question_classes,
    supercategories=all_supercategories
)
df__quest_per_gl, img__quest_per_gl = analyze_and_visualize_question_per_guideline(
    questions_coll=dbi.get_collection(CollectionName.QUESTIONS),
    question_type_coll=dbi.get_collection(CollectionName.QUESTION_TYPES),
    gl_coll=dbi.get_collection(CollectionName.GUIDELINES),
    answer_coll=dbi.get_collection(CollectionName.CORRECT_ANSWERS)
)

number_questions = {
    cat.value: int(df__question_dist.query(f"supercategory == '{cat.value}' and subcategory.isna()")["count"].sum())
    for cat in all_supercategories
}

percentages = {
    cat: count / total_number_questions * 100
    for cat, count in number_questions.items()
}

print(f"Total number of questions: {total_number_questions}")
for supercat in all_supercategories:
    print(
        f"Number of {supercat.value} questions: {number_questions[supercat.value]} (-> {percentages[supercat.value]:.2f}%)")

In [None]:
img__question_dist.update_layout(width=screen_width, height=screen_height)
img__quest_per_gl.update_layout(width=screen_width, height=screen_height)

img__question_dist.show()
img__quest_per_gl.show()

Can alternatively also save the images and numbers.

In [None]:
with open(statistics_doc, "w", encoding="utf-8") as statistics_file:
    statistics_file.write(
        f"Total number of questions: {total_number_questions}\n" + "".join([
            f"Number of {supercat.value} questions: {number_questions[supercat.value]} (-> {percentages[supercat.value]:.2f}%)\n"
            for supercat in all_supercategories
        ])
    )

img__question_dist.write_image(questions_per_types, width=width, height=height)
img__quest_per_gl.write_image(questions_per_gl, width=width, height=height)

## Shutdown

In [None]:
dbi.close()