In [None]:
import re
import io
import PIL
import enum
import requests
import dataclasses
import collections
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from tqdm.auto import tqdm
from typing import Callable
from bs4 import BeautifulSoup

## Get the question and answers

In [None]:
# Define the structure to store the questions as a dataclass.
@dataclasses.dataclass
class QuestionData:
    number: int
    catalog_identifier: str  # number or state code
    question: str
    options: list[str]
    answer_index: int
    question_image: np.ndarray | None
    failure_rate: float | None = None

    @property
    def answer(self) -> str:
        return self.options[self.answer_index]

    @property
    def num_with_text(self) -> str:
        return f'Q{self.number}: {self.question}'

In [None]:
_BASE_URL = 'https://www.lebenindeutschland.eu'
_QUESTION_CATALOG_IDS = list(map(str, range(1, 11))) + ['by']


def get_questions(question_catalogs_ids: list[str]) -> list[QuestionData]:
    questions = []

    for catalog_id in tqdm(question_catalogs_ids):
        url = f'{_BASE_URL}/fragenkatalog/{catalog_id}'
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')

        question_blocks = soup.select("div[id^='frage-']")
        for block in question_blocks:
            question_full_text = block.select_one('h3').get_text(strip=True)
            question_full_text = question_full_text.replace('\u2009', '')
            question_number, question_text = question_full_text.split(': ', 1)
            question_number = int(question_number.split('№')[-1])

            options_list = []
            option_elements = block.select('div.flex.flex-col.gap-1 > div.py-1.pr-2.flex.gap-2')
            for option_div in option_elements:
                # Extract the text from the second child div (the actual answer text)
                option_text_element = option_div.select_one("div:nth-child(2)")
                if option_text_element:
                    option_text = option_text_element.get_text(strip=True)
                    options_list.append(option_text)

                    # Check if this is the correct answer div (has the green background class)
                    if 'bg-green-100' in option_div.get('class', []):
                        answer_index = len(options_list) - 1

            img_element = block.select_one('div.mb-6 img')
            if img_element:
                img_src = img_element.get('src')
                full_img_url = _BASE_URL + img_src
                img_response = requests.get(full_img_url, timeout=10)
                img_response.raise_for_status()
                raw_img = img_response.content
                img = np.array(PIL.Image.open(io.BytesIO(raw_img)))
            else:
                img = None


            questions.append(
                QuestionData(
                    question_number,
                    catalog_id,
                    question_text,
                    options_list,
                    answer_index,
                    img,
                )
            )
    return questions


question_number_to_question_data = {}
for question in get_questions(_QUESTION_CATALOG_IDS):
    # State specific questions again start with 1,
    # so their numbers need to be adjusted for simplifying the logic.
    if question.catalog_identifier == 'by':  question.number += 300
    question_number_to_question_data[question.number] = question

total_questions = len(question_number_to_question_data)
# questions = list(question_number_to_question_data.values())
print('total questions = ', len(question_number_to_question_data))

  0%|          | 0/11 [00:00<?, ?it/s]

total questions =  310


## Practice test

In [None]:
def conduct_test(
    question_numbers: list[int],
    question_number_to_question_data: dict[int, QuestionData]
):
    correct_count, wrongly_answered = 0, []
    for question_index, question_number in enumerate(question_numbers, start=1):
        question_data = question_number_to_question_data[question_number]
        options = np.array(question_data.options)
        # Randomize option sequence but preserve the currect choice.
        random_indices = list(np.random.permutation(len(question_data.options)))
        print(f'{question_index}) {question_data.num_with_text}')
        for i, option in enumerate(options[random_indices]): print(f'{i+1}: {option}')
        while True:
            option = input('enter option[1-4]: ')
            if not option.isdigit():  continue
            if not int(option) in range(1,5):  continue
            break
        selected_option = int(option) - 1
        correct_option = random_indices.index(question_data.answer_index)
        if selected_option == correct_option:
            print('\033[34mcorrect\033[0m', end='\n\n')
            correct_count += 1
        else:
            print(f'\033[31mwrong\033[0m, the correct option is {correct_option+1}', end='\n\n')
            wrongly_answered.append(question_number)
    print(f'correctly answered {correct_count} out of {len(question_numbers)}')
    print(f'{len(wrongly_answered)} answers were wrong, the wrongly answered question numbers are:')
    print(sorted(list(map(int, wrongly_answered))))


# # Select one question per catalog.
question_numbers = [i*30+np.random.choice(np.arange(30), 3, replace=False) for i in range(10)]
question_numbers += [300+np.random.choice(np.arange(10), 3, replace=False)]  # State specific questions.
question_numbers = np.array(question_numbers).flatten() + 1
np.random.shuffle(question_numbers[:30])
# conduct_test(question_numbers, question_number_to_question_data)

## Data Analysis

### Always correct words

The option is correct if it contains any of these words

In [None]:
def get_always_correct_word_to_question_numbers(
        question_number_to_question_data: dict[int, QuestionData]
) -> dict[str, list[int]]:
    words_in_correct_option, words_in_wrong_options = [], []
    for question_number, question_data in question_number_to_question_data.items():
        words_in_correct_option.extend(question_data.answer.split())
        for option in question_data.options:
            if option != question_data.answer:
                words_in_wrong_options.extend(option.split())
    words_only_in_correct_option = set(words_in_correct_option) - set(words_in_wrong_options)

    always_correct_word_to_question_numbers = collections.defaultdict(list)
    for question_number, question_data in question_number_to_question_data.items():
        for word in words_only_in_correct_option:
            if word in question_data.answer.split():
                always_correct_word_to_question_numbers[word].append(question_number)

    # return the dict sorted by the questions it can answer.
    always_correct_word_to_question_numbers = dict(
        sorted(
            always_correct_word_to_question_numbers.items(),
            key=lambda item: len(item[1]),
            reverse=True
        )
    )
    return always_correct_word_to_question_numbers

always_correct_word_to_question_numbers = get_always_correct_word_to_question_numbers(
    question_number_to_question_data
)
questions_covered = set(
    question_number for question_numbers in always_correct_word_to_question_numbers.values()
    for question_number in question_numbers
)
print(len(questions_covered))

# print top 10 results
for word, question_numbers in list(always_correct_word_to_question_numbers.items())[:10]:
    print(f'{word}: {question_numbers}')

203
18: [94, 108, 246, 304]
Geldstrafe: [10, 96]
Sozialversicherung: [45, 97]
Volke: [52, 61]
zählen: [105, 106]
geheim.: [112, 122]
Thüringen: [197, 201]
Opfer: [206, 220]
Nationalsozialismus: [206, 220]
Luxemburg: [229, 238]


### QuestionDatas containing images

In [None]:
def show_image_questions(questions: list[QuestionData]):
    for question in questions:
        if (question_image:=question.question_image) is not None:
            # show the question n
            print(f'Q{question.number}: {question.question}')
            plt.imshow(question_image)
            plt.show()
            for option in (options:=question.options):
                if option == question.answer:
                    print(f'\033[33m{option}\033[0m')
                else:
                    print(option)
            print()

# show_image_questions(question_number_to_question_data.values())

### Frequency of shortest/longest option being the correct answer

In [None]:
def get_correct_option_lengths(questions: list[QuestionData]):
    total_shortest_correct, total_longest_correct, other = 0, 0, 0
    for question in questions:
        option_lengths = list(map(len, question.options))
        ans_id = question.answer_index
        if ans_id == np.argmin(option_lengths):
            total_shortest_correct += 1
        elif ans_id == np.argmax(option_lengths):
            total_longest_correct += 1
        else:
            other += 1
    assert total_shortest_correct + total_longest_correct + other == len(questions)
    return total_shortest_correct, total_longest_correct, other

total_shortest_correct, total_longest_correct, other = get_correct_option_lengths(
    question_number_to_question_data.values()
)
print(f'shortest_correct = {total_shortest_correct} = {total_shortest_correct/total_questions*100:.2f}%')
print(f'longest_correct = {total_longest_correct} = {total_longest_correct/total_questions*100:.2f}%')
print(f'other = {other} = {other/total_questions*100:.2f}%')

shortest_correct = 77 = 24.84%
longest_correct = 74 = 23.87%
other = 159 = 51.29%


In [None]:
def get_option_to_count(questions: list[QuestionData]):
    option_to_count = collections.defaultdict(int)
    for question in questions:
        option_to_count[question.answer] += 1
    return option_to_count

option_to_count = get_option_to_count(question_number_to_question_data.values())
# filter recurring answers
option_to_count = {option: count for option, count in option_to_count.items() if count > 1}
option_to_count = dict(sorted(option_to_count.items(), key=lambda item: item[1], reverse=True))
print(option_to_count)

{'18': 3, '4': 3, 'Grundgesetz': 2, '1': 2, 'Ministerpräsident / Ministerpräsidentin': 2, 'Sozialversicherung': 2, 'der Bundespräsident / die Bundespräsidentin': 2, 'der Bundestag': 2, 'Sie zählen die Stimmen nach dem Ende der Wahl.': 2, '4 Jahre': 2, 'Judikative.': 2, '1933 bis 1945': 2, '1949': 2, '1961': 2, '5': 2, '2': 2}


Not a very useful result :/

### Filter question based on words

In [None]:
class FilterOption(enum.StrEnum):
    QUES_ONLY = enum.auto()
    OPT_ONLY_SINGLE = enum.auto()
    OPT_ONLY_COLLECTIVE = enum.auto()
    CORRECT_ONLY = enum.auto()
    FAILURE_RATE = enum.auto()

def filter_questions(
    questions: list[QuestionData],
    filter_func: Callable[[str], bool],
    filter_option: FilterOption,
) -> list[int]:
    filtered_question_numbers = []
    for question in questions:
        if filter_option == FilterOption.QUES_ONLY:
            if filter_func(question.question):
                filtered_question_numbers.append(question.number)
        elif filter_option == FilterOption.OPT_ONLY_SINGLE:
            if any(map(filter_func, question.options)):
                filtered_question_numbers.append(question.number)
        elif filter_option == FilterOption.OPT_ONLY_COLLECTIVE:
            if filter_func(question.options):
                filtered_question_numbers.append(question.number)
        elif filter_option == FilterOption.CORRECT_ONLY:
            if filter_func(question.answer):
                filtered_question_numbers.append(question.number)
        elif filter_option == FilterOption.FAILURE_RATE:
            if filter_func(question.failure_rate):
                filtered_question_numbers.append(question.number)
    return np.array(filtered_question_numbers)

In [None]:
word_matcher = lambda option: 'nicht' in option
is_digit = lambda option: option.isdigit()
has_year = lambda option: any(
    len(word)==4 and word.isdigit() for word in option.replace('.', ' ').split()
)
# Check if all options have digits.
has_digit = lambda options: all(
    any(char.isdigit() for char in option) for option in options
)
def is_name(option):
    if len(words:=option.split()) == 2:
        name, surname = words
        return name.istitle() and surname.istitle()
    return False

# Check for one option being longer than others. Use with collective options only.
def relative_longer_than(options: list[str], ratio:float=1.7):
    options_length = np.array(list(map(len, options)))
    return options_length.max() > ratio * options_length.mean()

# Test a filter function.
question_numbers = filter_questions(
    question_number_to_question_data.values(),
    filter_func=word_matcher, filter_option=FilterOption.QUES_ONLY
)
print(question_numbers)

[  8  34  44  47  48  51  63  65 100 104 159 245 267 268 272 274 277 278
 281 285 289 290 291 310]


In [None]:
ques_words_of_interest = [
    'nicht',  # For questions that ask the negation as a confusion tactic.
    'heiß',   # For questions that ask the name of something.
    'verb.*?t',  # 'veriet' or 'verbot' for questions regarding forbidding.
    'Abkürzung',  # Short forms.
    '„.*?“', # Quoted questions,
    'minister',
    'Regierung',
    'Bund',
    'Hitler',
    'Weltkrieg',
    'Arbeit',
    'jüd',
    'Mauer',
    'Kind',
    'Schul'
    'bedeut',
    'Ehe',  # Questions about marriage.
    'Nachbar',  # Neighboring counties.
]

options_words_of_interest = [
    'Hitler',
    'Gemein',
    'Berlin',
    'Bund',
    'spruch',
    'Abend',
    'Eltern',
    'wirtschaft',
    'gleich',
]

Find question indices for words of interest

In [None]:
# You can use `lambda option: word in option` as `filter_func` but it is
# not as flexible as regex and doest allow for string matching.

filter_name_to_question_numbers = {'no filter': filter_questions(
        question_number_to_question_data.values(),
        filter_func=lambda option: True, filter_option=FilterOption.QUES_ONLY
    )
}

filter_name_to_question_numbers |= {
    f'q_{word}': filter_questions(
        question_number_to_question_data.values(),
        filter_func=lambda option: bool(re.search(word, option)),
        filter_option= FilterOption.QUES_ONLY
    )
    for word in ques_words_of_interest
}

filter_name_to_question_numbers |= {
    f'o_{word}': filter_questions(
        question_number_to_question_data.values(),
        filter_func=lambda option: bool(re.search(word, option)),
        filter_option= FilterOption.OPT_ONLY_SINGLE
    )
    for word in options_words_of_interest
}

filter_name_to_question_numbers |= {
    'digits in options': filter_questions(
        question_number_to_question_data.values(),
        filter_func=has_digit, filter_option=FilterOption.OPT_ONLY_COLLECTIVE
    )
}

filter_name_to_question_numbers |= {
    'one long option': filter_questions(
        question_number_to_question_data.values(),
        filter_func=relative_longer_than, filter_option=FilterOption.OPT_ONLY_COLLECTIVE
    )
}

filter_name_to_question_numbers |= {
    'is name': filter_questions(
        question_number_to_question_data.values(),
        filter_func=is_name, filter_option=FilterOption.CORRECT_ONLY
    )
}

filter_name_to_counts = {
    filter_name: len(question_indices)
    for filter_name, question_indices in filter_name_to_question_numbers.items()
}
print(filter_name_to_counts)

{'no filter': 310, 'q_nicht': 24, 'q_heiß': 13, 'q_verb.*?t': 4, 'q_Abkürzung': 7, 'q_„.*?“': 26, 'q_minister': 0, 'q_Regierung': 5, 'q_Bund': 72, 'q_Hitler': 3, 'q_Weltkrieg': 6, 'q_Arbeit': 12, 'q_jüd': 6, 'q_Mauer': 4, 'q_Kind': 11, 'q_Schulbedeut': 0, 'q_Ehe': 3, 'q_Nachbar': 5, 'o_Hitler': 4, 'o_Gemein': 8, 'o_Berlin': 8, 'o_Bund': 52, 'o_spruch': 3, 'o_Abend': 2, 'o_Eltern': 8, 'o_wirtschaft': 4, 'o_gleich': 5, 'digits in options': 38, 'one long option': 15, 'is name': 9}


### Visualize

In [None]:
def scatter_questions(
    question_number_to_data: dict[int, QuestionData],
    filter_name_to_question_numbers: dict[str, list[int]]
):
    fig = go.Figure()

    for fig_index, (filter_name, question_numbers) in enumerate(
        filter_name_to_question_numbers.items()
    ):
        if len(question_numbers) == 0:  # No questions found for this word.
            fig.add_trace(go.Scatter(x=[], y=[], visible=False))
            continue

        bin_indices = question_numbers // 31
        heights = ((question_numbers-1) % 30)
        x_positions = 30 * (bin_indices+0.5)
        hovertexts = []

        for question_data in [
            question_number_to_data[question_number]
            for question_number in question_numbers
        ]:
            hovertext = question_data.num_with_text + '<br><br>'
            for option in question_data.options:
                if option == question_data.answer:
                    hovertext += f'<span style="color: yellow;">{option}</span><br>'
                else:
                    hovertext += f'{option}<br>'
            hovertexts.append(hovertext)

        fig.add_trace(
            go.Scatter(
                x=x_positions, y=heights,
                mode='text', text=question_numbers.astype(str), textfont=dict(size=14),
                hovertext=hovertexts, hoverinfo='text',
                visible=(fig_index==0)  # Only first trace visible initially.
            )
        )

    dropdown_buttons = []  # Create dropdown buttons for each word of interest.
    for i, filter_name in enumerate(filter_name_to_question_numbers):
        visibility = [False] * len(filter_name_to_question_numbers)
        visibility[i] = True

        if filter_name.startswith('q_'):
            title = f'Questions containing "{filter_name[2:]}"'
        elif filter_name.startswith('o_'):
            title = f'Options containing "{filter_name[2:]}"'
        elif 'option' in filter_name:  # Special case for numeric options.
            title = f'Questions containing {filter_name}'
        else:
            title = filter_name
        dropdown_buttons.append(
            dict(
                label=filter_name,
                method='update',
                args=[{'visible': visibility}, {'title': title}],
            )
        )

    tick_vals = 30 * np.arange(1, 11)
    fig.update_layout(
        xaxis=dict(title='Question number', tickvals=tick_vals, ticktext=tick_vals),
        yaxis=dict(visible=False), updatemenus=[dict(buttons=dropdown_buttons, x=0.0)],
        title=dropdown_buttons[0]['args'][1]['title'] if dropdown_buttons else '',
    )

    fig.show()


scatter_questions(question_number_to_question_data, filter_name_to_question_numbers)

Questions involving year

In [None]:
def get_year_to_question_indices(question_number_to_data: dict[int, QuestionData]):
    year_question_numbers = filter_questions(
            question_number_to_data.values(),
            filter_func=has_year, filter_option=FilterOption.CORRECT_ONLY
    )

    year_to_question_numbers = collections.defaultdict(list)
    for question_number in year_question_numbers:
        answer = question_number_to_data[question_number].answer
        for word in answer.replace('.', ' ').split():
            if len(word) == 4 and word.isdigit():
                year_to_question_numbers[int(word)].append(question_number)
    return year_to_question_numbers


year_to_question_numbers = get_year_to_question_indices(question_number_to_question_data)
# sort based on key
year_to_question_numbers = dict(sorted(year_to_question_numbers.items()))
print(list(map(int, year_to_question_numbers)))
del year_to_question_numbers[1700]  # False positive.
# Add 219.

[1700, 1933, 1938, 1944, 1945, 1949, 1961, 1989, 1990, 2002]


In [None]:
def plot_chronological_questions(
    year_to_question_numbers: dict[int, list[int]],
    question_number_to_data: dict[int, QuestionData],
):
    year_to_question_numbers = dict(sorted(year_to_question_numbers.items()))
    fig = go.Figure()

    x_pos_year, y_positions, all_question_numbers, hovertexts = [], [], [], []
    for year, question_list in year_to_question_numbers.items():
        for stack_index, question_number in enumerate(question_list):
            x_pos_year.append(year)
            y_positions.append(stack_index)
            all_question_numbers.append(str(question_number))
            question = question_number_to_data[question_number]
            hovertext = question.num_with_text + '<br><br>'
            for option in question.options:
                if option == question.answer:
                    hovertext += f'<span style="color: yellow;">{option}</span><br>'
                else:
                    hovertext += f'{option}<br>'
            hovertexts.append(hovertext)


    fig.add_trace(
        go.Scatter(
            x=x_pos_year, y=y_positions,
            mode='text', text=all_question_numbers, textfont=dict(size=14),
            hovertext=hovertexts, hoverinfo='text',
        )
    )

    max_stack_pos = max(y_positions) if y_positions else 0
    fig.update_layout(
        title='Questions with Year',
        xaxis=dict(title='Year', type='category', categoryorder='category ascending'),
        yaxis=dict(visible=False),
    )

    fig.show()

plot_chronological_questions(year_to_question_numbers, question_number_to_question_data)

## Get the failure rate

Please dont call `get_failure_rates()` too many times or else you will cause all the failure rates to be .75 or 3/4 since it selects one option randomly from 4 choices.

I have already saved the precomputed failure rates and assigned it to the `failure_rates` variable as a numpy array. Use that instead.

In [None]:
def get_failure_rates():
    total_questions = 310
    failure_rates = -np.ones(total_questions)
    failure_rates_pbar = tqdm(total=total_questions, desc='questions seen')
    url = 'https://www.lebenindeutschland.eu/test/by'

    while not np.all(failure_rates >= 0):
        session = requests.Session()
        # Mocks the exam which has 33 questions. The last 3 which are state specific.
        for question_num in tqdm(range(33), desc='taking the test', leave=False):
            # Get the current question page
            response_get = session.get(url)
            soup_before = BeautifulSoup(response_get.content, 'html.parser')

            # Find the form and extract data
            form = soup_before.select_one('form[method="post"]')
            form_action = form.get('action')

            hidden_inputs = {}
            for input_tag in form.select('input[type="hidden"]'):
                hidden_inputs[input_tag.get('name')] = input_tag.get('value')

            options = form.select('input[type="radio"][name="question-answer"]')
            # Select a random option.
            selected_answer_value = options[np.random.randint(4)].get('value')

            # Prepare POST data
            post_data = hidden_inputs
            post_data['question-answer'] = selected_answer_value

            # Submit the answer
            response_post = session.post(form_action, data=post_data)
            soup_after = BeautifulSoup(response_post.content, 'html.parser')

            # Extract Fehlerquote from the response page, this element appears after submission.
            fehlerquote_element = soup_after.select_one('p.text-right.text-sm')
            # 'Frage №\u200929.\n            Fehlerquote: 6,4%'
            fehlerquote_and_ques = fehlerquote_element.get_text(strip=True)
            question_text, fehlerquote_text = fehlerquote_and_ques.split('Fehlerquote: ')
            # Subtract 1 from ques num because arr indices start from 0.
            question_index = int(question_text.split('\u2009')[1].split('.\n')[0]) - 1
            if question_num >= 30:  # Last 3 questions are state specific.
                question_index += 300
            fehlerquote_text = fehlerquote_text.split('Fehlerquote: ')[-1][:-1]
            # German has comma ',' instead of point'.'.
            fehlerquote_value = float(fehlerquote_text.replace(',', '.')) / 100
            failure_rates[question_index] = fehlerquote_value

            # Updates the progress bar.
            filled_count = np.sum(failure_rates >= 0)
            failure_rates_pbar.n = filled_count
            failure_rates_pbar.refresh()
    return failure_rates

# failure_rates = get_failure_rates()
# print(list(map(float, np.round(failure_rates, 15))))

In [None]:
# Precomputed failure rates.
question_number_to_failure_rate = {1: 0.209, 2: 0.152, 3: 0.114, 4: 0.08, 5: 0.204, 6: 0.217, 7: 0.174, 8: 0.185, 9: 0.068, 10: 0.113, 11: 0.271, 12: 0.208, 13: 0.461, 14: 0.308, 15: 0.241, 16: 0.279, 17: 0.312, 18: 0.469, 19: 0.286, 20: 0.315, 21: 0.098, 22: 0.105, 23: 0.224, 24: 0.093, 25: 0.242, 26: 0.081, 27: 0.156, 28: 0.157, 29: 0.064, 30: 0.243, 31: 0.293, 32: 0.183, 33: 0.343, 34: 0.09, 35: 0.085, 36: 0.136, 37: 0.147, 38: 0.065, 39: 0.283, 40: 0.156, 41: 0.122, 42: 0.273, 43: 0.213, 44: 0.364, 45: 0.107, 46: 0.12, 47: 0.132, 48: 0.409, 49: 0.196, 50: 0.324, 51: 0.236, 52: 0.174, 53: 0.35, 54: 0.399, 55: 0.255, 56: 0.221, 57: 0.407, 58: 0.354, 59: 0.264, 60: 0.409, 61: 0.25, 62: 0.471, 63: 0.336, 64: 0.229, 65: 0.473, 66: 0.111, 67: 0.281, 68: 0.31, 69: 0.259, 70: 0.295, 71: 0.093, 72: 0.055, 73: 0.235, 74: 0.124, 75: 0.095, 76: 0.133, 77: 0.143, 78: 0.157, 79: 0.147, 80: 0.287, 81: 0.346, 82: 0.368, 83: 0.315, 84: 0.298, 85: 0.601, 86: 0.58, 87: 0.394, 88: 0.344, 89: 0.269, 90: 0.602, 91: 0.453, 92: 0.12, 93: 0.325, 94: 0.114, 95: 0.095, 96: 0.225, 97: 0.098, 98: 0.46, 99: 0.199, 100: 0.233, 101: 0.289, 102: 0.416, 103: 0.287, 104: 0.151, 105: 0.194, 106: 0.184, 107: 0.056, 108: 0.099, 109: 0.198, 110: 0.229, 111: 0.285, 112: 0.135, 113: 0.276, 114: 0.153, 115: 0.236, 116: 0.174, 117: 0.086, 118: 0.193, 119: 0.213, 120: 0.249, 121: 0.095, 122: 0.215, 123: 0.245, 124: 0.474, 125: 0.347, 126: 0.335, 127: 0.424, 128: 0.406, 129: 0.331, 130: 0.105, 131: 0.194, 132: 0.139, 133: 0.148, 134: 0.262, 135: 0.12, 136: 0.065, 137: 0.123, 138: 0.21, 139: 0.21, 140: 0.228, 141: 0.287, 142: 0.287, 143: 0.205, 144: 0.197, 145: 0.261, 146: 0.18, 147: 0.202, 148: 0.251, 149: 0.212, 150: 0.26, 151: 0.119, 152: 0.067, 153: 0.138, 154: 0.153, 155: 0.129, 156: 0.162, 157: 0.073, 158: 0.177, 159: 0.325, 160: 0.054, 161: 0.305, 162: 0.211, 163: 0.241, 164: 0.272, 165: 0.219, 166: 0.23, 167: 0.17, 168: 0.166, 169: 0.297, 170: 0.38, 171: 0.282, 172: 0.097, 173: 0.144, 174: 0.22, 175: 0.296, 176: 0.336, 177: 0.089, 178: 0.226, 179: 0.298, 180: 0.204, 181: 0.182, 182: 0.113, 183: 0.243, 184: 0.363, 185: 0.286, 186: 0.273, 187: 0.135, 188: 0.205, 189: 0.176, 190: 0.064, 191: 0.166, 192: 0.239, 193: 0.116, 194: 0.19, 195: 0.213, 196: 0.193, 197: 0.172, 198: 0.252, 199: 0.324, 200: 0.244, 201: 0.215, 202: 0.164, 203: 0.406, 204: 0.306, 205: 0.345, 206: 0.199, 207: 0.175, 208: 0.238, 209: 0.338, 210: 0.432, 211: 0.402, 212: 0.092, 213: 0.096, 214: 0.16, 215: 0.392, 216: 0.176, 217: 0.276, 218: 0.344, 219: 0.272, 220: 0.336, 221: 0.212, 222: 0.048, 223: 0.053, 224: 0.076, 225: 0.076, 226: 0.019, 227: 0.127, 228: 0.339, 229: 0.071, 230: 0.096, 231: 0.144, 232: 0.307, 233: 0.089, 234: 0.153, 235: 0.213, 236: 0.196, 237: 0.224, 238: 0.052, 239: 0.26, 240: 0.135, 241: 0.136, 242: 0.115, 243: 0.162, 244: 0.087, 245: 0.033, 246: 0.076, 247: 0.12, 248: 0.09, 249: 0.072, 250: 0.1, 251: 0.166, 252: 0.201, 253: 0.215, 254: 0.189, 255: 0.114, 256: 0.162, 257: 0.136, 258: 0.159, 259: 0.273, 260: 0.28, 261: 0.128, 262: 0.273, 263: 0.276, 264: 0.197, 265: 0.184, 266: 0.031, 267: 0.086, 268: 0.074, 269: 0.073, 270: 0.182, 271: 0.105, 272: 0.092, 273: 0.146, 274: 0.188, 275: 0.184, 276: 0.105, 277: 0.086, 278: 0.124, 279: 0.199, 280: 0.216, 281: 0.212, 282: 0.198, 283: 0.261, 284: 0.205, 285: 0.271, 286: 0.229, 287: 0.17, 288: 0.305, 289: 0.135, 290: 0.198, 291: 0.27, 292: 0.108, 293: 0.125, 294: 0.147, 295: 0.047, 296: 0.148, 297: 0.158, 298: 0.203, 299: 0.145, 300: 0.274, 301: 0.033, 302: 0.131, 303: 0.283, 304: 0.076, 305: 0.058, 306: 0.169, 307: 0.016, 308: 0.03, 309: 0.16, 310: 0.113}
# print question number with max failure rate
max_failure_rate = max(question_number_to_failure_rate.values())
print('question numbers with max failure rate of', max_failure_rate)
for question_number, failure_rate in question_number_to_failure_rate.items():
    if failure_rate == max_failure_rate:
        print(question_number)

# Update questions with failure rates
for question_number, failure_rate in question_number_to_failure_rate.items():
    question_number_to_question_data[question_number].failure_rate = failure_rate

filter_name_to_question_numbers |= {
    'tough questions': filter_questions(
        question_number_to_question_data.values(),
        filter_func=lambda failure_rate: failure_rate > 0.4, filter_option=FilterOption.FAILURE_RATE
    )
}

question numbers with max failure rate of 0.602
90


Using heatmap since the data is of a matrix form. \
Still the upper visualization is more useful because the hover is only for plotted values.

In [None]:
def plot_questions_heatmap(
    question_number_to_data: dict[int, QuestionData],
    filter_name_to_question_numbers: dict[str, list[int]]
):
    fig = go.Figure()

    # Initialize empty matrices for data and text.
    heatmap = np.zeros((30, 11))
    textmap = np.full((30, 11), '', dtype='object')
    hovertextmap = np.full((30, 11), '', dtype='object')

    # Create a heatmap trace for each filter
    for fig_index, (filter_name, question_numbers) in enumerate(
        filter_name_to_question_numbers.items()
    ):
        if len(question_numbers) == 0:  # No questions found for this word.
            fig.add_trace(go.Heatmap(z=np.zeros((30, 10)), visible=False))
            continue

        bin_indices = (question_numbers-1) // 30
        heights = (question_numbers-1) % 30

        heatmap[:], textmap[:], hovertextmap[:] = 0, '', ''  # Reset the matrices.
        heatmap[heights, bin_indices] = 1

        for i, question_data in enumerate([
            question_number_to_data[question_number]
            for question_number in question_numbers
        ]):
            hovertext = question_data.num_with_text + '<br><br>'
            for option in question_data.options:
                if option == question_data.answer:
                    hovertext += f'<span style="color: yellow;">{option}</span><br>'
                else:
                    hovertext += f'{option}<br>'
            hovertext += '<br>Failure rate: ' + str(question_data.failure_rate)

            # Assign to arrays
            textmap[heights[i], bin_indices[i]] = question_data.number
            hovertextmap[heights[i], bin_indices[i]] = hovertext
            heatmap[heights[i], bin_indices[i]] = str(question_data.failure_rate)

        # Add heatmap trace
        fig.add_trace(
            go.Heatmap(
                z=heatmap,
                x=30 * np.arange(10),
                text=textmap,
                texttemplate="%{text}",
                colorscale='Viridis',
                colorbar=dict(title='Failure rate'),
                textfont=dict(size=14),
                hovertext=hovertextmap,
                hoverinfo='text',
                visible=(fig_index==0)  # Only first trace visible initially
            )
        )


    # Create dropdown buttons
    dropdown_buttons = []
    for i, filter_name in enumerate(filter_name_to_question_numbers):
        visibility = [False] * len(filter_name_to_question_numbers)
        visibility[i] = True

        if filter_name.startswith('q_'):
            title = f'Questions containing "{filter_name[2:]}"'
        elif filter_name.startswith('o_'):
            title = f'Options containing "{filter_name[2:]}"'
        elif 'option' in filter_name:  # Special case for numeric options.
            title = f'Questions containing {filter_name}'
        else:
            title = filter_name
        dropdown_buttons.append(
            dict(
                label=filter_name,
                method='update',
                args=[{'visible': visibility}, {'title': title}],
            )
        )

    tick_vals = 30 * np.arange(11)
    fig.update_layout(
        xaxis=dict(title='Question number', tickvals=tick_vals, ticktext=tick_vals),
        yaxis=dict(visible=False), updatemenus=[dict(buttons=dropdown_buttons, x=0.0)],
        title=dropdown_buttons[0]['args'][1]['title'] if dropdown_buttons else '',
    )
    fig.show()


plot_questions_heatmap(question_number_to_question_data, filter_name_to_question_numbers)