In [10]:
import xml.etree.ElementTree as ET

def add_question_groups(
    input_lss: str,
    output_lss: str,
    original_group_id: str,
    new_group_count: int,
    gid_start: int,
    qid_start: int,
    questions_per_group: int
):
    """
    Clones a group in a LimeSurvey .lss file and creates multiple new copies.
    Also updates placeholders for newly added question codes like 'qid1' and 'langq1'.
    """

    # 1) Parse the LSS and find main sections
    tree = ET.parse(input_lss)
    root = tree.getroot()

    groups_elem       = root.find("./groups")
    questions_elem    = root.find("./questions")
    subquestions_elem = root.find("./subquestions")
    answers_elem      = root.find("./answers")
    qattributes_elem  = root.find("./question_attributes")  # For copying random_order, etc.

    if not (groups_elem and questions_elem and subquestions_elem and answers_elem):
        raise ValueError("Could not locate <groups>, <questions>, <subquestions>, or <answers> in the LSS.")

    group_rows      = groups_elem.find("rows")
    question_rows   = questions_elem.find("rows")
    subquestion_rows= subquestions_elem.find("rows")
    answer_rows     = answers_elem.find("rows")

    if not (group_rows and question_rows and subquestion_rows and answer_rows):
        raise ValueError("One of the <rows> sections is missing inside <groups>, <questions>, <subquestions>, or <answers>.")

    # We'll also gather the question_attributes rows if present
    qattr_rows = None
    if qattributes_elem is not None:
        qattr_rows = qattributes_elem.find("rows")

    # 2) Gather the original group entries (both languages) with gid=original_group_id
    original_group_entries = [
        row for row in group_rows.findall("row")
        if row.find("gid").text == original_group_id
    ]
    if not original_group_entries:
        raise ValueError(f"Could not find group with gid={original_group_id}.")

    # 3) Identify all QIDs from that group (questions + subquestions)
    original_qid_list = []

    # A) Main questions in that group
    these_question_rows = [
        row for row in question_rows.findall("row")
        if row.find("gid").text == original_group_id
    ]
    original_qid_list.extend(row.find("qid").text for row in these_question_rows)

    # B) Subquestions in that group
    these_subquestion_rows = [
        row for row in subquestion_rows.findall("row")
        if row.find("gid").text == original_group_id
    ]
    original_qid_list.extend(row.find("qid").text for row in these_subquestion_rows)

    # Make the list unique
    original_qid_list = list(set(original_qid_list))

    # 4) Create a mapping {old_qid -> new_qid}, so each old QID gets a new unique QID
    qid_counter = qid_start
    old_qid_to_new_qid = {}
    for old_qid in sorted(original_qid_list):
        old_qid_to_new_qid[old_qid] = str(qid_counter)
        qid_counter += 1

    # 5) We replicate the group until we have new_group_count total
    #    If new_group_count=2, that means we add 1 new copy (since 1 original already exists).
    additional_copies = new_group_count - 1
    gid_counter = gid_start

    # Prepare lists to hold new <row> elements
    all_new_group_rows       = []
    all_new_question_rows    = []
    all_new_subquestion_rows = []
    all_new_answer_rows      = []
    all_new_qattr_rows       = []

    for i in range(1, additional_copies + 1):
        # e.g. "ans1" => "ans2" if i=1
        # also "qid1" => "qid2", "langq1" => "langq2" if i=1, etc.
        placeholder_number = i + 1

        # For every questions_per_group new copies, we increment randomgroup by 1
        # e.g., if questions_per_group=6, i=1..6 => randomgroup==1
        index           = i
        randomgroup_val = (index // questions_per_group) + 1

        #
        # (A) Clone the group row(s) for each language
        #
        for orig_grp in original_group_entries:
            new_grp = ET.fromstring(ET.tostring(orig_grp))

            # Set new GID
            new_grp.find("gid").text = str(gid_counter)

            # Optionally shift group_order
            go_el = new_grp.find("group_order")
            if go_el is not None and go_el.text.isdigit():
                old_val = int(go_el.text)
                go_el.text = str(old_val + i)

            # Update placeholders in <description>
            desc_el = new_grp.find("description")
            if desc_el is not None and desc_el.text:
                desc_text = desc_el.text

                # Replace placeholders
                desc_text = desc_text.replace("PLACEHOLDER_QUESTION_1",  f"PLACEHOLDER_QUESTION_{placeholder_number}")
                desc_text = desc_text.replace("PLACEHOLDER_ANSWER_1",    f"PLACEHOLDER_ANSWER_{placeholder_number}")
                desc_text = desc_text.replace("PLACEHOLDER_CONTEXT_1",   f"PLACEHOLDER_CONTEXT_{placeholder_number}")
                desc_text = desc_text.replace("PLACEHOLDER_ID_1",        f"PLACEHOLDER_ID_{placeholder_number}")
                desc_text = desc_text.replace("PLACEHOLDER_LANG_1",      f"PLACEHOLDER_LANG_{placeholder_number}")
                desc_text = desc_text.replace("ans1",                    f"ans{placeholder_number}")
                desc_text = desc_text.replace("qid1",                    f"qid{placeholder_number}")
                desc_text = desc_text.replace("langq1",                  f"langq{placeholder_number}")
                desc_text = desc_text.replace("comment1",                f"comment{placeholder_number}")

                desc_el.text = desc_text

            # Update grelevance so that after questions_per_group copies, randomgroup==2, etc.
            grel_el = new_grp.find("grelevance")
            if grel_el is not None:
                old_expr = grel_el.text or ""
                if "==1" in old_expr:
                    new_expr = old_expr.replace("==1", f"=={randomgroup_val}")
                elif not old_expr.strip():
                    new_expr = f"randomgroup=={randomgroup_val}"
                else:
                    new_expr = old_expr
                grel_el.text = new_expr

            all_new_group_rows.append(new_grp)

        #
        # (B) Clone all questions in that group, applying new QIDs
        #
        for qrow in these_question_rows:
            old_qid = qrow.find("qid").text
            new_qid = old_qid_to_new_qid[old_qid]

            new_q = ET.fromstring(ET.tostring(qrow))
            # Overwrite GID and QID
            new_q.find("gid").text = str(gid_counter)
            new_q.find("qid").text = new_qid

            # Shift question_order
            qo_el = new_q.find("question_order")
            if qo_el is not None and qo_el.text.isdigit():
                old_val = int(qo_el.text)
                qo_el.text = str(old_val + i)

            # Replace placeholders in <title> and <question>
            title_el = new_q.find("title")
            if title_el is not None and title_el.text:
                tmp = title_el.text
                tmp = tmp.replace("ans1",   f"ans{placeholder_number}")
                tmp = tmp.replace("qid1",   f"qid{placeholder_number}")
                tmp = tmp.replace("langq1", f"langq{placeholder_number}")
                tmp = tmp.replace("comment1", f"comment{placeholder_number}")
                title_el.text = tmp

            question_el = new_q.find("question")
            if question_el is not None and question_el.text:
                tmp = question_el.text
                tmp = tmp.replace("PLACEHOLDER_QUESTION_1",  f"PLACEHOLDER_QUESTION_{placeholder_number}")
                tmp = tmp.replace("PLACEHOLDER_ANSWER_1",    f"PLACEHOLDER_ANSWER_{placeholder_number}")
                tmp = tmp.replace("PLACEHOLDER_CONTEXT_1",   f"PLACEHOLDER_CONTEXT_{placeholder_number}")
                tmp = tmp.replace("PLACEHOLDER_ID_1",        f"PLACEHOLDER_ID_{placeholder_number}")
                tmp = tmp.replace("PLACEHOLDER_LANG_1",      f"PLACEHOLDER_LANG_{placeholder_number}")
                tmp = tmp.replace("ans1",                    f"ans{placeholder_number}")
                tmp = tmp.replace("qid1",                    f"qid{placeholder_number}")
                tmp = tmp.replace("langq1",                  f"langq{placeholder_number}")
                tmp = tmp.replace("comment1",                f"comment{placeholder_number}")
                question_el.text = tmp

            all_new_question_rows.append(new_q)

        #
        # (C) Clone subquestions in that group
        #
        for sqrow in these_subquestion_rows:
            old_subq_qid = sqrow.find("qid").text
            new_subq_qid = old_qid_to_new_qid[old_subq_qid]

            old_parent_qid = sqrow.find("parent_qid").text
            new_parent_qid = old_qid_to_new_qid[old_parent_qid]

            new_sq = ET.fromstring(ET.tostring(sqrow))
            new_sq.find("qid").text        = new_subq_qid
            new_sq.find("parent_qid").text = new_parent_qid
            new_sq.find("gid").text        = str(gid_counter)

            qo_el = new_sq.find("question_order")
            if qo_el is not None and qo_el.text.isdigit():
                old_val = int(qo_el.text)
                qo_el.text = str(old_val + i)

            # placeholders in <title> and <question>
            title_el = new_sq.find("title")
            if title_el is not None and title_el.text:
                tmp = title_el.text
                tmp = tmp.replace("ans1",    f"ans{placeholder_number}")
                tmp = tmp.replace("qid1",    f"qid{placeholder_number}")
                tmp = tmp.replace("langq1",  f"langq{placeholder_number}")
                tmp = tmp.replace("comment1", f"comment{placeholder_number}")
                title_el.text = tmp

            question_el = new_sq.find("question")
            if question_el is not None and question_el.text:
                tmp = question_el.text
                tmp = tmp.replace("PLACEHOLDER_QUESTION_1", f"PLACEHOLDER_QUESTION_{placeholder_number}")
                tmp = tmp.replace("PLACEHOLDER_ANSWER_1",   f"PLACEHOLDER_ANSWER_{placeholder_number}")
                tmp = tmp.replace("PLACEHOLDER_CONTEXT_1",  f"PLACEHOLDER_CONTEXT_{placeholder_number}")
                tmp = tmp.replace("PLACEHOLDER_ID_1",        f"PLACEHOLDER_ID_{placeholder_number}")
                tmp = tmp.replace("PLACEHOLDER_LANG_1",      f"PLACEHOLDER_LANG_{placeholder_number}")
                tmp = tmp.replace("ans1",                   f"ans{placeholder_number}")
                tmp = tmp.replace("qid1",                   f"qid{placeholder_number}")
                tmp = tmp.replace("langq1",                 f"langq{placeholder_number}")
                tmp = tmp.replace("comment1",               f"comment{placeholder_number}")
                question_el.text = tmp

            all_new_subquestion_rows.append(new_sq)

        #
        # (D) Clone all answers for these QIDs
        #
        for old_qid in original_qid_list:
            new_qid = old_qid_to_new_qid[old_qid]
            relevant_answers = [
                row for row in answer_rows.findall("row")
                if row.find("qid").text == old_qid
            ]
            for ansrow in relevant_answers:
                new_ans = ET.fromstring(ET.tostring(ansrow))
                new_ans.find("qid").text = new_qid

                # placeholders in <answer> text
                ans_text_el = new_ans.find("answer")
                if ans_text_el is not None and ans_text_el.text:
                    tmp = ans_text_el.text
                    tmp = tmp.replace("PLACEHOLDER_QUESTION_1", f"PLACEHOLDER_QUESTION_{placeholder_number}")
                    tmp = tmp.replace("PLACEHOLDER_ANSWER_1",   f"PLACEHOLDER_ANSWER_{placeholder_number}")
                    tmp = tmp.replace("PLACEHOLDER_CONTEXT_1",  f"PLACEHOLDER_CONTEXT_{placeholder_number}")
                    tmp = tmp.replace("PLACEHOLDER_ID_1",        f"PLACEHOLDER_ID_{placeholder_number}")
                    tmp = tmp.replace("PLACEHOLDER_LANG_1",      f"PLACEHOLDER_LANG_{placeholder_number}")
                    tmp = tmp.replace("ans1",                   f"ans{placeholder_number}")
                    tmp = tmp.replace("qid1",                   f"qid{placeholder_number}")
                    tmp = tmp.replace("langq1",                 f"langq{placeholder_number}")
                    tmp = tmp.replace("comment1",               f"comment{placeholder_number}")
                    ans_text_el.text = tmp

                all_new_answer_rows.append(new_ans)

        #
        # (E) Clone question_attributes for these QIDs (e.g. random_order, hidden, etc.)
        #
        if qattr_rows is not None:
            for old_qid in original_qid_list:
                new_qid = old_qid_to_new_qid[old_qid]
                # find all attribute rows for old_qid
                relevant_attrs = [
                    row for row in qattr_rows.findall("row")
                    if row.find("qid").text == old_qid
                ]
                for attrrow in relevant_attrs:
                    new_attr = ET.fromstring(ET.tostring(attrrow))
                    # update the qid to the new question
                    new_attr.find("qid").text = new_qid
                    all_new_qattr_rows.append(new_attr)

        #
        # (F) Bump GID for the next group and re-init QIDs
        #
        gid_counter += 1

        # Re-init the QID mapping so each iteration reuses the old QIDs 
        # (that way every new copy references brand-new QIDs).
        old_qid_to_new_qid = {}
        tmp_qid_counter = qid_counter
        for old_qid in sorted(original_qid_list):
            old_qid_to_new_qid[old_qid] = str(tmp_qid_counter)
            tmp_qid_counter += 1
        qid_counter = tmp_qid_counter

    #
    # 6) Append all newly created rows
    #
    for elem in all_new_group_rows:
        group_rows.append(elem)
    for elem in all_new_question_rows:
        question_rows.append(elem)
    for elem in all_new_subquestion_rows:
        subquestion_rows.append(elem)
    for elem in all_new_answer_rows:
        answer_rows.append(elem)

    # If question_attributes exist, append new ones too
    if qattr_rows is not None:
        for elem in all_new_qattr_rows:
            qattr_rows.append(elem)

    #
    # 7) Write the final .lss to disk
    #
    tree.write(output_lss, encoding="utf-8", xml_declaration=True)
    print(f"Done! Created {additional_copies} new copy/copies of group {original_group_id}.")
    #print(f"Saved to: {output_lss}")


In [2]:
import pandas as pd
from markdown import markdown
import re

def convert_markdown_to_html_with_target_blank(df, markdown_column, html_column):
    """
    Converts Markdown content in a DataFrame column to HTML and ensures links open in a new tab.

    Args:
        df (pd.DataFrame): The DataFrame containing chatbot answers.
        markdown_column (str): The name of the column with Markdown content.
        html_column (str): The name of the column where the HTML output will be stored.

    Returns:
        pd.DataFrame: Updated DataFrame with the HTML content.
    """
    # Check if the markdown_column exists in the DataFrame
    if markdown_column not in df.columns:
        raise ValueError(f"Column '{markdown_column}' not found in the DataFrame.")
    
    def add_target_blank_to_links(html):
        # Regex to find all <a> tags and add target="_blank"
        return re.sub(r'(<a href="[^"]+")', r'\1 target="_blank"', html)
    
    # Convert Markdown to HTML and add target="_blank" to links
    df[html_column] = df[markdown_column].apply(
        lambda x: add_target_blank_to_links(markdown(x)) if isinstance(x, str) else x
    )
    
    return df


def extract_context_links(df, context_column, output_column, no_context_text):
    """
    Extracts all links after 'Information taken from:' and formats them as a simple HTML list.

    Args:
        df (pd.DataFrame): DataFrame containing context data.
        context_column (str): Column name containing the chatbot context.
        output_column (str): Column name to store the formatted HTML list.
        no_context_text (str): Text to display if no context

    Returns:
        pd.DataFrame: Updated DataFrame with HTML-formatted links.
    """
    def extract_links(context):
        # Extract all links after "Information taken from:"
        matches = re.findall(r'Information taken from:(https?://[^\s]+)', context)
        if matches:
            # Format each link into an HTML list item
            links_html = ''.join(f'<li><a href="{link}" target="_blank">{link}</a></li>' for link in matches)
            return f"<ul>{links_html}</ul>"
        return f"<p>{no_context_text}</p>"

    # Apply the transformation
    df[output_column] = df[context_column].apply(
        lambda x: extract_links(x) if isinstance(x, str) else f"<p>{no_context_text}</p>"
    )
    return df

In [8]:
import pandas as pd
import xml.etree.ElementTree as ET
from random import seed as py_seed

def build_shuffled_questions_and_clone(
    df_en: pd.DataFrame,
    df_de: pd.DataFrame,
    input_lss: str,
    output_lss: str,
    original_group_id: str,
    gid_start: int,
    qid_start: int,
    questions_per_group: int,
    shuffle_seed: int = 42
):
    """
    1) Takes two DataFrames (English + German) which already have columns like:
         - english_question_text_q, chatbot_answer_en_html, formatted_context_en_html, question_id_q, question_language_q
         - german_question_text_q,  chatbot_answer_de_html, formatted_context_de_html, question_id_q, question_language_q

    2) Renames/standardizes these columns into:
         [question_text, answer_html, context_html, question_id, language]

    3) Stacks them into one DataFrame (df_together), each row = a question with:
         question_text, answer_html, context_html, question_id, language

    4) If shuffle_seed != None shuffles df_together using a fixed seed.

    5) Calls the existing add_question_groups(...) function to replicate your .lss
       so that the total # of groups = len(df_together). (One per row in df_together.)

    6) Finally, parses that newly generated .lss and replaces the placeholders:
         PLACEHOLDER_QUESTION_i  => df_together.iloc[i-1]["question_text"]
         PLACEHOLDER_ANSWER_i    => df_together.iloc[i-1]["answer_html"]
         PLACEHOLDER_CONTEXT_i   => df_together.iloc[i-1]["context_html"]
         PLACEHOLDER_ID_i        => df_together.iloc[i-1]["question_id"]
         PLACEHOLDER_LANG_i      => df_together.iloc[i-1]["language"]

       for i in 1..len(df_together). The result is saved back to output_lss.

    Returns:
        pd.DataFrame: The final stacked & shuffled DataFrame (df_together).
                      (For logging/inspection. The placeholders are replaced in the .lss on disk.)
    """
    # --- 1) Standardize columns for df_en
    df_en_copy = df_en.copy()
    df_en_copy["question_text"] = df_en_copy["english_question_text_q"]
    df_en_copy["answer_html"]   = df_en_copy["chatbot_answer_en_html"]
    df_en_copy["context_html"]  = df_en_copy["formatted_context_en_html"]
    df_en_copy["question_id"]   = df_en_copy["question_id_q"]
    df_en_copy["language"]      = df_en_copy["question_language_q"]

    # Keep only the unified columns
    df_en_final = df_en_copy[["question_text", "answer_html", "context_html", "question_id", "language"]]

    # --- 2) Standardize columns for df_de
    df_de_copy = df_de.copy()
    df_de_copy["question_text"] = df_de_copy["german_question_text_q"]
    df_de_copy["answer_html"]   = df_de_copy["chatbot_answer_de_html"]
    df_de_copy["context_html"]  = df_de_copy["formatted_context_de_html"]
    df_de_copy["question_id"]   = df_de_copy["question_id_q"]
    df_de_copy["language"]      = df_de_copy["question_language_q"]

    df_de_final = df_de_copy[["question_text", "answer_html", "context_html", "question_id", "language"]]

    # --- 3) Concatenate into one df_together
    df_together = pd.concat([df_en_final, df_de_final], ignore_index=True)

    # --- 4) Shuffle
    if shuffle_seed is not None:
        df_together = df_together.sample(frac=1, random_state=shuffle_seed).reset_index(drop=True)
    

    # --- 5) The total # of groups to produce = len(df_together)
    new_group_count = len(df_together)

    # --- 6) Clone the group in the .lss (using your existing add_question_groups function)
    add_question_groups(
        input_lss=input_lss,
        output_lss=output_lss,
        original_group_id=original_group_id,
        new_group_count=new_group_count,
        gid_start=gid_start,
        qid_start=qid_start,
        questions_per_group=questions_per_group
    )

    # --- 7) Now fill in the newly created placeholders with the actual data from df_together
    _fill_placeholders_with_data(output_lss, df_together)

    return df_together


def _fill_placeholders_with_data(output_lss: str, df_together: pd.DataFrame):
    """
    Internal helper to parse the newly generated LSS (output_lss) 
    and replace placeholder text with real data from df_together.

    For i in [1..N], we replace:
      PLACEHOLDER_QUESTION_{i} -> df_together.iloc[i-1]["question_text"]
      PLACEHOLDER_ANSWER_{i}   -> df_together.iloc[i-1]["answer_html"]
      PLACEHOLDER_CONTEXT_{i}  -> df_together.iloc[i-1]["context_html"]
      PLACEHOLDER_ID_{i}       -> df_together.iloc[i-1]["question_id"]
      PLACEHOLDER_LANG_{i}     -> df_together.iloc[i-1]["language"]

    The function overwrites the same .lss on disk.
    """
    import xml.etree.ElementTree as ET

    # Load the .lss we just created
    tree = ET.parse(output_lss)
    root = tree.getroot()

    # We define a small helper that does multi-line text replacement:
    def replace_placeholders_in_text(original_text):
        if not original_text:
            return original_text
        new_text = original_text

        # **Key change**: we do replacements in descending order:
        #   i = len(df_together), len(df_together)-1, ..., 2, 1
        for i in range(len(df_together), 0, -1):
            row = df_together.iloc[i - 1]
            idx_str = str(i)

            new_text = new_text.replace(f"PLACEHOLDER_QUESTION_{idx_str}", str(row["question_text"]))
            new_text = new_text.replace(f"PLACEHOLDER_ANSWER_{idx_str}",   str(row["answer_html"]))
            new_text = new_text.replace(f"PLACEHOLDER_CONTEXT_{idx_str}",  str(row["context_html"]))
            new_text = new_text.replace(f"PLACEHOLDER_ID_{idx_str}",       str(row["question_id"]))
            new_text = new_text.replace(f"PLACEHOLDER_LANG_{idx_str}",     str(row["language"]))

        return new_text

    # Replace in <groups> -> <rows> -> <row> -> <description>
    groups_elem = root.find("./groups")
    if groups_elem is not None:
        rows_el = groups_elem.find("rows")
        if rows_el is not None:
            for row_el in rows_el.findall("row"):
                desc_el = row_el.find("description")
                if desc_el is not None:
                    desc_el.text = replace_placeholders_in_text(desc_el.text)

    # Replace in <questions> -> <rows> -> <row> -> <question>/<title>
    questions_elem = root.find("./questions")
    if questions_elem is not None:
        qrows_el = questions_elem.find("rows")
        if qrows_el is not None:
            for row_el in qrows_el.findall("row"):
                question_el = row_el.find("question")
                if question_el is not None:
                    question_el.text = replace_placeholders_in_text(question_el.text)

                title_el = row_el.find("title")
                if title_el is not None:
                    title_el.text = replace_placeholders_in_text(title_el.text)

    # Replace in <subquestions> -> <rows> -> <row> -> <question>/<title>
    subquestions_elem = root.find("./subquestions")
    if subquestions_elem is not None:
        sqrows_el = subquestions_elem.find("rows")
        if sqrows_el is not None:
            for row_el in sqrows_el.findall("row"):
                question_el = row_el.find("question")
                if question_el is not None:
                    question_el.text = replace_placeholders_in_text(question_el.text)

                title_el = row_el.find("title")
                if title_el is not None:
                    title_el.text = replace_placeholders_in_text(title_el.text)

    # Replace in <answers> -> <rows> -> <row> -> <answer>
    answers_elem = root.find("./answers")
    if answers_elem is not None:
        arows_el = answers_elem.find("rows")
        if arows_el is not None:
            for row_el in arows_el.findall("row"):
                ans_el = row_el.find("answer")
                if ans_el is not None:
                    ans_el.text = replace_placeholders_in_text(ans_el.text)

    # Finally, write back to the same .lss
    tree.write(output_lss, encoding="utf-8", xml_declaration=True)



In [None]:
# English dataset
df_en = pd.read_csv("../../data/final_merged_dataset_short_en_2.csv")
df_en = convert_markdown_to_html_with_target_blank(
    df_en, 
    markdown_column="chatbot_answer_en", 
    html_column="chatbot_answer_en_html")

df_en = extract_context_links(
    df_en, 
    context_column="chatbot_context_en", 
    output_column="formatted_context_en_html", 
    no_context_text="No context was used by the chatbot.")
# German dataset
df_de = pd.read_csv("../../data/final_merged_dataset_short_de_2.csv")
df_de = convert_markdown_to_html_with_target_blank(
    df_de, 
    markdown_column="chatbot_answer_de", 
    html_column="chatbot_answer_de_html")
df_de = extract_context_links(
    df_de, 
    context_column="chatbot_context_de", 
    output_column="formatted_context_de_html", 
    no_context_text="Der Chatbot hat keinen Kontext verwendet.")




# -----------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------
input_lss = "../../../survey/eval/eval_test_2.lss"      # Path to your existing .lss file
output_lss = "../../data/human_eval/survey_output.lss"  # Path for the new .lss file

original_group_id = "99600"    # The group we want to clone
questions_per_group = 3        # How many questions untill we increment randomgroup

# Starting offset for new GIDs and QIDs:
gid_start = 200000
qid_start = 300000

print(f"german length: {len(df_de)}", f"english length: {len(df_en)}")

# 2) Now call our new function:
df_final = build_shuffled_questions_and_clone(
    df_en=df_en,
    df_de=df_de,
    input_lss=input_lss,
    output_lss=output_lss,
    original_group_id=original_group_id,
    gid_start=gid_start,
    qid_start=qid_start,
    questions_per_group=questions_per_group,
    shuffle_seed=None
)


german length: 18 english length: 18
Done! Created 35 new copy/copies of group 99600.


In [None]:
# using equastion from https://mat.uab.cat/matmat_antiga/PDFv2014/v2014n02.pdf [(Ferrante & Saltalamacchia, 2006)]
import numpy as np
from scipy.integrate import quad
from math import factorial, exp

def Sm(t, m):
    """
    Compute the sum S_m(t) = \sum_{k=0}^{m-1} (t^k / k!)
    """
    return sum((t**k) / factorial(k) for k in range(m))

def integrand(t, N, m):
    """
    The integrand for the expected number of draws.
    """
    sm_t = Sm(t, m)
    return 1 - (1 - sm_t * exp(-t))**N

def expected_draws(N, m):
    """
    Calculate the expected number of draws to complete m collections of N items.

    Parameters:
        N: int - Number of unique items (coupons).
        m: int - Number of collections.

    Returns:
        float - Expected number of draws.
    """
    result, _ = quad(lambda t: integrand(t, N, m), 0, np.inf)
    return N * result

# Example usage
N = 7  # Number of unique items
m = 3  # Number of times each item is needed

expected = expected_draws(N, m)
print(f"Expected number of draws to complete {m} collections of {N} items: {expected:.4f}")

Expected number of draws to complete 3 collections of 7 items: 39.5895


In [66]:
import mpmath as mp

def S_m(t, m):
    """
    Compute S_m(t) = sum_{k=0}^{m-1} [t^k / k!].
    """
    s = mp.mpf('0')
    for k in range(m):
        s += (t**k) / mp.factorial(k)
    return s

def expected_time_m_collections(N, m):
    """
    Return the expected number of coupons needed to collect
    m complete sets of N equally likely coupon types.

    Formula:
      E[X] = N * ∫_{0 to ∞} [ 1 - (1 - S_m(t)*e^-t)^N ] dt
    """
    # Define the integrand for the integral
    def integrand(t):
        val = S_m(t, m) * mp.e**(-t)
        return 1 - (1 - val)**N

    # Perform the integral from 0 to ∞
    result = N * mp.quad(integrand, [0, mp.inf])
    return result

# Example usage for small values of N, m:
m_example = 2
N_example = 3
est = expected_time_m_collections(N_example, m_example)
print(f"E[X] for N={N_example}, m={m_example} is approximately {est}")


E[X] for N=3, m=2 is approximately 9.63888888888889
