In [None]:
# skript to create .lss files for the survey questions AFTER lottery ADDITION for only unanswered questions

import random
import pandas as pd


# Load the CSV file
file_path = '../../data/question_survey/question_dataset.csv'
interleaved_questions = pd.read_csv(file_path)

# DELETE QUESTIONS already answered in first 6 surveys / should be not necessary since shuffeling is done inside the batch but just to be sure
# they had question_id_q == 336   9 203 194 153 105 129 351 212 143 123 315 328 347 111  83 213 230
# List of already answered question IDs
answered_question_ids = [336, 9, 203, 194, 153, 105, 129, 351, 212, 143, 123, 315, 328, 347, 111, 83, 213, 230]

# Filter the dataset to include only unanswered questions
interleaved_questions = interleaved_questions[~interleaved_questions['question_id_q'].isin(answered_question_ids)]


# Load the .lss template file
lss_template_path = '../../data/answer_survey/survey_lss/placeholder_survey_answers_gs.lss'
with open(lss_template_path, 'r', encoding='utf-8') as file:
    lss_template = file.read()

# Initialize counters for the number of surveys that include questions from each role
prospective_count = 0
enrolled_count = 0
international_count = 0
others_count = 0
all_roles_count = 0

# Initialize dataframe to store survey number, token, question_id, and url
survey_info = pd.DataFrame(columns=['survey_number', 'token', 'url', 'question_id'])

# Iterate through questions in batches of three and create .lss files
output_files = []
batch_size = 3
for i in range(0, len(interleaved_questions), batch_size):
    # Get the current batch of questions
    batch = interleaved_questions.iloc[i:i+batch_size]
    if len(batch) < batch_size:
        break  # Skip if there are less than 3 questions remaining

    # reshuffle the batch
    batch = batch.sample(frac=1).reset_index(drop=True)

    # Check for the presence of roles in the batch
    if 'prospective' in batch['role_q'].values:
        prospective_count += 1
    if 'enrolled' in batch['role_q'].values:
        enrolled_count += 1
    if 'international' in batch['role_q'].values:
        international_count += 1
    if 'other' in batch['role_q'].values:
        others_count += 1
    # Check if all three roles are present in the batch
    if {'prospective', 'enrolled', 'international'}.issubset(batch['role_q'].values):
        all_roles_count += 1

    
    # Replace placeholders with batch questions
    updated_lss = lss_template
    for j, question in enumerate(batch['german_question_text_q']):
        placeholder = f"PLACEHOLDER{j + 1}"
        updated_lss = updated_lss.replace(placeholder, question)

    # Replace placeholders with batch original questions IDs
    for j, id in enumerate(batch['question_id_q']):
        placeholder = f"PLACEHOLDERID{j + 1}"
        updated_lss = updated_lss.replace(placeholder, f"{id}")
    
    # Update survey title
    survey_number = (i // batch_size) + 1 +6  # Start numbering from 7
    updated_lss = updated_lss.replace(
        "<surveyls_title><![CDATA[Chatbot der Universität Osnabrück – Erstellung von Referenzantworten GS]]></surveyls_title>",
        f"<surveyls_title><![CDATA[Chatbot der Universität Osnabrück – Erstellung von Referenzantworten {survey_number}]]></surveyls_title>"
    )

    # Create a 7 digit random token for the survey
    existing_tokens = set(survey_info['token'])  # Collect existing tokens
    # Generate a unique 7-digit token
    while True:
        token = random.randint(1000000, 9999999)
        if token not in existing_tokens:
            break

    # populate survey_info dataframe
    new_row = pd.DataFrame([{
        'survey_number': survey_number,
        'token': token,
        'url': 'PLACEHOLDER',
        'question_id': batch['question_id_q'].values
    }])

    survey_info = pd.concat([survey_info, new_row], ignore_index=True)
    
    # Save the updated .lss file
    output_path = f"../../data/answer_survey/survey_lss/survey_lottery{survey_number}.lss"
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write(updated_lss)
    
    output_files.append(output_path)

# add one cloumn with false values named survey distributed
survey_info['survey_distributed'] = False
# add one cloumn with false values named survey filled
survey_info['survey_filled'] = False

# Save the survey_info dataframe to a new CSV file
survey_info.to_csv('../../data/answer_survey/survey_info_gs.csv', index=False, quoting=1)

# Output the results
print(f"Surveys with prospective questions: {prospective_count}")
print(f"Surveys with enrolled questions: {enrolled_count}")
print(f"Surveys with international questions: {international_count}")
print(f"Surveys with other questions: {others_count}")
print(f"Surveys with all three roles of interest: {all_roles_count}")

output_files  # List of generated files

Surveys with prospective questions: 40
Surveys with enrolled questions: 90
Surveys with international questions: 27
Surveys with other questions: 16
Surveys with all three roles of interest: 27


['data/answer_surveys/survey_lss/survey_lottery7.lss',
 'data/answer_surveys/survey_lss/survey_lottery8.lss',
 'data/answer_surveys/survey_lss/survey_lottery9.lss',
 'data/answer_surveys/survey_lss/survey_lottery10.lss',
 'data/answer_surveys/survey_lss/survey_lottery11.lss',
 'data/answer_surveys/survey_lss/survey_lottery12.lss',
 'data/answer_surveys/survey_lss/survey_lottery13.lss',
 'data/answer_surveys/survey_lss/survey_lottery14.lss',
 'data/answer_surveys/survey_lss/survey_lottery15.lss',
 'data/answer_surveys/survey_lss/survey_lottery16.lss',
 'data/answer_surveys/survey_lss/survey_lottery17.lss',
 'data/answer_surveys/survey_lss/survey_lottery18.lss',
 'data/answer_surveys/survey_lss/survey_lottery19.lss',
 'data/answer_surveys/survey_lss/survey_lottery20.lss',
 'data/answer_surveys/survey_lss/survey_lottery21.lss',
 'data/answer_surveys/survey_lss/survey_lottery22.lss',
 'data/answer_surveys/survey_lss/survey_lottery23.lss',
 'data/answer_surveys/survey_lss/survey_lottery24.l