In [1]:
import pandas as pd

# Load the CSV file
file_path = '../../data/question_survey/questions_dataset_reviewed_translated_de_en.csv'
# expectting dataframe with at least 4 columns: 'question_id_q', 'role_q', 'german_question_text_q', 'english_question_text_q'
csv_data = pd.read_csv(file_path)

german_questions = csv_data.copy()

# randomize the rows of the data
# add random seed for reproducibility
german_questions = german_questions.sample(frac=1, random_state=42).reset_index(drop=True)

# Define a custom sort order for the 'role' column to sort rows as possible to prospective, enrolled, and international (for survey later)

# Separate the rows based on the role
prospective_rows = german_questions[german_questions['role_q'] == 'prospective']
enrolled_rows = german_questions[german_questions['role_q'] == 'enrolled']
international_rows = german_questions[german_questions['role_q'] == 'international']
other_rows = german_questions[~german_questions['role_q'].isin(['prospective', 'enrolled', 'international'])]

# Initialize an empty list to collect rows in the desired order
interleaved_rows = []

# Use round-robin to interleave rows
max_len = max(len(prospective_rows), len(enrolled_rows), len(international_rows))
for i in range(max_len):
    if i < len(prospective_rows):
        interleaved_rows.append(prospective_rows.iloc[i])
    if i < len(enrolled_rows):
        interleaved_rows.append(enrolled_rows.iloc[i])
    if i < len(international_rows):
        interleaved_rows.append(international_rows.iloc[i])

# Create a new DataFrame from the interleaved rows
interleaved_questions = pd.DataFrame(interleaved_rows)

# Add the remaining rows to the end of the DataFrame
interleaved_questions = pd.concat([interleaved_questions, other_rows])

# Reset index for the final DataFrame
interleaved_questions.reset_index(drop=True, inplace=True)

# Save the interleaved DataFrame to a new CSV file
interleaved_questions.to_csv('../../data/question_survey/question_dataset.csv', index=False, quoting=1)

In [1]:
import pandas as pd
# Load the CSV file
file_path = '../../data/question_survey/question_dataset.csv'
interleaved_questions = pd.read_csv(file_path)
# show the whole content of columns
pd.set_option('display.max_colwidth', None)
interleaved_questions.head(7)

Unnamed: 0,german_question_text_q,english_question_text_q,role_q,program_q,participant_id_q,question_id_q,question_language_q,age_q,gender_q,gender[other]_q,educationlevel_q,educationstatus_q,familiar_q,interviewtime_q,demographics_time_q,question_time_q,email_time_q,translation_done_q
0,Welche Jobs kann man mit welchen Studienfächern aufnehmen?,What jobs can you take up with which fields of study?,prospective,,119,356,de,A1,A1,,A2,A2,A3,180.71,41.57,133.58,5.56,True
1,Ist die Universität Osnabrück ein internationaler studentenfreundlicher Campus?,Is University of Osnabrück an international student friendly campus?,enrolled,Cognitive Science,17,53,en,A2,A2,,A4,A4,A2,415.44,38.97,356.76,19.71,True
2,"Wie kann ich anfangen, mich über Austauschprogramme zu informieren?",How can I start looking into exchange programs?,international,Cognitive Science,83,340,en,A2,A2,,A4,A5,A4,483.54,71.2,349.62,62.72,True
3,Welche Studiengänge gibt es an der Universität Osnabrück?,What courses of study are available at the University of Osnabrück?,prospective,,85,86,de,A1,A2,,A4,A2,A5,658.58,153.64,496.42,8.52,True
4,Wie lege ich Geld auf meine Campus-Karte?,How do I put money in my campus card,enrolled,Cognitive Science,50,177,en,A3,A2,,A4,A4,A2,236.81,29.64,192.72,14.45,True
5,"Wie sollte ich entscheiden, für welches Stipendium ich mich angesichts meiner persönlichen Situation bewerben sollte?",How should I decide which scholarship grant to apply to given my personal situation?,international,Cognitive Science,83,85,en,A2,A2,,A4,A5,A4,483.54,71.2,349.62,62.72,True
6,Besteht die Möglichkeit an Auslandsaufenthalten teilzunehmen?,Is there the possibility to participate in stays abroad?,prospective,,86,87,de,A1,A2,,A4,A2,A3,460.74,57.33,389.96,13.45,True


In [6]:
import random
# random seed for reproducibility
random.seed(42)

# Load the CSV file
file_path = '../../data/question_survey/question_dataset.csv'
interleaved_questions = pd.read_csv(file_path)

# Load the .lss template file
lss_template_path = '../../data/answer_survey/survey_lss/placeholder_survey_answers.lss'
with open(lss_template_path, 'r', encoding='utf-8') as file:
    lss_template = file.read()

# Initialize counters for the number of surveys that include questions from each role
prospective_count = 0
enrolled_count = 0
international_count = 0
others_count = 0
all_roles_count = 0

# Initialize dataframe to store survey number, token, question_id, and url
survey_info = pd.DataFrame(columns=['survey_number', 'token', 'url', 'question_id'])

# Iterate through questions in batches of three and create .lss files
output_files = []
batch_size = 3
for i in range(0, len(interleaved_questions), batch_size):
    # Get the current batch of questions
    batch = interleaved_questions.iloc[i:i+batch_size]
    if len(batch) < batch_size:
        break  # Skip if there are less than 3 questions remaining

    # reshuffle the batch
    batch = batch.sample(frac=1, random_state=42).reset_index(drop=True)

    # Check for the presence of roles in the batch
    if 'prospective' in batch['role_q'].values:
        prospective_count += 1
    if 'enrolled' in batch['role_q'].values:
        enrolled_count += 1
    if 'international' in batch['role_q'].values:
        international_count += 1
    if 'other' in batch['role_q'].values:
        others_count += 1
    # Check if all three roles are present in the batch
    if {'prospective', 'enrolled', 'international'}.issubset(batch['role_q'].values):
        all_roles_count += 1

    
    # Replace placeholders with batch questions
    updated_lss = lss_template
    for j, question in enumerate(batch['german_question_text_q']):
        placeholder = f"PLACEHOLDER{j + 1}"
        updated_lss = updated_lss.replace(placeholder, question)

    # Replace placeholders with batch original questions IDs
    for j, id in enumerate(batch['question_id_q']):
        placeholder = f"PLACEHOLDERID{j + 1}"
        updated_lss = updated_lss.replace(placeholder, f"{id}")
    
    # Update survey title
    survey_number = (i // batch_size) + 1 
    updated_lss = updated_lss.replace(
        "<surveyls_title><![CDATA[Chatbot der Universität Osnabrück – Erstellung von Referenzantworten]]></surveyls_title>",
        f"<surveyls_title><![CDATA[Chatbot der Universität Osnabrück – Erstellung von Referenzantworten {survey_number}]]></surveyls_title>"
    )

    # Create a 7 digit random token for the survey
    existing_tokens = set(survey_info['token'])  # Collect existing tokens
    # Generate a unique 7-digit token
    while True:
        token = random.randint(1000000, 9999999)
        if token not in existing_tokens:
            break

    # populate survey_info dataframe
    new_row = pd.DataFrame([{
        'survey_number': survey_number,
        'token': token,
        'url': 'PLACEHOLDER',
        'question_id': batch['question_id_q'].values
    }])

    survey_info = pd.concat([survey_info, new_row], ignore_index=True)
    
    # Save the updated .lss file
    output_path = f"../../data/answer_survey/survey_lss/survey_{survey_number}.lss"
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write(updated_lss)
    
    output_files.append(output_path)

# add one cloumn with false values named survey distributed
survey_info['survey_distributed'] = False
# add one cloumn with false values named survey filled
survey_info['survey_filled'] = False

# Save the survey_info dataframe to a new CSV file
survey_info.to_csv('../../data/answer_survey/survey_info_filled.csv', index=False, quoting=1)

# Output the results
print(f"Surveys with prospective questions: {prospective_count}")
print(f"Surveys with enrolled questions: {enrolled_count}")
print(f"Surveys with international questions: {international_count}")
print(f"Surveys with other questions: {others_count}")
print(f"Surveys with all three roles of interest: {all_roles_count}")

output_files  # List of generated files

Surveys with prospective questions: 46
Surveys with enrolled questions: 96
Surveys with international questions: 33
Surveys with other questions: 16
Surveys with all three roles of interest: 33


['data/answer_surveys/survey_1.lss',
 'data/answer_surveys/survey_2.lss',
 'data/answer_surveys/survey_3.lss',
 'data/answer_surveys/survey_4.lss',
 'data/answer_surveys/survey_5.lss',
 'data/answer_surveys/survey_6.lss',
 'data/answer_surveys/survey_7.lss',
 'data/answer_surveys/survey_8.lss',
 'data/answer_surveys/survey_9.lss',
 'data/answer_surveys/survey_10.lss',
 'data/answer_surveys/survey_11.lss',
 'data/answer_surveys/survey_12.lss',
 'data/answer_surveys/survey_13.lss',
 'data/answer_surveys/survey_14.lss',
 'data/answer_surveys/survey_15.lss',
 'data/answer_surveys/survey_16.lss',
 'data/answer_surveys/survey_17.lss',
 'data/answer_surveys/survey_18.lss',
 'data/answer_surveys/survey_19.lss',
 'data/answer_surveys/survey_20.lss',
 'data/answer_surveys/survey_21.lss',
 'data/answer_surveys/survey_22.lss',
 'data/answer_surveys/survey_23.lss',
 'data/answer_surveys/survey_24.lss',
 'data/answer_surveys/survey_25.lss',
 'data/answer_surveys/survey_26.lss',
 'data/answer_surveys