In [1]:
import pandas as pd
# load the csv file with chatbot answers
df_chatbot = pd.read_csv('../../data/chatbot_data/chatbot_answers_first_20_de_en_test_2.csv')

# load the csv file with human answers
df_human = pd.read_csv('../../data/answer_survey/answers_25_12_long_with_english.csv')

# load the main question dataset
df_questions = pd.read_csv('../../data/question_survey/question_dataset_interleaved_en_de.csv')

In [2]:
# Create a copy of the questions DataFrame as the foundation
df_merged = df_questions.copy()

# Merge df_chatbot into df_merged
#    Both have the column 'question_id_q' to join on.

# Specify the columns to merge from df_chatbot into df_merged
columns_to_merge = [
    'question_id_q', 
    'chatbot_answer_de', 
    'chatbot_visited_urls_de', 
    'chatbot_answer_en', 
    'chatbot_visited_urls_en',
    'chatbot_context_en',
    'chatbot_search_query_en',
    'latency_en',
    'chatbot_context_de',
    'chatbot_search_query_de',
    'latency_de'
    ]
df_merged = df_merged.merge(
    df_chatbot[columns_to_merge],
    on='question_id_q',
    how='left'  # use 'left' so that all questions remain, even if chatbot data is missing
)

# Merge df_human into df_merged
#    df_merged uses 'question_id_q'; df_human uses 'question_id_a'
#    So specify left_on='question_id_q', right_on='question_id_a'
df_merged = df_merged.merge(
    df_human, 
    how='left',
    left_on='question_id_q',
    right_on='question_id_a'
)

# Rename the human columns to something more descriptive
df_merged.rename(columns={
    'answer_text_a': 'human_answer_de',
    'answer_links_a': 'human_answer_links_de',
    'english_answer_text_a': 'human_answer_en',
    'english_answer_links_a': 'human_answer_links_en'
}, inplace=True)

# No longer need 'question_id_a' after the merge, so drop it
df_merged.drop(columns=['question_id_a'], inplace=True, errors='ignore')
# No longer need 'translation_done_q' after the merge, so drop it
df_merged.drop(columns=['translation_done_q'], inplace=True, errors='ignore')

# create a new df with only limited columns for german and english
columns_to_merge_de = [
    'german_question_text_q',
    'chatbot_answer_de',
    'chatbot_visited_urls_de', 
    'chatbot_context_de',
    'human_answer_de', 
    'human_answer_links_de', 
    'question_id_q',
    'participant_id_a', 
    'participant_id_q', 
    'question_language_q',
    #'chatbot_search_query_de',
    'latency_de'
    ]

df_merged_short_de = df_merged[columns_to_merge_de]
# only take rows that have a chatbot answer and human answer
df_merged_short_de = df_merged_short_de.dropna(subset=['chatbot_answer_de', 'human_answer_de'])

columns_to_merge_en = [
    'english_question_text_q',
    'chatbot_answer_en',
    'chatbot_visited_urls_en',
    'chatbot_context_en',
    'human_answer_en',
    'human_answer_links_en',
    'question_id_q',
    'participant_id_a',
    'participant_id_q',
    'question_language_q',
    #'chatbot_search_query_en',
    'latency_en'
    ]
df_merged_short_en = df_merged[columns_to_merge_en]
# only take rows that have a chatbot answer and human answer
df_merged_short_en = df_merged_short_en.dropna(subset=['chatbot_answer_en', 'human_answer_en'])


# Save the final merged dataset to a CSV
output_path = '../../data/final_merged_dataset_de_en_2.csv'
df_merged.to_csv(output_path, index=False, quoting=1)
df_merged_short_de.to_csv('../../data/final_merged_dataset_short_de_2.csv', index=False, quoting=1)
df_merged_short_en.to_csv('../../data/final_merged_dataset_short_en_2.csv', index=False, quoting=1)
print(f"Merged dataset saved to: {output_path}")

Merged dataset saved to: ../../data/final_merged_dataset_de_en_2.csv
