# Creating the Code Books

This notebook creates the codebooks for future researchers. [Codebooks](https://www.icpsr.umich.edu/web/ICPSR/cms/1983) often accompany social science datasets. We make a big latex table at the end to convert the dataset info to a PDF.

In [1]:
# Setup
import pandas as pd
import numpy as np
from src.utils.helper_funcs import find_project_root
from src.utils.data_loader import load_data, unnest_columns

# Set up paths
PROJECT_ROOT = find_project_root()

# Load data
data_dict = load_data()

# Load the raw qualtrics survey mapping
survey_mapping = pd.read_json(
    PROJECT_ROOT / "data" / "storage" / "mappings" / "survey_question_mapping.json"
)

header_usecases = pd.read_csv(
    PROJECT_ROOT / "data" / "storage" / "mappings" / "header_usecases_mapping.csv"
)

header_prefs = pd.read_csv(
    PROJECT_ROOT / "data" / "storage" / "mappings" / "header_prefs_mapping.csv"
)


# Create a dictionary
survey_mapping_dict = survey_mapping.to_dict()

In [2]:
display(header_usecases.head(4))

display(header_prefs.head(4))

Unnamed: 0,header,survey_numeric,survey_text
0,shared_opener,,Which of the following scenarios best describe...
1,homework_assistance,1.0,**Homework Assistance**: Getting help with sch...
2,research,2.0,**Research**: Fact-checking or gaining overvie...
3,source_suggestions,3.0,**Source Suggestions**: Creating or finding bi...


Unnamed: 0,header,survey_numeric,survey_text,conversations_performance_factors,conversations_choice_factors
0,slider_values,,Strongly Disagree - Strongly Agree,Performed very poorly - Performed very well,Very unimportant - Very important
1,shared_opener,,It is important that an AI language model... -,<b>This response</b>,<b>I chose this response</b>
2,values,1.0,...reflects my values or cultural perspectives,...reflected my values or cultural perspective,...because it reflected my values or cultural ...
3,creativity,10.0,...produces responses that are creative and in...,...was creative and inspiring,...because it was creative and inspiring


## General Functions

In [3]:
def return_codebook_entry(
    ENTRIES,
    df,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
    show_values=True,
):
    if "dict" in variable_type:
        num_missing, num_unique = "-", "-"
    else:
        # Get num missing
        num_missing = df[variable_name].isna().sum()
        # Get num unique
        num_unique = df[variable_name].nunique()

    # Get values depending on variable type
    if show_values:
        if variable_type in ["int", "float"]:
            values_dict = {}
            for stat in ["mean", "std", "min", "max"]:
                values_dict[stat] = np.round(
                    df[variable_name].describe().to_dict()[stat], 1
                )
        elif variable_type == "string":
            values_dict = {}
            for stat in ["mean", "std", "min", "max"]:
                values_dict[f"{stat} chars"] = np.round(
                    df[variable_name].str.len().describe().to_dict()[stat], 1
                )
        elif variable_type in ["categorical", "binary"]:
            values_dict = df[variable_name].value_counts(dropna=False).to_dict()
        elif variable_type == "datetime":
            values_dict = {}
            min_date = df[variable_name].min()
            max_date = df[variable_name].max()
            values_dict["earliest date"] = min_date
            values_dict["latest_date"] = max_date
        else:
            values_dict = {"-": "-"}
    else:
        values_dict = {"Too many values to show": "-"}

    # Create the entry
    entry = {
        "variable_name": variable_name,
        "variable_label": variable_label,
        "variable_category": variable_category,
        "variable_type": variable_type,
        "question_text": question_text,
        "num_missing": num_missing,
        "num_unique": num_unique,
        "values": values_dict,
        "notes": notes,
    }

    print(entry)
    # Append to entries
    ENTRIES.append(entry)

In [4]:
def return_codebook_entry_nested(
    ENTRIES,
    df,
    header_df,
    header_col,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
    show_values=True,
):
    keys = df[variable_name].iloc[0].keys()
    cols = [f"{variable_name}_{c}" for c in keys]

    num_missing = 0
    for _, row in df.iterrows():
        response_dict = row[variable_name]
        num_missing_row = 0
        for k, v in response_dict.items():
            if k != "other_text":
                if pd.isna(v):
                    num_missing_row += 1
        if num_missing_row > 0:
            num_missing += 1
    num_unique = df[variable_name].astype(str).nunique()  # unique combos

    entry = {
        "variable_name": variable_name,
        "variable_label": variable_label,
        "variable_category": variable_category,
        "variable_type": variable_type,
        "question_text": question_text,
        "num_missing": num_missing,
        "num_unique": num_unique,
    }

    if show_values:
        values = []
        # INNER LEVEL
        for c in keys:
            inner_dict = {}
            inner_variable_name = c
            inner_variable_label = header_df[header_df["header"] == c][header_col].iloc[
                0
            ]

            if c == "other":
                inner_variable_label = "Other (selected)"
            elif c == "other_text":
                inner_variable_label = "Other (typed text)"

            match_column = f"{variable_name}_{c}"
            inner_values_dict = {}
            if "other_text" in inner_variable_name:
                for stat in ["mean", "std", "min", "max"]:
                    inner_values_dict[f"{stat} chars"] = np.round(
                        df[match_column].str.len().describe().to_dict()[stat], 1
                    )
            else:
                if variable_name == "lm_usecases":  # binary
                    inner_values_dict = (
                        df[match_column]
                        .astype(bool)
                        .value_counts(dropna=False)
                        .to_dict()
                    )
                elif variable_name in [
                    "stated_prefs",
                    "performance_attributes",
                    "choice_attributes",
                ]:
                    for stat in ["mean", "std", "min", "max"]:
                        inner_values_dict[stat] = np.round(
                            df[match_column].describe().to_dict()[stat], 1
                        )

            inner_dict["variable_name"] = inner_variable_name
            inner_dict["variable_label"] = inner_variable_label
            for k, v in inner_values_dict.items():
                inner_dict[k] = v

            values.append(inner_dict)
        entry["values"] = {"nested_values": values}

    else:
        entry["values"] = {"-": "-"}

    entry["notes"] = notes
    ENTRIES.append(entry)
    print(entry)

## The Survey

In [5]:
# Start new entry set
ENTRIES = []

survey = data_dict["survey"]

print(survey.columns)

# Unnest the columns
survey = unnest_columns(data_dict["survey"])

print(survey.columns)

Index(['user_id', 'survey_only', 'num_completed_conversations',
       'timing_duration_s', 'timing_duration_mins', 'generated_datetime',
       'consent', 'consent_age', 'lm_familiarity', 'lm_indirect_use',
       'lm_direct_use', 'lm_frequency_use', 'self_description',
       'system_string', 'age', 'gender', 'employment_status', 'education',
       'marital_status', 'english_proficiency', 'study_id', 'study_locale',
       'religion', 'ethnicity', 'location', 'lm_usecases', 'stated_prefs',
       'order_lm_usecases', 'order_stated_prefs', 'included_in_US_REP',
       'included_in_UK_REP', 'included_in_balanced_subset'],
      dtype='object')
Index(['user_id', 'survey_only', 'num_completed_conversations',
       'timing_duration_s', 'timing_duration_mins', 'generated_datetime',
       'consent', 'consent_age', 'lm_familiarity', 'lm_indirect_use',
       ...
       'order_lm_usecases_other', 'order_stated_prefs_values',
       'order_stated_prefs_creativity', 'order_stated_prefs_fluen

### Base columns

In [6]:
# Base columns
variable_name = "user_id"
variable_label = "Unique participant identifier"
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = "Pseudonymized from Prolific worker ID. Used to link survey data to conversation data. In our paper, we refer to `users' as `participants'."
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "survey_only"
variable_label = "Indicator if participant only completed the survey, or also completed conversations"
variable_category = "meta"
variable_type = "binary"
question_text = "-"
notes = ""
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "num_completed_conversations"
variable_label = "Number of conversations that a participant completed"
variable_category = "meta"
variable_type = "int"
question_text = "-"
notes = ""
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'user_id', 'variable_label': 'Unique participant identifier', 'variable_category': 'meta', 'variable_type': 'string id', 'question_text': '-', 'num_missing': 0, 'num_unique': 1500, 'values': {'-': '-'}, 'notes': "Pseudonymized from Prolific worker ID. Used to link survey data to conversation data. In our paper, we refer to `users' as `participants'."}
{'variable_name': 'survey_only', 'variable_label': 'Indicator if participant only completed the survey, or also completed conversations', 'variable_category': 'meta', 'variable_type': 'binary', 'question_text': '-', 'num_missing': 0, 'num_unique': 2, 'values': {False: 1396, True: 104}, 'notes': ''}
{'variable_name': 'num_completed_conversations', 'variable_label': 'Number of conversations that a participant completed', 'variable_category': 'meta', 'variable_type': 'int', 'question_text': '-', 'num_missing': 0, 'num_unique': 8, 'values': {'mean': 5.3, 'std': 1.7, 'min': 0.0, 'max': 7.0}, 'notes': ''}


In [7]:
# Timing
variable_name = "timing_duration_s"
variable_label = "Duration of the survey session (in seconds)"
variable_category = "meta"
variable_type = "float"
question_text = "-"
notes = (
    "Extreme values are caused by participants completing task in multiple sessions."
)
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)


variable_name = "timing_duration_mins"
variable_label = "Duration of the survey session (in minutes)"
variable_category = "constructed"
variable_type = "float"
notes = "timing_duration_s / 60. Extreme values are caused by participants completing task in multiple sessions."
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'timing_duration_s', 'variable_label': 'Duration of the survey session (in seconds)', 'variable_category': 'meta', 'variable_type': 'float', 'question_text': '-', 'num_missing': 0, 'num_unique': 977, 'values': {'mean': 2154.2, 'std': 20557.1, 'min': 160.0, 'max': 529927.0}, 'notes': 'Extreme values are caused by participants completing task in multiple sessions.'}
{'variable_name': 'timing_duration_mins', 'variable_label': 'Duration of the survey session (in minutes)', 'variable_category': 'constructed', 'variable_type': 'float', 'question_text': '-', 'num_missing': 0, 'num_unique': 977, 'values': {'mean': 35.9, 'std': 342.6, 'min': 2.7, 'max': 8832.1}, 'notes': 'timing_duration_s / 60. Extreme values are caused by participants completing task in multiple sessions.'}


In [8]:
# Generated datetime
variable_name = "generated_datetime"
variable_label = "Recorded date of the survey completion"
variable_category = "meta"
variable_type = "datetime"
question_text = "-"
notes = "End time, not start time"
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'generated_datetime', 'variable_label': 'Recorded date of the survey completion', 'variable_category': 'meta', 'variable_type': 'datetime', 'question_text': '-', 'num_missing': 0, 'num_unique': 1492, 'values': {'earliest date': '2023-11-22 15:48:46', 'latest_date': '2023-12-22 06:56:27'}, 'notes': 'End time, not start time'}


In [9]:
# Consent
variable_name = "consent"
variable_label = "Participant informed consent confirmation"
variable_category = "direct"
variable_type = "categorical"
question_text = survey_mapping_dict["consent"]["full_question"]
notes = "See full informed consent document for details"
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

# Consent age
variable_name = "consent_age"
variable_label = "Participant age confirmation"
variable_category = "direct"
variable_type = "categorical"
question_text = survey_mapping_dict["consent_age"]["full_question"]
notes = "See full informed consent document for details"
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'consent', 'variable_label': 'Participant informed consent confirmation', 'variable_category': 'direct', 'variable_type': 'categorical', 'question_text': 'If you have read the information above and agree to participate with the understanding that the data (including any personal data) you submit will be processed accordingly, please select the box below to start.', 'num_missing': 0, 'num_unique': 1, 'values': {'Yes, I consent to take part': 1500}, 'notes': 'See full informed consent document for details'}
{'variable_name': 'consent_age', 'variable_label': 'Participant age confirmation', 'variable_category': 'direct', 'variable_type': 'categorical', 'question_text': 'Please note that you may only participate in this survey if you are 18 years of age or over.', 'num_missing': 0, 'num_unique': 1, 'values': {'I certify that I am 18 years of age or over': 1500}, 'notes': 'See full informed consent document for details'}


### LLM columns

In [10]:
# LLM familiarity
variable_name = "lm_familiarity"
variable_label = "Familiarity with LLMs"
variable_category = "direct"
variable_type = "categorical"
question_text = survey_mapping_dict["lm_familiarity"]["full_question"]
notes = ""
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

# Direct use
variable_name = "lm_direct_use"
variable_label = "Direct use of LLMs"
variable_category = "direct"
variable_type = "categorical"
question_text = survey_mapping_dict["lm_direct_use"]["full_question"]
notes = ""
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

# Indirect use
variable_name = "lm_indirect_use"
variable_label = "Direct use of LLMs"
variable_category = "direct"
variable_type = "categorical"
question_text = survey_mapping_dict["lm_direct_use"]["full_question"]
notes = ""
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

# Frequency of use
variable_name = "lm_frequency_use"
variable_label = "Frequency of using Large Language Models"
variable_category = "direct"
variable_type = "categorical"
question_text = survey_mapping_dict["lm_frequency_use"]["full_question"]
notes = "Only shown if lm_indirect_use==1 OR lm_direct_use==1. Null indicates particiant did not see question."
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'lm_familiarity', 'variable_label': 'Familiarity with LLMs', 'variable_category': 'direct', 'variable_type': 'categorical', 'question_text': 'How familiar are you with AI language models like ChatGPT?', 'num_missing': 0, 'num_unique': 3, 'values': {'Somewhat familiar': 920, 'Very familiar': 424, 'Not familiar at all': 156}, 'notes': ''}
{'variable_name': 'lm_direct_use', 'variable_label': 'Direct use of LLMs', 'variable_category': 'direct', 'variable_type': 'categorical', 'question_text': 'Have you directly used or communicated with an AI language model, such as asking questions to ChatGPT, BARD, Claude or other similar models?', 'num_missing': 0, 'num_unique': 3, 'values': {'Yes': 1162, 'No': 259, 'Unsure': 79}, 'notes': ''}
{'variable_name': 'lm_indirect_use', 'variable_label': 'Direct use of LLMs', 'variable_category': 'direct', 'variable_type': 'categorical', 'question_text': 'Have you directly used or communicated with an AI language model, such as asking questio

### Nested usecase and prefs columns

In [11]:
variable_name = "lm_usecases"
variable_label = "Use cases of LLMs"
variable_category = "direct"
variable_type = "dict"
header_df = header_usecases
header_col = "survey_text"
question_text = (
    header_df["survey_text"][header_df["header"] == "shared_opener"]
    .iloc[0]
    .split("-")[0]
    .strip()
)
notes = """Question only show if lm_direct_use==1 OR lm_indirect_use==1.
N Missing indicates the participants who have at least one missing value in the usecases (besides from 'other_text').
N Unique indicates the unique combinations of use cases selected by participants.
On 'other_text', Null indicates participant did not type anything.
On all other keys, 0 indicates participant saw question and did not select usecase. Null indicates participant did not see question."""
return_codebook_entry_nested(
    ENTRIES,
    survey,
    header_df,
    header_col,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'lm_usecases', 'variable_label': 'Use cases of LLMs', 'variable_category': 'direct', 'variable_type': 'dict', 'question_text': 'Which of the following scenarios best describe how and why you use AI language models? Select all that apply.', 'num_missing': 247, 'num_unique': 853, 'values': {'nested_values': [{'variable_name': 'homework_assistance', 'variable_label': '**Homework Assistance**: Getting help with school or university assignments.', False: 967, True: 533}, {'variable_name': 'research', 'variable_label': '**Research**: Fact-checking or gaining overviews on specific topics.', True: 864, False: 636}, {'variable_name': 'source_suggestions', 'variable_label': '**Source Suggestions**: Creating or finding bibliographies, information sources or reading lists.', False: 1036, True: 464}, {'variable_name': 'professional_work', 'variable_label': '**Professional Work**: Assisting in drafting, editing, or brainstorming content for work.', False: 784, True: 716}, {'variabl

In [12]:
variable_name = "order_lm_usecases"
variable_label = "Use cases of LLMs (order of options presented in survey)"
variable_category = "meta"
variable_type = "dict"
header_df = header_usecases
header_col = "survey_text"
question_text = "-"
notes = """Integer 1-18 indicating random order that usecase option was presented to participant.
For 'other', option is always shown last so will always be 19.
Null indicates participant did not see question.
The usecases as the same as in lm_usecases."""
return_codebook_entry_nested(
    ENTRIES,
    survey,
    header_df,
    header_col,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
    show_values=False,
)

{'variable_name': 'order_lm_usecases', 'variable_label': 'Use cases of LLMs (order of options presented in survey)', 'variable_category': 'meta', 'variable_type': 'dict', 'question_text': '-', 'num_missing': 247, 'num_unique': 1254, 'values': {'-': '-'}, 'notes': "Integer 1-18 indicating random order that usecase option was presented to participant.\nFor 'other', option is always shown last so will always be 19.\nNull indicates participant did not see question.\nThe usecases as the same as in lm_usecases."}


In [13]:
variable_name = "stated_prefs"
variable_label = "Stated preferences over LLM behaviours"
variable_category = "direct"
variable_type = "dict"
header_df = header_prefs
header_col = "survey_text"
question_text = """Rate each of the following statements about your opinion on the importance of different AI language model behaviors or traits.
It is important that an AI language model..."""
notes = """Sliders from [Strongly disagree] to [Strongly agree] are recorded on a 0-100 scale. Participant does not see numeric value.
N Missing indicates the participants who have at least one missing value in the attributes (besides from 'other_text').
N Unique indicates the unique combinations of use cases selected by participants.
On 'other_text', Null indicates participant did not type anything.
Note that this scale (on Qualtrics) runs 0-100. The Conversations rating scales (for choice_attributes, performance_attributes on Dynabench) run 1-100.
"""

return_codebook_entry_nested(
    ENTRIES,
    survey,
    header_df,
    header_col,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'stated_prefs', 'variable_label': 'Stated preferences over LLM behaviours', 'variable_category': 'direct', 'variable_type': 'dict', 'question_text': 'Rate each of the following statements about your opinion on the importance of different AI language model behaviors or traits.\nIt is important that an AI language model...', 'num_missing': 0, 'num_unique': 1475, 'values': {'nested_values': [{'variable_name': 'values', 'variable_label': '...reflects my values or cultural perspectives', 'mean': 54.3, 'std': 26.3, 'min': 0.0, 'max': 100.0}, {'variable_name': 'creativity', 'variable_label': '...produces responses that are creative and inspiring', 'mean': 69.6, 'std': 22.1, 'min': 0.0, 'max': 100.0}, {'variable_name': 'fluency', 'variable_label': '...produces responses that are well-written and coherent', 'mean': 86.7, 'std': 16.3, 'min': 2.0, 'max': 100.0}, {'variable_name': 'factuality', 'variable_label': '...produces factual and informative responses', 'mean': 88.7, 'std'

In [14]:
variable_name = "order_stated_prefs"
variable_label = (
    "Stated preferences over LLM behaviours (order of options presented in survey)"
)
variable_category = "meta"
variable_type = "dict"
header_df = header_prefs
header_col = "survey_text"
question_text = "-"
notes = """Integer 1-8 indicating random order that attribute slider was presented to participant.
For 'other', option is always shown last so will always be 9.
Null indicates participant did not see question.
The attributes as the same as in stated_prefs."""
return_codebook_entry_nested(
    ENTRIES,
    survey,
    header_df,
    header_col,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
    show_values=False,
)

{'variable_name': 'order_stated_prefs', 'variable_label': 'Stated preferences over LLM behaviours (order of options presented in survey)', 'variable_category': 'meta', 'variable_type': 'dict', 'question_text': '-', 'num_missing': 0, 'num_unique': 1467, 'values': {'-': '-'}, 'notes': "Integer 1-8 indicating random order that attribute slider was presented to participant.\nFor 'other', option is always shown last so will always be 9.\nNull indicates participant did not see question.\nThe attributes as the same as in stated_prefs."}


### Text columns

In [15]:
variable_name = "self_description"
variable_label = "Participant self-written profile describing themself"
variable_category = "direct"
variable_type = "string"
question_text = survey_mapping_dict["self_description"]["full_question"]
notes = ""
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "system_string"
variable_label = "Participant self-written system string, constitution or custom instructions for an LLM"
variable_category = "direct"
variable_type = "string"
question_text = survey_mapping_dict["system_string"]["full_question"]
notes = ""
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'self_description', 'variable_label': 'Participant self-written profile describing themself', 'variable_category': 'direct', 'variable_type': 'string', 'question_text': "Please briefly describe your values, core beliefs, guiding principles in life, or other things that are important to you. For example, you might include values you'd want to teach to your children or qualities you look for in friends. There are no right or wrong answers. Please do not provide any personally identifiable details like your name, address or email. Please write 2-5 sentences in your own words.", 'num_missing': 0, 'num_unique': 1500, 'values': {'mean chars': 241.3, 'std chars': 134.6, 'min chars': 3.0, 'max chars': 1547.0}, 'notes': ''}
{'variable_name': 'system_string', 'variable_label': 'Participant self-written system string, constitution or custom instructions for an LLM', 'variable_category': 'direct', 'variable_type': 'string', 'question_text': "Imagine you are instructing an AI lang

### Demographic columns

In [16]:
# Basic demographics
cols = [
    "age",
    "education",
    "employment_status",
    "marital_status",
    "english_proficiency",
]
for c in cols:
    variable_name = c
    variable_label = c.replace("_", " ").title()
    variable_category = "direct"
    variable_type = "categorical"
    question_text = survey_mapping_dict[c]["full_question"]
    notes = ""
    return_codebook_entry(
        ENTRIES,
        survey,
        variable_name,
        variable_label,
        variable_category,
        variable_type,
        question_text,
        notes,
    )

# Gender
variable_name = "gender"
variable_label = "Gender"
variable_category = "constructed"
variable_type = "categorical"
question_text = survey_mapping_dict[c]["full_question"]
notes = "Participants could chose Male, Female, Non-binary / third Gender, Prefer not to say, or write in their own response. Two independent annotators then categorised the self-describe responses only when abundantly clear they fit another category. See paper for details."
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'age', 'variable_label': 'Age', 'variable_category': 'direct', 'variable_type': 'categorical', 'question_text': 'How old are you?', 'num_missing': 0, 'num_unique': 7, 'values': {'25-34 years old': 454, '18-24 years old': 297, '35-44 years old': 237, '45-54 years old': 208, '55-64 years old': 197, '65+ years old': 106, 'Prefer not to say': 1}, 'notes': ''}
{'variable_name': 'education', 'variable_label': 'Education', 'variable_category': 'direct', 'variable_type': 'categorical', 'question_text': 'What is the highest level of education you have completed?', 'num_missing': 0, 'num_unique': 9, 'values': {'University Bachelors Degree': 637, 'Graduate / Professional degree': 241, 'Some University but no degree': 236, 'Completed Secondary School': 209, 'Vocational': 125, 'Some Secondary': 24, 'Completed Primary School': 16, 'Prefer not to say': 9, 'Some Primary': 3}, 'notes': ''}
{'variable_name': 'employment_status', 'variable_label': 'Employment Status', 'variable_category

In [17]:
# Religion and Ethnicity (nested)
for c in ["religion", "ethnicity"]:
    variable_name = c
    variable_label = f"Dictionary of {c} information."
    variable_category = "NA"
    variable_type = "dict"
    question_text = "-"
    notes = "Keys explained below."
    return_codebook_entry(
        ENTRIES,
        survey,
        variable_name,
        variable_label,
        variable_category,
        variable_type,
        question_text,
        notes,
    )

    variable_name = f"{c}_self_described"
    variable_label = "Participant {c} self-description"
    variable_category = "direct"
    variable_type = "string"
    question_text = (
        survey_mapping_dict[f"{c}_1_text"]["full_question"].split("-")[0].strip()
    )
    notes = (
        "Participant had option to type and Self Describe or select Prefer not to say."
    )
    return_codebook_entry(
        ENTRIES,
        survey,
        variable_name,
        variable_label,
        variable_category,
        variable_type,
        question_text,
        notes,
    )

    variable_name = f"{c}_categorised"
    variable_label = f"Granular categories of participant {c}"
    variable_category = "constructed"
    variable_type = "categorical"
    question_text = "-"
    notes = "Two independent annotators manually verified all automated classifications (gpt-4-turbo) of the self-describe string. See paper for details."
    return_codebook_entry(
        ENTRIES,
        survey,
        variable_name,
        variable_label,
        variable_category,
        variable_type,
        question_text,
        notes,
    )

    variable_name = f"{c}_simplified"
    variable_label = f"Simplified categories of participant {c}"
    variable_category = "constructed"
    variable_type = "categorical"
    question_text = "-"
    notes = f"Simplified version of {c}_categorised for more aggregate analysis."
    return_codebook_entry(
        ENTRIES,
        survey,
        variable_name,
        variable_label,
        variable_category,
        variable_type,
        question_text,
        notes,
    )

{'variable_name': 'religion', 'variable_label': 'Dictionary of religion information.', 'variable_category': 'NA', 'variable_type': 'dict', 'question_text': '-', 'num_missing': '-', 'num_unique': '-', 'values': {'-': '-'}, 'notes': 'Keys explained below.'}
{'variable_name': 'religion_self_described', 'variable_label': 'Participant {c} self-description', 'variable_category': 'direct', 'variable_type': 'string', 'question_text': 'What is your religious affiliation?', 'num_missing': 0, 'num_unique': 137, 'values': {'mean chars': 12.2, 'std chars': 5.7, 'min chars': 2.0, 'max chars': 112.0}, 'notes': 'Participant had option to type and Self Describe or select Prefer not to say.'}
{'variable_name': 'religion_categorised', 'variable_label': 'Granular categories of participant religion', 'variable_category': 'constructed', 'variable_type': 'categorical', 'question_text': '-', 'num_missing': 0, 'num_unique': 12, 'values': {'Non-religious': 762, 'Christian': 487, 'Agnostic': 71, 'Prefer not to s

In [18]:
variable_name = "location"
variable_label = "Dictionary of location information."
variable_category = "NA"
variable_type = "dict"
question_text = "-"
notes = "Keys explained below."
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)


for c, l in zip(["birth", "reside"], ["birth", "residence"]):
    # Location (nested)
    variable_name = f"location_{c}_country"
    variable_label = f"Participant country of {l}"
    variable_category = "direct"
    variable_type = "categorical"
    if c == "birth":
        question_text = "In which country were you born?"
    elif c == "reside":
        question_text = "In which country do you currently reside?"
    notes = "Selected from standardised dropdown country list."
    return_codebook_entry(
        ENTRIES,
        survey,
        variable_name,
        variable_label,
        variable_category,
        variable_type,
        question_text,
        notes,
        show_values=False,
    )

    variable_name = f"location_{c}_countryISO"
    variable_label = f"ISO 3166-1 alpha-3 code for the country of {l}"
    variable_category = "constructed"
    variable_type = "categorical"
    question_text = "-"
    notes = ""
    return_codebook_entry(
        ENTRIES,
        survey,
        variable_name,
        variable_label,
        variable_category,
        variable_type,
        question_text,
        notes,
        show_values=False,
    )

    variable_name = f"location_{c}_subregion"
    variable_label = f"Participant sub-region of {l}"
    variable_category = "constructed"
    variable_type = "categorical"
    question_text = "-"
    notes = f"Mapped from country of {l}, based on United Nations defined subregions."
    return_codebook_entry(
        ENTRIES,
        survey,
        variable_name,
        variable_label,
        variable_category,
        variable_type,
        question_text,
        notes,
        show_values=False,
    )


variable_name = "location_same_birth_reside_country"
variable_label = "Whether the participant was born and resides in the same country"
variable_category = "constructed"
variable_type = "binary"
question_text = "-"
notes = ""
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "location_special_region"
variable_label = "Adjusted regional categories for unique sample properties"
variable_category = "constructed"
variable_type = "categorical"
question_text = "-"
notes = "Within regions and sub-regions, some countries are split out to better represent sample density (e.g., treating UK and US samples seperately from Europe and North America)."
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'location', 'variable_label': 'Dictionary of location information.', 'variable_category': 'NA', 'variable_type': 'dict', 'question_text': '-', 'num_missing': '-', 'num_unique': '-', 'values': {'-': '-'}, 'notes': 'Keys explained below.'}
{'variable_name': 'location_birth_country', 'variable_label': 'Participant country of birth', 'variable_category': 'direct', 'variable_type': 'categorical', 'question_text': 'In which country were you born?', 'num_missing': 0, 'num_unique': 75, 'values': {'Too many values to show': '-'}, 'notes': 'Selected from standardised dropdown country list.'}
{'variable_name': 'location_birth_countryISO', 'variable_label': 'ISO 3166-1 alpha-3 code for the country of birth', 'variable_category': 'constructed', 'variable_type': 'categorical', 'question_text': '-', 'num_missing': 0, 'num_unique': 75, 'values': {'Too many values to show': '-'}, 'notes': ''}
{'variable_name': 'location_birth_subregion', 'variable_label': 'Participant sub-region of bi

### Study columns

In [19]:
# Study params
variable_name = "study_id"
variable_label = "Unique study idenfitier on Prolific"
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = ""

return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "study_locale"
variable_label = "Recruitment country of Prolific study"
variable_category = "meta"
variable_type = "categorical"
question_text = "-"
notes = ""
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
    show_values=False,
)

{'variable_name': 'study_id', 'variable_label': 'Unique study idenfitier on Prolific', 'variable_category': 'meta', 'variable_type': 'string id', 'question_text': '-', 'num_missing': 0, 'num_unique': 51, 'values': {'-': '-'}, 'notes': ''}
{'variable_name': 'study_locale', 'variable_label': 'Recruitment country of Prolific study', 'variable_category': 'meta', 'variable_type': 'categorical', 'question_text': '-', 'num_missing': 0, 'num_unique': 33, 'values': {'Too many values to show': '-'}, 'notes': ''}


### Other constructed columns

In [20]:
# Other constructed columns
for r in ["UK", "US"]:
    variable_name = f"included_in_{r}_REP"
    variable_label = f"Indicator if participant was included in the rebalanced {r} representative sample"
    variable_category = "constructed"
    variable_type = "binary"
    question_text = "-"
    notes = "Census-representative samples were rebalanced to mitigate sampling issues. See paper for details."
    return_codebook_entry(
        ENTRIES,
        survey,
        variable_name,
        variable_label,
        variable_category,
        variable_type,
        question_text,
        notes,
    )

variable_name = "included_in_balanced_subset"
variable_label = (
    "Indicator if participant's conversations are included in the balanced subset"
)
variable_category = "constructed"
variable_type = "binary"
question_text = "-"
notes = """Balanced subset was created to equally sample conversations of three types (unguided, values, controversy).
We only include participants who have at least one of each conversation type, and then ensure equal numbers of each type are retained.
See paper for details."""
return_codebook_entry(
    ENTRIES,
    survey,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'included_in_UK_REP', 'variable_label': 'Indicator if participant was included in the rebalanced UK representative sample', 'variable_category': 'constructed', 'variable_type': 'binary', 'question_text': '-', 'num_missing': 0, 'num_unique': 2, 'values': {False: 1257, True: 243}, 'notes': 'Census-representative samples were rebalanced to mitigate sampling issues. See paper for details.'}
{'variable_name': 'included_in_US_REP', 'variable_label': 'Indicator if participant was included in the rebalanced US representative sample', 'variable_category': 'constructed', 'variable_type': 'binary', 'question_text': '-', 'num_missing': 0, 'num_unique': 2, 'values': {False: 1270, True: 230}, 'notes': 'Census-representative samples were rebalanced to mitigate sampling issues. See paper for details.'}
{'variable_name': 'included_in_balanced_subset', 'variable_label': "Indicator if participant's conversations are included in the balanced subset", 'variable_category': 'constructed', '

In [21]:
SURVEY_ENTRIES = ENTRIES
print(f"There are {len(SURVEY_ENTRIES)} entries in the survey codebook.")

There are 46 entries in the survey codebook.


## The Conversations

In [22]:
# Start new entry set for conversations
ENTRIES = []
conversations = data_dict["conversations"]
print(conversations.columns)

# Unnest the columns
nested_columns = ["performance_attributes", "choice_attributes"]
conversations = unnest_columns(conversations, nested_columns)

print(conversations.columns)

Index(['conversation_id', 'user_id', 'included_in_balanced_subset',
       'generated_datetime', 'timing_duration_s', 'timing_duration_mins',
       'conversation_type', 'opening_prompt', 'conversation_turns',
       'conversation_history', 'performance_attributes', 'choice_attributes',
       'open_feedback'],
      dtype='object')
Index(['conversation_id', 'user_id', 'included_in_balanced_subset',
       'generated_datetime', 'timing_duration_s', 'timing_duration_mins',
       'conversation_type', 'opening_prompt', 'conversation_turns',
       'conversation_history', 'performance_attributes', 'choice_attributes',
       'open_feedback', 'performance_attributes_values',
       'performance_attributes_fluency', 'performance_attributes_factuality',
       'performance_attributes_safety', 'performance_attributes_diversity',
       'performance_attributes_creativity',
       'performance_attributes_helpfulness', 'choice_attributes_values',
       'choice_attributes_fluency', 'choice_attri

### Base columns

In [23]:
# Base columns
variable_name = "user_id"
variable_label = "Unique participant identifier"
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = "Pseudonymized from Prolific worker ID. Used to link conversation data to survey data."
return_codebook_entry(
    ENTRIES,
    conversations,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "conversation_id"
variable_label = "Unique conversation identifier"
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = ""
return_codebook_entry(
    ENTRIES,
    conversations,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "included_in_balanced_subset"
variable_label = (
    "Indicator if participant's conversations are included in the balanced subset"
)
variable_category = "constructed"
variable_type = "binary"
question_text = "-"
notes = """Balanced subset was created to equally sample conversations of three types (unguided, values, controversy).
We only include participants who have at least one of each conversation type, and then ensure equal numbers of each type are retained.
See paper for details."""
return_codebook_entry(
    ENTRIES,
    conversations,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'user_id', 'variable_label': 'Unique participant identifier', 'variable_category': 'meta', 'variable_type': 'string id', 'question_text': '-', 'num_missing': 0, 'num_unique': 1396, 'values': {'-': '-'}, 'notes': 'Pseudonymized from Prolific worker ID. Used to link conversation data to survey data.'}
{'variable_name': 'conversation_id', 'variable_label': 'Unique conversation identifier', 'variable_category': 'meta', 'variable_type': 'string id', 'question_text': '-', 'num_missing': 0, 'num_unique': 8011, 'values': {'-': '-'}, 'notes': ''}
{'variable_name': 'included_in_balanced_subset', 'variable_label': "Indicator if participant's conversations are included in the balanced subset", 'variable_category': 'constructed', 'variable_type': 'binary', 'question_text': '-', 'num_missing': 0, 'num_unique': 2, 'values': {True: 6696, False: 1315}, 'notes': 'Balanced subset was created to equally sample conversations of three types (unguided, values, controversy).\nWe only include

In [24]:
# Timing
variable_name = "timing_duration_s"
variable_label = "Duration of the conversation (in seconds)"
variable_category = "meta"
variable_type = "float"
question_text = "-"
notes = (
    "Extreme values are caused by participants completing task in multiple sessions."
)
return_codebook_entry(
    ENTRIES,
    conversations,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)


variable_name = "timing_duration_mins"
variable_label = "Duration of the conversation (in minutes)"
variable_category = "constructed"
variable_type = "float"
notes = "timing_duration_s / 60. Extreme values are caused by participants completing task in multiple sessions."
return_codebook_entry(
    ENTRIES,
    conversations,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'timing_duration_s', 'variable_label': 'Duration of the conversation (in seconds)', 'variable_category': 'meta', 'variable_type': 'float', 'question_text': '-', 'num_missing': 0, 'num_unique': 7656, 'values': {'mean': 555.9, 'std': 422.1, 'min': 73.5, 'max': 17145.8}, 'notes': 'Extreme values are caused by participants completing task in multiple sessions.'}
{'variable_name': 'timing_duration_mins', 'variable_label': 'Duration of the conversation (in minutes)', 'variable_category': 'constructed', 'variable_type': 'float', 'question_text': '-', 'num_missing': 0, 'num_unique': 1948, 'values': {'mean': 9.3, 'std': 7.0, 'min': 1.2, 'max': 285.8}, 'notes': 'timing_duration_s / 60. Extreme values are caused by participants completing task in multiple sessions.'}


In [25]:
# Generated datetime
variable_name = "generated_datetime"
variable_label = "Recorded date of the conversation completion"
variable_category = "meta"
variable_type = "datetime"
question_text = "-"
notes = "Recorded at end of conversation, before fine-grained feedback page shown."
return_codebook_entry(
    ENTRIES,
    conversations,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'generated_datetime', 'variable_label': 'Recorded date of the conversation completion', 'variable_category': 'meta', 'variable_type': 'datetime', 'question_text': '-', 'num_missing': 0, 'num_unique': 7820, 'values': {'earliest date': '2023-11-22 15:55:46', 'latest_date': '2023-12-22 08:04:46'}, 'notes': 'Recorded at end of conversation, before fine-grained feedback page shown.'}


### Text columns

In [26]:
variable_name = "opening_prompt"
variable_label = "Opening human-written prompt of the conversation"
variable_category = "direct"
variable_type = "string"
question_text = "Now start the conversation with your question, request or statement."
notes = """We provide the following soft guidance:
Need some inspiration? You can request help with a task (like writing a recipe, organising an activity or event, completing an assignment)... You can chitchat, have casual conversation or seek personal advice. You can ask questions about the world, current events or your viewpoints."""
return_codebook_entry(
    ENTRIES,
    conversations,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "open_feedback"
variable_label = "Participant written feedback on the conversation as a whole."
variable_category = "direct"
variable_type = "string"
question_text = """Give the model some feedback on the conversation as whole. Hypothetically, what would an ideal interaction for you look like here? What was good and what was bad? What (if anything) was missing? What would you change to make the conversation better?
Please write 2-5 sentences in your own words."""
notes = "Entry box reads: Enter text here. Do not copy and paste."
return_codebook_entry(
    ENTRIES,
    conversations,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'opening_prompt', 'variable_label': 'Opening human-written prompt of the conversation', 'variable_category': 'direct', 'variable_type': 'string', 'question_text': 'Now start the conversation with your question, request or statement.', 'num_missing': 0, 'num_unique': 7811, 'values': {'mean chars': 65.7, 'std chars': 59.2, 'min chars': 2.0, 'max chars': 1195.0}, 'notes': 'We provide the following soft guidance:\nNeed some inspiration? You can request help with a task (like writing a recipe, organising an activity or event, completing an assignment)... You can chitchat, have casual conversation or seek personal advice. You can ask questions about the world, current events or your viewpoints.'}
{'variable_name': 'open_feedback', 'variable_label': 'Participant written feedback on the conversation as a whole.', 'variable_category': 'direct', 'variable_type': 'string', 'question_text': 'Give the model some feedback on the conversation as whole. Hypothetically, what would an 

### Convo columns

In [27]:
variable_name = "conversation_type"
variable_label = "Type of conversation (from pre-defined categories)"
variable_category = "direct"
variable_type = "categorical"
question_text = """Choose what type of conversation you want to have."""
notes = """Participants pick from the following radio buttons:
Unguided. Ask, request or talk to the model about anything . It is up to you!
Values guided. Ask, request or talk to the model about something important to you or that represents your values. This could be related to work, religion, family and relationship, politics or culture.
Controversy guided. Ask, request or talk to the model about something controversial or where people would disagree in your community, culture or country.
We also provide the additional instruction: Remember if you are here as a paid study participant, you need to do two of each type. If you are here as a volunteer, then take your pick!
"""
return_codebook_entry(
    ENTRIES,
    conversations,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'conversation_type', 'variable_label': 'Type of conversation (from pre-defined categories)', 'variable_category': 'direct', 'variable_type': 'categorical', 'question_text': 'Choose what type of conversation you want to have.', 'num_missing': 0, 'num_unique': 3, 'values': {'unguided': 3113, 'values guided': 2460, 'controversy guided': 2438}, 'notes': 'Participants pick from the following radio buttons:\nUnguided. Ask, request or talk to the model about anything . It is up to you!\nValues guided. Ask, request or talk to the model about something important to you or that represents your values. This could be related to work, religion, family and relationship, politics or culture.\nControversy guided. Ask, request or talk to the model about something controversial or where people would disagree in your community, culture or country.\nWe also provide the additional instruction: Remember if you are here as a paid study participant, you need to do two of each type. If you ar

In [28]:
variable_name = "conversation_turns"
variable_label = "Number of human-model turns (back-and-forths) in the conversation."
variable_category = "meta"
variable_type = "int"
question_text = "-"
notes = """We force 2 turns as the minimum. After the opening turn, we give the instruction:
Now continue the conversation. Conversations can be between 2 and 10 turns. Try to vary the length. When you're done, click Finish."""
return_codebook_entry(
    ENTRIES,
    conversations,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'conversation_turns', 'variable_label': 'Number of human-model turns (back-and-forths) in the conversation.', 'variable_category': 'meta', 'variable_type': 'int', 'question_text': '-', 'num_missing': 0, 'num_unique': 13, 'values': {'mean': 3.4, 'std': 1.6, 'min': 2.0, 'max': 22.0}, 'notes': "We force 2 turns as the minimum. After the opening turn, we give the instruction:\nNow continue the conversation. Conversations can be between 2 and 10 turns. Try to vary the length. When you're done, click Finish."}


In [29]:
variable_name = "conversation_history"
variable_label = "Full conversation history (human and model messages, with scores and model metadata)"
variable_category = "direct"
variable_type = "dict"
question_text = "-"
notes = """We provide an example of what this nested conversation history looks like below."""
return_codebook_entry(
    ENTRIES,
    conversations,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
    show_values=False,
)

{'variable_name': 'conversation_history', 'variable_label': 'Full conversation history (human and model messages, with scores and model metadata)', 'variable_category': 'direct', 'variable_type': 'dict', 'question_text': '-', 'num_missing': '-', 'num_unique': '-', 'values': {'Too many values to show': '-'}, 'notes': 'We provide an example of what this nested conversation history looks like below.'}


### Nested attribute columns

In [30]:
variable_name = "performance_attributes"
variable_label = (
    "How well the top-rated model response performed across different attributes"
)
variable_category = "nested"
variable_type = "dict"
question_text = """Tell us how the model performed. Consider your first message and the top-rated response.
Rate the following statements about the performance across different attributes.
This response..."""
notes = """Sliders from [Performed very poorly] to [Performed very well] are recorded on a 1-100 scale. Participant does not see numeric value.
Note that the attributes align choice_attributes, as well as with the stated preference ratings from The Survey.
Participants had option to select N/A, which is recorded as Null.
N Missing indicates the number of participants who have at least one missing value in the nested columns.
N Unique indicates the unique combinations of use cases selected by participants.
There was no option for 'other'.
Note, these sliders run from 1-100 (on Dynabench). The sliders for stated_prefs (in Survey on Qualtrics) run 0-100."""
header_df = header_prefs
header_col = "conversations_performance_factors"
return_codebook_entry_nested(
    ENTRIES,
    conversations,
    header_df,
    header_col,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'performance_attributes', 'variable_label': 'How well the top-rated model response performed across different attributes', 'variable_category': 'nested', 'variable_type': 'dict', 'question_text': 'Tell us how the model performed. Consider your first message and the top-rated response.\nRate the following statements about the performance across different attributes.\nThis response...', 'num_missing': 1824, 'num_unique': 7532, 'values': {'nested_values': [{'variable_name': 'values', 'variable_label': '...reflected my values or cultural perspective', 'mean': 74.1, 'std': 22.2, 'min': 1.0, 'max': 100.0}, {'variable_name': 'fluency', 'variable_label': '...was well-written and coherent', 'mean': 84.3, 'std': 18.3, 'min': 1.0, 'max': 100.0}, {'variable_name': 'factuality', 'variable_label': '...was factual and informative', 'mean': 79.2, 'std': 21.5, 'min': 1.0, 'max': 100.0}, {'variable_name': 'safety', 'variable_label': "...was safe and doesn't risk harm to myself and othe

In [31]:
variable_name = "choice_attributes"
variable_label = "How different attributes influenced the participant's choice of the top-rated model response"
variable_category = "direct"
variable_type = "dict"
question_text = """Tell us why you chose this response over others. Consider your first message and top-rated response compared to other responses.
Rate the following statements about the importance of different attributes in your decision.
I chose this response..."""
notes = """Sliders from [Very unimportant] to [Very important] are recorded on a 1-100 scale. Participant does not see numeric value.
Note that the attributes align with performance_attributes, as well as the stated preference ratings from The Survey.
Participants had option to select N/A, which is recorded as Null.
num_missing indicates the number of participants who have at least one missing value in the nested columns.
num_unique indicates the unique combinations of use cases selected by participants.
There was no option for 'other'.
Note, these sliders run from 1-100 (on Dynabench). The sliders for stated_prefs (in Survey on Qualtrics) run 0-100."""
header_df = header_prefs
header_col = "conversations_performance_factors"
return_codebook_entry_nested(
    ENTRIES,
    conversations,
    header_df,
    header_col,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'choice_attributes', 'variable_label': "How different attributes influenced the participant's choice of the top-rated model response", 'variable_category': 'direct', 'variable_type': 'dict', 'question_text': 'Tell us why you chose this response over others. Consider your first message and top-rated response compared to other responses.\nRate the following statements about the importance of different attributes in your decision.\nI chose this response...', 'num_missing': 1740, 'num_unique': 7526, 'values': {'nested_values': [{'variable_name': 'values', 'variable_label': '...reflected my values or cultural perspective', 'mean': 66.9, 'std': 27.2, 'min': 1.0, 'max': 100.0}, {'variable_name': 'fluency', 'variable_label': '...was well-written and coherent', 'mean': 82.5, 'std': 18.5, 'min': 1.0, 'max': 100.0}, {'variable_name': 'factuality', 'variable_label': '...was factual and informative', 'mean': 79.3, 'std': 21.0, 'min': 1.0, 'max': 100.0}, {'variable_name': 'safety',

In [32]:
CONVO_ENTRIES = ENTRIES
print(f"There are {len(CONVO_ENTRIES)} entries in the convos codebook.")

There are 13 entries in the convos codebook.


## The Utterances

In [33]:
# Start new entry set for utterances
ENTRIES = []
utterances = data_dict["utterances"]
print(utterances.columns)

Index(['utterance_id', 'interaction_id', 'conversation_id', 'user_id', 'turn',
       'within_turn_id', 'included_in_balanced_subset', 'conversation_type',
       'user_prompt', 'model_response', 'model_name', 'model_provider',
       'score', 'if_chosen'],
      dtype='object')


### Basic columns

In [34]:
# Base columns
variable_name = "user_id"
variable_label = "Unique participant identifier"
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = (
    "Pseudonymized from Prolific worker ID. Used to link utterance data to survey data."
)
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "conversation_id"
variable_label = "Unique conversation identifier"
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = "Used to link utterance data to conversation data."
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "interaction_id"
variable_label = "Unique interaction identifier, where an interaction is a turn within a conversation (single human message with multiple model responses)"
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = ""
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "utterance_id"
variable_label = "Unique utterance identifier, where an utterance is a single human message - single model response pair"
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = ""
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "within_turn_id"
variable_label = (
    "Within turn identifier of up to four model responses to a single human message"
)
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = "Order is random, not based on score or presentation in interface"
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "included_in_balanced_subset"
variable_label = (
    "Indicator if participant's conversations are included in the balanced subset"
)
variable_category = "constructed"
variable_type = "binary"
question_text = "-"
notes = """Balanced subset was created to equally sample conversations of three types (unguided, values, controversy).
We only include participants who have at least one of each conversation type, and then ensure equal numbers of each type are retained.
See paper for details."""
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'user_id', 'variable_label': 'Unique participant identifier', 'variable_category': 'meta', 'variable_type': 'string id', 'question_text': '-', 'num_missing': 0, 'num_unique': 1396, 'values': {'-': '-'}, 'notes': 'Pseudonymized from Prolific worker ID. Used to link utterance data to survey data.'}
{'variable_name': 'conversation_id', 'variable_label': 'Unique conversation identifier', 'variable_category': 'meta', 'variable_type': 'string id', 'question_text': '-', 'num_missing': 0, 'num_unique': 8011, 'values': {'-': '-'}, 'notes': 'Used to link utterance data to conversation data.'}
{'variable_name': 'interaction_id', 'variable_label': 'Unique interaction identifier, where an interaction is a turn within a conversation (single human message with multiple model responses)', 'variable_category': 'meta', 'variable_type': 'string id', 'question_text': '-', 'num_missing': 0, 'num_unique': 27172, 'values': {'-': '-'}, 'notes': ''}
{'variable_name': 'utterance_id', 'variable

### Utterance columns

In [35]:
variable_name = "conversation_type"
variable_label = "Type of conversation (from pre-defined categories)"
variable_category = "direct"
variable_type = "categorical"
question_text = """Choose what type of conversation you want to have."""
notes = """Participants pick from the following radio buttons:
Unguided. Ask, request or talk to the model about anything . It is up to you!
Values guided. Ask, request or talk to the model about something important to you or that represents your values . This could be related to work, religion, family and relationship, politics or culture.
Controversy guided. Ask, request or talk to the model about something controversial or where people would disagree in your community, culture or country.
We also provide the additional instruction: Remember if you are here as a paid study participant, you need to do two of each type. If you are here as a volunteer, then take your pick!
"""
return_codebook_entry(
    ENTRIES,
    conversations,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'conversation_type', 'variable_label': 'Type of conversation (from pre-defined categories)', 'variable_category': 'direct', 'variable_type': 'categorical', 'question_text': 'Choose what type of conversation you want to have.', 'num_missing': 0, 'num_unique': 3, 'values': {'unguided': 3113, 'values guided': 2460, 'controversy guided': 2438}, 'notes': 'Participants pick from the following radio buttons:\nUnguided. Ask, request or talk to the model about anything . It is up to you!\nValues guided. Ask, request or talk to the model about something important to you or that represents your values . This could be related to work, religion, family and relationship, politics or culture.\nControversy guided. Ask, request or talk to the model about something controversial or where people would disagree in your community, culture or country.\nWe also provide the additional instruction: Remember if you are here as a paid study participant, you need to do two of each type. If you a

In [36]:
variable_name = "turn"
variable_label = "Turn of conversation when prompt was entered"
variable_category = "meta"
variable_type = "int"
question_text = "-"
notes = "In the paper, we refer to the first turn as T=1. Here, we index the first turn as 0."
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'turn', 'variable_label': 'Turn of conversation when prompt was entered', 'variable_category': 'meta', 'variable_type': 'int', 'question_text': '-', 'num_missing': 0, 'num_unique': 22, 'values': {'mean': 1.2, 'std': 1.6, 'min': 0.0, 'max': 21.0}, 'notes': 'In the paper, we refer to the first turn as T=1. Here, we index the first turn as 0.'}


### Model metadata

In [37]:
variable_name = "model_name"
variable_label = "Name of LLM"
variable_category = "meta"
variable_type = "categorical"
question_text = "-"
notes = """We provide the long name as it appeared on our backend.
We provide a mapping of long names to shorter more familiar names on our Github or in the paper."""
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'model_name', 'variable_label': 'Name of LLM', 'variable_category': 'meta', 'variable_type': 'categorical', 'question_text': '-', 'num_missing': 0, 'num_unique': 21, 'values': {'command': 4812, 'claude-instant-1': 4292, 'models/chat-bison-001': 4168, 'HuggingFaceH4/zephyr-7b-beta': 4133, 'meta-llama/Llama-2-7b-chat-hf': 3995, 'command-light': 3929, 'command-nightly': 3816, 'gpt-4-1106-preview': 3735, 'gpt-4': 3515, 'meta-llama/Llama-2-70b-chat-hf': 3493, 'gpt-3.5-turbo': 3471, 'timdettmers/guanaco-33b-merged': 3468, 'claude-2.1': 3338, 'mistralai/Mistral-7B-Instruct-v0.1': 3261, 'claude-2': 3209, 'tiiuae/falcon-7b-instruct': 2608, 'OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5': 2314, 'meta-llama/Llama-2-13b-chat-hf': 1744, 'luminous-supreme-control': 1722, 'google/flan-t5-xxl': 1715, 'luminous-extended-control': 1633}, 'notes': 'We provide the long name as it appeared on our backend.\nWe provide a mapping of long names to shorter more familiar names on our Github or

In [38]:
variable_name = "model_provider"
variable_label = "Provider of the LLM"
variable_category = "meta"
variable_type = "categorical"
question_text = "-"
notes = """Note for open-access LLMs, HuggingFace API is always listed as the source and does not imply they built the model."""
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'model_provider', 'variable_label': 'Provider of the LLM', 'variable_category': 'meta', 'variable_type': 'categorical', 'question_text': '-', 'num_missing': 0, 'num_unique': 6, 'values': {'huggingface_api': 26731, 'cohere': 12557, 'anthropic': 10839, 'openai': 10721, 'google': 4168, 'aleph': 3355}, 'notes': 'Note for open-access LLMs, HuggingFace API is always listed as the source and does not imply they built the model.'}


### Text columns

In [39]:
variable_name = "user_prompt"
variable_label = "Human-written message."
variable_category = "direct"
variable_type = "string"
question_text = "-"
notes = ""
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'user_prompt', 'variable_label': 'Human-written message.', 'variable_category': 'direct', 'variable_type': 'string', 'question_text': '-', 'num_missing': 0, 'num_unique': 26673, 'values': {'mean chars': 69.9, 'std chars': 62.0, 'min chars': 1.0, 'max chars': 1311.0}, 'notes': ''}


In [40]:
variable_name = "model_response"
variable_label = "Model-generated response"
variable_category = "direct"
variable_type = "string"
question_text = "-"
notes = "An empty string is stored as `EMPTY STRING'."
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'model_response', 'variable_label': 'Model-generated response', 'variable_category': 'direct', 'variable_type': 'string', 'question_text': '-', 'num_missing': 0, 'num_unique': 66614, 'values': {'mean chars': 565.3, 'std chars': 387.9, 'min chars': 1.0, 'max chars': 4630.0}, 'notes': "An empty string is stored as `EMPTY STRING'."}


### Score related columns

In [41]:
variable_name = "score"
variable_label = "Score of the model response"
variable_category = "direct"
variable_type = "int"
question_text = "Rate the model responses. There are no right or wrong answers. Use your subjective judgement."
notes = """Sliders from [Terrible] to [Perfect] are recorded on a 1-100 scale. Participant does not see numeric value."""
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'score', 'variable_label': 'Score of the model response', 'variable_category': 'direct', 'variable_type': 'int', 'question_text': 'Rate the model responses. There are no right or wrong answers. Use your subjective judgement.', 'num_missing': 0, 'num_unique': 100, 'values': {'mean': 65.1, 'std': 29.3, 'min': 1.0, 'max': 100.0}, 'notes': 'Sliders from [Terrible] to [Perfect] are recorded on a 1-100 scale. Participant does not see numeric value.'}


In [42]:
variable_name = "if_chosen"
variable_label = "Whether model response was highest-rated by participant"
variable_category = "constructed"
variable_type = "binary"
question_text = "-"
notes = "In case of a tie, a random response is chosen."
return_codebook_entry(
    ENTRIES,
    utterances,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'if_chosen', 'variable_label': 'Whether model response was highest-rated by participant', 'variable_category': 'constructed', 'variable_type': 'binary', 'question_text': '-', 'num_missing': 0, 'num_unique': 2, 'values': {False: 40934, True: 27437}, 'notes': 'In case of a tie, a random response is chosen.'}


In [43]:
UTTER_ENTRIES = ENTRIES
print(f"There are {len(UTTER_ENTRIES)} entries in the utterances codebook.")

There are 14 entries in the utterances codebook.


## The MetaData

In [44]:
# Start new entry set for metadata
ENTRIES = []

metadata = data_dict["metadata"]
metadata.columns

Index(['column_id', 'user_id', 'conversation_id', 'interaction_id',
       'utterance_id', 'pii_flag', 'pii_manual_flag', 'language_flag',
       'en_flag', 'moderation_flag'],
      dtype='object')

### Base columns

In [45]:
# Base columns
variable_name = "column_id"
variable_label = "Source of text utterance"
variable_category = "meta"
variable_type = "categorical"
question_text = "-"
notes = ""
return_codebook_entry(
    ENTRIES,
    metadata,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)


variable_name = "user_id"
variable_label = "Unique participant identifier"
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = "Pseudonymized from Prolific worker ID. Used to link metadata to main data."
return_codebook_entry(
    ENTRIES,
    metadata,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "conversation_id"
variable_label = "Unique conversation identifier"
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = "Used to link metadata to main data."
return_codebook_entry(
    ENTRIES,
    metadata,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "interaction_id"
variable_label = "Unique interaction identifier, where an interaction is a turn within a conversation (single human message with multiple model responses)"
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = "Used to link metadata to main data."
return_codebook_entry(
    ENTRIES,
    metadata,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

variable_name = "utterance_id"
variable_label = "Unique utterance identifier, where an utterance is a single human message - single model response pair"
variable_category = "meta"
variable_type = "string id"
question_text = "-"
notes = "Used to link metadata to main data."
return_codebook_entry(
    ENTRIES,
    metadata,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'column_id', 'variable_label': 'Source of text utterance', 'variable_category': 'meta', 'variable_type': 'categorical', 'question_text': '-', 'num_missing': 0, 'num_unique': 5, 'values': {'model_response': 68371, 'user_prompt': 27172, 'open_feedback': 8011, 'self_description': 1500, 'system_string': 1500}, 'notes': ''}
{'variable_name': 'user_id', 'variable_label': 'Unique participant identifier', 'variable_category': 'meta', 'variable_type': 'string id', 'question_text': '-', 'num_missing': 0, 'num_unique': 1500, 'values': {'-': '-'}, 'notes': 'Pseudonymized from Prolific worker ID. Used to link metadata to main data.'}
{'variable_name': 'conversation_id', 'variable_label': 'Unique conversation identifier', 'variable_category': 'meta', 'variable_type': 'string id', 'question_text': '-', 'num_missing': 3000, 'num_unique': 8011, 'values': {'-': '-'}, 'notes': 'Used to link metadata to main data.'}
{'variable_name': 'interaction_id', 'variable_label': 'Unique interactio

### Flag columns

In [46]:
variable_name = "pii_flag"
variable_label = "Automated flag for personally identifiable information"
variable_category = "meta"
variable_type = "binary"
question_text = "-"
notes = """Uses scrubadub https://scrubadub.readthedocs.io/en/stable/ to find PII. There may be some misclassifications.
Many of the inspected positives were false positives. All positive human-written texts checked.
See pii_manual_flag."""
return_codebook_entry(
    ENTRIES,
    metadata,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'pii_flag', 'variable_label': 'Automated flag for personally identifiable information', 'variable_category': 'meta', 'variable_type': 'binary', 'question_text': '-', 'num_missing': 0, 'num_unique': 2, 'values': {False: 105443, True: 1111}, 'notes': 'Uses scrubadub https://scrubadub.readthedocs.io/en/stable/ to find PII. There may be some misclassifications.\nMany of the inspected positives were false positives. All positive human-written texts checked.\nSee pii_manual_flag.'}


In [47]:
variable_name = "pii_manual_flag"
variable_label = (
    "Manual verification of personally identifiable information in human-written texts"
)
variable_category = "meta"
variable_type = "binary"
question_text = "-"
notes = """For any automated PII flags, we manually checked the human-written text for PII.
All were false positives so this flag overules the automated flag. We did not check model-generated text for PII.
NaN indicates entry was not manually checked."""
return_codebook_entry(
    ENTRIES,
    metadata,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'pii_manual_flag', 'variable_label': 'Manual verification of personally identifiable information in human-written texts', 'variable_category': 'meta', 'variable_type': 'binary', 'question_text': '-', 'num_missing': 106387, 'num_unique': 1, 'values': {nan: 106387, 0.0: 167}, 'notes': 'For any automated PII flags, we manually checked the human-written text for PII.\nAll were false positives so this flag overules the automated flag. We did not check model-generated text for PII.\nNaN indicates entry was not manually checked.'}


In [48]:
variable_name = "language_flag"
variable_label = "Automated language detection"
variable_category = "meta"
variable_type = "categorical"
question_text = "-"
notes = """Uses langid. There may be some misclassifications."""
return_codebook_entry(
    ENTRIES,
    metadata,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
    show_values=False,
)

{'variable_name': 'language_flag', 'variable_label': 'Automated language detection', 'variable_category': 'meta', 'variable_type': 'categorical', 'question_text': '-', 'num_missing': 0, 'num_unique': 59, 'values': {'Too many values to show': '-'}, 'notes': 'Uses langid. There may be some misclassifications.'}


In [49]:
variable_name = "en_flag"
variable_label = "Whether detected language is English"
variable_category = "meta"
variable_type = "binary"
question_text = "-"
notes = """Constructed based on automated language detection."""
return_codebook_entry(
    ENTRIES,
    metadata,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
    show_values=False,
)

{'variable_name': 'en_flag', 'variable_label': 'Whether detected language is English', 'variable_category': 'meta', 'variable_type': 'binary', 'question_text': '-', 'num_missing': 0, 'num_unique': 2, 'values': {'Too many values to show': '-'}, 'notes': 'Constructed based on automated language detection.'}


In [50]:
variable_name = "moderation_flag"
variable_label = "Automated flag for moderation"
variable_category = "meta"
variable_type = "nested dict"
question_text = "-"
notes = """Uses OpenAI moderation API. There may be some misclassifications.
Nested dictionary with binary flags and probabilities for sub-categories of harm."""
return_codebook_entry(
    ENTRIES,
    metadata,
    variable_name,
    variable_label,
    variable_category,
    variable_type,
    question_text,
    notes,
)

{'variable_name': 'moderation_flag', 'variable_label': 'Automated flag for moderation', 'variable_category': 'meta', 'variable_type': 'nested dict', 'question_text': '-', 'num_missing': '-', 'num_unique': '-', 'values': {'-': '-'}, 'notes': 'Uses OpenAI moderation API. There may be some misclassifications.\nNested dictionary with binary flags and probabilities for sub-categories of harm.'}


In [51]:
META_ENTRIES = ENTRIES
print(f"There are {len(META_ENTRIES)} entries in the utterances codebook.")

There are 10 entries in the utterances codebook.


## Table Functions

In [52]:
def dict_depth(d, level=1):
    if not isinstance(d, dict) or not d:
        return level
    return max(dict_depth(v, level + 1) for k, v in d.items())

In [53]:
# Put into dataframe


def restructure_data(ENTRIES):
    df = pd.DataFrame(ENTRIES)

    display(df.head(3))

    new_rows = []

    for idx, row in df.iterrows():
        # First row entries appended as is
        header_row = ["HEADER"] + row[0:4].to_list()
        assert len(header_row) == 5
        new_rows.append([idx] + header_row)
        idx = ""
        # Now append the question text
        if row["question_text"] != "-":
            text_row = (
                ["TEXT"]
                + ["Question text:"]
                + [row["question_text"]]
                + ["*QEND*"]
                + [""]
            )
            assert len(text_row) == 5
            new_rows.append([idx] + text_row)
        # Now append the overall values
        for col, label in zip(
            ["num_missing", "num_unique"], ["N Missing:", "N Unique:"]
        ):
            if row[col] != "-":
                row_to_add = ["OVERALL"] + [""] + [""] + [label] + [row[col]]
                assert len(row_to_add) == 5
                new_rows.append([idx] + row_to_add)
        # Now append the nested values
        values = row["values"]
        if pd.isna(values) or "-" not in values.keys():
            depth = dict_depth(values)
            if "nested_values" not in values.keys():
                for k, v in values.items():
                    tag = "NESTED"
                    nested_row = [tag] + [""] + [f"*VSTART:*{k}"] + ["*VEND*"] + [v]
                    assert len(nested_row) == 5
                    new_rows.append([idx] + nested_row)
            elif "nested_values" in values.keys():
                list_of_values = values["nested_values"]
                for subitem_dict in list_of_values:
                    variable_name = subitem_dict["variable_name"]
                    variable_label = subitem_dict["variable_label"].replace("**", "")
                    tag = "NESTED HEADER"
                    nested_row = (
                        [tag]
                        + [variable_name]
                        + [f"*VHSTART:*{variable_label}"]
                        + ["*VHEND*"]
                        + [""]
                    )
                    new_rows.append([idx] + nested_row)
                    # Now add other items
                    for k, v in subitem_dict.items():
                        if k in ["variable_name", "variable_label"]:
                            continue
                        else:
                            tag = "NESTED"
                            nested_row = (
                                [tag] + [""] + [f"*VSTART:*{k}"] + ["*VEND*"] + [v]
                            )
                            new_rows.append([idx] + nested_row)

        if (row["notes"] == "") or (row["notes"] == "-"):
            continue
        else:
            notes_row = ["NOTES"] + ["Notes:"] + [row["notes"]] + ["*NEND*"] + [""]
            assert len(notes_row) == 5
            new_rows.append([idx] + notes_row)

    restructured_df = pd.DataFrame(
        new_rows,
        columns=["", "Tag", "VARIABLE", "LABEL", "CATEGORY", "TYPE"],
    )

    display(restructured_df.head(2))

    return restructured_df

In [54]:
# Style
def custom_format(val):
    if isinstance(val, (float)):
        # Format numeric values to two decimal places
        return f"{val:.1f}"
    else:
        # Return non-numeric values unaltered
        return val


# Function to insert custom LaTeX commands for highlighting rows
def midline_rows_latex(df):
    new_rows = []
    for index, row in df.iterrows():
        if row["Tag"] == "HEADER":
            # Inserting midrule before and after the header row
            new_rows.append([""] * 2 + ["MIDRULE"] + [""] * (df.shape[1] - 3))
            new_rows.append(row.values)
            new_rows.append([""] * 2 + ["MIDRULE"] + [""] * (df.shape[1] - 3))
        elif row["Tag"] == "TEXT":
            new_rows.append(row.values)
            new_rows.append([""] * 2 + ["CLINE"] + [""] * (df.shape[1] - 3))
        elif row["Tag"] == "NESTED HEADER":
            new_rows.append([""] * 2 + ["CLINE"] + [""] * (df.shape[1] - 3))
            new_rows.append(row.values)
        elif row["Tag"] == "NOTES":
            new_rows.append(row.values)
            new_rows.append([""] * 2 + ["CLINE"] + [""] * (df.shape[1] - 3))
        else:
            new_rows.append(row.values)
    return pd.DataFrame(new_rows, columns=df.columns)


def style_latex(restructured_df):

    # Apply midline rows
    lines_df = midline_rows_latex(restructured_df)

    # if header, make whole row bold
    def highlight_header(s):
        if s["Tag"] in ["HEADER", "NESTED_HEADER", "OVERALL"]:
            return ["font-weight: bold" for v in s]
        else:
            return ["" for v in s]

    # Apply the custom_format() function only to the Var Type column
    styled_df = lines_df.style.format(custom_format, subset="TYPE")

    # Apply bold styling
    styled_df = styled_df.apply(highlight_header, axis=1)

    # Bold the first row
    styled_df = styled_df.set_properties(
        **{"font-weight": "bold"}, subset=pd.IndexSlice[0, :]
    )

    # Color the cells in the Var Category column
    color_map = {
        "meta": "myred",
        "direct": "myyellow",
        "constructed": "mygreen",
        "string id": "myoat",
        "string": "myoat",
        "categorical": "myoat",
        "float": "myoat",
        "int": "myoat",
        "datetime": "myoat",
        "binary": "myoat",
        "dict": "myoat",
    }
    # Apply the colormap
    styled_df = styled_df.map(
        lambda val: f"background-color: {color_map[val]}" if val in color_map else ""
    )

    # Only display Var Name, Var Label, Var Category, Var Type
    styled_df = styled_df.hide(["Tag"], axis=1)
    styled_df = styled_df.hide(axis=0)

    # Escape latex characters
    styled_df = styled_df.format(escape="latex", subset=["VARIABLE", "LABEL"])

    # Convert to LaTeX
    latex_str = styled_df.to_latex(
        hrules=True,
        convert_css=True,
        environment="longtable",
    )

    # Edit the alignment
    latex_str = latex_str.replace(
        "lllll",
        "p{0.005\\textwidth}p{0.20\\textwidth}p{0.40\\textwidth}p{0.1\\textwidth}p{0.1\\textwidth}",
    )

    # Edit the midrule
    latex_str = "\\fontsize{6pt}{6pt}\\selectfont\n" + latex_str.replace(
        "& MIDRULE &  &  &  \\\\", "\\midrule"
    )

    # Edit the clins
    latex_str = latex_str.replace("CLINE &  &  &  \\\\", "\\cline{1-4}")

    # Replace question text
    latex_str = (
        latex_str.replace(
            "Question text: &",
            "\\multicolumn{4}{p{0.9\\textwidth}}{\\textit{Question text:}",
        )
    ).replace("& *QEND* &", "}")

    # Replace notes text
    latex_str = latex_str.replace(
        "Notes: &", "\\multicolumn{4}{p{0.9\\textwidth}}{\\textit{Notes:"
    ).replace("& *NEND* &", "}}")

    # Replace values text
    latex_str = latex_str.replace("*VSTART:*", "\\multicolumn{2}{r}{").replace(
        "& *VEND*", "}"
    )

    # Replace nested values text
    latex_str = latex_str.replace(
        "*VHSTART:*", "\\multicolumn{3}{p{0.6\\textwidth}}{"
    ).replace("& *VHEND* & ", "}")

    print(latex_str)

## Generate Codebooks

In [55]:
style_latex(restructure_data(SURVEY_ENTRIES))

Unnamed: 0,variable_name,variable_label,variable_category,variable_type,question_text,num_missing,num_unique,values,notes
0,user_id,Unique participant identifier,meta,string id,-,0,1500,{'-': '-'},Pseudonymized from Prolific worker ID. Used to...
1,survey_only,Indicator if participant only completed the su...,meta,binary,-,0,2,"{False: 1396, True: 104}",
2,num_completed_conversations,Number of conversations that a participant com...,meta,int,-,0,8,"{'mean': 5.3, 'std': 1.7, 'min': 0.0, 'max': 7.0}",


Unnamed: 0,Unnamed: 1,Tag,VARIABLE,LABEL,CATEGORY,TYPE
0,0.0,HEADER,user_id,Unique participant identifier,meta,string id
1,,OVERALL,,,N Missing:,0


\fontsize{6pt}{6pt}\selectfont
\begin{longtable}{p{0.005\textwidth}p{0.20\textwidth}p{0.40\textwidth}p{0.1\textwidth}p{0.1\textwidth}}
\toprule
 & VARIABLE & LABEL & CATEGORY & TYPE \\
\midrule
\endfirsthead
\toprule
 & VARIABLE & LABEL & CATEGORY & TYPE \\
\midrule
\endhead
\midrule
\multicolumn{5}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
\bfseries  & \bfseries MIDRULE & \bfseries  & \bfseries  & \bfseries  \\
\bfseries 0 & \bfseries user\_id & \bfseries Unique participant identifier & \bfseries {\cellcolor{myred}} meta & \bfseries {\cellcolor{myoat}} string id \\
 \midrule
\bfseries  & \bfseries  & \bfseries  & \bfseries N Missing: & \bfseries 0 \\
\bfseries  & \bfseries  & \bfseries  & \bfseries N Unique: & \bfseries 1500 \\
 & \multicolumn{4}{p{0.9\textwidth}}{\textit{Notes: Pseudonymized from Prolific worker ID. Used to link survey data to conversation data. In our paper, we refer to `users' as `participants'. }}  \\
 & \cline{1-4}
 \midrule
\bfseri

In [56]:
style_latex(restructure_data(CONVO_ENTRIES))

Unnamed: 0,variable_name,variable_label,variable_category,variable_type,question_text,num_missing,num_unique,values,notes
0,user_id,Unique participant identifier,meta,string id,-,0,1396,{'-': '-'},Pseudonymized from Prolific worker ID. Used to...
1,conversation_id,Unique conversation identifier,meta,string id,-,0,8011,{'-': '-'},
2,included_in_balanced_subset,Indicator if participant's conversations are i...,constructed,binary,-,0,2,"{True: 6696, False: 1315}",Balanced subset was created to equally sample ...


Unnamed: 0,Unnamed: 1,Tag,VARIABLE,LABEL,CATEGORY,TYPE
0,0.0,HEADER,user_id,Unique participant identifier,meta,string id
1,,OVERALL,,,N Missing:,0


\fontsize{6pt}{6pt}\selectfont
\begin{longtable}{p{0.005\textwidth}p{0.20\textwidth}p{0.40\textwidth}p{0.1\textwidth}p{0.1\textwidth}}
\toprule
 & VARIABLE & LABEL & CATEGORY & TYPE \\
\midrule
\endfirsthead
\toprule
 & VARIABLE & LABEL & CATEGORY & TYPE \\
\midrule
\endhead
\midrule
\multicolumn{5}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
\bfseries  & \bfseries MIDRULE & \bfseries  & \bfseries  & \bfseries  \\
\bfseries 0 & \bfseries user\_id & \bfseries Unique participant identifier & \bfseries {\cellcolor{myred}} meta & \bfseries {\cellcolor{myoat}} string id \\
 \midrule
\bfseries  & \bfseries  & \bfseries  & \bfseries N Missing: & \bfseries 0 \\
\bfseries  & \bfseries  & \bfseries  & \bfseries N Unique: & \bfseries 1396 \\
 & \multicolumn{4}{p{0.9\textwidth}}{\textit{Notes: Pseudonymized from Prolific worker ID. Used to link conversation data to survey data. }}  \\
 & \cline{1-4}
 \midrule
\bfseries 1 & \bfseries conversation\_id & \bfseries Unique 

In [57]:
style_latex(restructure_data(UTTER_ENTRIES))

Unnamed: 0,variable_name,variable_label,variable_category,variable_type,question_text,num_missing,num_unique,values,notes
0,user_id,Unique participant identifier,meta,string id,-,0,1396,{'-': '-'},Pseudonymized from Prolific worker ID. Used to...
1,conversation_id,Unique conversation identifier,meta,string id,-,0,8011,{'-': '-'},Used to link utterance data to conversation data.
2,interaction_id,"Unique interaction identifier, where an intera...",meta,string id,-,0,27172,{'-': '-'},


Unnamed: 0,Unnamed: 1,Tag,VARIABLE,LABEL,CATEGORY,TYPE
0,0.0,HEADER,user_id,Unique participant identifier,meta,string id
1,,OVERALL,,,N Missing:,0


\fontsize{6pt}{6pt}\selectfont
\begin{longtable}{p{0.005\textwidth}p{0.20\textwidth}p{0.40\textwidth}p{0.1\textwidth}p{0.1\textwidth}}
\toprule
 & VARIABLE & LABEL & CATEGORY & TYPE \\
\midrule
\endfirsthead
\toprule
 & VARIABLE & LABEL & CATEGORY & TYPE \\
\midrule
\endhead
\midrule
\multicolumn{5}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
\bfseries  & \bfseries MIDRULE & \bfseries  & \bfseries  & \bfseries  \\
\bfseries 0 & \bfseries user\_id & \bfseries Unique participant identifier & \bfseries {\cellcolor{myred}} meta & \bfseries {\cellcolor{myoat}} string id \\
 \midrule
\bfseries  & \bfseries  & \bfseries  & \bfseries N Missing: & \bfseries 0 \\
\bfseries  & \bfseries  & \bfseries  & \bfseries N Unique: & \bfseries 1396 \\
 & \multicolumn{4}{p{0.9\textwidth}}{\textit{Notes: Pseudonymized from Prolific worker ID. Used to link utterance data to survey data. }}  \\
 & \cline{1-4}
 \midrule
\bfseries 1 & \bfseries conversation\_id & \bfseries Unique con

In [58]:
style_latex(restructure_data(META_ENTRIES))

Unnamed: 0,variable_name,variable_label,variable_category,variable_type,question_text,num_missing,num_unique,values,notes
0,column_id,Source of text utterance,meta,categorical,-,0,5,"{'model_response': 68371, 'user_prompt': 27172...",
1,user_id,Unique participant identifier,meta,string id,-,0,1500,{'-': '-'},Pseudonymized from Prolific worker ID. Used to...
2,conversation_id,Unique conversation identifier,meta,string id,-,3000,8011,{'-': '-'},Used to link metadata to main data.


Unnamed: 0,Unnamed: 1,Tag,VARIABLE,LABEL,CATEGORY,TYPE
0,0.0,HEADER,column_id,Source of text utterance,meta,categorical
1,,OVERALL,,,N Missing:,0


\fontsize{6pt}{6pt}\selectfont
\begin{longtable}{p{0.005\textwidth}p{0.20\textwidth}p{0.40\textwidth}p{0.1\textwidth}p{0.1\textwidth}}
\toprule
 & VARIABLE & LABEL & CATEGORY & TYPE \\
\midrule
\endfirsthead
\toprule
 & VARIABLE & LABEL & CATEGORY & TYPE \\
\midrule
\endhead
\midrule
\multicolumn{5}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
\bfseries  & \bfseries MIDRULE & \bfseries  & \bfseries  & \bfseries  \\
\bfseries 0 & \bfseries column\_id & \bfseries Source of text utterance & \bfseries {\cellcolor{myred}} meta & \bfseries {\cellcolor{myoat}} categorical \\
 \midrule
\bfseries  & \bfseries  & \bfseries  & \bfseries N Missing: & \bfseries 0 \\
\bfseries  & \bfseries  & \bfseries  & \bfseries N Unique: & \bfseries 5 \\
 &  & \multicolumn{2}{r}{model\_response } & 68371 \\
 &  & \multicolumn{2}{r}{user\_prompt } & 27172 \\
 &  & \multicolumn{2}{r}{open\_feedback } & 8011 \\
 &  & \multicolumn{2}{r}{self\_description } & 1500 \\
 &  & \multicolumn{2}{