# Prepare Human Annotation

This notebook prepares the sheets for in-house secondary human evaluation. Please read the Supplementary Materials of our paper for more details!

In [1]:
import pandas as pd
from src.utils.helper_funcs import find_project_root, ensure_dir_exists, save_as_jsonl
from src.utils.data_loader import unnest_columns
from src.data_processing.lm_label_cleaner import simple_text_clean, get_labels

PROJECT_ROOT = find_project_root()

OUTPUT_PATH = PROJECT_ROOT / "data" / "annotation"

In [2]:
def add_annotation_cols(df, n_annotators=2):
    for i in range(n_annotators):
        df[f"A{i+1}_label"] = None
        df[f"A{i+1}_comment"] = None
    return df


def add_verification_cols(df, n_annotators=2):
    for i in range(n_annotators):
        df[f"A{i+1}_correct"] = None
        df[f"A{i+1}_edit"] = None
        df[f"A{i+1}_comment"] = None
    return df

## Examine Self-Described Labels

### Ethnicity

For some individuals, we have additional data on their ethnicity from Prolific, but only where they filled in this optional screening criteria. We will include that information in our annotation sheets for additional context when creating a simplified annotation category for ethnicity.

In [3]:
# Load survey with prolific details
survey_w_prolific_details = pd.read_csv(
    f"{PROJECT_ROOT}/data/interim/merged_survey.csv"
)
survey_w_prolific_details = survey_w_prolific_details.rename(
    columns={"ethnicity_simplified": "ethnicity_prolific"}
)

# Load prolific id mappings
mapping = pd.read_csv(f"{PROJECT_ROOT}/data/interim/prolific_id_mapping.csv")
survey_w_prolific_details["user_id"] = survey_w_prolific_details["participant_id"].map(
    lambda x: mapping.loc[mapping["prolific_id"] == x, "user_id"].values[0]
)

# Load cleaned survey
survey = pd.read_json(f"{PROJECT_ROOT}/data/survey.jsonl", lines=True)
nested_columns = ["location", "religion", "ethnicity"]
survey = unnest_columns(survey, nested_columns)

# Merge
combined = survey.merge(
    survey_w_prolific_details[["user_id", "ethnicity_prolific"]],
    on="user_id",
    how="left",
)

# Get number of unique ethnicities
print(
    f"There are {combined['ethnicity_self_described'].nunique()} unique self-described ethnicities"
)

# Clean
for col in ["ethnicity_self_described", "ethnicity_prolific"]:
    combined[col] = combined[col].apply(lambda x: simple_text_clean(x))

# Ethnicity data only
eth = combined[
    [
        "user_id",
        "ethnicity_self_described",
        "ethnicity_prolific",
        "ethnicity_gpt4_categorised",
    ]
]

# Group by "ethnicity_self_described", "ethnicity_prolific" and store user_ids as list
grp_eth = eth.groupby(
    ["ethnicity_self_described", "ethnicity_prolific", "ethnicity_gpt4_categorised"]
)["user_id"].apply(list)

# Reset index
grp_eth = grp_eth.reset_index()

# Add n users
grp_eth["n_users"] = grp_eth["user_id"].apply(lambda x: len(x))

# Add annotation columns
grp_eth = add_verification_cols(grp_eth)

display(grp_eth.sample(10, random_state=10))

print(f"There are {len(grp_eth)} rows for manual annotation")

# Export for annotation
grp_eth.to_clipboard()

There are 264 unique self-described ethnicities


Unnamed: 0,ethnicity_self_described,ethnicity_prolific,ethnicity_gpt4_categorised,user_id,n_users,A1_correct,A1_edit,A1_comment,A2_correct,A2_edit,A2_comment
229,nz european,mixed,White,[user382],1,,,,,,
80,caucatian,white,White,[user71],1,,,,,,
326,white/american,white,White,[user1480],1,,,,,,
6,african america,black,Black / African,[user838],1,,,,,,
309,white hungarian,white,White,[user254],1,,,,,,
300,white caucasian australian,white,White,[user533],1,,,,,,
113,european-american,white,White,[user1405],1,,,,,,
217,multiracial,mixed,Mixed,[user987],1,,,,,,
64,british indian,asian,Mixed,[user871],1,,,,,,
142,human,white,Other,[user1445],1,,,,,,


There are 343 rows for manual annotation


### Religion

For Religion, we don't have any Prolific-native data.

In [4]:
# Religion data only
rel = combined[["user_id", "religion_self_described", "religion_gpt4_categorised"]]

# Get number of unique ethnicities
print(
    f"There are {combined['religion_self_described'].nunique()} unique self-described religions"
)

# Group by "religion_self_described" and store user_ids as list
grp_rel = rel.groupby(["religion_self_described", "religion_gpt4_categorised"])[
    "user_id"
].apply(list)

# Reset index
grp_rel = grp_rel.reset_index()

# Add n users
grp_rel["n_users"] = grp_rel["user_id"].apply(lambda x: len(x))

# Add annotation columns
grp_rel = add_verification_cols(grp_rel)

display(grp_rel.sample(10, random_state=10))

# Export for annotation
grp_rel.to_clipboard()


print(f"There are {len(grp_rel)} rows for manual annotation")

There are 137 unique self-described religions


Unnamed: 0,religion_self_described,religion_gpt4_categorised,user_id,n_users,A1_correct,A1_edit,A1_comment,A2_correct,A2_edit,A2_comment
75,gnostic satanist,Other,[user1231],1,,,,,,
133,spirtitual not religious,Spiritual,[user1320],1,,,,,,
41,chrisitian,Christian,[user767],1,,,,,,
102,non practising christian,Christian,[user518],1,,,,,,
129,spiritual but not religious,Spiritual,[user1388],1,,,,,,
70,discordian,Other,[user1169],1,,,,,,
101,no affiliation,Non-religious,"[user4, user9, user10, user11, user12, user13,...",682,,,,,,
59,christian values,Christian,[user394],1,,,,,,
79,humanist,Non-religious,[user451],1,,,,,,
98,methodist,Christian,"[user310, user541, user742, user765, user768, ...",7,,,,,,


There are 137 rows for manual annotation


### Gender

We also had an option to self-describe gender, so we manually annotate these textual responses for very clear cut cases.

In [5]:
# Group by "gender" and store user_ids as list
gen = survey_w_prolific_details[["user_id", "gender"]]
grp_gen = gen.groupby("gender")["user_id"].apply(list)

# Reset index
grp_gen = grp_gen.reset_index()

# Add n users
grp_gen["n_users"] = grp_gen["user_id"].apply(lambda x: len(x))

# Add annotation columns
grp_gen = add_annotation_cols(grp_gen)

# Copy to google sheets for annotation
grp_gen.to_clipboard()

print(f"There are {len(grp_gen)} rows for manual annotation")

There are 7 rows for manual annotation


## Reassigning Labels Post-Annotation

Now we load back in the annotations from the two annotators who independenty rated each row, then met to discuss and resolve disagreements.

In [6]:
relabel_store = {}
for attribute in ["Gender", "Ethnicity", "Religion"]:
    print(f"Loading {attribute} annotation sheet")
    # Load annotation sheet from file
    annotations = pd.read_csv(
        f"{PROJECT_ROOT}/data/interim/Annotation Sheets - {attribute}.csv", index_col=0
    )
    # Check the annotations do indeed now match (after disagreements have been resolved)
    if attribute == "Gender":  # direct annotation, not verification
        assert annotations["A1_label"].equals(annotations["A2_label"])
        annotations["label"] = annotations["A1_label"].map(lambda x: x.strip())
        # Drop the other columns now
        annotations = annotations.drop(columns=["A1_label", "A2_label"])
    else:
        # Check that the verification columns match
        assert annotations["A1_correct"].equals(annotations["A2_correct"])
        assert annotations["A1_edit"].equals(annotations["A2_edit"])
        annotations["correct"] = annotations["A1_correct"].copy()
        annotations["edit"] = annotations["A1_edit"].copy()
        annotations["label"] = annotations.apply(
            lambda row: (
                row[f"{attribute.lower()}_gpt4_categorised"].strip()
                if row["correct"] == 1
                else row["edit"].strip()
            ),
            axis=1,
        )
        # Summarise the number of correct
        print(annotations["correct"].value_counts(normalize=True))
        # Drop the other columns now
        annotations = annotations.drop(
            columns=["A1_correct", "A1_edit", "A2_correct", "A2_edit", "edit"]
        )

    # Create mapping
    mapping = annotations.set_index(annotations.columns[0])["label"].to_dict()

    # Store
    relabel_store[attribute.lower()] = mapping

Loading Gender annotation sheet
Loading Ethnicity annotation sheet
correct
1    0.860058
0    0.139942
Name: proportion, dtype: float64
Loading Religion annotation sheet
correct
1    0.948905
0    0.051095
Name: proportion, dtype: float64


### Reassign to Survey

In [7]:
# Load survey from file to ensure clean version
survey = pd.read_json(f"{PROJECT_ROOT}/data/survey.jsonl", lines=True)

# Unnest
nested_columns = ["religion", "ethnicity"]
survey = unnest_columns(survey, nested_columns)
survey.head(2)

Unnamed: 0,user_id,survey_only,num_completed_conversations,timing_duration_s,timing_duration_mins,generated_datetime,consent,consent_age,lm_familiarity,lm_indirect_use,...,ethnicity,location,lm_usecases,stated_prefs,order_lm_usecases,order_stated_prefs,religion_self_described,religion_gpt4_categorised,ethnicity_self_described,ethnicity_gpt4_categorised
0,user0,False,1,266,4.43,2023-11-22 15:48:46,"Yes, I consent to take part",I certify that I am 18 years of age or over,Somewhat familiar,Yes,...,"{'self_described': 'prefer not to say', 'gpt4_...","{'birth_country': 'Canada', 'birth_countryISO'...","{'homework_assistance': 0, 'research': 0, 'sou...","{'values': 83, 'creativity': 100, 'fluency': 1...","{'homework_assistance': 17, 'research': 6, 'so...","{'values': 6, 'creativity': 3, 'fluency': 1, '...",prefer not to say,Prefer not to say,prefer not to say,Prefer not to say
1,user1,False,6,632,10.53,2023-11-22 15:56:10,"Yes, I consent to take part",I certify that I am 18 years of age or over,Somewhat familiar,Yes,...,"{'self_described': 'caucasian', 'gpt4_categori...","{'birth_country': 'Canada', 'birth_countryISO'...","{'homework_assistance': 0, 'research': 1, 'sou...","{'values': 19, 'creativity': 73, 'fluency': 86...","{'homework_assistance': 11, 'research': 16, 's...","{'values': 4, 'creativity': 5, 'fluency': 1, '...",prefer not to say,Prefer not to say,caucasian,White


In [8]:
print(relabel_store.keys())

dict_keys(['gender', 'ethnicity', 'religion'])


In [9]:
for idx, row in survey.iterrows():
    for attribute in relabel_store:
        mapping = relabel_store[attribute]
        # Produce one gender column
        if attribute == "gender":
            survey.at[idx, f"{attribute}_categorised"] = mapping[row[attribute]]
        else:
            survey.at[idx, f"{attribute}_categorised"] = mapping[
                row[f"{attribute}_self_described"]
            ]

for attribute in relabel_store:
    print(f"\nChecking {attribute} mapping")
    print(survey[f"{attribute}_categorised"].value_counts(normalize=True, dropna=False))


Checking gender mapping
gender_categorised
Male                         0.504667
Female                       0.478667
Non-binary / third gender    0.014000
Prefer not to say            0.002667
Name: proportion, dtype: float64

Checking ethnicity mapping
ethnicity_categorised
White                         0.646000
Black / African               0.081333
Hispanic / Latino             0.080667
Asian                         0.063333
Prefer not to say             0.057333
Mixed                         0.045333
Other                         0.011333
Middle Eastern / Arab         0.009333
Indigenous / First Peoples    0.005333
Name: proportion, dtype: float64

Checking religion mapping
religion_categorised
Non-religious        0.508000
Christian            0.324667
Agnostic             0.047333
Prefer not to say    0.039333
Jewish               0.028000
Muslim               0.020667
Spiritual            0.012000
Buddhist             0.008000
Folk religion        0.004000
Hindu              

### Add further simplified categorisation

In [10]:
# Additional string cleaning for very aggregated categories
ethnicity_mapping = {
    "White": "White",
    "Hispanic / Latino": "Hispanic",
    "Black / African": "Black",
    "Asian": "Asian",
    "Mixed": "Mixed",
    "Other": "Other",
    "Middle Eastern / Arab": "Other",
    "Indigenous / First Peoples": "Other",
    "Prefer not to say": "Prefer not to say",
}

religion_mapping = {
    "Christian": "Christian",
    "Non-religious": "No Affiliation",
    "Agnostic": "No Affiliation",
    "Jewish": "Jewish",
    "Muslim": "Muslim",
    "Spiritual": "No Affiliation",
    "Hindu": "Other",
    "Sikh": "Other",
    "Buddhist": "Other",
    "Folk religion": "Other",
    "Other": "Other",
    "Prefer not to say": "Prefer not to say",
}

simplified_mappings = {"ethnicity": ethnicity_mapping, "religion": religion_mapping}

for attribute in ["ethnicity", "religion"]:
    survey[f"{attribute}_simplified"] = survey[f"{attribute}_categorised"].map(
        lambda x: simplified_mappings[attribute][x]
    )

    # Check on the simplified mappings
    print(f"\nChecking {attribute} simplified mapping")
    print(survey[f"{attribute}_simplified"].value_counts(normalize=True, dropna=False))


Checking ethnicity simplified mapping
ethnicity_simplified
White                0.646000
Black                0.081333
Hispanic             0.080667
Asian                0.063333
Prefer not to say    0.057333
Mixed                0.045333
Other                0.026000
Name: proportion, dtype: float64

Checking religion simplified mapping
religion_simplified
No Affiliation       0.567333
Christian            0.324667
Prefer not to say    0.039333
Jewish               0.028000
Muslim               0.020667
Other                0.020000
Name: proportion, dtype: float64


### Prepare for Release

In [11]:
def reformat_values(row, attribute):
    reformatted = {}
    reformatted["self_described"] = row[f"{attribute}_self_described"]
    reformatted["categorised"] = row[f"{attribute}_categorised"]
    reformatted["simplified"] = row[f"{attribute}_simplified"]

    return reformatted


# For ethnicity and religion, renest the columns into a dictionary
for attribute in ["ethnicity", "religion"]:
    survey[attribute] = survey.apply(
        lambda row: reformat_values(row, attribute), axis=1
    )
    # Drop all the other rows
    cols_to_drop = [c for c in survey.columns if c.startswith(f"{attribute}_")]
    survey = survey.drop(columns=cols_to_drop)

# For gender, replace the column
for attribute in ["gender"]:
    survey[attribute] = survey[f"{attribute}_categorised"].copy()
    survey = survey.drop(columns=[f"{attribute}_categorised"])

### Resave

In [12]:
SAVE_PATH = PROJECT_ROOT / "data"
ensure_dir_exists(SAVE_PATH)
save_as_jsonl(survey, f"{SAVE_PATH}/survey.jsonl", is_already_records=False)