# Analysing results of user studies

## All responses

In [315]:
import re
import pandas as pd
import plotly.express as px

H_ORDER = [
    "regular_ReputationVariable", "regular_PersonalityVariable",
    "study_mod_GenderVariable", "study_mod_IncentivesVariable",
    "var_mod_LeadershipVariable", "var_mod_IdentificationVariable"]
TEMPLATES_REGEX = {
    "regular": [
        r"cooperation is significantly (.+) when (.+) is (.+) compared to when (.+) is (.+)\."
    ],
    "study_mod": [
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation is significantly (.+) when (.+) is (.+) compared to when (.+) has another value."
    ],
    "var_mod": [
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation from studies involving (.+) as (.+) is significantly (.+) than cooperation from studies involving (.+) as (.+).",
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation from (.+) as (.+) is significantly (.+) than cooperation from studies involving (.+) as (.+).",
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation from studies involving (.+) as (.+) is (.+) than studies involving (.+) as (.+)."
    ]
}
TEMPLATES_VAR = {
    "regular": ["comparative", "siv", "sivv1", "siv", "sivv2"],
    "study_mod": ["siv", "sivv1", "siv", "sivv2", "comparative", "mod", "mod_val", "mod"],
    "var_mod": ["siv", "sivv1", "siv", "sivv2", "mod1", "mod", "comparative", "mod2", "mod"],
}
COLS_VAR = ["comparative", "siv", "sivv1", "sivv2", "mod", "mod_val", "mod1", "mod2"]
COLS_ALL_H = ["A", "B", "C", "D", "E"] + [f"H{i}" for i in range(6)]

In [316]:
all_results = pd.read_csv("../../user_study_responses.csv")
all_results = all_results[[x for x in all_results.columns[1:-8]]]
all_results["user_id"] = all_results.index
all_results.head(3)

Unnamed: 0,"Please formulate the top 5 hypotheses that are the most relevant or interesting to investigate. Use the ""Templated Hypothesis"" as a reference. Each hypothesis should come with a score from 1 to 5, with 1 being the lowest and 5 the best. You should use only some of the concepts that are provided in the tab and that you think are the most relevant.\n\nYour answer should be similar to:\n```output\nA- First hypothesis (score)\nB- Second hypothesis (score)\nC- Third hypothesis (score)\nD- Fourth hypothesis (score)\nE- Fifth hypothesis (score)\n```","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H1: Cooperation is significantly higher when anonymity manipulation is low compared to when anonymity manipulation is medium.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H2: Cooperation is significantly higher when Knowledge of partner's prior behavior is cooperative compared to when Knowledge of partner's prior behavior is absent.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H3: Cooperation is significantly lower when knowledge of partner's prior behavior is absent compared to when knowledge of partner's prior behavior is present.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H4: Cooperation is significantly lower when partner selection is present compared to when partner selection is absent.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H5: Cooperation is significantly higher when Knowledge of partner's prior behavior is cooperative compared to when Knowledge of partner's prior behavior is noncooperative.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H6: Cooperation is significantly lower when anonymity manipulation is low compared to when anonymity manipulation is medium.]","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [A]","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [B]","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [C]",...,"Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [C].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [D].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [E].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H1].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H2].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H3].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H4].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H5].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H6].5",user_id
0,A- Cooperation is significantly higer when Ano...,3,2,4,5,1,4,7,2,8,...,2,6,4,3,1,9,5,7,8,0
1,A- Cooperation is significantly higher when go...,2,1,4,2,3,4,8,7,1,...,3,5,7,4,8,10,9,11,6,1


## Human-generated hypotheses

In [317]:
human_h_start_col = "Please formulate the top 5 hypotheses that are the most relevant or interesting to investigate."
human_h_orig = all_results[[x for x in all_results.columns if x.startswith(human_h_start_col)]]
human_h_orig.columns = H_ORDER
human_h = pd.DataFrame(columns=["hypothesis", "user_id", "th", "giv", "nb"] + COLS_VAR)

def preprocess_h(text, pattern):
    """ Pre-processing hypotheses (sometimes there is a difference across writings) """
    text = re.sub(pattern, '', text) \
        .replace(". .", ".").lower().replace("higer", "higher") \
            .replace(" .", ".").replace("’", "'").strip()
    to_replace = [
        ("knowledge of the partner's behavior", "knowledge of partner's prior behavior"),
        ("participant's gender is known", "partner's gender is known")
    ]
    for old, new in to_replace:
        text = text.replace(old, new)
    return text

def update_row(row, val):
    """ Add info to df """
    th = "_".join(val.split("_")[:-1])
    row["th"] = th
    row["giv"] = val.split("_")[-1]
    row["nb"] = row["hypothesis"].split("-")[0]
    row["hypothesis"] = "-".join(row["hypothesis"].split("-")[1:]).strip()
    pattern = r'\((\d+)\)'
    score = re.search(pattern, row["hypothesis"])
    if score:
        row["score"] = int(score.group(1))
    else:
        row["score"] = None
    row["hypothesis"] = preprocess_h(row["hypothesis"], pattern)
    
    for regex in TEMPLATES_REGEX[th]:
        matches = list(re.finditer(regex, row["hypothesis"], re.MULTILINE))
        if matches:
            grouped = list(matches)[0].groups()
            vals = {var: grouped[index] for index, var in enumerate(TEMPLATES_VAR[th])}
            for col in COLS_VAR:
                row[col] = vals.get(col, None)
            break

    return row

for val in H_ORDER:
    human_h_orig.loc[:, val] = human_h_orig[val].apply(lambda x: [y for y in x.split("\n") if y])
    curr_df = human_h_orig[[val]].explode(val).rename(columns={val: "hypothesis"})
    curr_df["user_id"] = curr_df.index
    curr_df = curr_df.apply(lambda row: update_row(row, val), axis=1)
    human_h = pd.concat([human_h, curr_df])

human_h.reset_index(drop=True).sample(3)

Unnamed: 0,hypothesis,user_id,th,giv,nb,comparative,siv,sivv1,sivv2,mod,mod_val,mod1,mod2,score
18,cooperation is significantly higher when indiv...,1,regular,PersonalityVariable,D,higher,individual difference,collectivism,power,,,,,4.0
3,cooperation is significantly higher when knowl...,0,regular,ReputationVariable,D,higher,knowledge of partner's prior behavior,cooperative,noncooperative,,,,,1.0
39,when comparing studies where show-up fee is tr...,1,study_mod,IncentivesVariable,E,higher,show-up fee,true,false,deception,True,,,2.0


In [318]:
human_h[human_h.comparative.isna()]

Unnamed: 0,hypothesis,user_id,th,giv,nb,comparative,siv,sivv1,sivv2,mod,mod_val,mod1,mod2,score


In [319]:
human_h.groupby(["user_id", "comparative"]).agg({"hypothesis": "count"}).reset_index()

Unnamed: 0,user_id,comparative,hypothesis
0,0,higher,26
1,0,lower,4
2,1,higher,23
3,1,lower,7


In [320]:
df_comparative = human_h.groupby(["user_id", "comparative"]).agg({"hypothesis": "count"}).reset_index().pivot_table(index='user_id', columns='comparative')
df_comparative.columns = [x[1] for x in df_comparative.columns]
df_comparative["div"] = df_comparative["higher"] / df_comparative["lower"]
df_comparative.reset_index(drop=True)

des = f"""
On average, participants used the comparative ``higher'' {round(df_comparative["div"].mean(), 1)} more times than ``lower'', resulting in {int(df_comparative["higher"].sum()):,}  and {int(df_comparative["lower"].sum()):,} hypotheses with the comparative ``higher'' and ``lower'' respectively.
"""
print(des)


On average, participants used the comparative ``higher'' 4.9 more times than ``lower'', resulting in 49  and 11 hypotheses with the comparative ``higher'' and ``lower'' respectively.



In [321]:
human_h.sample(3)

Unnamed: 0,hypothesis,user_id,th,giv,nb,comparative,siv,sivv1,sivv2,mod,mod_val,mod1,mod2,score
1,when comparing studies where knowledge of grou...,1,var_mod,IdentificationVariable,B,lower,knowledge of group membership,common,unknown,partner's group membership,,ingroup and outgroup,ingroup,5.0
1,when comparing studies where leader's behavior...,1,var_mod,LeadershipVariable,E,lower,leader's behavior,cooperative,non-cooperative,leader's characteristic,,weak,strong,3.0
0,cooperation is significantly higher when partn...,0,regular,ReputationVariable,E,higher,partner selection,present,absent,,,,,4.0


In [322]:
print(f"""
On average, participants rated their hypotheses with a score of {round(human_h.groupby("user_id").agg({"score": "mean"}).score.mean(), 1)}.
""")


On average, participants rated their hypotheses with a score of 3.3.



In [331]:
df_hist = pd.DataFrame(columns=["th", "nb"])
for th in TEMPLATES_VAR.keys():
    vals = human_h[human_h.th == th].groupby(TEMPLATES_VAR[th]).agg({"user_id": "nunique"}).user_id.values
    curr_df = pd.DataFrame({"th": th, "nb": vals}, columns=["th", "nb"])
    df_hist = pd.concat([df_hist, curr_df])

color_palette = px.colors.qualitative.Safe
fig = px.histogram(df_hist, x="nb", histnorm="", nbins=human_h.user_id.unique().shape[0], color="th",
                   color_discrete_sequence=color_palette, opacity=0.75)
fig.update_layout(barmode='group')
fig.write_image("../visualisations/user_studies_common_h.pdf", format='pdf')
fig.show()

In [332]:
color_palette = px.colors.qualitative.Safe
fig = px.histogram(
    human_h.groupby(["th", "siv"]).agg({"user_id": "nunique"}).reset_index(),
    x="user_id", histnorm="", nbins=human_h.user_id.unique().shape[0], color="th",
    color_discrete_sequence=color_palette, opacity=0.75)
fig.update_layout(barmode='group')
fig.write_image("../visualisations/user_studies_common_siv.pdf", format='pdf')
fig.show()

In [333]:
color_palette = px.colors.qualitative.Safe
fig = px.histogram(
    human_h.groupby(["th", "mod"]).agg({"user_id": "nunique"}).reset_index(),
    x="user_id", histnorm="", nbins=human_h.user_id.unique().shape[0], color="th",
    color_discrete_sequence=color_palette, opacity=0.75)
fig.update_layout(barmode='group')
fig.write_image("../visualisations/user_studies_common_mod.pdf", format='pdf')
fig.show()

## AI-generated hypotheses

In [334]:
ai_h_start_col = "Please rank the following 6 hypotheses on a scale of 1 to 5"
ai_h_orig = all_results[[x for x in all_results.columns if x.startswith(ai_h_start_col)]]
ai_h_orig.columns = [f"{ho}_{i}" for ho in H_ORDER for i in range(6)]

ai_h = pd.DataFrame(columns=["user_id", "score", "th", "giv", "i_shuffled"])

def update_row_ai(row, val):
    th = "_".join(val.split("_")[:-1])
    row["th"] = th
    row["giv"] = val.split("_")[-1]
    return row

for ho in H_ORDER:
    for i in range(6):
        curr_df = ai_h_orig[[f"{ho}_{i}"]].rename(columns={f"{ho}_{i}": "score"})
        curr_df["user_id"] = curr_df.index
        curr_df["i_shuffled"] = i
        curr_df = curr_df.apply(lambda row: update_row_ai(row, ho), axis=1)
        ai_h = pd.concat([ai_h, curr_df])
    
ai_h = ai_h.reset_index(drop=True)
ai_h_metadata = pd.read_csv("hypotheses.csv", index_col=0)
ai_h = pd.merge(ai_h, ai_h_metadata, on=['th', 'giv', 'i_shuffled'], how='left')
ai_h.sample(3)

Unnamed: 0,user_id,score,th,giv,i_shuffled,m,i_orig,h
71,1,4,var_mod,IdentificationVariable,5,anyburl,5,When comparing studies where partner's group m...
37,1,4,study_mod,IncentivesVariable,0,anyburl,4,When comparing studies where game incentive is...
47,1,1,study_mod,IncentivesVariable,5,classification,1,When comparing studies where game incentive is...


In [336]:
color_palette = px.colors.qualitative.Safe
fig = px.box(ai_h, x='th', y='score', color='m', points='all',
             #labels={'Score': 'Scores', 'Model': 'Models', 'Type of Hypothesis': 'Type of Hypothesis'}
             color_discrete_sequence=color_palette
             )
fig.update_layout(title='Distribution of Scores by Model and Type of Hypothesis')
fig.write_image("../visualisations/user_studies_ai_h_scores.pdf", format='pdf')
fig.show()

## Comparing human-generated hypotheses and ai-generated hypotheses

In [337]:
comparison_h_start_col = "Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11)."
comparison_h_orig = all_results[[x for x in all_results.columns if x.startswith(comparison_h_start_col)]]
comparison_h_orig.columns = [f"{ho}_{col}" for ho in H_ORDER for col in COLS_ALL_H]
comparison_h_orig

Unnamed: 0,regular_ReputationVariable_A,regular_ReputationVariable_B,regular_ReputationVariable_C,regular_ReputationVariable_D,regular_ReputationVariable_E,regular_ReputationVariable_H0,regular_ReputationVariable_H1,regular_ReputationVariable_H2,regular_ReputationVariable_H3,regular_ReputationVariable_H4,...,var_mod_IdentificationVariable_B,var_mod_IdentificationVariable_C,var_mod_IdentificationVariable_D,var_mod_IdentificationVariable_E,var_mod_IdentificationVariable_H0,var_mod_IdentificationVariable_H1,var_mod_IdentificationVariable_H2,var_mod_IdentificationVariable_H3,var_mod_IdentificationVariable_H4,var_mod_IdentificationVariable_H5
0,7,2,8,11,5,6,9,3,1,10,...,10,2,6,4,3,1,9,5,7,8
1,8,7,1,5,3,10,11,2,9,6,...,2,3,5,7,4,8,10,9,11,6


In [338]:
comparison_h = pd.DataFrame(columns=["user_id", "rank", "th", "giv", "i_shuffled"])

def update_row_ai(row, val):
    th = "_".join(val.split("_")[:-1])
    row["th"] = th
    row["giv"] = val.split("_")[-1]
    return row

for ho in H_ORDER:
    for i in COLS_ALL_H:
        curr_df = comparison_h_orig[[f"{ho}_{i}"]].rename(columns={f"{ho}_{i}": "rank"})
        curr_df["user_id"] = curr_df.index
        curr_df["i_shuffled"] = i.replace("H", "")
        curr_df = curr_df.apply(lambda row: update_row_ai(row, ho), axis=1)
        comparison_h = pd.concat([comparison_h, curr_df])
    
comparison_h = comparison_h.reset_index(drop=True)
ai_h_metadata.i_shuffled = ai_h_metadata.i_shuffled.astype(str)
comparison_h = pd.merge(comparison_h, ai_h_metadata, on=['th', 'giv', 'i_shuffled'], how='left')
comparison_h["m"] = comparison_h["m"].fillna("human")
comparison_h.sample(3)

Unnamed: 0,user_id,rank,th,giv,i_shuffled,m,i_orig,h
108,0,5,var_mod,LeadershipVariable,5,llm_zero_shot_prompting,3.0,When comparing studies where Leadership assign...
72,0,5,study_mod,IncentivesVariable,D,human,,
19,1,6,regular,ReputationVariable,4,llm_zero_shot_prompting,2.0,Cooperation is significantly higher when Know...


In [340]:
color_palette = px.colors.qualitative.Safe
fig = px.box(comparison_h, x='th', y='rank', color='m', points='all',
             #labels={'Score': 'Scores', 'Model': 'Models', 'Type of Hypothesis': 'Type of Hypothesis'}
             color_discrete_sequence=color_palette
             )
fig.update_layout(title='Distribution of Scores by Model and Type of Hypothesis')
fig.write_image("../visualisations/user_studies_ai_human_h_ranks.pdf", format='pdf')
fig.show()