# Analysing results of user studies

## All responses

In [465]:
import re
import pandas as pd
import plotly.express as px

H_ORDER = [
    "regular_ReputationVariable", "regular_PersonalityVariable",
    "study_mod_GenderVariable", "study_mod_IncentivesVariable",
    "var_mod_LeadershipVariable", "var_mod_IdentificationVariable"]
TEMPLATES_REGEX = {
    "regular": [
        r"cooperation is significantly (.+) when (.+) is (.+) compared to when (.+) is (.+)\."
    ],
    "study_mod": [
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation is significantly (.+) when (.+) is (.+) compared to when (.+) has another value."
    ],
    "var_mod": [
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation from studies involving (.+) as (.+) is significantly (.+) than cooperation from studies involving (.+) as (.+).",
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation from (.+) as (.+) is significantly (.+) than cooperation from studies involving (.+) as (.+).",
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation from studies involving (.+) as (.+) is (.+) than studies involving (.+) as (.+)."
    ]
}
TEMPLATES_VAR = {
    "regular": ["comparative", "siv", "sivv1", "siv", "sivv2"],
    "study_mod": ["siv", "sivv1", "siv", "sivv2", "comparative", "mod", "mod_val", "mod"],
    "var_mod": ["siv", "sivv1", "siv", "sivv2", "mod1", "mod", "comparative", "mod2", "mod"],
}
COLS_VAR = ["comparative", "siv", "sivv1", "sivv2", "mod", "mod_val", "mod1", "mod2"]
COLS_ALL_H = ["A", "B", "C", "D", "E"] + [f"H{i}" for i in range(6)]

In [466]:
all_results = pd.read_csv("../../user_study_responses.csv")
all_results = all_results[[x for x in all_results.columns[1:-8]]]
all_results["user_id"] = all_results.index
all_results.head(3)

Unnamed: 0,"Please formulate the top 5 hypotheses that are the most relevant or interesting to investigate. Use the ""Templated Hypothesis"" as a reference. Each hypothesis should come with a score from 1 to 5, with 1 being the lowest and 5 the best. You should use only some of the concepts that are provided in the tab and that you think are the most relevant.\n\nYour answer should be similar to:\n```output\nA- First hypothesis (score)\nB- Second hypothesis (score)\nC- Third hypothesis (score)\nD- Fourth hypothesis (score)\nE- Fifth hypothesis (score)\n```","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H1: Cooperation is significantly higher when anonymity manipulation is low compared to when anonymity manipulation is medium.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H2: Cooperation is significantly higher when Knowledge of partner's prior behavior is cooperative compared to when Knowledge of partner's prior behavior is absent.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H3: Cooperation is significantly lower when knowledge of partner's prior behavior is absent compared to when knowledge of partner's prior behavior is present.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H4: Cooperation is significantly lower when partner selection is present compared to when partner selection is absent.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H5: Cooperation is significantly higher when Knowledge of partner's prior behavior is cooperative compared to when Knowledge of partner's prior behavior is noncooperative.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H6: Cooperation is significantly lower when anonymity manipulation is low compared to when anonymity manipulation is medium.]","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [A]","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [B]","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [C]",...,"Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [C].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [D].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [E].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H1].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H2].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H3].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H4].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H5].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H6].5",user_id
0,A- Cooperation is significantly higer when Ano...,3,2,4,5,1,4,7,2,8,...,2,6,4,3,1,9,5,7,8,0
1,A- Cooperation is significantly higher when go...,2,1,4,2,3,4,8,7,1,...,3,5,7,4,8,10,9,11,6,1


## Human-generated hypotheses

In [467]:
human_h_start_col = "Please formulate the top 5 hypotheses that are the most relevant or interesting to investigate."
human_h_orig = all_results[[x for x in all_results.columns if x.startswith(human_h_start_col)]]
human_h_orig.columns = H_ORDER
human_h = pd.DataFrame(columns=["hypothesis", "user_id", "th", "m", "giv", "i_shuffled"] + COLS_VAR)

def preprocess_h(text, pattern):
    """ Pre-processing hypotheses (sometimes there is a difference across writings) """
    text = re.sub(pattern, '', text) \
        .replace(". .", ".").lower().replace("higer", "higher") \
            .replace(" .", ".").replace("’", "'").strip()
    to_replace = [
        ("knowledge of the partner's behavior", "knowledge of partner's prior behavior"),
        ("participant's gender is known", "partner's gender is known")
    ]
    for old, new in to_replace:
        text = text.replace(old, new)
    return text

def update_row(row, val):
    """ Add info to df """
    th = "_".join(val.split("_")[:-1])
    row["th"] = th
    row["m"] = "human"
    row["giv"] = val.split("_")[-1]
    row["i_shuffled"] = str(row["hypothesis"].split("-")[0])
    row["hypothesis"] = "-".join(row["hypothesis"].split("-")[1:]).strip()
    pattern = r'\((\d+)\)'
    score = re.search(pattern, row["hypothesis"])
    if score:
        row["score"] = int(score.group(1))
    else:
        row["score"] = None
    row["hypothesis"] = preprocess_h(row["hypothesis"], pattern)
    
    for regex in TEMPLATES_REGEX[th]:
        matches = list(re.finditer(regex, row["hypothesis"], re.MULTILINE))
        if matches:
            grouped = list(matches)[0].groups()
            vals = {var: grouped[index] for index, var in enumerate(TEMPLATES_VAR[th])}
            for col in COLS_VAR:
                row[col] = vals.get(col, None)
            break

    return row

for val in H_ORDER:
    human_h_orig.loc[:, val] = human_h_orig[val].apply(lambda x: [y for y in x.split("\n") if y])
    curr_df = human_h_orig[[val]].explode(val).rename(columns={val: "hypothesis"})
    curr_df["user_id"] = curr_df.index
    curr_df = curr_df.apply(lambda row: update_row(row, val), axis=1)
    human_h = pd.concat([human_h, curr_df])

human_h.reset_index(drop=True).sample(3)

Unnamed: 0,hypothesis,user_id,th,m,giv,i_shuffled,comparative,siv,sivv1,sivv2,mod,mod_val,mod1,mod2,score
17,cooperation is significantly higher when indiv...,1,regular,human,PersonalityVariable,C,higher,individual difference,pro-environmentalism,narcissism,,,,,3.0
31,when comparing studies where game incentive is...,0,study_mod,human,IncentivesVariable,B,higher,game incentive,non-monetary social,hypothetical,acquaintance,acquaintance,,,5.0
45,when comparing studies where leadership is tru...,1,var_mod,human,LeadershipVariable,A,higher,leadership,true,false,endogenous leadership,,endogenous,exogenous,4.0


In [468]:
human_h[human_h.comparative.isna()]

Unnamed: 0,hypothesis,user_id,th,m,giv,i_shuffled,comparative,siv,sivv1,sivv2,mod,mod_val,mod1,mod2,score


In [469]:
human_h.groupby(["user_id", "comparative"]).agg({"hypothesis": "count"}).reset_index()

Unnamed: 0,user_id,comparative,hypothesis
0,0,higher,26
1,0,lower,4
2,1,higher,23
3,1,lower,7


In [470]:
df_comparative = human_h.groupby(["user_id", "comparative"]).agg({"hypothesis": "count"}).reset_index().pivot_table(index='user_id', columns='comparative')
df_comparative.columns = [x[1] for x in df_comparative.columns]
df_comparative["div"] = df_comparative["higher"] / df_comparative["lower"]
df_comparative.reset_index(drop=True)

des = f"""
On average, participants used the comparative ``higher'' {round(df_comparative["div"].mean(), 1)} more times than ``lower'', resulting in {int(df_comparative["higher"].sum()):,}  and {int(df_comparative["lower"].sum()):,} hypotheses with the comparative ``higher'' and ``lower'' respectively.
"""
print(des)


On average, participants used the comparative ``higher'' 4.9 more times than ``lower'', resulting in 49  and 11 hypotheses with the comparative ``higher'' and ``lower'' respectively.



In [471]:
human_h.sample(3)

Unnamed: 0,hypothesis,user_id,th,m,giv,i_shuffled,comparative,siv,sivv1,sivv2,mod,mod_val,mod1,mod2,score
0,when comparing studies where identification le...,0,var_mod,human,IdentificationVariable,A,higher,identification level,low,high,group type,,natural group,experimentally induced group,1.0
0,when comparing studies where leadership is tru...,0,var_mod,human,LeadershipVariable,B,higher,leadership,true,false,leader's characteristic,,strong,weak,3.0
1,when comparing studies where leadership is tru...,1,var_mod,human,LeadershipVariable,A,higher,leadership,true,false,endogenous leadership,,endogenous,exogenous,4.0


In [472]:
print(f"""
On average, participants rated their hypotheses with a score of {round(human_h.groupby("user_id").agg({"score": "mean"}).score.mean(), 1)}.
""")


On average, participants rated their hypotheses with a score of 3.3.



In [473]:
df_hist = pd.DataFrame(columns=["th", "nb"])
for th in TEMPLATES_VAR.keys():
    vals = human_h[human_h.th == th].groupby(TEMPLATES_VAR[th]).agg({"user_id": "nunique"}).user_id.values
    curr_df = pd.DataFrame({"th": th, "nb": vals}, columns=["th", "nb"])
    df_hist = pd.concat([df_hist, curr_df])

color_palette = px.colors.qualitative.Safe
fig = px.histogram(df_hist, x="nb", histnorm="", nbins=human_h.user_id.unique().shape[0], color="th",
                   color_discrete_sequence=color_palette, opacity=0.75)
fig.update_layout(barmode='group')
fig.write_image("../visualisations/user_studies_common_h.pdf", format='pdf')
fig.show()

In [474]:
color_palette = px.colors.qualitative.Safe
fig = px.histogram(
    human_h.groupby(["th", "siv"]).agg({"user_id": "nunique"}).reset_index(),
    x="user_id", histnorm="", nbins=human_h.user_id.unique().shape[0], color="th",
    color_discrete_sequence=color_palette, opacity=0.75)
fig.update_layout(barmode='group')
fig.write_image("../visualisations/user_studies_common_siv.pdf", format='pdf')
fig.show()

In [475]:
color_palette = px.colors.qualitative.Safe
fig = px.histogram(
    human_h.groupby(["th", "mod"]).agg({"user_id": "nunique"}).reset_index(),
    x="user_id", histnorm="", nbins=human_h.user_id.unique().shape[0], color="th",
    color_discrete_sequence=color_palette, opacity=0.75)
fig.update_layout(barmode='group')
fig.write_image("../visualisations/user_studies_common_mod.pdf", format='pdf')
fig.show()

## AI-generated hypotheses

In [477]:
ai_h_start_col = "Please rank the following 6 hypotheses on a scale of 1 to 5"
ai_h_orig = all_results[[x for x in all_results.columns if x.startswith(ai_h_start_col)]]
ai_h_orig.columns = [f"{ho}_{i}" for ho in H_ORDER for i in range(6)]

ai_h = pd.DataFrame(columns=["user_id", "score", "th", "giv", "i_shuffled"])

def update_row_ai(row, val):
    th = "_".join(val.split("_")[:-1])
    row["th"] = th
    row["giv"] = val.split("_")[-1]
    return row

for ho in H_ORDER:
    for i in range(6):
        curr_df = ai_h_orig[[f"{ho}_{i}"]].rename(columns={f"{ho}_{i}": "score"})
        curr_df["user_id"] = curr_df.index
        curr_df["i_shuffled"] = str(i)
        curr_df = curr_df.apply(lambda row: update_row_ai(row, ho), axis=1)
        ai_h = pd.concat([ai_h, curr_df])
    
ai_h = ai_h.reset_index(drop=True)
ai_h_metadata = pd.read_csv("hypotheses.csv", index_col=0)
ai_h_metadata.i_shuffled = ai_h_metadata.i_shuffled.astype(str)
ai_h = pd.merge(ai_h, ai_h_metadata, on=['th', 'giv', 'i_shuffled'], how='left')
ai_h.sample(3)

Unnamed: 0,user_id,score,th,giv,i_shuffled,m,i_orig,h
62,0,5,var_mod,IdentificationVariable,1,llm_zero_shot_prompting,3,When comparing studies where Partner's group m...
11,1,4,regular,ReputationVariable,5,anyburl,5,Cooperation is significantly lower when anonym...
60,0,4,var_mod,IdentificationVariable,0,anyburl,4,When comparing studies where identification le...


In [478]:
color_palette = px.colors.qualitative.Safe
fig = px.box(ai_h, x='th', y='score', color='m', points='all',
             #labels={'Score': 'Scores', 'Model': 'Models', 'Type of Hypothesis': 'Type of Hypothesis'}
             color_discrete_sequence=color_palette
             )
fig.update_layout(title='Distribution of Scores by Model and Type of Hypothesis')
fig.write_image("../visualisations/user_studies_ai_h_scores.pdf", format='pdf')
fig.show()

In [479]:
color_palette = px.colors.qualitative.Safe
columns = ["user_id", "score", "th", "giv", "i_shuffled", "m"]
df_score_h_ai_human = pd.concat([human_h[columns], ai_h[columns]])

fig = px.box(df_score_h_ai_human, x='th', y='score', color='m', points='all',
             #labels={'Score': 'Scores', 'Model': 'Models', 'Type of Hypothesis': 'Type of Hypothesis'}
             color_discrete_sequence=color_palette
             )
fig.update_layout(title='Distribution of Scores by Model and Type of Hypothesis')
fig.write_image("../visualisations/user_studies_ai_human_h_scores.pdf", format='pdf')
fig.show()

## Comparing human-generated hypotheses and ai-generated hypotheses

In [480]:
comparison_h_start_col = "Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11)."
comparison_h_orig = all_results[[x for x in all_results.columns if x.startswith(comparison_h_start_col)]]
comparison_h_orig.columns = [f"{ho}_{col}" for ho in H_ORDER for col in COLS_ALL_H]
comparison_h_orig

Unnamed: 0,regular_ReputationVariable_A,regular_ReputationVariable_B,regular_ReputationVariable_C,regular_ReputationVariable_D,regular_ReputationVariable_E,regular_ReputationVariable_H0,regular_ReputationVariable_H1,regular_ReputationVariable_H2,regular_ReputationVariable_H3,regular_ReputationVariable_H4,...,var_mod_IdentificationVariable_B,var_mod_IdentificationVariable_C,var_mod_IdentificationVariable_D,var_mod_IdentificationVariable_E,var_mod_IdentificationVariable_H0,var_mod_IdentificationVariable_H1,var_mod_IdentificationVariable_H2,var_mod_IdentificationVariable_H3,var_mod_IdentificationVariable_H4,var_mod_IdentificationVariable_H5
0,7,2,8,11,5,6,9,3,1,10,...,10,2,6,4,3,1,9,5,7,8
1,8,7,1,5,3,10,11,2,9,6,...,2,3,5,7,4,8,10,9,11,6


In [481]:
comparison_h = pd.DataFrame(columns=["user_id", "rank", "th", "giv", "i_shuffled"])

def update_row_ai(row, val):
    th = "_".join(val.split("_")[:-1])
    row["th"] = th
    row["giv"] = val.split("_")[-1]
    return row

for ho in H_ORDER:
    for i in COLS_ALL_H:
        curr_df = comparison_h_orig[[f"{ho}_{i}"]].rename(columns={f"{ho}_{i}": "rank"})
        curr_df["user_id"] = curr_df.index
        curr_df["i_shuffled"] = i.replace("H", "")
        curr_df = curr_df.apply(lambda row: update_row_ai(row, ho), axis=1)
        comparison_h = pd.concat([comparison_h, curr_df])
    
comparison_h = comparison_h.reset_index(drop=True)
ai_h_metadata.i_shuffled = ai_h_metadata.i_shuffled.astype(str)
comparison_h = pd.merge(comparison_h, ai_h_metadata, on=['th', 'giv', 'i_shuffled'], how='left')
comparison_h["m"] = comparison_h["m"].fillna("human")
comparison_h.sample(3)

Unnamed: 0,user_id,rank,th,giv,i_shuffled,m,i_orig,h
128,0,7,var_mod,IdentificationVariable,4,llm_zero_shot_prompting,2.0,When comparing studies where Knowledge of grou...
56,0,7,study_mod,GenderVariable,1,classification,1.0,When comparing studies where gender is female ...
27,1,3,regular,PersonalityVariable,C,human,,


In [482]:
color_palette = px.colors.qualitative.Safe
fig = px.box(comparison_h, x='th', y='rank', color='m', points='all',
             #labels={'Score': 'Scores', 'Model': 'Models', 'Type of Hypothesis': 'Type of Hypothesis'}
             color_discrete_sequence=color_palette
             )
fig.update_layout(title='Distribution of Scores by Model and Type of Hypothesis')
fig.write_image("../visualisations/user_studies_ai_human_h_ranks.pdf", format='pdf')
fig.show()

In [499]:
columns = ['th', 'giv', 'm', 'user_id', 'i_shuffled']
data_table = pd.merge(comparison_h[columns + ['rank']], df_score_h_ai_human[columns + ['score']], on=columns, how='left').groupby(["m", "th"]).agg({"score": "mean", "rank": "mean"}).reset_index().pivot_table(index="m", columns="th")
data_table.columns = ["_".join(x) for x in data_table.columns]
data_table

Unnamed: 0_level_0,rank_regular,rank_study_mod,rank_var_mod,score_regular,score_study_mod,score_var_mod
m,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
anyburl,6.375,8.25,6.125,3.125,1.875,3.125
classification,5.375,9.125,7.375,3.25,1.25,3.0
human,4.9,4.1,4.8,3.15,3.2,3.5
llm_zero_shot_prompting,9.0,5.375,7.5,1.875,3.375,2.875


In [505]:
def get_data_overleaf(df):
    res = []
    for model in ["human", "classification", "anyburl", "llm_zero_shot_prompting"]:
        curr_res = [model.split("_")[0]]
        curr_df = df[df.index==model]
        for th in ["regular", "study_mod", "var_mod"]:
            for metric in ["score", "rank"]:
                curr_res.append(str(round(curr_df[f"{metric}_{th}"].values[0], 1)))
        res.append(" & ".join(curr_res))
    return "\\\\ \n".join(res) + "\\\\ \n"

print(get_data_overleaf(data_table))

human & 3.1 & 4.9 & 3.2 & 4.1 & 3.5 & 4.8\\ 
classification & 3.2 & 5.4 & 1.2 & 9.1 & 3.0 & 7.4\\ 
anyburl & 3.1 & 6.4 & 1.9 & 8.2 & 3.1 & 6.1\\ 
llm & 1.9 & 9.0 & 3.4 & 5.4 & 2.9 & 7.5\\ 



In [403]:
columns = ['th', 'giv', 'm', 'user_id', 'i_shuffled']
pd.merge(comparison_h[columns + ['rank']], df_score_h_ai_human[columns + ['score']], on=columns, how='inner').groupby(["m", "th"]).agg({"score": "mean", "rank": "mean"})

Unnamed: 0_level_0,Unnamed: 1_level_0,score,rank
m,th,Unnamed: 2_level_1,Unnamed: 3_level_1
human,regular,3.15,4.9
human,study_mod,3.2,4.1
human,var_mod,3.5,4.8


In [400]:
df_score_h_ai_human[columns + ['score']]

Unnamed: 0,th,giv,m,user_id,i_shuffled,score
0,regular,ReputationVariable,human,0,A,3.0
0,regular,ReputationVariable,human,0,B,5.0
0,regular,ReputationVariable,human,0,C,2.0
0,regular,ReputationVariable,human,0,D,1.0
0,regular,ReputationVariable,human,0,E,4.0
...,...,...,...,...,...,...
67,var_mod,IdentificationVariable,classification,1,3,3
68,var_mod,IdentificationVariable,llm_zero_shot_prompting,0,4,3
69,var_mod,IdentificationVariable,llm_zero_shot_prompting,1,4,2
70,var_mod,IdentificationVariable,anyburl,0,5,3


In [401]:
test=pd.merge(comparison_h[columns + ['rank']], df_score_h_ai_human[columns + ['score']], on=columns, how='inner')
test[test["score"].isna()]

Unnamed: 0,th,giv,m,user_id,i_shuffled,rank,score
