# Analysing results of user studies

## All responses

In [21]:
import re
import pandas as pd
import plotly.express as px
from agreement.utils.kernels import linear_kernel, quadratic_kernel
from agreement.utils.transform import pivot_table_frequency
from agreement.metrics import cohens_kappa, krippendorffs_alpha

H_ORDER = [
    "regular_ReputationVariable", "regular_PersonalityVariable",
    "study_mod_GenderVariable", "study_mod_IncentivesVariable",
    "var_mod_LeadershipVariable", "var_mod_IdentificationVariable"]
TEMPLATES_REGEX = {
    "regular": [
        r"cooperation is significantly (.+) when (.+) is (.+) compared to when (.+) is (.+)\.",
        r"cooperation is significantly (.+) when (.+) is (.+) compared to when (.+) is (.+)"
    ],
    "study_mod": [
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation is significantly (.+) when (.+) is (.+) compared to when (.+) has another value.",
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation is significantly (.+) when (.+) is (.+) compared to when (.+) is .+",
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation is significantly (.+) when (.+) was (.+) compared to when (.+) was .+",
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation is significantly (.+) when (.+) was (.+) than when (.+) was .+",
        r"when comparing studies where (.+) is (.+) and where (.+) is (.+), cooperation is significantly (.+) when (.+) is (.+) compared to when (.+) has another value.",
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), .+ is (.+) when (.+) is (.+) compared to when (.+) is .+"
    ],
    "var_mod": [
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation from studies involving (.+) as (.+) is significantly (.+) than cooperation from studies involving (.+) as (.+).",
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation from (.+) as (.+) is significantly (.+) than cooperation from studies involving (.+) as (.+).",
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), cooperation from studies involving (.+) as (.+) is (.+) than studies involving (.+) as (.+).",
        r"when comparing studies where (.+) is (.+) and studies where (.+) is (.+), .+ is (.+) when (.+) is (.+) compared to when (.+) is (.+)"
    ]
}
TEMPLATES_VAR = {
    "regular": ["comparative", "siv", "sivv1", "siv", "sivv2"],
    "study_mod": ["siv", "sivv1", "siv", "sivv2", "comparative", "mod", "mod_val", "mod"],
    "var_mod": ["siv", "sivv1", "siv", "sivv2", "mod1", "mod", "comparative", "mod2", "mod"],
}
COLS_VAR = ["comparative", "siv", "sivv1", "sivv2", "mod", "mod_val", "mod1", "mod2"]
COLS_ALL_H = ["A", "B", "C", "D", "E"] + [f"H{i}" for i in range(6)]

In [22]:
SAFE_COLORS = px.colors.qualitative.Safe
COLOR_PALETTE = {
    'human': SAFE_COLORS[0],            # blue
    'classification': SAFE_COLORS[1],   # red
    'llm_zero_shot_prompting': SAFE_COLORS[2], # yellow
    'anyburl': SAFE_COLORS[3],           # green,
    'regular': SAFE_COLORS[0],
    'study_mod': SAFE_COLORS[1], 
    'var_mod': SAFE_COLORS[2]
}

In [23]:
all_results = pd.read_csv("../../user_study_responses.csv")
all_results = all_results[[x for x in all_results.columns[1:-8]]]
all_results["user_id"] = all_results.index
all_results.head(3)

Unnamed: 0,"Please formulate the top 5 hypotheses that are the most relevant or interesting to investigate. Use the ""Templated Hypothesis"" as a reference. Each hypothesis should come with a score from 1 to 5, with 1 being the lowest and 5 the best. You should use only some of the concepts that are provided in the tab and that you think are the most relevant.\n\nYour answer should be similar to:\n```output\nA- First hypothesis (score)\nB- Second hypothesis (score)\nC- Third hypothesis (score)\nD- Fourth hypothesis (score)\nE- Fifth hypothesis (score)\n```","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H1: Cooperation is significantly higher when anonymity manipulation is low compared to when anonymity manipulation is medium.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H2: Cooperation is significantly higher when Knowledge of partner's prior behavior is cooperative compared to when Knowledge of partner's prior behavior is absent.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H3: Cooperation is significantly lower when knowledge of partner's prior behavior is absent compared to when knowledge of partner's prior behavior is present.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H4: Cooperation is significantly lower when partner selection is present compared to when partner selection is absent.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H5: Cooperation is significantly higher when Knowledge of partner's prior behavior is cooperative compared to when Knowledge of partner's prior behavior is noncooperative.]","Please rank the following 6 hypotheses on a scale of 1 to 5, in terms of relevance or interestingness to investigate. 1 is the worst score and 5 the best. [H6: Cooperation is significantly lower when anonymity manipulation is low compared to when anonymity manipulation is medium.]","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [A]","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [B]","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [C]",...,"Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [C].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [D].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [E].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H1].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H2].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H3].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H4].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H5].5","Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11). The hypotheses contain the 5 hypotheses you generated, and the 6 generated by AI methods. [H6].5",user_id
0,A- Cooperation is significantly higer when Ano...,3,2,4,5,1,4,7,2,8,...,2,6,4,3,1,9,5,7,8,0
1,A- Cooperation is significantly higher when go...,2,1,4,2,3,4,8,7,1,...,3,5,7,4,8,10,9,11,6,1
2,A-\tCooperation is significantly higher when A...,3,4,1,1,4,1,3,8,2,...,3,4,8,9,10,5,6,11,7,2


## Human-generated hypotheses

### Generic

In [24]:
human_h_start_col = "Please formulate the top 5 hypotheses that are the most relevant or interesting to investigate."
human_h_orig = all_results[[x for x in all_results.columns if x.startswith(human_h_start_col)]]
human_h_orig.columns = H_ORDER
human_h = pd.DataFrame(columns=["hypothesis", "user_id", "th", "m", "giv", "i_shuffled"] + COLS_VAR)

def preprocess_h(text, pattern):
    """ Pre-processing hypotheses (sometimes there is a difference across writings) """
    text = text.replace("\\t", " ")
    text = re.sub(pattern, '', text) \
        .replace(". .", ".").lower().replace("higer", "higher") \
            .replace(" .", ".").replace("’", "'").strip()
    to_replace = [
        ("knowledge of the partner's behavior", "knowledge of partner's prior behavior"),
        ("participant's gender is known", "partner's gender is known"),
        ('game incentives', 'game incentive'),
        ("leader's behaviour", "leader's behavior"),
        ('non-cooperative', 'noncooperative'),
        ('endogenous leadershi', 'endogenous leadership'),
        ('entitativity leve', 'entitativity level'),
        ('group typ', 'group type'),
        ('identification leve', 'identification level'),
        ('knowledge of group membershi', 'knowledge of group membership'),
        ('leadership assignment rul', 'leadership assignment rule'),
        ("leader's assignment rul", 'leadership assignment rule'),
        ("leader's characteristi", "leader's characteristic"),
        ('leadership rol', 'leadership role'),
        ('in-group', 'ingroup'),
        ("academic", "academic discipline"),
        ("academic discipline discipline", "academic discipline")
    ]
    for old, new in to_replace:
        text = text.replace(old, new)
    return text

def format_h(text):
    for letter in ["A", "B", "C", "D", "E"]:
        text = text.replace(f"({letter})", f"{letter}-")
    return text

def update_row(row, val):
    """ Add info to df """
    th = "_".join(val.split("_")[:-1])
    row["th"] = th
    row["m"] = "human"
    row["giv"] = val.split("_")[-1]
    row["i_shuffled"] = str(row["hypothesis"].split("-")[0])
    row["hypothesis"] = "-".join(row["hypothesis"].split("-")[1:]).strip()
    pattern = r'\((\d+)\)'
    score = re.search(pattern, row["hypothesis"])
    if score:
        row["score"] = int(score.group(1))
    else:
        row["score"] = None
    row["hypothesis"] = preprocess_h(row["hypothesis"], pattern)
    
    for regex in TEMPLATES_REGEX[th]:
        matches = list(re.finditer(regex, row["hypothesis"], re.MULTILINE))
        if matches:
            grouped = list(matches)[0].groups()
            vals = {var: grouped[index] for index, var in enumerate(TEMPLATES_VAR[th])}
            for col in COLS_VAR:
                row[col] = vals.get(col, None)
            break

    return row

for val in H_ORDER:
    human_h_orig.loc[:, val] = human_h_orig[val].apply(lambda x: [format_h(y) for y in x.split("\n") if y])
    curr_df = human_h_orig[[val]].explode(val).rename(columns={val: "hypothesis"})
    curr_df["user_id"] = curr_df.index
    curr_df = curr_df.apply(lambda row: update_row(row, val), axis=1)
    human_h = pd.concat([human_h, curr_df])

human_h.reset_index(drop=True).sample(3)

Unnamed: 0,hypothesis,user_id,th,m,giv,i_shuffled,comparative,siv,sivv1,sivv2,mod,mod_val,mod1,mod2,score
86,when comparing studies where game incentive is...,2,study_mod,human,IncentivesVariable,B,higher,game incentive,monetary,hypothetical,one-shot vs repeated,repeated,,,5.0
82,when comparing studies where lottery incentive...,1,study_mod,human,IncentivesVariable,C,lower,lottery incentive,true,false,acquaintance,strangers,,,3.0
57,when comparing studies where partner's gender ...,1,study_mod,human,GenderVariable,C,higher,partner's gender is known,true,false,matching,partner,,,3.0


In [25]:
human_h[human_h.comparative.isna()]

Unnamed: 0,hypothesis,user_id,th,m,giv,i_shuffled,comparative,siv,sivv1,sivv2,mod,mod_val,mod1,mod2,score


In [26]:
human_h[human_h.comparative=="elected"]

Unnamed: 0,hypothesis,user_id,th,m,giv,i_shuffled,comparative,siv,sivv1,sivv2,mod,mod_val,mod1,mod2,score
4,when comparing studies where leadership is tru...,4,var_mod,human,LeadershipVariable,A,elected,leadership,True,False,random appointment,,larger,leadership assignment rulee,5.0


In [27]:
human_h[human_h.comparative=="elected"].hypothesis.values

array(['when comparing studies where leadership is true and studies where leadership is false, the treatment effect [true - false] is larger when leadership assignment rulee is elected compared to when leadership assignment rulee is random appointment'],
      dtype=object)

In [28]:
human_h.comparative.unique()

array(['higher', 'lower', 'smaller', 'larger', 'elected', 'ingroup',
       'cooperative', 'strong', 'rotating', 'common', 'high',
       'natural group'], dtype=object)

In [29]:
mappings = {"smaller": "lower", "larger": "higher"}
no_swap_cols = [col for col in human_h.columns if col not in ["comparative", "mod", "mod1", "mod", "mod2"] + list(mappings.keys())]
human_h["comparative"] = human_h["comparative"].apply(lambda x: mappings.get(x, x))

def swap_row(row):
    if row["comparative"] not in ["higher", "lower"]:
        new_row = {x: row[x] for x in no_swap_cols}
        new_row.update({
            'comparative': row['mod1'],
            'mod': row['mod'],
            'mod1': row['comparative'],
            'mod': row['mod2'],
            'mod2': row['mod']})
        return pd.Series(new_row)
    return row

human_h = human_h.apply(swap_row, axis=1)
human_h.groupby(["user_id", "comparative"]).agg({"hypothesis": "count"}).reset_index()

Unnamed: 0,user_id,comparative,hypothesis
0,0,higher,26
1,0,lower,4
2,1,higher,23
3,1,lower,7
4,2,higher,30
5,3,higher,27
6,3,lower,3
7,4,higher,19
8,4,larger,10
9,4,lower,1


In [30]:
df_comparative = human_h.groupby(["user_id", "comparative"]).agg({"hypothesis": "count"}).reset_index().pivot_table(index='user_id', columns='comparative').fillna(1)
df_comparative.columns = [x[1] for x in df_comparative.columns]
df_comparative["div"] = df_comparative["higher"] / df_comparative["lower"]
df_comparative.reset_index(drop=True)

des = f"""
On average, participants used the comparative ``higher'' {round(df_comparative["div"].mean(), 1)} more times than ``lower'', resulting in {int(df_comparative["higher"].sum()):,}  and {int(df_comparative["lower"].sum()):,} hypotheses with the comparative ``higher'' and ``lower'' respectively.
"""
print(des)


On average, participants used the comparative ``higher'' 13.6 more times than ``lower'', resulting in 125  and 16 hypotheses with the comparative ``higher'' and ``lower'' respectively.



In [31]:
human_h.sample(3)

Unnamed: 0,comparative,giv,hypothesis,i_shuffled,m,mod,mod1,mod2,mod_val,score,siv,sivv1,sivv2,th,user_id
2,higher,IncentivesVariable,when comparing studies where game incentive is...,B,human,one-shot vs repeated,,,repeated,5.0,game incentive,monetary,hypothetical,study_mod,2
1,higher,PersonalityVariable,cooperation is significantly higher when indiv...,E,human,,,,,4.0,individual difference,locus of control,aggression,regular,1
3,higher,LeadershipVariable,when comparing studies where leader's behavior...,3,human,leadership role,leader,follower,,4.0,leader's behavior,cooperative,noncooperative,var_mod,3


In [32]:
print(f"""
On average, participants rated their hypotheses with a score of {round(human_h.groupby("user_id").agg({"score": "mean"}).score.mean(), 1)}.
""")


On average, participants rated their hypotheses with a score of 3.9.



### Analysis of Hypothesis

In [33]:
df_hist = pd.DataFrame(columns=["th", "nb"])
for th in TEMPLATES_VAR.keys():
    vals = human_h[human_h.th == th].groupby(TEMPLATES_VAR[th]).agg({"user_id": "nunique"}).user_id.values
    curr_df = pd.DataFrame({"th": th, "nb": vals}, columns=["th", "nb"])
    df_hist = pd.concat([df_hist, curr_df])

fig = px.histogram(df_hist, x="nb", histnorm="", nbins=human_h.user_id.unique().shape[0], color="th",
                   color_discrete_map=COLOR_PALETTE, opacity=0.75)
fig.update_layout(barmode='group')
fig.write_image("../visualisations/user_studies_common_h.pdf", format='pdf')
fig.show()

In [59]:
fig = px.histogram(
    human_h.groupby(["th", "siv"]).agg({"user_id": "nunique"}).reset_index(),
    x="user_id", histnorm="", nbins=human_h.user_id.unique().shape[0], color="th",
    color_discrete_map=COLOR_PALETTE, opacity=0.75)
fig.update_layout(barmode='group')
fig.write_image("../visualisations/user_studies_common_siv.pdf", format='pdf')
fig.show()

In [58]:
color_palette = px.colors.qualitative.Safe
fig = px.histogram(
    human_h.groupby(["th", "mod"]).agg({"user_id": "nunique"}).reset_index(),
    x="user_id", histnorm="", nbins=human_h.user_id.unique().shape[0], color="th",
    color_discrete_map=COLOR_PALETTE, opacity=0.75)
fig.update_layout(barmode='group')
fig.write_image("../visualisations/user_studies_common_mod.pdf", format='pdf')
fig.show()

In [36]:
human_h.groupby(["siv"]).agg({"hypothesis": "count"}).sort_values(by="hypothesis", ascending=False)

Unnamed: 0_level_0,hypothesis
siv,Unnamed: 1_level_1
game incentive,18
leader's behavior,15
gender,13
svo type,11
partner's group membership,11
knowledge of partner's prior behavior,9
leadership,8
identification levell,7
anonymity manipulation,6
gossip,5


In [37]:
human_h.groupby(["mod"]).agg({"hypothesis": "count"}).sort_values(by="hypothesis", ascending=False)

Unnamed: 0_level_0,hypothesis
mod,Unnamed: 1_level_1
leader's characteristicc,9
sanction,8
partner's group membership,6
discussion,5
matching,5
entitativity levell,5
leader's characteristic,5
academic discipline,4
group typee,4
game incentive,4


## AI-generated hypotheses

In [38]:
ai_h_start_col = "Please rank the following 6 hypotheses on a scale of 1 to 5"
ai_h_orig = all_results[[x for x in all_results.columns if x.startswith(ai_h_start_col)]]
ai_h_orig.columns = [f"{ho}_{i}" for ho in H_ORDER for i in range(6)]

ai_h = pd.DataFrame(columns=["user_id", "score", "th", "giv", "i_shuffled"])

def update_row_ai(row, val):
    th = "_".join(val.split("_")[:-1])
    row["th"] = th
    row["giv"] = val.split("_")[-1]

    return row

def update_row_ai_2(row):
    row["h"] = row["h"].lower()
    th = row["th"]
    for regex in TEMPLATES_REGEX[th]:
        matches = list(re.finditer(regex, row["h"], re.MULTILINE))
        if matches:
            grouped = list(matches)[0].groups()
            vals = {var: grouped[index] for index, var in enumerate(TEMPLATES_VAR[th])}
            for col in COLS_VAR:
                row[col] = vals.get(col, None)
            break

    return row

for ho in H_ORDER:
    for i in range(6):
        curr_df = ai_h_orig[[f"{ho}_{i}"]].rename(columns={f"{ho}_{i}": "score"})
        curr_df["user_id"] = curr_df.index
        curr_df["i_shuffled"] = str(i)
        curr_df = curr_df.apply(lambda row: update_row_ai(row, ho), axis=1)
        ai_h = pd.concat([ai_h, curr_df])
    
ai_h = ai_h.reset_index(drop=True)
ai_h_metadata = pd.read_csv("hypotheses.csv", index_col=0)
ai_h_metadata.i_shuffled = ai_h_metadata.i_shuffled.astype(str)
ai_h = pd.merge(ai_h, ai_h_metadata, on=['th', 'giv', 'i_shuffled'], how='left')
ai_h = ai_h.apply(update_row_ai_2, axis=1).rename(columns={"h": "hypothesis"})
ai_h.sample(3)

Unnamed: 0,comparative,giv,hypothesis,i_orig,i_shuffled,m,mod,mod1,mod2,mod_val,score,siv,sivv1,sivv2,th,user_id
69,lower,GenderVariable,when comparing studies where gender is female ...,1,1,classification,acquaintance,,,acquaintance,2,gender,female,male,study_mod,4
86,higher,GenderVariable,when comparing studies where gender is female ...,4,5,anyburl,has country,,,can,1,gender,female,male,study_mod,1
94,lower,IncentivesVariable,when comparing studies where game incentive is...,4,0,anyburl,has country,,,usa,1,game incentive,monetary,non-monetary material,study_mod,4


In [39]:
df_comparison_siv_mod = pd.DataFrame(columns=["name", "type", "h_nb", "h_perc", "m"])

def helper(x):
    if x[-1] == x[-2] and not x.endswith("fee"):
        return x[:-1]
    return x

for df, m in [(ai_h[ai_h.user_id==0], "ai"), (human_h, "human")]:
    nb = df.shape[0]
    for t in ["siv", "mod"]:
        curr_df = df.groupby([t]).agg({"hypothesis": "count"}).reset_index() \
            .rename(columns={t: "name", "hypothesis": "h_nb"})
        curr_df["name"] = curr_df["name"].apply(helper)
        curr_df["type"] = t
        curr_df["h_perc"] = 100 * curr_df["h_nb"] / nb
        curr_df["m"] = m
        df_comparison_siv_mod = pd.concat([df_comparison_siv_mod, curr_df])
df_comparison_siv_mod


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



Unnamed: 0,name,type,h_nb,h_perc,m
0,anonymity manipulation,siv,2,5.555556,ai
1,game incentive,siv,6,16.666667,ai
2,gender,siv,4,11.111111,ai
3,heterogeneous gender,siv,2,5.555556,ai
4,identification level,siv,2,5.555556,ai
...,...,...,...,...,...
29,recruitment method,mod,3,2.000000,human
30,sanction,mod,8,5.333333,human
31,show-up fee,mod,2,1.333333,human
32,student sample,mod,1,0.666667,human


In [71]:
df_comparison_siv_mod[df_comparison_siv_mod.name=="gender"]

Unnamed: 0,name,type,h_nb,h_perc,m
2,gender,siv,4,11.111111,ai
6,gender,siv,13,8.666667,human


In [40]:
df_comparison_siv_mod_pivot = df_comparison_siv_mod.pivot_table(index=['name', 'type'], columns='m', values='h_perc').reset_index().fillna(0)
df_comparison_siv_mod_pivot["diff"] = abs(df_comparison_siv_mod_pivot["ai"]-df_comparison_siv_mod_pivot["human"])
df_comparison_siv_mod_pivot[(df_comparison_siv_mod_pivot.ai>5) | (df_comparison_siv_mod_pivot.human>5)].sort_values(by="diff")

m,name,type,ai,human,diff
24,identification level,siv,5.555556,4.666667,0.888889
2,anonymity manipulation,siv,5.555556,4.0,1.555556
47,partner's group membership,siv,5.555556,7.333333,1.777778
16,gender,siv,11.111111,8.666667,2.444444
13,game incentive,mod,5.555556,2.666667,2.888889
28,knowledge of group membership,siv,5.555556,2.666667,2.888889
29,knowledge of partner's prior behavior,siv,2.777778,6.0,3.222222
21,heterogeneous gender,siv,5.555556,2.0,3.555556
33,leader's characteristic,mod,8.333333,4.666667,3.666667
55,svo type,siv,11.111111,7.333333,3.777778


In [41]:
ai_h[ai_h.user_id==0].groupby(["siv"]).agg({"hypothesis": "count"}).sort_values(by="hypothesis", ascending=False)

Unnamed: 0_level_0,hypothesis
siv,Unnamed: 1_level_1
game incentive,6
leadership assignment rule,6
gender,4
svo type,4
anonymity manipulation,2
heterogeneous gender,2
identification level,2
knowledge of group membership,2
partner's group membership,2
knowledge of partner's prior behavior,1


In [43]:
ai_h[ai_h.user_id==0].groupby(["mod"]).agg({"hypothesis": "count"}).sort_values(by="hypothesis", ascending=False)

Unnamed: 0_level_0,hypothesis
mod,Unnamed: 1_level_1
partner's group membership,4
source of country or region,4
endogenous leadership,3
leader's characteristic,3
game incentive,2
has country,2
symmetry,2
acquaintance,1
identification level,1
knowledge of group membership,1


In [44]:
data_agreement = ai_h[["th", "giv", "i_shuffled", "m", "score", "user_id"]]
data_agreement["q_id"] = data_agreement["th"] + '_' + data_agreement["giv"] + '_' + \
    data_agreement["i_shuffled"] + '_' + data_agreement["m"]
data_agreement = data_agreement[["q_id", "user_id", "score"]].values

questions_answers_table = pivot_table_frequency(data_agreement[:, 0], data_agreement[:, 2])
users_answers_table = pivot_table_frequency(data_agreement[:, 1], data_agreement[:, 2])

kappa = cohens_kappa(questions_answers_table, users_answers_table)
weighted_kappa = cohens_kappa(questions_answers_table, users_answers_table, weights_kernel=quadratic_kernel)
print(kappa, weighted_kappa)


0.1463670126654604 0.28631741253350035




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [45]:
fig = px.box(ai_h, x='th', y='score', color='m', points='all',
             #labels={'Score': 'Scores', 'Model': 'Models', 'Type of Hypothesis': 'Type of Hypothesis'}
             color_discrete_map=COLOR_PALETTE
             )
fig.update_layout(title='Distribution of Scores by Model and Type of Hypothesis')
fig.write_image("../visualisations/user_studies_ai_h_scores.pdf", format='pdf')
fig.show()

In [46]:
columns = ["user_id", "score", "th", "giv", "i_shuffled", "m"]
fig = px.box(ai_h[columns].groupby(["th", "giv", "i_shuffled", "m"]).agg({"score": "mean"}).reset_index(), x='th', y='score', color='m', points='all',
             #labels={'Score': 'Scores', 'Model': 'Models', 'Type of Hypothesis': 'Type of Hypothesis'}
             color_discrete_map=COLOR_PALETTE
             )
fig.update_layout(title='Distribution of Scores by Model and Type of Hypothesis')
fig.write_image("../visualisations/user_studies_ai_h_scores_mean.pdf", format='pdf')
fig.show()

In [60]:
columns = ["user_id", "score", "th", "giv", "i_shuffled", "m"]
df_score_h_ai_human = pd.concat([human_h[columns], ai_h[columns]])

fig = px.box(df_score_h_ai_human, x='th', y='score', color='m', points='all',
             #labels={'Score': 'Scores', 'Model': 'Models', 'Type of Hypothesis': 'Type of Hypothesis'}
             color_discrete_map=COLOR_PALETTE
             )
fig.update_layout(title='Distribution of Scores by Model and Type of Hypothesis')
fig.write_image("../visualisations/user_studies_ai_human_h_scores.pdf", format='pdf')
fig.show()

In [48]:
ai_h

Unnamed: 0,comparative,giv,hypothesis,i_orig,i_shuffled,m,mod,mod1,mod2,mod_val,score,siv,sivv1,sivv2,th,user_id
0,higher,ReputationVariable,cooperation is significantly higher when anony...,0,0,classification,,,,,3,anonymity manipulation,low,medium,regular,0
1,higher,ReputationVariable,cooperation is significantly higher when anony...,0,0,classification,,,,,2,anonymity manipulation,low,medium,regular,1
2,higher,ReputationVariable,cooperation is significantly higher when anony...,0,0,classification,,,,,3,anonymity manipulation,low,medium,regular,2
3,higher,ReputationVariable,cooperation is significantly higher when anony...,0,0,classification,,,,,3,anonymity manipulation,low,medium,regular,3
4,higher,ReputationVariable,cooperation is significantly higher when anony...,0,0,classification,,,,,3,anonymity manipulation,low,medium,regular,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,higher,IdentificationVariable,when comparing studies where partner's group m...,5,5,anyburl,identification level,high,low,,3,partner's group membership,ingroup,stranger,var_mod,0
176,higher,IdentificationVariable,when comparing studies where partner's group m...,5,5,anyburl,identification level,high,low,,4,partner's group membership,ingroup,stranger,var_mod,1
177,higher,IdentificationVariable,when comparing studies where partner's group m...,5,5,anyburl,identification level,high,low,,5,partner's group membership,ingroup,stranger,var_mod,2
178,higher,IdentificationVariable,when comparing studies where partner's group m...,5,5,anyburl,identification level,high,low,,4,partner's group membership,ingroup,stranger,var_mod,3


In [49]:
ai_h.groupby(["siv"]).agg({"hypothesis": "count"}).sort_values(by="hypothesis", ascending=False)

Unnamed: 0_level_0,hypothesis
siv,Unnamed: 1_level_1
game incentive,30
leadership assignment rule,30
gender,20
svo type,20
anonymity manipulation,10
heterogeneous gender,10
identification level,10
knowledge of group membership,10
partner's group membership,10
knowledge of partner's prior behavior,5


## Comparing human-generated hypotheses and ai-generated hypotheses

In [50]:
comparison_h_start_col = "Please rank all the hypotheses, from the best one (rank = 1) to the worst one (rank = 11)."
comparison_h_orig = all_results[[x for x in all_results.columns if x.startswith(comparison_h_start_col)]]
comparison_h_orig.columns = [f"{ho}_{col}" for ho in H_ORDER for col in COLS_ALL_H]
comparison_h_orig

Unnamed: 0,regular_ReputationVariable_A,regular_ReputationVariable_B,regular_ReputationVariable_C,regular_ReputationVariable_D,regular_ReputationVariable_E,regular_ReputationVariable_H0,regular_ReputationVariable_H1,regular_ReputationVariable_H2,regular_ReputationVariable_H3,regular_ReputationVariable_H4,...,var_mod_IdentificationVariable_B,var_mod_IdentificationVariable_C,var_mod_IdentificationVariable_D,var_mod_IdentificationVariable_E,var_mod_IdentificationVariable_H0,var_mod_IdentificationVariable_H1,var_mod_IdentificationVariable_H2,var_mod_IdentificationVariable_H3,var_mod_IdentificationVariable_H4,var_mod_IdentificationVariable_H5
0,7,2,8,11,5,6,9,3,1,10,...,10,2,6,4,3,1,9,5,7,8
1,8,7,1,5,3,10,11,2,9,6,...,2,3,5,7,4,8,10,9,11,6
2,3,8,2,5,1,6,7,9,10,4,...,2,3,4,8,9,10,5,6,11,7
3,5,1,4,2,7,10,11,8,3,6,...,2,3,4,5,11,6,7,8,9,10
4,9,10,6,7,11,4,2,8,1,5,...,10,9,8,7,5,4,3,6,2,1


In [51]:
comparison_h = pd.DataFrame(columns=["user_id", "rank", "th", "giv", "i_shuffled"])

def update_row_ai(row, val):
    th = "_".join(val.split("_")[:-1])
    row["th"] = th
    row["giv"] = val.split("_")[-1]
    return row

for ho in H_ORDER:
    for i in COLS_ALL_H:
        curr_df = comparison_h_orig[[f"{ho}_{i}"]].rename(columns={f"{ho}_{i}": "rank"})
        curr_df["user_id"] = curr_df.index
        curr_df["i_shuffled"] = i.replace("H", "")
        curr_df = curr_df.apply(lambda row: update_row_ai(row, ho), axis=1)
        comparison_h = pd.concat([comparison_h, curr_df])
    
comparison_h = comparison_h.reset_index(drop=True)
ai_h_metadata.i_shuffled = ai_h_metadata.i_shuffled.astype(str)
comparison_h = pd.merge(comparison_h, ai_h_metadata, on=['th', 'giv', 'i_shuffled'], how='left')
comparison_h["m"] = comparison_h["m"].fillna("human")
comparison_h.sample(3)

Unnamed: 0,user_id,rank,th,giv,i_shuffled,m,i_orig,h
267,2,10,var_mod,LeadershipVariable,4,classification,1.0,When comparing studies where leadership assign...
61,1,4,regular,PersonalityVariable,B,human,,
203,3,9,study_mod,IncentivesVariable,2,anyburl,5.0,When comparing studies where game incentive is...


In [52]:
data_agreement = comparison_h[comparison_h.m!="human"][["th", "giv", "i_shuffled", "m", "rank", "user_id"]]
data_agreement["q_id"] = data_agreement["th"] + '_' + data_agreement["giv"] + '_' + \
    data_agreement["i_shuffled"] + '_' + data_agreement["m"]
data_agreement = data_agreement[["q_id", "user_id", "rank"]].values

questions_answers_table = pivot_table_frequency(data_agreement[:, 0], data_agreement[:, 2])
users_answers_table = pivot_table_frequency(data_agreement[:, 1], data_agreement[:, 2])

kappa = cohens_kappa(questions_answers_table, users_answers_table)
weighted_kappa = cohens_kappa(questions_answers_table, users_answers_table, weights_kernel=quadratic_kernel)
print(kappa, weighted_kappa)


0.0345412882966986 0.015436339243862368


In [53]:
fig = px.box(comparison_h[comparison_h.m!="human"].groupby(["th", "giv", "i_shuffled", "m"]).agg({"rank": "mean"}).reset_index(), x='th', y='rank', color='m', points='all',
             #labels={'Score': 'Scores', 'Model': 'Models', 'Type of Hypothesis': 'Type of Hypothesis'}
             color_discrete_map=COLOR_PALETTE
             )
fig.update_layout(title='Distribution of Scores by Model and Type of Hypothesis')
fig.write_image("../visualisations/user_studies_ai_h_ranks_mean.pdf", format='pdf')
fig.show()

In [54]:
fig = px.box(comparison_h, x='th', y='rank', color='m', points='all',
             #labels={'Score': 'Scores', 'Model': 'Models', 'Type of Hypothesis': 'Type of Hypothesis'}
             color_discrete_map=COLOR_PALETTE
             )
fig.update_layout(title='Distribution of Scores by Model and Type of Hypothesis')
fig.write_image("../visualisations/user_studies_ai_human_h_ranks.pdf", format='pdf')
fig.show()

In [55]:
comparison_h[comparison_h["rank"]<=3].groupby(["m", "user_id"]).agg({"user_id": "count"})

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id
m,user_id,Unnamed: 2_level_1
anyburl,0,2
anyburl,1,2
anyburl,3,3
anyburl,4,8
classification,0,3
classification,1,1
classification,4,2
human,0,10
human,1,15
human,2,18


In [56]:
columns = ['th', 'giv', 'm', 'user_id', 'i_shuffled']
data_table = pd.merge(comparison_h[columns + ['rank']], df_score_h_ai_human[columns + ['score']], on=columns, how='left').groupby(["m", "th"]).agg({"score": "mean", "rank": "mean"}).reset_index().pivot_table(index="m", columns="th")
data_table.columns = ["_".join(x) for x in data_table.columns]
data_table

Unnamed: 0_level_0,rank_regular,rank_study_mod,rank_var_mod,score_regular,score_study_mod,score_var_mod
m,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
anyburl,6.45,7.55,5.7,3.0,1.85,3.1
classification,6.5,7.9,6.65,3.5,1.55,3.35
human,4.98,4.64,5.58,3.777778,3.6,4.05
llm_zero_shot_prompting,7.6,5.95,6.7,2.05,2.75,2.9


In [57]:
def get_data_overleaf(df):
    res = []
    for model in ["human", "classification", "anyburl", "llm_zero_shot_prompting"]:
        curr_res = [model.split("_")[0]]
        curr_df = df[df.index==model]
        for th in ["regular", "study_mod", "var_mod"]:
            for metric in ["score", "rank"]:
                curr_res.append(str(round(curr_df[f"{metric}_{th}"].values[0], 1)))
        res.append(" & ".join(curr_res))
    return "\\\\ \n".join(res) + "\\\\ \n"

print(get_data_overleaf(data_table))

human & 3.8 & 5.0 & 3.6 & 4.6 & 4.0 & 5.6\\ 
classification & 3.5 & 6.5 & 1.6 & 7.9 & 3.4 & 6.7\\ 
anyburl & 3.0 & 6.5 & 1.8 & 7.5 & 3.1 & 5.7\\ 
llm & 2.0 & 7.6 & 2.8 & 6.0 & 2.9 & 6.7\\ 

