# Choosing User Studies GIV 

Based on quantitative criteria

In [2]:
import os
import random
import textwrap
import pandas as pd
import plotly.express as px
from IPython.display import display

## Parameters 
Only change the `HYPOTHESIS` variable, can be one of the followings:
[`regular`, `study_mod`, `variable_mod`]

In [40]:
HYPOTHESIS = "regular"
ES = "d"
METHODS = ["classification", "llm", "anyburl"]
FOLDERS = {
    "classification": f"./classification/final/h_{HYPOTHESIS}_es_{ES}/outputs",
    "llm": f"./llm_zero_shot_prompting/h_{HYPOTHESIS}_es_{ES}/outputs",
    "anyburl": f"./anyburl/final/h_{HYPOTHESIS}_es_{ES}/outputs"
}
FOLDERS

{'classification': './classification/final/h_regular_es_d/outputs',
 'llm': './llm_zero_shot_prompting/h_regular_es_d/outputs',
 'anyburl': './anyburl/final/h_regular_es_d/outputs'}

## GIV where there is at least 5 hypotheses for each of the 3 methods

In [41]:
def get_nb_h(method, folder):
    """ Get nb of hypotheses per method per giv """
    files = [x for x in os.listdir(folder) if x.endswith(".csv")]
    res = []
    for f in files:
        data = pd.read_csv(os.path.join(folder, f), index_col=0)
        res.append((method, f.replace(".csv", "").replace("_csv", ""), data.shape[0]))
    return pd.DataFrame(res, columns=["method", "giv", "nb"])

info_h = pd.concat([get_nb_h(m, f) for m, f in FOLDERS.items()])
info_h.head(5)

Unnamed: 0,method,giv,nb
0,classification,Game_DurationVariable,1
1,classification,MatchingVariable,3
2,classification,EducationVariable,2
3,classification,AcquaintanceVariable,2
4,classification,ChoicesVariable,1


In [42]:
fig = px.histogram(info_h, x="nb", color="method", nbins=50)
fig.show()

In [43]:
info_giv_method = info_h[info_h.nb >= 5].groupby("giv").agg({"method": "nunique", "nb": ["min", "mean", "max"]}).reset_index()
info_giv_method.columns = ["_".join(x) for x in info_giv_method.columns]
print(f"Total method+h: {info_h.shape[0]}")
print(f"Total method+h (# h >= 5): {info_h[info_h.nb >= 5].shape[0]}")
print(f"Total giv (# h >= 5 | # m = 3): {info_giv_method[info_giv_method.method_nunique == 3].shape[0]}")
display(info_giv_method[info_giv_method.method_nunique == 3])
# fig = px.histogram(info_giv_method, x="method_nunique", color="giv_", nbins=3)
# fig.show()

Total method+h: 153
Total method+h (# h >= 5): 60
Total giv (# h >= 5 | # m = 3): 12


Unnamed: 0,giv_,method_nunique,nb_min,nb_mean,nb_max
5,FeedbackVariable,3,5,9.666667,12
6,FramingVariable,3,5,9.333333,13
12,IdentificationVariable,3,5,12.666667,17
13,Intergroup_Competition_Variable,3,5,5.333333,6
14,LeadershipVariable,3,5,7.0,11
17,Partner(s)'_strategiesVariable,3,5,8.666667,11
20,PersonalityVariable,3,5,10.333333,14
23,Public_Goods_GameVariable,3,5,12.0,18
24,PunishmentVariable,3,5,20.333333,28
25,ReputationVariable,3,5,7.0,8


In [44]:
filtered_giv = info_giv_method[info_giv_method.method_nunique == 3].giv_.values
filtered_giv

array(['FeedbackVariable', 'FramingVariable', 'IdentificationVariable',
       'Intergroup_Competition_Variable', 'LeadershipVariable',
       "Partner(s)'_strategiesVariable", 'PersonalityVariable',
       'Public_Goods_GameVariable', 'PunishmentVariable',
       'ReputationVariable', 'RewardVariable', 'UncertaintyVariable'],
      dtype=object)

## GIV from ontology information

In [45]:
df = pd.read_csv("../data/coda_kg.csv")
df = df[df.predicate.str.endswith("subPropertyOf")]
df = df[df['object'].apply(lambda x: any(x.endswith(suffix) for suffix in filtered_giv))]
df["object"] = df["object"].apply(lambda x: x.split("/")[-1])
df.groupby("object").agg({"subject": "nunique"}).reset_index().sort_values(by="subject")

Unnamed: 0,object,subject
3,Intergroup_Competition_Variable,2
12,UncertaintyVariable,3
0,FeedbackVariable,4
1,FramingVariable,6
9,ReputationVariable,6
11,Step-level_Public_Goods_GameVariable,6
5,Partner(s)'_strategiesVariable,6
6,PersonalityVariable,7
4,LeadershipVariable,7
2,IdentificationVariable,8


In [46]:
ont_info = df.groupby("object").agg({"subject": "nunique"}).reset_index().sort_values(by="subject")
pd.merge(info_giv_method, ont_info, left_on='giv_', right_on='object', how='inner').drop(columns=['method_nunique', 'object']).sort_values(by=['subject', 'nb_mean'])

Unnamed: 0,giv_,nb_min,nb_mean,nb_max,subject
3,Intergroup_Competition_Variable,5,5.333333,6,2
12,UncertaintyVariable,5,8.333333,11,3
0,FeedbackVariable,5,9.666667,12,4
11,Step-level_Public_Goods_GameVariable,5,5.0,5,6
9,ReputationVariable,5,7.0,8,6
5,Partner(s)'_strategiesVariable,5,8.666667,11,6
1,FramingVariable,5,9.333333,13,6
4,LeadershipVariable,5,7.0,11,7
6,PersonalityVariable,5,10.333333,14,7
2,IdentificationVariable,5,12.666667,17,8


## Final GIVs

For `var_mod`: the ones with the least hypotheses, since you need two specific independent variables -> picking one where there is a lot of options from the ontology (8 siv)
* `Group_SizeVariable`
* `IdentificationVariable`

For `study_mod`: there are 16 categorical study moderators, which represent many options already. So we prefer to take a GIV with fewer options in the ontology (4 siv)
* `GenderVariable`
* `IncentivesVariable`

For `regular`: medium-level (6 siv)
* `ReputationVariable`
* `Partner(s)'_strategiesVariable`