# Choosing User Studies GIV 

Based on quantitative criteria

In [18]:
import os
import json
import random
import textwrap
import pandas as pd
import plotly.express as px
from IPython.display import display
from kglab.helpers.kg_query import run_query
from kglab.helpers.variables import HEADERS_CSV

## Parameters 
Only change the `HYPOTHESIS` variable, can be one of the followings:
[`regular`, `study_mod`, `variable_mod`]

In [19]:
HYPOTHESIS = "regular"
ES = "d"
METHODS = ["classification", "llm_zero_shot_prompting", "anyburl"]
FOLDERS = {
    m: f"./{m}/final/h_{HYPOTHESIS}_es_{ES}/outputs" for m in METHODS
}
FOLDERS

{'classification': './classification/final/h_regular_es_d/outputs',
 'llm_zero_shot_prompting': './llm_zero_shot_prompting/final/h_regular_es_d/outputs',
 'anyburl': './anyburl/final/h_regular_es_d/outputs'}

## GIV where there is at least 5 hypotheses for each of the 3 methods

In [20]:
def get_nb_h(method, folder):
    """ Get nb of hypotheses per method per giv """
    files = [x for x in os.listdir(folder) if x.endswith(".csv")]
    res = []
    for f in files:
        data = pd.read_csv(os.path.join(folder, f), index_col=0)
        res.append((method, f.replace(".csv", ""), data.shape[0]))
    return pd.DataFrame(res, columns=["method", "giv", "nb"])

info_h = pd.concat([get_nb_h(m, f) for m, f in FOLDERS.items()])
info_h.head(5)

Unnamed: 0,method,giv,nb
0,classification,Game_DurationVariable,1
1,classification,MatchingVariable,3
2,classification,EducationVariable,2
3,classification,AcquaintanceVariable,2
4,classification,ChoicesVariable,1


In [21]:
fig = px.histogram(info_h, x="nb", color="method", nbins=50)
fig.show()

In [22]:
info_giv_method = info_h[info_h.nb >= 5].groupby("giv").agg({"method": "nunique", "nb": ["min", "mean", "max"]}).reset_index()
info_giv_method.columns = ["_".join(x) for x in info_giv_method.columns]
print(f"Total method+h: {info_h.shape[0]}")
print(f"Total method+h (# h >= 5): {info_h[info_h.nb >= 5].shape[0]}")
print(f"Total giv (# h >= 5 | # m = 3): {info_giv_method[info_giv_method.method_nunique == 3].shape[0]}")
display(info_giv_method[info_giv_method.method_nunique == 3])
# fig = px.histogram(info_giv_method, x="method_nunique", color="giv_", nbins=3)
# fig.show()

Total method+h: 153
Total method+h (# h >= 5): 72
Total giv (# h >= 5 | # m = 3): 15


Unnamed: 0,giv_,method_nunique,nb_min,nb_mean,nb_max
4,Degree_of_conflicting_interestsVariable,3,5,11.333333,15
8,FeedbackVariable,3,5,9.666667,12
9,FramingVariable,3,5,9.333333,13
12,GenderVariable,3,5,14.666667,21
13,Group_SizeVariable,3,5,8.666667,11
14,IdentificationVariable,3,5,12.666667,17
16,Intergroup_Competition_Variable,3,5,5.333333,6
17,LeadershipVariable,3,5,7.0,11
21,Partner(s)'_strategiesVariable,3,5,8.666667,11
24,PersonalityVariable,3,5,10.333333,14


In [23]:
filtered_giv = info_giv_method[info_giv_method.method_nunique == 3].giv_.values
filtered_giv

array(['Degree_of_conflicting_interestsVariable', 'FeedbackVariable',
       'FramingVariable', 'GenderVariable', 'Group_SizeVariable',
       'IdentificationVariable', 'Intergroup_Competition_Variable',
       'LeadershipVariable', "Partner(s)'_strategiesVariable",
       'PersonalityVariable', 'Public_Goods_GameVariable',
       'PunishmentVariable', 'ReputationVariable', 'RewardVariable',
       'UncertaintyVariable'], dtype=object)

## GIV from ontology information

In [24]:
with open("./cat_moderators.json", "r", encoding="utf-8") as openfile:
    cat_mods = json.load(openfile)

In [25]:
df = pd.read_csv("../data/coda_kg.csv")
df = df[df.predicate.str.endswith("subPropertyOf")]
df = df[df.subject != df.object]
df = df[df['object'].apply(lambda x: any(x.endswith(suffix) for suffix in filtered_giv))]
df["tm"] = df["subject"].apply(lambda x: cat_mods[x])
df = df[df.tm == "categorical"]
df.head(3)

Unnamed: 0,subject,predicate,object,tm
224744,https://data.cooperationdatabank.org/vocab/pro...,http://www.w3.org/2000/01/rdf-schema#subProper...,https://data.cooperationdatabank.org/vocab/pro...,categorical
224756,https://data.cooperationdatabank.org/vocab/pro...,http://www.w3.org/2000/01/rdf-schema#subProper...,https://data.cooperationdatabank.org/vocab/pro...,categorical
224760,https://data.cooperationdatabank.org/vocab/pro...,http://www.w3.org/2000/01/rdf-schema#subProper...,https://data.cooperationdatabank.org/vocab/pro...,categorical


In [26]:
df["object"] = df["object"].apply(lambda x: x.split("/")[-1])
df.groupby("object").agg({"subject": "nunique"}).reset_index().sort_values(by="subject")

Unnamed: 0,object,subject
6,Intergroup_Competition_Variable,1
0,Degree_of_conflicting_interestsVariable,2
15,UncertaintyVariable,2
3,GenderVariable,3
4,Group_SizeVariable,3
14,Step-level_Public_Goods_GameVariable,3
8,Partner(s)'_strategiesVariable,3
1,FeedbackVariable,3
12,ReputationVariable,5
9,PersonalityVariable,5


In [27]:
ont_info = df.groupby("object").agg({"subject": "nunique"}).reset_index().sort_values(by="subject")
pd.merge(info_giv_method, ont_info, left_on='giv_', right_on='object', how='inner').drop(columns=['method_nunique', 'object']).sort_values(by=['subject', 'nb_mean'])

Unnamed: 0,giv_,nb_min,nb_mean,nb_max,subject
6,Intergroup_Competition_Variable,5,5.333333,6,1
15,UncertaintyVariable,5,8.333333,11,2
0,Degree_of_conflicting_interestsVariable,5,11.333333,15,2
14,Step-level_Public_Goods_GameVariable,5,5.0,5,3
4,Group_SizeVariable,5,8.666667,11,3
8,Partner(s)'_strategiesVariable,5,8.666667,11,3
1,FeedbackVariable,5,9.666667,12,3
3,GenderVariable,5,14.666667,21,3
12,ReputationVariable,5,7.0,8,5
2,FramingVariable,5,9.333333,13,5


## Final GIVs

We consider the categorical moderators only, to align with our training data.

For `var_mod`: the ones with the least hypotheses, since you need two specific independent variables -> picking one where there is a lot of options from the ontology (7 siv)
* `LeadershipVariable`
* `IdentificationVariable`

For `study_mod`: there are 16 categorical study moderators, which represent many options already. So we prefer to take a GIV with fewer options in the ontology 3 siv)
* `GenderVariable`
* `IncentivesVariable`

For `regular`: medium-level (5 siv)
* `ReputationVariable`
* `PersonalityVariable`

In [28]:
SPARQL_ENDPOINT = "http://localhost:7200/repositories/coda"
# Query to get the documentation for regular/variable moderators
QUERY_TEMPLATE = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
SELECT ?iv_label ?iv_des ?siv_label ?siv_des WHERE {
    ?iv rdfs:subPropertyOf <https://data.cooperationdatabank.org/vocab/prop/[giv_var]> ;
        rdfs:label ?iv_label ;
        dc:description ?iv_des ;
        rdfs:range ?range .
    OPTIONAL {
        ?siv rdf:type ?range ;
         rdfs:label ?siv_label ;
         dc:description ?siv_des .
    }
    
}
"""
QUERY_STUDY = """
PREFIX cc: <https://data.cooperationdatabank.org/vocab/class/>
PREFIX cp: <https://data.cooperationdatabank.org/vocab/prop/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX xml: <http://www.w3.org/2001/XMLSchema#>
select distinct ?study_mod ?study_mod_des ?val ?val_des where { 
	?study_mod rdfs:domain cc:Study ;
               rdfs:range ?range .
               #rdf:type owl:DatatypeProperty .
	OPTIONAL {?study_mod dct:description ?study_mod_des .}
    FILTER (?study_mod NOT IN (cp:comments, cp:descriptionIV, cp:label, cp:type, cp:otherVariables, cp:reportsEffect, cp:studyOtherDilemmaType, cp:studySequentiality))
            FILTER (?range NOT IN (cc:ValueRange, xml:double, xml:integer))
    ?study ?study_mod ?val .
    OPTIONAL {?val dct:description ?val_des .}
} 
"""
# GIV_VARS

GIV_VAR = [
    "ReputationVariable", "PersonalityVariable",  # regular
    "LeadershipVariable", "IdentificationVariable",  # var_mod
    "GenderVariable", "IncentivesVariable",  # study_mod
]

In [29]:
# for giv in GIV_VAR:
#     print(QUERY_TEMPLATE.replace("[giv_var]", giv))
#     df_des = run_query(query=QUERY_TEMPLATE.replace("[giv_var]", giv),
#                        sparql_endpoint=SPARQL_ENDPOINT,
#                        headers=HEADERS_CSV)

In [30]:
## Retrieving hypotheses
NB = 2
I_TO_M = {}
for index, method in enumerate(METHODS):
    for _ in range(NB):
        curr_index = max(I_TO_M.keys()) + 1 if I_TO_M else 0
        I_TO_M.update({curr_index: method})
COLS = ["th", "giv", "m", "i_orig", "i_shuffled", "h"]
DF_H = pd.DataFrame(columns=COLS)

for (th, giv) in [
    ('regular', 'ReputationVariable'), ('regular', 'PersonalityVariable'),
    ('var_mod', 'LeadershipVariable'), ('var_mod', 'IdentificationVariable'),
    ('study_mod', 'GenderVariable'), ('study_mod', 'IncentivesVariable'),
]:
    files = [f"./{m}/final/h_{th}_es_d/outputs/{giv}.txt" for m in METHODS]
    h = []
    for f in files:
        lines = open(f, 'r', encoding='utf-8').readlines()[:NB]
        h.extend([x.replace("\n", "") for x in lines])
    indexes = list(range(NB*len(METHODS)))
    random.shuffle(indexes)
    mapping = {val: index for index, val in enumerate(indexes)}
    df = pd.DataFrame([(th, giv, I_TO_M[val], val, mapping[val], h[val]) for val in sorted(I_TO_M.keys())], columns=COLS)
    DF_H = pd.concat([DF_H, df])

DF_H.to_csv("./user_studies/hypotheses.csv")
DF_H.sample(3)

Unnamed: 0,th,giv,m,i_orig,i_shuffled,h
4,regular,PersonalityVariable,anyburl,4,0,Cooperation is significantly lower when svo ty...
1,regular,PersonalityVariable,classification,1,2,Cooperation is significantly lower when svo ty...
4,regular,ReputationVariable,anyburl,4,3,Cooperation is significantly lower when partne...


In [31]:
DF_H.columns

Index(['th', 'giv', 'm', 'i_orig', 'i_shuffled', 'h'], dtype='object')

In [32]:
f_helper = open("./user_studies/hyposeses_form.txt", "w", encoding="utf-8")
for th in DF_H.th.unique():
    curr_df = DF_H[DF_H.th == th]
    for giv in curr_df.giv.unique():
        h_df = curr_df[curr_df.giv == giv].sort_values(by="i_shuffled")
        f_helper.write(f"{th} | {giv}"+ "\n\n" + '\n'.join(h_df.h.values) + "\n==========\n")
f_helper.close()