In [267]:
import os
import math
import pandas as pd
from tqdm import tqdm
import numpy as np
import plotly.express as px

In [268]:
HIGHER_CATEGORIES = {
    "Demographic and Personal Characteristics": ['AgeVariable', 'EducationVariable', 'EthnicityVariable', 
                                                'GenderVariable', 'PersonalityVariable', 'HormonesVariable', 
                                                'ReligiosityVariable'],
    
    "Social and Group Dynamics": ['AcquaintanceVariable', 'Group_SizeVariable', 'LeadershipVariable', 
                                  'MonitoringVariable', 'MatchingVariable', 'OstracismVariable', 
                                  'Partner_realVariable', 'Partner_typeVariable', 
                                  'Intergroup_Competition_Variable', 'Perception_of_the_partner(s)Variable', 
                                  "Partner(s)'_strategiesVariable", 'SymmetryVariable'],
    
    "Psychological and Cognitive Factors": ['Cognitive_loadVariable', 'EmotionsVariable', 'ExpectationsVariable', 
                                           'Game_ComprehensionVariable', 'Motivational_OrientationVariable', 
                                           'Preferences_for_Conditional_CooperationVariable', 'State_TrustVariable'],
    
    "Game and Experimental Conditions": ['Behavior_in_the_gameVariable', 'ChoicesVariable', 'Experimental_SettingVariable', 
                                        'FeedbackVariable', 'FramingVariable', 'Game_DurationVariable', 
                                        'Game_TypeVariable', 'PeriodVariable', 'SequentialityVariable', 
                                        'Shadow_of_the_FutureVariable', 'Time_PressureVariable'],
    
    "Institutional and Structural Factors": ['AnchorVariable', 'CriticalityVariable', 'Degree_of_conflicting_interestsVariable', 
                                            'IncentivesVariable', 'Institutional_ChoiceVariable', 'Institution__Variable', 
                                            'TaxationVariable'],
    
    "Game Types and Scenarios": ['Public_Goods_GameVariable', 'Resource_Dilemma_GameVariable', 
                                 'Step-level_Public_Goods_GameVariable'],
    
    "Behavioral and Strategic Factors": ['CommunicationVariable', 'IdentificationVariable', 'Normative_BehaviorVariable', 
                                        'Partner_typeVariable', 'ReputationVariable', 'SynchronyVariable'],
    
    "Control and Influence Mechanisms": ['PowerVariable', 'PunishmentVariable', 'RewardVariable', 'PrimingVariable'],
    
    "Environmental and External Factors": ['Physical_ProximityVariable', 'UncertaintyVariable', 
                                          'Shadow_of_the_FutureVariable']
}

HIGHER_CATEGORIES_R = {y: x for x, l in HIGHER_CATEGORIES.items() for y in l}

In [269]:
def type_of_effect(row):
    """ Categorize effect based on its signifiance """
    if math.isnan(row.ESLower) or math.isnan(row.ESUpper):
        if row.ES > -0.2 and row.ES < 0.2:
            return 'noEffect'
        return 'positive' if row.ES >= 0.2 else 'negative'
    if row.ESLower <= 0 <= row.ESUpper:
        return 'noEffect'
    return 'positive'  if float(row.ES) > 0 else 'negative'

def get_output_csv(path, giv):
    df = pd.read_csv(path, index_col=0)
    df["giv"] = giv
    return df 

def get_input_csv(path):
    df = pd.read_csv(path, index_col=0)
    tqdm.pandas()
    df["effect"] = df.progress_apply(type_of_effect, axis=1)
    return df

In [270]:
THS = ["regular", "study_mod", "var_mod"]
DATA_IN = {
    th: get_input_csv(f"../../data/hypotheses/entry/h_{th}_es_d.csv") for th in THS
}

  0%|          | 0/4463 [00:00<?, ?it/s]

100%|██████████| 4463/4463 [00:00<00:00, 30159.73it/s]
100%|██████████| 91156/91156 [00:02<00:00, 38292.11it/s]
100%|██████████| 846/846 [00:00<00:00, 35698.72it/s]


In [271]:
def format_data_out(fo):
    files = [x for x in os.listdir(fo) if x.endswith(".csv")] 
    data_out = pd.concat([get_output_csv(path=os.path.join(fo, x), giv=x.replace(".csv", "")) for x in files], axis=0)
    data_out["dependent"]="https://data.cooperationdatabank.org/id/dependentvariable/" + data_out["dependent"]
    data_out["category"] = data_out["giv"].apply(lambda x: HIGHER_CATEGORIES_R[x])
    return data_out

DATA_OUT = {
    th: format_data_out(f"./final/h_{th}_es_d/outputs") for th in THS
}
for k, v in DATA_IN.items():
    print(f"{k}\t# h_data: {v.shape[0]}\t# h_output: {DATA_OUT[k].shape[0]}")


regular	# h_data: 4463	# h_output: 243
study_mod	# h_data: 91156	# h_output: 290
var_mod	# h_data: 846	# h_output: 60


In [272]:
def get_evidence_data(cols, data, row):
    for col in cols:
        data = data[data[col] == row[col]]
    return data

def get_ratio(cols, data, row):
    evidence = get_evidence_data(cols, data, row)
    grouped = evidence.groupby('effect').agg({'obs': 'count'}).reset_index()
    if row.comparative == "higher":  # #pos/#neg
        col1, col2 = 'positive', 'negative'
    else:  # row.comparative == "lower":  # #neg/#pos
        col2, col1 = 'positive', 'negative'
    row["evidence_plus"] = (list(grouped[grouped.effect==col1].obs.values) + [0])[0]
    row["evidence_minus"] = (list(grouped[grouped.effect==col2].obs.values) + [0])[0]
    row["acc"] = row["evidence_plus"]/(row["evidence_plus"]+row["evidence_minus"]) if (row["evidence_plus"]+row["evidence_minus"]) else "N/A"
    row["diff"] = row["evidence_plus"] - row["evidence_minus"]
    return row

def add_info(df, th):
    cols_filter = ["dependent"] + [col for col in df.columns if col.endswith("_label")]
    tqdm.pandas()
    df["th"] = th
    df = df.progress_apply(lambda row: get_ratio(cols_filter, DATA_IN[th], row), axis=1)
    return df

DATA_OUT = {k: add_info(v, k) for k, v in DATA_OUT.items()}
DATA_OUT["regular"].sample(10)


100%|██████████| 243/243 [00:01<00:00, 159.30it/s]
100%|██████████| 290/290 [00:07<00:00, 38.31it/s]
100%|██████████| 60/60 [00:00<00:00, 163.91it/s]


Unnamed: 0,dependent,iv,iv_label,cat_t1,cat_t1_label,cat_t2,cat_t2_label,comparative,giv,category,th,evidence_plus,evidence_minus,acc,diff
4,https://data.cooperationdatabank.org/id/depend...,heterogeneousEthnicity,Heterogeneous ethnicity,False,False,True,True,higher,EthnicityVariable,Demographic and Personal Characteristics,regular,0,0,,0
0,https://data.cooperationdatabank.org/id/depend...,gameComprehensionLevel,Comprehension of the game level,low,Low,high,High,higher,Game_ComprehensionVariable,Psychological and Cognitive Factors,regular,0,0,,0
0,https://data.cooperationdatabank.org/id/depend...,rebateorRefund,Rebate vs refund,absent,Absent,full_refund,Full Refund,higher,Institution__Variable,Institutional and Structural Factors,regular,0,2,0.0,-2
1,https://data.cooperationdatabank.org/id/depend...,feedbackContent,Feedback content,choice,Choice,earnings,Earnings,higher,FeedbackVariable,Game and Experimental Conditions,regular,7,4,0.636364,3
3,https://data.cooperationdatabank.org/id/depend...,communicationContent,Communication content,irrelevant,Irrelevant,promise,Promise,higher,CommunicationVariable,Behavioral and Strategic Factors,regular,0,3,0.0,-3
2,https://data.cooperationdatabank.org/id/depend...,matching,Matching,partner,Partner,partner's_choice,Partner's choice,lower,MatchingVariable,Social and Group Dynamics,regular,3,0,1.0,3
2,https://data.cooperationdatabank.org/id/depend...,conflictIndexLevel,Conflict level,high,High,medium,Medium,higher,Degree_of_conflicting_interestsVariable,Institutional and Structural Factors,regular,9,30,0.230769,-21
4,https://data.cooperationdatabank.org/id/depend...,preferenceConditionalCooperation,Preference for conditional cooperation,freeriders,Freeriders,hump-shaped_contributors,Hump-shaped contributors,higher,Preferences_for_Conditional_CooperationVariable,Psychological and Cognitive Factors,regular,0,1,0.0,-1
2,https://data.cooperationdatabank.org/id/depend...,institutionalChoice,Institutional choice,endogenous,Endogenous,exogenous,Exogenous,higher,Institutional_ChoiceVariable,Institutional and Structural Factors,regular,19,7,0.730769,12
2,https://data.cooperationdatabank.org/id/depend...,emotion,Emotion,guilt,Guilt,neutral,Neutral,higher,EmotionsVariable,Psychological and Cognitive Factors,regular,1,0,1.0,1


In [273]:
COLS_KEEP = ["giv", "category", "th", "evidence_plus", "evidence_minus", "acc", "diff"]
DATA_VIS = pd.concat([v[COLS_KEEP] for _, v in DATA_OUT.items()], axis=0)
DATA_VIS.sample(3)

Unnamed: 0,giv,category,th,evidence_plus,evidence_minus,acc,diff
2,Game_ComprehensionVariable,Psychological and Cognitive Factors,study_mod,0,0,,0
4,Public_Goods_GameVariable,Game Types and Scenarios,study_mod,1,3,0.25,-2
2,PowerVariable,Control and Influence Mechanisms,study_mod,3,2,0.6,1


In [274]:
DO_MEAN_VAR = DATA_VIS.groupby(["th", "giv", "category"]).agg({"diff": ["mean", "std", ("cv", lambda x: np.std(x)/abs(np.mean(x)) if np.mean(x) else "N/A")]}).reset_index()
DO_MEAN_VAR.columns = ['_'.join(x) for x in list(DO_MEAN_VAR.columns)]

color_palette = px.colors.qualitative.Safe
fig = px.scatter(DO_MEAN_VAR.dropna(), x="diff_mean", y="diff_cv", color="th_", hover_data="giv_",
                 color_discrete_sequence=color_palette)
fig.write_image("../visualisations/llm_mean_diff_vs_coeff_var_diff.pdf", format='pdf')
fig.show()

In [275]:
color_palette = px.colors.qualitative.Safe
fig = px.histogram(DATA_VIS, x="acc", histnorm="", nbins=30, color="th",
                   color_discrete_sequence=color_palette, opacity=0.75)
fig.update_layout(barmode='group')
fig.write_image("../visualisations/llm_hist_acc.pdf", format='pdf')
fig.show()

In [276]:
top = 5
pos_dif = DATA_VIS.groupby("giv").agg({"diff": "mean"}).reset_index().sort_values(by="diff", ascending=False).giv.values[:5]
neg_dif = DATA_VIS.groupby("giv").agg({"diff": "mean"}).reset_index().sort_values(by="diff", ascending=True).giv.values[:5]

In [277]:
color_palette = px.colors.qualitative.Safe
fig = px.box(DATA_VIS[DATA_VIS.giv.isin(pos_dif)], 
             x="giv", y="diff", color="th", points='all',
             color_discrete_sequence=color_palette)
fig.update_layout(yaxis={'categoryorder': 'total descending'})
fig.update_layout(width=600, height=600, showlegend=False)
fig.write_image("../visualisations/llm_pos_diff_giv_all.pdf", format='pdf')
fig.show()

In [278]:
color_palette = px.colors.qualitative.Safe
fig = px.box(DATA_VIS[DATA_VIS.giv.isin(neg_dif)], 
             x="giv", y="diff", color="th", points='all',
             color_discrete_sequence=color_palette)
fig.update_layout(yaxis={'categoryorder': 'total descending'})
fig.update_layout(width=600, height=600, showlegend=False)
fig.write_image("../visualisations/llm_neg_diff_giv_all.pdf", format='pdf')

fig.show()

In [279]:
top = 5
pos_dif_all = []
neg_diff_all = []
data_vis_pos = []
data_vis_neg = []
for th in DATA_VIS.th.unique():
    pos_diff = DATA_VIS[DATA_VIS.th==th].groupby("giv").agg({"diff": "mean"}).reset_index().sort_values(by="diff", ascending=False).giv.values.tolist()[:5]
    neg_diff = DATA_VIS[DATA_VIS.th==th].groupby("giv").agg({"diff": "mean"}).reset_index().sort_values(by="diff", ascending=True).giv.values.tolist()[:5]
    data_vis_pos.append(DATA_VIS[(DATA_VIS.th==th) & (DATA_VIS.giv.isin(pos_diff))])
    data_vis_neg.append(DATA_VIS[(DATA_VIS.th==th) & (DATA_VIS.giv.isin(neg_diff))])

In [280]:
color_palette = px.colors.qualitative.Safe
fig = px.box(pd.concat(data_vis_pos, axis=0), 
             x="giv", y="diff", color="th", points='all',
             color_discrete_sequence=color_palette)
fig.update_layout(yaxis={'categoryorder': 'total descending'})
fig.update_layout(width=600, height=600, showlegend=False)
fig.write_image("../visualisations/llm_pos_diff_giv_distinct_per_th.pdf", format='pdf')
fig.show()

In [281]:
color_palette = px.colors.qualitative.Safe
fig = px.box(pd.concat(data_vis_neg, axis=0), 
             x="giv", y="diff", color="th", points='all',
             color_discrete_sequence=color_palette)
fig.update_layout(yaxis={'categoryorder': 'total descending'})
fig.update_layout(width=600, height=600, showlegend=False)
fig.write_image("../visualisations/llm_neg_diff_giv_distinct_per_th.pdf", format='pdf')
fig.show()