In [71]:
from scipy.stats import pearsonr, spearmanr, kendalltau
import plotly.express as px
import pandas as pd
import numpy as np

In [84]:
HYPOTHESIS = "var_mod"
ES = "r"

In [85]:
df = pd.read_csv(f"classification/hp_search/h_{HYPOTHESIS}_es_{ES}_metrics.csv", index_col=0).fillna(-1)
features = ['criterion', 'splitter', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features']
metrics = ["acc_train", "acc_val"] + \
    [f"{x}_{y}_{z}" for x in ["precision", "recall", "f1"] \
        for y in ["macro", "micro", "weighted"] \
            for z in ["train", "val"]]
print(f"{df.shape[0]} experiments")
for col in features:
    print(f"features `{col}`: {sorted(df[col].unique())}")
for m in metrics:
    print(f"Metric `{m}`: min/mean/max\t {round(np.min(df[m].values), 3)}/{round(np.mean(df[m].values), 3)}/{round(np.max(df[m].values), 3)}")
df.head(3)

864 experiments
features `criterion`: ['entropy', 'gini', 'log_loss']
features `splitter`: ['best', 'random']
features `max_depth`: [-1.0, 5.0, 10.0, 15.0]
features `min_samples_split`: [2, 5, 10]
features `min_samples_leaf`: [1, 2, 4]
features `max_features`: ['0.5', '0.7', 'log2', 'sqrt']
Metric `acc_train`: min/mean/max	 0.587/0.685/0.703
Metric `acc_val`: min/mean/max	 0.447/0.581/0.684
Metric `precision_macro_train`: min/mean/max	 0.605/0.707/0.731
Metric `precision_macro_val`: min/mean/max	 0.506/0.624/0.701
Metric `precision_micro_train`: min/mean/max	 0.587/0.685/0.703
Metric `precision_micro_val`: min/mean/max	 0.447/0.581/0.684
Metric `precision_weighted_train`: min/mean/max	 0.611/0.706/0.732
Metric `precision_weighted_val`: min/mean/max	 0.51/0.632/0.712
Metric `recall_macro_train`: min/mean/max	 0.581/0.665/0.687
Metric `recall_macro_val`: min/mean/max	 0.454/0.598/0.692
Metric `recall_micro_train`: min/mean/max	 0.587/0.685/0.703
Metric `recall_micro_val`: min/mean/max	 0

Unnamed: 0,criterion,max_depth,max_features,min_samples_leaf,min_samples_split,random_state,splitter,acc_train,acc_val,precision_macro_train,...,recall_micro_train,recall_micro_val,recall_weighted_train,recall_weighted_val,f1_macro_train,f1_macro_val,f1_micro_train,f1_micro_val,f1_weighted_train,f1_weighted_val
0,gini,-1.0,sqrt,1,2,23,best,0.702797,0.578947,0.721942,...,0.702797,0.578947,0.702797,0.578947,0.689002,0.582459,0.702797,0.578947,0.698148,0.576758
1,gini,-1.0,sqrt,1,2,23,random,0.702797,0.578947,0.721942,...,0.702797,0.578947,0.702797,0.578947,0.689002,0.582459,0.702797,0.578947,0.698148,0.576758
2,gini,-1.0,sqrt,1,5,23,best,0.702797,0.578947,0.721942,...,0.702797,0.578947,0.702797,0.578947,0.689002,0.582459,0.702797,0.578947,0.698148,0.576758


In [86]:
df[df["f1_macro_train"]==np.max(df["f1_macro_train"].values)]

Unnamed: 0,criterion,max_depth,max_features,min_samples_leaf,min_samples_split,random_state,splitter,acc_train,acc_val,precision_macro_train,...,recall_micro_train,recall_micro_val,recall_weighted_train,recall_weighted_val,f1_macro_train,f1_macro_val,f1_micro_train,f1_micro_val,f1_weighted_train,f1_weighted_val
328,entropy,-1.0,0.5,1,10,23,best,0.702797,0.578947,0.712858,...,0.702797,0.578947,0.702797,0.578947,0.691894,0.582459,0.702797,0.578947,0.700053,0.576758
334,entropy,-1.0,0.5,2,10,23,best,0.702797,0.578947,0.712858,...,0.702797,0.578947,0.702797,0.578947,0.691894,0.582459,0.702797,0.578947,0.700053,0.576758
340,entropy,-1.0,0.5,4,10,23,best,0.702797,0.578947,0.712858,...,0.702797,0.578947,0.702797,0.578947,0.691894,0.582459,0.702797,0.578947,0.700053,0.576758
472,entropy,10.0,0.5,1,10,23,best,0.702797,0.578947,0.712858,...,0.702797,0.578947,0.702797,0.578947,0.691894,0.582459,0.702797,0.578947,0.700053,0.576758
478,entropy,10.0,0.5,2,10,23,best,0.702797,0.578947,0.712858,...,0.702797,0.578947,0.702797,0.578947,0.691894,0.582459,0.702797,0.578947,0.700053,0.576758
484,entropy,10.0,0.5,4,10,23,best,0.702797,0.578947,0.712858,...,0.702797,0.578947,0.702797,0.578947,0.691894,0.582459,0.702797,0.578947,0.700053,0.576758
544,entropy,15.0,0.5,1,10,23,best,0.702797,0.578947,0.712858,...,0.702797,0.578947,0.702797,0.578947,0.691894,0.582459,0.702797,0.578947,0.700053,0.576758
550,entropy,15.0,0.5,2,10,23,best,0.702797,0.578947,0.712858,...,0.702797,0.578947,0.702797,0.578947,0.691894,0.582459,0.702797,0.578947,0.700053,0.576758
556,entropy,15.0,0.5,4,10,23,best,0.702797,0.578947,0.712858,...,0.702797,0.578947,0.702797,0.578947,0.691894,0.582459,0.702797,0.578947,0.700053,0.576758
616,log_loss,-1.0,0.5,1,10,23,best,0.702797,0.578947,0.712858,...,0.702797,0.578947,0.702797,0.578947,0.691894,0.582459,0.702797,0.578947,0.700053,0.576758


In [87]:
method_to_func = {"pearson": pearsonr, "kendall": kendalltau, "spearman": spearmanr}

def build_cols(metrics):
    """ Columns for df """
    columns = []
    for x in metrics:
        columns += [f"{x}_corr", f"{x}_pval"]
    return columns

def get_correlations(df, method, params, metrics):
    """ Retrieve correlations between params and metrics """
    data, mappings = [], {}
    for p in params:
        curr_data = []
        for m in metrics:
            if not isinstance(df[p].values[0], str):
                x = list(df[p].values)
            else:
                curr_mapping = {val: index for index, val in enumerate(df[p].unique())}
                mappings[p] = curr_mapping
                x = [curr_mapping[elt] for elt in df[p].values]
            curr_data += list(method_to_func[method](x, list(df[m].values)))
        data.append(curr_data)
    return pd.DataFrame(data, columns=build_cols(metrics), index=params), mappings

In [89]:
corrs, mappings = get_correlations(df, 'pearson', features, metrics)
corrs, mappings = get_correlations(
    df[(df.splitter == "best") & (df.max_depth == 15)],
    'pearson', features, metrics)
print(mappings)
corrs

{'criterion': {'gini': 0, 'entropy': 1, 'log_loss': 2}, 'splitter': {'best': 0}, 'max_features': {'sqrt': 0, 'log2': 1, '0.5': 2, '0.7': 3}}



An input array is constant; the correlation coefficient is not defined.



Unnamed: 0,acc_train_corr,acc_train_pval,acc_val_corr,acc_val_pval,precision_macro_train_corr,precision_macro_train_pval,precision_macro_val_corr,precision_macro_val_pval,precision_micro_train_corr,precision_micro_train_pval,...,f1_macro_val_corr,f1_macro_val_pval,f1_micro_train_corr,f1_micro_train_pval,f1_micro_val_corr,f1_micro_val_pval,f1_weighted_train_corr,f1_weighted_train_pval,f1_weighted_val_corr,f1_weighted_val_pval
criterion,-2.2551410000000003e-17,1.0,-0.1867718,0.052932,0.07799164,0.422377,-0.2492594,0.009283,-2.2551410000000003e-17,1.0,...,-0.1847264,0.055633,-2.2551410000000003e-17,1.0,-0.1867718,0.052932,-0.01520187,0.8759123,-0.191042,0.047642
splitter,,,,,,,,,,,...,,,,,,,,,,
max_depth,,,,,,,,,,,...,,,,,,,,,,
min_samples_split,-0.6217016,6.953483e-13,-0.2772843,0.003669,-0.4362057,2e-06,-0.1825128,0.058682,-0.6217016,6.953483e-13,...,-0.2815722,0.003156,-0.6217016,6.953483e-13,-0.2772843,0.003669,-0.5594659,3.095302e-10,-0.2785725,0.003508
min_samples_leaf,3.469447e-18,1.0,1.257675e-17,1.0,-9.107298e-18,1.0,7.372575e-18,1.0,3.469447e-18,1.0,...,-1.0408340000000001e-17,1.0,3.469447e-18,1.0,1.257675e-17,1.0,-1.8214600000000002e-17,1.0,1.387779e-17,1.0
max_features,0.2085144,0.03034225,0.2500646,0.009051,0.1588385,0.100604,0.3026142,0.001456,0.2085144,0.03034225,...,0.2456639,0.010386,0.2085144,0.03034225,0.2500646,0.009051,0.1917701,0.04678546,0.244418,0.010794


In [25]:
col_fig_parallel = []
for p in features:
    if p not in mappings:
        col_fig_parallel.append(p)
    else:
        df[f"{p}_cat"] = df[p].apply(lambda x: mappings[p][x])
        col_fig_parallel.append(f"{p}_cat")

for k, v in mappings.items():
    print(f"{k}\t{v}")

metric_for_fig = "f1_macro_train"
fig = px.parallel_coordinates(df, color=metric_for_fig,
                              range_color=[np.min(df[metric_for_fig].values), np.max(df[metric_for_fig].values)],
                              dimensions=col_fig_parallel+[metric_for_fig],
                              color_continuous_scale=px.colors.diverging.Tealrose,
                              color_continuous_midpoint=2)
fig.show()

criterion	{'gini': 0, 'entropy': 1, 'log_loss': 2}
splitter	{'best': 0, 'random': 1}
max_features	{'sqrt': 0, 'log2': 1, '0.5': 2, '0.7': 3}


In [52]:
# 'criterion', 'splitter', 'max_depth', 
# 'min_samples_split', 'min_samples_leaf', 'max_features']
metric_for_fig = "f1_macro_train"
dim = "criterion_cat"
curr_df = df
curr_df = df[(df.splitter == "best") & (df.max_depth == 15)]
fig = px.parallel_coordinates(curr_df, color=metric_for_fig,
                              range_color=[np.min(curr_df[metric_for_fig].values), np.max(curr_df[metric_for_fig].values)],
                              dimensions=[dim]+[metric_for_fig],
                              color_continuous_scale=px.colors.diverging.Tealrose,
                              color_continuous_midpoint=2)
fig.show()

## Final Hypotheses

In [61]:
df = pd.read_csv("classification/final/h_regular_es_d.csv", index_col=0)
df.head(3)

Unnamed: 0,giv_prop,iv,iv_label,cat_t1,cat_t1_label,iv.1,iv_label.1,cat_t2,cat_t2_label,dependent,...,td,pred_num,pred_readable,comparative,pred_true,score_0,score_1,score_2,max_score,hypothesis
0,https://data.cooperationdatabank.org/vocab/pro...,https://data.cooperationdatabank.org/vocab/pro...,SVO type,https://data.cooperationdatabank.org/id/svotyp...,Prosocial,https://data.cooperationdatabank.org/vocab/pro...,SVO type,https://data.cooperationdatabank.org/id/svotyp...,Proself,https://data.cooperationdatabank.org/id/depend...,...,train,2.0,positive,higher,2.0,0.035714,0.428571,0.535714,0.535714,Contributions are significantly higher when sv...
1,https://data.cooperationdatabank.org/vocab/pro...,https://data.cooperationdatabank.org/vocab/pro...,Power level,https://data.cooperationdatabank.org/id/powerl...,High,https://data.cooperationdatabank.org/vocab/pro...,Power level,https://data.cooperationdatabank.org/id/powerl...,Low,https://data.cooperationdatabank.org/id/depend...,...,train,2.0,positive,higher,0.0,0.277778,0.111111,0.611111,0.611111,Contributions are significantly higher when po...
2,https://data.cooperationdatabank.org/vocab/pro...,https://data.cooperationdatabank.org/vocab/pro...,Power level,https://data.cooperationdatabank.org/id/powerl...,High,https://data.cooperationdatabank.org/vocab/pro...,Power level,https://data.cooperationdatabank.org/id/powerl...,Low,https://data.cooperationdatabank.org/id/depend...,...,train,2.0,positive,higher,1.0,0.277778,0.111111,0.611111,0.611111,Contributions are significantly higher when po...


In [62]:
df.groupby("pred_num").agg({"hypothesis": "count"})

Unnamed: 0_level_0,hypothesis
pred_num,Unnamed: 1_level_1
0.0,1458
1.0,1167
2.0,1048


In [63]:
df.groupby("pred_true").agg({"hypothesis": "count"})

Unnamed: 0_level_0,hypothesis
pred_true,Unnamed: 1_level_1
0.0,1128
1.0,1335
2.0,1210


In [70]:
df[(df.giv_prop == 'https://data.cooperationdatabank.org/vocab/prop/Game_TypeVariable') & (df.td == "test") & (df.pred_num.isin([0, 2]))].sort_values(by="max_score", ascending=False)

Unnamed: 0,giv_prop,iv,iv_label,cat_t1,cat_t1_label,iv.1,iv_label.1,cat_t2,cat_t2_label,dependent,...,td,pred_num,pred_readable,comparative,pred_true,score_0,score_1,score_2,max_score,hypothesis
698,https://data.cooperationdatabank.org/vocab/pro...,https://data.cooperationdatabank.org/vocab/pro...,Game type,https://data.cooperationdatabank.org/id/gamety...,Continuous Public Goods Game,https://data.cooperationdatabank.org/vocab/pro...,Game type,https://data.cooperationdatabank.org/id/gamety...,Resource Dilemma,https://data.cooperationdatabank.org/id/depend...,...,test,0.0,negative,lower,0.0,1.0,0.0,0.0,1.0,Contributions are significantly lower when gam...
1061,https://data.cooperationdatabank.org/vocab/pro...,https://data.cooperationdatabank.org/vocab/pro...,Game type,https://data.cooperationdatabank.org/id/gamety...,Resource Dilemma,https://data.cooperationdatabank.org/vocab/pro...,Game type,https://data.cooperationdatabank.org/id/gamety...,Other Game,https://data.cooperationdatabank.org/id/depend...,...,test,0.0,negative,lower,0.0,1.0,0.0,0.0,1.0,Withdrawals are significantly lower when game ...
2615,https://data.cooperationdatabank.org/vocab/pro...,https://data.cooperationdatabank.org/vocab/pro...,Game type,https://data.cooperationdatabank.org/id/gamety...,Prisoner's Dilemma Game,https://data.cooperationdatabank.org/vocab/pro...,Game type,https://data.cooperationdatabank.org/id/gamety...,Other Game,https://data.cooperationdatabank.org/id/depend...,...,test,2.0,positive,higher,0.0,0.230769,0.076923,0.692308,0.692308,Cooperation are significantly higher when game...
2667,https://data.cooperationdatabank.org/vocab/pro...,https://data.cooperationdatabank.org/vocab/pro...,Game type,https://data.cooperationdatabank.org/id/gamety...,Prisoner's Dilemma Game,https://data.cooperationdatabank.org/vocab/pro...,Game type,https://data.cooperationdatabank.org/id/gamety...,Other Game,https://data.cooperationdatabank.org/id/depend...,...,test,2.0,positive,higher,2.0,0.230769,0.076923,0.692308,0.692308,Cooperation are significantly higher when game...
