In [57]:
from scipy.stats import pearsonr, spearmanr, kendalltau
import plotly.express as px
import pandas as pd
import numpy as np

In [67]:
df = pd.read_csv("results_kg_embedding_hp.csv", index_col=0)
df = df.rename(columns={"mean_reciprocal_rank": "mrr"})
params = [x for x in df.columns if not (x.startswith("hits") or x.startswith("mrr"))]
metrics = [x for x in df.columns if (x.startswith("hits") or x.startswith("mrr"))]

print(f"{df.shape[0]} experiments")
for col in params:
    print(f"Param `{col}`: {sorted(df[col].unique())}")
for m in metrics:
    print(f"Metric `{m}`: min/mean/max\t {round(np.min(df[m].values), 3)}/{round(np.mean(df[m].values), 3)}/{round(np.max(df[m].values), 3)}")
df.head(3)

624 experiments
Param `embedding_dim`: [16, 64, 128, 256, 512]
Param `epochs`: [100, 200, 300, 400, 500]
Param `lr`: [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1]
Param `model`: ['distmult', 'rgcn']
Param `num_negs_per_pos`: [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
Metric `hits@1`: min/mean/max	 0.0/0.085/0.244
Metric `hits@3`: min/mean/max	 0.0/0.165/0.365
Metric `hits@10`: min/mean/max	 0.0/0.268/0.486
Metric `mrr`: min/mean/max	 0.001/0.15/0.323


Unnamed: 0,embedding_dim,epochs,lr,model,num_negs_per_pos,hits@1,hits@3,hits@10,mrr
0,16,100,0.01,rgcn,50,0.09009,0.132883,0.225225,0.139202
1,16,200,0.01,rgcn,50,0.066441,0.13964,0.272523,0.137036
2,64,100,0.001,rgcn,30,0.105856,0.148649,0.230856,0.15194


In [53]:
method_to_func = {"pearson": pearsonr, "kendall": kendalltau, "spearman": spearmanr}

def build_cols(metrics):
    """ Columns for df """
    columns = []
    for x in metrics:
        columns += [f"{x}_corr", f"{x}_pval"]
    return columns

def get_correlations(df, method, params, metrics):
    """ Retrieve correlations between params and metrics """
    data, mappings = [], {}
    for p in params:
        curr_data = []
        for m in metrics:
            if not isinstance(df[p].values[0], str):
                x = list(df[p].values)
            else:
                curr_mapping = {val: index for index, val in enumerate(df[p].unique())}
                mappings[p] = curr_mapping
                x = [curr_mapping[elt] for elt in df[p].values]
            curr_data += list(method_to_func[method](x, list(df[m].values)))
        data.append(curr_data)
    return pd.DataFrame(data, columns=build_cols(metrics), index=params), mappings

In [68]:
corrs, mappings = get_correlations(df, 'pearson', params, metrics)
print(mappings)
corrs

{'model': {'rgcn': 0, 'distmult': 1}}


Unnamed: 0,hits@1_corr,hits@1_pval,hits@3_corr,hits@3_pval,hits@10_corr,hits@10_pval,mrr_corr,mrr_pval
embedding_dim,0.054142,0.1767802,0.212676,8.164157e-08,0.328533,3.5988850000000004e-17,0.192045,1.346758e-06
epochs,0.053822,0.1793511,0.203019,3.144197e-07,0.316911,5.040022e-16,0.191085,1.523624e-06
lr,-0.488082,1.1546009999999999e-38,-0.497652,2.349835e-40,-0.466542,4.778901e-35,-0.514173,2.099746e-43
model,0.397245,5.078201e-25,0.43608,2.363738e-30,0.324711,8.682504e-17,0.393438,1.554849e-24
num_negs_per_pos,0.011416,0.7759461,0.067237,0.09332305,0.093658,0.01928194,0.05862,0.1435609


Param Notes
* `embedding_dim`: higher is better (hits@10~0.23 | mrr~0.14)
* `epochs`: higher is better (hits@10~0.20 | mrr~0.09)
* `lr`: lower is better (hits@10~-0.35 | mrr~-0.39)
* `model`: distmult is better (hits@10~0.32 | mrr~0.38)
* `num_neg_per_pos`: no strong correlations