In [1]:
import os
import torch as th
import numpy as np
from tqdm import tqdm
from ray import tune
from ray.tune.search.hyperopt import HyperOptSearch
from ray.air.config import RunConfig
from hyperopt import hp

import setproctitle
setproctitle.setproctitle("RelEns@yueling")

In [2]:
dataset_name = 'wn18rr' # wn18rr, fb15k_237, nell
model_names = ['TransE', 'RotatE', 'ComplEx', 'ConvE', 'compgcn', 'house']

rank_dir = "ranks"
checkpoint_path = f"weights/{dataset_name}_rel_weights.npy"

max_concurrent_trials = 8
n_samples = 300
n_initial_points = 64

In [10]:
# (N, 3) Columns: head, realtion, tail
print(f"Loading dataset: {dataset_name}")
triplets = {
    'test': np.load(f"dataset/{dataset_name}/test_triples.npy"),
    'valid': np.load(f"dataset/{dataset_name}/valid_triples.npy")
}
print(triplets['test'].shape)

pos_idx = {
    'test': triplets['test'][:, 2],
    'valid': triplets['valid'][:, 2]
}

Loading dataset: wn18rr
(6268, 3)


In [12]:
all_ranks = {
    'test': [np.load(f"{rank_dir}/{dataset_name}/{m}_test_ranks.npy") for m in model_names],
    'valid': [np.load(f"{rank_dir}/{dataset_name}/{m}_valid_ranks.npy") for m in model_names]
}
print(all_ranks['test'][0].shape)

n_model = len(all_ranks['test'])
print(n_model)

(6268, 40943)
6


In [5]:
def eval_model(pos_idx, ranks):
    ''' 
        pos_idx: np.ndarray 
        ranks: np.ndarray
    '''
    pos_idx = th.from_numpy(pos_idx)
    ranks = th.from_numpy(ranks)
    argsort = th.argsort(ranks , dim=1, descending=False)
    pos_ranking = th.nonzero(argsort == pos_idx.unsqueeze(1))[:, 1].cpu().numpy() + 1

    # calculate metrics
    all_ranking        = np.array(pos_ranking)
    metrics            = {}
    metrics['mrr']     = round(np.mean(1/all_ranking), 4)
    metrics['mr']      = round(np.mean(all_ranking), 4)
    metrics['hits@1']  = round(np.mean(all_ranking<=1), 4)
    metrics['hits@3']  = round(np.mean(all_ranking<=3), 4)
    metrics['hits@10'] = round(np.mean(all_ranking<=10), 4)

    return metrics

def objective(config, data):
    sub_pos_idx, sub_ranks = data

    n_model = len(sub_ranks)

    weights = [config[f"w_{i}"] for i in range(n_model)]

    ranks_avg = np.average(sub_ranks, weights=weights, axis=0)

    mrr = eval_model(sub_pos_idx, ranks_avg)

    return mrr

In [6]:
# get rel indexes
relations = {
    'test': triplets['test'][:, 1],
    'valid': triplets['valid'][:, 1]
}

num_relation = relations['valid'].max()+1

rel_indexes = {
    'valid': {},
    'test': {}
} # relation_id -> np array

# relation_id [0, 534]
for relation_id in range(num_relation):
    rel_indexes['test'][relation_id] = np.where(relations['test'] == relation_id)[0]
    rel_indexes['valid'][relation_id] = np.where(relations['valid'] == relation_id)[0]

In [9]:
default_config = {f'w_{i}': 1/n_model for i in range(n_model)}

if checkpoint_path and os.path.exists(checkpoint_path):
    print("Load existing models")
    rel_weights = np.load(checkpoint_path)
else:
    print("Searching for ensemble weights")
    rel_weights = np.zeros((num_relation, n_model))

    search_space = {f"w_{i}": hp.uniform(f"w_{i}", 0, 1) for i in range(n_model)}
    hyperopt_search = HyperOptSearch(search_space, metric="mrr", mode="max", n_initial_points=n_initial_points)

    for rel_id in range(num_relation):
        if len(rel_indexes['valid'][rel_id]) == 0:
            # default weights
            rel_weights[rel_id] = np.fromiter(default_config.values(), dtype=np.float32)
            continue

        subranks = [model_rank[rel_indexes['valid'][rel_id]] for model_rank in all_ranks['valid']]
        sub_pos_idx = pos_idx['valid'][rel_indexes['valid'][rel_id]]

        tuner = tune.Tuner(tune.with_parameters(objective, data=(sub_pos_idx, subranks)), param_space=search_space,
                tune_config=tune.TuneConfig(num_samples=n_samples, search_alg=hyperopt_search, max_concurrent_trials=max_concurrent_trials))
        results = tuner.fit()

        best_weights = np.fromiter(results.get_best_result(metric="mrr", mode="max").config.values(), dtype=float)
        rel_weights[rel_id] = best_weights

    # np.save(f"{dataset_name}_rel_weights.npy", rel_weights)

Load existing models


## Calculate relation performance

In [8]:
print("Evaluating")
rel_res = {
    'rel_ensemble': [{'mrr': 0, 'mr': 0, 'hits@1': 0, 'hits@3': 0, 'hits@10': 0} for _ in range(num_relation)]
}

mode = 'test'
for rel_id in tqdm(range(num_relation)):
    if len(rel_indexes[mode][rel_id]) == 0:
        continue

    sub_ranks = [model_rank[rel_indexes[mode][rel_id]] for model_rank in all_ranks[mode]]
    sub_pos_idx = pos_idx[mode][rel_indexes[mode][rel_id]]

    config = {f"w_{i}": rel_weights[rel_id][i] for i in range(n_model)}
    metrics = objective(config, (sub_pos_idx, sub_ranks))
    rel_res['rel_ensemble'][rel_id] = metrics

final_res = {
    'rel_ensemble': {'mrr': 0, 'hits@1': 0, 'hits@3': 0, 'hits@10': 0}
}

for rel_id in range(num_relation):
    for metric in ['mrr', 'hits@1', 'hits@3', 'hits@10']:
        final_res['rel_ensemble'][metric] += rel_res['rel_ensemble'][rel_id][metric] * len(rel_indexes[mode][rel_id])

print("rel_ensemble")
for k, v in final_res['rel_ensemble'].items():
    v_mean = v / len(relations[mode])
    print(f"{k}: {v_mean:.4f}")

Evaluating


100%|██████████| 22/22 [00:30<00:00,  1.41s/it]

rel_ensemble
mrr: 0.5201
hits@1: 0.4770
hits@3: 0.5375
hits@10: 0.6039



