# utils

In [3]:
import json
import numpy as np
import random
from tqdm.auto import tqdm
import itertools
import os
from copy import deepcopy
import matplotlib.pyplot as plt

def build_dicts(entities):
    entity2ind = dict()
    ind2entity = []
    for i in range(len(entities)):
        entity = entities[i]
        if not (entity in ind2entity):
            ind2entity.append(entity)
            entity2ind[entity] = len(ind2entity) - 1
    return ind2entity, entity2ind

def choose(arr, ratio_or_count):
    if type(ratio_or_count) == float:
        num = round(ratio_or_count*len(arr))
    elif type(ratio_or_count) == int:
        num = ratio_or_count
    else:
         assert False
    if num >= len(arr):
        return arr
    rand_inds = np.random.choice(len(arr), num, replace=False).tolist()
    return [arr[i] for i in rand_inds]
    
def split(arr, ratio_or_count):
    if type(ratio_or_count) == float:
        num = round(ratio_or_count*len(arr))
    elif type(ratio_or_count) == int:
        num = ratio_or_count
    else:
         assert False
    train, test = [], []
    rand_inds = np.random.choice(len(arr), num, replace=False).tolist()
    for i in tqdm(range(len(arr))):
        if i in rand_inds:
            train.append(arr[i])
        else:
            test.append(arr[i])
    return [train, test]

def form_items(c, t):
    """
    return format:
        {
            "input_text": "<e_736><r_120>", 
            "target_text": "<e_736><r_120><e_1544></a>",
        }
    """
    input_text = "".join(c)
    target_text = input_text + "".join([t, "</a>"])
    item = {
        "input_text": input_text,
        "target_text": target_text
    }
    return item

# Base Configuration

In [22]:
def build_base_dataset(num_entities, num_relations, out_degree=20):
 
    entities = ["<e_{}>".format(i) for i in range(num_entities)]
    ind2entity, entity2ind = build_dicts(entities)

    relations = ["<r_{}>".format(i) for i in range(num_relations)]
    ind2relation, relation2ind = build_dicts(relations)

    atomic_dict = dict()   # {h1: [(r1, t1), (r2, t2), ...], ...}
    atomic_facts = []   # [{"input_text": "...", "target_text": "..."}, ...]
    atomics = []   # [(h1,r1,t1), (h2,r2,t2), ...]

    for i in tqdm(range(num_entities)):
        # for each subject entity, randomly select some outgoing relations to some random object entity
        num_rows = out_degree
        selected_rows = np.random.choice(num_relations, size=num_rows, replace=False).tolist()
        for row_idx in selected_rows:
            col_idx = np.random.randint(num_entities)  # pick some random tail entity for each selected (h,r)
            h,r,t = ind2entity[i], ind2relation[row_idx], ind2entity[col_idx]  # h and t might be same here
            atomic_facts.append(form_items([h, r], t))
            atomics.append((h,r,t))
            if h not in atomic_dict:
                atomic_dict[h] = []
            atomic_dict[h].append((r, t))
    
    # split ID/OOD
    OOD_ratio = 0.05
    OOD_facts, ID_facts = split(atomics, round(len(atomics)*OOD_ratio))  # randomly
    OOD_facts, ID_facts = set(OOD_facts), set(ID_facts)

    id_atomic_facts = [form_items([h, r], t) for (h,r,t) in ID_facts]
    ood_atomic_facts = [form_items([h, r], t) for (h,r,t) in OOD_facts]

    train_2hop_ii, test_2hop_ii, test_2hop_io, test_2hop_oi, test_2hop_oo = [], [], [], [], []
    
    for ent in tqdm(entities, desc="2-hop: "):
        for (r1, b) in atomic_dict[ent]:
            for (r2, t) in atomic_dict[b]:
                if (ent, r1, b) in ID_facts and (b, r2, t) in ID_facts:
                    if np.random.uniform() > 0.05:
                        train_2hop_ii.append(form_items([ent, r1, r2], t))
                    else:
                        test_2hop_ii.append(form_items([ent, r1, r2], t))
                
                elif (ent, r1, b) in ID_facts and (b, r2, t) in OOD_facts:
                    test_2hop_io.append(form_items([ent, r1, r2], t))
                
                elif (ent, r1, b) in OOD_facts and (b, r2, t) in ID_facts:
                    test_2hop_oi.append(form_items([ent, r1, r2], t))
                
                elif (ent, r1, b) in OOD_facts and (b, r2, t) in OOD_facts:
                    test_2hop_oo.append(form_items([ent, r1, r2], t))

    return (
        entities, relations,  # vocab
        id_atomic_facts, ood_atomic_facts,
        
        # 2-hop
        train_2hop_ii,  # train
        test_2hop_ii, test_2hop_io, test_2hop_oi, test_2hop_oo,  # test
    )

In [None]:
NUM_ENTITY_IN = 2000
NUM_RELATION = 200

(
    entities, relations,  # vocab
    id_atomic_facts, ood_atomic_facts,
    
    # 2-hop
    train_2hop_ii,  # train
    test_2hop_ii, test_2hop_io, test_2hop_oi, test_2hop_oo,  # test
) = build_base_dataset(NUM_ENTITY_IN, NUM_RELATION)

In [17]:
vocab = []
vocab = vocab + entities + relations
# special tokens
# actually only "</a>" used as <eos>
vocab = vocab + ["<mask>", "<sep>", "<a>", "</a>", "<q>", "</q>"]

# For predict_during_training: cut test group
test_size = 3000
# atomic
test_id_atomic = choose(id_atomic_facts, test_size)
test_ood_atomic = choose(ood_atomic_facts, test_size)
# 2-hop
test_2hop_ii = choose(test_2hop_ii, test_size)
test_2hop_io = choose(test_2hop_io, test_size)
test_2hop_oi = choose(test_2hop_oi, test_size)
test_2hop_oo = choose(test_2hop_oo, test_size)

train_atomics = id_atomic_facts + ood_atomic_facts

phi = 7.2  # Train-II / ID Triples
train_2hop_ii = choose(train_2hop_ii, phi * len(id_atomic_facts))

dataset_name = "base_configuration.{}.{}.{}".format(NUM_ENTITY_IN, NUM_RELATION, phi)
os.makedirs("../data/{}".format(dataset_name), exist_ok=True)

probes = []

for item in test_id_atomic:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "ID Triples"

for item in test_ood_atomic:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "OOD Triples"

for item in choose(train_2hop_ii, test_size):
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Train-II'

for item in test_2hop_ii:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Test-II"

for item in test_2hop_io:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Test-IO'

for item in test_2hop_oi:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Test-OI'

for item in test_2hop_oo:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Test-OO"

with open("../data/{}/train.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(train_atomics + train_2hop_ii, f)

# evaluate_during_training
with open("../data/{}/valid.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(test_2hop_oo, f)

# predict_during_training
with open("../data/{}/test.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(probes, f)

# add vocab
with open("../data/{}/vocab.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(vocab, f)

# 3-hop

In [13]:
def build_3_hop(num_entities, num_relations, out_degree=10):

    entities = ["<e_{}>".format(i) for i in range(num_entities)]
    ind2entity, entity2ind = build_dicts(entities)

    relations = ["<r_{}>".format(i) for i in range(num_relations)]
    ind2relation, relation2ind = build_dicts(relations)

    atomic_dict = dict()   # {h1: [(r1, t1), (r2, t2), ...], ...}
    atomic_facts = []   # [{"input_text": "...", "target_text": "..."}, ...]
    atomics = []   # [(h1,r1,t1), (h2,r2,t2), ...]

    for i in tqdm(range(num_entities)):
        # for each subject entity, randomly select some outgoing relations to some random object entity
        num_rows = out_degree
        selected_rows = np.random.choice(num_relations, size=num_rows, replace=False).tolist()
        for row_idx in selected_rows:
            col_idx = np.random.randint(num_entities)  # pick some random tail entity for each selected (h,r)
            h,r,t = ind2entity[i], ind2relation[row_idx], ind2entity[col_idx]  # h and t might be same here
            atomic_facts.append(form_items([h, r], t))
            atomics.append((h,r,t))
            if h not in atomic_dict:
                atomic_dict[h] = []
            atomic_dict[h].append((r, t))
    
    # split ID/OOD
    OOD_ratio = 0.2  # This ratio can't be too low in 3-hop dataset
    OOD_facts, ID_facts = split(atomics, round(len(atomics)*OOD_ratio))  # randomly
    OOD_facts, ID_facts = set(OOD_facts), set(ID_facts)

    id_atomic_facts = [form_items([h, r], t) for (h,r,t) in ID_facts]
    ood_atomic_facts = [form_items([h, r], t) for (h,r,t) in OOD_facts]
    
    (
        train_3hop_iii, 
        test_3hop_iii, test_3hop_iio, test_3hop_ioi, test_3hop_ioo,  # startwiths i
        test_3hop_oii, test_3hop_oio, test_3hop_ooi, test_3hop_ooo,  # startwiths o
    ) = [], [], [], [], [], [], [], [], []

    for ent in tqdm(entities, desc="3-hop: "):
        for (r1, b1) in atomic_dict[ent]:
            for (r2, b2) in atomic_dict[b1]:
                for (r3, t) in atomic_dict[b2]:
                    if (ent, r1, b1) in ID_facts and (b1, r2, b2) in ID_facts and (b2, r3, t) in ID_facts:
                        if np.random.uniform() > 0.05:
                            # 1000000 * 0.8 * 0.8 * 0.8 * 0.95 = 486400
                            train_3hop_iii.append(form_items([ent, r1, r2, r3], t))
                        else:
                            # 1000000 * 0.8 * 0.8 * 0.8 * 0.05 = 25600
                            test_3hop_iii.append(form_items([ent, r1, r2, r3], t))
                    
                    # 1000000 * 0.8 * 0.8 * 0.2 = 128000
                    elif (ent, r1, b1) in ID_facts and (b1, r2, b2) in ID_facts and (b2, r3, t) in OOD_facts:
                        test_3hop_iio.append(form_items([ent, r1, r2, r3], t))

                    # 1000000 * 0.8 * 0.2 * 0.8 = 128000
                    elif (ent, r1, b1) in ID_facts and (b1, r2, b2) in OOD_facts and (b2, r3, t) in ID_facts:
                        test_3hop_ioi.append(form_items([ent, r1, r2, r3], t))

                    # 1000000 * 0.8 * 0.2 * 0.2 = 32000
                    elif (ent, r1, b1) in ID_facts and (b1, r2, b2) in OOD_facts and (b2, r3, t) in OOD_facts:
                        test_3hop_ioo.append(form_items([ent, r1, r2, r3], t))
                    
                    # 1000000 * 0.2 * 0.8 * 0.8 = 128000
                    elif (ent, r1, b1) in OOD_facts and (b1, r2, b2) in ID_facts and (b2, r3, t) in ID_facts:
                        test_3hop_oii.append(form_items([ent, r1, r2, r3], t))
                    
                    # 1000000 * 0.2 * 0.8 * 0.2 = 32000
                    elif (ent, r1, b1) in OOD_facts and (b1, r2, b2) in ID_facts and (b2, r3, t) in OOD_facts:
                        test_3hop_oio.append(form_items([ent, r1, r2, r3], t))
                    
                    # 1000000 * 0.2 * 0.2 * 0.8 = 32000
                    elif (ent, r1, b1) in OOD_facts and (b1, r2, b2) in OOD_facts and (b2, r3, t) in ID_facts:
                        test_3hop_ooi.append(form_items([ent, r1, r2, r3], t))
                    
                    # 1000000 * 0.2 * 0.2 * 0.2 = 8000
                    elif (ent, r1, b1) in OOD_facts and (b1, r2, b2) in OOD_facts and (b2, r3, t) in OOD_facts:
                        test_3hop_ooo.append(form_items([ent, r1, r2, r3], t))


    return (
        entities, relations,  # vocab
        id_atomic_facts, ood_atomic_facts,

        # 3-hop
        train_3hop_iii, # train
        test_3hop_iii, test_3hop_iio, test_3hop_ioi, test_3hop_ioo,
        test_3hop_oii, test_3hop_oio, test_3hop_ooi, test_3hop_ooo,  # test
    )

In [None]:
NUM_ENTITY_IN = 1000
NUM_RELATION = 100

(
    entities, relations,  # vocab
    id_atomic_facts, ood_atomic_facts,

    # 3-hop
    train_3hop_iii, # train
    test_3hop_iii, test_3hop_iio, test_3hop_ioi, test_3hop_ioo,
    test_3hop_oii, test_3hop_oio, test_3hop_ooi, test_3hop_ooo,  # test
) = build_3_hop(NUM_ENTITY_IN, NUM_RELATION)

In [15]:
vocab = []
vocab = vocab + entities + relations
# special tokens
# actually only "</a>" used as <eos>
vocab = vocab + ["<mask>", "<sep>", "<a>", "</a>", "<q>", "</q>"]

print("vocab size:", len(vocab))

# For predict_during_training: cut test group
test_size = 1000
# atomic
test_id_atomic = choose(id_atomic_facts, test_size)
test_ood_atomic = choose(ood_atomic_facts, test_size)

# 3-hop
test_3hop_iii = choose(test_3hop_iii, test_size)
test_3hop_iio = choose(test_3hop_iio, test_size)
test_3hop_ioi = choose(test_3hop_ioi, test_size)
test_3hop_ioo = choose(test_3hop_ioo, test_size)
test_3hop_oii = choose(test_3hop_oii, test_size)
test_3hop_oio = choose(test_3hop_oio, test_size)
test_3hop_ooi = choose(test_3hop_ooi, test_size)
test_3hop_ooo = choose(test_3hop_ooo, test_size)

train_atomics = id_atomic_facts + ood_atomic_facts

phi = 12  # Train-III / ID Triples
dataset_name = "3hop.{}.{}.{}".format(NUM_ENTITY_IN, NUM_RELATION, phi)
os.makedirs("../data/{}".format(dataset_name), exist_ok=True)

train_3hop_iii_cut = choose(train_3hop_iii, round(phi * len(train_atomics)))

probes = []

for item in test_id_atomic:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "ID Triples"

for item in test_ood_atomic:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "OOD Triples"

for item in choose(train_3hop_iii_cut, test_size):
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Train-III'

for item in test_3hop_iii:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Test-III'

for item in test_3hop_iio:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Test-IIO"

for item in test_3hop_ioi:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Test-IOI'

for item in test_3hop_ioo:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Test-IOO"

for item in test_3hop_oii:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Test-OII'

for item in test_3hop_oio:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Test-OIO"

for item in test_3hop_ooi:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Test-OOI'

for item in test_3hop_ooo:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Test-OOO"

with open("../data/{}/train.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(train_atomics + train_3hop_iii_cut, f)

# evaluate_during_training
with open("../data/{}/valid.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(test_3hop_ooo, f)

# predict_during_training
with open("../data/{}/test.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(probes, f)

# add vocab
with open("../data/{}/vocab.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(vocab, f)

vocab size: 1106


# Base Configuration with Second-Hop Ablation

In [10]:
def build_second_hop_ablation(num_entities, num_relations, out_degree=20):
 
    entities = ["<e_{}>".format(i) for i in range(num_entities)]
    ind2entity, entity2ind = build_dicts(entities)

    relations = ["<r_{}>".format(i) for i in range(num_relations)]
    ind2relation, relation2ind = build_dicts(relations)

    atomic_dict = dict()   # {h1: [(r1, t1), (r2, t2), ...], ...}
    atomic_facts = []   # [{"input_text": "...", "target_text": "..."}, ...]
    atomics = []   # [(h1,r1,t1), (h2,r2,t2), ...]

    for i in tqdm(range(num_entities)):
        # for each subject entity, randomly select some outgoing relations to some random object entity
        num_rows = out_degree
        selected_rows = np.random.choice(num_relations, size=num_rows, replace=False).tolist()
        for row_idx in selected_rows:
            col_idx = np.random.randint(num_entities)  # pick some random tail entity for each selected (h,r)
            h,r,t = ind2entity[i], ind2relation[row_idx], ind2entity[col_idx]  # h and t might be same here
            atomic_facts.append(form_items([h, r], t))
            atomics.append((h,r,t))
            if h not in atomic_dict:
                atomic_dict[h] = []
            atomic_dict[h].append((r, t))
    
    # split ID/OOD
    OOD_ratio = 0.05
    OOD_facts, ID_facts = split(atomics, round(len(atomics)*OOD_ratio))  # randomly
    OOD_facts = set(OOD_facts)

    second_hop_restricted_ratio = 0.05
    second_hop_restricted_id, normal_id = split(ID_facts, round(len(ID_facts) * second_hop_restricted_ratio))
    second_hop_restricted_id, normal_id = set(second_hop_restricted_id), set(normal_id)
    ID_facts = set(ID_facts)

    id_atomic_facts = [form_items([h, r], t) for (h,r,t) in ID_facts]
    ood_atomic_facts = [form_items([h, r], t) for (h,r,t) in OOD_facts]

    # We don't involve OOD here
    train_2hop_ii, test_2hop_ii, test_2hop_second_hop_restricted = [], [], []
    
    for ent in tqdm(entities, desc="2-hop: "):
        for (r1, b) in atomic_dict[ent]:
            for (r2, t) in atomic_dict[b]:
                if (ent, r1, b) in OOD_facts or (b, r2, t) in OOD_facts:
                    continue

                if (b, r2, t) in second_hop_restricted_id:
                    # (ent, r1, b) from second_hop_restricted_id or normal_id; (b, r2, t) from second_hop_restricted_id
                    test_2hop_second_hop_restricted.append(form_items([ent, r1, r2], t))
                else:
                    # (ent, r1, b) from second_hop_restricted_id or normal_id; (b, r2, t) from normal_id
                    if np.random.uniform() > 0.005:
                        train_2hop_ii.append(form_items([ent, r1, r2], t))
                    else:
                        test_2hop_ii.append(form_items([ent, r1, r2], t))

    return (
        entities, relations,  # vocab
        id_atomic_facts, ood_atomic_facts,
        
        # 2-hop
        train_2hop_ii,  # train
        test_2hop_ii, test_2hop_second_hop_restricted,  # test
    )

In [None]:
NUM_ENTITY_IN = 2000
NUM_RELATION = 200

(
    entities, relations,  # vocab
    id_atomic_facts, ood_atomic_facts,
    
    # 2-hop
    train_2hop_ii,  # train
    test_2hop_ii, test_2hop_second_hop_restricted,  # test
) = build_second_hop_ablation(NUM_ENTITY_IN, NUM_RELATION)

In [12]:
vocab = []
vocab = vocab + entities + relations
# special tokens
# actually only "</a>" used as <eos>
vocab = vocab + ["<mask>", "<sep>", "<a>", "</a>", "<q>", "</q>"]

# For predict_during_training: cut test group
test_size = 3000
# atomic
test_id_atomic = choose(id_atomic_facts, test_size)
test_ood_atomic = choose(ood_atomic_facts, test_size)
# 2-hop
test_2hop_ii = choose(test_2hop_ii, test_size)
test_2hop_second_hop_restricted = choose(test_2hop_second_hop_restricted, test_size)

train_atomics = id_atomic_facts + ood_atomic_facts

phi = 7.2  # Train-II / ID Triples
train_2hop_ii = choose(train_2hop_ii, phi * len(id_atomic_facts))

dataset_name = "second_hop_ablation_configuration.{}.{}.{}".format(NUM_ENTITY_IN, NUM_RELATION, phi)
os.makedirs("../data/{}".format(dataset_name), exist_ok=True)

probes = []

for item in test_id_atomic:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "ID Triples"

for item in test_ood_atomic:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "OOD Triples"

for item in choose(train_2hop_ii, test_size):
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Train-II'

for item in test_2hop_ii:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Test-II"

for item in test_2hop_second_hop_restricted:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Test-II-SR'


with open("../data/{}/train.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(train_atomics + train_2hop_ii, f)

# evaluate_during_training
with open("../data/{}/valid.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(test_2hop_oo, f)

# predict_during_training
with open("../data/{}/test.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(probes, f)

# add vocab
with open("../data/{}/vocab.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(vocab, f)

# Unanchored OOD Configuration & Decoding Preference Configuration
&emsp; without ID Triples

In [None]:
NUM_ENTITY_IN = 2000
NUM_RELATION = 200

# Reuse the build_base_dataset functions
(
    entities, relations,  # vocab
    id_atomic_facts, ood_atomic_facts,
    
    # 2-hop
    train_2hop_ii,  # train
    test_2hop_ii, test_2hop_io, test_2hop_oi, test_2hop_oo,  # test
) = build_base_dataset(NUM_ENTITY_IN, NUM_RELATION)

In [12]:
vocab = []
vocab = vocab + entities + relations
# special tokens
# actually only "</a>" used as <eos>
vocab = vocab + ["<mask>", "<sep>", "<a>", "</a>", "<q>", "</q>"]

# For predict_during_training: cut test group
test_size = 3000
# atomic
test_id_atomic = choose(id_atomic_facts, test_size)
test_ood_atomic = choose(ood_atomic_facts, test_size)
# 2-hop
test_2hop_ii = choose(test_2hop_ii, test_size)
test_2hop_io = choose(test_2hop_io, test_size)
test_2hop_oi = choose(test_2hop_oi, test_size)
test_2hop_oo = choose(test_2hop_oo, test_size)

# ensure that ID triples do not appear in the training set
train_atomics = ood_atomic_facts

phi = 7.2  # Train-II / ID Triples
train_2hop_ii = choose(train_2hop_ii, phi * len(id_atomic_facts))

dataset_name = "without_id.{}.{}.{}".format(NUM_ENTITY_IN, NUM_RELATION, phi)
os.makedirs("../data/{}".format(dataset_name), exist_ok=True)

probes = []

for item in test_id_atomic:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "ID Triples"

for item in test_ood_atomic:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "OOD Triples"

for item in choose(train_2hop_ii, test_size):
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Train-II'

for item in test_2hop_ii:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Test-II"

for item in test_2hop_io:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Test-IO'

for item in test_2hop_oi:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Test-OI'

for item in test_2hop_oo:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Test-OO"

with open("../data/{}/train.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(train_atomics + train_2hop_ii, f)

# evaluate_during_training
with open("../data/{}/valid.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(test_2hop_oo, f)

# predict_during_training
with open("../data/{}/test.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(probes, f)

# add vocab
with open("../data/{}/vocab.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(vocab, f)

# Held-out ID Triple Configuration

In [6]:
def build_held_out_id(num_entities, num_relations, out_degree=20):
 
    entities = ["<e_{}>".format(i) for i in range(num_entities)]
    ind2entity, entity2ind = build_dicts(entities)

    relations = ["<r_{}>".format(i) for i in range(num_relations)]
    ind2relation, relation2ind = build_dicts(relations)

    atomic_dict = dict()   # {h1: [(r1, t1), (r2, t2), ...], ...}
    atomic_facts = []   # [{"input_text": "...", "target_text": "..."}, ...]
    atomics = []   # [(h1,r1,t1), (h2,r2,t2), ...]

    for i in tqdm(range(num_entities)):
        # for each subject entity, randomly select some outgoing relations to some random object entity
        num_rows = out_degree
        selected_rows = np.random.choice(num_relations, size=num_rows, replace=False).tolist()
        for row_idx in selected_rows:
            col_idx = np.random.randint(num_entities)  # pick some random tail entity for each selected (h,r)
            h,r,t = ind2entity[i], ind2relation[row_idx], ind2entity[col_idx]  # h and t might be same here
            atomic_facts.append(form_items([h, r], t))
            atomics.append((h,r,t))
            if h not in atomic_dict:
                atomic_dict[h] = []
            atomic_dict[h].append((r, t))
    
    # split ID/OOD
    OOD_ratio = 0.05
    OOD_facts, ID_facts = split(atomics, round(len(atomics)*OOD_ratio))  # randomly
    OOD_facts = set(OOD_facts)

    held_out_ratio = 0.05
    held_out_id, retained_id = split(ID_facts, round(len(ID_facts) * held_out_ratio))
    held_out_id, retained_id = set(held_out_id), set(retained_id)

    ID_facts = set(ID_facts)

    held_out_id_atomic_facts = [form_items([h, r], t) for (h,r,t) in held_out_id]
    retained_id_atomic_facts = [form_items([h, r], t) for (h,r,t) in retained_id]
    ood_atomic_facts = [form_items([h, r], t) for (h,r,t) in OOD_facts]

    train_2hop_ii, test_2hop_ii, test_2hop_io, test_2hop_oi, test_2hop_oo = [], [], [], [], []
    
    for ent in tqdm(entities, desc="2-hop: "):
        for (r1, b) in atomic_dict[ent]:
            for (r2, t) in atomic_dict[b]:
                if (ent, r1, b) in ID_facts and (b, r2, t) in ID_facts:
                    if np.random.uniform() > 0.05:
                        train_2hop_ii.append(form_items([ent, r1, r2], t))
                    else:
                        test_2hop_ii.append(form_items([ent, r1, r2], t))
                
                elif (ent, r1, b) in ID_facts and (b, r2, t) in OOD_facts:
                    test_2hop_io.append(form_items([ent, r1, r2], t))
                
                elif (ent, r1, b) in OOD_facts and (b, r2, t) in ID_facts:
                    test_2hop_oi.append(form_items([ent, r1, r2], t))
                
                elif (ent, r1, b) in OOD_facts and (b, r2, t) in OOD_facts:
                    test_2hop_oo.append(form_items([ent, r1, r2], t))

    return (
        entities, relations,  # vocab
        held_out_id_atomic_facts, retained_id_atomic_facts, ood_atomic_facts,
        
        # 2-hop
        train_2hop_ii,  # train
        test_2hop_ii, test_2hop_io, test_2hop_oi, test_2hop_oo,  # test
    )

In [None]:
NUM_ENTITY_IN = 2000
NUM_RELATION = 200

(
    entities, relations,  # vocab
    held_out_id_atomic_facts, retained_id_atomic_facts, ood_atomic_facts,
    
    # 2-hop
    train_2hop_ii,  # train
    test_2hop_ii, test_2hop_io, test_2hop_oi, test_2hop_oo,  # test
) = build_held_out_id(NUM_ENTITY_IN, NUM_RELATION)

In [9]:
vocab = []
vocab = vocab + entities + relations
# special tokens
# actually only "</a>" used as <eos>
vocab = vocab + ["<mask>", "<sep>", "<a>", "</a>", "<q>", "</q>"]

# For predict_during_training: cut test group
test_size = 3000
# atomic
test_held_out_id_atomic = choose(held_out_id_atomic_facts, test_size)
test_retained_id_atomic = choose(retained_id_atomic_facts, test_size)
test_ood_atomic = choose(ood_atomic_facts, test_size)
# 2-hop
test_2hop_ii = choose(test_2hop_ii, test_size)
test_2hop_io = choose(test_2hop_io, test_size)
test_2hop_oi = choose(test_2hop_oi, test_size)
test_2hop_oo = choose(test_2hop_oo, test_size)

# ensure that held_out_id_atomic_facts do not appear in the training set
train_atomics = retained_id_atomic_facts + ood_atomic_facts

phi = 7.2  # Train-II / ID Triples
train_2hop_ii = choose(train_2hop_ii, phi * len(retained_id_atomic_facts + held_out_id_atomic_facts))

dataset_name = "held_out_id_configuration.{}.{}.{}".format(NUM_ENTITY_IN, NUM_RELATION, phi)
os.makedirs("../data/{}".format(dataset_name), exist_ok=True)

probes = []

for item in test_held_out_id_atomic:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Held-out ID Triples"

for item in test_retained_id_atomic:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Retained ID Triples"

for item in test_ood_atomic:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "OOD Triples"

for item in choose(train_2hop_ii, test_size):
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Train-II'

for item in test_2hop_ii:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Test-II"

for item in test_2hop_io:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Test-IO'

for item in test_2hop_oi:
    probes.append(deepcopy(item))
    probes[-1]['type'] = 'Test-OI'

for item in test_2hop_oo:
    probes.append(deepcopy(item))
    probes[-1]["type"] = "Test-OO"

with open("../data/{}/train.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(train_atomics + train_2hop_ii, f)

# evaluate_during_training
with open("../data/{}/valid.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(test_2hop_oo, f)

# predict_during_training
with open("../data/{}/test.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(probes, f)

# add vocab
with open("../data/{}/vocab.json".format(dataset_name), "w", encoding='utf-8') as f:
    json.dump(vocab, f)