In [None]:
import os 
from collections import defaultdict

from tqdm import tqdm
import numpy as np
import pandas as pd

in_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_kgc/unified_train/"

a2sp, a2cp, q2p = defaultdict(list), defaultdict(list), defaultdict(list)

fn_to_data = {
    "a2sp.train.tsv": a2sp,
    "a2cp.train.tsv": a2cp,
    "max2_qorient_q2p.train.tsv": q2p
}
for fn, data in fn_to_data.items():
    fn = os.path.join(in_dir, fn)
    with open(fn) as fin:
        for line in fin:
            array = line.strip().split("\t")
            assert len(array) == 4
            hid, pos_tid = int(array[0]), int(array[1])
            data[hid].append(pos_tid)

print(len(a2sp), len(a2cp), len(q2p))
print(np.mean([len(xs) for xs in a2sp.values()]), np.mean([len(xs) for xs in a2cp.values()]), np.mean([len(xs) for xs in q2p.values()]))

In [None]:
def create_self_triples(hid, pos_tid, hid_to_postids, sampler):
    neg_tid = random.sample(sampler[hid], k=1)[0]
    while neg_tid in hid_to_postids[hid]:
        neg_tid = random.sample(sampler[hid], k=1)[0]
    return (hid, pos_tid, neg_tid)

def create_self_n_triples(hid, pos_tid, hid_to_postids, sampler, n=4):
    neg_tis = set()
    while len(neg_tis) < n:
        neg_tid = random.sample(sampler[hid], k=1)[0]
        while neg_tid in hid_to_postids[hid] or neg_tid in neg_tis:
            neg_tid = random.sample(sampler[hid], k=1)[0]
        neg_tis.add(neg_tid)
    
    triples = []
    for neg_tid in neg_tis:
        exp = (hid, pos_tid, neg_tid)
        triples.append(exp)
        
    return triples


data_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
eid_to_text = {}
with open(os.path.join(data_dir, "all_entities.tsv")) as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text

# "experiment_09-05_201710" "experiment_09-06_014836" "experiment_09-06_144636" "experiment_09-07_005742"
exp_dir = "/home/jupyter/unity_jointly_rec_and_search/experiments/unified_kgc/phase_1/experiment_09-07_005742"
run_path = os.path.join(exp_dir, "runs/checkpoint_latest.all.run")
df = pd.read_csv(run_path, sep="\t", names=["hid", "tid", "rank", "score"])
self_hid_to_tids = {}
number_of_group = len(df.hid.unique())
for hid, group in tqdm(df.groupby("hid"), total=number_of_group):
    cand_tids = list(group.tid.values)
    assert len(cand_tids) == 200
    self_hid_to_tids[int(hid)] = [int(x) for x in cand_tids]


In [43]:
hard_hid_to_tids = {}
for hid, tids in self_hid_to_tids.items():
    hard_hid_to_tids[hid] = tids[100:]

rand_hid_to_tids = {}
for hid in self_hid_to_tids:
    rand_hid_to_tids[hid] = range(2_000_000)

In [44]:
import random
from itertools import chain

a2sp_triples = []
a2cp_triples = []
q2p_triples = []
n_neg = 4

for aid, simpids in a2sp.items():
    for pos_pid in simpids:
        triples_1 = create_self_n_triples(aid, pos_pid, a2sp, hard_hid_to_tids, n_neg//2)
        triples_2 = create_self_n_triples(aid, pos_pid, a2sp, rand_hid_to_tids, n_neg//2)
        for triple in chain(triples_1, triples_2):
            a2sp_triples.append(triple)
        
for aid, complpids in a2cp.items():
    for pos_pid in complpids:
        triples_1 = create_self_n_triples(aid, pos_pid, a2cp, hard_hid_to_tids, n_neg//2)
        triples_2 = create_self_n_triples(aid, pos_pid, a2cp, rand_hid_to_tids, n_neg//2)
        for triple in chain(triples_1, triples_2):
            a2cp_triples.append(triple)
        
for qid, relpids in q2p.items():
    for pos_pid in relpids:
        triples_1 = create_self_n_triples(qid, pos_pid, q2p, hard_hid_to_tids, n_neg//2)
        triples_2 = create_self_n_triples(qid, pos_pid, q2p, rand_hid_to_tids, n_neg//2)
        for triple in chain(triples_1, triples_2):  
            q2p_triples.append(triple)

SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"

out_dir = os.path.join(exp_dir, f"self_train_{n_neg}neg")
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
fn_to_data = {
    "a2sp.train.tsv": (a2sp_triples, SIM_RELATION),
    "a2cp.train.tsv": (a2cp_triples, COMPL_RELATION),
    "q2p.train.tsv": (q2p_triples, REL_RELATION),
}

for fn, (triples, relation) in fn_to_data.items():
    fn = os.path.join(out_dir, fn)
    with open(fn, "w") as fout:
        for triple in triples:
            hid, pos_tid, neg_tid = triple
            fout.write(f"{hid}\t{pos_tid}\t{neg_tid}\t{relation}\n")

In [None]:
# sanity check
for path in os.listdir(out_dir):
    path = os.path.join(out_dir, path)
    ! wc -l $path
    ! head -n 8 $path
    ! tail -n 8 $path
    print("="*100)

In [54]:
hid, pos_tid, neg_tid = (2853868,102466,680632)
print(eid_to_text[hid])
print(eid_to_text[pos_tid])
print(eid_to_text[neg_tid])

file crates
Style Selections Plastics crates 17-in W x 11-in H x 14-in D Black Plastic Milk Crate ; Storage Bins & Baskets
Hastings Home Plastic Storage Tray ; Tool Storage Accessories


In [41]:
random.sample(range(10), 2)

[6, 7]