In [1]:
import os 
from collections import defaultdict

from tqdm import tqdm
import numpy as np
import pandas as pd

in_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_kgc/unified_train/"

a2sp, a2cp, q2p = defaultdict(list), defaultdict(list), defaultdict(list)

fn_to_data = {
    "a2sp.train.tsv": a2sp,
    "a2cp.train.tsv": a2cp,
    "max2_qorient_q2p.train.tsv": q2p
}
for fn, data in fn_to_data.items():
    fn = os.path.join(in_dir, fn)
    with open(fn) as fin:
        for line in fin:
            array = line.strip().split("\t")
            assert len(array) == 4
            hid, pos_tid = int(array[0]), int(array[1])
            data[hid].append(pos_tid)

print(len(a2sp), len(a2cp), len(q2p))
print(np.mean([len(xs) for xs in a2sp.values()]), np.mean([len(xs) for xs in a2cp.values()]), np.mean([len(xs) for xs in q2p.values()]))

199507 80042 832482
2.8232693589698608 2.40278853601859 1.4414726084167586


In [2]:
def create_self_triples(hid, pos_tid, hid_to_postids, sampler):
    neg_tid = random.sample(sampler[hid], k=1)[0]
    while neg_tid in hid_to_postids[hid]:
        neg_tid = random.sample(sampler[hid], k=1)[0]
    return (hid, pos_tid, neg_tid)

data_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
eid_to_text = {}
with open(os.path.join(data_dir, "all_entities.tsv")) as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text

# "experiment_09-05_201710" "experiment_09-06_014836" "experiment_09-06_144636" "experiment_09-07_005742"
exp_dir = "/home/jupyter/unity_jointly_rec_and_search/experiments/unified_kgc/experiment_09-07_005742"
run_path = os.path.join(exp_dir, "runs/checkpoint_latest.all.run")
df = pd.read_csv(run_path, sep="\t", names=["hid", "tid", "rank", "score"])
self_hid_to_tids = {}
number_of_group = len(df.hid.unique())
for hid, group in tqdm(df.groupby("hid"), total=number_of_group):
    cand_tids = list(group.tid.values)
    assert len(cand_tids) == 200
    self_hid_to_tids[int(hid)] = [int(x) for x in cand_tids]

100%|██████████| 3214651/3214651 [07:55<00:00, 6764.30it/s]


In [3]:
import random

a2sp_triples = []
a2cp_triples = []
q2p_triples = []

for aid, simpids in a2sp.items():
    for pos_pid in simpids:
        triple = create_self_triples(aid, pos_pid, a2sp, self_hid_to_tids)
        a2sp_triples.append(triple)
        
for aid, complpids in a2cp.items():
    for pos_pid in complpids:
        triple = create_self_triples(aid, pos_pid, a2cp, self_hid_to_tids)
        a2cp_triples.append(triple)
        
for qid, relpids in q2p.items():
    for pos_pid in relpids:
        triple = create_self_triples(qid, pos_pid, q2p, self_hid_to_tids)
        q2p_triples.append(triple)

SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"

out_dir = os.path.join(exp_dir, "self_train")
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
fn_to_data = {
    "a2sp.train.tsv": (a2sp_triples, SIM_RELATION),
    "a2cp.train.tsv": (a2cp_triples, COMPL_RELATION),
    "q2p.train.tsv": (q2p_triples, REL_RELATION),
}

for fn, (triples, relation) in fn_to_data.items():
    fn = os.path.join(out_dir, fn)
    with open(fn, "w") as fout:
        for triple in triples:
            hid, pos_tid, neg_tid = triple
            fout.write(f"{hid}\t{pos_tid}\t{neg_tid}\t{relation}\n")

In [6]:
# sanity check
for path in os.listdir(out_dir):
    path = os.path.join(out_dir, path)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)

1200000 /home/jupyter/unity_jointly_rec_and_search/experiments/unified_kgc/experiment_09-07_005742/self_train/q2p.train.tsv
2853868	102466	1400885	is_relevant_to
2705355	2002433	2034175	is_relevant_to
3041465	1096710	2198693	is_relevant_to
2531648	291192	8625	is_relevant_to
2474011	731124	1338460	is_relevant_to
2517222	1829280	1495872	is_relevant_to
563262 /home/jupyter/unity_jointly_rec_and_search/experiments/unified_kgc/experiment_09-07_005742/self_train/a2sp.train.tsv
1048567	331101	2037830	is_similar_to
1048567	1144496	910290	is_similar_to
1144496	1777815	1117287	is_similar_to
1891760	2233677	557509	is_similar_to
352886	1068443	2147908	is_similar_to
615570	1017607	1068443	is_similar_to
192324 /home/jupyter/unity_jointly_rec_and_search/experiments/unified_kgc/experiment_09-07_005742/self_train/a2cp.train.tsv
745636	1983214	1926294	is_complementary_to
745636	1337474	1709339	is_complementary_to
745636	1173319	605170	is_complementary_to
364149	41477	833619	is_complementary_to
187314	12

In [11]:
hid, pos_tid, neg_tid = (745636,1337474,1709339)
print(eid_to_text[hid])
print(eid_to_text[pos_tid])
print(eid_to_text[neg_tid])

Broan 30-in Ducted Black Undercabinet Range Hood ; Undercabinet Range Hoods
Broan Undercabinet Range Hood Damper Kit (Aluminum) ; Range Hood Parts
Broan Duct-Free Wall-mounted Range Hood Liner (Aluminum) ; Range Hood Parts
