In [34]:
import os 
from collections import defaultdict

REL_RELATION = "is_relevant_to"

in_dir = "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_train/"

qorien_path = os.path.join(in_dir, "max2_qorient_q2p.train.tsv")
porien_path = os.path.join(in_dir, "q2a.train.tsv")

qorien_q2ps = defaultdict(list)
porien_q2ps = defaultdict(list)
q2p2negp = {}
with open(qorien_path) as fin:
    for line in fin:
        qid, pid, neg_pid, _ = line.strip().split("\t")
        qorien_q2ps[qid].append(pid)
        if qid not in q2p2negp:
            q2p2negp[qid] = {}
            q2p2negp[qid][pid] = neg_pid
        else:
            q2p2negp[qid][pid] = neg_pid
with open(porien_path) as fin:
    for line in fin:
        qid, pid, neg_pid, _ = line.strip().split("\t")
        porien_q2ps[qid].append(pid)
        if qid not in q2p2negp:
            q2p2negp[qid] = {}
            q2p2negp[qid][pid] = neg_pid
        else:
            q2p2negp[qid][pid] = neg_pid

qorien_unique_ps = set([p for ps in qorien_q2ps.values() for p in ps])
porien_unique_ps = set([p for ps in porien_q2ps.values() for p in ps])
print("qorien_q2ps, unique queries = {:,}, unique passages = {:,}".format(len(qorien_q2ps.keys()), len(qorien_unique_ps)))
print("porien_q2ps, unique queries = {:,}, unique passages = {:,}".format(len(porien_q2ps.keys()), len(porien_unique_ps)))

merge_q2ps = defaultdict(list)
for q, ps in qorien_q2ps.items():
    if q in porien_q2ps:
        merge_ps = set(ps + porien_q2ps[q])
        merge_q2ps[q] = list(merge_ps)
    else:
        merge_q2ps[q] = ps
        
for q, ps in porien_q2ps.items():
    if q not in merge_q2ps:
        merge_q2ps[q] = ps

merge_unique_ps = set([p for ps in merge_q2ps.values() for p in ps])
print("merge_q2ps, unique queries = {:,}, unique passages = {:,}".format(len(merge_q2ps.keys()), len(merge_unique_ps)))

train_triples = []
for qid, pos_pids in merge_q2ps.items():
    for pos_pid in pos_pids:
        train_triples.append((qid, pos_pid, q2p2negp[qid][pos_pid]))
print("number of train examples = {:,}".format(len(train_triples)))

with open(os.path.join(in_dir, "complete_q2p.train.tsv"), "w") as fout:
    for qid, pos_pid, neg_pid in train_triples:
        fout.write(f"{qid}\t{pos_pid}\t{neg_pid}\t{REL_RELATION}\n")

qorien_q2ps, unique queries = 716,991, unique passages = 176,811
porien_q2ps, unique queries = 289,415, unique passages = 330,387
merge_q2ps, unique queries = 716,991, unique passages = 330,387
number of train examples = 1,808,056


In [35]:
! wc -l /home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_train/complete_q2p.train.tsv
! head -n 10 /home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_train/complete_q2p.train.tsv

1808056 /home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_train/complete_q2p.train.tsv
3006068	785329	2019535	is_relevant_to
3006068	1517384	809219	is_relevant_to
2608168	1732982	409623	is_relevant_to
2608168	551815	2174606	is_relevant_to
2588656	934774	1698316	is_relevant_to
2588656	541860	855023	is_relevant_to
2588656	477227	1675319	is_relevant_to
2588656	1560403	137815	is_relevant_to
2588656	1391645	148715	is_relevant_to
2588656	1331297	791279	is_relevant_to


In [38]:
hid, pos_tid, neg_tid = (2608168,1732982,409623)

! grep -P "^{hid}\t" "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/all_entities.tsv"
! grep -P "^{pos_tid}\t" "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/all_entities.tsv"
! grep -P "^{neg_tid}\t" "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/all_entities.tsv"

2608168	brick oven
1732982	Forno Venetzia Pronto Brick Hearth Wood-fired Outdoor Pizza Oven ; Outdoor Pizza Ovens
409623	Antico Elements Faux Brick Panels 5-in x 7-in Storm Brick Veneer Individual Piece 0.3 Square ; Brick Veneer


In [36]:
train_triples[:10]

[('3006068', '785329', '2019535'),
 ('3006068', '1517384', '809219'),
 ('2608168', '1732982', '409623'),
 ('2608168', '551815', '2174606'),
 ('2588656', '934774', '1698316'),
 ('2588656', '541860', '855023'),
 ('2588656', '477227', '1675319'),
 ('2588656', '1560403', '137815'),
 ('2588656', '1391645', '148715'),
 ('2588656', '1331297', '791279')]

In [31]:
q2p2negp["3006068"]["785329"]

'2019535'