In [10]:
import os 
import random

import ujson 
import numpy as np
import copy

np.random.seed(4680)
MAX_PIDS = 2_000_000

out_dir = "/home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/train/"
in_dir = "/home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

one_hop_examples = []
with open(os.path.join(in_dir, "one_hop_examples.jsonl")) as fin:
    for line in fin:
        example = ujson.loads(line.rstrip())
        one_hop_examples.append(example)

no_enough_simpids = 0
for topk in [2,5]:
    train_examples = []
    for example in one_hop_examples:
        train_exp = {"compl_pids": [], "sim_pids": []}
        assert len(example) == 2, example
        for key, vals in example.items():
            if key == "compl_pids":
                compl_pids = [int(pid) for pid in vals.keys()]
                train_exp["compl_pids"] = compl_pids
                random.shuffle(compl_pids)
                train_exp["compl_pids"] = compl_pids[:topk]
        for key, vals in example.items():
            if key != "compl_pids":
                train_exp["aid"] = int(key)
                sim_pids = [int(pid) for pid in example[key]["sim_pids"]]
                random.shuffle(sim_pids)
                train_exp["sim_pids"] = sim_pids[:len(train_exp["compl_pids"])]
                
                if len(sim_pids) < len(train_exp["compl_pids"]):
                    no_enough_simpids += 1
        train_examples.append(train_exp)
        
    with open(os.path.join(out_dir, f"train_{topk}compl_{topk}sim.json"), "w") as fout:
        for train_exp in train_examples:
                fout.write(ujson.dumps(train_exp) + "\n")
    with open(os.path.join(out_dir, f"top{topk}_triples.tsv"), "w") as fout:
        for train_exp in train_examples:
            aid = train_exp["aid"]
            pos_neg_pairs = zip(train_exp["compl_pids"], train_exp["sim_pids"])
            for pos_pid, neg_pid in pos_neg_pairs:
                fout.write(f"{aid}\t{pos_pid}\t{neg_pid}\n")
                
    print(f"no_enough_simpids for top-{topk}: ", no_enough_simpids)
            
    

no_enough_simpids for top-2:  21159
no_enough_simpids for top-5:  46699


In [5]:
# sanity check
for path in os.listdir(out_dir):
    path = os.path.join(out_dir, path)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)

86870 /home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/train/train_5compl_5sim.json
{"compl_pids":[996049],"sim_pids":[],"aid":1533887}
{"compl_pids":[1742890,138280,137963,1737192,2220361],"sim_pids":[203190,700862,1665306,1679435,1329645],"aid":1890880}
{"compl_pids":[2218053,1361171],"sim_pids":[],"aid":1747057}
{"compl_pids":[2066839],"sim_pids":[],"aid":319334}
{"compl_pids":[658077,1691210,551638,1649508,1405724],"sim_pids":[684267,2193403,454906,263070,2189016],"aid":1083335}
{"compl_pids":[398038,240925,90666,2204991,1099519],"sim_pids":[1083335,820391,454906,684466,163550],"aid":2193403}
86870 /home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/train/train_2compl_2sim.json
{"compl_pids":[996049],"sim_pids":[],"aid":1533887}
{"compl_pids":[137963,2220361],"sim_pids":[700862,1665306],"aid":1890880}
{"compl_pids":[1361171,2218053],"sim_pids":[],"aid":1747057}
{"compl_pids":[2066839],"sim_pids":[],"aid":319334}
{"compl_pids":[1405724,1879519],"sim_

In [11]:
! grep -P "^1533887\t" "/home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/collection_title.tsv"

1533887	SIMPSON 72-in x 80-in Wood Full Lite Right-Hand Inswing Brown Unfinished Double Front Door Solid Core


In [None]:
path = "/home/jupyter/jointly_rec_and_search/preprocess/rec_compl/dataset//"