In [19]:
import os 
import random

import ujson 
import numpy as np
import copy
import pandas as pd
from tqdm import tqdm

np.random.seed(4680)
random.seed(4680)
MAX_PIDS = 2_000_000

out_dir = "/home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/train/"
in_dir = "/home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/"
run_path = "/home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/runs/bm25.train.run"

if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
df = pd.read_csv(run_path, sep=" ", names=["aid", "q0", "pid", "rank", "score", "model_name"])
bm25_aid_to_pids = {}
ignore_aids = set()
for aid, group in df.groupby("aid"):
    cand_pids = list(group.pid.values)
    if len(cand_pids) < 10:
        ignore_aids.add(int(aid))
    else:
        bm25_aid_to_pids[int(aid)] = [int(x) for x in cand_pids]
        
assert len(ignore_aids) == 0

train_aids = set()
with open(os.path.join(in_dir, "anchors_title.train.tsv")) as fin:
    for line in fin:
        aid, title = line.rstrip().split("\t")
        train_aids.add(int(aid))

one_hop_examples = []
with open(os.path.join(in_dir, "one_hop_examples.jsonl")) as fin:
    for line in fin:
        example = ujson.loads(line.rstrip())
        for key in example:
            assert len(example) == 2
            if key != "compl_pids":
                aid = int(key)
                
        if aid in train_aids:
            one_hop_examples.append(example)

ignore_num = 0
no_enough_simpids = 0
for topk in [1, 2, 5]:
    train_examples = []
    for example in tqdm(one_hop_examples, total=len(one_hop_examples)):
        train_exp = {"compl_pids": [], "sim_pids": []}
        assert len(example) == 2, example
        for key, vals in example.items():
            if key == "compl_pids":
                compl_pids = [int(pid) for pid in vals.keys()]
                train_exp["compl_pids"] = compl_pids
                random.shuffle(compl_pids)
                train_exp["compl_pids"] = compl_pids[:topk]
        for key, vals in example.items():
            if key != "compl_pids":
                train_exp["aid"] = int(key)
                sim_pids = [int(pid) for pid in example[key]["sim_pids"]]
                random.shuffle(sim_pids)
                train_exp["sim_pids"] = sim_pids[:len(train_exp["compl_pids"])]
                
                if len(train_exp["sim_pids"]) < len(train_exp["compl_pids"]):
                    no_enough_simpids += 1
                    cand_pids = bm25_aid_to_pids[train_exp["aid"]]
                    random.shuffle(cand_pids)
                    
                    remain_pids = []      
                    for pid in cand_pids:
                        if pid in compl_pids:
                            continue
                        remain_pids.append(pid)
                        if len(remain_pids) >= len(train_exp["compl_pids"]) - len(train_exp["sim_pids"]):
                            break
                    train_exp["sim_pids"] += remain_pids
                        
                    
        train_examples.append(train_exp)
        
    with open(os.path.join(out_dir, f"train_{topk}compl_{topk}sim.json"), "w") as fout:
        for train_exp in train_examples:
                fout.write(ujson.dumps(train_exp) + "\n")
    with open(os.path.join(out_dir, f"top{topk}_triples.tsv"), "w") as fout:
        for train_exp in train_examples:
            aid = train_exp["aid"]
            pos_neg_pairs = zip(train_exp["compl_pids"], train_exp["sim_pids"])
            for pos_pid, neg_pid in pos_neg_pairs:
                fout.write(f"{aid}\t{pos_pid}\t{neg_pid}\n")
                
    print(f"no_enough_simpids for top-{topk}: ", no_enough_simpids)
            
    

100%|██████████| 69496/69496 [00:10<00:00, 6684.77it/s] 


no_enough_simpids for top-1:  14304


100%|██████████| 69496/69496 [00:10<00:00, 6916.90it/s] 


no_enough_simpids for top-2:  31276


100%|██████████| 69496/69496 [00:13<00:00, 5020.00it/s]


no_enough_simpids for top-5:  51773


In [20]:
# sanity check
for path in os.listdir(out_dir):
    path = os.path.join(out_dir, path)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)

69496 /home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/train/train_5compl_5sim.json
{"compl_pids":[2186809],"sim_pids":[1296125],"aid":501931}
{"compl_pids":[1960065,774261],"sim_pids":[500345,2128382],"aid":220006}
{"compl_pids":[855403,1654671,565304,1939891,1185215],"sim_pids":[1952428,70274,393439,414956,351113],"aid":1970525}
{"compl_pids":[273456],"sim_pids":[672057],"aid":1822620}
{"compl_pids":[533708,1043030],"sim_pids":[485389,1381526],"aid":132127}
{"compl_pids":[1870322],"sim_pids":[1889462],"aid":385303}
69496 /home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/train/train_2compl_2sim.json
{"compl_pids":[2186809],"sim_pids":[1890315],"aid":501931}
{"compl_pids":[774261,1960065],"sim_pids":[1702313,563604],"aid":220006}
{"compl_pids":[2257172,586807],"sim_pids":[393439,1952428],"aid":1970525}
{"compl_pids":[273456],"sim_pids":[971697],"aid":1822620}
{"compl_pids":[533708,1043030],"sim_pids":[1078725,485389],"aid":132127}
{"compl_pids":[18703

In [4]:
! grep -P "^132127\t" "/home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/collection_title_catalog.tsv"
! grep -P "^1043030\t" "/home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/collection_title_catalog.tsv"
! grep -P "^1381526\t" "/home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/collection_title_catalog.tsv"

132127	Jacuzzi 1500-Watt Inline Heater [SEP] Whirlpool Tub & Air Bath Parts
1043030	Jacuzzi 1.5-in Brushed Nickel Foot Lock Drain with Plastic Pipe [SEP] Bathtub Drains
1381526	WaterTECH Whirpool 110 Volt tub heater [SEP] Whirlpool Tub & Air Bath Parts


In [None]:
path = "/home/jupyter/jointly_rec_and_search/preprocess/rec_compl/dataset//"