In [27]:
import os 
import pandas as pd 
import numpy as np 
import ujson
import networkx as nx
from collections import defaultdict
from tqdm import tqdm
from copy import deepcopy

base_dir = "/home/jupyter/jointly_rec_and_search/datasets/rec_search/"
test_qrel_path = os.path.join(base_dir, "search/qrels.test.tsv")

G = nx.Graph()
aid_to_relpids = defaultdict(list)
with open(os.path.join(base_dir, "rec/arels.train.tsv")) as fin:
    for line in fin:
        aid, _, pid, _ = line.strip().split("\t")
        G.add_edge(aid, pid)
        aid_to_relpids[aid].append(pid)
with open(os.path.join(base_dir, "rec/arels.val.tsv")) as fin:
    for line in fin:
        aid, _, pid, _ = line.strip().split("\t")
        G.add_edge(aid, pid)
        aid_to_relpids[aid].append(pid)
with open(os.path.join(base_dir, "rec/arels.test.tsv")) as fin:
    for line in fin:
        aid, _, pid, _ = line.strip().split("\t")
        G.add_edge(aid, pid)
        aid_to_relpids[aid].append(pid)
        
print("G number of nodes, edges, connected_components = {}, {}, {}".format(G.number_of_nodes(), G.number_of_edges(), 
                                                                               nx.number_connected_components(G)))
tmps = [len(xs) for xs in list(aid_to_relpids.values())]
print("median and mean number of interaction for aid = {:.3f}, {:.3f}".format(np.median(tmps), np.mean(tmps)))

qid_to_relpids = defaultdict(list)
with open(test_qrel_path) as fin:
    for line in fin:
        qid, _, pid, _ = line.strip().split("\t")
        qid_to_relpids[qid].append(pid)

di_ext_dist  = []
di_ext_qid_to_relpids = {}
for qid, relpids in tqdm(qid_to_relpids.items()):
    ext_pids = set(relpids)
    for pid in relpids:
        if pid not in G:
            continue
        nodes = set(nx.shortest_path(G, pid).keys())
        ext_pids = ext_pids.union(nodes)

    di_ext_dist.append(len(ext_pids) - len(relpids))
    di_ext_qid_to_relpids[qid] = list(ext_pids)
    
un_ext_dist = []
un_ext_qid_to_relpids = {}
for qid, relpids in tqdm(qid_to_relpids.items()):
    ext_pids = set(relpids)
    for pid in relpids:
        if pid in aid_to_relpids:
            new_pids = set(aid_to_relpids[pid])
            ext_pids = ext_pids.union(new_pids)
    un_ext_dist.append(len(ext_pids) - len(relpids))
    un_ext_qid_to_relpids[qid] = list(ext_pids)

G number of nodes, edges, connected_components = 82191, 180152, 8124
median and mean number of interaction for aid = 2.000, 4.188


100%|██████████| 32530/32530 [00:20<00:00, 1581.59it/s]
100%|██████████| 32530/32530 [00:00<00:00, 121519.13it/s]


In [28]:
train_qids = set()
with open(os.path.join(base_dir, "search/queries.train.tsv")) as fin:
    for line in fin:
        qid, _ = line.strip().split("\t")
        train_qids.add(qid)

test_qids = set()
with open(os.path.join(base_dir, "search/queries.test.tsv")) as fin:
    for line in fin:
        qid, _  = line.strip().split("\t")
        test_qids.add(qid)
with open(os.path.join(base_dir, "search/ext_qrels.test.tsv"), "w") as fout:
    for qid in list(test_qids):
        for pid in un_ext_qid_to_relpids[qid]:
            fout.write(f"{qid}\tQ0\t{pid}\t{1}\n")
            
test_qids = set()
with open(os.path.join(base_dir, "search/queries.test.exclude.tsv")) as fin:
    for line in fin:
        qid, _  = line.strip().split("\t")
        test_qids.add(qid)
with open(os.path.join(base_dir, "search/ext_qrels.test.exclude.tsv"), "w") as fout:
    for qid in list(test_qids):
        for pid in un_ext_qid_to_relpids[qid]:
            fout.write(f"{qid}\tQ0\t{pid}\t{1}\n")

In [29]:
# sanity check
for path in os.listdir(os.path.join(base_dir, "search")):
    path = os.path.join(os.path.join(base_dir, "search"), path)
    if "qrels.test" in path:
        ! wc -l $path
        ! head -n 3 $path
        ! tail -n 3 $path
        print("="*100)

985 /home/jupyter/jointly_rec_and_search/datasets/rec_search/search/qrels.test.exclude.tsv
1044	Q0	1379373	1
1103	Q0	2247719	1
1702	Q0	389059	1
193548	Q0	2079325	1
193879	Q0	2069763	1
194602	Q0	991352	1
445586 /home/jupyter/jointly_rec_and_search/datasets/rec_search/search/ext_qrels.test.tsv
194164	Q0	1696419	1
194164	Q0	330724	1
194164	Q0	313573	1
20561	Q0	2207306	1
20561	Q0	2207303	1
20561	Q0	2207186	1
6244 /home/jupyter/jointly_rec_and_search/datasets/rec_search/search/ext_qrels.test.exclude.tsv
67572	Q0	151535	1
67572	Q0	1804839	1
67572	Q0	366443	1
60277	Q0	112843	1
60277	Q0	109769	1
60277	Q0	2245777	1
107087 /home/jupyter/jointly_rec_and_search/datasets/rec_search/search/qrels.test.tsv
11	Q0	2126343	1
43	Q0	2126661	1
69	Q0	87856	1
194615	Q0	961160	1
194615	Q0	2032048	1
194615	Q0	1679595	1


In [25]:
idxes = np.random.randint(0,30000, 5)

sample_1 = [list(qid_to_relpids.items())[idx] for idx in idxes]
sample_2 = [list(di_ext_qid_to_relpids.items())[idx] for idx in idxes]
sample_3 = [list(un_ext_qid_to_relpids.items())[idx] for idx in idxes]
samples = [sample_1, sample_2, sample_3]
for i in range(3):
    print(f"sample_{i}")
    if i==1:
        continue
    for qid, pids in samples[i]:
        #print(qid_to_query[qid], [pid_to_prodocutid[_pid] for _pid in pids])
        print(qid_to_query[qid], [pid_to_title[_pid] for _pid in pids])
        print("-"*50)
    print("="*100)

sample_0
outdoor daybed ['Safavieh Cadeo Rattan Outdoor Daybed with Off-white Cushion(S) and Iron Frame', 'Best Selling Home Decor Ottavio Outdoor Daybed with Gray Cushion(S) and Acacia Frame']
--------------------------------------------------
12 foot step ladder ['Werner 7400 Fiberglass 12-ft Type 1AA- 375-lb Capacity Step Ladder', 'Werner 370 Aluminum 10-ft Type 1A- 300-lb Capacity Step Ladder', 'Werner NXT1A Fiberglass 12-ft Type 1A- 300 lbs. Capacity Step Ladder']
--------------------------------------------------
clorox pool filter sand ['Clorox Pool&Spa 25-lb Sand Pool Filter Aid']
--------------------------------------------------
ultrasonic rodent repellent ['Victor PestChaser Pest Repeller', 'Victor Heavy Sonic PestChaser Rodent Repellent']
--------------------------------------------------
roof cap vent ['Broan Universal Roof Cap (Black)']
--------------------------------------------------
sample_1
sample_2
outdoor daybed ['Haven Way Hana Wicker Outdoor Sectional with White 

(3003, array([  2.,   2.,   2.,   4.,   9., 195.]), 5.226440226440227)