In [12]:
import os 

import pandas as pd
import numpy as np 
from tqdm import tqdm

in_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"

datas = []
fns = [
    os.path.join(in_dir, "train_sim_recs.csv"),
    os.path.join(in_dir, "test_sim_recs.csv"),
    os.path.join(in_dir, "train_compl_recs.csv"),
    os.path.join(in_dir, "test_compl_recs.csv"),
    os.path.join(in_dir, "train_searchs.csv"),
    os.path.join(in_dir, "test_searchs.csv"),
]

for fn in fns:
    datas.append(pd.read_csv(fn, index_col=0))
    
train_sim_data, test_sim_data, train_compl_data, test_compl_data, train_search_data, test_search_data = datas
sim_data = pd.concat([train_sim_data, test_sim_data])
compl_data = pd.concat([train_compl_data, test_compl_data])
search_data = pd.concat([train_search_data, test_search_data])
assert len(sim_data) == len(train_sim_data) + len(test_sim_data) 
assert len(compl_data) == len(train_compl_data) + len(test_compl_data) 
assert len(search_data) == len(train_search_data) + len(test_search_data)

  mask |= (ar1 == a)


In [47]:
import random
from collections import defaultdict
import pickle as pkl
import pandas as pd
random.seed(4680)

out_dir = os.path.join(in_dir, "zero_shot")
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

max_user_num = 10_000
selected_sim_users = random.sample(list(sim_data.uid), k=min(max_user_num, int(len(sim_data.uid.unique())*0.1)))
selected_compl_users = random.sample(list(compl_data.uid), k=min(max_user_num, int(len(compl_data.uid.unique())*0.1)))
selected_search_users = random.sample(list(search_data.uid), k=min(max_user_num, int(len(search_data.uid.unique())*0.1)))

print("number of selected sim, compl, search users {:,}, {:,}, {:,}".format(len(selected_sim_users), len(selected_compl_users), 
                                                                            len(selected_search_users)))

test_sim_data = sim_data[np.in1d(sim_data.uid, selected_sim_users)]
test_compl_data = compl_data[np.in1d(compl_data.uid, selected_compl_users)]
test_search_data = search_data[np.in1d(search_data.uid, selected_search_users)]

train_sim_data = sim_data[~np.in1d(sim_data.uid, selected_sim_users)]
train_compl_data = compl_data[~np.in1d(compl_data.uid, selected_compl_users)]
train_search_data = search_data[~np.in1d(search_data.uid, selected_search_users)]

assert len(test_sim_data) + len(train_sim_data) == len(sim_data) and len(test_compl_data) + len(train_compl_data) == len(compl_data)
assert len(test_search_data) + len(train_search_data) == len(search_data)

train_aid_to_simpids, train_aid_to_complpids, train_qid_to_pids = defaultdict(set), defaultdict(set), defaultdict(set)
test_aid_to_simpids, test_aid_to_complpids, test_qid_to_pids = defaultdict(set), defaultdict(set), defaultdict(set)
for aid, simpids in zip(train_sim_data.aid, train_sim_data.sim_pids):
    train_aid_to_simpids[aid].update(eval(simpids))
for aid, simpids in zip(test_sim_data.aid, test_sim_data.sim_pids):
    test_aid_to_simpids[aid].update(eval(simpids))
for aid, complpids in zip(train_compl_data.aid, train_compl_data.compl_pids):
    train_aid_to_complpids[aid].update(eval(complpids))
for aid, complpids in zip(test_compl_data.aid, test_compl_data.compl_pids):
    test_aid_to_complpids[aid].update(eval(complpids))
for qid, relpids in zip(train_search_data.qid, train_search_data.rel_pids):
    train_qid_to_pids[qid].update(eval(relpids))
for qid, relpids in tqdm(zip(test_search_data.qid, test_search_data.rel_pids), total=len(test_search_data)):
    test_qid_to_pids[qid].update(eval(relpids))
    

print("train sim_arels, compl_arels, search_qrels = {:,}, {:,}, {:,}".format(
    sum([len(x) for x in train_aid_to_simpids.values()]), sum([len(x) for x in train_aid_to_complpids.values()]), 
    sum([len(x) for x in train_qid_to_pids.values()])
))
print("test sim_arels, compl_arels, search_qrels = {:,}, {:,}, {:,}".format(
    sum([len(x) for x in test_aid_to_simpids.values()]), sum([len(x) for x in test_aid_to_complpids.values()]), 
    sum([len(x) for x in test_qid_to_pids.values()])
))

exclude_aid_to_simpids, exclude_aid_to_complpids, exclude_qid_to_pids = {}, {}, {}
for aid, simpids in test_aid_to_simpids.items():
    if aid in train_aid_to_simpids:
        exclude_pids = simpids.difference(train_aid_to_simpids[aid])
    else:
        exclude_pids = simpids
    exclude_aid_to_simpids[aid] = exclude_pids
for aid, complpids in test_aid_to_complpids.items():
    if aid in train_aid_to_complpids:
        exclude_pids = complpids.difference(train_aid_to_complpids[aid])
    else:
        exclude_pids = complpids
    exclude_aid_to_complpids[aid] = exclude_pids
for qid, pids in test_qid_to_pids.items():
    if qid in train_qid_to_pids:
        exclude_pids = pids.difference(train_qid_to_pids[qid])
    else:
        exclude_pids = pids
    exclude_qid_to_pids[qid] = exclude_pids
    
print("after difference, test sim_arels, compl_arels, search_qrels = {:,}, {:,}, {:,}".format(
    sum([len(x) for x in exclude_aid_to_simpids.values()]), sum([len(x) for x in exclude_aid_to_complpids.values()]), 
    sum([len(x) for x in exclude_qid_to_pids.values()])
))

fn_to_data = {
    os.path.join(out_dir, "exclude_aid_to_simpids.pkl"): exclude_aid_to_simpids,
    os.path.join(out_dir, "exclude_aid_to_complpids.pkl"): exclude_aid_to_complpids,
    os.path.join(out_dir, "exclude_qid_to_relpids.pkl"): exclude_qid_to_pids,
}
for fn, data in fn_to_data.items():
    with open(fn, "wb") as fout:
        pkl.dump(data, fout)

number of selected sim, compl, search users 8,166, 1,262, 10,000


100%|██████████| 330924/330924 [00:01<00:00, 177863.04it/s]


train sim_arels, compl_arels, search_qrels = 321,960, 58,493, 2,463,484
test sim_arels, compl_arels, search_qrels = 71,517, 10,521, 203,891
after difference, test sim_arels, compl_arels, search_qrels = 24,429, 5,246, 19,858


In [None]:
def create_neg_value_ids(query_ids, pos_value_ids, miss_qids, sampler=None):
    assert type(sampler) == dict
    assert len(query_ids) == len(pos_value_ids)
    neg_value_ids = []
    for qid, pos_vid in zip(query_ids, pos_value_ids):
        if qid not in sampler:
            miss_qids.add(qid)
            neg_vid = random.sample(range(2_000_000), k=1)[0]
            while neg_vid == pos_vid:
                neg_vid = random.sample(range(2_000_000), k=1)[0]
            neg_value_ids.append(neg_vid)
        else:
            neg_vid = random.sample(sampler[qid], k=1)[0]
            while neg_vid == pos_vid:
                neg_vid = random.sample(range(2_000_000), k=1)[0]
            neg_value_ids.append(neg_vid)
    
    assert len(neg_value_ids) == len(pos_value_ids)
    
    return neg_value_ids

run_path = os.path.join(in_dir, "runs/bm25.all.run")
df = pd.read_csv(run_path, sep=" ", names=["hid", "q0", "tid", "rank", "score", "model_name"])
bm25_hid_to_tids = {}
ignore_hids = set()
for hid, group in df.groupby("hid"):
    cand_tids = list(group.tid.values)
    if len(cand_tids) < 10:
        ignore_hids.add(int(hid))
    else:
        bm25_hid_to_tids[int(hid)] = [int(x) for x in cand_tids]
        
print("number of ignore hids = {}".format(len(ignore_hids)))

number of ignore hids = 6644


In [55]:
def get_sequential_examples(data, prefix, bm25_hid_to_tids, is_train=True):
    seq_examples = []
    miss_hids = set()
    for uid, group in tqdm(data.groupby("uid")):
        if "search_sequential" in prefix:
            qids = list(group.qid)
            group_rel_pids = group.rel_pids 
        elif "sim_rec_sequential" in prefix:
            qids = list(group.aid)
            group_rel_pids = group.sim_pids
        elif "compl_rec_sequential" in prefix:
            qids = list(group.aid)
            group_rel_pids = group.compl_pids
        else:
            raise ValueError(f"{prefix} not valid.")
            
        rel_pids = []
        for xs in group_rel_pids:
            rel_pids.append(random.sample(eval(xs), k=1)[0]) # only sample 1 relpid 
        assert len(qids) == len(rel_pids) == len(group)
        
        if is_train:
            target_value_ids = [int(x) for x in rel_pids[1:]]
        else:
            target_value_ids = [eval(xs) for xs in group_rel_pids][1:]
            target_value_ids = [[int(x) for x in xs] for xs in target_value_ids]

        uid = int(uid)
        qids = [int(x) for x in qids]
        rel_pids = [int(x) for x in rel_pids]

        query_ids = qids[1:]
        context_key_ids = qids[:-1]
        context_value_ids = rel_pids[:-1]
            
        assert len(query_ids) == len(context_key_ids) == len(context_value_ids) == len(target_value_ids)
        
        if is_train:
            if "sim_rec_sequential" in prefix:
                neg_value_ids = random.sample(range(2_000_000), k=len(target_value_ids))
            elif "search_sequential" in prefix or "compl_rec_sequential" in prefix:
                neg_value_ids = create_neg_value_ids(query_ids=query_ids, 
                                                     pos_value_ids=target_value_ids, 
                                                     miss_qids=miss_hids, 
                                                     sampler=bm25_hid_to_tids)
            else:
                raise ValueError(f"prefix: {prefix} is not valid.")
            example = {"uid": uid, "query_ids": query_ids, "context_key_ids": context_key_ids, "context_value_ids": context_value_ids,
                        "target_value_ids": target_value_ids, "neg_value_ids": neg_value_ids}
        else:
            example = {"uid": uid, "query_ids": query_ids, "context_key_ids": context_key_ids, "context_value_ids": context_value_ids,
                        "target_value_ids": target_value_ids}
        
        seq_examples.append(example)
    print(f"miss_hids for {prefix}: {len(miss_hids):,}.")
        
    return seq_examples

train_search_examples = get_sequential_examples(train_search_data, "search_sequential", bm25_hid_to_tids, is_train=True)
test_search_examples = get_sequential_examples(test_search_data, "search_sequential", bm25_hid_to_tids, is_train=False)
train_sim_examples = get_sequential_examples(train_sim_data, "sim_rec_sequential", bm25_hid_to_tids, is_train=True)
test_sim_examples = get_sequential_examples(test_sim_data, "sim_rec_sequential", bm25_hid_to_tids, is_train=False)
train_compl_examples = get_sequential_examples(train_compl_data, "compl_rec_sequential", bm25_hid_to_tids, is_train=True)
test_compl_examples = get_sequential_examples(test_compl_data, "compl_rec_sequential", bm25_hid_to_tids, is_train=False)


100%|██████████| 805946/805946 [05:38<00:00, 2381.33it/s] 


miss_hids for search_sequential: 45,210.


100%|██████████| 9886/9886 [00:06<00:00, 1577.35it/s]


miss_hids for search_sequential: 0.


100%|██████████| 74036/74036 [00:22<00:00, 3358.05it/s]


miss_hids for sim_rec_sequential: 0.


100%|██████████| 7628/7628 [00:03<00:00, 2487.95it/s]


miss_hids for sim_rec_sequential: 0.


100%|██████████| 11424/11424 [00:02<00:00, 4267.27it/s]


miss_hids for compl_rec_sequential: 0.


100%|██████████| 1204/1204 [00:00<00:00, 3693.08it/s]

miss_hids for compl_rec_sequential: 0.





In [90]:
import ujson

hlen = 4
dest_dir = os.path.join(out_dir, f"sequential_train_test_hlen_{hlen}_bm25")
if not os.path.exists(dest_dir):
    os.mkdir(dest_dir)
    
SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"
    
prefix_to_data = {
    "search_sequential": (test_search_examples, REL_RELATION),
    "sim_rec_sequential": (test_sim_examples, SIM_RELATION),
    "compl_rec_sequential": (test_compl_examples, COMPL_RELATION)
}
for prefix, (test_examples, relation) in prefix_to_data.items():
    num_j = 3
    test_id = 0
    out_examples = []
    tid_to_pospids = {}
    for example in tqdm(test_examples, total=len(test_examples)):
        max_len = len(example["query_ids"])
        for j in range(num_j):
            end_idx = max_len - j
            start_idx = max(0, max_len - j - hlen)
            assert end_idx > 0 and end_idx > start_idx

            out_example = {"uid": example["uid"], 'query_ids': example['query_ids'][start_idx:end_idx], 
                           'context_key_ids': example['context_key_ids'][start_idx:end_idx], 
                           'context_value_ids': example['context_value_ids'][start_idx:end_idx], "relation": relation,
                          "test_id": test_id}
            out_examples.append(out_example)
            tid_to_pospids[test_id] = example["target_value_ids"][end_idx-1]
            test_id += 1
            
    with open(os.path.join(dest_dir, f"{prefix}.test.json"), "w") as fout:
        for example in out_examples:
            fout.write(ujson.dumps(example) + "\n")
    with open(os.path.join(dest_dir, f"{prefix}.trels.tsv"), "w") as fout:
        for tid, pospid in tid_to_pospids.items():
            for pospid in pospid:
                fout.write(f"{tid}\tQ0\t{pospid}\t{1}\n")
    
    
fn_to_data = {
    os.path.join(dest_dir, "search_sequential.train.json"): (train_search_examples, REL_RELATION),
    os.path.join(dest_dir, "sim_rec_sequential.train.json"): (train_sim_examples, SIM_RELATION),
    os.path.join(dest_dir, "compl_rec_sequential.train.json"): (train_compl_examples, COMPL_RELATION)
}
for fn, (train_examples,relation) in fn_to_data.items():
    out_examples = []
    for example in tqdm(train_examples, total=len(train_examples)):
        max_len = len(example["query_ids"])
        start_idx = max(0, max_len-hlen)
        
        out_example = {"uid": example["uid"], 'query_ids': example['query_ids'][start_idx:], 
                           'context_key_ids': example['context_key_ids'][start_idx:], 
                           'context_value_ids': example['context_value_ids'][start_idx:],
                            'target_value_ids': example['target_value_ids'][start_idx:],
                              'neg_value_ids': example['neg_value_ids'][start_idx:], "relation": relation}
        out_examples.append(out_example)
        
    with open(fn, "w") as fout:
        for example in out_examples:
            fout.write(ujson.dumps(example) + "\n")

100%|██████████| 9886/9886 [00:00<00:00, 96231.02it/s]
100%|██████████| 7628/7628 [00:00<00:00, 115473.59it/s]
100%|██████████| 1204/1204 [00:00<00:00, 124742.29it/s]
100%|██████████| 805946/805946 [00:24<00:00, 33062.43it/s] 
100%|██████████| 74036/74036 [00:00<00:00, 313172.04it/s]
100%|██████████| 11424/11424 [00:00<00:00, 261217.94it/s]


In [92]:
# sanity check
for fn in os.listdir(dest_dir):
    fn = os.path.join(dest_dir, fn)
    if fn.endswith(".test.json"):
        continue
    ! wc -l $fn
    ! head -n 5 $fn
    print(75*"=")

31637 /home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/zero_shot/sequential_train_test_hlen_4_bm25/sim_rec_sequential.trels.tsv
0	Q0	1042730	1
1	Q0	2054815	1
2	Q0	2054815	1
3	Q0	2024162	1
4	Q0	2024162	1
29658 /home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/zero_shot/sequential_train_test_hlen_4_bm25/search_sequential.test.json
{"uid":76,"query_ids":[2475509,2536403,2987930,3008398],"context_key_ids":[2483941,2475509,2536403,2987930],"context_value_ids":[102099,490215,534086,157105],"relation":"is_relevant_to","test_id":0}
{"uid":76,"query_ids":[2483941,2475509,2536403,2987930],"context_key_ids":[2654742,2483941,2475509,2536403],"context_value_ids":[266488,102099,490215,534086],"relation":"is_relevant_to","test_id":1}
{"uid":76,"query_ids":[2654742,2483941,2475509,2536403],"context_key_ids":[2824399,2654742,2483941,2475509],"context_value_ids":[1121930,266488,102099,490215],"relation":"is_relevant_to","test_id":2}
{"uid":112,"query_ids":[2298242,2840926,

In [88]:
uid = 211
for example in test_sim_examples:
    if example["uid"] == uid:
        print(example)
query_ids = [2237551,2237551,986063,1040357]
target_vids = [1666533,1666533,257280,1452546]
neg_vids = [2088791,828212,1127530,1943751]

for qid, tvid, nvid in zip(query_ids, target_vids, neg_vids):
    print("query: {}\ntarget_item: {}\nneg_item: {}\n".format(eid_to_text[qid], eid_to_text[tvid], eid_to_text[nvid]))
    print(75*"=")

{'uid': 211, 'query_ids': [455596, 1890709, 1890709, 1223138, 1223138, 1707153, 1223138, 841737, 1938671, 1938671, 2172629, 1595941], 'context_key_ids': [1403791, 455596, 1890709, 1890709, 1223138, 1223138, 1707153, 1223138, 841737, 1938671, 1938671, 2172629], 'context_value_ids': [662654, 187543, 187543, 187543, 187543, 187543, 187543, 187543, 713893, 2054815, 2054815, 2054815], 'target_value_ids': [[187543], [187543], [187543], [187543], [187543], [187543, 1223138], [187543], [713893], [2054815], [2054815], [2054815], [1042730]]}
query: Pit Boss Pit Boss Pro Series 4 Series Vertical Smoker ; Pellet Smokers
target_item: Eaton 15-Amp Residential Duplex Outlet, White ; Electrical Outlets
neg_item: RACO Gray Metal New Work Deep Square Ceiling/Wall Electrical Box ; Electrical Boxes

query: Pit Boss Pit Boss Pro Series 4 Series Vertical Smoker ; Pellet Smokers
target_item: Eaton 15-Amp Residential Duplex Outlet, White ; Electrical Outlets
neg_item: CARLON 4-Gang Blue Plastic New Work Stand