In [17]:
import os 
import pickle 

import pandas as pd
import numpy as np

SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"


in_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
    
train_sim_data, train_compl_data, train_search_data = None, None, None
data_fns = [
    os.path.join(in_dir, "train_sim_recs.csv"),
    os.path.join(in_dir, "train_compl_recs.csv"),
    os.path.join(in_dir, "train_searchs.csv"),
]
datas = []
for fn in data_fns:
    datas.append(pd.read_csv(fn, index_col=0))
train_sim_data, train_compl_data, train_search_data = datas


datas = []
test_sim_data, test_compl_data, test_search_data = None, None, None
selected_dir = os.path.join(in_dir, "selected_test_user")
data_fns = [
    os.path.join(selected_dir, "selected_sim_data.test.pkl"),
    os.path.join(selected_dir, "selected_compl_data.test.pkl"),
    os.path.join(selected_dir, "selected_search_data.test.pkl"),
]
for fn in data_fns:
    with open(fn, "rb") as fin:
        datas.append(pickle.load(fin))
test_sim_data, test_compl_data, test_search_data = datas
datas = None

root_dir="/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
eid_to_text = {}
with open(os.path.join(root_dir, "all_entities.tsv")) as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text
        
train_sim_data["relation"] = SIM_RELATION
test_sim_data["relation"] = SIM_RELATION
train_compl_data["relation"] = COMPL_RELATION
test_compl_data["relation"] = COMPL_RELATION
train_search_data["relation"] = REL_RELATION
test_search_data["relation"] = REL_RELATION

train_sim_data.rename({"aid": "hid", "sim_pids": "tids"}, axis=1, inplace=True)
test_sim_data.rename({"aid": "hid", "sim_pids": "tids"}, axis=1, inplace=True)
train_compl_data.rename({"aid": "hid", "compl_pids": "tids"}, axis=1, inplace=True)
test_compl_data.rename({"aid": "hid", "compl_pids": "tids"}, axis=1, inplace=True)
train_search_data.rename({"qid": "hid", "rel_pids": "tids"}, axis=1, inplace=True)
test_search_data.rename({"qid": "hid", "rel_pids": "tids"}, axis=1, inplace=True)

train_merge_data = pd.concat([train_sim_data, train_compl_data, train_search_data])
train_merge_data["date_time"] = pd.to_datetime(train_merge_data["date_time"])
train_merge_data = train_merge_data.sort_values(by=["uid", "date_time"])

print("length of sim_rec train and test = {:,}, {:,}".format(len(train_sim_data), len(test_sim_data)))
print("length of compl_rec train and test = {:,}, {:,}".format(len(train_compl_data), len(test_compl_data)))
print("length of search train and test = {:,}, {:,}".format(len(train_search_data), len(test_search_data)))
print("length of train_merge_data = {:,}".format(len(train_merge_data)))
print("number of entites = {:,}".format(len(eid_to_text)))

assert set(test_sim_data.uid).issubset(set(train_sim_data.uid)) \
and set(test_compl_data.uid).issubset(set(train_compl_data.uid)) \
and set(test_search_data.uid).issubset(set(train_search_data.uid))
assert len(train_merge_data) == len(train_sim_data) + len(train_compl_data) + len(train_search_data)
print("test users for each data are subset of their corresponding train users.")

  mask |= (ar1 == a)


length of sim_rec train and test = 1,017,800, 10,000
length of compl_rec train and test = 67,310, 10,000
length of search train and test = 13,726,249, 10,000
length of train_merge_data = 14,811,359
number of entites = 3,214,651
test users for each data are subset of their corresponding train users.


In [68]:
import random
from tqdm import tqdm
import ujson 

out_dir = os.path.join(in_dir, "mixture_sequential_train_test/")
if not os.path.exists(out_dir):
    os.mkdir(out_dir) 

seq_examples_list = []
prefixes_to_datas= {
    os.path.join(out_dir, "search_sequential"): (train_search_data, test_search_data, "urels.search.test.tsv"),
    os.path.join(out_dir, "sim_rec_sequential"): (train_sim_data, test_sim_data, "urels.sim.test.tsv"),
    os.path.join(out_dir, "compl_rec_sequential"): (train_compl_data, test_compl_data, "urels.compl.test.tsv"),
}

for prefix, (train_data, test_data, urel_path) in prefixes_to_datas.items():
    train_seq_examples = []
    test_seq_examples = []
    test_uid_to_pospids = {}
    for uid, g in tqdm(train_data.groupby("uid"), desc=prefix.split("/")[-1]):
        last_time = g.iloc[-1].date_time
        group = train_merge_data[train_merge_data.uid==uid]
        group = group[group.date_time <= last_time]
        
        qids = list(group.hid)
        group_rel_pids = list(group.tids)
        relations = list(group.relation)
        
        rel_pids = []
        for xs in group_rel_pids:
            rel_pids.append(random.sample(eval(xs), k=1)[0]) # only sample 1 relpid 
        assert len(qids) == len(rel_pids) == len(group)

        uid = int(uid)
        qids = [int(x) for x in qids]
        rel_pids = [int(x) for x in rel_pids]

        query_ids = qids[1:]
        context_key_ids = qids[:-1]
        context_value_ids = rel_pids[:-1]
        target_value_ids = rel_pids[1:]
        relations = relations[1:]
        assert len(query_ids) == len(context_key_ids) == len(context_value_ids) == len(target_value_ids) == len(relations)

        example = {"uid": uid, "query_ids": query_ids, "context_key_ids": context_key_ids, "context_value_ids": context_value_ids,
                    "target_value_ids": target_value_ids, "relations": relations}
        train_seq_examples.append(example)

        # for test
        test_row = test_data[test_data.uid == uid]
        if len(test_row) == 0:
            continue
        assert len(test_row) == 1, test_row
        
        test_qid = int(test_row.iloc[0].hid)
        test_relation = str(test_row.iloc[0].relation)

        test_query_ids = qids[1:] + [test_qid]
        test_context_key_ids = qids 
        test_context_value_ids = rel_pids
        relations = relations[1:] + [test_relation]
        assert len(test_query_ids) == len(test_context_key_ids) == len(test_context_value_ids), (len(test_query_ids), 
                                                                                len(test_context_key_ids), len(test_context_value_ids))

        example = {"uid": uid, "query_ids": test_query_ids, "context_key_ids": test_context_key_ids, 
                   "context_value_ids": test_context_value_ids, "relations": relations}
        test_seq_examples.append(example)

        if "search_sequential" in prefix:
            test_uid_to_pospids[uid] = test_row.iloc[0].tids
        elif "sim_rec_sequential" in prefix:
            test_uid_to_pospids[uid] = test_row.iloc[0].tids
        elif "compl_rec_sequential" in prefix:
            test_uid_to_pospids[uid] = test_row.iloc[0].tids
        else:
            raise ValueError(f"{prefix} not valid.")
        

    with open(prefix + ".train.json", "w") as fout:
        for line in train_seq_examples:
            fout.write(ujson.dumps(line) + "\n")
    with open(prefix + ".test.json", "w") as fout:
        for line in test_seq_examples:
            fout.write(ujson.dumps(line) + "\n")
    with open(os.path.join(out_dir, urel_path), "w") as fout:
        for uid, pos_pids in test_uid_to_pospids.items():
            for pos_pid in pos_pids:
                fout.write(f"{uid}\tQ0\t{pos_pid}\t{1}\n")


search_sequential:   0%|          | 0/815832 [00:04<?, ?it/s]


AssertionError: 

In [65]:
PIDS = []
with open(os.path.join(in_dir, "collection_title.tsv")) as fin:
    for line in fin:
        PIDS.append(int(line.strip().split("\t")[0]))
print(f"max pids = {max(PIDS)}")

def create_neg_value_ids(query_ids, pos_value_ids, relations, miss_qids, sampler=None):
    assert type(sampler) == dict
    assert len(query_ids) == len(pos_value_ids) == len(relations), (len(query_ids), len(pos_value_ids), len(relations))
    neg_value_ids = []
    for qid, pos_vid, relation in zip(query_ids, pos_value_ids, relations):
        if relation == SIM_RELATION:
            neg_ivd = random.sample(range(2_000_000), k=1)[0]
            while neg_vid == pos_vid:
                neg_vid = random.sample(range(2_000_000), k=1)[0]
        elif relation in [COMPL_RELATION, REL_RELATION]:
            if qid not in sampler:
                miss_qids.add(qid)
                neg_vid = random.sample(range(2_000_000), k=1)[0]
                while neg_vid == pos_vid:
                    neg_vid = random.sample(range(2_000_000), k=1)[0]
                neg_value_ids.append(neg_vid)
            else:
                neg_vid = random.sample(sampler[qid], k=1)[0]
                while neg_vid == pos_vid:
                    neg_vid = random.sample(range(2_000_000), k=1)[0]
                neg_value_ids.append(neg_vid)
        else:
            raise ValueError("relation = {} is not valid".format(relation))
    
    assert len(neg_value_ids) == len(pos_value_ids)
    
    return neg_value_ids


run_path = os.path.join(in_dir, "runs/bm25.all.run")
df = pd.read_csv(run_path, sep=" ", names=["hid", "q0", "tid", "rank", "score", "model_name"])
bm25_hid_to_tids = {}
ignore_hids = set()
for hid, group in df.groupby("hid"):
    cand_tids = list(group.tid.values)
    if len(cand_tids) < 10:
        ignore_hids.add(int(hid))
    else:
        bm25_hid_to_tids[int(hid)] = [int(x) for x in cand_tids]
        
print("number of ignore hids = {}".format(len(ignore_hids)))

max pids = 2260877


'\nrun_path = os.path.join(in_dir, "runs/bm25.all.run")\ndf = pd.read_csv(run_path, sep=" ", names=["hid", "q0", "tid", "rank", "score", "model_name"])\nbm25_hid_to_tids = {}\nignore_hids = set()\nfor hid, group in df.groupby("hid"):\n    cand_tids = list(group.tid.values)\n    if len(cand_tids) < 10:\n        ignore_hids.add(int(hid))\n    else:\n        bm25_hid_to_tids[int(hid)] = [int(x) for x in cand_tids]\n        \nprint("number of ignore hids = {}".format(len(ignore_hids)))\n'

In [67]:
import ujson
from tqdm import tqdm 
import random

train_search_examples, test_search_examples, train_sim_rec_examples, test_sim_rec_examples, train_compl_rec_examples, \
test_compl_rec_examples = [],[],[],[],[],[]
fn_to_example = {
    os.path.join(out_dir, "search_sequential.train.json"): train_search_examples,
    os.path.join(out_dir, "search_sequential.test.json"): test_search_examples,
    os.path.join(out_dir, "sim_rec_sequential.train.json"): train_sim_rec_examples,
    os.path.join(out_dir, "sim_rec_sequential.test.json"): test_sim_rec_examples,
    os.path.join(out_dir, "compl_rec_sequential.train.json"): train_compl_rec_examples,
    os.path.join(out_dir, "compl_rec_sequential.test.json"): test_compl_rec_examples,
}

for fn, data_examples in fn_to_example.items():
    with open(fn) as fin:
        for line in fin:
            data_examples.append(ujson.loads(line))

miss_hids = set()

suffix_to_examples = {
    "search_sequential.train.json": (train_search_examples),
    "search_sequential.test.json": (test_search_examples),
    "sim_rec_sequential.train.json": (train_sim_rec_examples),
    "sim_rec_sequential.test.json": (test_sim_rec_examples),
    "compl_rec_sequential.train.json": (train_compl_rec_examples),
    "compl_rec_sequential.test.json": (test_compl_rec_examples),
}

history_lengths = [4, 8]
for hist_len in history_lengths:
    for dest_signature in ["hlen_{}_bm25".format(hist_len)]:
        dest_dir = os.path.join(out_dir, dest_signature)
        if not os.path.exists(dest_dir):
            os.mkdir(dest_dir)
        for suffix, data_examples in suffix_to_examples.items():
            dest_fn = os.path.join(dest_dir, suffix)
            with open(dest_fn, "w") as fout:
                for example in tqdm(data_examples, desc=suffix):
                    if "train.json" in dest_fn:
                        start_idx = max(0, len(example["query_ids"])-hist_len)
                    elif "test.json":
                        start_idx = max(0, len(example["query_ids"])-hist_len-1)
                    else:
                        raise ValueError(f"{suffix} is not valid.")

                    query_ids = example["query_ids"][start_idx:]
                    context_key_ids = example["context_key_ids"][start_idx:]
                    context_value_ids = example["context_value_ids"][start_idx:]
                    relations = example["relations"][start_idx:]
                    print(len(example["query_ids"]), len(example["relations"]))
                    if "train.json" in dest_fn:
                        target_value_ids = example["target_value_ids"][start_idx:]
                        assert "bm25" in dest_signature
                        neg_value_ids = create_neg_value_ids(query_ids=query_ids, 
                                                             pos_value_ids=target_value_ids,
                                                             relations=relations,
                                                             miss_qids=miss_hids, 
                                                             sampler=bm25_hid_to_tids)
                        dest_example = {"uid": example["uid"], "query_ids": query_ids, "context_key_ids": context_key_ids,
                                    "context_value_ids": context_value_ids, 
                                    "target_value_ids": target_value_ids, "neg_value_ids": neg_value_ids, "relations": relations}
                    elif "test.json" in dest_fn:
                        dest_example = {"uid": example["uid"], "query_ids": query_ids, "context_key_ids": context_key_ids,
                                    "context_value_ids": context_value_ids, "relations": relations}
                    else:
                        raise ValueError(f"{suffix} is not valid.")
                    fout.write(ujson.dumps(dest_example) + "\n")
            if "bm25" in dest_signature:
                if "search_sequential" in suffix or "compl_rec_sequential" in suffix:
                    print("bm25 suffix: {}'s miss_hids = {}".format(suffix, len(miss_hids)))

search_sequential.train.json:   0%|          | 0/815832 [00:00<?, ?it/s]

9 10





AssertionError: (4, 4, 5)

In [67]:
#out_dir="/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/sequential_train_test/"
check_dir = os.path.join(out_dir, "hlen_4_bm25")

for fn in os.listdir(check_dir):
    fn = os.path.join(check_dir, fn)
    if ".test.json" in fn:
        continue
    if "search_sequential" not in fn:
        continue
    ! wc -l $fn 
    ! head -n 2 $fn
    print(100*"=")


"""
for fn in os.listdir(out_dir):
    if "search" not in fn or "small" in fn:
        continue 
    fn = os.path.join(out_dir, fn)
    
    ! wc -l $fn 
    ! head -n 2 $fn
    print(100*"=")

print(100*"=.")

check_dir = os.path.join(out_dir, "without_context")

for fn in os.listdir(check_dir):
    fn = os.path.join(check_dir, fn)
    
    ! wc -l $fn 
    ! head -n 2 $fn
    print(100*"=")
"""

815832 /home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/sequential_train_test/hlen_4_bm25/search_sequential.train.json
{"uid":0,"query_ids":[2938044,2441693,2382801,2676693],"context_key_ids":[2791482,2938044,2441693,2382801],"context_value_ids":[1558005,246217,438838,479119],"target_value_ids":[246217,438838,479119,264598],"neg_value_ids":[1554910,1363026,1865963,640542],"relation":"is_relevant_to"}
{"uid":1,"query_ids":[3000257,2736901,3000257,2736901],"context_key_ids":[2461556,3000257,2736901,3000257],"context_value_ids":[618773,2065684,1698588,1846797],"target_value_ids":[2065684,1698588,1846797,1698588],"neg_value_ids":[1851723,2256340,1424601,1445720],"relation":"is_relevant_to"}


'\nfor fn in os.listdir(out_dir):\n    if "search" not in fn or "small" in fn:\n        continue \n    fn = os.path.join(out_dir, fn)\n    \n    ! wc -l $fn \n    ! head -n 2 $fn\n    print(100*"=")\n\nprint(100*"=.")\n\ncheck_dir = os.path.join(out_dir, "without_context")\n\nfor fn in os.listdir(check_dir):\n    fn = os.path.join(check_dir, fn)\n    \n    ! wc -l $fn \n    ! head -n 2 $fn\n    print(100*"=")\n'

In [68]:
uid = 148
for example in test_search_examples:
    if example["uid"] == uid:
        print(example)
query_ids = [2938044,2441693,2382801,2676693]
target_vids = [246217,438838,479119,264598]
neg_vids = [1554910,1363026,1865963,640542]

for qid, tvid, nvid in zip(query_ids, target_vids, neg_vids):
    print("query: {}\ntarget_item: {}\nneg_item: {}\n".format(eid_to_text[qid], eid_to_text[tvid], eid_to_text[nvid]))
    print(75*"=")

{'uid': 148, 'query_ids': [3120745, 2299257, 3110146, 3169024, 2582666, 3191904, 2875418, 2978514, 3156648], 'context_key_ids': [2504195, 3120745, 2299257, 3110146, 3169024, 2582666, 3191904, 2875418, 2978514], 'context_value_ids': [818764, 538461, 1146204, 1738420, 926207, 727621, 2167844, 2217383, 663385]}
query: venom steel gloves
target_item: Venom Steel Unisex 2-Layer Rip Resistant Nitrile Multipurpose Gloves, One Size Fits All (25-Pairs) ; Work Gloves
neg_item: Buckle-Down Venom Face Icon Black/Reds/White Black Dog Collar, Medium (26- 40 Lbs.) ; Pet Collars & Harnesses

query: edger attachment
target_item: TrimmerPlus LE720 Edger Attachment ; String Trimmer Attachments
neg_item: Badger Link-on Cultivator Attachment Tiller Attachment ; String Trimmer Attachments

query: drill bit extensions
target_item: Southwire 3/16-in Round 54-in Drill Bit Extension ; Drill Bit Extensions
neg_item: Drill America 6-in Cobalt Twist Drill Bit ; Twist Drill Bits

query: air compressors
target_item:

In [44]:
root_dir="/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/
with open(os.path.join(root_dir, "all_entities.tsv")) as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text

query_ids = [2417516,2374200, 2576318, 2836633 ]
context_value_ids = [202549,1582436,1400531,520284]
target_value_ids = [1582436,1400531,520284,27714]

for qid, context_pid, target_pid in zip(query_ids, context_value_ids, target_value_ids):
    print("qid: {}\n ctxpid : {} \n relpid : {}".format(eid_to_text[qid],eid_to_text[context_pid], eid_to_text[target_pid]))
    print("="*75)
print("hi")

SyntaxError: EOL while scanning string literal (1838640968.py, line 1)

In [63]:
! ls /home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/sequential_train_test/

compl_rec_sequential.test.json	 search_sequential.train.json
compl_rec_sequential.train.json  sim_rec_sequential.test.json
hlen_4_bm25			 sim_rec_sequential.train.json
hlen_4_randneg			 urels.compl.test.tsv
hlen_8_bm25			 urels.search.test.tsv
hlen_8_randneg			 urels.sim.test.tsv
search_sequential.test.json	 without_context


In [64]:
! ls /home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/sequential_train_test/hlen_4_bm25/

compl_rec_sequential.test.json	 search_sequential.train.json
compl_rec_sequential.train.json  sim_rec_sequential.test.json
search_sequential.test.json	 sim_rec_sequential.train.json
