In [4]:
import os 
import pickle as pkl

import pandas as pd
import numpy as np

in_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/amazon_review_datasets/"
dataset_name = "Cell_Phones_and_Accessories"
in_dir = os.path.join(in_dir, dataset_name)
    
with open(os.path.join(in_dir, "train_user_review_df.pkl"), "rb") as fin:
    train_search_data = pkl.load(fin)

with open(os.path.join(in_dir, "test_user_review_df.pkl"), "rb") as fin:
    test_search_data = pkl.load(fin)

eid_to_text = {}
with open(os.path.join(in_dir, "all_entities.tsv")) as fin:
    for line in fin:
        #print(line)
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text

print("length of search train and test = {:,}, {:,}".format(len(train_search_data), len(test_search_data)))
print("number of entites = {:,}".format(len(eid_to_text)))


length of search train and test = 1,368,611, 215,338
number of entites = 534,613


In [8]:
import random
from tqdm import tqdm
import ujson 

out_dir = os.path.join(in_dir, "sequential_train_test/")
if not os.path.exists(out_dir):
    os.mkdir(out_dir) 

seq_examples_list = []
prefixes_to_datas= {
    os.path.join(out_dir, "search_sequential"): (train_search_data, test_search_data, "urels.search.test.tsv"),
}

for prefix, (train_data, test_data, urel_path) in prefixes_to_datas.items():
    train_seq_examples = []
    test_seq_examples = []
    test_uid_to_pospids = {}
    for uid, group in tqdm(train_data.groupby("uid"), desc=prefix.split("/")[-1]):
        if "search_sequential" in prefix:
            qids = [int(x) for x in list(group.qid)]
            rel_pids = [int(x) for x in list(group.pid)]
        else:
            raise ValueError(f"{prefix} not valid.")
        
        assert len(qids) == len(rel_pids) == len(group)

        uid = int(uid)

        query_ids = qids[1:]
        context_key_ids = qids[:-1]
        context_value_ids = rel_pids[:-1]
        target_value_ids = rel_pids[1:]
        assert len(query_ids) == len(context_key_ids) == len(context_value_ids) == len(target_value_ids)

        example = {"uid": uid, "query_ids": query_ids, "context_key_ids": context_key_ids, "context_value_ids": context_value_ids,
                    "target_value_ids": target_value_ids}
        train_seq_examples.append(example)

        # for test
        test_row = test_data[test_data.uid == uid]
        if len(test_row) == 0:
            continue
        assert len(test_row) == 1, test_row
        
        if "search_sequential" in prefix:
            test_qid = int(test_row.iloc[0].qid)
        else:
            raise ValueError(f"{prefix} not valid.") 

        test_query_ids = qids[1:] + [test_qid]
        test_context_key_ids = qids 
        test_context_value_ids = rel_pids
        assert len(test_query_ids) == len(test_context_key_ids) == len(test_context_value_ids), (len(test_query_ids), 
                                                                                len(test_context_key_ids), len(test_context_value_ids))

        example = {"uid": uid, "query_ids": test_query_ids, "context_key_ids": test_context_key_ids, "context_value_ids": test_context_value_ids}
        test_seq_examples.append(example)

        if "search_sequential" in prefix:
            test_uid_to_pospids[uid] = int(test_row.iloc[0].pid)
        else:
            raise ValueError(f"{prefix} not valid.")
        
    
    with open(prefix + ".train.json", "w") as fout:
        for line in train_seq_examples:
            fout.write(ujson.dumps(line) + "\n")
    with open(prefix + ".test.json", "w") as fout:
        for line in test_seq_examples:
            fout.write(ujson.dumps(line) + "\n")
    with open(os.path.join(out_dir, urel_path), "w") as fout:
        for uid, pos_pid in test_uid_to_pospids.items():
            fout.write(f"{uid}\tQ0\t{pos_pid}\t{1}\n")
    

search_sequential: 100%|██████████| 215338/215338 [03:23<00:00, 1056.17it/s]


In [9]:
PIDS = []
with open(os.path.join(in_dir, "collection_title.tsv")) as fin:
    for line in fin:
        PIDS.append(int(line.strip().split("\t")[0]))
print(f"max pids = {max(PIDS)}")

def create_neg_value_ids(query_ids, pos_value_ids, miss_qids, sampler=None):
    assert type(sampler) == dict
    assert len(query_ids) == len(pos_value_ids)
    neg_value_ids = []
    for qid, pos_vid in zip(query_ids, pos_value_ids):
        if qid not in sampler:
            miss_qids.add(qid)
            neg_vid = random.sample(range(len(PIDS)), k=1)[0]
            while neg_vid == pos_vid:
                neg_vid = random.sample(range(len(PIDS)), k=1)[0]
            neg_value_ids.append(neg_vid)
        else:
            neg_vid = random.sample(sampler[qid], k=1)[0]
            while neg_vid == pos_vid:
                neg_vid = random.sample(range(len(PIDS)), k=1)[0]
            neg_value_ids.append(neg_vid)
    
    assert len(neg_value_ids) == len(pos_value_ids)
    
    return neg_value_ids

run_path = os.path.join(in_dir, "runs/bm25.all.run")
df = pd.read_csv(run_path, sep=" ", names=["hid", "q0", "tid", "rank", "score", "model_name"])
bm25_hid_to_tids = {}
ignore_hids = set()
for hid, group in df.groupby("hid"):
    cand_tids = list(group.tid.values)
    if len(cand_tids) < 10:
        ignore_hids.add(int(hid))
    else:
        bm25_hid_to_tids[int(hid)] = [int(x) for x in cand_tids]
        
print("number of ignore hids = {}".format(len(ignore_hids)))

max pids = 533870
number of ignore hids = 0


In [10]:
import ujson
from tqdm import tqdm 
import random

REL_RELATION = "is_relevant_to"

out_dir = os.path.join(in_dir, "sequential_train_test/")
train_search_examples, test_search_examples = [],[]
fn_to_example = {
    os.path.join(out_dir, "search_sequential.train.json"): train_search_examples,
    os.path.join(out_dir, "search_sequential.test.json"): test_search_examples,
}

for fn, data_examples in fn_to_example.items():
    with open(fn) as fin:
        for line in fin:
            data_examples.append(ujson.loads(line))
            
miss_hids = set()

suffix_to_examples = {
    "search_sequential.train.json": (train_search_examples,REL_RELATION),
    "search_sequential.test.json": (test_search_examples,REL_RELATION) ,
}

history_lengths = [4,8]
for hist_len in history_lengths:
    for dest_signature in ["hlen_{}_bm25".format(hist_len)]:
        dest_dir = os.path.join(out_dir, dest_signature)
        if not os.path.exists(dest_dir):
            os.mkdir(dest_dir)
        for suffix, (data_examples, relation) in suffix_to_examples.items():
            dest_fn = os.path.join(dest_dir, suffix)
            with open(dest_fn, "w") as fout:
                for example in tqdm(data_examples, desc=suffix):
                    if "train.json" in dest_fn:
                        start_idx = max(0, len(example["query_ids"])-hist_len)
                    elif "test.json":
                        start_idx = max(0, len(example["query_ids"])-hist_len-1)
                    else:
                        raise ValueError(f"{suffix} is not valid.")

                    query_ids = example["query_ids"][start_idx:]
                    context_key_ids = example["context_key_ids"][start_idx:]
                    context_value_ids = example["context_value_ids"][start_idx:]
                    if "train.json" in dest_fn:
                        target_value_ids = example["target_value_ids"][start_idx:]
                        if "bm25" in dest_signature:
                            if "search_sequential" in suffix:
                                neg_value_ids = create_neg_value_ids(query_ids=query_ids, 
                                                                     pos_value_ids=target_value_ids, 
                                                                     miss_qids=miss_hids, 
                                                                     sampler=bm25_hid_to_tids)
                            else:
                                raise ValueError(f"suffix: {suffix} is not valid.")             
                        else:
                            raise ValueError(f"dest signature: {dest_signature} is not valid.")
                        dest_example = {"uid": example["uid"], "query_ids": query_ids, "context_key_ids": context_key_ids,
                                    "context_value_ids": context_value_ids, 
                                    "target_value_ids": target_value_ids, "neg_value_ids": neg_value_ids, "relation": relation}
                    elif "test.json" in dest_fn:
                        dest_example = {"uid": example["uid"], "query_ids": query_ids, "context_key_ids": context_key_ids,
                                    "context_value_ids": context_value_ids, "relation": relation}
                    else:
                        raise ValueError(f"{suffix} is not valid.")
                    fout.write(ujson.dumps(dest_example) + "\n")
            if "bm25" in dest_signature:
                if "search_sequential" in suffix or "compl_rec_sequential" in suffix:
                    print("bm25 suffix: {}'s miss_hids = {}".format(suffix, len(miss_hids)))


dest_dir = os.path.join(out_dir, "without_context/")
if not os.path.exists(dest_dir):
    os.mkdir(dest_dir)
fn_to_example = {
    os.path.join(dest_dir, "uid_queries.test.search.tsv"): (test_search_examples, REL_RELATION)
}
for fn, (test_examples, relation) in fn_to_example.items():
    with open(fn, "w") as fout:
        for example in test_examples:
            uid, query = example["uid"], eid_to_text[example["query_ids"][-1]]
            fout.write(f"{uid}\t{query}\t{relation}\n")
        

search_sequential.train.json: 100%|██████████| 215338/215338 [00:03<00:00, 58500.74it/s]


bm25 suffix: search_sequential.train.json's miss_hids = 0


search_sequential.test.json: 100%|██████████| 215338/215338 [00:00<00:00, 219833.84it/s]


bm25 suffix: search_sequential.test.json's miss_hids = 0


search_sequential.train.json: 100%|██████████| 215338/215338 [00:04<00:00, 48351.11it/s]


bm25 suffix: search_sequential.train.json's miss_hids = 0


search_sequential.test.json: 100%|██████████| 215338/215338 [00:01<00:00, 209526.29it/s]


bm25 suffix: search_sequential.test.json's miss_hids = 0


In [16]:
check_dir = os.path.join(out_dir, "hlen_4_bm25")
"""
for fn in os.listdir(check_dir):
    fn = os.path.join(check_dir, fn)
    if ".test.json" in fn:
        continue
    if "search_sequential" not in fn:
        continue
    ! wc -l $fn 
    ! head -n 2 $fn
    print(100*"=")

"""

for fn in os.listdir(out_dir):
    if "search" not in fn or "small" in fn:
        continue 
    fn = os.path.join(out_dir, fn)
    
    ! wc -l $fn 
    ! head -n 2 $fn
    print(100*"=")

print(100*"=.")
"""
check_dir = os.path.join(out_dir, "without_context")

for fn in os.listdir(check_dir):
    fn = os.path.join(check_dir, fn)
    
    ! wc -l $fn 
    ! head -n 2 $fn
    print(100*"=")
"""

215338 /home/jupyter/unity_jointly_rec_and_search/datasets/amazon_review_datasets/Cell_Phones_and_Accessories/sequential_train_test/search_sequential.train.json
{"uid":0,"query_ids":[533872,533879,533872],"context_key_ids":[533872,533872,533879],"context_value_ids":[36258,181323,273874],"target_value_ids":[181323,273874,317230]}
{"uid":1,"query_ids":[533872,533871,533884,533871],"context_key_ids":[533883,533872,533871,533884],"context_value_ids":[29109,123568,266605,60890],"target_value_ids":[123568,266605,60890,193511]}
215338 /home/jupyter/unity_jointly_rec_and_search/datasets/amazon_review_datasets/Cell_Phones_and_Accessories/sequential_train_test/urels.search.test.tsv
0	Q0	306281	1
1	Q0	50876	1
215338 /home/jupyter/unity_jointly_rec_and_search/datasets/amazon_review_datasets/Cell_Phones_and_Accessories/sequential_train_test/search_sequential.test.json
{"uid":0,"query_ids":[533872,533879,533872,533872],"context_key_ids":[533872,533872,533879,533872],"context_value_ids":[36258,181323

'\ncheck_dir = os.path.join(out_dir, "without_context")\n\nfor fn in os.listdir(check_dir):\n    fn = os.path.join(check_dir, fn)\n    \n    ! wc -l $fn \n    ! head -n 2 $fn\n    print(100*"=")\n'

In [18]:
uid = 148
for example in test_search_examples:
    if example["uid"] == uid:
        print(example)
query_ids = [533872,533879,533872,533872]
target_vids = [181323,273874,317230, 306281]
neg_vids = [241366,207444,185388, 2222]

for qid, tvid, nvid in zip(query_ids, target_vids, neg_vids):
    print("query: {}\ntarget_item: {}\nneg_item: {}\n".format(eid_to_text[qid], eid_to_text[tvid], eid_to_text[nvid]))
    print(75*"=")

{'uid': 148, 'query_ids': [533888, 533871, 533891, 533874, 533879, 533875, 533879, 533879, 533893, 533882, 533904, 533902, 533882, 533879, 533894, 533894], 'context_key_ids': [533888, 533888, 533871, 533891, 533874, 533879, 533875, 533879, 533879, 533893, 533882, 533904, 533902, 533882, 533879, 533894], 'context_value_ids': [192667, 216313, 220932, 470258, 386194, 435309, 435610, 396247, 426200, 488058, 424134, 503340, 295929, 320473, 499373, 375161]}
query: query: cell phones accessories cases holsters sleeves
target_item: product: iPhone 5S Case, Caseology [Fusion Series] Scratch-Resistant Clearback Cover [Black] [Dual Bumper] for Apple iPhone 5S / 5 (2013) &amp; iPhone SE (2016) - Black ; Best brand for cases! This case is not only nice looking but it does what it says it does!! I had it for a year and my phone is still the same as when I put the case on. I have drop it so many times from different heights and different surfaces and my phone was always protected!! It is so nice that