In [17]:
import os 
import pickle 

import pandas as pd
import numpy as np

in_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
    
train_sim_data, train_compl_data, train_search_data = None, None, None
data_fns = [
    os.path.join(in_dir, "train_sim_recs.csv"),
    os.path.join(in_dir, "train_compl_recs.csv"),
    os.path.join(in_dir, "train_searchs.csv"),
]
datas = []
for fn in data_fns:
    datas.append(pd.read_csv(fn, index_col=0))
train_sim_data, train_compl_data, train_search_data = datas


datas = []
test_sim_data, test_compl_data, test_search_data = None, None, None
selected_dir = os.path.join(in_dir, "selected_test_user")
data_fns = [
    os.path.join(selected_dir, "selected_sim_data.test.pkl"),
    os.path.join(selected_dir, "selected_compl_data.test.pkl"),
    os.path.join(selected_dir, "selected_search_data.test.pkl"),
]
for fn in data_fns:
    with open(fn, "rb") as fin:
        datas.append(pickle.load(fin))
test_sim_data, test_compl_data, test_search_data = datas
datas = None

root_dir="/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
eid_to_text = {}
with open(os.path.join(root_dir, "all_entities.tsv")) as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text

print("length of sim_rec train and test = {:,}, {:,}".format(len(train_sim_data), len(test_sim_data)))
print("length of compl_rec train and test = {:,}, {:,}".format(len(train_compl_data), len(test_compl_data)))
print("length of search train and test = {:,}, {:,}".format(len(train_search_data), len(test_search_data)))
print("number of entites = {:,}".format(len(eid_to_text)))

assert set(test_sim_data.uid).issubset(set(train_sim_data.uid)) \
and set(test_compl_data.uid).issubset(set(train_compl_data.uid)) \
and set(test_search_data.uid).issubset(set(train_search_data.uid))
print("test users for each data are subset of their corresponding train users.")

  mask |= (ar1 == a)


length of sim_rec train and test = 1,017,800, 10,000
length of compl_rec train and test = 67,310, 10,000
length of search train and test = 13,726,249, 10,000
number of entites = 3,214,651
test users for each data are subset of their corresponding train users.


In [28]:
import random
from tqdm import tqdm
import ujson 

out_dir = os.path.join(in_dir, "sequential_train_test/")
if not os.path.exists(out_dir):
    os.mkdir(out_dir) 

seq_examples_list = []
prefixes_to_datas= {
    os.path.join(out_dir, "search_sequential"): (train_search_data, test_search_data, "urels.search.test.tsv"),
    os.path.join(out_dir, "sim_rec_sequential"): (train_sim_data, test_sim_data, "urels.sim.test.tsv"),
    os.path.join(out_dir, "compl_rec_sequential"): (train_compl_data, test_compl_data, "urels.compl.test.tsv"),
}

for prefix, (train_data, test_data, urel_path) in prefixes_to_datas.items():
    train_seq_examples = []
    test_seq_examples = []
    test_uid_to_pospids = {}
    for uid, group in tqdm(train_data.groupby("uid"), desc=prefix.split("/")[-1]):
        if "search_sequential" in prefix:
            qids = list(group.qid)
            group_rel_pids = group.rel_pids 
        elif "sim_rec_sequential" in prefix:
            qids = list(group.aid)
            group_rel_pids = group.sim_pids
        elif "compl_rec_sequential" in prefix:
            qids = list(group.aid)
            group_rel_pids = group.compl_pids
        else:
            raise ValueError(f"{prefix} not valid.")
        
        rel_pids = []
        for xs in group_rel_pids:
            rel_pids.append(random.sample(eval(xs), k=1)[0]) # only sample 1 relpid 
        assert len(qids) == len(rel_pids) == len(group)

        uid = int(uid)
        qids = [int(x) for x in qids]
        rel_pids = [int(x) for x in rel_pids]

        query_ids = qids[1:]
        context_key_ids = qids[:-1]
        context_value_ids = rel_pids[:-1]
        target_value_ids = rel_pids[1:]
        assert len(query_ids) == len(context_key_ids) == len(context_value_ids) == len(target_value_ids)

        example = {"uid": uid, "query_ids": query_ids, "context_key_ids": context_key_ids, "context_value_ids": context_value_ids,
                    "target_value_ids": target_value_ids}
        train_seq_examples.append(example)

        # for test
        test_row = test_data[test_data.uid == uid]
        if len(test_row) == 0:
            continue
        assert len(test_row) == 1, test_row
        
        if "search_sequential" in prefix:
            test_qid = int(test_row.iloc[0].qid)
        elif "sim_rec_sequential" in prefix:
            test_qid = int(test_row.iloc[0].aid)
        elif "compl_rec_sequential" in prefix:
            qids = list(group.aid)
            test_qid = int(test_row.iloc[0].aid)
        else:
            raise ValueError(f"{prefix} not valid.") 

        test_query_ids = qids[1:] + [test_qid]
        test_context_key_ids = qids 
        test_context_value_ids = rel_pids
        assert len(test_query_ids) == len(test_context_key_ids) == len(test_context_value_ids), (len(test_query_ids), 
                                                                                len(test_context_key_ids), len(test_context_value_ids))

        example = {"uid": uid, "query_ids": test_query_ids, "context_key_ids": test_context_key_ids, "context_value_ids": test_context_value_ids}
        test_seq_examples.append(example)

        if "search_sequential" in prefix:
            test_uid_to_pospids[uid] = test_row.iloc[0].rel_pids
        elif "sim_rec_sequential" in prefix:
            test_uid_to_pospids[uid] = test_row.iloc[0].sim_pids
        elif "compl_rec_sequential" in prefix:
            test_uid_to_pospids[uid] = test_row.iloc[0].compl_pids
        else:
            raise ValueError(f"{prefix} not valid.")
        

    with open(prefix + ".train.json", "w") as fout:
        for line in train_seq_examples:
            fout.write(ujson.dumps(line) + "\n")
    with open(prefix + ".test.json", "w") as fout:
        for line in test_seq_examples:
            fout.write(ujson.dumps(line) + "\n")
    with open(os.path.join(out_dir, urel_path), "w") as fout:
        for uid, pos_pids in test_uid_to_pospids.items():
            for pos_pid in pos_pids:
                fout.write(f"{uid}\tQ0\t{pos_pid}\t{1}\n")


search_sequential: 100%|██████████| 815832/815832 [09:31<00:00, 1428.49it/s]
sim_rec_sequential: 100%|██████████| 81664/81664 [00:57<00:00, 1415.14it/s]
compl_rec_sequential: 100%|██████████| 12628/12628 [00:10<00:00, 1188.69it/s]


In [29]:
def create_neg_value_ids(query_ids, pos_value_ids, miss_qids, sampler=None):
    assert type(sampler) == dict
    assert len(query_ids) == len(pos_value_ids)
    neg_value_ids = []
    for qid, pos_vid in zip(query_ids, pos_value_ids):
        if qid not in sampler:
            miss_qids.add(qid)
            neg_vid = random.sample(range(2_000_000), k=1)[0]
            while neg_vid == pos_vid:
                neg_vid = random.sample(range(2_000_000), k=1)[0]
            neg_value_ids.append(neg_vid)
        else:
            neg_vid = random.sample(sampler[qid], k=1)[0]
            while neg_vid == pos_vid:
                neg_vid = random.sample(range(2_000_000), k=1)[0]
            neg_value_ids.append(neg_vid)
    
    assert len(neg_value_ids) == len(pos_value_ids)
    
    return neg_value_ids

run_path = os.path.join(in_dir, "runs/bm25.all.run")
df = pd.read_csv(run_path, sep=" ", names=["hid", "q0", "tid", "rank", "score", "model_name"])
bm25_hid_to_tids = {}
ignore_hids = set()
for hid, group in df.groupby("hid"):
    cand_tids = list(group.tid.values)
    if len(cand_tids) < 10:
        ignore_hids.add(int(hid))
    else:
        bm25_hid_to_tids[int(hid)] = [int(x) for x in cand_tids]
        
print("number of ignore hids = {}".format(len(ignore_hids)))

number of ignore hids = 6644


In [36]:
import ujson
from tqdm import tqdm 
import random

SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"

in_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
out_dir = os.path.join(in_dir, "sequential_train_test/")
train_search_examples, test_search_examples, train_sim_rec_examples, test_sim_rec_examples, train_compl_rec_examples, \
test_compl_rec_examples = [],[],[],[],[],[]
fn_to_example = {
    os.path.join(out_dir, "search_sequential.train.json"): train_search_examples,
    os.path.join(out_dir, "search_sequential.test.json"): test_search_examples,
    os.path.join(out_dir, "sim_rec_sequential.train.json"): train_sim_rec_examples,
    os.path.join(out_dir, "sim_rec_sequential.test.json"): test_sim_rec_examples,
    os.path.join(out_dir, "compl_rec_sequential.train.json"): train_compl_rec_examples,
    os.path.join(out_dir, "compl_rec_sequential.test.json"): test_compl_rec_examples,
}

for fn, data_examples in fn_to_example.items():
    with open(fn) as fin:
        for line in fin:
            data_examples.append(ujson.loads(line))
            
miss_hids = set()

suffix_to_examples = {
    "search_sequential.train.json": (train_search_examples,REL_RELATION),
    "search_sequential.test.json": (test_search_examples,REL_RELATION) ,
    "sim_rec_sequential.train.json": (train_sim_rec_examples,SIM_RELATION),
    "sim_rec_sequential.test.json": (test_sim_rec_examples,SIM_RELATION),
    "compl_rec_sequential.train.json": (train_compl_rec_examples,COMPL_RELATION),
    "compl_rec_sequential.test.json": (test_compl_rec_examples,COMPL_RELATION),
}

history_lengths = [4, 8]
for hist_len in history_lengths:
    for dest_signature in ["hlen_{}_randneg".format(hist_len), "hlen_{}_bm25".format(hist_len)]:
        dest_dir = os.path.join(out_dir, dest_signature)
        if not os.path.exists(dest_dir):
            os.mkdir(dest_dir)
        for suffix, (data_examples, relation) in suffix_to_examples.items():
            dest_fn = os.path.join(dest_dir, suffix)
            with open(dest_fn, "w") as fout:
                for example in tqdm(data_examples, desc=suffix):
                    if "train.json" in dest_fn:
                        start_idx = max(0, len(example["query_ids"])-hist_len)
                    elif "test.json":
                        start_idx = max(0, len(example["query_ids"])-hist_len-1)
                    else:
                        raise ValueError(f"{suffix} is not valid.")

                    query_ids = example["query_ids"][start_idx:]
                    context_key_ids = example["context_key_ids"][start_idx:]
                    context_value_ids = example["context_value_ids"][start_idx:]
                    if "train.json" in dest_fn:
                        target_value_ids = example["target_value_ids"][start_idx:]
                        if "randneg" in dest_signature:
                            neg_value_ids = random.sample(range(2_000_000), k=len(target_value_ids))
                        elif "bm25" in dest_signature:
                            if "sim_rec_sequential" in suffix:
                                neg_value_ids = random.sample(range(2_000_000), k=len(target_value_ids))
                            elif "search_sequential" in suffix or "compl_rec_sequential" in suffix:
                                neg_value_ids = create_neg_value_ids(query_ids=query_ids, 
                                                                     pos_value_ids=target_value_ids, 
                                                                     miss_qids=miss_hids, 
                                                                     sampler=bm25_hid_to_tids)
                                
                        else:
                            raise ValueError(f"dest signature: {dest_signature} is not valid.")
                        dest_example = {"uid": example["uid"], "query_ids": query_ids, "context_key_ids": context_key_ids,
                                    "context_value_ids": context_value_ids, 
                                    "target_value_ids": target_value_ids, "neg_value_ids": neg_value_ids, "relation": relation}
                    elif "test.json" in dest_fn:
                        dest_example = {"uid": example["uid"], "query_ids": query_ids, "context_key_ids": context_key_ids,
                                    "context_value_ids": context_value_ids, "relation": relation}
                    else:
                        raise ValueError(f"{suffix} is not valid.")
                    fout.write(ujson.dumps(dest_example) + "\n")
            if "bm25" in dest_signature:
                if "search_sequential" in suffix or "compl_rec_sequential" in suffix:
                    print("bm25 suffix: {}'s miss_hids = {}".format(suffix, len(miss_hids)))


root_dir="/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
eid_to_text = {}
with open(os.path.join(root_dir, "all_entities.tsv")) as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text

dest_dir = os.path.join(out_dir, "without_context/")
if not os.path.exists(dest_dir):
    os.mkdir(dest_dir)
fn_to_example = {
    os.path.join(dest_dir, "uid_anchors.test.sim.tsv"): (test_sim_rec_examples, SIM_RELATION),
    os.path.join(dest_dir, "uid_anchors.test.compl.tsv"): (test_compl_rec_examples, COMPL_RELATION),
    os.path.join(dest_dir, "uid_queries.test.search.tsv"): (test_search_examples, REL_RELATION)
}
for fn, (test_examples, relation) in fn_to_example.items():
    with open(fn, "w") as fout:
        for example in test_examples:
            uid, query = example["uid"], eid_to_text[example["query_ids"][-1]]
            fout.write(f"{uid}\t{query}\t{relation}\n")
        

search_sequential.train.json: 100%|██████████| 815832/815832 [00:10<00:00, 79697.56it/s]
search_sequential.test.json: 100%|██████████| 10000/10000 [00:00<00:00, 182879.47it/s]
sim_rec_sequential.train.json: 100%|██████████| 81664/81664 [00:00<00:00, 83854.00it/s]
sim_rec_sequential.test.json: 100%|██████████| 10000/10000 [00:00<00:00, 219623.51it/s]
compl_rec_sequential.train.json: 100%|██████████| 12628/12628 [00:00<00:00, 87230.37it/s]
compl_rec_sequential.test.json: 100%|██████████| 10000/10000 [00:00<00:00, 202789.92it/s]
search_sequential.train.json: 100%|██████████| 815832/815832 [00:18<00:00, 43379.93it/s]


bm25 suffix: search_sequential.train.json's miss_hids = 25114


search_sequential.test.json: 100%|██████████| 10000/10000 [00:00<00:00, 211321.24it/s]


bm25 suffix: search_sequential.test.json's miss_hids = 25114


sim_rec_sequential.train.json: 100%|██████████| 81664/81664 [00:01<00:00, 75563.93it/s]
sim_rec_sequential.test.json: 100%|██████████| 10000/10000 [00:00<00:00, 152583.95it/s]
compl_rec_sequential.train.json: 100%|██████████| 12628/12628 [00:00<00:00, 49535.67it/s]


bm25 suffix: compl_rec_sequential.train.json's miss_hids = 25114


compl_rec_sequential.test.json: 100%|██████████| 10000/10000 [00:00<00:00, 226594.20it/s]


bm25 suffix: compl_rec_sequential.test.json's miss_hids = 25114


search_sequential.train.json: 100%|██████████| 815832/815832 [00:14<00:00, 55988.87it/s]
search_sequential.test.json: 100%|██████████| 10000/10000 [00:00<00:00, 169429.86it/s]
sim_rec_sequential.train.json: 100%|██████████| 81664/81664 [00:01<00:00, 59523.23it/s]
sim_rec_sequential.test.json: 100%|██████████| 10000/10000 [00:00<00:00, 175142.87it/s]
compl_rec_sequential.train.json: 100%|██████████| 12628/12628 [00:00<00:00, 78450.46it/s]
compl_rec_sequential.test.json: 100%|██████████| 10000/10000 [00:00<00:00, 218129.55it/s]
search_sequential.train.json: 100%|██████████| 815832/815832 [00:32<00:00, 25448.43it/s]


bm25 suffix: search_sequential.train.json's miss_hids = 33608


FileNotFoundError: [Errno 2] No such file or directory: '/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/sequential_train_test/hlen_8_bm25/search_sequential.test.json'

In [7]:
"""
check_dir = os.path.join(out_dir, "hlen_4_randneg")

for fn in os.listdir(check_dir):
    if "search" not in fn or "small" in fn:
        continue 
    fn = os.path.join(check_dir, fn)
    
    ! wc -l $fn 
    ! head -n 2 $fn
    print(100*"=")

for fn in os.listdir(out_dir):
    if "search" not in fn or "small" in fn:
        continue 
    fn = os.path.join(out_dir, fn)
    
    ! wc -l $fn 
    ! head -n 2 $fn
    print(100*"=")

print(100*"=.")
"""
check_dir = os.path.join(out_dir, "without_context")

for fn in os.listdir(check_dir):
    fn = os.path.join(check_dir, fn)
    
    ! wc -l $fn 
    ! head -n 2 $fn
    print(100*"=")


12628 /work/hzeng_umass_edu/ir-research/joint_modeling_search_and_rec/datasets/unified_kgc/unified_user/sequential_train_test/without_context/uid_anchors.test.compl.tsv
6	General Shale Providence series 50-Pack Carbon 1/2-in x 8-in Tumbled Ceramic Brick Look Wall Tile ; Tile	is_complementary_to
92	Legrand Plastic RCA to F-Type Wall Jack ; Audio & Video Wall Jacks	is_complementary_to
10000 /work/hzeng_umass_edu/ir-research/joint_modeling_search_and_rec/datasets/unified_kgc/unified_user/sequential_train_test/without_context/uid_anchors.test.sim.small.tsv
376229	LG Smart Wi-Fi Enabled 4.5-cu ft High Efficiency Stackable Steam Cycle Front-Load Washer (Graphite Steel) ENERGY STAR ; Front-Load Washers	is_similar_to
490831	ClosetMaid BrightWood 5-ft to 10-ft W x 6.85-ft H White Wood Closet Kit ; Wood Closet Kits	is_similar_to
10000 /work/hzeng_umass_edu/ir-research/joint_modeling_search_and_rec/datasets/unified_kgc/unified_user/sequential_train_test/without_context/uid_queries.test.search.sma

In [13]:
uid = 146571
for example in test_search_examples:
    if example["uid"] == uid:
        print(example)
print(eid_to_text[2809002])

{'uid': 146571, 'query_ids': [2981289, 2658084, 2619697, 2438241, 2438241, 3012079, 2494185, 2730445, 2282185, 2650790, 3012079, 3012079, 3059960, 3170758, 2809002], 'context_key_ids': [2467143, 2981289, 2658084, 2619697, 2438241, 2438241, 3012079, 2494185, 2730445, 2282185, 2650790, 3012079, 3012079, 3059960, 3170758], 'context_value_ids': [420438, 1470982, 1430343, 200653, 2251545, 1302684, 1921262, 1276207, 1572750, 656803, 804622, 1545826, 1921262, 1461566, 555962]}
bathroom exhaust fan motor


In [4]:
root_dir="/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/
with open(os.path.join(root_dir, "all_entities.tsv")) as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text

query_ids = [2417516,2374200, 2576318, 2836633 ]
context_value_ids = [202549,1582436,1400531,520284]
target_value_ids = [1582436,1400531,520284,27714]

for qid, context_pid, target_pid in zip(query_ids, context_value_ids, target_value_ids):
    print("qid: {}\n ctxpid : {} \n relpid : {}".format(eid_to_text[qid],eid_to_text[context_pid], eid_to_text[target_pid]))
    print("="*75)
print("hi")

qid: sputnik light
 ctxpid : ReliaBilt 30001 Series 31.75-in x 15.75-in x 3-in Jamb Tilting Vinyl Replacement White Basement Hopper Window ; Basement Hopper Windows 
 relpid : allen + roth Grayford 9-Light Brushed Nickel Mid-century Sputnik Pendant Light ; Pendant Lighting
qid: pantry cabinet
 ctxpid : allen + roth Grayford 9-Light Brushed Nickel Mid-century Sputnik Pendant Light ; Pendant Lighting 
 relpid : Project Source 18-in W x 84-in H x 23.75-in D Natural Unfinished Oak Door Pantry Fully Assembled Stock Cabinet (Square Door Style) ; Kitchen Cabinets
qid: linoleum sheet flooring
 ctxpid : Project Source 18-in W x 84-in H x 23.75-in D Natural Unfinished Oak Door Pantry Fully Assembled Stock Cabinet (Square Door Style) ; Kitchen Cabinets 
 relpid : Armstrong Flooring Pickwick Landing I 12-ft W Cut-to-Length Bear Path Oak Dark Brown Wood Look Low-Gloss Finish Sheet Vinyl ; Sheet Vinyl (Cut-to-Length)
qid: shop heater
 ctxpid : Armstrong Flooring Pickwick Landing I 12-ft W Cut-to-Len