In [3]:
import os 
import pickle 

import pandas as pd
import numpy as np

in_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
    
train_sim_data, train_compl_data, train_search_data = None, None, None
data_fns = [
    os.path.join(in_dir, "train_sim_recs.csv"),
    os.path.join(in_dir, "train_compl_recs.csv"),
    os.path.join(in_dir, "train_searchs.csv"),
]
datas = []
for fn in data_fns:
    datas.append(pd.read_csv(fn, index_col=0))
train_sim_data, train_compl_data, train_search_data = datas


datas = []
test_sim_data, test_compl_data, test_search_data = None, None, None
data_fns = [
    os.path.join(in_dir, "test_sim_recs.csv"),
    os.path.join(in_dir, "test_compl_recs.csv"),
    os.path.join(in_dir, "test_searchs.csv"),
]
datas = []
for fn in data_fns:
    datas.append(pd.read_csv(fn, index_col=0))
test_sim_data, test_compl_data, test_search_data = datas
datas = None

root_dir="/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
eid_to_text = {}
with open(os.path.join(root_dir, "all_entities.tsv")) as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text

print("length of sim_rec train and test = {:,}, {:,}".format(len(train_sim_data), len(test_sim_data)))
print("length of compl_rec train and test = {:,}, {:,}".format(len(train_compl_data), len(test_compl_data)))
print("length of search train and test = {:,}, {:,}".format(len(train_search_data), len(test_search_data)))
print("number of entites = {:,}".format(len(eid_to_text)))

assert set(test_sim_data.uid).issubset(set(train_sim_data.uid)) \
and set(test_compl_data.uid).issubset(set(train_compl_data.uid)) \
and set(test_search_data.uid).issubset(set(train_search_data.uid))
print("test users for each data are subset of their corresponding train users.")

length of sim_rec train and test = 1,017,800, 81,664
length of compl_rec train and test = 67,310, 12,628
length of search train and test = 13,726,249, 815,832
number of entites = 3,214,651
test users for each data are subset of their corresponding train users.


In [15]:
search_uids = np.array(train_search_data.uid)
unique_search_uids, uid_counts = np.unique(search_uids, return_counts=True)
print(np.quantile(a=uid_counts, q=[0.1, 0.25, 0.5, 0.75, 0.9]), np.sort(uid_counts))

sim_uids = np.array(train_sim_data.uid)
unique_sim_uids, sim_uid_counts = np.unique(sim_uids, return_counts=True)
print(np.quantile(a=sim_uid_counts, q=[0.1, 0.25, 0.5, 0.75, 0.9]), np.sort(sim_uid_counts))

compl_uids = np.array(train_compl_data.uid)
unique_compl_uids, compl_uid_counts = np.unique(compl_uids, return_counts=True)
print(np.quantile(a=compl_uid_counts, q=[0.1, 0.25, 0.5, 0.75, 0.9]), np.sort(compl_uid_counts))

[ 9. 10. 13. 18. 28.] [   9    9    9 ... 1840 1955 2520]
[ 9.  9. 11. 13. 17.] [  9   9   9 ... 688 854 886]
[4. 4. 4. 6. 8.] [ 4  4  4 ... 41 43 50]


In [27]:
search_ranges = [
(9,12), (13,16), (17,20), (21, 24), (25, np.max(uid_counts))]

sim_ranges = [
(9,11), (12,14), (15, 17), (18,20), (21, np.max(sim_uid_counts))]   

compl_ranges = [
(4,6), (7,9), (10,12), (13,15), (16, np.max(compl_uid_counts))]

search_uid_groups, sim_uid_groups, compl_uid_groups = [], [], []

# search
for low_num, up_num in search_ranges:
    mask = np.logical_and(uid_counts <= up_num, uid_counts >= low_num)
    search_uid_groups.append(unique_search_uids[mask])
assert np.sum(len(xs) for xs in search_uid_groups) == len(unique_search_uids)

# sim_rec
for low_num, up_num in sim_ranges:
    mask = np.logical_and(sim_uid_counts <= up_num, sim_uid_counts >= low_num)
    sim_uid_groups.append(unique_sim_uids[mask])
assert np.sum(len(xs) for xs in sim_uid_groups) == len(unique_sim_uids)

# compl_rec
for low_num, up_num in compl_ranges:
    mask = np.logical_and(compl_uid_counts <= up_num, compl_uid_counts >= low_num)
    compl_uid_groups.append(unique_compl_uids[mask])
assert np.sum(len(xs) for xs in compl_uid_groups) == len(unique_compl_uids)

  


In [38]:
import random
from tqdm import tqdm
import ujson 


SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"

MAX_LEN=10
out_dir = os.path.join(in_dir, "users_divided_by_group")
if not os.path.exists(out_dir):
    os.mkdir(out_dir) 

seq_examples_list = []
prefixes_to_datas= {
    os.path.join(out_dir, "search_sequential"): (train_search_data, test_search_data, search_uid_groups, "urels.search.test.tsv"),
    os.path.join(out_dir, "sim_rec_sequential"): (train_sim_data, test_sim_data, sim_uid_groups, "urels.sim.test.tsv"),
    os.path.join(out_dir, "compl_rec_sequential"): (train_compl_data, test_compl_data, compl_uid_groups, "urels.compl.test.tsv"),
}

for prefix, (train_data, test_data, uid_groups, urel_path) in prefixes_to_datas.items():
    for group_id in range(5):
        train_seq_examples = []
        test_seq_examples = []
        test_uid_to_pospids = {}
        for uid, group in tqdm(train_data.groupby("uid"), desc=prefix.split("/")[-1]):
            if uid not in uid_groups[group_id]:
                continue
            if "search_sequential" in prefix:
                qids = list(group.qid)
                group_rel_pids = group.rel_pids 
            elif "sim_rec_sequential" in prefix:
                qids = list(group.aid)
                group_rel_pids = group.sim_pids
            elif "compl_rec_sequential" in prefix:
                qids = list(group.aid)
                group_rel_pids = group.compl_pids
            else:
                raise ValueError(f"{prefix} not valid.")

            rel_pids = []
            for xs in group_rel_pids:
                rel_pids.append(random.sample(eval(xs), k=1)[0]) # only sample 1 relpid 
            assert len(qids) == len(rel_pids) == len(group)

            uid = int(uid)
            qids = [int(x) for x in qids]
            rel_pids = [int(x) for x in rel_pids]

            query_ids = qids[1:]
            context_key_ids = qids[:-1]
            context_value_ids = rel_pids[:-1]
            target_value_ids = rel_pids[1:]
            assert len(query_ids) == len(context_key_ids) == len(context_value_ids) == len(target_value_ids)

            # for test
            test_row = test_data[test_data.uid == uid]
            assert len(test_row) != 0
            assert len(test_row) == 1, test_row

            if "search_sequential" in prefix:
                test_qid = int(test_row.iloc[0].qid)
                relations = len(qids) * [REL_RELATION]
            elif "sim_rec_sequential" in prefix:
                test_qid = int(test_row.iloc[0].aid)
                relations = len(qids) * [SIM_RELATION]
            elif "compl_rec_sequential" in prefix:
                test_qid = int(test_row.iloc[0].aid)
                relations = len(qids) * [COMPL_RELATION]
            else:
                raise ValueError(f"{prefix} not valid.") 

            test_query_ids = qids[1:] + [test_qid]
            test_context_key_ids = qids 
            test_context_value_ids = rel_pids
            assert len(test_query_ids) == len(test_context_key_ids) == len(test_context_value_ids), (len(test_query_ids), 
                                                                                    len(test_context_key_ids), len(test_context_value_ids))
            assert len(relations) == len(test_query_ids)

            example = {"uid": uid, "query_ids": test_query_ids[:MAX_LEN], "context_key_ids": test_context_key_ids[:MAX_LEN], 
                       "context_value_ids": test_context_value_ids[:MAX_LEN], "relations": relations[:MAX_LEN]}
            test_seq_examples.append(example)

            if "search_sequential" in prefix:
                test_uid_to_pospids[uid] = eval(test_row.iloc[0].rel_pids)
            elif "sim_rec_sequential" in prefix:
                test_uid_to_pospids[uid] = eval(test_row.iloc[0].sim_pids)
            elif "compl_rec_sequential" in prefix:
                test_uid_to_pospids[uid] = eval(test_row.iloc[0].compl_pids)
            else:
                raise ValueError(f"{prefix} not valid.")


        with open(prefix + f"_group{group_id}.test.json", "w") as fout:
            for line in test_seq_examples:
                fout.write(ujson.dumps(line) + "\n")
        with open(os.path.join(out_dir, f"group_{group_id}_"+urel_path), "w") as fout:
            for uid, pos_pids in test_uid_to_pospids.items():
                for pos_pid in pos_pids:
                    fout.write(f"{uid}\tQ0\t{pos_pid}\t{1}\n")


search_sequential: 100%|██████████| 815832/815832 [13:21<00:00, 1017.75it/s]
search_sequential: 100%|██████████| 815832/815832 [05:56<00:00, 2290.86it/s]
search_sequential: 100%|██████████| 815832/815832 [03:09<00:00, 4313.65it/s]
search_sequential: 100%|██████████| 815832/815832 [01:52<00:00, 7282.66it/s]
search_sequential: 100%|██████████| 815832/815832 [04:08<00:00, 3281.12it/s]
sim_rec_sequential: 100%|██████████| 81664/81664 [00:53<00:00, 1532.86it/s]
sim_rec_sequential: 100%|██████████| 81664/81664 [00:19<00:00, 4214.93it/s]
sim_rec_sequential: 100%|██████████| 81664/81664 [00:09<00:00, 8994.76it/s] 
sim_rec_sequential: 100%|██████████| 81664/81664 [00:04<00:00, 17315.24it/s]
sim_rec_sequential: 100%|██████████| 81664/81664 [00:06<00:00, 12006.40it/s]
compl_rec_sequential: 100%|██████████| 12628/12628 [00:09<00:00, 1346.19it/s]
compl_rec_sequential: 100%|██████████| 12628/12628 [00:01<00:00, 8334.72it/s]
compl_rec_sequential: 100%|██████████| 12628/12628 [00:00<00:00, 24522.02it/

In [34]:
eval(test_row.iloc[0].rel_pids)

[536313, 36596]

In [36]:
! ls $out_dir

compl_rec_sequential_group0.test.json  group_3_urels.search.test.tsv
compl_rec_sequential_group1.test.json  group_3_urels.sim.test.tsv
compl_rec_sequential_group2.test.json  group_4_urels.compl.test.tsv
compl_rec_sequential_group3.test.json  group_4_urels.search.test.tsv
compl_rec_sequential_group4.test.json  group_4_urels.sim.test.tsv
group_0_urels.compl.test.tsv	       search_sequential_group0.test.json
group_0_urels.search.test.tsv	       search_sequential_group1.test.json
group_0_urels.sim.test.tsv	       search_sequential_group2.test.json
group_1_urels.compl.test.tsv	       search_sequential_group3.test.json
group_1_urels.search.test.tsv	       search_sequential_group4.test.json
group_1_urels.sim.test.tsv	       sim_rec_sequential_group0.test.json
group_2_urels.compl.test.tsv	       sim_rec_sequential_group1.test.json
group_2_urels.search.test.tsv	       sim_rec_sequential_group2.test.json
group_2_urels.sim.test.tsv	       sim_rec_sequential_group3.test.json
group_3_urels.compl.te

In [37]:
out_dir

'/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/users_divided_by_group'