In [None]:
import os 
import pickle 

import pandas as pd
import numpy as np

SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"

in_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
    
train_sim_data, train_compl_data, train_search_data = None, None, None
data_fns = [
    os.path.join(in_dir, "train_sim_recs.csv"),
    os.path.join(in_dir, "train_compl_recs.csv"),
    os.path.join(in_dir, "train_searchs.csv"),
]
datas = []
for fn in data_fns:
    datas.append(pd.read_csv(fn, index_col=0))
train_sim_data, train_compl_data, train_search_data = datas


datas = []
test_sim_data, test_compl_data, test_search_data = None, None, None
data_fns = [
    os.path.join(in_dir, "test_sim_recs.csv"),
    os.path.join(in_dir, "test_compl_recs.csv"),
    os.path.join(in_dir, "test_searchs.csv"),
]
datas = []
for fn in data_fns:
    datas.append(pd.read_csv(fn, index_col=0))
test_sim_data, test_compl_data, test_search_data = datas
datas = None

root_dir="/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
eid_to_text = {}
with open(os.path.join(root_dir, "all_entities.tsv")) as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text

train_sim_data["relation"] = SIM_RELATION
test_sim_data["relation"] = SIM_RELATION
train_compl_data["relation"] = COMPL_RELATION
test_compl_data["relation"] = COMPL_RELATION
train_search_data["relation"] = REL_RELATION
test_search_data["relation"] = REL_RELATION

train_sim_data.rename({"aid": "hid", "sim_pids": "tids"}, axis=1, inplace=True)
test_sim_data.rename({"aid": "hid", "sim_pids": "tids"}, axis=1, inplace=True)
train_compl_data.rename({"aid": "hid", "compl_pids": "tids"}, axis=1, inplace=True)
test_compl_data.rename({"aid": "hid", "compl_pids": "tids"}, axis=1, inplace=True)
train_search_data.rename({"qid": "hid", "rel_pids": "tids"}, axis=1, inplace=True)
test_search_data.rename({"qid": "hid", "rel_pids": "tids"}, axis=1, inplace=True)

train_merge_data = pd.concat([train_sim_data, train_compl_data, train_search_data])
train_merge_data["date_time"] = pd.to_datetime(train_merge_data["date_time"])
train_merge_data = train_merge_data.sort_values(by=["uid", "date_time"])

print("length of sim_rec train and test = {:,}, {:,}".format(len(train_sim_data), len(test_sim_data)))
print("length of compl_rec train and test = {:,}, {:,}".format(len(train_compl_data), len(test_compl_data)))
print("length of search train and test = {:,}, {:,}".format(len(train_search_data), len(test_search_data)))
print("length of train_merge_data = {:,}".format(len(train_merge_data)))
print("number of entites = {:,}".format(len(eid_to_text)))

assert set(test_sim_data.uid).issubset(set(train_sim_data.uid)) \
and set(test_compl_data.uid).issubset(set(train_compl_data.uid)) \
and set(test_search_data.uid).issubset(set(train_search_data.uid))
assert len(train_merge_data) == len(train_sim_data) + len(train_compl_data) + len(train_search_data)
print("test users for each data are subset of their corresponding train users.")

In [2]:
search_uids = np.array(train_search_data.uid)
unique_search_uids, uid_counts = np.unique(search_uids, return_counts=True)
print(np.quantile(a=uid_counts, q=[0.02, 0.2, 0.4, 0.6, 0.8, 0.975]), np.sort(uid_counts))

sim_uids = np.array(train_sim_data.uid)
unique_sim_uids, sim_uid_counts = np.unique(sim_uids, return_counts=True)
print(np.quantile(a=sim_uid_counts, q=[0.02, 0.2, 0.4, 0.6, 0.8, 0.975]), np.sort(sim_uid_counts))

compl_uids = np.array(train_compl_data.uid)
unique_compl_uids, compl_uid_counts = np.unique(compl_uids, return_counts=True)
print(np.quantile(a=compl_uid_counts, q=[0.02, 0.2, 0.4, 0.6, 0.8, 0.975]), np.sort(compl_uid_counts))

[ 9. 10. 11. 14. 20. 49.] [   9    9    9 ... 1840 1955 2520]
[ 9.  9. 10. 11. 14. 25.] [  9   9   9 ... 688 854 886]
[ 4.  4.  4.  5.  6. 12.] [ 4  4  4 ... 41 43 50]


In [3]:
search_ranges = [
(9,10), (11,12), (13, 14), (15, 20), (21, np.max(uid_counts))]

sim_ranges = [
(9,10), (11,12), (13, 14), (15,16), (17, np.max(sim_uid_counts))]   

compl_ranges = [
(4,5), (6,7), (8,9), (10,11), (12, np.max(compl_uid_counts))]

search_uid_groups, sim_uid_groups, compl_uid_groups = [], [], []

# search
for (low_num, up_num) in search_ranges:
    mask = np.logical_and(uid_counts <= up_num, uid_counts >= low_num)
    search_uid_groups.append(unique_search_uids[mask])
assert np.sum(len(xs) for xs in search_uid_groups) == len(unique_search_uids)

# sim_rec
for low_num, up_num in sim_ranges:
    mask = np.logical_and(sim_uid_counts <= up_num, sim_uid_counts >= low_num)
    sim_uid_groups.append(unique_sim_uids[mask])
assert np.sum(len(xs) for xs in sim_uid_groups) == len(unique_sim_uids)

# compl_rec
for low_num, up_num itest_data_ranges:
    mask = np.logical_and(compl_uid_counts <= up_num, compl_uid_counts >= low_num)
    compl_uid_groups.append(unique_compl_uids[mask])
assert np.sum(len(xs) for xs in compl_uid_groups) == len(unique_compl_uids)

SyntaxError: invalid syntax (1963251253.py, line 25)

In [None]:
import random
from tqdm import tqdm
import ujson 


SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"

MAX_LEN=20
out_dir = os.path.join(in_dir, "users_divided_by_group")
if not os.path.exists(out_dir):
    os.mkdir(out_dir) 

seq_examples_list = []
prefixes_to_datas= {
    os.path.join(out_dir, "search_sequential"): (train_search_data, test_search_data, search_uid_groups, "urels.search.test.tsv"),
    os.path.join(out_dir, "sim_rec_sequential"): (train_sim_data, test_sim_data, sim_uid_groups, "urels.sim.test.tsv"),
    os.path.join(out_dir, "compl_rec_sequential"): (train_compl_data, test_compl_data, compl_uid_groups, "urels.compl.test.tsv"),
}

for prefix, (train_data, test_data, uid_groups, urel_path) in prefixes_to_datas.items():
    for group_id in range(5):
        train_seq_examples = []
        test_seq_examples = []
        test_uid_to_pospids = {}
        for uid, g in tqdm(train_data.groupby("uid"), desc=prefix.split("/")[-1]):
            if uid not in uid_groups[group_id]:
                continue
            last_time = g.iloc[-1].date_time
            group = train_merge_data[train_merge_data.uid==uid]
            group = group[group.date_time <= last_time]
            
            qids = list(group.hid)
            group_rel_pids = list(group.tids)
            relations = list(group.relation)

            rel_pids = []
            for xs in group_rel_pids:
                rel_pids.append(random.sample(eval(xs), k=1)[0]) # only sample 1 relpid 
            assert len(qids) == len(rel_pids) == len(group)

            uid = int(uid)
            qids = [int(x) for x in qids]
            rel_pids = [int(x) for x in rel_pids]

            # for test
            test_row = test_data[test_data.uid == uid]
            if len(test_row) == 0:
                continue
            assert len(test_row) == 1, test_row

            test_qid = int(test_row.iloc[0].hid)
            test_relation = str(test_row.iloc[0].relation)

            test_query_ids = qids[1:] + [test_qid]
            test_context_key_ids = qids 
            test_context_value_ids = rel_pids
            relations = relations[1:] + [test_relation]
            assert len(test_query_ids) == len(test_context_key_ids) == len(test_context_value_ids), (len(test_query_ids), 
                                                                                    len(test_context_key_ids), len(test_context_value_ids))
            assert len(test_query_ids) == len(relations), (len(test_query_ids), len(relations))

            example = {"uid": uid, "query_ids": test_query_ids[-MAX_LEN:], "context_key_ids": test_context_key_ids[-MAX_LEN:], 
                       "context_value_ids": test_context_value_ids[-MAX_LEN:], "relations": relations[-MAX_LEN:]}
            test_seq_examples.append(example)

            if "search_sequential" in prefix:
                test_uid_to_pospids[uid] = test_row.iloc[0].tids
            elif "sim_rec_sequential" in prefix:
                test_uid_to_pospids[uid] = test_row.iloc[0].tids
            elif "compl_rec_sequential" in prefix:
                test_uid_to_pospids[uid] = test_row.iloc[0].tids
            else:
                raise ValueError(f"{prefix} not valid.")


        with open(prefix + f"_group{group_id}.test.json", "w") as fout:
            for line in test_seq_examples:
                fout.write(ujson.dumps(line) + "\n")
        with open(os.path.join(out_dir, f"group_{group_id}_"+urel_path), "w") as fout:
            for uid, pos_pids in test_uid_to_pospids.items():
                for pos_pid in pos_pids:
                    fout.write(f"{uid}\tQ0\t{pos_pid}\t{1}\n")


search_sequential: 100%|██████████| 815832/815832 [1:15:55<00:00, 179.10it/s]
search_sequential:  29%|██▉       | 238229/815832 [13:37<35:09, 273.83it/s]  

In [5]:
import random
from tqdm import tqdm
import ujson 


SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"

MAX_LEN=20
out_dir = os.path.join(in_dir, "users_divided_by_group")
if not os.path.exists(out_dir):
    os.mkdir(out_dir) 

seq_examples_list = []
prefixes_to_datas= {
    os.path.join(out_dir, "search_sequential"): (train_search_data, test_search_data,  "urels.search.test.tsv"),
    os.path.join(out_dir, "sim_rec_sequential"): (train_sim_data, test_sim_data,  "urels.sim.test.tsv"),
    os.path.join(out_dir, "compl_rec_sequential"): (train_compl_data, test_compl_data,  "urels.compl.test.tsv"),
}
for prefix, (train_data, test_data, urel_path) in prefixes_to_datas.items():
    test_uid_to_pospids = {}
    for idx, test_row in tqdm(test_data.iterrows(), total=len(test_data)):
        uid = test_row.uid
            
        if "search_sequential" in prefix:
            test_uid_to_pospids[uid] = eval(test_row.tids)
        elif "sim_rec_sequential" in prefix:
            test_uid_to_pospids[uid] = eval(test_row.tids)
        elif "compl_rec_sequential" in prefix:
            test_uid_to_pospids[uid] = eval(test_row.tids)
        else:
            raise ValueError(f"{prefix} not valid.")
            
    with open(os.path.join(out_dir, urel_path), "w") as fout:
        for uid, pos_pids in test_uid_to_pospids.items():
            for pos_pid in pos_pids:
                fout.write(f"{uid}\tQ0\t{pos_pid}\t{1}\n")

100%|██████████| 815832/815832 [00:56<00:00, 14341.34it/s]
100%|██████████| 81664/81664 [00:05<00:00, 16295.19it/s]
100%|██████████| 12628/12628 [00:00<00:00, 16114.30it/s]


In [13]:
test_row.tids

'[648908]'

In [None]:
out_dir