In [2]:
import os
import pickle
import glob

import pandas as pd 
import numpy as np

bq_in_dir="/home/jupyter/data_transfer/data/"
with open(os.path.join(bq_in_dir, "time_anchor_10core_sim_rec_bytime.pkl"), "rb") as fin:
    user_sim_rec_df = pickle.load(fin)

with open(os.path.join(bq_in_dir, "time_anchor_5core_compl_rec_bytime.pkl"), "rb") as fin:
    user_compl_rec_df = pickle.load(fin)

user_search_subdfs = []
for fn in glob.glob(os.path.join(bq_in_dir, "time_query_10core_search_bytime_[0-9][0-9].pkl")):
    with open(fn, "rb") as fin:
        user_search_subdfs.append(pickle.load(fin))
        
user_search_df = pd.concat(user_search_subdfs)




In [None]:

from tqdm import tqdm
in_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
if not os.path.exists(in_dir):
    os.mkdir(in_dir)

print("size of user_sim_rec_df = {:,}, user_compl_rec_df = {:,}, user_search_df = {:,}".format(
    len(user_sim_rec_df), len(user_compl_rec_df), len(user_search_df),
))

all_users = set(
    list(user_sim_rec_df.customer_id.unique()) + list(user_compl_rec_df.customer_id.unique()) + list(user_search_df.customer_id.unique()))
interacted_ivms = set()
interacted_queries = set()
for i, row in tqdm(user_sim_rec_df.iterrows(), total=len(user_sim_rec_df)):
    for sim_record in row.sim_records:
        sim_ivms = [meta_sim_ivms["ivm"] for meta_sim_ivms in sim_record["sim_ivms"]]
        interacted_ivms.update(sim_ivms)
        interacted_ivms.add(sim_record["anchor"])
        
for i, row in tqdm(user_compl_rec_df.iterrows(), total=len(user_compl_rec_df)):
    for compl_record in row.compl_records:
        compl_ivms = [meta_compl_ivm["ivm"] for meta_compl_ivm in compl_record["compl_ivms"]]
        interacted_ivms.update(compl_ivms)
        interacted_ivms.add(compl_record["anchor"])
        
for i, row in tqdm(user_search_df.iterrows(), total=len(user_search_df)):
    for search_record in row.search_records:
        rel_ivms = [meta_rel_ivm["ivm"] for meta_rel_ivm in search_record["rel_ivms"]]
        interacted_ivms.update(rel_ivms)
        interacted_queries.add(search_record["query"])
        
print("all_users = {:,}, interacted_ivms = {:,}, interacted_queries = {:,}".format(len(all_users), len(interacted_ivms), len(interacted_queries)))

# read exisiting ivm, query map
with open(os.path.join(in_dir, "ivm_to_pid.pkl"), "rb") as fin:
    ivm_to_pid = pickle.load(fin)

with open(os.path.join(in_dir, "query_to_qid.pkl"), "rb") as fin:
    query_to_qid = pickle.load(fin)

print("interacted_ivms is subset of all: ", interacted_ivms.issubset(set(ivm_to_pid.keys())))
print("interacted_queries is subset of all: ", interacted_queries.issubset(set(query_to_qid.keys())))

In [None]:
user_to_uid = {user:uid for uid, user in enumerate(list(all_users))}

train_sim_recs = []
test_sim_recs = []

train_compl_recs = []
test_compl_recs = []

train_searchs = []
test_searchs = []

for i, row in tqdm(user_sim_rec_df.iterrows(), total=len(user_sim_rec_df)):
    records = []
    uid = user_to_uid[row.customer_id]
    for sim_record in row.sim_records:
        aid = ivm_to_pid[sim_record["anchor"]]
        sim_pids = [ivm_to_pid[meta_sim_ivms["ivm"]] for meta_sim_ivms in sim_record["sim_ivms"]]
        visit_ids = [meta_sim_ivms["visit_id"] for meta_sim_ivms in sim_record["sim_ivms"]]
        assert len(set(visit_ids)) == 1, visit_ids
        
        records.append([uid, aid, sim_pids, sim_record["date_time"], visit_ids[0]])
    train_sim_recs += records[:-1]
    test_sim_recs.append(records[-1])
    
for i, row in tqdm(user_compl_rec_df.iterrows(), total=len(user_compl_rec_df)):
    records = []
    uid = user_to_uid[row.customer_id]
    for compl_record in row.compl_records:
        aid = ivm_to_pid[compl_record["anchor"]]
        compl_pids = [ivm_to_pid[meta_compl_ivm["ivm"]] for meta_compl_ivm in compl_record["compl_ivms"]]
        visit_ids = [meta_compl_ivm["visit_id"] for meta_compl_ivm in compl_record["compl_ivms"]]
        assert len(set(visit_ids)) == 1, visit_ids
        
        records.append([uid, aid, compl_pids, compl_record["date_time"], visit_ids[0]])
    train_compl_recs += records[:-1]
    test_compl_recs.append(records[-1])
    
for i, row in tqdm(user_search_df.iterrows(), total=len(user_search_df)):
    records = []
    uid = user_to_uid[row.customer_id]
    for search_record in row.search_records:
        qid = query_to_qid[search_record["query"]]
        rel_pids = [ivm_to_pid[meta_rel_ivm["ivm"]] for meta_rel_ivm in search_record["rel_ivms"]]
        visit_ids = [meta_rel_ivm["visit_id"] for meta_rel_ivm in search_record["rel_ivms"]]
        #assert len(set(visit_ids)) == 1, visit_ids
        
        records.append([uid, qid, rel_pids, search_record["date_time"], visit_ids[0]])
    train_searchs += records[:-1]
    test_searchs.append(records[-1])


In [None]:
train_sim_data = pd.DataFrame(train_sim_recs, columns=["uid", "aid", "sim_pids", "date_time", "visit_id"])
test_sim_data = pd.DataFrame(test_sim_recs, columns=["uid", "aid", "sim_pids", "date_time", "visit_id"])

train_compl_data = pd.DataFrame(train_compl_recs, columns=["uid", "aid", "compl_pids", "date_time", "visit_id"])
test_compl_data = pd.DataFrame(test_compl_recs, columns=["uid", "aid", "compl_pids", "date_time", "visit_id"])

train_search_data = pd.DataFrame(train_searchs, columns=["uid", "qid", "rel_pids", "date_time", "visit_id"])
test_search_data = pd.DataFrame(test_searchs, columns=["uid", "qid", "rel_pids", "date_time", "visit_id"])

print("number of train_sim_data = {:,}, test_sim_data = {:,}".format(len(train_sim_data), len(test_sim_data)))
print("number of train_compl_data = {:,}, test_compl_data = {:,}".format(len(train_compl_data), len(test_compl_data)))
print("number of train_search_data = {:,}, test_search_Data = {:,}".format(len(train_search_data), len(test_search_data)))
print("number of users in sim_rec, compl_rec, search = {:,}, {:,}, {:,}".format(len(train_sim_data.uid.unique()),
                                                                               len(train_compl_data.uid.unique()),
                                                                               len(train_search_data.uid.unique())))
assert len(train_sim_data.uid.unique()) == len(test_sim_data.uid.unique())
assert len(train_compl_data.uid.unique()) == len(test_compl_data.uid.unique())
assert len(train_search_data.uid.unique()) == len(test_search_data.uid.unique())

In [8]:
import pickle

out_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
fn_to_data = {
    os.path.join(out_dir, "train_sim_recs.csv"): train_sim_data,
    os.path.join(out_dir, "test_sim_recs.csv"): test_sim_data,
    os.path.join(out_dir, "train_compl_recs.csv"): train_compl_data,
    os.path.join(out_dir, "test_compl_recs.csv"): test_compl_data,
    os.path.join(out_dir, "train_searchs.csv"): train_search_data,
    os.path.join(out_dir, "test_searchs.csv"): test_search_data,
}
for fn, pd_data in fn_to_data.items():
    pd_data.to_csv(fn)

with open(os.path.join(out_dir, "user_to_uid.pkl"), "wb") as fout:
    pickle.dump(user_to_uid, fout)

In [9]:
import numpy as np

# statistics
sim_rlens, compl_rlens, search_rlens = [], [], []
for i, group in train_sim_data.groupby("uid"):
    sim_rlens.append(len(group))
for i, group in train_compl_data.groupby("uid"):
    compl_rlens.append(len(group))
for i, group in train_search_data.groupby("uid"):
    search_rlens.append(len(group))  

rlens = [sim_rlens, compl_rlens, search_rlens]
for rlen in rlens:
    print(sum(rlen)/len(rlen))
    
sim_pid_lens = np.array([len(x) for x in list(train_sim_data.sim_pids)])
compl_pid_lens = np.array([len(x) for x in list(train_compl_data.compl_pids)])
search_pid_lens = np.array([len(x) for x in list(train_search_data.rel_pids)])
print(np.mean(sim_pid_lens), np.mean(compl_pid_lens), np.mean(search_pid_lens))
for lens in [sim_pid_lens, compl_pid_lens, search_pid_lens]:
    print(np.quantile(lens, [0.25, 0.5, 0.75, 0.9]))

12.463264106583072
5.330218561925879
16.82484751762618
1.4388484967577126 2.0612093299658296 1.2722863325588805
[1. 1. 2. 2.]
[1. 2. 3. 4.]
[1. 1. 1. 2.]


In [32]:
import random
from collections import defaultdict
import pickle as pkl
import pandas as pd
random.seed(4680)

selected_dir = os.path.join(in_dir, "selected_test_user")
if not os.path.exists(selected_dir):
    os.mkdir(selected_dir)

user_num = 10_000
selected_sim_users = random.sample(list(test_sim_data.uid), k=user_num)
selected_compl_users = random.sample(list(test_compl_data.uid), k=user_num)
selected_search_users = random.sample(list(test_search_data.uid), k=user_num)

selected_sim_data = test_sim_data[np.in1d(test_sim_data.uid, selected_sim_users)]
selected_compl_data = test_compl_data[np.in1d(test_compl_data.uid, selected_compl_users)]
selected_search_data = test_search_data[np.in1d(test_search_data.uid, selected_search_users)]

rm_aid_to_simpids, rm_aid_to_complpids, rm_qid_to_relpids = defaultdict(set), defaultdict(set), defaultdict(set)

for aid, sim_pids in zip(selected_sim_data.aid, selected_sim_data.sim_pids):
    rm_aid_to_simpids[aid].update(sim_pids)
    
for aid, compl_pids in zip(selected_compl_data.aid, selected_compl_data.compl_pids):
    rm_aid_to_complpids[aid].update(compl_pids)

for qid, rel_pids in zip(selected_search_data.qid, selected_search_data.rel_pids):
    rm_qid_to_relpids[qid].update(rel_pids)
    
print("unique excluded sim_aids, compl_aids and qids = {:,}, {:,}, {:,}".format(
    len(rm_qid_to_relpids), len(rm_aid_to_complpids), len(rm_aid_to_simpids)))
    
exclude_sim_aids = list(rm_aid_to_simpids.keys())
exclude_compl_aids = list(rm_aid_to_complpids.keys())
exclude_qids = list(rm_qid_to_relpids.keys())

fn_to_data = {
    "selected_sim_data.test.pkl": selected_sim_data,
    "selected_compl_data.test.pkl": selected_compl_data,
    "selected_search_data.test.pkl": selected_search_data,
}
for fn, data in fn_to_data.items():
    fn = os.path.join(selected_dir, fn)
    data.to_pickle(fn)
    
fn_to_data = {
    "aid_to_simpid.test.tsv": rm_aid_to_simpids,
    "aid_to_complpid.test.tsv": rm_aid_to_complpids,
    "qid_to_relpid.test.tsv": rm_qid_to_relpids,
}
for fn, data in fn_to_data.items():
    fn = os.path.join(selected_dir, fn)
    with open(fn, "w") as fout:
        for qid, pos_pids in data.items():
            for pid in pos_pids:
                fout.write(f"{qid}\t{pid}\n")
                
eid_to_text = {}
with open("/home/jupyter/unity_jointly_rec_and_search/datasets/unified_user/all_entities.tsv") as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text
                
fn_to_data = {
    "anchors.sim.test.tsv": exclude_sim_aids,
    "anchors.compl.test.tsv": exclude_compl_aids,
    "queries.search.test.tsv": exclude_qids,
}

for fn, data in fn_to_data.items():
    fn = os.path.join(selected_dir, fn)
    with open(fn, "w") as fout:
        for eid in data:
            fout.write(f"{eid}\t{eid_to_text[eid]}\n")

unique excluded sim_aids, compl_aids and qids = 7,274, 6,720, 8,273


In [None]:
# sanity check
for fn in os.listdir(selected_dir):
    fn = os.path.join(selected_dir, fn)
    if fn.endswith(".pkl"):
        continue
    ! wc -l $fn
    ! head -n 2 $fn
    print(75*"=")

In [39]:
eid_to_text[278428], eid_to_text[2174624]

('Freedom Newport 3-ft H x 8-ft W White Vinyl Gothic Fence Panel ; Vinyl Fencing',
 'Freedom 6-ft H x 3-in W White Vinyl Fence Gate Kit ; Vinyl Fencing')