In [3]:
import os

from google.cloud import bigquery
import pandas as pd 
import numpy as np

client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

query = """
    SELECT *
    FROM `gcp-ushi-digital-ds-qa.hansi_user_dataset.time_anchor_10core_sim_rec_bytime`;
"""
query_job = client.query(query)
user_sim_rec_df = query_job.to_dataframe()

query = """
    SELECT *
    FROM `gcp-ushi-digital-ds-qa.hansi_user_dataset.time_anchor_5core_compl_rec_bytime`;
"""
query_job = client.query(query)
user_compl_rec_df = query_job.to_dataframe()

query = """
    SELECT *
    FROM `gcp-ushi-digital-ds-qa.hansi_user_dataset.time_query_10core_search_bytime`;
"""
query_job = client.query(query)
user_search_df = query_job.to_dataframe()




Client creating using default project: gcp-ushi-digital-ds-qa


In [27]:
import pickle
from tqdm import tqdm
in_dir = "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/"

print("size of user_sim_rec_df = {:,}, user_compl_rec_df = {:,}, user_search_df = {:,}".format(
    len(user_sim_rec_df), len(user_compl_rec_df), len(user_search_df),
))

all_users = set(
    list(user_sim_rec_df.customer_id.unique()) + list(user_compl_rec_df.customer_id.unique()) + list(user_search_df.customer_id.unique()))
interacted_ivms = set()
interacted_queries = set()
for i, row in tqdm(user_sim_rec_df.iterrows(), total=len(user_sim_rec_df)):
    for sim_record in row.sim_records:
        sim_ivms = [meta_sim_ivms["ivm"] for meta_sim_ivms in sim_record["sim_ivms"]]
        interacted_ivms.update(sim_ivms)
        interacted_ivms.add(sim_record["anchor"])
        
for i, row in tqdm(user_compl_rec_df.iterrows(), total=len(user_compl_rec_df)):
    for compl_record in row.compl_records:
        compl_ivms = [meta_compl_ivm["ivm"] for meta_compl_ivm in compl_record["compl_ivms"]]
        interacted_ivms.update(compl_ivms)
        interacted_ivms.add(compl_record["anchor"])
        
for i, row in tqdm(user_search_df.iterrows(), total=len(user_search_df)):
    for search_record in row.search_records:
        rel_ivms = [meta_rel_ivm["ivm"] for meta_rel_ivm in search_record["rel_ivms"]]
        interacted_ivms.update(rel_ivms)
        interacted_queries.add(search_record["query"])
        
print("all_users = {:,}, interacted_ivms = {:,}, interacted_queries = {:,}".format(len(all_users), len(interacted_ivms), len(interacted_queries)))

# read exisiting ivm, query map
with open(os.path.join(in_dir, "ivm_to_pid.pkl"), "rb") as fin:
    ivm_to_pid = pickle.load(fin)

with open(os.path.join(in_dir, "query_to_qid.pkl"), "rb") as fin:
    query_to_qid = pickle.load(fin)

print("interacted_ivms is subset of all: ", interacted_ivms.issubset(set(ivm_to_pid.keys())))
print("interacted_queries is subset of all: ", interacted_queries.issubset(set(query_to_qid.keys())))

size of user_sim_rec_df = 81,664, user_compl_rec_df = 12,628, user_search_df = 815,832


100%|██████████| 81664/81664 [00:05<00:00, 14005.18it/s]
100%|██████████| 12628/12628 [00:00<00:00, 16698.56it/s]
100%|██████████| 815832/815832 [01:11<00:00, 11333.66it/s]


all_users = 893,619, interacted_ivms = 298,660, interacted_queries = 644,817
interacted_ivms is subset of all:  True
interacted_queries is subset of all:  True


In [46]:
user_to_uid = {user:uid for uid, user in enumerate(list(all_users))}

train_sim_recs = []
test_sim_recs = []

train_compl_recs = []
test_compl_recs = []

train_searchs = []
test_searchs = []

for i, row in tqdm(user_sim_rec_df.iterrows(), total=len(user_sim_rec_df)):
    records = []
    uid = user_to_uid[row.customer_id]
    for sim_record in row.sim_records:
        aid = ivm_to_pid[sim_record["anchor"]]
        sim_pids = [ivm_to_pid[meta_sim_ivms["ivm"]] for meta_sim_ivms in sim_record["sim_ivms"]]
        visit_ids = [meta_sim_ivms["visit_id"] for meta_sim_ivms in sim_record["sim_ivms"]]
        assert len(set(visit_ids)) == 1, visit_ids
        
        records.append([uid, aid, sim_pids, sim_record["date_time"], visit_ids[0]])
    train_sim_recs += records[:-1]
    test_sim_recs.append(records[-1])
    
for i, row in tqdm(user_compl_rec_df.iterrows(), total=len(user_compl_rec_df)):
    records = []
    uid = user_to_uid[row.customer_id]
    for compl_record in row.compl_records:
        aid = ivm_to_pid[compl_record["anchor"]]
        compl_pids = [ivm_to_pid[meta_compl_ivm["ivm"]] for meta_compl_ivm in compl_record["compl_ivms"]]
        visit_ids = [meta_compl_ivm["visit_id"] for meta_compl_ivm in compl_record["compl_ivms"]]
        assert len(set(visit_ids)) == 1, visit_ids
        
        records.append([uid, aid, compl_pids, compl_record["date_time"], visit_ids[0]])
    train_compl_recs += records[:-1]
    test_compl_recs.append(records[-1])
    
for i, row in tqdm(user_search_df.iterrows(), total=len(user_search_df)):
    records = []
    uid = user_to_uid[row.customer_id]
    for search_record in row.search_records:
        qid = query_to_qid[search_record["query"]]
        rel_pids = [ivm_to_pid[meta_rel_ivm["ivm"]] for meta_rel_ivm in search_record["rel_ivms"]]
        visit_ids = [meta_rel_ivm["visit_id"] for meta_rel_ivm in search_record["rel_ivms"]]
        #assert len(set(visit_ids)) == 1, visit_ids
        
        records.append([uid, qid, rel_pids, search_record["date_time"], visit_ids[0]])
    train_searchs += records[:-1]
    test_searchs.append(records[-1])


100%|██████████| 81664/81664 [00:27<00:00, 2989.09it/s] 
100%|██████████| 12628/12628 [00:01<00:00, 11560.93it/s]
100%|██████████| 815832/815832 [02:22<00:00, 5733.28it/s]


In [50]:
train_sim_data = pd.DataFrame(train_sim_recs, columns=["uid", "aid", "sim_pids", "date_time", "visit_id"])
test_sim_data = pd.DataFrame(test_sim_recs, columns=["uid", "aid", "sim_pids", "date_time", "visit_id"])

train_compl_data = pd.DataFrame(train_compl_recs, columns=["uid", "aid", "compl_pids", "date_time", "visit_id"])
test_compl_data = pd.DataFrame(test_compl_recs, columns=["uid", "aid", "compl_pids", "date_time", "visit_id"])

train_search_data = pd.DataFrame(train_searchs, columns=["uid", "qid", "rel_pids", "date_time", "visit_id"])
test_search_data = pd.DataFrame(test_searchs, columns=["uid", "qid", "rel_pids", "date_time", "visit_id"])

print("number of train_sim_data = {:,}, test_sim_data = {:,}".format(len(train_sim_data), len(test_sim_data)))
print("number of train_compl_data = {:,}, test_compl_data = {:,}".format(len(train_compl_data), len(test_compl_data)))
print("number of train_search_data = {:,}, test_search_Data = {:,}".format(len(train_search_data), len(test_search_data)))
print("number of users in sim_rec, compl_rec, search = {:,}, {:,}, {:,}".format(len(train_sim_data.uid.unique()),
                                                                               len(train_compl_data.uid.unique()),
                                                                               len(train_search_data.uid.unique())))
assert len(train_sim_data.uid.unique()) == len(test_sim_data.uid.unique())
assert len(train_compl_data.uid.unique()) == len(test_compl_data.uid.unique())
assert len(train_search_data.uid.unique()) == len(test_search_data.uid.unique())

number of train_sim_data = 1,017,800, test_sim_data = 81,664
number of train_compl_data = 67,310, test_compl_data = 12,628
number of train_search_data = 13,726,249, test_search_Data = 815,832
number of users in sim_rec, compl_rec, search = 81,664, 12,628, 815,832


In [80]:
import pickle

out_dir = "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_user/"
fn_to_data = {
    os.path.join(out_dir, "train_sim_recs.csv"): train_sim_data,
    os.path.join(out_dir, "test_sim_recs.csv"): test_sim_data,
    os.path.join(out_dir, "train_compl_recs.csv"): train_compl_data,
    os.path.join(out_dir, "test_compl_recs.csv"): test_compl_data,
    os.path.join(out_dir, "train_searchs.csv"): train_search_data,
    os.path.join(out_dir, "test_searchs.csv"): test_search_data,
}
for fn, pd_data in fn_to_data.items():
    pd_data.to_csv(fn)

with open(os.path.join(out_dir, "user_to_uid.pkl"), "wb") as fout:
    pickle.dump(user_to_uid, fout)

In [81]:
import numpy as np

# statistics
sim_rlens, compl_rlens, search_rlens = [], [], []
for i, group in train_sim_data.groupby("uid"):
    sim_rlens.append(len(group))
for i, group in train_compl_data.groupby("uid"):
    compl_rlens.append(len(group))
for i, group in train_search_data.groupby("uid"):
    search_rlens.append(len(group))  

rlens = [sim_rlens, compl_rlens, search_rlens]
for rlen in rlens:
    print(sum(rlen)/len(rlen))
    
sim_pid_lens = np.array([len(x) for x in list(train_sim_data.sim_pids)])
compl_pid_lens = np.array([len(x) for x in list(train_compl_data.compl_pids)])
search_pid_lens = np.array([len(x) for x in list(train_search_data.rel_pids)])
print(np.mean(sim_pid_lens), np.mean(compl_pid_lens), np.mean(search_pid_lens))
for lens in [sim_pid_lens, compl_pid_lens, search_pid_lens]:
    print(np.quantile(lens, [0.25, 0.5, 0.75, 0.9]))

12.463264106583072
5.330218561925879
16.82484751762618


In [130]:
# create static test set
import random

SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"

eid_to_text = {}
with open(os.path.join(in_dir, "all_entities.tsv")) as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text
        
out_dir = "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_user/"
test_dir = os.path.join(out_dir, "without_context")
if not os.path.exists(test_dir):
    os.mkdir(test_dir)

test_uid_to_aidsimpids = {uid: (aid, simpids) for uid, aid, simpids in zip(list(test_sim_data.uid), list(test_sim_data.aid), list(test_sim_data.sim_pids))}
test_uid_to_aidcomplpids = {uid: (aid, complpids) for uid, aid, complpids in zip(list(test_compl_data.uid), list(test_compl_data.aid),
                                                                        list(test_compl_data.compl_pids))}
test_uid_to_qidrelpids = {uid: (qid, relpids) for uid, qid, relpids in zip(list(test_search_data.uid), list(test_search_data.qid),
                                                                        list(test_search_data.rel_pids))}

test_fn_to_urel = {
    os.path.join(test_dir, "urels.test.sim.tsv"): test_uid_to_aidsimpids,
    os.path.join(test_dir, "urels.test.compl.tsv"): test_uid_to_aidcomplpids,
    os.path.join(test_dir, "urels.test.search.tsv"): test_uid_to_qidrelpids}
for fn, urel_data in test_fn_to_urel.items():
    with open(fn, "w") as fout:
        for uid, (_, tids) in urel_data.items():
            for tid in tids:
                fout.write(f"{uid}\tQ0\t{tid}\t{1}\n")
                
assert len(test_uid_to_aidsimpids) == len(test_sim_data) and len(test_uid_to_aidcomplpids) == len(test_compl_data) and \
        len(test_uid_to_qidrelpids ) == len(test_search_data)
                
test_fn_to_uids = {
    os.path.join(test_dir, "uid_anchors.test.sim.tsv"): (test_uid_to_aidsimpids, SIM_RELATION),
    os.path.join(test_dir, "uid_anchors.test.compl.tsv"): (test_uid_to_aidcomplpids, COMPL_RELATION),
    os.path.join(test_dir, "uid_queries.test.search.tsv"): (test_uid_to_qidrelpids, REL_RELATION),
    
    os.path.join(test_dir, "uid_anchors.test.sim.small.tsv"): (dict(random.sample(test_uid_to_aidsimpids.items(), k=10_000)), SIM_RELATION),
    os.path.join(test_dir, "uid_queries.test.search.small.tsv"): (dict(random.sample(test_uid_to_qidrelpids.items(), k=10_000)), REL_RELATION),
}

for fn, (uid_data, relation) in test_fn_to_uids.items():
    with open(fn, "w") as fout:
        for uid, (hid, _) in uid_data.items():
            fout.write(f"{uid}\t{eid_to_text[hid]}\t{relation}\n")             



In [131]:
for path in os.listdir(test_dir):
    path = os.path.join(test_dir, path)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)

10000 /home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_user/without_context/uid_queries.test.search.small.tsv
888571	umbrella	is_relevant_to
212310	chlorine tablets	is_relevant_to
560882	lemon balm	is_relevant_to
830399	blue hawk shelves	is_relevant_to
534593	kitchen sinks	is_relevant_to
567050	sun screen fabric shade	is_relevant_to
815832 /home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_user/without_context/uid_queries.test.search.tsv
212921	kneeling pad	is_relevant_to
508485	2" wood blinds	is_relevant_to
39217	soffit	is_relevant_to
215819	air conditioner window	is_relevant_to
481849	mapei frost	is_relevant_to
356344	composite decking boards	is_relevant_to
10000 /home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_user/without_context/uid_anchors.test.sim.small.tsv
585573	URREA 19in Metal Tool Box for Sets and General Use 379 in3 Storage Capacity ; Portable Tool Boxes	is_similar_to
298982	RCA 2.7-cu ft Capacity White Ventless All-in-One W

In [None]:
# create for qrel arel

out_dir = "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_user/"
aqrel_test_dir = os.path.join(out_dir, "aqrel/")
if not os.path.exists(aqrel_test_dir):
    os.mkdir(aqrel_test_dir)

test_aid_to_simpids = {aid: sim_pids for aid, sim_pids in zip(list(test_sim_data.aid), list(test_sim_data.sim_pids))}
test_aid_to_complpids = {aid: compl_pids for aid, compl_pids in zip(list(test_compl_data.aid), list(test_compl_data.compl_pids))}
test_qid_to_relpids = {qid: rel_pids for qid, rel_pids in zip(list(test_search_data.qid), list(test_search_data.rel_pids))}

fn_to_aqrel = {
    os.path.join(aqrel_test_dir, "arels.test.sim.tsv"): test_aid_to_simpids,
    os.path.join(aqrel_test_dir, "arels.test.compl.tsv"): test_aid_to_complpids,
    os.path.join(aqrel_test_dir, "qrels.test.search.tsv"): test_qid_to_relpids,
    }

for fn, aqrel_data in fn_to_aqrel.items():
    with open(fn, "w") as fout:
        for hid, tids in aqrel_data.items():
            for tid in tids:
                fout.write(f"{hid}\tQ0\t{tid}\t{1}\n")
                
fn_to_hid = {
    os.path.join(aqrel_test_dir, "anchors.test.sim.tsv"): (list(test_aid_to_simpids.keys()), SIM_RELATION),
    os.path.join(aqrel_test_dir, "anchors.test.compl.tsv"): (list(test_aid_to_complpids.keys()), COMPL_RELATION),
    os.path.join(aqrel_test_dir, "queries.test.search.tsv"): (list(test_qid_to_relpids.keys()), REL_RELATION),
    
    os.path.join(aqrel_test_dir, "anchors.test.sim.small.tsv"): (random.sample(list(test_aid_to_simpids.keys()), k=10_000), 
                                                                 SIM_RELATION),
    os.path.join(aqrel_test_dir, "queries.test.search.small.tsv"): (random.sample(list(test_qid_to_relpids.keys()), k=10_000), 
                                                                    REL_RELATION),
    }
for fn, (hids, relation) in fn_to_hid.items():
    with open(fn, "w") as fout:
        for hid in hids:
            fout.write(f"{hid}\t{eid_to_text[hid]}\t{relation}\n")