In [1]:
import os

from google.cloud import bigquery
import pandas as pd 
import numpy as np

client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

query = """
    SELECT *
    FROM `gcp-ushi-digital-ds-qa.hansi_user_dataset.hansi_10core_user_sim_rec_bytime`;
"""
query_job = client.query(query)
user_sim_rec_df = query_job.to_dataframe()

query = """
    SELECT *
    FROM `gcp-ushi-digital-ds-qa.hansi_user_dataset.hansi_5core_user_compl_rec_bytime`;
"""
query_job = client.query(query)
user_compl_rec_df = query_job.to_dataframe()

query = """
    SELECT *
    FROM `gcp-ushi-digital-ds-qa.hansi_user_dataset.hansi_10core_user_search_bytime`;
"""
query_job = client.query(query)
user_search_df = query_job.to_dataframe()

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.hansi_user_dataset.hansi_10core_user_rec_search_bytime`;
    """
query_job = client.query(query)
user_rec_search_df = query_job.to_dataframe()


Client creating using default project: gcp-ushi-digital-ds-qa


In [None]:
import pickle
from tqdm import tqdm
in_dir = "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/"

print("size of user_sim_rec_df = {:,}, user_compl_rec_df = {:,}, user_search_df = {:,}, user_rec_search_df = {:,}".format(
    len(user_sim_rec_df), len(user_compl_rec_df), len(user_search_df), len(user_rec_search_df)
))

all_users = set(
    list(user_sim_rec_df.customer_id.unique()) + list(user_compl_rec_df.customer_id.unique()) + list(user_search_df.customer_id.unique()))
interacted_ivms = set()
interacted_queries = set()
for i, row in tqdm(user_sim_rec_df.iterrows(), total=len(user_sim_rec_df)):
    for sim_record in row.sim_records:
        interacted_ivms.update([sim_record["anchor"], sim_record["ivm"]])
for i, row in user_compl_rec_df.iterrows():
    for compl_record in row.compl_records:
        interacted_ivms.update([compl_record["anchor"], compl_record["ivm"]])
for i, row in user_search_df.iterrows():
    for search_record in row.search_records:
        interacted_ivms.add(search_record["ivm"])
        interacted_queries.add(search_record["query"])
print("all_users = {:,}, interacted_ivms = {:,}, interacted_queries = {:,}".format(len(all_users), len(interacted_ivms), len(interacted_queries)))

# read exisiting ivm, query map
with open(os.path.join(in_dir, "ivm_to_pid.pkl"), "rb") as fin:
    ivm_to_pid = pickle.load(fin)

with open(os.path.join(in_dir, "query_to_qid.pkl"), "rb") as fin:
    query_to_qid = pickle.load(fin)

print("interacted_ivms is subset of all: ", interacted_ivms.issubset(set(ivm_to_pid.keys())))
print("interacted_queries is subset of all: ", interacted_queries.issubset(set(query_to_qid.keys())))

In [None]:
user_to_uid = {user:uid for uid, user in enumerate(list(all_users))}
rs_sim_rec_examples = []
rs_compl_rec_examples = []
rs_search_examples = []

for i, row in user_rec_search_df.iterrows():
    for sim_record in row.sim_records:
        uid, aid, pid = user_to_uid[row.customer_id] , ivm_to_pid[sim_record["anchor"]], ivm_to_pid[sim_record["ivm"]]
        rs_sim_rec_examples.append((uid, aid, pid, sim_record["date_time"], sim_record["visit_id"]))
    for compl_record in row.compl_records:
        uid, aid, pid = user_to_uid[row.customer_id] , ivm_to_pid[compl_record["anchor"]], ivm_to_pid[compl_record["ivm"]]
        rs_compl_rec_examples.append((uid, aid, pid , compl_record["date_time"], compl_record["visit_id"]))
    for search_record in row.search_records:
        uid, qid, pid = user_to_uid[row.customer_id], query_to_qid[search_record["query"]], ivm_to_pid[search_record["ivm"]]
        rs_search_examples.append((uid, qid, pid , search_record["date_time"], search_record["visit_id"]))

r_sim_rec_examples = []
for i, row in tqdm(user_sim_rec_df.iterrows(), total=len(user_sim_rec_df)):
    for sim_record in row.sim_records:
        uid, aid, pid = user_to_uid[row.customer_id] , ivm_to_pid[sim_record["anchor"]], ivm_to_pid[sim_record["ivm"]]
        r_sim_rec_examples.append((uid, aid, pid, sim_record["date_time"], sim_record["visit_id"]))
r_compl_rec_examples = []
for i, row in tqdm(user_compl_rec_df.iterrows(), total=len(user_compl_rec_df)):
    for compl_record in row.compl_records:
        uid, aid, pid = user_to_uid[row.customer_id] , ivm_to_pid[compl_record["anchor"]], ivm_to_pid[compl_record["ivm"]]
        r_compl_rec_examples.append((uid, aid, pid , compl_record["date_time"], compl_record["visit_id"]))
s_search_examples = []
for i, row in tqdm(user_search_df.iterrows(), total=len(user_search_df)):
    for search_record in row.search_records:
        uid, qid, pid = user_to_uid[row.customer_id], query_to_qid[search_record["query"]], ivm_to_pid[search_record["ivm"]]
        s_search_examples.append((uid, qid, pid , search_record["date_time"], search_record["visit_id"]))

print("rec_search's user have number of sim_rec, compl_rec, search examples = {:,}, {:,}, {:,}".format(len(rs_sim_rec_examples), 
                                                                                     len(rs_compl_rec_examples),
                                                                                       len(rs_search_examples)))
print("only rec or search's user have number of sim_rec, compl_rec, search examples = {:,}, {:,}, {:,}".format(len(r_sim_rec_examples), 
                                                                                     len(r_compl_rec_examples),
                                                                                       len(s_search_examples)))

In [26]:
train_rs_sim_recs = {}
test_rs_sim_recs = {}

train_rs_compl_recs = {}
test_rs_compl_recs = {}

train_rs_searchs = {}
test_rs_searchs = {}
for i, row in user_rec_search_df.iterrows():
    uid = user_to_uid[row.customer_id]
    train_rs_sim_recs[uid], test_rs_sim_recs[uid] = [], []
    train_rs_compl_recs[uid], test_rs_compl_recs[uid] = [], []
    train_rs_searchs[uid], test_rs_searchs[uid] = [], []
    
    train_idx = int(len(row.sim_records) * 0.8) 
    for sim_record in row.sim_records[:train_idx]:
        aid, pid = ivm_to_pid[sim_record["anchor"]], ivm_to_pid[sim_record["ivm"]]
        train_rs_sim_recs[uid].append((aid, pid, sim_record["date_time"], sim_record["visit_id"]))
    for sim_record in row.sim_records[train_idx:]:
        aid, pid = ivm_to_pid[sim_record["anchor"]], ivm_to_pid[sim_record["ivm"]]
        test_rs_sim_recs[uid].append((aid, pid, sim_record["date_time"], sim_record["visit_id"]))
    assert len(train_rs_sim_recs[uid]) + len(test_rs_sim_recs[uid]) == len(row.sim_records)
    
    train_idx = int(len(row.compl_records) * 0.8)
    for compl_record in row.compl_records[:train_idx]:
        aid, pid = ivm_to_pid[compl_record["anchor"]], ivm_to_pid[compl_record["ivm"]]
        train_rs_compl_recs[uid].append((aid, pid , compl_record["date_time"], compl_record["visit_id"]))
    for compl_record in row.compl_records[train_idx:]:
        aid, pid = ivm_to_pid[compl_record["anchor"]], ivm_to_pid[compl_record["ivm"]]
        test_rs_compl_recs[uid].append((aid, pid , compl_record["date_time"], compl_record["visit_id"]))
    assert len(train_rs_compl_recs[uid]) + len(test_rs_compl_recs[uid]) == len(row.compl_records)
    
    train_idx = int(len(row.search_records) * 0.8)
    for search_record in row.search_records[:train_idx]:
        qid, pid = query_to_qid[search_record["query"]], ivm_to_pid[search_record["ivm"]]
        train_rs_searchs[uid].append((qid, pid , search_record["date_time"], search_record["visit_id"]))
    for search_record in row.search_records[train_idx:]:
        qid, pid = query_to_qid[search_record["query"]], ivm_to_pid[search_record["ivm"]]
        test_rs_searchs[uid].append((qid, pid , search_record["date_time"], search_record["visit_id"]))
    assert len(train_rs_searchs[uid]) + len(test_rs_searchs[uid]) == len(row.search_records)
    assert len(test_rs_searchs[uid]) != 0

In [32]:
import ujson

out_dir = "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_user/"
fn_to_data = {
    os.path.join(out_dir, "train_rs_sim_recs.json"): [{"uid": uid, "records": records} for uid, records in train_rs_sim_recs.items()],
    os.path.join(out_dir, "test_rs_sim_recs.json"): [{"uid": uid, "records": records} for uid, records in test_rs_sim_recs.items()],
    os.path.join(out_dir, "train_rs_compl_recs.json"): [{"uid": uid, "records": records} for uid, records in train_rs_compl_recs.items()],
    os.path.join(out_dir, "test_rs_compl_recs.json"): [{"uid": uid, "records": records} for uid, records in test_rs_compl_recs.items()],
    os.path.join(out_dir, "train_rs_search.json"): [{"uid": uid, "records": records} for uid, records in train_rs_searchs.items()],
    os.path.join(out_dir, "test_rs_search.json"): [{"uid": uid, "records": records} for uid, records in test_rs_searchs.items()],
}

for fn, data in fn_to_data.items():
    with open(fn, "w") as fout:
        for line in data:
            fout.write(ujson.dumps(line) + "\n")

In [None]:
for fn in os.listdir(out_dir):
    fn = os.path.join(out_dir, fn)
    if not fn.endswith(".json"):
        continue
    ! wc -l $fn
    ! head -n 2 $fn
    print(75*"=")