In [1]:
import os

from google.cloud import bigquery
import pandas as pd 
import numpy as np

client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.comp_rec_ClicksData_2core`;
    """
query_job = client.query(query)
compl_rec_df = query_job.to_dataframe()

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.hansi_rec_ClicksData_5core`;
    """
query_job = client.query(query)
sim_rec_df = query_job.to_dataframe()

query = """
    SELECT *
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.search_ClicksData_1year_5core`;
"""
query_job = client.query(query)
search_df = query_job.to_dataframe()

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.hansi_dataset.all_products_info`;
    """
query_job = client.query(query)
product_df = query_job.to_dataframe()
print("product_df = {:,}".format(len(product_df)))

all_products = set(product_df.product_id)
anchors = set(compl_rec_df.anchor)
compl_ivms = set(compl_rec_df.ivm)
all_compl_ivms = anchors.union(compl_ivms)

print("================================ For anchor_to_compl_ivms: ===================================")
print("number of unique product = {}, anchors = {:,}, complementary_compl_ivms = {:,}".format(len(all_products), len(anchors), len(compl_ivms)))
assert len(all_products & anchors) == len(anchors) and len(all_products & compl_ivms) == len(compl_ivms),(
    len(all_products & anchors), len(anchors), len(all_products & compl_ivms), len(compl_ivms)
)

all_sim_ivms = set(sim_rec_df.anchor).union(set(sim_rec_df.ivm))
print("================================ After updating anchor_to_similar_ivms: ===================================")
print("all_compl_ivms = {:,}, all_sim_ivms = {:,}".format(len(all_compl_ivms), len(all_sim_ivms)))
print("sim_compl_intersect = {:,} ({:.3f})".format(len(all_compl_ivms & all_sim_ivms), len(all_compl_ivms & all_sim_ivms) / len(all_compl_ivms)))
print("all_ivms = {:,}".format(len(all_compl_ivms | all_sim_ivms)))
all_ivms = all_compl_ivms | all_sim_ivms

assert len(all_products & all_ivms) == len(all_ivms), (len(all_products & all_ivms), len(all_ivms))

Client creating using default project: gcp-ushi-digital-ds-qa
product_df = 2,260,878
number of unique product = 2260878, anchors = 86,870, complementary_compl_ivms = 65,561
all_compl_ivms = 109,758, all_sim_ivms = 256,765
sim_compl_intersect = 87,425 (0.797)
all_ivms = 279,098


In [2]:
query_to_ivms = search_df.groupby("query")["ivm"].apply(list)
ivm_to_tmp_queries = search_df.groupby("ivm")["query"].apply(list)
query_lengths = np.array([len(x) for x in ivm_to_tmp_queries.values])
all_queries = set(search_df["query"])
print("all queries = {}".format(len(all_queries)))
print("total ivms (queries) = {:,}, length >=3 = {:,}, length >= 5 = {:,}".format(
    len(query_lengths), np.sum(query_lengths >=3), np.sum(query_lengths >= 5) ))

anchor_to_compl_ivms = compl_rec_df.groupby("anchor")["ivm"].apply(list)
compl_ivms_length = np.array([len(x) for x in anchor_to_compl_ivms.values])
print("================================ For anchor_to_compl_ivms: ===================================")
print("total_compl_ivms = {:,}, length >=3 = {:,}, length >= 5 = {:,}".format(len(compl_ivms_length), np.sum(compl_ivms_length >=3), np.sum(compl_ivms_length >= 5) ))

anchor_to_sim_ivms = sim_rec_df.groupby("anchor")["ivm"].apply(list)

all queries = 953773
total ivms (queries) = 360,744, length >=3 = 196,481, length >= 5 = 142,527
total_compl_ivms = 86,870, length >=3 = 35,837, length >= 5 = 22,121


In [3]:
# map product --> text
from tqdm import tqdm 

ivm_to_title = {}
ivm_to_bullet = {}
ivm_to_catalog = {}
no_bulletin_ivms = set()
no_title_ivms = set()
no_catalog_ivms = set()

def preprocess_text(in_text):
    in_text = in_text.replace("\t", " ")
    in_text = in_text.replace("\n", " ")
    return in_text

for idx, row in tqdm(product_df.iterrows(), total=len(product_df)):
    product_id = row.product_id
    title = row.product_name if row.product_name != None else "No title"
    bullets = row.bullets if row.bullets != None else "No bullets"
    catalog = row.catalog_name if row.catalog_name != None else "No catalog"
    
    if row.product_name == None:
        no_title_ivms.add(product_id)
    if row.bullets == None:
        no_bulletin_ivms.add(product_id)
    if row.catalog_name == None:
        no_catalog_ivms.add(product_id)
    
    title = preprocess_text(title)
    bullets = preprocess_text(bullets)
    catalog = preprocess_text(catalog)
    
    ivm_to_title[product_id] = title
    ivm_to_bullet[product_id] = bullets
    ivm_to_catalog[product_id] = catalog

# sanity check
print("ivm_to_title = {:,}, ivm_to_bullet = {:,}, ivm_to_catalog = {:,}, products no bulletin = {:,}, no title = {:,}, no catalog = {:,}".format(
    len(ivm_to_title), len(ivm_to_bullet), len(ivm_to_catalog), len(no_bulletin_ivms), len(no_title_ivms), len(no_catalog_ivms)
))

assert len(ivm_to_title) == len(ivm_to_bullet) == len(ivm_to_catalog) == len(product_df)

100%|██████████| 2260878/2260878 [03:45<00:00, 10042.65it/s]

ivm_to_title = 2,260,878, ivm_to_bullet = 2,260,878, ivm_to_catalog = 2,260,878, products no bulletin = 0, no title = 21, no catalog = 4,519





In [4]:
import ujson
from collections import defaultdict

# map to pid and qid
ivm_to_pid = {ivm: pid for pid, ivm in enumerate(list(all_products))}
pid_to_ivm = {pid: ivm for ivm, pid in ivm_to_pid.items()}
query_to_qid = {query: qid + len(ivm_to_pid) for qid, query in enumerate(list(all_queries))}
qid_to_query = {qid: query for query, qid in query_to_qid.items()}
start_qid = len(ivm_to_pid)

pid_to_title = {ivm_to_pid[ivm]: title for ivm, title in ivm_to_title.items()}
pid_to_bullet = {ivm_to_pid[ivm]: bullet for ivm, bullet in ivm_to_bullet.items()}
pid_to_catalog = {ivm_to_pid[ivm]: catalog for ivm, catalog in ivm_to_catalog.items()}

aid_to_sim_pids = {ivm_to_pid[anchor]: [ivm_to_pid[ivm] for ivm in sim_ivms] for anchor, sim_ivms in anchor_to_sim_ivms.items()}
aid_to_compl_pids = {ivm_to_pid[anchor]: [ivm_to_pid[ivm] for ivm in compl_ivms] for anchor, compl_ivms in anchor_to_compl_ivms.items()}
qid_to_pids = {query_to_qid[query]: [ivm_to_pid[ivm] for ivm in ivms] for query, ivms in query_to_ivms.items()}
pid_to_tmp_qids = {ivm_to_pid[ivm]: [query_to_qid[_query] for _query in queries] for ivm, queries in ivm_to_tmp_queries.items()}

In [5]:
import numpy as np 
from itertools import chain
np.random.seed(4680)

val_test_indices = np.random.choice(np.arange(0, len(qid_to_pids)), int(0.2*len(qid_to_pids)), replace=False)
val_indices = val_test_indices[:int(0.5*len(val_test_indices))]
test_indices = val_test_indices[int(0.5*len(val_test_indices)):]

train_qid_to_pids, val_qid_to_pids, test_qid_to_pids = {}, {}, {}
exclude_qids = set()
for idx, (qid, pids) in tqdm(enumerate(qid_to_pids.items()), total=len(qid_to_pids)):
    if idx in val_indices:
        val_qid_to_pids[qid] = pids
        exclude_qids.add(qid)
    elif idx in test_indices:
        test_qid_to_pids[qid] = pids
        exclude_qids.add(qid)
    else:
        train_qid_to_pids[qid] = pids 
print("number of train, val, test qid_to_pids = {:,}, {:,}, {:,}. number of exclude_qids = {:,}".format(
        len(train_qid_to_pids), len(val_qid_to_pids), len(test_qid_to_pids), len(exclude_qids)))

item_sim_dict = {}
item_rel_dict = {}
item_compl_dict = {}
exclude_qids = {qid: 1 for qid in list(exclude_qids)}
for qid, pids in tqdm(train_qid_to_pids.items(), total=len(train_qid_to_pids)):
    for pid in pids:
        if pid in aid_to_sim_pids:
            item_sim_dict[pid] = aid_to_sim_pids[pid]
        if pid in aid_to_compl_pids:
            item_compl_dict[pid] = aid_to_compl_pids[pid]
        if pid in pid_to_tmp_qids:
            valid_qids = [ _qid for _qid in pid_to_tmp_qids[pid] if _qid not in exclude_qids and _qid != qid]
            if len(valid_qids) != 0:
                item_rel_dict[pid] = valid_qids
                
# sanity check
for pid, qids in tqdm(item_rel_dict.items(), total=len(item_rel_dict)):
    assert sum([_q in exclude_qids for _q in qids]) == 0, [_q in exclude_qids for _q in qids]

# statistics
print("number of labeled_pids = {:,}, number of labeld_pids has similar = {:,} ({:.3f})".format(
    len(set([x for xs in train_qid_to_pids.values() for x in xs])), len(item_sim_dict), 
    len(item_sim_dict) / len(set([x for xs in train_qid_to_pids.values() for x in xs]))
))
print("number of labeled_pids = {:,}, number of labeld_pids has compl = {:,} ({:.3f})".format(
    len(set([x for xs in train_qid_to_pids.values() for x in xs])), len(item_compl_dict), 
    len(item_compl_dict) / len(set([x for xs in train_qid_to_pids.values() for x in xs]))
))
print("number of labeled_pids = {:,}, number of labeld_pids has relevant = {:,} ({:.3f})".format(
    len(set([x for xs in train_qid_to_pids.values() for x in xs])), len(item_rel_dict), 
    len(item_rel_dict) / len(set([x for xs in train_qid_to_pids.values() for x in xs]))
))

100%|██████████| 953773/953773 [01:12<00:00, 13141.44it/s]


number of train, val, test qid_to_pids = 763,019, 95,377, 95,377. number of exclude_qids = 190,754


100%|██████████| 763019/763019 [00:42<00:00, 17818.12it/s]
100%|██████████| 226048/226048 [00:00<00:00, 392087.25it/s]


number of labeled_pids = 336,073, number of labeld_pids has similar = 172,703 (0.514)
number of labeled_pids = 336,073, number of labeld_pids has compl = 77,170 (0.230)
number of labeled_pids = 336,073, number of labeld_pids has relevant = 226,048 (0.673)


In [6]:
# write to disk
import os 
import copy
import pickle

SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"

out_dir = "/home/jupyter/jointly_rec_and_search/datasets/kgc_search/"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
with open(os.path.join(out_dir, "collection_title.tsv"), "w") as fout:
    for pid, title in pid_to_title.items():
        fout.write(f"{pid}\t{title}\n")
        
with open(os.path.join(out_dir, "collection_bullet.tsv"), "w") as fout:
    for pid, bullet in pid_to_bullet.items():
        fout.write(f"{pid}\t{bullet}\n")
        
with open(os.path.join(out_dir, "collection_catalog.tsv"), "w") as fout:
    for pid, catalog in pid_to_catalog.items():
        fout.write(f"{pid}\t{catalog}\n")
        
        
with open(os.path.join(out_dir, "product.jsonl"), "w") as fout:
    with open(os.path.join(out_dir, "collection_title_catalog.tsv"), "w") as fout2:
        for aid in pid_to_title:
            text = pid_to_title[aid] + " ; " +  pid_to_catalog[aid]
            example = {"id": aid, "contents": text}
            fout.write(ujson.dumps(example) + "\n")
            fout2.write(f"{aid}\t{text}\n")
            
with open(os.path.join(out_dir, "one_hop_relation.pkl"), "wb") as fout:
    out_obj = {"item_sim": item_sim_dict, "item_rel": item_rel_dict, "item_compl": item_compl_dict}
    pickle.dump(out_obj, fout)
          
with open(os.path.join(out_dir, "queries.train.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "qrels.train.tsv"), "w") as fout2:
        for qid, pids in train_qid_to_pids.items():
            fout.write(f"{qid}\t{qid_to_query[qid]}\t{REL_RELATION}\n")
            for pid in pids:
                fout2.write(f"{qid}\tQ0\t{pid}\t{1}\n")
            
with open(os.path.join(out_dir, "queries.val.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "qrels.val.tsv"), "w") as fout2:
        for qid, pids in val_qid_to_pids.items():
            fout.write(f"{qid}\t{qid_to_query[qid]}\t{REL_RELATION}\n")
            for pid in pids:
                fout2.write(f"{qid}\tQ0\t{pid}\t{1}\n")
            
with open(os.path.join(out_dir, "queries.test.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "qrels.test.tsv"), "w") as fout2:
        for qid, pids in test_qid_to_pids.items():
            fout.write(f"{qid}\t{qid_to_query[qid]}\t{REL_RELATION}\n")
            for pid in pids:
                fout2.write(f"{qid}\tQ0\t{pid}\t{1}\n")
            
assert len(set(train_qid_to_pids.keys()) & set(val_qid_to_pids.keys()) & set(test_qid_to_pids.keys())) == 0

train_pids = set()
train_qids = set()
train_qids.update(set(train_qid_to_pids.keys()))
for pid in chain(item_sim_dict.keys(), item_compl_dict.keys()):
    train_pids.update([pid])
for pid, qids in item_rel_dict.items():
    train_qids.update(qids)
    
with open(os.path.join(out_dir, "entities.train.tsv"), "w") as fout:
    for eid in list(train_pids):
        text = pid_to_title[eid] + " ; " +  pid_to_catalog[eid]
        fout.write(f"{eid}\t{text}\n")
    for eid in list(train_qids):
        fout.write(f"{eid}\t{qid_to_query[eid]}\n")
        

with open(os.path.join(out_dir, "all_entites.tsv"), "w") as fout:
    for aid in pid_to_title:
        text = pid_to_title[aid] + " ; " +  pid_to_catalog[aid]
        fout.write(f"{aid}\t{text}\n")
    for query, qid in query_to_qid.items():
        fout.write(f"{qid}\t{query}\n")

In [7]:
# sanity check
for path in os.listdir(out_dir):
    path = os.path.join(out_dir, path)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)

3214651 /home/jupyter/jointly_rec_and_search/datasets/kgc_search/all_entites.tsv
182082	Harris Dry Up 4-Count Mouse Killer ; No catalog
1977826	Westmore by ELK Lighting Huldra 16-in W 1-Light Antique Bronze Modern/Contemporary Wall Sconce ; No catalog
1085636	Pet Life White Dog/Cat Slicker Brush ; No catalog
3214648	vinyl flooring with padding
3214649	american standard 4385a
3214650	flicker carpet
95377 /home/jupyter/jointly_rec_and_search/datasets/kgc_search/queries.test.tsv
2831003	# 26506	is_relevant_to
2431965	# 6 awg wire	is_relevant_to
2853884	# 6/3 wire	is_relevant_to
2806221	■ 749404	is_relevant_to
2511414	■ 771790	is_relevant_to
2382999	■ 896777	is_relevant_to
95377 /home/jupyter/jointly_rec_and_search/datasets/kgc_search/queries.val.tsv
2934890	"kn95"	is_relevant_to
2327846	"o" rings	is_relevant_to
2780200	# 2 thhn	is_relevant_to
2914404	■ 771788	is_relevant_to
2276097	■ 884139	is_relevant_to
2390696	■ 936291	is_relevant_to
wc: /home/jupyter/jointly_rec_and_search/datasets/kg