In [1]:
import os

from google.cloud import bigquery
import pandas as pd 
import numpy as np

client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.search_ClicksData_w_core_sim_ivms_and_comp_ivms`;
    """
query_job = client.query(query)
search_df = query_job.to_dataframe()

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.hansi_dataset.all_products_info`;
    """
query_job = client.query(query)
product_df = query_job.to_dataframe()
print("product_df = {:,}".format(len(product_df)))

all_products = set(product_df.product_id)
similar_ivms = set(np.concatenate(list(search_df.similar_ivms.values)))
ivms = set(search_df.ivm.values)

print("number of unique product = {}, ivms = {}, similar_ivms = {}".format(len(all_products), len(ivms), len(similar_ivms)))
assert len(all_products & similar_ivms) == len(similar_ivms) and len(all_products & ivms) == len(ivms)

Client creating using default project: gcp-ushi-digital-ds-qa
product_df = 2,260,878
number of unique product = 2260878, ivms = 159380, similar_ivms = 76572


In [2]:
# split train, val, test set 
np.random.seed(4680)

HEAD_COUNT=850
TORSO_COUNT=10
np.set_printoptions(suppress=True)

query_to_clicknum = search_df.groupby(["query"])["clicked_numbers"].sum()
print("clicknum of 37.5% and 97.5% quantiles = {}".format(np.quantile(query_to_clicknum.values, [.375, .975])))
print("TORSO COUNT = {}, HEAD_COUNT = {}".format(TORSO_COUNT, HEAD_COUNT))

head_mask = query_to_clicknum.values > HEAD_COUNT
torso_mask = np.logical_and(query_to_clicknum.values > TORSO_COUNT, query_to_clicknum <= HEAD_COUNT)
tail_mask = query_to_clicknum <= TORSO_COUNT
all_queries = np.array([query for query, _ in query_to_clicknum.iteritems()])
head_queries = all_queries[head_mask]
torso_queries = all_queries[torso_mask]
tail_queries = all_queries[tail_mask]
print("number of head, torso, tail queries = {}, {}, {}".format(len(head_queries), len(torso_queries), len(tail_queries)))
assert len(head_queries) + len(torso_queries) + len(tail_queries) == len(query_to_clicknum)

cand_head_queries = np.random.choice(head_queries, size=1000, replace=False)
val_head_queries, test_head_queries = cand_head_queries[:500], cand_head_queries[500:]

cand_torso_queries = np.random.choice(torso_queries, size=2000, replace=False)
val_torso_queries, test_torso_queries = cand_torso_queries[:1000], cand_torso_queries[1000:]

cand_tail_queries = np.random.choice(tail_queries, size = 2000, replace=False)
val_tail_queries, test_tail_queries = cand_tail_queries[:1000], cand_tail_queries[1000:]

print( [len(x) for x in [val_head_queries, val_torso_queries, val_tail_queries]] )
val_queries = np.concatenate([val_head_queries, val_torso_queries, val_tail_queries])
test_queries = np.concatenate([test_head_queries, test_torso_queries, test_tail_queries])
assert len(np.unique(val_queries)) == 2500 and len(np.unique(test_queries)) == 2500, (len(np.unique(val_queries)), len(np.unique(test_queries)))
assert len(set(val_queries) & set(test_queries)) == 0

val_mask = np.in1d(search_df["query"], val_queries)
test_mask = np.in1d(search_df["query"], test_queries)
train_mask = np.logical_and(~val_mask, ~test_mask)
assert np.sum(np.logical_and(np.logical_and(val_mask, test_mask), train_mask)) == 0
assert np.sum(val_mask) + np.sum(test_mask) + np.sum(train_mask) == len(search_df)

train_df = search_df[train_mask]
val_df = search_df[val_mask]
test_df = search_df[test_mask]

clicknum of 37.5% and 97.5% quantiles = [ 11. 853.]
TORSO COUNT = 10, HEAD_COUNT = 850
number of head, torso, tail queries = 5697, 138743, 82493
[500, 1000, 1000]


In [3]:
# map product --> text

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("sep token = {}".format(tokenizer.sep_token))
from tqdm import tqdm 

product_to_title = {}
product_to_text = {}
no_bulletin_product = set()
no_title_product = set()

def preprocess_text(in_text):
    in_text = in_text.replace("\t", " ")
    in_text = in_text.replace("\n", " ")
    return in_text

for idx, row in tqdm(product_df.iterrows(), total=len(product_df)):
    product_id = row.product_id
    title = row.product_name if row.product_name != None else "No title"
    bullets = row.bullets if row.bullets != None else "No bullets"
    
    if row.product_name == None:
        no_title_product.add(product_id)
    if row.bullets == None:
        no_bulletin_product.add(product_id)
    
    title = preprocess_text(title)
    bullets = preprocess_text(bullets)
    prd_text = title + " " + tokenizer.sep_token + " " + bullets
    assert "\t" not in prd_text and "\n" not in prd_text, prd_text
    
    product_to_title[product_id] = title
    product_to_text[product_id] = prd_text

# sanity check
print("product_to_title and _to_text = {:,}, {:,}, no bulletin product = {:,}, no title product = {:,}".format(
    len(product_to_title), len(product_to_text), len(no_bulletin_product), len(no_title_product)
))

assert len(product_to_title) == len(product_to_text) and len(product_to_text) == len(product_df), (len(product_to_text), len(product_df))

sep token = [SEP]


100%|██████████| 2260878/2260878 [03:11<00:00, 11810.01it/s]

product_to_title and _to_text = 2,260,878, 2,260,878, no bulletin product = 0, no title product = 21





In [4]:
# map: product --> pid, query --> qid
all_uni_queries = np.unique(search_df["query"])
#sanity check
print("search_df = {}".format(search_df.head()))
print("size = {:,}, query_to_clicked_product = {:.3f}".format(len(search_df), len(search_df)/len(all_uni_queries)))
assert len(search_df) == len(train_df) + len(val_df) + len(test_df)
assert len(all_products) == 2260878, len(all_products)
print("all unique queries = {:,}".format(len(all_uni_queries)))
assert len(all_uni_queries) < len(search_df) - 1000

query_to_qid = {query: qid for qid, query in enumerate(all_uni_queries)}
product_to_pid = {product_id: pid for pid, product_id in enumerate(all_products)} 
pid_to_title = {product_to_pid[product_id]: title for product_id, title in product_to_title.items()}
pid_to_text = {product_to_pid[product_id]: text for product_id, text in product_to_text.items()}

assert len(pid_to_text) == len(pid_to_text) == len(product_to_text) == len(product_to_title)

qid_to_query = {qid: query for query, qid in query_to_qid.items()}
assert len(qid_to_query) == len(query_to_qid)

search_df =                   query                        ivm  clicked_numbers  \
0  leisure made blakely   2415613-96143-639288-BLU                9   
1     chain by the foot       637441-273-BH0990856                6   
2         hammock chair   2547144-104571-LY-HCS-SS               10   
3      12x18 gable vent    157012-10065-GLFC1218WH                6   
4     bypass door track  4914182-111316-DZ4TGH060C                6   

  similar_ivms complement_ivms  
0           []              []  
1           []              []  
2           []              []  
3           []              []  
4           []              []  
size = 915,725, query_to_clicked_product = 4.035
all unique queries = 226,933


In [5]:
# write 
import os 
import ujson

out_dir = "/home/jupyter/jointly_rec_and_search/datasets/rec_search/search_compl/"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

# for train
train_qids = set()
# train: queries.train.tsv, qrels.train.tsv
with open(os.path.join(out_dir, "queries.train.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "qrels.train.tsv"), "w") as fout2:
        for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
            query, product_id = row["query"], row["ivm"]
            assert row["clicked_numbers"] >= 5
            assert "\n" not in query, query
            
            qid = query_to_qid[query]
            pid = product_to_pid[product_id]
            
            if qid not in train_qids:
                fout.write(f"{qid}\t{query}\n")
                train_qids.add(qid)
            fout2.write(f"{qid}\tQ0\t{pid}\t{1}\n")
            
            

# for val
with open(os.path.join(out_dir, "queries.val.tsv"), "w") as fout:
    for query in val_queries:
        fout.write(f"{query_to_qid[query]}\t{query}\n")
with open(os.path.join(out_dir, "qrels.val.tsv"), "w") as fout:
    for idx, row in tqdm(val_df.iterrows(), total=len(val_df)):
        query, product_id = row["query"], row["ivm"]
        assert row["clicked_numbers"] >= 5
        assert "\n" not in query, query
        
        qid = query_to_qid[query]
        pid = product_to_pid[product_id]
        
        assert qid not in train_qids
        fout.write(f"{qid}\tQ0\t{pid}\t{1}\n")
        

# for test
with open(os.path.join(out_dir, "queries.test.tsv"), "w") as fout:
    for query in test_queries:
        fout.write(f"{query_to_qid[query]}\t{query}\n")
with open(os.path.join(out_dir, "queries.test.head.tsv"), "w") as fout:
    for query in test_head_queries:
        fout.write(f"{query_to_qid[query]}\t{query}\n")
with open(os.path.join(out_dir, "queries.test.torso.tsv"), "w") as fout:
    for query in test_torso_queries:
        fout.write(f"{query_to_qid[query]}\t{query}\n")
with open(os.path.join(out_dir, "queries.test.tail.tsv"), "w") as fout:
    for query in test_tail_queries:
        fout.write(f"{query_to_qid[query]}\t{query}\n")
with open(os.path.join(out_dir, "qrels.test.tsv"), "w") as fout:
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
        query, product_id = row["query"], row["ivm"]
        assert row["clicked_numbers"] >= 5
        assert "\n" not in query, query
        
        qid = query_to_qid[query]
        pid = product_to_pid[product_id]
        
        assert qid not in train_qids
        fout.write(f"{qid}\tQ0\t{pid}\t{1}\n")
        
with open(os.path.join(out_dir, "qrels.test.head.tsv"), "w") as fout1:
    with open(os.path.join(out_dir, "qrels.test.torso.tsv"), "w") as fout2:
        with open(os.path.join(out_dir, "qrels.test.tail.tsv"), "w") as fout3:
            for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
                query, product_id = row["query"], row["ivm"]
                assert row["clicked_numbers"] >= 5
                assert "\n" not in query, query

                qid = query_to_qid[query]
                pid = product_to_pid[product_id]

                assert qid not in train_qids
                if query in test_head_queries:
                    fout1.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                elif query in test_torso_queries:
                    fout2.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                elif query in test_tail_queries:
                    fout3.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                else:
                    raise ValueError(f"{query} not in test_queries")
            
# collection
with open(os.path.join(out_dir, "collection_title.tsv"), "w") as fout:
    for pid, title in pid_to_title.items():
        assert "\n" not in title, title
        fout.write(f"{pid}\t{title}\n")

with open(os.path.join(out_dir, "collection.tsv"), "w") as fout:
    for pid, text in pid_to_text.items():
        assert "\n" not in text, text 
        fout.write(f"{pid}\t{text}\n")
        
with open(os.path.join(out_dir, "product.jsonl"), "w") as fout:
    for pid, text in pid_to_text.items():
        example = {"id": pid, "contents": text}
        fout.write(ujson.dumps(example) + "\n")
        
# product_to_pid
with open(os.path.join(out_dir, "pid_productid_map.tsv"), "w") as fout:
    for product_id, pid in product_to_pid.items():
        fout.write(f"{pid}\t{product_id}\n")

# anchors.train.tsv, arels.train.tsv
aid_to_simpids = {}
for idx, row in train_df.iterrows():
    ivm, similar_ivms = row["ivm"], row["similar_ivms"]
    
    if len(similar_ivms) == 0:
        continue
    aid = product_to_pid[ivm]
    pids = [product_to_pid[_ivm] for _ivm in similar_ivms]
    
    if aid not in aid_to_simpids:
        aid_to_simpids[aid] = set()
    
    for pid in pids:
        aid_to_simpids[aid].add(pid)

aid_to_complpids = {}
for idx, row in train_df.iterrows():
    ivm, compl_ivms = row["ivm"], row["complement_ivms"]
    
    if len(compl_ivms) == 0:
        continue
    aid = product_to_pid[ivm]
    pids = [product_to_pid[_ivm] for _ivm in compl_ivms]
    
    if aid not in aid_to_complpids:
        aid_to_complpids[aid] = set()
    
    for pid in pids:
        aid_to_complpids[aid].add(pid)

with open(os.path.join(out_dir, "anchors.similar.train.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "arels.similar.train.tsv"), "w") as fout2:
        for aid in aid_to_simpids:
            fout.write(f"{aid}\t{pid_to_title[aid]}\n")
            for pid in list(aid_to_simpids[aid]):
                fout2.write(f"{aid}\tQ0\t{pid}\t{1}\n")
                
with open(os.path.join(out_dir, "anchors.compl.train.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "arels.compl.train.tsv"), "w") as fout2:
        for aid in aid_to_complpids:
            fout.write(f"{aid}\t{pid_to_title[aid]}\n")
            for pid in list(aid_to_complpids[aid]):
                fout2.write(f"{aid}\tQ0\t{pid}\t{1}\n")

100%|██████████| 870971/870971 [00:41<00:00, 21002.81it/s]
100%|██████████| 23265/23265 [00:01<00:00, 18931.23it/s]
100%|██████████| 21489/21489 [00:01<00:00, 20963.33it/s]
100%|██████████| 21489/21489 [00:01<00:00, 14421.43it/s]


In [6]:
# sanity check
for path in os.listdir(out_dir):
    path = os.path.join(out_dir, path)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)

2260878 /home/jupyter/jointly_rec_and_search/datasets/rec_search/search_compl/pid_productid_map.tsv
0	2231837-105765-BB8343TH68
1	4120837-101988-AC-422-GFF-11X14
2	1379979-35985-MC-1/16-GAL-ORB
2260875	1777306-56240-GVWOT20X2801SFUPI
2260876	95021-1621-3733
2260877	283430-69572-681561
83440 /home/jupyter/jointly_rec_and_search/datasets/rec_search/search_compl/anchors.similar.train.tsv
234090	Eagle Clear High-gloss Acrylic Waterproofer (1-Gallon)
81625	Eagle Clear High-gloss Transparent Acrylic Waterproofer (5-Gallon)
1067107	Bigfoot Systems 29.5-in Footing Form
409112	Lithonia Lighting LTIKMSBK LED Series Track Kit 3-Light 44-in Brushed Nickel Dimmable LED Linear Track Lighting Kit
355862	Kwikset Signature Series Halifax Matte Black Universal Privacy
63588	WRIGHT PRODUCTS 3.94-in Surface Mounted Sliding Patio Door Handle
2500 /home/jupyter/jointly_rec_and_search/datasets/rec_search/search_compl/queries.test.tsv
189079	small refrigerator without freezer
210369	vegetable seeds
97919	fluo

In [7]:
with open(os.path.join(out_dir, "qrels.test.head.tsv"), "w") as fout1:
    with open(os.path.join(out_dir, "qrels.test.torso.tsv"), "w") as fout2:
        with open(os.path.join(out_dir, "qrels.test.tail.tsv"), "w") as fout3:
            for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
                query, product_id = row["query"], row["ivm"]
                assert row["clicked_numbers"] >= 5
                assert "\n" not in query, query

                qid = query_to_qid[query]
                pid = product_to_pid[product_id]

                assert qid not in train_qids
                if query in test_head_queries:
                    fout1.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                elif query in test_torso_queries:
                    fout2.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                elif query in test_tail_queries:
                    fout3.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                else:
                    raise ValueError(f"{query} not in test_queries")

100%|██████████| 21489/21489 [00:01<00:00, 13354.77it/s]
