In [1]:
import os

from google.cloud import bigquery
import pandas as pd 
import numpy as np

client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

for set_name in ["train", "val", "test"]:
    query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.hansi_dataset.{}_search_ClicksData_5core`;
    """.format(set_name)
    query_job = client.query(query)
    if set_name == "train":
        print("load train_df")
        train_df = query_job.to_dataframe()
    elif set_name == "val":
        print("load val_df")
        val_df = query_job.to_dataframe()
    else:
        print("load test_df")
        test_df = query_job.to_dataframe()

print("train_df, val_df, test_df = {:,}, {:,}, {:,}".format(len(train_df), len(val_df), len(test_df)))

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.hansi_dataset.all_products_info`;
    """
query_job = client.query(query)
product_df = query_job.to_dataframe()
print("product_df = {:,}".format(len(product_df)))

clicked_products = np.unique(np.union1d(np.union1d(train_df.ivm, val_df.ivm), test_df.ivm))
all_products = np.unique(product_df.product_id)
print("unique clicked_products = {:,}, all_products = {:,}, is subset of = {} ({:,}/{:,})".format(
    len(clicked_products), len(all_products), len(np.intersect1d(clicked_products, all_products)) == len(clicked_products),
    len(np.intersect1d(clicked_products, all_products)), len(clicked_products)
))
assert len(np.intersect1d(clicked_products, all_products)) == len(clicked_products), (len(np.intersect1d(clicked_products, all_products)),
                                                                                      len(clicked_products), "STOPPPPPPPPPPP")

Client creating using default project: gcp-ushi-digital-ds-qa
load train_df
load val_df


KeyboardInterrupt: 

In [None]:
! pip install transformers
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("sep token = {}".format(tokenizer.sep_token))

In [None]:
# map product --> text
from tqdm import tqdm 

product_to_title = {}
product_to_text = {}
no_bulletin_product = set()
no_title_product = set()

def preprocess_text(in_text):
    in_text = in_text.replace("\t", " ")
    in_text = in_text.replace("\n", " ")
    return in_text

for idx, row in tqdm(product_df.iterrows(), total=len(product_df)):
    product_id = row.product_id
    title = row.product_name if row.product_name != None else "No title"
    bullets = row.bullets if row.bullets != None else "No bullets"
    
    if row.product_name == None:
        no_title_product.add(product_id)
    if row.bullets == None:
        no_bulletin_product.add(product_id)
    
    title = preprocess_text(title)
    bullets = preprocess_text(bullets)
    prd_text = title + " " + tokenizer.sep_token + " " + bullets
    assert "\t" not in prd_text and "\n" not in prd_text, prd_text
    
    product_to_title[product_id] = title
    product_to_text[product_id] = prd_text

# sanity check
print("product_to_title and _to_text = {:,}, {:,}, no bulletin product = {:,}, no title procut = {:,}".format(
    len(product_to_title), len(product_to_text), len(no_bulletin_product), len(no_title_product)
))

assert len(product_to_title) == len(product_to_text) and len(product_to_text) == len(product_df), (len(product_to_text), len(product_df))

In [None]:
# map: product --> pid, query --> qid
merge_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
all_uni_queries = np.unique(merge_df["query"])

#sanity check
print("merge_df = {}".format(merge_df.head()))
print("size = {:,}, query_to_clicked_product = {:.3f}".format(len(merge_df), len(merge_df)/len(all_uni_queries)))
assert len(merge_df) == len(train_df) + len(val_df) + len(test_df)
assert len(all_products) == 2260878, len(all_products)
print("all unique queries = {:,}".format(len(all_uni_queries)))
assert len(all_uni_queries) < len(merge_df) - 1000

query_to_qid = {query: qid for qid, query in enumerate(all_uni_queries)}
product_to_pid = {product_id: pid for pid, product_id in enumerate(all_products)} 
pid_to_title = {product_to_pid[product_id]: title for product_id, title in product_to_title.items()}
pid_to_text = {product_to_pid[product_id]: text for product_id, text in product_to_text.items()}

assert len(pid_to_text) == len(pid_to_text) == len(product_to_text) == len(product_to_title)

qid_to_query = {qid: query for query, qid in query_to_qid.items()}
assert len(qid_to_query) == len(query_to_qid)


In [None]:
# write 
import os 
import ujson

out_dir = "/home/jupyter/jointly_rec_and_search/datasets/rec_search/search/"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

all_qids = set()
# train: queries.train.tsv, qrels.train.tsv
with open(os.path.join(out_dir, "queries.train.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "qrels.train.tsv"), "w") as fout2:
        for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
            query, product_id = row["query"], row["ivm"]
            assert row["clicked_numbers"] >= 5
            assert "\n" not in query, query
            
            qid = query_to_qid[query]
            pid = product_to_pid[product_id]
            
            if qid not in all_qids:
                fout.write(f"{qid}\t{query}\n")
                all_qids.add(qid)
            fout2.write(f"{qid}\tQ0\t{pid}\t{1}\n")
            
train_queries = set(np.array(train_df["query"]))
print("train_queries = {}".format(len(train_queries)))

all_qids = set()
exclude_qids = set()
# val: queries.val.tsv, qrels.val.tsv, queries.val.exclude.tsv, qrels.val.exclude.tsv
with open(os.path.join(out_dir, "queries.val.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "qrels.val.tsv"), "w") as fout2:
        with open(os.path.join(out_dir, "queries.val.exclude.tsv"), "w") as fout3:
            with open(os.path.join(out_dir, "qrels.val.exclude.tsv"), "w") as fout4:
                for idx, row in val_df.iterrows():
                    query, product_id = row["query"], row["ivm"]
                    assert row["clicked_numbers"] >= 5
                    assert "\n" not in query, query
                    
                    qid = query_to_qid[query]
                    pid = product_to_pid[product_id]
                    
                    if qid not in all_qids:
                        fout.write(f"{qid}\t{query}\n")
                        all_qids.add(qid)
                    fout2.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                    
                    if query not in train_queries:
                        if qid not in exclude_qids:
                            fout3.write(f"{qid}\t{query}\n")
                            exclude_qids.add(qid)
                        fout4.write(f"{qid}\tQ0\t{pid}\t{1}\n")

all_qids = set()
exclude_qids = set()
# test: queries.test.tsv, qrels.test.tsv, queries.test.exclude.tsv, qrels.test.exclude.tsv
with open(os.path.join(out_dir, "queries.test.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "qrels.test.tsv"), "w") as fout2:
        with open(os.path.join(out_dir, "queries.test.exclude.tsv"), "w") as fout3:
            with open(os.path.join(out_dir, "qrels.test.exclude.tsv"), "w") as fout4:
                for idx, row in test_df.iterrows():
                    query, product_id = row["query"], row["ivm"]
                    assert row["clicked_numbers"] >= 5
                    assert "\n" not in query, query
                    
                    qid = query_to_qid[query]
                    pid = product_to_pid[product_id]
                    
                    if qid not in all_qids:
                        fout.write(f"{qid}\t{query}\n")
                        all_qids.add(qid)
                    fout2.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                    
                    if query not in train_queries:
                        if qid not in exclude_qids:
                            fout3.write(f"{qid}\t{query}\n")
                            exclude_qids.add(qid)
                        fout4.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                        
# collection
with open(os.path.join(out_dir, "collection_title.tsv"), "w") as fout:
    for pid, title in pid_to_title.items():
        assert "\n" not in title, title
        fout.write(f"{pid}\t{title}\n")

with open(os.path.join(out_dir, "collection.tsv"), "w") as fout:
    for pid, text in pid_to_text.items():
        assert "\n" not in text, text 
        fout.write(f"{pid}\t{text}\n")
        
with open(os.path.join(out_dir, "product.jsonl"), "w") as fout:
    for pid, text in pid_to_text.items():
        example = {"id": pid, "contents": text}
        fout.write(ujson.dumps(example) + "\n")
        
# product_to_pid
with open(os.path.join(out_dir, "pid_productid_map.tsv"), "w") as fout:
    for product_id, pid in product_to_pid.items():
        fout.write(f"{pid}\t{product_id}\n")

In [None]:
# sanity check
for path in os.listdir(out_dir):
    path = os.path.join(out_dir, path)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)
    

In [None]:
len(train_queries)