In [1]:
import os

from google.cloud import bigquery
import pandas as pd 
import numpy as np

client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.search_ClicksData_w_core_sim_ivms_and_comp_ivms`;
    """
query_job = client.query(query)
search_df = query_job.to_dataframe()

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.hansi_dataset.all_products_info`;
    """
query_job = client.query(query)
product_df = query_job.to_dataframe()
print("product_df = {:,}".format(len(product_df)))

all_products = set(product_df.product_id)
similar_ivms = set(np.concatenate(list(search_df.similar_ivms.values)))
ivms = set(search_df.ivm.values)

print("number of unique product = {}, ivms = {}, similar_ivms = {}".format(len(all_products), len(ivms), len(similar_ivms)))
assert len(all_products & similar_ivms) == len(similar_ivms) and len(all_products & ivms) == len(ivms)

Client creating using default project: gcp-ushi-digital-ds-qa
product_df = 2,260,878
number of unique product = 2260878, ivms = 159380, similar_ivms = 76572


In [2]:
# map product --> text

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("sep token = {}".format(tokenizer.sep_token))
from tqdm import tqdm 

product_to_title = {}
product_to_text = {}
no_bulletin_product = set()
no_title_product = set()

def preprocess_text(in_text):
    in_text = in_text.replace("\t", " ")
    in_text = in_text.replace("\n", " ")
    return in_text

for idx, row in tqdm(product_df.iterrows(), total=len(product_df)):
    product_id = row.product_id
    title = row.product_name if row.product_name != None else "No title"
    bullets = row.bullets if row.bullets != None else "No bullets"
    
    if row.product_name == None:
        no_title_product.add(product_id)
    if row.bullets == None:
        no_bulletin_product.add(product_id)
    
    title = preprocess_text(title)
    bullets = preprocess_text(bullets)
    prd_text = title + " " + tokenizer.sep_token + " " + bullets
    assert "\t" not in prd_text and "\n" not in prd_text, prd_text
    
    product_to_title[product_id] = title
    product_to_text[product_id] = prd_text

# sanity check
print("product_to_title and _to_text = {:,}, {:,}, no bulletin product = {:,}, no title product = {:,}".format(
    len(product_to_title), len(product_to_text), len(no_bulletin_product), len(no_title_product)
))

assert len(product_to_title) == len(product_to_text) and len(product_to_text) == len(product_df), (len(product_to_text), len(product_df))

sep token = [SEP]


100%|██████████| 2260878/2260878 [03:14<00:00, 11645.89it/s]

product_to_title and _to_text = 2,260,878, 2,260,878, no bulletin product = 0, no title product = 21





In [4]:
# map: product --> pid, query --> qid
all_uni_queries = np.unique(search_df["query"])

query_to_qid = {query: qid for qid, query in enumerate(all_uni_queries)}
product_to_pid = {product_id: pid for pid, product_id in enumerate(all_products)} 
pid_to_title = {product_to_pid[product_id]: title for product_id, title in product_to_title.items()}
pid_to_text = {product_to_pid[product_id]: text for product_id, text in product_to_text.items()}

assert len(pid_to_text) == len(pid_to_text) == len(product_to_text) == len(product_to_title)

qid_to_query = {qid: query for query, qid in query_to_qid.items()}
assert len(qid_to_query) == len(query_to_qid)

In [32]:
# read all anchor_ivm from train_qrel
anchor_ivms = set()
in_dir = "/home/jupyter/jointly_rec_and_search/datasets/rec_search/search/"
pid_to_productid = {pid: product_id for product_id, pid in product_to_pid.items()}
with open(os.path.join(in_dir, "anchors.train.tsv")) as fin:
    for line in fin:
        pid, _ = line.rstrip().split("\t")
        product_id = pid_to_productid[int(pid)]
        anchor_ivms.add(product_id)
print("length of anchor_ivms = {}".format(len(anchor_ivms)))

anchor_to_compls = {}
not_in_train = set()
for idx, row in tqdm(search_df.iterrows(), total=len(search_df)):
    a_ivm, compl_ivms = row["ivm"], row["complement_ivms"]
    if len(compl_ivms) == 0:
        continue 
    
    if a_ivm not in anchor_ivms:
        not_in_train.add(a_ivm)
        continue 
    
    for compl_ivm in compl_ivms:
        if a_ivm not in anchor_to_compls:
            anchor_to_compls[a_ivm] = set([compl_ivm])
        else:
            anchor_to_compls[a_ivm].add(compl_ivm)

length of anchor_ivms = 83440


100%|██████████| 915725/915725 [00:38<00:00, 23694.39it/s]


In [28]:
len(anchor_to_compls), np.sum(len(xs) for _, xs in anchor_to_compls.items()) 

  """Entry point for launching an IPython kernel.


(1950, 8251)

In [33]:
len(not_in_train), len(anchor_ivms)

(49568, 83440)