In [1]:
import os

from google.cloud import bigquery
import pandas as pd 
import numpy as np

client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

for set_name in ["train", "val", "test"]:
    query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.hansi_rec_dataset.{}_rec_ClicksData_5core`;
    """.format(set_name)
    query_job = client.query(query)
    if set_name == "train":
        print("load train_df")
        train_df = query_job.to_dataframe()
    elif set_name == "val":
        print("load val_df")
        val_df = query_job.to_dataframe()
    else:
        print("load test_df")
        test_df = query_job.to_dataframe()
        
query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.hansi_dataset.all_products_info`;
    """
query_job = client.query(query)
product_df = query_job.to_dataframe()
print("product_df = {:,}".format(len(product_df)))
all_products = np.unique(product_df.product_id)

print("train_df, val_df, test_df = {:,}, {:,}, {:,}".format(len(train_df), len(val_df), len(test_df)))

clicked_products = np.unique(np.union1d(np.union1d(train_df.ivm, val_df.ivm), test_df.ivm))
print("unique clicked_products = {:,}".format(len(clicked_products)))

Client creating using default project: gcp-ushi-digital-ds-qa
load train_df
load val_df
load test_df
product_df = 2,260,878
train_df, val_df, test_df = 240,315, 30,936, 27,733
unique clicked_products = 54,651


In [2]:
productid_to_pid = {}
pid_map_path = "/home/jupyter/jointly_rec_and_search/datasets/rec_search/search/pid_productid_map.tsv"
with open(pid_map_path) as fin:
    for line in fin:
        pid, *product_id = line.rstrip().split("\t")
        if len(product_id) >= 2:
            tmp_product_id = "\t".join(product_id)
            product_id = tmp_product_id
        else:
            product_id = product_id[0]
        productid_to_pid[product_id] = pid 
        
print(list(productid_to_pid.items())[:10])

pid_to_title = {}
with open("/home/jupyter/jointly_rec_and_search/datasets/rec_search/search/collection_title.tsv") as fin:
    for line in fin:
        pid, title = line.rstrip().split("\t")
        pid_to_title[pid] = title

[('100-199-7643', '0'), ('1000000-11813-LO1000000', '1'), ('1000001-11813-LO1000001', '2'), ('1000002-11813-LO1000002', '3'), ('1000003-11813-LO1000003', '4'), ('1000004-11813-LO1000004', '5'), ('1000005-11813-LO1000005', '6'), ('1000006-11813-LO1000006', '7'), ('1000007-11813-LO1000007', '8'), ('1000008-11813-LO1000008', '9')]


In [3]:
import os

train_aids = set()
train_examples = []
remove_train = 0
for idx, row in train_df.iterrows():
    anchor, product_id = row["anchor"], row["ivm"]
    
    if anchor not in productid_to_pid:
        remove_train += 1
        continue
    aid = productid_to_pid[anchor]
    pid = productid_to_pid[product_id]
    
    train_aids.add(aid)
    train_examples.append((aid, pid))

val_aids = set()
exclude_val_aids = set()
val_examples = []
exclude_val_examples = []
remove_val = 0
for idx, row in val_df.iterrows():
    anchor, product_id = row["anchor"], row["ivm"]
    
    if anchor not in productid_to_pid:
        remove_val += 1
        continue
    aid = productid_to_pid[anchor]
    pid = productid_to_pid[product_id]
    
    val_aids.add(aid)
    val_examples.append((aid, pid))
    
    if aid not in train_aids:
        exclude_val_aids.add(aid)
        exclude_val_examples.append((aid, pid))
        
test_aids = set()
exclude_test_aids = set()
test_examples = []
exclude_test_examples = []
remove_test = 0
for idx, row in test_df.iterrows():
    anchor, product_id = row["anchor"], row["ivm"]
    
    if anchor not in productid_to_pid:
        remove_test += 1
        continue
    aid = productid_to_pid[anchor]
    pid = productid_to_pid[product_id]
    
    test_aids.add(aid)
    test_examples.append((aid, pid))
    
    if aid not in train_aids:
        exclude_test_aids.add(aid)
        exclude_test_examples.append((aid, pid))
        

print("remove train_aids = {}, val_aids = {}, test_aids = {}".format(remove_train, remove_val, remove_test))

out_dir = "/home/jupyter/jointly_rec_and_search/datasets/rec_search/rec/"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
with open(os.path.join(out_dir, "anchors.train.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "arels.train.tsv"), "w") as fout2:
        for aid in list(train_aids):
            fout.write(f"{aid}\t{pid_to_title[aid]}\n")
        for (aid, pid) in train_examples:
            fout2.write(f"{aid}\tQ0\t{pid}\t{1}\n")
            
with open(os.path.join(out_dir, "anchors.val.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "arels.val.tsv"), "w") as fout2:
        with open(os.path.join(out_dir, "anchors.val.exclude.tsv"), "w") as fout3:
            with open(os.path.join(out_dir, "arels.val.exclude.tsv"), "w") as fout4:
                for aid in list(val_aids):
                    fout.write(f"{aid}\t{pid_to_title[aid]}\n")
                for (aid, pid) in val_examples:
                    fout2.write(f"{aid}\tQ0\t{pid}\t{1}\n")
                for aid in list(exclude_val_aids):
                    fout3.write(f"{aid}\t{pid_to_title[aid]}\n")
                for (aid, pid) in exclude_val_examples:
                    fout4.write(f"{aid}\tQ0\t{pid}\t{1}\n")

with open(os.path.join(out_dir, "anchors.test.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "arels.test.tsv"), "w") as fout2:
        with open(os.path.join(out_dir, "anchors.test.exclude.tsv"), "w") as fout3:
            with open(os.path.join(out_dir, "arels.test.exclude.tsv"), "w") as fout4:
                for aid in list(test_aids):
                    fout.write(f"{aid}\t{pid_to_title[aid]}\n")
                for (aid, pid) in test_examples:
                    fout2.write(f"{aid}\tQ0\t{pid}\t{1}\n")
                for aid in list(exclude_test_aids):
                    fout3.write(f"{aid}\t{pid_to_title[aid]}\n")
                for (aid, pid) in exclude_test_examples:
                    fout4.write(f"{aid}\tQ0\t{pid}\t{1}\n")

remove train_aids = 289, val_aids = 31, test_aids = 19


In [4]:
for path in os.listdir(out_dir):
    path = os.path.join(out_dir, path)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)

70755 /home/jupyter/jointly_rec_and_search/datasets/rec_search/rec/anchors.train.tsv
1202993	Ring Smart Lighting Outdoor Lights Transformer (2021)
1295191	Beyond Paint All-in-One Furniture-Cabinets-and-Countertop Flat Nantucket Cabinet & Furniture Paint (1-Gallon)
221497	Samsung StormWash 42-Decibel Top Control 24-in Built-In Dishwasher (Fingerprint Resistant Stainless Steel) ENERGY STAR
1751759	Bonide Captain Jack's 16-oz Concentrate Organic Natural Garden Insect Killer
2160030	ReliaBilt 1-in x 12-in x 12-ft Medium-density Primed MDF
396173	Goof Off 4-fl oz Adhesive Remover
607 /home/jupyter/jointly_rec_and_search/datasets/rec_search/rec/arels.test.exclude.tsv
11360	Q0	932343	1
12997	Q0	2106379	1
16406	Q0	92216	1
2247523	Q0	2247719	1
2248157	Q0	326986	1
2251176	Q0	1702335	1
240026 /home/jupyter/jointly_rec_and_search/datasets/rec_search/rec/arels.train.tsv
32	Q0	5369	1
32	Q0	5344	1
32	Q0	2075636	1
2260706	Q0	2260717	1
2260717	Q0	5594	1
2260717	Q0	2260706	1
12155 /home/jupyter/jointly_