In [1]:
import os
import re 
from collections import defaultdict

import pandas as pd 
from tqdm import tqdm

root_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/amazon_esci_dataset/data/processed/public/task_1_query_product_ranking/"

# preprocess product_df
col_product_id = "product_id"
col_product_title = "product_title"
col_product_description = "product_description"
col_product_bullet = 'product_bullet_point'
col_product_brand = 'product_brand'
col_product_color = "product_color_name"
col_product_locale = "product_locale"

product_catalogue_path = os.path.join(root_dir, "product_catalogue-v0.3.csv")

product_df = pd.read_csv(product_catalogue_path)
product_df.fillna('unknown', inplace=True)
replace_pattern = re.escape(";+/\.?")
replace_pattern = f"[{replace_pattern}\n]+"
product_df[col_product_description] = product_df[col_product_description] \
    .str.replace("<\w+>", " ", regex=True) \
    .str.replace("</\w+>", " ", regex=True) \
    .str.strip()
product_df[col_product_bullet] = product_df[col_product_bullet] \
    .str.replace("<\w+>", " ", regex=True) \
    .str.replace("</\w+>", " ", regex=True) \
    .str.strip()

product_df = product_df[product_df.product_locale=="us"]


pid_to_ivm = {pid: ivm for pid, ivm in enumerate(product_df.product_id.unique())}
ivm_to_pid = {ivm: pid for pid, ivm in pid_to_ivm.items()}
pid_to_title = {ivm_to_pid[ivm]: title for ivm, title in zip(product_df.product_id, product_df.product_title)}
pid_to_desc = {ivm_to_pid[ivm]: desc for ivm, desc in zip(product_df.product_id, product_df.product_description)}
pid_to_brand = {ivm_to_pid[ivm]: brand for ivm, brand in zip(product_df.product_id, product_df.product_brand)}
pid_to_color = {ivm_to_pid[ivm]: color for ivm, color in zip(product_df.product_id, product_df.product_color_name)}
assert len(product_df) == len(pid_to_ivm) == len(pid_to_title) == len(ivm_to_pid)

# preprocess train_df
train_df = pd.read_csv(os.path.join(root_dir, "train-v0.3.csv"))
train_df = train_df[train_df.query_locale=="us"]
query_to_qid = {query: qid+len(product_df) for qid, query in enumerate(train_df["query"].unique())}
train_df["query_id"] = train_df["query"].apply(lambda x: query_to_qid[x])

qid_to_relpids = defaultdict(set)
qid_to_simpids = defaultdict(set)
qid_to_complpids = defaultdict(set)

for i, row in tqdm(train_df.iterrows(), total=len(train_df)):
    qid, ivm, label = row.query_id, row.product_id, row.esci_label
    pid = ivm_to_pid[ivm]
    if label == "exact":
        qid_to_relpids[qid].add(pid)
    elif label == "substitute":
        qid_to_simpids[qid].add(pid)
    elif label == "complement":
        qid_to_complpids[qid].add(pid)
    else:
        assert label == "irrelevant", label

100%|██████████| 419730/419730 [00:24<00:00, 17404.50it/s]


In [2]:
import numpy as np 
import pickle as pkl
import random
random.seed(4680)

aid_to_simpids = defaultdict(set)
aid_to_complpids = defaultdict(set)

for qid in tqdm(qid_to_relpids, total=len(qid_to_relpids)):
    aids = list(qid_to_relpids[qid])
    if qid in qid_to_simpids:
        for aid in aids:
            aid_to_simpids[aid].update(qid_to_simpids[qid])
    if qid in qid_to_complpids:
        for aid in aids:
            aid_to_complpids[aid].update(qid_to_complpids[qid])
            
print("number of aids for sim_rec, compl_rec = {:,}, {:,}, qids for search = {:,}".format(
    len(aid_to_simpids), len(aid_to_complpids), len(qid_to_relpids)))
print("average rels for sim_rec, compl_rec, search = {:.3f}, {:.3f}, {:.3f}".format(
    np.mean([len(xs) for xs in aid_to_simpids.values()]), np.mean([len(xs) for xs in aid_to_complpids.values()]),
     np.mean([len(xs) for xs in qid_to_relpids.values()])))


# train & valid & test split
# for similar items
val_test_aids = random.sample(aid_to_simpids.keys(), int(0.2*len(aid_to_simpids)))
val_aids = val_test_aids[:int(0.5*len(val_test_aids))]
test_aids = val_test_aids[int(0.5*len(val_test_aids)):]
train_aid_to_simpids, val_aid_to_simpids, test_aid_to_simpids = {}, {}, {}
for aid, simpids in tqdm(aid_to_simpids.items(), total=len(aid_to_simpids)):
    if aid in val_aids:
        val_aid_to_simpids[aid] = simpids
    elif aid in test_aids:
        test_aid_to_simpids[aid] = simpids
    else:
        train_aid_to_simpids[aid] = simpids
        
# for complementary items
val_test_aids = random.sample(aid_to_complpids.keys(), int(0.2*len(aid_to_complpids)))
val_aids = val_test_aids[:int(0.5*len(val_test_aids))]
test_aids = val_test_aids[int(0.5*len(val_test_aids)):]
train_aid_to_complpids, val_aid_to_complpids, test_aid_to_complpids = {}, {}, {}
for aid, complpids in tqdm(aid_to_complpids.items(), total=len(aid_to_complpids)):
    if aid in val_aids:
        val_aid_to_complpids[aid] = complpids
    elif aid in test_aids:
        test_aid_to_complpids[aid] = complpids
    else:
        train_aid_to_complpids[aid] = complpids

# for queries
qid_pids_pairs = list(qid_to_relpids.items())
random.shuffle(qid_pids_pairs)
train_qid_to_relpids = {qid: pids for qid, pids in qid_pids_pairs[:int(0.8*len(qid_pids_pairs))]}
val_qid_to_relpids = {qid: pids for qid, pids in qid_pids_pairs[int(0.8*len(qid_pids_pairs)): int(0.9*len(qid_pids_pairs))]}
test_qid_to_relpids = {qid: pids for qid, pids in qid_pids_pairs[int(0.9*len(qid_pids_pairs)):]}

print("number of aid_to_simpids  train = {:,}, val = {:,}, test = {:,}".format(len(train_aid_to_simpids), 
                                                                              len(val_aid_to_simpids), len(test_aid_to_simpids)))
print("number of aid_to_complpids train = {:,}, val = {:,}, test = {:,}".format(len(train_aid_to_complpids), 
                                                                              len(val_aid_to_complpids), len(test_aid_to_complpids)))
print("number of qid_to_relpids train = {:,}, val = {:,}, test = {:,}".format(len(train_qid_to_relpids), 
                                                                              len(val_qid_to_relpids), len(test_qid_to_relpids)))
assert len( set(train_aid_to_simpids.keys()) & set(val_aid_to_simpids.keys()) & set(test_aid_to_simpids.keys()) ) == 0
assert len( set(train_aid_to_complpids.keys()) & set(val_aid_to_complpids.keys()) & set(test_aid_to_complpids.keys())) == 0
assert len( set(train_qid_to_relpids.keys()) & set(val_qid_to_relpids.keys()) & set(test_qid_to_relpids.keys())) == 0

100%|██████████| 20888/20888 [00:00<00:00, 37745.40it/s]


number of aids for sim_rec, compl_rec = 146,300, 48,842, qids for search = 20,888
average rels for sim_rec, compl_rec, search = 8.218, 3.587, 8.706


100%|██████████| 146300/146300 [01:41<00:00, 1448.45it/s]
100%|██████████| 48842/48842 [00:09<00:00, 4940.66it/s]


number of aid_to_simpids  train = 117,040, val = 14,630, test = 14,630
number of aid_to_complpids train = 39,074, val = 4,884, test = 4,884
number of qid_to_relpids train = 16,710, val = 2,089, test = 2,089


In [1]:
146300*8.218

1202293.4

In [3]:
import ujson 

out_dir = root_dir
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
with open(os.path.join(out_dir, "collection_title.tsv"), "w") as fout:
    for pid, title in pid_to_title.items():
        fout.write(f"{pid}\t{title}\n")
        
with open(os.path.join(out_dir, "collection_description.tsv"), "w") as fout:
    for pid, desc in pid_to_desc.items():
        fout.write(f"{pid}\t{desc}\n")

with open(os.path.join(out_dir, "collection_brand.tsv"), "w") as fout:
    for pid, brand in pid_to_brand.items():
        fout.write(f"{pid}\t{brand}\n")
        
with open(os.path.join(out_dir, "collection_color.tsv"), "w") as fout:
    for pid, color in pid_to_color.items():
        fout.write(f"{pid}\t{color}\n")
        
with open(os.path.join(out_dir, "product.jsonl"), "w") as fout:
    for pid, title in pid_to_title.items():
        example = {"id": pid, "contents": title}
        fout.write(ujson.dumps(example) + "\n")
        
with open(os.path.join(out_dir, "all_queries.tsv"), "w") as fout:
    for query, qid in query_to_qid.items():
        fout.write(f"{qid}\t{query}\n")
            
with open(os.path.join(out_dir, "all_entities.tsv"), "w") as fout:
    for pid in pid_to_title:
        text = pid_to_title[pid]
        fout.write(f"{pid}\t{text}\n")
    for query, qid in query_to_qid.items():
        fout.write(f"{qid}\t{query}\n")
        
with open(os.path.join(out_dir, "ivm_to_pid.pkl"), "wb") as fout:
    pkl.dump(ivm_to_pid, fout)
    
with open(os.path.join(out_dir, "query_to_qid.pkl"), "wb") as fout:
    pkl.dump(query_to_qid, fout)
    
    
fn_to_data = {
    "train_aid_to_simpids.pkl": train_aid_to_simpids,
    "val_aid_to_simpids.pkl": val_aid_to_simpids,
    "test_aid_to_simpids.pkl": test_aid_to_simpids,
    
    "train_aid_to_complpids.pkl": train_aid_to_complpids,
    "val_aid_to_complpids.pkl": val_aid_to_complpids,
    "test_aid_to_complpids": test_aid_to_complpids,
    
    "train_qid_to_relpids.pkl": train_qid_to_relpids,
    "val_qid_to_relpids.pkl": val_qid_to_relpids,
    "test_qid_to_relpids": test_qid_to_relpids,
}

for fn, data in fn_to_data.items():
    fn = os.path.join(out_dir, fn)
    with open(fn, "wb") as fout:
        pkl.dump(data, fout)
    

product_df.to_csv(os.path.join(out_dir, "processed_product_df.csv"))
train_df.to_csv(os.path.join(out_dir, "processed_train_df.csv"))

In [11]:
# sanity check
for path in os.listdir(out_dir):
    if path.endswith("tsv") or  path.endswith("jsonl") :
        path = os.path.join(out_dir, path)
        ! wc -l $path
        ! head -n 3 $path
        ! tail -n 3 $path
        print("="*100)

482198 /home/jupyter/unity_jointly_rec_and_search/datasets/amazon_esci_dataset/data/processed/public/task_1_query-product_ranking/collection_color.tsv
0	Yellow
1	12-count
2	Yellow
482195	6pcs Maple Leaf Stems
482196	unknown
482197	Red
482198 /home/jupyter/unity_jointly_rec_and_search/datasets/amazon_esci_dataset/data/processed/public/task_1_query-product_ranking/product.jsonl
{"id":0,"contents":"Amazon Basics Woodcased #2 Pencils, Unsharpened, HB Lead - Box of 144, Bulk Box"}
{"id":1,"contents":"BAZIC Pencil #2 HB Pencils, Latex Free Eraser, Wood Free Yellow Unsharpened Pencils for Exam School Office (12\/Pack), 1-Pack"}
{"id":2,"contents":"Emraw Pre Sharpened Round Primary Size No 2 Jumbo Pencils for Preschoolers, Elementary Kids - Pack of 8 Premium Fat Pencils"}
{"id":482195,"contents":"Maple Leaves Branches, Arowner 6PCS Artificial Fall Leaf Bush Silk Autumn Foliage Shrubs Fake Plants for Indoor Outdoor Home Kitchen Festival Thanksgiving Table Centerpieces Arrangement Decor"}
{"id":

In [5]:
pid_to_title[383368],pid_to_title[160], pid_to_title[174]

('Price Tags with String Attached, 1000pcs White Smooth Surface Marking Merchandise Strung Tags Writable Label Hang Tags for Pricing Gift Jewelry Clothing Yard Sale Garage Supplies 1.75 x 1.093 inch',
 'KC Store Fixtures 08902 Perforated Merchandise Tags without Strings, 1-3/4" x 2-7/8", Orange (Pack of 1000)',
 'White Paper Tags, Jewelry Price Tags with string (3/8" x 7/8")')

In [6]:
list(aid_to_simpids.items())[100:105]

[(152, {145, 156, 157, 359351}),
 (153, {145, 156, 157, 359351}),
 (160,
  {159,
   165,
   167,
   173,
   174,
   176,
   177,
   178,
   179,
   180,
   255586,
   255588,
   255602,
   383364,
   383365,
   383368,
   383376,
   383383,
   411563}),
 (161,
  {159,
   165,
   167,
   173,
   174,
   176,
   177,
   178,
   179,
   180,
   255586,
   255588,
   255602,
   383364,
   383365,
   383368,
   383376,
   383383,
   411563}),
 (162,
  {159,
   165,
   167,
   173,
   174,
   176,
   177,
   178,
   179,
   180,
   255586,
   255588,
   255602,
   383364,
   383365,
   383368,
   383376,
   383383,
   411563})]

In [17]:
pid_to_title[383376], pid_to_desc[383376], pid_to_brand[317231]

('500Pcs Price Tags with String Attached by Divine Light, 0.91 x 0.55 inches Premium Writable Jewelry Tags, Paper Sale Tags with String Pricing Tags - for Anything You Need to Identify or Price',
 'Please check the size carefully before you order it, The size of our price tags is 0.91 x 0.55 inches.    Divine Light sale tags with string are ideal for labeling party favors, using for gift tags, craft projects, identifying household items, or garage sale price tags   ► Widely use - Great for jewelry or any small crafts!   ► Enough space to write - The size of price tags is 0.91 x 0.55 inches, give you enough space for decoration or writing a big name or a short message  ► Easy to Use and remove - Knotted string lets you add and remove the rectangular price tag easily without tying knots, durable and convenient   Specifications  Size: 0.91 x 0.55 inches Color: White Material: Paper   Package includes  500 x Good quality Price tags   CLICK ‘ADD TO CART’ GET IT NOW!',
 'Zephyr')