In [2]:
import os

from google.cloud import bigquery
import pandas as pd 

client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

query = """
SELECT * 
FROM `gcp-ushi-digital-ds-qa.digital_ds.hansi_recs_search_data`;
"""
query_job = client.query(query)
df = query_job.to_dataframe()

query = """
SELECT * 
FROM `gcp-ushi-digital-ds-qa.np_digital_ds.hansi_srch_recs_5_core_products`;
"""
query_job = client.query(query)
core_prd_df = query_job.to_dataframe()

query = """
SELECT * 
FROM `gcp-ushi-digital-ds-qa.np_digital_ds.hansi_product_text_metas`;
"""
query_job = client.query(query)
product_df = query_job.to_dataframe()

    

Client creating using default project: gcp-ushi-digital-ds-qa


In [3]:
# read all pids 
from tqdm import tqdm

prdid_to_title = {}
prdid_to_text = {}
prdids = []
no_bulletin_product = set()
no_title_product = set()

for idx, row in tqdm(product_df.iterrows(), total=len(product_df)):
    product_id = row["product_id"]
    if row["product_name"] == None:
        no_title_product.add(product_id)
        title = "no title"
    else:
        old_title = row["product_name"]
        title = old_title.replace("\t", " ")
        if old_title != title:
            print("old title = {}, new title = {}".format(old_title, title))
    if row["bulletins"] == None:
        no_bulletin_product.add(product_id)
        description = "no description"
    else:
        description = row["bulletins"].replace("\t", " ")
        description = description.replace("\n", " ")
    
    prdid_to_title[product_id] = title
    prdid_to_text[product_id] = title + " " + description
    prdids.append(product_id)

assert len(prdids) == len(set(prdids))
print("number of products = {}, products without title = {}, without bulletins = {}".format(len(prdids), len(no_title_product), len(no_bulletin_product)))
print("pid_to_title = {} \n pid_to_text = {}".format(prdid_to_title[product_id], prdid_to_text[product_id]))

 62%|██████▏   | 1228136/1987188 [01:10<00:54, 13860.34it/s]

old title = Square D Terminal	 Aluminum Ground Bar Kit, new title = Square D Terminal  Aluminum Ground Bar Kit
old title = Square D 7 Terminal	 Aluminum Ground Bar Kit, new title = Square D 7 Terminal  Aluminum Ground Bar Kit
old title = Blackburn 2 Terminal	 CopperBar, new title = Blackburn 2 Terminal  CopperBar


 63%|██████▎   | 1256057/1987188 [01:12<00:52, 13946.13it/s]

old title = Blackburn 1 Terminal	 CopperBar, new title = Blackburn 1 Terminal  CopperBar
old title = Eaton Terminal	 Galvanized Ground Bar Kit, new title = Eaton Terminal  Galvanized Ground Bar Kit


 65%|██████▌   | 1299847/1987188 [01:15<00:39, 17620.14it/s]

old title = Blackburn 1 Terminal	 Copper Rods, new title = Blackburn 1 Terminal  Copper Rods
old title = Eaton 5 Terminal	 Galvanized Ground Bar Kit, new title = Eaton 5 Terminal  Galvanized Ground Bar Kit


 71%|███████   | 1408101/1987188 [01:21<00:33, 17071.06it/s]

old title = Square D 18 Terminal	 Aluminum Ground Bar Kit, new title = Square D 18 Terminal  Aluminum Ground Bar Kit


 72%|███████▏  | 1436550/1987188 [01:23<00:30, 17762.71it/s]

old title = Square D 3 Terminal	 Aluminum Ground Bar Kit, new title = Square D 3 Terminal  Aluminum Ground Bar Kit


 78%|███████▊  | 1558556/1987188 [01:30<00:24, 17751.08it/s]

old title = Square D 4 Terminal	 Aluminum Ground Bar Kit, new title = Square D 4 Terminal  Aluminum Ground Bar Kit


 80%|███████▉  | 1579985/1987188 [01:31<00:22, 17866.60it/s]

old title = Eaton 21 Terminal	 Galvanized Ground Bar Kit, new title = Eaton 21 Terminal  Galvanized Ground Bar Kit
old title = Eaton 14 Terminal	 Galvanized Ground Bar Kit, new title = Eaton 14 Terminal  Galvanized Ground Bar Kit
old title = Blackburn 1 Terminal	 Copper Rods, new title = Blackburn 1 Terminal  Copper Rods


100%|██████████| 1987188/1987188 [01:54<00:00, 17338.81it/s]


number of products = 1987188, products without title = 182, without bulletins = 778
pid_to_title = Korky Red 3-in Flush Valve Seal for American Standard, Kohler 
 pid_to_text = Korky Red 3-in Flush Valve Seal for American Standard, Kohler Replaces American Standard Seal: 738651-0070A Exclusive longest lasting red rubber resists chlorine Made in the USA Includes: (1) Kohler tower style seal, (1) American Standard&#174; tower style seal Replaces Kohler Seal: GP1059291 5 Year warranty Easy installation


In [4]:
from tqdm import tqdm 
from copy import deepcopy

core_products = list(core_prd_df.ivm)
core_products = {p: 1 for p in core_products} # hashmap 

all_products = {p: 1 for p in prdids} # hashmap

remove_rec_examples = 0
remove_search_examples = 0
total_rec_examples = 0
total_search_examples = 0
remove_rows = 0
remove_ivms = set()
total_ivms = set()
filtered_df = []
for idx, df_row in tqdm(df.iterrows(), total=len(df)):
    row = deepcopy(df_row)
    new_rec_records = []
    for rec_record in row["rec_record"]:
        total_rec_examples += 1
        anchor, ivms = rec_record["anchor"], rec_record["interacted_ivms"]
        if anchor not in all_products:
            remove_rec_examples += 1
            continue
        new_ivms = []
        for ivm in ivms:
            total_ivms.add(ivm)
            if ivm not in core_products:
                remove_ivms.add(ivm)
                continue
            if ivm not in all_products:
                remove_ivms.add(ivm)
                continue
            new_ivms.append(ivm)
        if len(new_ivms) == 0:
            remove_rec_examples += 1
        else:
            assert anchor in all_products
            new_rec_records.append({"anchor": anchor, "interacted_ivms": new_ivms})
            
    new_search_records = []
    for search_record in row["search_record"]:
        total_search_examples += 1
        query, ivms = search_record["query"], search_record["interacted_ivms"]
        new_ivms = []
        for ivm in ivms:
            total_ivms.add(ivm)
            if ivm not in core_products:
                remove_ivms.add(ivm)
                continue
            if ivm not in all_products:
                remove_ivms.add(ivm)
                continue
            new_ivms.append(ivm)
        if len(new_ivms) == 0:
            remove_search_examples += 1
        else:
            new_search_records.append({"query": query, "interacted_ivms": new_ivms})
            
    row["rec_record"] = new_rec_records
    row["search_record"] = new_search_records
    if len(new_rec_records) == 0 and len(new_search_records) == 0:
        remove_rows += 1
        continue
    filtered_df.append(row)
    
filtered_df = pd.DataFrame(data=filtered_df).sort_values(by="feed_date")

100%|██████████| 759677/759677 [01:38<00:00, 7676.11it/s]


In [5]:
# sanity check
print("remove rec_example = {} ({:.3f}), remove search example = {} ({:.3f})".format(remove_rec_examples, remove_rec_examples/total_rec_examples,
                                                                                    remove_search_examples, remove_search_examples/total_search_examples))
print("remove rows = {} ({:.3f}), remove ivms = {} ({:.3f})".format(remove_rows, remove_rows/len(df),
                                                                   len(remove_ivms), len(remove_ivms)/len(total_ivms)))
print("original total rec example = {}, search example = {}, ivms = {}".format(total_rec_examples, total_search_examples, len(total_ivms)))
    

remove rec_example = 65827 (0.061), remove search example = 81082 (0.069)
remove rows = 7970 (0.010), remove ivms = 66302 (0.294)
original total rec example = 1087786, search example = 1179024, ivms = 225539


In [6]:
# create training examples 
rec_examples = []
search_examples = []
for idx, df_row in tqdm(filtered_df.iterrows(), total=len(filtered_df)):
    row = deepcopy(df_row)
    for rec_record in row["rec_record"]:
        anchor, ivms = rec_record["anchor"], rec_record["interacted_ivms"]
        for ivm in ivms:
            assert ivm in core_products and ivm in all_products and anchor in all_products, anchor
            rec_examples.append({"feed_date": row["feed_date"], "feed_type": row["feed_type"], "anchor": anchor, "product_id": ivm})
    
    for search_record in row["search_record"]:
        query, ivms = search_record["query"], search_record["interacted_ivms"]
        for ivm in ivms:
            assert ivm in core_products and ivm in all_products
            search_examples.append({"feed_date": row["feed_date"], "feed_type": row["feed_type"], "query": query, "product_id": ivm})

print("rec_example = {}, search_example = {}".format(len(rec_examples), len(search_examples)))
print("# remove_imvs since not in all_products = {:.3f}".format((len(total_ivms) - len(total_ivms & set(prdids))) / len(total_ivms)))

100%|██████████| 751707/751707 [01:17<00:00, 9692.95it/s]


rec_example = 1267877, search_example = 1420720
# remove_imvs since not in all_products = 0.059


In [8]:
import numpy as np

unique_dates = np.unique(filtered_df.feed_date)
start_date, end_date = unique_dates[0], unique_dates[-1]
val_start_date = unique_dates[int(len(unique_dates)*0.8)]
test_start_date = unique_dates[int(len(unique_dates)*0.9)]
print("start_date = {}, end_date = {}, val_start_date = {}, test_start_date".format(start_date, end_date, val_start_date, test_start_date))
len(unique_dates)

start_date = 2022-03-05, end_date = 2022-05-06, val_start_date = 2022-04-24, test_start_date


63

In [10]:
# write files 
import os 
import pickle

import ujson
import numpy as np

base_dir = "/home/jupyter/jointly_rec_and_search/datasets/jointly_rec_and_search/"

prdid_to_pid = {prdid: pid for pid, prdid in enumerate(prdids)}
query_to_qid = {}
for search_exp in search_examples:
    cand_qid = len(query_to_qid)
    query = search_exp["query"]
    if query in query_to_qid:
        continue
    query_to_qid[query] = cand_qid
    
# sanity check
pid_to_prdid = {pid: prdid for prdid, pid in prdid_to_pid.items()}
qid_to_query = {qid: query for query, qid in query_to_qid.items()}
pid_to_text = {}
pid_to_title = {}
assert len(pid_to_prdid) == len(prdid_to_pid) and len(query_to_qid) == len(qid_to_query)
with open(os.path.join(base_dir, "all_queries.tsv"), "w") as fout:
    for qid, query in qid_to_query.items():
        fout.write(f"{qid}\t{query}\n")
with open(os.path.join(base_dir, "pid_to_product_id.tsv"), "w") as fout:
    for pid, product_id in pid_to_prdid.items():
        fout.write(f"{pid}\t{product_id}\n")
with open(os.path.join(base_dir, "pid_to_product_id.pkl"), "wb") as fout:
    pickle.dump(pid_to_prdid, fout)
with open(os.path.join(base_dir, "collection.tsv"), "w") as fout:
    with open(os.path.join(base_dir, "collection_title.tsv"), "w") as fout2:
        for pid, product_id in pid_to_prdid.items():
            text = prdid_to_text[product_id]
            title = prdid_to_title[product_id]
            fout.write(f"{pid}\t{text}\n")
            fout2.write(f"{pid}\t{title}\n")
            pid_to_text[pid] = text
            pid_to_title[pid] = title
        
print("len pid_to_text = {} pid_to_title = {}".format(len(pid_to_text), len(pid_to_title)))
# split train, val, test set 
unique_dates = np.unique(filtered_df.feed_date)
start_date, end_date = unique_dates[0], unique_dates[-1]
val_start_date = unique_dates[int(len(unique_dates)*0.8)]
test_start_date = unique_dates[int(len(unique_dates)*0.9)]
print("start_date = {}, end_date = {}, val_start_date = {}, test_start_date".format(start_date, end_date, val_start_date, test_start_date))

train_rec_examples, val_rec_examples, test_rec_examples = [], [], []
for rec_row in rec_examples:
    rec_exp = {"feed_date": rec_row["feed_date"], "feed_type": rec_row["feed_type"], "anchor_pid": prdid_to_pid[rec_row["anchor"]],
                "pid": prdid_to_pid[rec_row["product_id"]]}
    if rec_exp["feed_date"] < val_start_date:
        rec_exp["feed_date"] = rec_exp["feed_date"].strftime("%Y-%m-%d")
        train_rec_examples.append(rec_exp)
    elif rec_exp["feed_date"] >= val_start_date and rec_exp["feed_date"] < test_start_date:
        rec_exp["feed_date"] = rec_exp["feed_date"].strftime("%Y-%m-%d")
        val_rec_examples.append(rec_exp)
    else:
        assert rec_exp["feed_date"] >= test_start_date
        rec_exp["feed_date"] = rec_exp["feed_date"].strftime("%Y-%m-%d")
        test_rec_examples.append(rec_exp)
assert len(train_rec_examples) + len(val_rec_examples) + len(test_rec_examples) == len(rec_examples), (len(train_rec_examples), len(val_rec_examples), len(test_rec_examples))


train_search_examples, val_search_examples, test_search_examples = [], [], []
for search_row in search_examples:
    search_exp = {"feed_date": search_row["feed_date"], "feed_type": search_row["feed_type"], "qid": query_to_qid[search_row["query"]],
                 "pid": prdid_to_pid[search_row["product_id"]]}
    if search_exp["feed_date"] < val_start_date:
        search_exp["feed_date"] = search_exp["feed_date"].strftime("%Y-%m-%d")
        train_search_examples.append(search_exp)
    elif search_exp["feed_date"] >= val_start_date and search_exp["feed_date"] < test_start_date:
        search_exp["feed_date"] = search_exp["feed_date"].strftime("%Y-%m-%d")
        val_search_examples.append(search_exp)
    else:
        assert search_exp["feed_date"] >= test_start_date
        search_exp["feed_date"] = search_exp["feed_date"].strftime("%Y-%m-%d")
        test_search_examples.append(search_exp)
assert len(train_search_examples) + len(val_search_examples) + len(test_search_examples) == len(search_examples), (len(train_search_examples),len(val_search_examples),len(test_search_examples))

anchor_pid_dict = set()
anchor_qrels = {}
with open(os.path.join(base_dir, "train_rec_examples.json"), "w") as fout:
    with open(os.path.join(base_dir, "train_anchors.tsv"), "w") as fout2:
        with open(os.path.join(base_dir, "train_anchor_qrels.tsv"), "w") as fout3:
            with open(os.path.join(base_dir, "train_anchor_qrels_clicknum.tsv"), "w") as fout4:
                for rec_exp in tqdm(train_rec_examples):
                    fout.write(ujson.dumps(rec_exp) + "\n")

                    anchor_pid = rec_exp["anchor_pid"]
                    if anchor_pid in anchor_pid_dict:
                        pass 
                    else:
                        anchor_pid_dict.add(anchor_pid)
                        title = pid_to_title[anchor_pid]
                        fout2.write(f"{anchor_pid}\t{title}\n")

                    if anchor_pid not in anchor_qrels:
                        anchor_qrels[anchor_pid] = {pid: 1}
                    else:
                        if pid not in anchor_qrels[anchor_pid]:
                            anchor_qrels[anchor_pid][pid] = 1
                        else:
                            anchor_qrels[anchor_pid][pid] += 1
                
                for anchor_pid in anchor_qrels:
                    for pid in anchor_qrels[anchor_pid]:
                        click_num = anchor_qrels[anchor_pid][pid]
                        fout3.write(f"{anchor_pid}\tQ0\t{pid}\t{1}\n")
                        fout4.write(f"{anchor_pid}\t{pid}\t{click_num}\n")

anchor_pid_dict = set()
anchor_qrels = {}
with open(os.path.join(base_dir, "val_rec_examples.json"), "w") as fout:
    with open(os.path.join(base_dir, "val_anchors.tsv"), "w") as fout2:
        with open(os.path.join(base_dir, "val_anchor_qrels.tsv"), "w") as fout3:
            with open(os.path.join(base_dir, "val_anchor_qrels_clicknum.tsv"), "w") as fout4:
                for rec_exp in val_rec_examples:
                    fout.write(ujson.dumps(rec_exp) + "\n")

                    anchor_pid = rec_exp["anchor_pid"]
                    if anchor_pid in anchor_pid_dict:
                        pass
                    else:
                        anchor_pid_dict.add(anchor_pid)
                        title = pid_to_title[anchor_pid]
                        fout2.write(f"{anchor_pid}\t{title}\n")

                    if anchor_pid not in anchor_qrels:
                        anchor_qrels[anchor_pid] = {pid: 1}
                    else:
                        if pid not in anchor_qrels[anchor_pid]:
                            anchor_qrels[anchor_pid][pid] = 1
                        else:
                            anchor_qrels[anchor_pid][pid] += 1
                
                for anchor_pid in anchor_qrels:
                    for pid in anchor_qrels[anchor_pid]:
                        click_num = anchor_qrels[anchor_pid][pid]
                        fout3.write(f"{anchor_pid}\tQ0\t{pid}\t{1}\n")
                        fout4.write(f"{anchor_pid}\t{pid}\t{click_num}\n")
                
anchor_pid_dict = set()
anchor_qrels = {}
with open(os.path.join(base_dir, "test_rec_examples.json"), "w") as fout:
    with open(os.path.join(base_dir, "test_anchors.tsv"), "w") as fout2:
        with open(os.path.join(base_dir, "test_anchor_qrels.tsv"), "w") as fout3:
            with open(os.path.join(base_dir, "test_anchor_qrels_clicknum.tsv"), "w") as fout4:
                for rec_exp in test_rec_examples:
                    fout.write(ujson.dumps(rec_exp) + "\n")

                    anchor_pid = rec_exp["anchor_pid"]
                    if anchor_pid in anchor_pid_dict:
                        pass
                    else:
                        anchor_pid_dict.add(anchor_pid)
                        title = pid_to_title[anchor_pid]
                        fout2.write(f"{anchor_pid}\t{title}\n")

                    if anchor_pid not in anchor_qrels:
                            anchor_qrels[anchor_pid] = {pid: 1}
                    else:
                        if pid not in anchor_qrels[anchor_pid]:
                            anchor_qrels[anchor_pid][pid] = 1
                        else:
                            anchor_qrels[anchor_pid][pid] += 1

                for anchor_pid in anchor_qrels:
                    for pid in anchor_qrels[anchor_pid]:
                        click_num = anchor_qrels[anchor_pid][pid]
                        fout3.write(f"{anchor_pid}\tQ0\t{pid}\t{1}\n")
                        fout4.write(f"{anchor_pid}\t{pid}\t{click_num}\n")

qid_dict = set()
qid_qrels = {}
with open(os.path.join(base_dir, "train_search_examples.json"), "w") as fout:
    with open(os.path.join(base_dir,"train_queries.tsv"), "w") as fout2:
        with open(os.path.join(base_dir,"train_query_qrels.tsv"), "w") as fout3:
            with open(os.path.join(base_dir,"train_query_qrels_clicknum.tsv"), "w") as fout4:
                for search_exp in tqdm(train_search_examples):
                    fout.write(ujson.dumps(search_exp) + "\n")

                    qid = search_exp["qid"]
                    if qid in qid_dict:
                        pass
                    else:
                        qid_dict.add(qid)
                        query = qid_to_query[qid]
                        fout2.write(f"{qid}\t{query}\n")

                    pid = search_exp["pid"]
                    fout3.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                    
                    if qid not in qid_qrels:
                        qid_qrels[qid] = {pid: 1}
                    else:
                        if pid not in qid_qrels[qid]:
                            qid_qrels[qid][pid] = 1
                        else:
                            qid_qrels[qid][pid] += 1
                
                for qid in qid_qrels:
                    for pid in qid_qrels[qid]:
                        click_num = qid_qrels[qid][pid]
                        fout3.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                        fout4.write(f"{qid}\t{pid}\t{click_num}\n")
                
qid_dict = set()
qid_qrels = {}
with open(os.path.join(base_dir, "val_search_examples.json"), "w") as fout:
    with open(os.path.join(base_dir,"val_queries.tsv"), "w") as fout2:
        with open(os.path.join(base_dir,"val_query_qrels.tsv"), "w") as fout3:
            with open(os.path.join(base_dir,"val_query_qrels_clicknum.tsv"), "w") as fout4:
                for search_exp in val_search_examples:
                    fout.write(ujson.dumps(search_exp) + "\n")

                    qid = search_exp["qid"]
                    if qid in qid_dict:
                        pass
                    else:
                        qid_dict.add(qid)
                        query = qid_to_query[qid]
                        fout2.write(f"{qid}\t{query}\n")

                    if qid not in qid_qrels:
                        qid_qrels[qid] = {pid: 1}
                    else:
                        if pid not in qid_qrels[qid]:
                            qid_qrels[qid][pid] = 1
                        else:
                            qid_qrels[qid][pid] += 1
                
                for qid in qid_qrels:
                    for pid in qid_qrels[qid]:
                        click_num = qid_qrels[qid][pid]
                        fout3.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                        fout4.write(f"{qid}\t{pid}\t{click_num}\n")
                
qid_dict = set()
qid_qrels = {}
with open(os.path.join(base_dir, "test_search_examples.json"), "w") as fout:
    with open(os.path.join(base_dir,"test_queries.tsv"), "w") as fout2:
        with open(os.path.join(base_dir,"test_query_qrels.tsv"), "w") as fout3:
            with open(os.path.join(base_dir,"test_query_qrels_clicknum.tsv"), "w") as fout4:
                for search_exp in test_search_examples:
                    fout.write(ujson.dumps(search_exp) + "\n")

                    qid = search_exp["qid"]
                    if qid in qid_dict:
                        pass
                    else:
                        qid_dict.add(qid)
                        query = qid_to_query[qid]
                        fout2.write(f"{qid}\t{query}\n")

                    if qid not in qid_qrels:
                        qid_qrels[qid] = {pid: 1}
                    else:
                        if pid not in qid_qrels[qid]:
                            qid_qrels[qid][pid] = 1
                        else:
                            qid_qrels[qid][pid] += 1
                
                for qid in qid_qrels:
                    for pid in qid_qrels[qid]:
                        click_num = qid_qrels[qid][pid]
                        fout3.write(f"{qid}\tQ0\t{pid}\t{1}\n")
                        fout4.write(f"{qid}\t{pid}\t{click_num}\n")

    




len pid_to_text = 1987188 pid_to_title = 1987188
start_date = 2022-03-05, end_date = 2022-05-06, val_start_date = 2022-04-24, test_start_date


100%|██████████| 1000992/1000992 [00:02<00:00, 387949.79it/s]
100%|██████████| 1118512/1118512 [00:03<00:00, 310884.28it/s]


In [12]:
# sanity check
path = os.path.join(base_dir, "train_rec_examples.json")
! wc -l $path
! head -n 3 $path
! tail -n 3 $path
print("="*100)
path = os.path.join(base_dir, "val_rec_examples.json")
! wc -l $path
! head -n 3 $path
! tail -n 3 $path
print("="*100)
path = os.path.join(base_dir, "test_rec_examples.json")
! wc -l $path
! head -n 3 $path
! tail -n 3 $path
print("="*100)
path = os.path.join(base_dir, "train_search_examples.json")
! wc -l $path
! head -n 3 $path
! tail -n 3 $path
print("="*100)
path = os.path.join(base_dir, "val_search_examples.json")
! wc -l $path
! head -n 3 $path
! tail -n 3 $path
print("="*100)
path = os.path.join(base_dir, "test_search_examples.json")
! wc -l $path
! head -n 3 $path
! tail -n 3 $path
print("="*100)
print("all search examples = {}".format(len(search_examples)))
path = os.path.join(base_dir, "collection.tsv")
! wc -l $path
! head -n 3 $path
! tail -n 3 $path
print("="*100)
path = os.path.join(base_dir, "collection_title.tsv")
! wc -l $path
! head -n 3 $path
! tail -n 3 $path
print("="*100)
path = os.path.join(base_dir, "all_queries.tsv")
! wc -l $path
! head -n 3 $path
! tail -n 3 $path
print("="*100)

list_fns = ["train_queries.tsv", "val_queries.tsv", "test_queries.tsv"]
for fn in list_fns:
    path = os.path.join(base_dir, fn)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)
    
list_fns = ["train_query_qrels.tsv", "val_query_qrels.tsv", "test_query_qrels.tsv", 
            "train_query_qrels_clicknum.tsv", "val_query_qrels_clicknum.tsv", "test_query_qrels_clicknum.tsv"]
for fn in list_fns:
    path = os.path.join(base_dir, fn)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)
    
list_fns = ["train_anchors.tsv", "val_anchors.tsv", "test_anchors.tsv"]
for fn in list_fns:
    path = os.path.join(base_dir, fn)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)
    
list_fns = ["train_anchor_qrels.tsv", "val_anchor_qrels.tsv", "test_anchor_qrels.tsv",
           "train_anchor_qrels_clicknum.tsv", "val_anchor_qrels_clicknum.tsv", "test_anchor_qrels_clicknum.tsv"]
for fn in list_fns:
    path = os.path.join(base_dir, fn)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)

1000992 /home/jupyter/jointly_rec_and_search/datasets/jointly_rec_and_search/train_rec_examples.json
{"feed_date":"2022-03-05","feed_type":"desktop","anchor_pid":1908294,"pid":1691387}
{"feed_date":"2022-03-05","feed_type":"desktop","anchor_pid":1684969,"pid":1810591}
{"feed_date":"2022-03-05","feed_type":"desktop","anchor_pid":1745463,"pid":1545588}
{"feed_date":"2022-04-23","feed_type":"desktop","anchor_pid":1726171,"pid":1884870}
{"feed_date":"2022-04-23","feed_type":"desktop","anchor_pid":1726171,"pid":1957647}
{"feed_date":"2022-04-23","feed_type":"mobile","anchor_pid":945617,"pid":945616}
130335 /home/jupyter/jointly_rec_and_search/datasets/jointly_rec_and_search/val_rec_examples.json
{"feed_date":"2022-04-24","feed_type":"desktop","anchor_pid":983902,"pid":732894}
{"feed_date":"2022-04-24","feed_type":"desktop","anchor_pid":1891227,"pid":1891228}
{"feed_date":"2022-04-24","feed_type":"mobile","anchor_pid":770457,"pid":888654}
{"feed_date":"2022-04-29","feed_type":"desktop","anch

In [None]:
import numpy as np 
unique_dates = np.unique(filtered_df.feed_date)
unique_dates[int(len(unique_dates)*0.9)]

In [None]:
len(query_to_qid) == len(qid_to_query)