In [1]:
import os 

from google.cloud import bigquery
import pandas as pd 
import numpy as np

client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.comp_rec_ClicksData_2core`;
    """
query_job = client.query(query)
compl_rec_df = query_job.to_dataframe()

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.hansi_rec_ClicksData_5core`;
    """
query_job = client.query(query)
sim_rec_df = query_job.to_dataframe()

query = """
    SELECT *
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.search_ClicksData_1year_5core`;
"""
query_job = client.query(query)
search_df = query_job.to_dataframe()

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.hansi_dataset.all_products_info`;
    """
query_job = client.query(query)
product_df = query_job.to_dataframe()
print("product_df = {:,}".format(len(product_df)))

all_products = set(product_df.product_id)
anchors = set(compl_rec_df.anchor)
compl_ivms = set(compl_rec_df.ivm)
all_compl_ivms = anchors.union(compl_ivms)

print("================================ For anchor_to_compl_ivms: ===================================")
print("number of unique product = {:,}, anchors = {:,}, complementary_compl_ivms = {:,}".format(len(all_products), len(anchors), len(compl_ivms)))
assert len(all_products & anchors) == len(anchors) and len(all_products & compl_ivms) == len(compl_ivms),(
    len(all_products & anchors), len(anchors), len(all_products & compl_ivms), len(compl_ivms)
)

all_sim_ivms = set(sim_rec_df.anchor).union(set(sim_rec_df.ivm))
print("================================ After updating anchor_to_similar_ivms: ===================================")
print("all_compl_ivms = {:,}, all_sim_ivms = {:,}".format(len(all_compl_ivms), len(all_sim_ivms)))
print("sim_compl_intersect = {:,} ({:.3f})".format(len(all_compl_ivms & all_sim_ivms), len(all_compl_ivms & all_sim_ivms) / len(all_compl_ivms)))
print("all_ivms = {:,}".format(len(all_compl_ivms | all_sim_ivms)))
all_ivms = all_compl_ivms | all_sim_ivms

assert len(all_products & all_ivms) == len(all_ivms), (len(all_products & all_ivms), len(all_ivms))

query_to_ivms = search_df.groupby("query")["ivm"].apply(list)
ivm_to_queries = search_df.groupby("ivm")["query"].apply(list)
query_lengths = np.array([len(x) for x in ivm_to_queries.values])
all_queries = set(search_df["query"])
print("all queries = {:,}".format(len(all_queries)))
assert len(all_queries) == len(query_to_ivms), len(query_to_ivms)
print("total ivms (queries) = {:,}, length >=3 = {:,}, length >= 5 = {:,}".format(
    len(query_lengths), np.sum(query_lengths >=3), np.sum(query_lengths >= 5) ))

anchor_to_compl_ivms = compl_rec_df.groupby("anchor")["ivm"].apply(list)
compl_ivms_length = np.array([len(x) for x in anchor_to_compl_ivms.values])
print("================================ For anchor_to_compl_ivms: ===================================")
print("total_compl_ivms = {:,}, length >=3 = {:,}, length >= 5 = {:,}".format(len(compl_ivms_length), np.sum(compl_ivms_length >=3), np.sum(compl_ivms_length >= 5) ))

anchor_to_sim_ivms = sim_rec_df.groupby("anchor")["ivm"].apply(list)


Client creating using default project: gcp-ushi-digital-ds-qa
product_df = 2,260,878
number of unique product = 2,260,878, anchors = 86,870, complementary_compl_ivms = 65,561
all_compl_ivms = 109,758, all_sim_ivms = 256,765
sim_compl_intersect = 87,425 (0.797)
all_ivms = 279,098
all queries = 953,773
total ivms (queries) = 360,744, length >=3 = 196,481, length >= 5 = 142,527
total_compl_ivms = 86,870, length >=3 = 35,837, length >= 5 = 22,121


In [2]:
import pickle

in_dir = "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/"

with open(os.path.join(in_dir, "ivm_to_pid.pkl"), "rb") as fin:
    ivm_to_pid = pickle.load(fin)
with open(os.path.join(in_dir, "query_to_qid.pkl"), "rb") as fin:
    query_to_qid = pickle.load(fin)


pid_to_qids = {ivm_to_pid[ivm]: [query_to_qid[query] for query in queries] for ivm, queries in ivm_to_queries.items()}
qid_to_pids = {query_to_qid[query]: [ivm_to_pid[ivm] for ivm in ivms] for query, ivms in query_to_ivms.items()}
aid_to_complpids = {ivm_to_pid[anchor]: [ivm_to_pid[prod] for prod in products] for anchor, products in anchor_to_compl_ivms.items()}
aid_to_simpids = {ivm_to_pid[anchor]: [ivm_to_pid[prod] for prod in products] for anchor, products in anchor_to_sim_ivms.items()}

In [3]:
# start create graph

import random 
from tqdm import tqdm
import networkx as nx
from collections import defaultdict
random.seed(4680)

# for similar items
val_test_aids = random.sample(aid_to_simpids.keys(), int(0.2*len(aid_to_simpids)))
val_aids = val_test_aids[:int(0.5*len(val_test_aids))]
test_aids = val_test_aids[int(0.5*len(val_test_aids)):]
train_aid_to_simpids, val_aid_to_simpids, test_aid_to_simpids = {}, {}, {}
for aid, simpids in tqdm(aid_to_simpids.items(), total=len(aid_to_simpids)):
    if aid in val_aids:
        val_aid_to_simpids[aid] = simpids
    elif aid in test_aids:
        test_aid_to_simpids[aid] = simpids
    else:
        train_aid_to_simpids[aid] = simpids
        
# for complementary items
val_test_aids = random.sample(aid_to_complpids.keys(), int(0.2*len(aid_to_complpids)))
val_aids = val_test_aids[:int(0.5*len(val_test_aids))]
test_aids = val_test_aids[int(0.5*len(val_test_aids)):]
train_aid_to_complpids, val_aid_to_complpids, test_aid_to_complpids = {}, {}, {}
for aid, complpids in tqdm(aid_to_complpids.items(), total=len(aid_to_complpids)):
    if aid in val_aids:
        val_aid_to_complpids[aid] = complpids
    elif aid in test_aids:
        test_aid_to_complpids[aid] = complpids
    else:
        train_aid_to_complpids[aid] = complpids

# for queries
qid_pids_pairs = list(qid_to_pids.items())
random.shuffle(qid_pids_pairs)
train_qid_to_pids = {qid: pids for qid, pids in qid_pids_pairs[:int(0.8*len(qid_pids_pairs))]}
val_qid_to_pids = {qid: pids for qid, pids in qid_pids_pairs[int(0.8*len(qid_pids_pairs)): int(0.9*len(qid_pids_pairs))]}
test_qid_to_pids = {qid: pids for qid, pids in qid_pids_pairs[int(0.9*len(qid_pids_pairs)):]}


print("number of aid_to_simpids  train = {:,}, val = {:,}, test = {:,}".format(len(train_aid_to_simpids), 
                                                                              len(val_aid_to_simpids), len(test_aid_to_simpids)))
print("number of aid_to_complpids train = {:,}, val = {:,}, test = {:,}".format(len(train_aid_to_complpids), 
                                                                              len(val_aid_to_complpids), len(test_aid_to_complpids)))
print("number of qid_to_pids train = {:,}, val = {:,}, test = {:,}".format(len(train_qid_to_pids), 
                                                                              len(val_qid_to_pids), len(test_qid_to_pids)))
assert len( set(train_aid_to_simpids.keys()) & set(val_aid_to_simpids.keys()) & set(test_aid_to_simpids.keys()) ) == 0
assert len( set(train_aid_to_complpids.keys()) & set(val_aid_to_complpids.keys()) & set(test_aid_to_complpids.keys())) == 0
assert len( set(train_qid_to_pids.keys()) & set(val_qid_to_pids.keys()) & set(test_qid_to_pids.keys())) == 0

G = nx.MultiDiGraph()
SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"

for aid, sim_pids in train_aid_to_simpids.items():
    triples = [(aid, sim_pid, {"type":SIM_RELATION}) for sim_pid in sim_pids]
    G.add_edges_from(triples)
    
for aid, compl_pids in train_aid_to_complpids.items():
    triples = [(aid, compl_pid, {"type":COMPL_RELATION}) for compl_pid in compl_pids]
    G.add_edges_from(triples)
    
for qid, pids in train_qid_to_pids.items():
    triples = [(pid, qid, {"type": REL_RELATION}) for pid in pids]
    G.add_edges_from(triples)
    
multi_edge_pairs = []
for n, nbrs_dict in tqdm(G.adj.items(), total=G.number_of_nodes()):
    for nbr_node, edge_attrs in nbrs_dict.items():
        assert len(edge_attrs) == 1 or len(edge_attrs) == 2
        if len(edge_attrs) == 2:
            multi_edge_pairs.append((n, nbr_node))
            
print("number of edges = {:,}, number of multi-attr edges = {:,}, ({:.3f})".format(G.number_of_edges(), len(multi_edge_pairs), 
                                                                                   len(multi_edge_pairs)/G.number_of_edges()))

100%|██████████| 216238/216238 [05:42<00:00, 631.42it/s]
100%|██████████| 86870/86870 [00:54<00:00, 1586.94it/s]


number of aid_to_simpids  train = 172,991, val = 21,623, test = 21,624
number of aid_to_complpids train = 69,496, val = 8,687, test = 8,687
number of qid_to_pids train = 763,018, val = 95,377, test = 95,378


100%|██████████| 1160206/1160206 [00:07<00:00, 149296.78it/s]


number of edges = 4,301,194, number of multi-attr edges = 12,074, (0.003)


In [4]:
def create_triples(hid, pos_tid, miss_hids, duplicate_pairs, eid_to_text, sampler=None):
    if sampler != None:
        assert type(sampler) == dict, type(sampler)
        if hid not in sampler:
            miss_hids.append(hid)
            return 0
    if eid_to_text[hid] == eid_to_text[pos_tid]:
        duplicate_pairs.append((hid, pos_tid))
        return 0
    
    if sampler != None:
        neg_tid = random.sample(sampler[hid], k=1)[0]
        while neg_tid == pos_tid:
            neg_tid = random.sample(sampler[hid], k=1)[0]
    else:
        neg_tid = random.sample(range(2_000_000), k=1)[0]
        while neg_tid == pos_tid:
            neg_tid = random.sample(range(2_000_000), k=1)[0]
            
    return (hid, pos_tid, neg_tid)


eid_to_text = {}
with open(os.path.join(in_dir, "all_entities.tsv")) as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text
        
run_path = os.path.join(in_dir, "runs/bm25.all.run")
df = pd.read_csv(run_path, sep=" ", names=["hid", "q0", "tid", "rank", "score", "model_name"])
bm25_hid_to_tids = {}
ignore_hids = set()
for hid, group in df.groupby("hid"):
    cand_tids = list(group.tid.values)
    if len(cand_tids) < 10:
        ignore_hids.add(int(hid))
    else:
        bm25_hid_to_tids[int(hid)] = [int(x) for x in cand_tids]
        
print("number of ignore hids = {}".format(len(ignore_hids)))

number of ignore hids = 6644


In [5]:
max5_h2sp = {}
max5_h2cp = {}
max5_h2q = {}

for head_node, nbrs_dict in tqdm(G.adj.items(), total=G.number_of_nodes()):
    sim_pids = []
    compl_pids = []
    rel_qids = []
    for tail_node, edge_attrs in nbrs_dict.items():
        assert len(edge_attrs) == 1 or len(edge_attrs) == 2
        relations = []
        for no, edge_attr in edge_attrs.items():
            relations.append(edge_attr["type"])
        for rel in relations:
            assert rel in [SIM_RELATION, COMPL_RELATION, REL_RELATION]
            if rel in SIM_RELATION:
                sim_pids.append(tail_node)
            if rel in COMPL_RELATION:
                compl_pids.append(tail_node)
            if rel in REL_RELATION:
                rel_qids.append(tail_node)
    if len(sim_pids) != 0:
        max5_h2sp[head_node] = random.sample(sim_pids, k=len(sim_pids))[:5]
    if len(compl_pids) != 0:
        max5_h2cp[head_node] = random.sample(compl_pids, k=len(compl_pids))[:5]
    if len(rel_qids) != 0:
        max5_h2q[head_node] = random.sample(rel_qids, k=len(rel_qids))[:5]
        
miss_hids = []
duplicate_pairs = []

h2sp_triples = []
h2cp_triples = []
q2h_triples = []
for hid, tail_ids in max5_h2sp.items():
    for pos_tid in tail_ids:
        triple = create_triples(hid, pos_tid, miss_hids, duplicate_pairs, eid_to_text)
        if triple != 0:
            h2sp_triples.append(triple)
print("miss_hids = {:,}, duplicate_pairs = {:,}".format(len(miss_hids), len(duplicate_pairs)))
print("="*75)
for hid, tail_ids in max5_h2cp.items():
    for pos_tid in tail_ids:
        triple = create_triples(hid, pos_tid, miss_hids, duplicate_pairs, eid_to_text, sampler=bm25_hid_to_tids)
        if triple != 0:
            h2cp_triples.append(triple)
print("miss_hids = {:,}, duplicate_pairs = {:,}".format(len(miss_hids), len(duplicate_pairs)))
print("="*75)
for pos_tid, head_ids in max5_h2q.items():
    for hid in head_ids:
        triple = create_triples(hid, pos_tid, miss_hids, duplicate_pairs, eid_to_text, sampler=bm25_hid_to_tids)
        if triple != 0:
            q2h_triples.append(triple)
print("miss_hids = {:,}, duplicate_pairs = {:,}".format(len(miss_hids), len(duplicate_pairs)))
print("="*75)

100%|██████████| 1160206/1160206 [00:21<00:00, 54410.64it/s] 


miss_hids = 0, duplicate_pairs = 26,772
miss_hids = 0, duplicate_pairs = 27,103
miss_hids = 34,511, duplicate_pairs = 27,103


In [29]:
max5_q2p = {}
max2_q2p = {}
max5_q2p_triples = []
max2_q2p_triples = []
for qid, pids in train_qid_to_pids.items():
    max5_q2p[qid] = random.sample(pids, k=len(pids))[:5]
    max2_q2p[qid] = random.sample(pids, k=len(pids))[:2]
    
for qid, pos_pids in max5_q2p.items():
    for pos_pid in pos_pids:
        triple = create_triples(qid, pos_pid, miss_hids, duplicate_pairs, eid_to_text, sampler=bm25_hid_to_tids)
        if triple != 0:
            max5_q2p_triples.append(triple)
print("miss_hids = {:,}, duplicate_pairs = {:,}".format(len(miss_hids), len(duplicate_pairs)))
print("="*75)

for qid, pos_pids in max2_q2p.items():
    for pos_pid in pos_pids:
        triple = create_triples(qid, pos_pid, miss_hids, duplicate_pairs, eid_to_text, sampler=bm25_hid_to_tids)
        if triple != 0:
            max2_q2p_triples.append(triple)
print("miss_hids = {:,}, duplicate_pairs = {:,}".format(len(miss_hids), len(duplicate_pairs)))
print("="*75)

miss_hids = 187,065, duplicate_pairs = 27,103
miss_hids = 249,763, duplicate_pairs = 27,103


In [6]:
import pickle

out_dir = os.path.join(in_dir, "unified_train/")
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

with open(os.path.join(out_dir, "train_graph.pkl"), "wb") as fout:
    pickle.dump(G, fout)

fn_to_tripleNrel = {
    "a2sp.train.tsv": (h2sp_triples, SIM_RELATION),
    "a2cp.train.tsv": (h2cp_triples, COMPL_RELATION),
    "q2a.train.tsv": (q2h_triples, REL_RELATION),
    
    "q2a.50.train.tsv": (random.sample(q2h_triples, k=int(0.5*len(q2h_triples))), REL_RELATION),
    "q2a.17.train.tsv": (random.sample(q2h_triples, k=int(0.17*len(q2h_triples))), REL_RELATION),
    
    "a2sp.50.train.tsv": (random.sample(h2sp_triples, k=int(0.5*len(h2sp_triples))), SIM_RELATION)
}

for fn, (triples, relation) in fn_to_tripleNrel.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for (hid, pos_tid, neg_tid) in triples:
            fout.write(f"{hid}\t{pos_tid}\t{neg_tid}\t{relation}\n")
            
out_dir = os.path.join(in_dir, "unified_test/")
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

# for similar items
fn_to_aids = {
    "anchors.train.sim.tsv": list(train_aid_to_simpids.keys()),
    "anchors.val.sim.tsv": list(val_aid_to_simpids.keys()),
    "anchors.test.sim.tsv": list(test_aid_to_simpids.keys()),
    "anchors.test.sim.small.tsv": random.sample(list(test_aid_to_simpids.keys()), k=10000)
}
for fn, aids in fn_to_aids.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for aid in aids:
            text = eid_to_text[aid]
            fout.write(f"{aid}\t{text}\t{SIM_RELATION}\n")
            
fn_to_arels = {
    "arels.train.sim.tsv": [(aid, pid) for aid, simpids in train_aid_to_simpids.items() for pid in simpids],
    "arels.val.sim.tsv": [(aid, pid) for aid, simpids in val_aid_to_simpids.items() for pid in simpids],
    "arels.test.sim.tsv": [(aid, pid) for aid, simpids in test_aid_to_simpids.items() for pid in simpids],
}
for fn, arels in fn_to_arels.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for (aid, pid) in arels:
            fout.write(f"{aid}\tQ0\t{pid}\t{1}\n")
            
# for complementary items
fn_to_aids = {
    "anchors.train.compl.tsv": list(train_aid_to_complpids.keys()),
    "anchors.val.compl.tsv": list(val_aid_to_complpids.keys()),
    "anchors.test.compl.tsv": list(test_aid_to_complpids.keys()),
}
for fn, aids in fn_to_aids.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for aid in aids:
            text = eid_to_text[aid]
            fout.write(f"{aid}\t{text}\t{COMPL_RELATION}\n")
fn_to_arels = {
    "arels.train.compl.tsv": [(aid, pid) for aid, complpids in train_aid_to_complpids.items() for pid in complpids],
    "arels.val.compl.tsv": [(aid, pid) for aid, complpids in val_aid_to_complpids.items() for pid in complpids],
    "arels.test.compl.tsv": [(aid, pid) for aid, complpids in test_aid_to_complpids.items() for pid in complpids]
}
for fn, arels in fn_to_arels.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for (aid, pid) in arels:
            fout.write(f"{aid}\tQ0\t{pid}\t{1}\n")
            
# for queries
fn_to_qids = {
    "queries.train.tsv": list(train_qid_to_pids.keys()),
    "queries.val.tsv": list(val_qid_to_pids.keys()),
    "queries.test.tsv": list(test_qid_to_pids.keys()),
    "queries.test.small.tsv": random.sample(list(test_qid_to_pids.keys()), k=10000)
}
for fn, qids in fn_to_qids.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for qid in qids:
            text = eid_to_text[qid]
            fout.write(f"{qid}\t{text}\t{REL_RELATION}\n")
            
            
    
            
fn_to_qrels = {
    "qrels.train.tsv": [(qid, pid) for qid, pids in train_qid_to_pids.items() for pid in pids],
    "qrels.val.tsv": [(qid, pid) for qid, pids in val_qid_to_pids.items() for pid in pids],
    "qrels.test.tsv": [(qid, pid) for (qid, pids) in test_qid_to_pids.items() for pid in pids],
}

for fn, qrels in fn_to_qrels.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for (qid, pid) in qrels:
            fout.write(f"{qid}\tQ0\t{pid}\t{1}\n")

In [30]:
out_dir = os.path.join(in_dir, "unified_train/")
extra_fn_to_tripleNrel = {
    "max2_qorient_q2p.train.tsv": (max2_q2p_triples, REL_RELATION),
    "max5_qorient_q2p.train.tsv": (max5_q2p_triples, REL_RELATION),
}
for fn, (triples, relation) in extra_fn_to_tripleNrel.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for (hid, pos_tid, neg_tid) in triples:
            fout.write(f"{hid}\t{pos_tid}\t{neg_tid}\t{relation}\n")

In [32]:
print("query oriented sampling max2, unique queries = {:,}, unique items = {:,}".format(
    len(set([q for (q, _, _) in max2_q2p_triples])), len(set([p for (_, p, _) in max2_q2p_triples]))))
print("query oriented sampling max5, unique queries = {:,}, unique items = {:,}".format(
    len(set([q for (q, _, _) in max5_q2p_triples])), len(set([p for (_, p, _) in max5_q2p_triples]))))
print("item oriented sampling, unique queries = {:,}, unique items = {:,}".format(
    len(set([q for (q, _, _) in q2h_triples])), len(set([p for (_, p, _) in q2h_triples]))))

query oriented sampling max2, unique queries = 716,991, unique items = 176,811
query oriented sampling max5, unique queries = 716,991, unique items = 230,181
item oriented sampling, unique queries = 289,415, unique items = 330,387


In [33]:
# sanity check
out_dir = os.path.join(in_dir, "unified_train/")
for path in os.listdir(out_dir):
    path = os.path.join(out_dir, path)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)



240311 /home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_train/a2sp.50.train.tsv
2089647	2113209	1591240	is_similar_to
1377891	2242286	286924	is_similar_to
1985999	860476	489794	is_similar_to
1505255	1899151	1616734	is_similar_to
389480	1679776	1083986	is_similar_to
1539323	1293969	1628774	is_similar_to
1749158 /home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_train/max5_qorient_q2p.train.tsv
3006068	1517384	1271680	is_relevant_to
3006068	785329	1252745	is_relevant_to
2608168	1732982	1878737	is_relevant_to
2381059	789716	2257556	is_relevant_to
3211172	1326017	287868	is_relevant_to
3211172	412940	50449	is_relevant_to
471087 /home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_train/train_graph.pkl
�cnetworkx.classes.multidigraph
MultiDiGraph
q )�q}q(X   edge_key_dict_factoryqcbuiltins
 jo�- J.� j��- uJ��" }r0� J� jGI sJڦ0 }r0� (J�|  j�� JU� jV�M J�� jt� uJ�2* }r0� Jr jr� sJ�?, }r 0� (JU& j�v1 J( j�w1 uJ��* }r!0� Jy j��

In [34]:
hid, pos_tid, neg_tid = (2381059,2117076,437646)

! grep -P "^{hid}\t" "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/all_entities.tsv"
! grep -P "^{pos_tid}\t" "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/all_entities.tsv"
! grep -P "^{neg_tid}\t" "/home/jupyter/jointly_rec_and_search/datasets/unified_kgc/all_entities.tsv"


2381059	goal zero solar generator
2117076	Goal Zero Yeti 500X 500-Watt Hour Portable Solar Generator ; Portable Solar Generators
437646	NATURE'S GENERATOR Elite Series 1200-Watt Hour Portable Solar Generator ; Portable Solar Generators


In [9]:
out_dir = os.path.join(in_dir, "unified_test/")
for path in os.listdir(out_dir):
    path = os.path.join(out_dir, path)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)

8687 /home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_test/anchors.val.compl.tsv
1330281	Bruce America's Best Choice Gunstock Oak 2-1/4-in Wide x 3/4-in Thick Smooth/Traditional Solid Hardwood Flooring (20-sq ft) ; Hardwood Flooring	is_complementary_to
776084	CARLON Gray Weatherproof Pvc Junction Box ; Junction Boxes	is_complementary_to
1199209	Kobalt 3000 Series 45-in W x 47-in H 7-Drawer Steel Rolling Tool Cabinet (Black) ; Bottom Tool Cabinets	is_complementary_to
891090	Gorilla Playsets Fort Highlander Residential Wood Playset ; Wood Playsets & Swing Sets	is_complementary_to
45217	Royal Mouldings Limited 2-in x 12-ft Finished PVC Crown Moulding ; Crown Moulding	is_complementary_to
2017169	ToughRock 5/8-in 4-ft x 8-ft Fireguard x Regular Drywall Panel ; Drywall Panels	is_complementary_to
95453 /home/jupyter/jointly_rec_and_search/datasets/unified_kgc/unified_test/arels.val.sim.tsv
554796	Q0	1363017	1
554796	Q0	1902026	1
554796	Q0	624049	1
2208349	Q0	794350	1
2208349	

In [10]:
len(max5_h2q), len(max5_h2sp)

(336188, 172991)