In [1]:
import os

from google.cloud import bigquery
import pandas as pd 
import numpy as np

client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.comp_rec_ClicksData_2core`;
    """
query_job = client.query(query)
compl_rec_df = query_job.to_dataframe()

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.hansi_rec_ClicksData_5core`;
    """
query_job = client.query(query)
sim_rec_df = query_job.to_dataframe()

query = """
    SELECT *
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.search_ClicksData_1year_5core`;
"""
query_job = client.query(query)
search_df = query_job.to_dataframe()

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.hansi_dataset.all_products_info`;
    """
query_job = client.query(query)
product_df = query_job.to_dataframe()
print("product_df = {:,}".format(len(product_df)))

all_products = set(product_df.product_id)
anchors = set(compl_rec_df.anchor)
compl_ivms = set(compl_rec_df.ivm)
all_compl_ivms = anchors.union(compl_ivms)

print("================================ For anchor_to_compl_ivms: ===================================")
print("number of unique product = {}, anchors = {:,}, complementary_compl_ivms = {:,}".format(len(all_products), len(anchors), len(compl_ivms)))
assert len(all_products & anchors) == len(anchors) and len(all_products & compl_ivms) == len(compl_ivms),(
    len(all_products & anchors), len(anchors), len(all_products & compl_ivms), len(compl_ivms)
)

all_sim_ivms = set(sim_rec_df.anchor).union(set(sim_rec_df.ivm))
print("================================ After updating anchor_to_similar_ivms: ===================================")
print("all_compl_ivms = {:,}, all_sim_ivms = {:,}".format(len(all_compl_ivms), len(all_sim_ivms)))
print("sim_compl_intersect = {:,} ({:.3f})".format(len(all_compl_ivms & all_sim_ivms), len(all_compl_ivms & all_sim_ivms) / len(all_compl_ivms)))
print("all_ivms = {:,}".format(len(all_compl_ivms | all_sim_ivms)))
all_ivms = all_compl_ivms | all_sim_ivms

assert len(all_products & all_ivms) == len(all_ivms), (len(all_products & all_ivms), len(all_ivms))

Client creating using default project: gcp-ushi-digital-ds-qa
product_df = 2,260,878
number of unique product = 2260878, anchors = 86,870, complementary_compl_ivms = 65,561
all_compl_ivms = 109,758, all_sim_ivms = 256,765
sim_compl_intersect = 87,425 (0.797)
all_ivms = 279,098


In [2]:
# statistics
ivm_to_queries = search_df.groupby("ivm")["query"].apply(list)
query_lengths = np.array([len(x) for x in ivm_to_queries.values])
all_queries = set(search_df["query"])
print("all queries = {}".format(len(all_queries)))
print("total ivms (queries) = {:,}, length >=3 = {:,}, length >= 5 = {:,}".format(
    len(query_lengths), np.sum(query_lengths >=3), np.sum(query_lengths >= 5) ))

anchor_to_compl_ivms = compl_rec_df.groupby("anchor")["ivm"].apply(list)
compl_ivms_length = np.array([len(x) for x in anchor_to_compl_ivms.values])
print("================================ For anchor_to_compl_ivms: ===================================")
print("total_compl_ivms = {:,}, length >=3 = {:,}, length >= 5 = {:,}".format(len(compl_ivms_length), np.sum(compl_ivms_length >=3), np.sum(compl_ivms_length >= 5) ))

anchor_to_sim_ivms = sim_rec_df.groupby("anchor")["ivm"].apply(list)


all queries = 953773
total ivms (queries) = 360,744, length >=3 = 196,481, length >= 5 = 142,527
total_compl_ivms = 86,870, length >=3 = 35,837, length >= 5 = 22,121


In [3]:
# map product --> text

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("sep token = {}".format(tokenizer.sep_token))
from tqdm import tqdm 

ivm_to_title = {}
ivm_to_bullet = {}
ivm_to_catalog = {}
no_bulletin_ivms = set()
no_title_ivms = set()
no_catalog_ivms = set()

def preprocess_text(in_text):
    in_text = in_text.replace("\t", " ")
    in_text = in_text.replace("\n", " ")
    return in_text

for idx, row in tqdm(product_df.iterrows(), total=len(product_df)):
    product_id = row.product_id
    title = row.product_name if row.product_name != None else "No title"
    bullets = row.bullets if row.bullets != None else "No bullets"
    catalog = row.catalog_name if row.catalog_name != None else "No catalog"
    
    if row.product_name == None:
        no_title_ivms.add(product_id)
    if row.bullets == None:
        no_bulletin_ivms.add(product_id)
    if row.catalog_name == None:
        no_catalog_ivms.add(product_id)
    
    title = preprocess_text(title)
    bullets = preprocess_text(bullets)
    catalog = preprocess_text(catalog)
    
    ivm_to_title[product_id] = title
    ivm_to_bullet[product_id] = bullets
    ivm_to_catalog[product_id] = catalog

# sanity check
print("ivm_to_title = {:,}, ivm_to_bullet = {:,}, ivm_to_catalog = {:,}, products no bulletin = {:,}, no title = {:,}, no catalog = {:,}".format(
    len(ivm_to_title), len(ivm_to_bullet), len(ivm_to_catalog), len(no_bulletin_ivms), len(no_title_ivms), len(no_catalog_ivms)
))

assert len(ivm_to_title) == len(ivm_to_bullet) == len(ivm_to_catalog) == len(product_df)

sep token = [SEP]


100%|██████████| 2260878/2260878 [03:45<00:00, 10027.42it/s]

ivm_to_title = 2,260,878, ivm_to_bullet = 2,260,878, ivm_to_catalog = 2,260,878, products no bulletin = 0, no title = 21, no catalog = 4,519





In [4]:
# anchor_to_compl_ivms
# [{anchor_A: {sim_ivms: {}, queries: {}}, compl_ivms: {sim_ivm: {}, query: {}}}, ..., {...}]
import ujson

from tqdm import tqdm

# map to pid and qid
ivm_to_pid = {ivm: pid for pid, ivm in enumerate(list(all_products))}
pid_to_ivm = {pid: ivm for ivm, pid in ivm_to_pid.items()}
query_to_qid = {query: qid for qid, query in enumerate(list(all_queries))}
qid_to_query = {qid: query for query, qid in query_to_qid.items()}

pid_to_title = {ivm_to_pid[ivm]: title for ivm, title in ivm_to_title.items()}
pid_to_bullet = {ivm_to_pid[ivm]: bullet for ivm, bullet in ivm_to_bullet.items()}
pid_to_catalog = {ivm_to_pid[ivm]: catalog for ivm, catalog in ivm_to_catalog.items()}

# start to one-hop examples construction
#anchor_to_sim_ivms = anchor_to_sim_ivms.to_dict()
#anchor_to_compl_ivms = anchor_to_compl_ivms.to_dict()
#ivm_to_queries = ivm_to_queries.to_dict()

aid_to_sim_pids = {ivm_to_pid[anchor]: [ivm_to_pid[ivm] for ivm in sim_ivms] for anchor, sim_ivms in anchor_to_sim_ivms.items()}
aid_to_compl_pids = {ivm_to_pid[anchor]: [ivm_to_pid[ivm] for ivm in compl_ivms] for anchor, compl_ivms in anchor_to_compl_ivms.items()}
pid_to_qids = {ivm_to_pid[anchor]: [query_to_qid[query] for query in _queries] for anchor, _queries in ivm_to_queries.items()}

one_hop_examples = []
for aid, compl_pids in tqdm(aid_to_compl_pids.items(), ):
    example = {aid: {}, "compl_pids": {}}
    if aid in aid_to_sim_pids:
        example[aid]["sim_pids"] = aid_to_sim_pids[aid]
    else:
        example[aid]["sim_pids"] = []
    
    if aid in pid_to_qids:
        example[aid]["qids"] = pid_to_qids[aid]
    else:
        example[aid]["qids"] = []
    
    for compl_pid in compl_pids:
        example["compl_pids"][compl_pid] = {}
        if compl_pid in aid_to_sim_pids:
            example["compl_pids"][compl_pid]["sim_pids"] = aid_to_sim_pids[compl_pid]
        else:
            example["compl_pids"][compl_pid]["sim_pids"] = []

        if compl_pid in pid_to_qids:
            example["compl_pids"][compl_pid]["qids"] = pid_to_qids[compl_pid]
        else:
            example["compl_pids"][compl_pid]["qids"] = []
        
        if compl_pid in aid_to_compl_pids:
            example["compl_pids"][compl_pid]["compl_pids"] = aid_to_compl_pids[compl_pid]
        else:
            example["compl_pids"][compl_pid]["compl_pids"] = []
    
    one_hop_examples.append(example)
    
# statistics
cur_obj = None
# sim_ivms length 
ohop_sim_ivms_lengths = [len(exp[anc]["sim_pids"]) for exp in one_hop_examples for anc in exp.keys() if anc != "compl_pids"] 
cur_obj = ohop_sim_ivms_lengths
print("sim_ivms length: ")
print("min = {}, max = {}, 25% = {:.1f}, 50% = {:.1f}, 75% = {:.1f}".format(min(cur_obj), max(cur_obj), np.quantile(cur_obj, 0.25),
                                                                           np.quantile(cur_obj, 0.55), np.quantile(cur_obj, 0.75)
                                                                           ))
print("="*100)
# number of query per anchor
print("number of query per anchor: ")
ohop_anc_queries_lengths = [len(exp[anc]["qids"]) for exp in one_hop_examples for anc in exp.keys() if anc != "compl_pids"] 
cur_obj = ohop_anc_queries_lengths
print("min = {}, max = {}, 25% = {:.1f}, 50% = {:.1f}, 75% = {:.1f}".format(min(cur_obj), max(cur_obj), np.quantile(cur_obj, 0.25),
                                                                           np.quantile(cur_obj, 0.55), np.quantile(cur_obj, 0.75)
                                                                           ))
print("="*100)
# number of compl_ivms 
print("number of compl_pids: ")
ohop_anc_queries_lengths = []
for exp in one_hop_examples:
    ohop_anc_queries_lengths.append(len(exp["compl_pids"]))
cur_obj = ohop_anc_queries_lengths
print("min = {}, max = {}, 25% = {:.1f}, 50% = {:.1f}, 75% = {:.1f}".format(min(cur_obj), max(cur_obj), np.quantile(cur_obj, 0.25),
                                                                           np.quantile(cur_obj, 0.55), np.quantile(cur_obj, 0.75)
                                                                           ))
print("="*100)
# number of queries per compl_ivm
print("number of queries per compl_pid: ")
ohop_num_queries_per_compl_pids = []
for exp in one_hop_examples:
    for compl_pid in exp["compl_pids"]:
        ohop_num_queries_per_compl_pids.append(len(exp["compl_pids"][compl_pid]["qids"]))
cur_obj = ohop_num_queries_per_compl_pids
print("min = {}, max = {}, 25% = {:.1f}, 50% = {:.1f}, 75% = {:.1f}".format(min(cur_obj), max(cur_obj), np.quantile(cur_obj, 0.25),
                                                                           np.quantile(cur_obj, 0.55), np.quantile(cur_obj, 0.75)
                                                                           ))
print("="*100)
# number of queries (compl_ivm's) per anchor 
print("number of queries (compl_pid's) per anchor : ")
ohop_num_compl_queries_per_anchor = []
for exp in one_hop_examples:
    tmp = []
    for compl_pid in exp["compl_pids"]:
        tmp.append(len(exp["compl_pids"][compl_pid]["qids"]))
    ohop_num_compl_queries_per_anchor.append(sum(tmp))
cur_obj = ohop_num_compl_queries_per_anchor
print("min = {}, max = {}, 25% = {:.1f}, 50% = {:.1f}, 75% = {:.1f}".format(min(cur_obj), max(cur_obj), np.quantile(cur_obj, 0.25),
                                                                           np.quantile(cur_obj, 0.55), np.quantile(cur_obj, 0.75)
                                                                           ))
print("="*100)

print("="*100)
# number of compl_pids per compl_ivm
print("number of compl_pids per compl_pid: ")
ohop_num_queries_per_compl_pids = []
for exp in one_hop_examples:
    for compl_pid in exp["compl_pids"]:
        ohop_num_queries_per_compl_pids.append(len(exp["compl_pids"][compl_pid]["compl_pids"]))
cur_obj = ohop_num_queries_per_compl_pids
print("min = {}, max = {}, 25% = {:.1f}, 50% = {:.1f}, 75% = {:.1f}".format(min(cur_obj), max(cur_obj), np.quantile(cur_obj, 0.25),
                                                                           np.quantile(cur_obj, 0.55), np.quantile(cur_obj, 0.75)
                                                                           ))
print("="*100)
# number of compl_pids (compl_ivm's) per anchor 
print("number of compl_pids (compl_pid's) per anchor : ")
ohop_num_compl_queries_per_anchor = []
for exp in one_hop_examples:
    tmp = []
    for compl_pid in exp["compl_pids"]:
        tmp.append(len(exp["compl_pids"][compl_pid]["compl_pids"]))
    ohop_num_compl_queries_per_anchor.append(sum(tmp))
cur_obj = ohop_num_compl_queries_per_anchor
print("min = {}, max = {}, 25% = {:.1f}, 50% = {:.1f}, 75% = {:.1f}".format(min(cur_obj), max(cur_obj), np.quantile(cur_obj, 0.25),
                                                                           np.quantile(cur_obj, 0.55), np.quantile(cur_obj, 0.75)
                                                                           ))
print("="*100)

100%|██████████| 86870/86870 [00:01<00:00, 54987.47it/s] 


sim_ivms length: 
min = 0, max = 47, 25% = 1.0, 50% = 5.0, 75% = 9.0
number of query per anchor: 
min = 0, max = 1415, 25% = 4.0, 50% = 16.0, 75% = 36.0
number of compl_pids: 
min = 1, max = 74, 25% = 1.0, 50% = 2.0, 75% = 5.0
number of queries per compl_pid: 
min = 0, max = 1415, 25% = 15.0, 50% = 51.0, 75% = 99.0
number of queries (compl_pid's) per anchor : 
min = 0, max = 11420, 25% = 14.0, 50% = 100.0, 75% = 306.0
number of compl_pids per compl_pid: 
min = 0, max = 74, 25% = 3.0, 50% = 11.0, 75% = 17.0
number of compl_pids (compl_pid's) per anchor : 
min = 0, max = 817, 25% = 2.0, 50% = 16.0, 75% = 43.0


In [5]:
# split train, val, test 
import numpy as np 
np.random.seed(4680)

val_test_indices = np.random.choice(np.arange(0, len(one_hop_examples)), int(0.2*len(one_hop_examples)), replace=False)
val_indices = val_test_indices[:int(0.5*len(val_test_indices))]
test_indices = val_test_indices[int(0.5*len(val_test_indices)):]

train_ohop_examples = []
val_ohop_examples = []
test_ohop_examples = []

for idx, example in enumerate(one_hop_examples):
    if idx in val_indices:
        val_ohop_examples.append(example)
    elif idx in test_indices:
        test_ohop_examples.append(example)
    else:
        train_ohop_examples.append(example)
print("train example = {:,}, val example = {:,}, test example = {:,}".format(len(train_ohop_examples), len(val_ohop_examples), len(test_ohop_examples)))
assert len(train_ohop_examples) + len(val_ohop_examples) + len(test_ohop_examples) == len(one_hop_examples)

train example = 69,496, val example = 8,687, test example = 8,687


In [6]:
# write to disk
import os 
import copy

def example_qids_to_queries(example):
    _exp = copy.deepcopy(example)
    new_exp = {}
    for key_1, vals_1 in _exp.items():
        if key_1 != "compl_pids":
            new_exp[key_1] = {}
            for key_2, vals_2 in _exp[key_1].items():
                if key_2 == "qids":
                    vals_2 = [qid_to_query[qid] for qid in vals_2]
                new_exp[key_1][key_2] = vals_2
        else:
            new_exp[key_1] = {}
            for key_2, vals_2 in _exp[key_1].items():
                new_exp[key_1][key_2] = {}
                for key_3, val_3 in vals_2.items():
                    if key_3 == "qids":
                        val_3 = [qid_to_query[qid] for qid in val_3]
                    new_exp[key_1][key_2][key_3] = val_3
    return new_exp

out_dir = "/home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
with open(os.path.join(out_dir, "collection_title.tsv"), "w") as fout:
    for pid, title in pid_to_title.items():
        fout.write(f"{pid}\t{title}\n")
        
with open(os.path.join(out_dir, "collection_bullet.tsv"), "w") as fout:
    for pid, bullet in pid_to_bullet.items():
        fout.write(f"{pid}\t{bullet}\n")
        
with open(os.path.join(out_dir, "collection_catalog.tsv"), "w") as fout:
    for pid, catalog in pid_to_catalog.items():
        fout.write(f"{pid}\t{catalog}\n")
        
with open(os.path.join(out_dir, "product.jsonl"), "w") as fout:
    for aid in pid_to_title:
        text = pid_to_title[aid] + " " + tokenizer.sep_token + " " + pid_to_catalog[aid] + " " + tokenizer.sep_token + " " + pid_to_bullet[aid]
        example = {"id": aid, "contents": text}
        fout.write(ujson.dumps(example) + "\n")
        
with open(os.path.join(out_dir, "one_hop_examples.jsonl"), "w") as fout:
    for example in one_hop_examples:
        fout.write(ujson.dumps(example) + "\n")

with open(os.path.join(out_dir, "one_hop_examples_queries.jsonl"), "w") as fout:
    for example in one_hop_examples:
        fout.write(ujson.dumps(example_qids_to_queries(example)) + "\n")
        
with open(os.path.join(out_dir, "queries.tsv"), "w") as fout:
    for query, qid in query_to_qid.items():
        fout.write(f"{qid}\t{query}\n")
        
with open(os.path.join(out_dir, "anchors_title.train.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "anchors_title_catalog.train.tsv"), "w") as fout2:
        with open(os.path.join(out_dir, "anchors_tcb.train.tsv"), "w") as fout3:
            for example in train_ohop_examples:
                for key in example:
                    if key == "compl_pids":
                        continue
                    aid = key
                    _title = pid_to_title[aid]
                    _tc = _title + " " + tokenizer.sep_token + " " + pid_to_catalog[aid]
                    _tcb = _tc + " " + tokenizer.sep_token + " " + pid_to_bullet[aid]
                    fout.write(f"{aid}\t{_title}\n")
                    fout2.write(f"{aid}\t{_tc}\n")
                    fout3.write(f"{aid}\t{_tcb}\n")
        
with open(os.path.join(out_dir, "anchors_title.val.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "anchors_title_catalog.val.tsv"), "w") as fout2:
        with open(os.path.join(out_dir, "anchors_tcb.val.tsv"), "w") as fout3:
            for example in val_ohop_examples:
                for key in example:
                    if key == "compl_pids":
                        continue
                    aid = key
                    _title = pid_to_title[aid]
                    _tc = _title + " " + tokenizer.sep_token + " " + pid_to_catalog[aid]
                    _tcb = _tc + " " + tokenizer.sep_token + " " + pid_to_bullet[aid]
                    fout.write(f"{aid}\t{_title}\n")
                    fout2.write(f"{aid}\t{_tc}\n")
                    fout3.write(f"{aid}\t{_tcb}\n")
        
with open(os.path.join(out_dir, "anchors_title.test.tsv"), "w") as fout:
    with open(os.path.join(out_dir, "anchors_title_catalog.test.tsv"), "w") as fout2:
        with open(os.path.join(out_dir, "anchors_tcb.test.tsv"), "w") as fout3:
            for example in test_ohop_examples:
                for key in example:
                    if key == "compl_pids":
                        continue
                    aid = key
                    _title = pid_to_title[aid]
                    _tc = _title + " " + tokenizer.sep_token + " " + pid_to_catalog[aid]
                    _tcb = _tc + " " + tokenizer.sep_token + " " + pid_to_bullet[aid]
                    fout.write(f"{aid}\t{_title}\n")
                    fout2.write(f"{aid}\t{_tc}\n")
                    fout3.write(f"{aid}\t{_tcb}\n")
                
with open(os.path.join(out_dir, "arels.compl.train.tsv"), "w") as fout:
    for example in train_ohop_examples:
        for key in example:
            if key == "compl_pids":
                continue
            aid = key 
            compl_pids = example["compl_pids"]
            for compl_pid in compl_pids:
                fout.write(f"{aid}\tQ0\t{compl_pid}\t{1}\n")

with open(os.path.join(out_dir, "arels.compl.val.tsv"), "w") as fout:
    for example in val_ohop_examples:
        for key in example:
            if key == "compl_pids":
                continue
            aid = key 
            compl_pids = example["compl_pids"]
            for compl_pid in compl_pids:
                fout.write(f"{aid}\tQ0\t{compl_pid}\t{1}\n")
                
with open(os.path.join(out_dir, "arels.compl.test.tsv"), "w") as fout:
    for example in test_ohop_examples:
        for key in example:
            if key == "compl_pids":
                continue
            aid = key 
            compl_pids = example["compl_pids"]
            for compl_pid in compl_pids:
                fout.write(f"{aid}\tQ0\t{compl_pid}\t{1}\n")

In [7]:
# sanity check
for path in os.listdir(out_dir):
    path = os.path.join(out_dir, path)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)

69496 /home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/anchors_title_catalog.train.tsv
501931	SIMPSON 72-in x 80-in Wood Full Lite Right-Hand Inswing Brown Unfinished Double Front Door Solid Core [SEP] Front Doors
220006	ReliaBilt Continental 24-in x 80-in Primed 2-Panel Round Top Hollow Core Primed Molded Composite Right Hand Inswing Single Prehung Interior Door [SEP] Prehung Interior Doors
1970525	Kraloy Gray PVC Weatherproof New Work/Old Work Standard Enclosure Exterior Electrical Box [SEP] Electrical Boxes
1822620	ToughRock 5/8-in 4-ft x 8-ft Fireguard x Regular Drywall Panel [SEP] Drywall Panels
132127	Jacuzzi 1500-Watt Inline Heater [SEP] Whirlpool Tub & Air Bath Parts
385303	Simply Put 11.1875-in W x 19.375-in H 2-Tier Pull Out Metal Cleaning Caddy [SEP] Cleaning Caddies
2260878 /home/jupyter/jointly_rec_and_search/datasets/rec_search/rec_compl/product.jsonl
{"id":1600705,"contents":"ReliaBilt Madison 32-in x 80-in Cream-n-sugar 1-Panel Hollow Core Prefinished 

In [None]:
# examples for retrieval model
# 1, anchor: {text: (product_name, catalog_name)}
# 2, compl_ivm_1: {text: (product_name)}

In [None]:
#train_ohop_examples[3]

In [None]:
catalogs = product_df.catalog_name.values
print(len(catalogs))
catalogs = catalogs[catalogs != None]
print(len(catalogs))
print(len(np.unique(catalogs)))
print(len(catalogs) / len(np.unique(catalogs)) )

In [None]:
# Q: Is it catalog_name a predictive factor for complementary item. 
from collections import defaultdict
import matplotlib.pyplot as plt

cata_to_complcatalogs = defaultdict(lambda: defaultdict(int))
prod_to_catalog = {}

products = product_df.product_id.values
catalogs = product_df.catalog_name.values
keep_mask = catalogs != None
products = products[keep_mask]
catalogs = catalogs[keep_mask]
print("number of products = {}, catalogs = {}, unique catalogs = {}".format(len(products), len(catalogs), len(np.unique(catalogs))))
prod_to_catalog = dict(zip(products, catalogs))

for anchor, compl_ivms in anchor_to_compl_ivms.items():
    if anchor not in prod_to_catalog:
        continue
    anc_catalog = prod_to_catalog[anchor]
    for compl_ivm in compl_ivms:
        if compl_ivm not in prod_to_catalog:
            continue
        compl_catalog = prod_to_catalog[compl_ivm]
        cata_to_complcatalogs[anc_catalog][compl_catalog] += 1
        
# complementary items length
compl_items_length = [len(val) for val in cata_to_complcatalogs.values()]
bins = np.append(np.arange(1,11), max(compl_items_length))
print(np.histogram(compl_items_length, bins=bins))

# the population percentage of top-k complementary items 
topks = [1,3,5,7,10]
for topk in topks:
    topk_compls_perc = []
    for anc_catalog in cata_to_complcatalogs:
        freqs = np.array(list(cata_to_complcatalogs[anc_catalog].values()))
        if len(freqs) < topk:
            topk_freqs = freqs
        else:
            topk_freqs = freqs[np.argpartition(freqs, -topk)[-topk:]]
        topk_compls_perc.append(np.sum(topk_freqs) / np.sum(freqs))
    bins = np.linspace(0,1,21)
    print("="*50)
    print(f"topk = {topk}", np.histogram(topk_compls_perc, bins=bins)[0])
    
# A: catalog_name is a predictive factor. Since the top-k complementary category accounts for a large poportion for each anchor_catagory

In [None]:
# What is the proportion of (anchor_ivm --> compl_ivm)  reversible ?
anchor_to_ivm = set()
rever_anchor_to_ivm = set()
for anc_ivm, compl_ivms in anchor_to_compl_ivms.items():
    for compl_ivm in compl_ivms:
        anchor_to_ivm.add((anc_ivm, compl_ivm))
        if compl_ivm in anchor_to_compl_ivms:
            if anc_ivm in anchor_to_compl_ivms[compl_ivm]:
                rever_anchor_to_ivm.add((anc_ivm, compl_ivm))
print("anchor_to_compl ivm pairs = {:,}, rever pairs = {:,}, proportion = {:.3f}".format(len(anchor_to_ivm), len(rever_anchor_to_ivm), 
                                                                                        len(rever_anchor_to_ivm)/len(anchor_to_ivm)))

In [None]:
import copy
_exp = one_hop_examples[-10]
_exp = copy.deepcopy(_exp)
new_exp = {}
for key_1, vals_1 in _exp.items():
    if key_1 != "compl_pids":
        new_exp[key_1] = {}
        for key_2, vals_2 in _exp[key_1].items():
            if key_2 == "qids":
                vals_2 = [qid_to_query[qid] for qid in vals_2]
            new_exp[key_1][key_2] = vals_2
    else:
        new_exp[key_1] = {}
        for key_2, vals_2 in _exp[key_1].items():
            new_exp[key_1][key_2] = {}
            for key_3, val_3 in vals_2.items():
                if key_3 == "qids":
                    val_3 = [qid_to_query[qid] for qid in val_3]
                new_exp[key_1][key_2][key_3] = val_3
new_exp

In [None]:
len(anchor_to_compl_ivms)