In [1]:
import os
import pickle
import glob

import pandas as pd 
import numpy as np

in_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/amazon_review_datasets/"
dataset_name = "Cell_Phones_and_Accessories"
in_dir = os.path.join(in_dir, dataset_name)

with open(os.path.join(in_dir, "asin_to_pid.pkl"), "rb") as fin:
    ivm_to_pid = pickle.load(fin)
with open(os.path.join(in_dir, "query_to_qid.pkl"), "rb") as fin:
    query_to_qid = pickle.load(fin)


In [4]:
# start create graph

import random 
from tqdm import tqdm
import networkx as nx
from collections import defaultdict

fns = ["train_aid_to_simpids.pkl", "test_aid_to_simpids.pkl", "train_aid_to_complpids.pkl", "test_aid_to_complpids.pkl",
      "train_qid_to_relpids.pkl", "test_qid_to_relpids.pkl"]

datas = []
for fn in fns:
    fn = os.path.join(in_dir, fn)
    with open(fn, "rb") as fin:
        datas.append(pickle.load(fin))

train_aid_to_simpids, test_aid_to_simpids, train_aid_to_complpids, test_aid_to_complpids, \
train_qid_to_pids, test_qid_to_pids = datas
print("number of aid_to_simpids  train = {:,}, test = {:,}".format(len(train_aid_to_simpids), 
                                                                     len(test_aid_to_simpids)))
print("number of aid_to_complpids train = {:,}, test = {:,}".format(len(train_aid_to_complpids), 
                                                                     len(test_aid_to_complpids)))
print("number of qid_to_pids train = {:,}, test = {:,}".format(len(train_qid_to_pids), 
                                                                      len(test_qid_to_pids)))

assert len( set(train_aid_to_simpids.keys()) &  set(test_aid_to_simpids.keys()) ) == 0
assert len( set(train_aid_to_complpids.keys())  & set(test_aid_to_complpids.keys())) == 0
assert len( set(train_qid_to_pids.keys())  & set(test_qid_to_pids.keys())) == 0

G = nx.MultiDiGraph()
SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"

for aid, sim_pids in train_aid_to_simpids.items():
    triples = [(aid, sim_pid, {"type":SIM_RELATION}) for sim_pid in sim_pids]
    G.add_edges_from(triples)
    
for aid, compl_pids in train_aid_to_complpids.items():
    triples = [(aid, compl_pid, {"type":COMPL_RELATION}) for compl_pid in compl_pids]
    G.add_edges_from(triples)
    
for qid, pids in train_qid_to_pids.items():
    triples = [(pid, qid, {"type": REL_RELATION}) for pid in pids]
    G.add_edges_from(triples)
    
multi_edge_pairs = []
for n, nbrs_dict in tqdm(G.adj.items(), total=G.number_of_nodes()):
    for nbr_node, edge_attrs in nbrs_dict.items():
        assert len(edge_attrs) == 1 or len(edge_attrs) == 2
        if len(edge_attrs) == 2:
            multi_edge_pairs.append((n, nbr_node))
            
print("number of edges = {:,}, number of multi-attr edges = {:,}, ({:.3f})".format(G.number_of_edges(), len(multi_edge_pairs), 
                                                                                   len(multi_edge_pairs)/G.number_of_edges()))

number of aid_to_simpids  train = 24,024, test = 3,003
number of aid_to_complpids train = 21,624, test = 2,703
number of qid_to_pids train = 593, test = 75


100%|██████████| 469578/469578 [00:01<00:00, 278603.62it/s]


number of edges = 659,201, number of multi-attr edges = 0, (0.000)


In [9]:
PIDS = []
with open(os.path.join(in_dir, "collection_title.tsv")) as fin:
    for line in fin:
        PIDS.append(int(line.strip().split("\t")[0]))
print(f"max pids = {max(PIDS)}")

def create_triples(hid, pos_tid, miss_hids, duplicate_pairs, eid_to_text, sampler=None):
    if sampler != None:
        assert type(sampler) == dict, type(sampler)
        if hid not in sampler:
            miss_hids.append(hid)
            return 0
    if eid_to_text[hid] == eid_to_text[pos_tid]:
        duplicate_pairs.append((hid, pos_tid))
        return 0
    
    if sampler != None:
        neg_tid = random.sample(sampler[hid], k=1)[0]
        while neg_tid == pos_tid:
            neg_tid = random.sample(sampler[hid], k=1)[0]
    else:
        neg_tid = random.sample(range(len(PIDS)), k=1)[0]
        while neg_tid == pos_tid:
            neg_tid = random.sample(range(len(PIDS)), k=1)[0]
            
    return (hid, pos_tid, neg_tid)


eid_to_text = {}
with open(os.path.join(in_dir, "all_entities.tsv")) as fin:
    for line in fin:
        eid, text = line.strip().split("\t")
        eid_to_text[int(eid)] = text


run_path = os.path.join(in_dir, "runs/bm25.all.run")
df = pd.read_csv(run_path, sep=" ", names=["hid", "q0", "tid", "rank", "score", "model_name"])
bm25_hid_to_tids = {}
ignore_hids = set()
for hid, group in df.groupby("hid"):
    cand_tids = list(group.tid.values)
    if len(cand_tids) < 10:
        ignore_hids.add(int(hid))
    else:
        bm25_hid_to_tids[int(hid)] = [int(x) for x in cand_tids]

print("number of ignore hids = {}".format(len(ignore_hids)))

max pids = 533870
number of ignore hids = 0


In [7]:
print("unique train qids, sim_aids, compl_aids = {:,}, {:,}, {:,}".format(len(train_qid_to_pids), len(train_aid_to_simpids),
                                                                         len(train_aid_to_complpids)))
print("avg length for relpids, simpids, complpids = {:.3f}, {:.3f}, {:.3f}".format(
    np.mean([len(xs) for xs in train_qid_to_pids.values()]), np.mean([len(xs) for xs in train_aid_to_simpids.values()]), 
    np.mean([len(xs) for xs in train_aid_to_complpids.values()])
))

unique train qids, sim_aids, compl_aids = 593, 24,024, 21,624
avg length for relpids, simpids, complpids = 752.678, 5.073, 4.208


In [11]:
miss_hids = []
duplicate_pairs = []

h2sp_triples = []
h2cp_triples = []
q2p_triples = []
for hid, tail_ids in train_aid_to_simpids.items():
    for pos_tid in tail_ids:
        triple = create_triples(hid, pos_tid, miss_hids, duplicate_pairs, eid_to_text)
        if triple != 0:
            h2sp_triples.append(triple)
print("miss_hids = {:,}, duplicate_pairs = {:,}".format(len(miss_hids), len(duplicate_pairs)))
print("="*75)
for hid, tail_ids in train_aid_to_complpids.items():
    for pos_tid in tail_ids:
        triple = create_triples(hid, pos_tid, miss_hids, duplicate_pairs, eid_to_text, sampler=bm25_hid_to_tids)
        if triple != 0:
            h2cp_triples.append(triple)
print("miss_hids = {:,}, duplicate_pairs = {:,}".format(len(miss_hids), len(duplicate_pairs)))
print("="*75)
for qid, pos_pids in train_qid_to_pids.items():
    for pos_pid in pos_pids:
        triple = create_triples(qid, pos_pid, miss_hids, duplicate_pairs, eid_to_text, sampler=bm25_hid_to_tids)
        if triple != 0:
            q2p_triples.append(triple)
print("miss_hids = {:,}, duplicate_pairs = {:,}".format(len(miss_hids), len(duplicate_pairs)))
print("="*75)

print("number of h2sp, h2cp, q2p triples = {:,}, {:,}, {:,}".format(
    len(h2sp_triples), len(h2cp_triples), len(q2p_triples)
))

miss_hids = 0, duplicate_pairs = 75
miss_hids = 0, duplicate_pairs = 85
miss_hids = 0, duplicate_pairs = 85
number of h2sp, h2cp, q2p triples = 121,800, 90,978, 446,338


In [12]:
import pickle

kgc_dir = in_dir
out_dir = os.path.join(kgc_dir, "unified_train/")
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

fn_to_tripleNrel = {
    "a2sp.train.tsv": (h2sp_triples, SIM_RELATION),
    "a2cp.train.tsv": (h2cp_triples, COMPL_RELATION),
    "q2p.train.tsv": (q2p_triples, REL_RELATION),   
}

for fn, (triples, relation) in fn_to_tripleNrel.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for (hid, pos_tid, neg_tid) in triples:
            fout.write(f"{hid}\t{pos_tid}\t{neg_tid}\t{relation}\n")
            
out_dir = os.path.join(kgc_dir, "unified_test/")
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

# for similar items
fn_to_aids = {
    "anchors.test.sim.tsv": list(test_aid_to_simpids.keys()),
}
for fn, aids in fn_to_aids.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for aid in aids:
            text = eid_to_text[aid]
            fout.write(f"{aid}\t{text}\t{SIM_RELATION}\n")
            
fn_to_arels = {
    "arels.test.sim.tsv": [(aid, pid) for aid, simpids in test_aid_to_simpids.items() for pid in simpids],
}
for fn, arels in fn_to_arels.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for (aid, pid) in arels:
            fout.write(f"{aid}\tQ0\t{pid}\t{1}\n")
            
# for complementary items
fn_to_aids = {
    "anchors.test.compl.tsv": list(test_aid_to_complpids.keys()),
}
for fn, aids in fn_to_aids.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for aid in aids:
            text = eid_to_text[aid]
            fout.write(f"{aid}\t{text}\t{COMPL_RELATION}\n")
fn_to_arels = {
    "arels.test.compl.tsv": [(aid, pid) for aid, complpids in test_aid_to_complpids.items() for pid in complpids]
}
for fn, arels in fn_to_arels.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for (aid, pid) in arels:
            fout.write(f"{aid}\tQ0\t{pid}\t{1}\n")
            
# for queries
fn_to_qids = {
    "queries.test.tsv": list(test_qid_to_pids.keys()),
}
for fn, qids in fn_to_qids.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for qid in qids:
            text = eid_to_text[qid]
            fout.write(f"{qid}\t{text}\t{REL_RELATION}\n")
fn_to_qrels = {
    "qrels.test.tsv": [(qid, pid) for (qid, pids) in test_qid_to_pids.items() for pid in pids],
}
for fn, qrels in fn_to_qrels.items():
    with open(os.path.join(out_dir, fn), "w") as fout:
        for (qid, pid) in qrels:
            fout.write(f"{qid}\tQ0\t{pid}\t{1}\n")

In [13]:
# sanity check
out_dir = os.path.join(kgc_dir, "unified_train/")
for path in os.listdir(out_dir):
    path = os.path.join(out_dir, path)
    ! wc -l $path
    ! head -n 3 $path
    ! tail -n 3 $path
    print("="*100)

446338 /home/jupyter/unity_jointly_rec_and_search/datasets/amazon_review_datasets/Cell_Phones_and_Accessories/unified_train/q2p.train.tsv
534497	413480	375531	is_relevant_to
534220	124354	519984	is_relevant_to
534502	415624	510037	is_relevant_to
534278	180645	488919	is_relevant_to
534072	384170	411126	is_relevant_to
534380	278164	37553	is_relevant_to
121800 /home/jupyter/unity_jointly_rec_and_search/datasets/amazon_review_datasets/Cell_Phones_and_Accessories/unified_train/a2sp.train.tsv
166	390657	334639	is_similar_to
166	257135	492614	is_similar_to
274	530133	402672	is_similar_to
533854	430577	172037	is_similar_to
533854	494818	409806	is_similar_to
533854	478166	112368	is_similar_to
90978 /home/jupyter/unity_jointly_rec_and_search/datasets/amazon_review_datasets/Cell_Phones_and_Accessories/unified_train/a2cp.train.tsv
64	2902	45045	is_complementary_to
219	352188	340285	is_complementary_to
288	354524	301	is_complementary_to
533863	422474	38497	is_complementary_to
533863	487604	315914	i

In [21]:
hid, pos_tid, neg_tid = (533903, 474745, 516305)
print(hid, eid_to_text[hid])
print(pos_tid, eid_to_text[pos_tid])
print(neg_tid, eid_to_text[neg_tid])


533903 query: cell phones accessories power adapters solar chargers
474745 product: Solar Panel Foldable Portable Universal 24W High Efficiency Dual-Port Solar Charger - by iURJA
516305 product: Solar Charger Eco-daily 10000mah Dual USB Solar Charger Waterproof Solar Power Bank / Backup Battery Charger for Android Cell Phone, Solar Power Bank for Iphone 6 Ipad Blue


In [20]:
q2p_triples[-100:]

[(533903, 149009, 242680),
 (533903, 69147, 503725),
 (533903, 97830, 516303),
 (533903, 50731, 428699),
 (533903, 11824, 524995),
 (533903, 138837, 526850),
 (533903, 202340, 733),
 (533903, 497255, 510667),
 (533903, 18023, 531986),
 (533903, 73328, 181038),
 (533903, 99953, 489590),
 (533903, 474745, 516305),
 (533903, 16008, 374224),
 (533903, 7817, 382905),
 (533903, 276105, 277644),
 (533903, 276107, 138837),
 (533903, 444055, 300908),
 (533903, 329371, 17391),
 (533903, 421531, 289628),
 (533903, 458396, 374231),
 (533903, 480928, 516726),
 (533903, 476839, 474308),
 (533903, 530096, 516729),
 (533903, 259766, 479177),
 (533903, 5816, 360071),
 (533903, 272061, 448586),
 (533903, 366285, 388597),
 (533903, 16103, 6754),
 (533903, 311029, 456170),
 (533903, 173831, 447482),
 (533903, 390930, 516595),
 (533903, 519978, 289628),
 (533903, 530218, 508272),
 (533903, 417581, 148116),
 (533903, 296754, 532587),
 (533903, 491316, 286162),
 (533903, 229180, 331389),
 (533903, 390977, 31

In [34]:
# sanity check
import os

#test_dir = os.path.join(in_dir, "selected_test_user")
kgc_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/unified_kgc/"

test_fns = [
    os.path.join(kgc_dir, "unified_test/arels.test.sim.tsv"),
    os.path.join(kgc_dir, "unified_test/arels.test.compl.tsv"),
    os.path.join(kgc_dir, "unified_test/qrels.test.tsv")
]
train_fns = [
    os.path.join(kgc_dir, "unified_train/a2sp.train.tsv"),
    os.path.join(kgc_dir, "unified_train/a2cp.train.tsv"),
    os.path.join(kgc_dir, "unified_train/max2_qorient_q2p.train.tsv")
]

for test_fn, train_fn in zip(test_fns, train_fns):
    test_qids, train_qids = set(), set()
    with open(test_fn) as fin:
        for line in fin:
            array = line.rstrip().split("\t")
            assert len(array) == 4
            test_qids.add(array[0])
    with open(train_fn) as fin:
        for line in fin:
            array = line.rstrip().split("\t")
            assert len(array) == 4
            train_qids.add(array[0])
    assert len(test_qids & train_qids) == 0
    print(f"test_fn: {test_fn}, size: {len(test_qids):,}")
    print(f"train_fn: {train_fn}, size: {len(train_qids):,}")
    print(75*"=")
print("SUCCESS: test and train qids not overlap.")

test_fn: /home/jupyter/unity_jointly_rec_and_search/datasets/unified_kgc/unified_test/arels.test.sim.tsv, size: 8,273
train_fn: /home/jupyter/unity_jointly_rec_and_search/datasets/unified_kgc/unified_train/a2sp.train.tsv, size: 199,507
test_fn: /home/jupyter/unity_jointly_rec_and_search/datasets/unified_kgc/unified_test/arels.test.compl.tsv, size: 6,720
train_fn: /home/jupyter/unity_jointly_rec_and_search/datasets/unified_kgc/unified_train/a2cp.train.tsv, size: 80,042
test_fn: /home/jupyter/unity_jointly_rec_and_search/datasets/unified_kgc/unified_test/qrels.test.tsv, size: 7,274
train_fn: /home/jupyter/unity_jointly_rec_and_search/datasets/unified_kgc/unified_train/max2_qorient_q2p.train.tsv, size: 832,482
SUCCESS: test and train qids not overlap.
