In [7]:
import networkx as nx


In [40]:
kgs = [(12, 2, {"type": "is_relevant_to"}), (3, 2, {"type": "is_similar_to"}), 
       (4,8, {"type": "is_similar_to"}), (4, 9, {"type": "is_complementary_to"}),
      (4,8, {"type": "is_complemtary_to"})]
G = nx.MultiDiGraph()
G.add_edges_from(kgs)

print(G.edges(data="type"))
edges_to_delete = []
for n, nbrs in G.adj.items():
    for nbr, edge_attrs in nbrs.items():
        print(nbr, edge_attrs)
        if n == 4 and edge_attrs["type"] == "is_similar_to":
            edges_to_delete.append((n, nbr))
G.remove_edges_from(edges_to_delete)
print(G.edges(data="type"))

[(12, 2, 'is_relevant_to'), (3, 2, 'is_similar_to'), (4, 8, 'is_similar_to'), (4, 8, 'is_complemtary_to'), (4, 9, 'is_complementary_to')]
2 {0: {'type': 'is_relevant_to'}}
2 {0: {'type': 'is_similar_to'}}
8 {0: {'type': 'is_similar_to'}, 1: {'type': 'is_complemtary_to'}}


KeyError: 'type'

In [42]:
G[4][8]

AtlasView({0: {'type': 'is_similar_to'}, 1: {'type': 'is_complemtary_to'}})

In [43]:
import os

from google.cloud import bigquery
import pandas as pd 
import numpy as np

client = bigquery.Client()
print("Client creating using default project: {}".format(client.project))

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.comp_rec_ClicksData_2core`;
    """
query_job = client.query(query)
compl_rec_df = query_job.to_dataframe()

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.hansi_rec_ClicksData_5core`;
    """
query_job = client.query(query)
sim_rec_df = query_job.to_dataframe()

query = """
    SELECT *
    FROM `gcp-ushi-digital-ds-qa.new_hansi_dataset.search_ClicksData_1year_5core`;
"""
query_job = client.query(query)
search_df = query_job.to_dataframe()

query = """
    SELECT * 
    FROM `gcp-ushi-digital-ds-qa.hansi_dataset.all_products_info`;
    """
query_job = client.query(query)
product_df = query_job.to_dataframe()
print("product_df = {:,}".format(len(product_df)))

all_products = set(product_df.product_id)
anchors = set(compl_rec_df.anchor)
compl_ivms = set(compl_rec_df.ivm)
all_compl_ivms = anchors.union(compl_ivms)

print("================================ For anchor_to_compl_ivms: ===================================")
print("number of unique product = {:,}, anchors = {:,}, complementary_compl_ivms = {:,}".format(len(all_products), len(anchors), len(compl_ivms)))
assert len(all_products & anchors) == len(anchors) and len(all_products & compl_ivms) == len(compl_ivms),(
    len(all_products & anchors), len(anchors), len(all_products & compl_ivms), len(compl_ivms)
)

all_sim_ivms = set(sim_rec_df.anchor).union(set(sim_rec_df.ivm))
print("================================ After updating anchor_to_similar_ivms: ===================================")
print("all_compl_ivms = {:,}, all_sim_ivms = {:,}".format(len(all_compl_ivms), len(all_sim_ivms)))
print("sim_compl_intersect = {:,} ({:.3f})".format(len(all_compl_ivms & all_sim_ivms), len(all_compl_ivms & all_sim_ivms) / len(all_compl_ivms)))
print("all_ivms = {:,}".format(len(all_compl_ivms | all_sim_ivms)))
all_ivms = all_compl_ivms | all_sim_ivms

assert len(all_products & all_ivms) == len(all_ivms), (len(all_products & all_ivms), len(all_ivms))

query_to_ivms = search_df.groupby("query")["ivm"].apply(list)
ivm_to_tmp_queries = search_df.groupby("ivm")["query"].apply(list)
query_lengths = np.array([len(x) for x in ivm_to_tmp_queries.values])
all_queries = set(search_df["query"])
print("all queries = {:,}".format(len(all_queries)))
print("total ivms (queries) = {:,}, length >=3 = {:,}, length >= 5 = {:,}".format(
    len(query_lengths), np.sum(query_lengths >=3), np.sum(query_lengths >= 5) ))

anchor_to_compl_ivms = compl_rec_df.groupby("anchor")["ivm"].apply(list)
compl_ivms_length = np.array([len(x) for x in anchor_to_compl_ivms.values])
print("================================ For anchor_to_compl_ivms: ===================================")
print("total_compl_ivms = {:,}, length >=3 = {:,}, length >= 5 = {:,}".format(len(compl_ivms_length), np.sum(compl_ivms_length >=3), np.sum(compl_ivms_length >= 5) ))

anchor_to_sim_ivms = sim_rec_df.groupby("anchor")["ivm"].apply(list)

# map product --> text
from tqdm import tqdm 

ivm_to_title = {}
ivm_to_bullet = {}
ivm_to_catalog = {}
no_bulletin_ivms = set()
no_title_ivms = set()
no_catalog_ivms = set()

def preprocess_text(in_text):
    in_text = in_text.replace("\t", " ")
    in_text = in_text.replace("\n", " ")
    return in_text

for idx, row in tqdm(product_df.iterrows(), total=len(product_df)):
    product_id = row.product_id
    title = row.product_name if row.product_name != None else "No title"
    bullets = row.bullets if row.bullets != None else "No bullets"
    catalog = row.catalog_name if row.catalog_name != None else "No catalog"
    
    if row.product_name == None:
        no_title_ivms.add(product_id)
    if row.bullets == None:
        no_bulletin_ivms.add(product_id)
    if row.catalog_name == None:
        no_catalog_ivms.add(product_id)
    
    title = preprocess_text(title)
    bullets = preprocess_text(bullets)
    catalog = preprocess_text(catalog)
    
    ivm_to_title[product_id] = title
    ivm_to_bullet[product_id] = bullets
    ivm_to_catalog[product_id] = catalog

# sanity check
print("ivm_to_title = {:,}, ivm_to_bullet = {:,}, ivm_to_catalog = {:,}, products no bulletin = {:,}, no title = {:,}, no catalog = {:,}".format(
    len(ivm_to_title), len(ivm_to_bullet), len(ivm_to_catalog), len(no_bulletin_ivms), len(no_title_ivms), len(no_catalog_ivms)
))

assert len(ivm_to_title) == len(ivm_to_bullet) == len(ivm_to_catalog) == len(product_df)

import ujson
from collections import defaultdict

# map to pid and qid
ivm_to_pid = {ivm: pid for pid, ivm in enumerate(list(all_products))}
pid_to_ivm = {pid: ivm for ivm, pid in ivm_to_pid.items()}
query_to_qid = {query: qid + len(ivm_to_pid) for qid, query in enumerate(list(all_queries))}
qid_to_query = {qid: query for query, qid in query_to_qid.items()}
start_qid = len(ivm_to_pid)

pid_to_title = {ivm_to_pid[ivm]: title for ivm, title in ivm_to_title.items()}
pid_to_bullet = {ivm_to_pid[ivm]: bullet for ivm, bullet in ivm_to_bullet.items()}
pid_to_catalog = {ivm_to_pid[ivm]: catalog for ivm, catalog in ivm_to_catalog.items()}

aid_to_sim_pids = {ivm_to_pid[anchor]: [ivm_to_pid[ivm] for ivm in sim_ivms] for anchor, sim_ivms in anchor_to_sim_ivms.items()}
aid_to_compl_pids = {ivm_to_pid[anchor]: [ivm_to_pid[ivm] for ivm in compl_ivms] for anchor, compl_ivms in anchor_to_compl_ivms.items()}
qid_to_pids = {query_to_qid[query]: [ivm_to_pid[ivm] for ivm in ivms] for query, ivms in query_to_ivms.items()}
pid_to_tmp_qids = {ivm_to_pid[ivm]: [query_to_qid[_query] for _query in queries] for ivm, queries in ivm_to_tmp_queries.items()}

Client creating using default project: gcp-ushi-digital-ds-qa
product_df = 2,260,878
number of unique product = 2260878, anchors = 86,870, complementary_compl_ivms = 65,561
all_compl_ivms = 109,758, all_sim_ivms = 256,765
sim_compl_intersect = 87,425 (0.797)
all_ivms = 279,098
all queries = 953773
total ivms (queries) = 360,744, length >=3 = 196,481, length >= 5 = 142,527
total_compl_ivms = 86,870, length >=3 = 35,837, length >= 5 = 22,121


100%|██████████| 2260878/2260878 [05:29<00:00, 6871.00it/s]


ivm_to_title = 2,260,878, ivm_to_bullet = 2,260,878, ivm_to_catalog = 2,260,878, products no bulletin = 0, no title = 21, no catalog = 4,519


In [63]:
G = nx.MultiDiGraph()
SIM_RELATION = "is_similar_to"
COMPL_RELATION = "is_complementary_to"
REL_RELATION = "is_relevant_to"

val_test_indices = np.random.choice(np.arange(0, len(aid_to_sim_pids)), int(0.2*len(aid_to_sim_pids)), replace=False)
val_indices = val_test_indices[:int(0.5*len(val_test_indices))]
test_indices = val_test_indices[int(0.5*len(val_test_indices)):]
train_aid_to_simpids, val_aid_to_simpids, test_aid_to_simpids = {}, {}, {}
for idx, (aid, simpids) in enumerate(aid_to_sim_pids.items()):
    if idx in val_indices:
        val_aid_to_simpids[aid] = simpids
    elif idx in test_indices:
        test_aid_to_simpids[aid] = simpids
    else:
        train_aid_to_simpids[aid] = simpids
print("number of train_aid_to_simpids = {:,}, val = {:,}, test = {:,}".format(len(train_aid_to_simpids), len(val_aid_to_simpids), 
                                                                            len(test_aid_to_simpids)))

for aid, sim_pids in train_aid_to_simpids.items():
    triples = [(aid, sim_pid, {"type":SIM_RELATION}) for sim_pid in sim_pids]
    G.add_edges_from(triples)
    
for aid, compl_pids in aid_to_compl_pids.items():
    triples = [(aid, compl_pid, {"type":COMPL_RELATION}) for compl_pid in compl_pids]
    G.add_edges_from(triples)
    
for pid, qids in pid_to_tmp_qids.items():
    triples = [(qid, pid, {"type": REL_RELATION}) for qid in qids]
    G.add_edges_from(triples)
    
    

number of train_aid_to_simpids = 172,991, val = 21,623, test = 21,624


In [64]:
multi_edge_pairs = []
for n, nbrs_dict in G.adj.items():
    for nbr_node, edge_attrs in nbrs_dict.items():
        assert len(edge_attrs) == 1 or len(edge_attrs) == 2
        if len(edge_attrs) == 2:
            multi_edge_pairs.append((n, nbr_node))
            
print("number of edges = {:,}, number of multi-attr edges = {:,}, ({:.3f})".format(G.number_of_edges(), len(multi_edge_pairs), 
                                                                                   len(multi_edge_pairs)/G.number_of_edges()))

number of edges = 5,181,469, number of multi-attr edges = 14,845, (0.003)


In [75]:
for i, (src_node, nbrs_dict) in enumerate(G.adj.items()):
    if i >= 1_000_000 and i < 1_000_000 + 10:
        print(i)
        print(src_node, nbrs_dict)

1000000
2511605 {2998: {0: {'type': 'is_relevant_to'}}, 529757: {0: {'type': 'is_relevant_to'}}}
1000001
2456733 {2998: {0: {'type': 'is_relevant_to'}}}
1000002
2965582 {2998: {0: {'type': 'is_relevant_to'}}, 1635048: {0: {'type': 'is_relevant_to'}}, 2237450: {0: {'type': 'is_relevant_to'}}, 1963674: {0: {'type': 'is_relevant_to'}}, 1217412: {0: {'type': 'is_relevant_to'}}, 221005: {0: {'type': 'is_relevant_to'}}, 1429565: {0: {'type': 'is_relevant_to'}}}
1000003
2651531 {337451: {0: {'type': 'is_relevant_to'}}}
1000004
2888930 {337451: {0: {'type': 'is_relevant_to'}}, 2240811: {0: {'type': 'is_relevant_to'}}}
1000005
2816955 {2010514: {0: {'type': 'is_relevant_to'}}, 2011310: {0: {'type': 'is_relevant_to'}}}
1000006
2608146 {2011310: {0: {'type': 'is_relevant_to'}}, 603729: {0: {'type': 'is_relevant_to'}}}
1000007
2190661 {}
1000008
975195 {}
1000009
3134000 {1134889: {0: {'type': 'is_relevant_to'}}, 753559: {0: {'type': 'is_relevant_to'}}}


In [80]:
for src_node, nbrs_dict in G.adj.items():
    if 975195 in nbrs_dict:
        print(src_node, nbrs_dict[975195])

2931719 {0: {'type': 'is_relevant_to'}}
