In [14]:
# %%
import argparse

import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map

from util.mongodb import Mongo
import os

import networkx as nx
from pyvis.network import Network

In [2]:


def get_triples(meta, reviews):
    # has reviewed relations
    # reviewerID has_reviewed productID
    triples = set()

    triples.add((meta.get('asin'), 'price', meta.get('price', '')))

    has_wrote_summary = set()
    summary_is_for = set()
    for review in reviews:
        if 'summary' in review:
            has_wrote_summary.add(
                (review["reviewerID"], "has_wrote_summary", review["summary"])
            )
            if meta['asin']:
                summary_is_for.add((review["summary"], "summary_is_for", meta["asin"]))
    triples.update(has_wrote_summary)

    ranked = set()
    for review in reviews:
        ranked.add((review["reviewerID"], f"ranked_{int(review['overall'])}", review["asin"]))
    triples.update(ranked)
        
    if "category" in meta:
        is_an_instance_of = set()
        for category in meta["category"]:
            is_an_instance_of.add((meta["asin"], "is_an_instance_of", category))
        triples.update(is_an_instance_of)
        
    if "also_buy" in meta:
        also_buy = set()
        for asin in meta["also_buy"]:
            also_buy.add((meta["asin"], "also_buy", asin))
        triples.update(also_buy)

    if "also_view" in meta:
        also_view = set()
        for asin in meta["also_view"]:
            also_view.add((meta["asin"], "also_view", asin))
        triples.update(also_view)

    # if 'image' in meta:
    #     has_such_number_of_images = set()
    #     has_such_number_of_images.add((meta['asin'], 'has_such_number_of_images', len(meta['image'])))
    #     triples.update(has_such_number_of_images)

    if "brand" in meta:
        is_of_brand = set()
        if meta['brand']:
            is_of_brand.add((meta["asin"], "is_of_brand", meta["brand"]))
        triples.update(is_of_brand)

    if "details" in meta:

        if "Discontinued by manufacturer:" in meta["details"]:
            is_discontinued = set()
            is_discontinued.add(
                (
                    meta["asin"],
                    "is_discontinued",
                    meta["details"]["Discontinued by manufacturer:"],
                )
            )
            triples.update(is_discontinued)
    return triples

In [29]:

mongo = Mongo(host="52.53.202.9", port=27017, username="mongo", password="avengers", database="amazon_product_review")

triples = set()

query = {"$and":[
    {"main_cat": "All Beauty"},
    {"asin": {
        "$in": ['B00KXVY7M8', 'B00CQGUT3E', 'B0070WVEWE']
        }
        }]}
# query = {"$and":[
#     {"main_cat": "All Beauty"},
#     {"asin": {
#         "$in": ['B00KXVY7M8']
#         }
#         }]}
# total = mongo.meta.count_documents(query)
total = 5000
metas = mongo.get_meta(query, limit=total)

def create_triples(meta):
    reviews = list(mongo.get_review_by_asin(meta["asin"]))
    triples.update(get_triples(meta, reviews))
    
_ = thread_map(create_triples, metas, max_workers=30, total=total)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [38]:
G = nx.Graph()
edge_types = {
    'also_view':"#000000", # black
    'has_wrote_summary':"#555555", # grey
    'is_an_instance_of':"#0000ff",# blue
    'is_of_brand':"#ff00ff",# pink
    'price':"#00ffff",# cyan
    'ranked_1':"#ff0000",# red
    'ranked_2':"#ff0000", # red
    'ranked_3':"#ff0000", # red
    'ranked_4':"#00ff00", # green
    'ranked_5':"#00ff00" # green
}
for node in ['B00KXVY7M8', 'B00CQGUT3E', 'B0070WVEWE']:
    G.add_node(node, color = "#FFFF00", size=40) # yellow
for triple in triples:
    if triple[1] == 'also_view':
        G.add_node(triple[2], color = "#FFFF00", size=20) # yellow
    if triple[1] == 'has_wrote_summary':
        G.add_node(triple[0], color = "#00FFFF", size=40) # cyan
    G.add_edge(triple[0], triple[2], group=triple[1], color=edge_types[triple[1]])

In [39]:
edge_types

{'also_view': '#000000',
 'has_wrote_summary': '#555555',
 'is_an_instance_of': '#0000ff',
 'is_of_brand': '#ff00ff',
 'price': '#00ffff',
 'ranked_1': '#ff0000',
 'ranked_2': '#ff0000',
 'ranked_3': '#ff0000',
 'ranked_4': '#00ff00',
 'ranked_5': '#00ff00'}

In [41]:
net = Network("100%", "70%")

net.from_nx(G)
net.show_buttons()
net.force_atlas_2based()
net.show("graph.html")

In [26]:
len(triples)

2612