In [None]:
import os
import json

EXPLORE PATH

In [None]:
OUT_DIR = "./categories/json"

In [None]:
def explore_with_depth(
    node: dict, name: str, path: str, parent: str, ancestor: str, depth: int = 0
):
    innerdata = {
        "name": name,
        "parent": parent,
        "ancestor": ancestor if depth > 0 else None,
        "is_leaf": False,
        "depth": depth,
        "url": node["url"],
        "path": path,
        "inner": {},
    }

    if not node["inner"]:
        innerdata["is_leaf"] = True
        return innerdata

    for category in node["inner"]:
        new_path = f"{path}/{category}"
        subdata = explore_with_depth(
            node["inner"][category],
            category,
            new_path,
            innerdata["name"],
            ancestor,
            depth + 1,
        )
        innerdata["inner"][category] = subdata
    return innerdata


filenames = [
    os.path.join(OUT_DIR, filename)
    for filename in os.listdir(OUT_DIR)
    if ".json" in filename
    and "\uf07c" not in filename
    and "processed" not in filename
    and "final" not in filename
]

In [None]:
dict_data = {}
for old_filename in filenames:
    new_filename = old_filename.split(".json")[0] + ".processed.json"
    with open(old_filename, "r", encoding="utf-8") as read_file, open(
        new_filename, "w"
    ) as write_file:
        jsondata = json.load(read_file)
        category_name = jsondata["path"]
        processed_jsondata = explore_with_depth(
            jsondata,
            category_name,
            category_name,
            category_name,
            category_name,
        )
        dict_data[category_name] = processed_jsondata
        json.dump(
            processed_jsondata,
            write_file,
            indent=2,
        )

In [None]:
lst_data = []
processed = set()


def process_node(node):
    k = f"[depth={node['depth']}][name={node['name']}]"
    if k in processed:
        return []

    processed.add(k)

    if not node:
        return []

    name = node["name"]
    parent = node["parent"]
    ancestor = node["ancestor"]
    is_leaf = node["is_leaf"]
    depth = node["depth"]
    url = node["url"]
    path = node["path"]
    inner = node["inner"]

    row = [name, depth, ancestor, parent, path, url, is_leaf]
    output = [row]

    for category in inner:
        innerdata = process_node(inner[category])
        output += innerdata
    return output


for category_name in dict_data:
    ancestor = category_name
    node = dict_data[category_name]
    lst_data += process_node(node)


with open(os.path.join(OUT_DIR, "final.db.json"), "w") as write_file:
    json.dump(lst_data, write_file, indent=2)

In [None]:
query = """
INSERT INTO "scraping"."amazon_categories" (
    name, depth, ancestor, parent, path, url, is_leaf
) VALUES ($1, $2, $3, $4, $5, $6, $7);
"""
import asyncpg

async with asyncpg.create_pool(dsn=os.getenv("POSTGRESQL_CONN_STR")) as pool:
    async with pool.acquire() as conn:
        await conn.executemany(query, lst_data)