In [13]:
import os
import json

from bs4 import BeautifulSoup
from data_gathering.data_extraction.tree_modification import simplify_body

In [14]:
with open("../dataset_creation_config.json", "r") as file:
    config = json.load(file)

tags_to_include = set(config["tags"])
text_formatting_tags = set(config["text_formatting_tags"])


In [15]:
class_id_names = ['supplementary']

In [16]:
DIR_PATH = "../web_pages/all_domains/pages"

In [17]:
files = os.listdir(DIR_PATH)

In [18]:
results = {name: set() for name in files}

In [19]:
import copy
from typing import List, Optional, Union

from bs4 import NavigableString, Tag


def handle_text(node: str) -> Optional[str]:
    text = node.strip().replace("\n", " ")

    if len(text):
        return f" {text} "


def handle_tag(node: Tag, text_formatting_tags: set, tags_to_include: set, filename: str):
    child = None

    if node.name == "a":
        child = node
    elif node.name in text_formatting_tags:
        if node.string:
            child = handle_text(node.string)
        elif len(node.contents):
            elements = simplify_body(
                soup=node,
                text_formatting_tags=text_formatting_tags,
                tags_to_include=tags_to_include,
                filename=filename
            )
            return elements
    elif node.name in tags_to_include:
        # Check is class or id includes nav
        class_str = " ".join(node.attrs.get("class", [])).lower()
        id_str = node.attrs.get("id", "").lower()

        if 'nav' in class_str or 'nav' in id_str:
            return None

        for name in class_id_names:
            if name in class_str or name in id_str:
                results[filename].add(name)

        child = simplify_body(
            soup=node,
            text_formatting_tags=text_formatting_tags,
            tags_to_include=tags_to_include,
            filename=filename,
        )

    return child


def handle_tag_a(elem: Tag, soup: Tag, text_formatting_tags: set, tags_to_include: set, filename:str):
    child = simplify_body(
        soup=elem,
        text_formatting_tags=text_formatting_tags,
        tags_to_include=tags_to_include,
        filename=filename
    )
    if child:
        for node in child.contents:
            soup.append(node)


def simplify_body(
    soup: Tag, text_formatting_tags: set, tags_to_include: set, filename: str
) -> Optional[Union[List[str], Tag]]:
    """
    Simplifies DOM tree.

    :param soup: a Tag Object
    :type soup: BeautifulSoup4 Tag
    :param text_formatting_tags: a set containing tags that should remain in the tree
    :type text_formatting_tags: set
    :param tags_to_include: a set containing text formatting tags (like "strong")
    :type tags_to_include: set

    :return: Returns different types which allows to recurrently clean the tree.
    :rtype: Optional[Union[List[str], Tag]]
    """
    if (
        soup.name in tags_to_include
        or soup.name == "a"
        or soup.name in text_formatting_tags
    ):
        children = []

        for node in soup.contents:
            child = None
            if isinstance(node, Tag):
                res = handle_tag(
                    node=node,
                    text_formatting_tags=text_formatting_tags,
                    tags_to_include=tags_to_include,
                    filename=filename
                )

                if type(res) == list:
                    children += res
                else:
                    child = res
            elif isinstance(node, NavigableString) and type(node) == NavigableString:
                child = handle_text(node)

            if child:
                children.append(child)

        if len(children) and soup.name in text_formatting_tags:
            return children
        elif len(children):
            soup.contents = []

            for elem in children:
                if isinstance(elem, Tag) and elem.name == "a":
                    handle_tag_a(
                        elem=elem,
                        soup=soup,
                        text_formatting_tags=text_formatting_tags,
                        tags_to_include=tags_to_include,
                        filename=filename
                    )
                else:
                    soup.append(elem)

            soup = copy.copy(soup)
            soup.smooth()

            return soup


In [20]:
for file in files:
    with open(f"{DIR_PATH}/{file}", "r") as f:
        html = f.read()

    soup = BeautifulSoup(html, "html.parser")

    simplified_soup_body = simplify_body(
        soup=soup.body,
        text_formatting_tags=text_formatting_tags,
        tags_to_include=tags_to_include,
        filename=file
    )


In [21]:
results


{'english_autoanything_2.html': {'supplementary'},
 'polish_easytoys_83.html': set(),
 'english_barnesandnoble_2139.html': set(),
 'polish_morele_2894.html': set(),
 'polish_doz_2700.html': set(),
 'english_footlocker_912.html': set(),
 'polish_ebutik_3957.html': set(),
 'english_lovinglane_3157.html': set(),
 'english_deguns_1403.html': set(),
 'english_hallmark_1578.html': set(),
 'polish_decathlon_4505.html': set(),
 'english_baublebar_1262.html': set(),
 'polish_zooart_94.html': set(),
 'english_vibemushrooms_4.html': set(),
 'english_target_2203.html': set(),
 'english_microcenter_4297.html': set(),
 'english_estellecoloredglass_60.html': set(),
 'english_baublebar_112.html': set(),
 'english_charmingcharlie_3922.html': set(),
 'english_guns_683.html': set(),
 'polish_eobuwie_812.html': set(),
 'english_ebay_4298.html': set(),
 'english_craftonlineusa_4872.html': set(),
 'english_thellegance_19.html': set(),
 'english_monoprice_2794.html': set(),
 'polish_answear_268.html': set(),

In [22]:
res = {key: list(val) for key, val in results.items()}

In [23]:
res = dict(sorted(res.items()))

In [24]:
with open("class_id_results_new.json", "w") as f:
    json.dump(res, f, indent=4)
