In [7]:
import os
import json
import random

from bs4 import BeautifulSoup
import html2text

from tree_modification import simplify_body, textify_simplified_head
from utils import get_binary_dicts_templates

In [8]:
# web_pages = os.listdir("web_pages/all_domains/pages")
web_pages_products = os.listdir("../../../../studia/master-thesis/data/02_intermediate/web_pages/products")
web_pages_other = os.listdir("../../../../studia/master-thesis/data/02_intermediate/web_pages/other")
# len(web_pages)

In [3]:
web_pages_products = random.sample(web_pages_products, len(web_pages_products) // 10)
web_pages_other = random.sample(web_pages_other, len(web_pages_other) // 10)

In [4]:
len(web_pages_products), len(web_pages_other)

(40079, 14836)

In [5]:
# web_pages

In [9]:
with open("./dataset_creation_config.json", "r") as file:
    config = json.load(file)

tags_to_include = set(config["tags"])
text_formatting_tags = set(config["text_formatting_tags"])
meta_values = config["meta_values"]
abbreviations = config["abbreviations"]
(
    available_tags_binary_dict,
    available_attributes_values_binary_dict,
) = get_binary_dicts_templates(config)

In [10]:
from markdownify import MarkdownConverter, ATX, BACKSLASH
from bs4 import Tag

# Create shorthand method for conversion
def create_md(**options):
    converter = MarkdownConverter(**options)

    def md(soup: Tag):
        return converter.convert_soup(soup)

    return md

In [11]:
md = create_md(heading_style=ATX, newline_style=BACKSLASH)

In [9]:
def compare_strings_line_by_line(str1, str2):
    str1_lines = [elem.strip() for elem in str1.strip().splitlines()]
    str2_lines = [elem.strip() for elem in str2.strip().splitlines()]

    for i in range(len(str1_lines)):
        if str1_lines[i] != str2_lines[i]:
            return False

    return True

In [9]:
def compare_files(web_pages, path):
    different_files = []

    for web_page in web_pages:
        html = None
        with open(f"{path}/{web_page}", "r") as f:
            html = f.read()

        soup = BeautifulSoup(html, "lxml")
        simplified_soup_body_1 = simplify_body(
            soup=soup.body,
            text_formatting_tags=text_formatting_tags,
            tags_to_include=tags_to_include
        )

        soup = BeautifulSoup(html, "html.parser")
        simplified_soup_body_2 = simplify_body(
            soup=soup.body,
            text_formatting_tags=text_formatting_tags,
            tags_to_include=tags_to_include,
        )

        try:
            if not compare_strings_line_by_line(simplified_soup_body_1.prettify(), simplified_soup_body_2.prettify()):
                different_files.append(web_page)
        except Exception:
            different_files.append(web_page)

    return different_files

In [10]:
import cchardet

different_files_dict = {
    'products': compare_files(web_pages_products, '../../../../studia/master-thesis/data/02_intermediate/web_pages/products'),
    'other': compare_files(web_pages_other, '../../../../studia/master-thesis/data/02_intermediate/web_pages/other')
}

with open("different_files.json", "w") as file:
    json.dump(different_files_dict, file, indent=4)



# Without nav class or id
later check with nav

In [12]:
from transformers import AutoTokenizer
import os

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")


In [13]:
from time import time
# import cchardet
import re

url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

simplify_lxml_times = []
simplify_html_parser_times = []
md_lxml_times = []
md_html_parser_times = []

sizes_lxml = []
sizes_html_parser = []

def get_times(web_pages, path):
    for web_page in web_pages:
        html = None
        with open(f"{path}/{web_page}", "r") as f:
            html = f.read()

        soup = BeautifulSoup(html, "lxml")

        simplified_soup_head_text = textify_simplified_head(soup=soup.head, meta_acceptable_values=meta_values)

        try:
            start = time()
            simplified_soup_body = simplify_body(
                soup=soup.body,
                text_formatting_tags=text_formatting_tags,
                tags_to_include=tags_to_include,
            )
            end = time()
            simplify_lxml_times.append(end - start)

            start = time()
            simplified_body_text = md(simplified_soup_body)
            end = time()
            md_lxml_times.append(end - start)

            full_page_text = f"{simplified_soup_head_text}\n{simplified_body_text}"
            full_page_text = re.sub(url_regex, "", full_page_text)

            inputs = tokenizer(full_page_text.strip(), return_tensors="pt")
            sizes_lxml.append(inputs.input_ids.squeeze().shape[0])
        except Exception:
            continue

        # full_page_text = f"{simplified_soup_head_text}\n{simplified_body_text}"
        #
        # with open(f"web_pages/all_domains/results_lxml/{web_page}", "w") as f:
        #     f.write(full_page_text)
        #
        # # save simplified soup to file
        # with open(f"web_pages/all_domains/simplified_lxml/{web_page}", "w") as file:
        #     file.write(simplified_soup_body.prettify())


        soup = BeautifulSoup(html, "html.parser")

        try:
            start = time()
            simplified_soup_body = simplify_body(
                soup=soup.body,
                text_formatting_tags=text_formatting_tags,
                tags_to_include=tags_to_include,
            )
            end = time()
            simplify_html_parser_times.append(end - start)

            start = time()
            simplified_body_text = html2text.html2text(simplified_soup_body.prettify())
            end = time()
            md_html_parser_times.append(end - start)

            full_page_text = f"{simplified_soup_head_text}\n{simplified_body_text}"
            full_page_text = re.sub(url_regex, "", full_page_text)

            inputs = tokenizer(full_page_text.strip(), return_tensors="pt")
            sizes_html_parser.append(inputs.input_ids.squeeze().shape[0])
        except Exception:
            continue

        # full_page_text = f"{simplified_soup_head_text}\n{simplified_body_text}"

        # with open(f"web_pages/all_domains/results_html_parser/{web_page}", "w") as f:
        #     f.write(full_page_text)
        #
        # with open(f"web_pages/all_domains/simplified_html_parser/{web_page}", "w") as file:
        #     file.write(simplified_soup_body.prettify())

In [None]:
get_times(web_pages_products, '../../../../studia/master-thesis/data/02_intermediate/web_pages/products')
get_times(web_pages_other, '../../../../studia/master-thesis/data/02_intermediate/web_pages/other')

Token indices sequence length is longer than the specified maximum sequence length for this model (1275 > 512). Running this sequence through the model will result in indexing errors


In [None]:
print(f"lxml simplify time: {sum(simplify_lxml_times)/len(simplify_lxml_times)}")
print(f"html parser simplify time: {sum(simplify_html_parser_times)/len(simplify_html_parser_times)}")
print(f"lxml md time: {sum(md_lxml_times)/len(md_lxml_times)}")
print(f"html parser md time: {sum(md_html_parser_times)/len(md_html_parser_times)}")

In [None]:
print(f"lxml size: {sum(sizes_lxml)/len(sizes_lxml)}")
print(f"html parser size: {sum(sizes_html_parser)/len(sizes_html_parser)}")

## Old results

lxml simplify time: 0.18664660945099995
html parser simplify time: 0.10447529040329413
lxml md time: 0.0016778466235480365
html parser md time: 0.017234264529081146

In [5]:
import re

url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

for web_page in os.listdir("web_pages/all_domains/pages"):
    with open(f"web_pages/all_domains/pages/{web_page}", "r") as f:
        html = f.read()

    soup = BeautifulSoup(html, "lxml")

    try:
        simplified_soup_body = simplify_body(
            soup=soup.body,
            text_formatting_tags=text_formatting_tags,
            tags_to_include=tags_to_include,
        )
        simplified_body_text = md(simplified_soup_body)
        simplified_soup_body = simplified_soup_body.prettify()
    except Exception:
        simplified_soup_body = ""
        simplified_body_text = ""

    try:
        simplified_soup_head_text = textify_simplified_head(
            soup=soup.head, meta_acceptable_values=meta_values
        )
    except Exception:
        simplified_soup_head_text = ""

    full_page_text = f"{simplified_soup_head_text}\n{simplified_body_text}"
    full_page_text = re.sub(url_regex, "", full_page_text)

    with open(f"web_pages/all_domains/results/{web_page}", "w") as f:
        f.write(full_page_text)

    with open(f"web_pages/all_domains/simplified/{web_page}", "w") as f:
        f.write(simplified_soup_body)



In [6]:
a = set(os.listdir("web_pages/all_domains/results"))
b = set(os.listdir("web_pages/all_domains/simplified"))
a.difference(b)

set()

In [16]:
# import filecmp
#
# for web_page in web_pages:
#     res = filecmp.cmp(f"web_pages/all_domains/simplified/{web_page}", f"web_pages/all_domains/simpl_html_parser/{web_page}")
#     if not res:
#         print(res, web_page)


In [2]:
# with open("different_files.json", "r") as file:
#     different_files_dict = json.load(file)
#
# different_files_products = different_files_dict['products']
# different_files_other = different_files_dict['other']


In [7]:
# from shutil import copyfile
#
# for file in different_files_products:
#     copyfile(f"../../../../studia/master-thesis/data/02_intermediate/web_pages/products/{file}", f"web_pages/all_domains/broken_pages/{file}")
#
# for file in different_files_other:
#     copyfile(f"../../../../studia/master-thesis/data/02_intermediate/web_pages/other/{file}", f"web_pages/all_domains/broken_pages/{file}")