In [None]:
import re
import json

from bs4 import BeautifulSoup, NavigableString, Tag
from transformers import T5TokenizerFast
from markdownify import ATX, BACKSLASH

from data_gathering.data_extraction.tree_modification import simplify_body, textify_simplified_head
from data_gathering.data_extraction.html2markdown import create_md
from data_gathering.data_extraction.utils import get_binary_dicts_templates

In [94]:
tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-base")

In [95]:
url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

In [96]:
with open("../dataset_creation_config.json", "r") as file:
    config = json.load(file)

tags_to_include = set(config["tags"])
text_formatting_tags = set(config["text_formatting_tags"])
meta_values = config["meta_values"]
abbreviations = config["abbreviations"]
(
    available_tags_binary_dict,
    available_attributes_values_binary_dict,
) = get_binary_dicts_templates(config)

In [97]:
md = create_md(heading_style=ATX, newline_style=BACKSLASH)

In [98]:
prompt = "Extract all information about the product from this text from a web store. Extract product title, brand, current price, old price, specifications (including material and dimensions), description, all sizes, all colors, all variants, categories, breadcrumbs, ratings (including number per each rating), opinions, number of available items (for each size) if provided else return null. Also extract similar products, products I may also like, related products, recommended for me but only return product name, brand and price. Categorize the product yourself (provide one general and one specific category). For each opinion return only the opinion text itself. If there's no information return: null. Answer should be in a proper JSON format. The text:"

In [99]:
input = tokenizer(prompt)

In [100]:
prompt_desc_size = len(input.input_ids)

In [101]:
model_max_input_size = 2048

## Max input length to be added

model_max_input_size = 2048

In [102]:
def get_max_input_size(model_max_input_size: int, prompt_desc_size: int, header_md: str, tokenizer: T5TokenizerFast) -> int:
    return model_max_input_size - prompt_desc_size - len(tokenizer(header_md).input_ids)

In [137]:
def join_tags(elems, tag_name):
    html = ""

    for partial_html, _ in elems:
        html += partial_html

    html = f"<{tag_name}>{html}</{tag_name}>"
    print("="*20)
    print("html", html[:100])
    print("="*20)
    # soup = BeautifulSoup(html, "html.parser")

    # if tag_name == "body":
    #     return str(soup.body)
    # else:
    # html = soup.body.contents[0]
    return html

def join_as_many_tags_as_possible(parts, max_size: int, tag_name="body"):
    final = []
    temp = []
    curr_len = 0

    # i = 0
    # while i < len(parts):
    #     if temp:
    #         if curr_len + parts[i][1] <= max_size:
    #             temp.append(parts[i])
    #             curr_len += parts[i][1]
    #         else:
    #             final.append((join_tags(temp, tag_name), curr_len))
    #             temp = [parts[i]]
    #             curr_len = parts[i][1]
    #     else:
    #         temp = [parts[i]]
    #         curr_len = parts[i][1]
    #
    #     i += 1

    for part, size in parts:
        if temp:
            if curr_len + size <= max_size:
                temp.append((part, size))
                curr_len += size
            else:
                final.append((join_tags(temp, tag_name), curr_len))
                temp = [(part, size)]
                curr_len = size
        else:
            temp = [(part, size)]
            curr_len = size

    if temp:
        final.append((join_tags(temp, tag_name), curr_len))

    return final

def divide_html_version(soup, transform_html_func, max_size):
    text = tokenizer(transform_html_func(soup)).input_ids
    size = len(text)
    print("size", size)

    if size <= max_size:
        return str(soup), size

    # if len(soup.contents) <= 1 and isinstance(soup.contents[0], NavigableString):
    #     return []

    # for elem in soup.contents:
    #     if not isinstance(elem, Tag) and len(soup.contents) > 1:
    #         return None

    print()
    print("="*30)
    print(str(soup)[:100])
    print("="*30)
    print()

    parts = []

    for elem in soup.contents:
        print("+"*20)
        print(str(elem)[:100])
        print("+"*20)
        res = divide_html_version(soup=elem, transform_html_func=transform_html_func, max_size=max_size) if isinstance(elem, Tag) else (str(elem), len(tokenizer(str(elem)).input_ids))

        if res is None:
            return None
        elif isinstance(res, list):
            print("res len", len(res))
            parts += res
        else:
            print("res len", 1)
            parts.append(res)

    return join_as_many_tags_as_possible(parts=parts, max_size=max_size, tag_name=soup.name)

In [138]:
with open("../stats/original_pages_html/8000_tokens/english_craftonlineusa_626.html", "r") as f:
    html = f.read()

soup = BeautifulSoup(html, "html.parser")

In [139]:
simplified_soup_head_text = textify_simplified_head(soup=soup.head, meta_acceptable_values=meta_values)
simplified_soup_body = simplify_body(
    soup=soup.body,
    text_formatting_tags=text_formatting_tags,
    tags_to_include=tags_to_include,
)
simplified_body_text = md(simplified_soup_body)

full_page_text = f"{simplified_soup_head_text}\n{simplified_body_text}"
full_page_text = re.sub(url_regex, "", full_page_text)

In [140]:
full_input = tokenizer(full_page_text).input_ids

In [141]:
len(full_input)

8268

In [142]:
with open("simplified.html", "w") as f:
    f.write(simplified_soup_body.prettify())

with open("simplified_head_txt.html", "w") as f:
    f.write(simplified_soup_head_text)

In [143]:
max_input_size = get_max_input_size(model_max_input_size, prompt_desc_size, simplified_soup_head_text, tokenizer)

In [144]:
max_input_size

1617

In [145]:
res = divide_html_version(simplified_soup_body, transform_html_func=md, max_size=max_input_size)

size 8097

<body class="templateProduct"><div class="boxes-wrapper"><div id="page-body"><div id="body-content">

++++++++++++++++++++
<div class="boxes-wrapper"><div id="page-body"><div id="body-content"><div class="container"><div id
++++++++++++++++++++
size 8097

<div class="boxes-wrapper"><div id="page-body"><div id="body-content"><div class="container"><div id

++++++++++++++++++++
<div id="page-body"><div id="body-content"><div class="container"><div id="main-content"><div class=
++++++++++++++++++++
size 8077

<div id="page-body"><div id="body-content"><div class="container"><div id="main-content"><div class=

++++++++++++++++++++
<div id="body-content"><div class="container"><div id="main-content"><div class="main-content"><div 
++++++++++++++++++++
size 8077

<div id="body-content"><div class="container"><div id="main-content"><div class="main-content"><div 

++++++++++++++++++++
<div class="container"><div id="main-content"><div class="main-content"><div itemscope="" itemtype

In [147]:
if isinstance(res, list):
    for i, elem in enumerate(res):
        elem_soup = BeautifulSoup(elem[0], "html.parser")

        simplified_elem_text = md(elem_soup)

        full_page_text_curr = f"{simplified_soup_head_text}\n{simplified_elem_text}"
        full_page_text_curr = re.sub(url_regex, "", full_page_text_curr)

        input = tokenizer(full_page_text_curr).input_ids
        print(f"divided_{i} size", len(input) + prompt_desc_size)

        print(str(elem_soup)[:100])
        with open(f"divided_{i}.html", "w") as f:
            f.write(elem_soup.body.prettify())
else:
    with open("divided_0.html", "w") as f:
        f.write(BeautifulSoup(res[0], "html.parser").prettify())


divided_0 size 1391
<body><div><div><div><div><div><div><div><span class="hide" itemprop="name"> Reminisce Collection Ki
divided_1 size 1459
<body><div><div><div><div><div><div><div><div><div><div><div><div><div class="product-wrapper"><div 
divided_2 size 2977
<body><div><div><div><div><div><div><div><div><div><div><div><div><div><div><div><div><div><span> {"
divided_3 size 373
<body><div><div><div><div><div><div><div><div><div><div><div><div><div><div class="product-content" 
divided_4 size 2052
<body><div><div><div><div><div><div><div><div><div><div><div><div><div><div><div><div><div><span> {"
divided_5 size 1757
<body><div><div><div><div><div><div><div><div><div><div><div><div><div><div class="product-content" 
