In [1]:
import openai
import os
import json

from markdownify import ATX, BACKSLASH
from transformers import T5TokenizerFast
from bs4 import BeautifulSoup, Tag

from data_gathering.data_extraction.utils import get_binary_dicts_templates
# from data_gathering.data_extraction.html2markdown import create_md
from data_gathering.data_extraction.tree_modification import simplify_body, textify_meta_tags

In [2]:
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv('OPEN_AI_API_KEY')

openai.api_key = API_KEY

In [3]:
tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-base")

In [2]:
with open("../dataset_creation_config.json", "r") as file:
    config = json.load(file)

tags_to_include = set(config["tags"])
text_formatting_tags = set(config["text_formatting_tags"])
abbreviations = config["abbreviations"]
meta_values = config["meta_values"]
(
    available_tags_binary_dict,
    available_attributes_values_binary_dict,
) = get_binary_dicts_templates(config)

In [4]:
# md = create_md(heading_style=ATX, newline_style=BACKSLASH)

In [3]:
class_id_to_exclude = [
    "nav",
    "tooltip",
    "banner",
    "footer",
    "shipping",
    "popup",
    "checkout",
    "payments",
    "returns",
    "delivery",
    "support",
    "warranty",
    "privacy",
    "benefit",
    "sign-in",
    "sign-up",
    "login",
    "password",
    "add-to-cart",
    "announcement",
    "notification",
    "wishlist",
    "social",
    "json",
    "alert",
    "disclaimer",
    "contact",
    "cookies",
    "newsletter",
    "basket",
    "askforproduct",
    "ask-about-product",
    "mobile-header",
    "header-menu",
    "customer"
]

In [4]:
def join_tags(elems, tag_name):
    html = ""

    for partial_html, _ in elems:
        html += partial_html

    html = f"<{tag_name}>{html}</{tag_name}>"

    return html

def join_as_many_tags_as_possible(parts, max_size: int, tag_name="body"):
    final = []
    temp = []
    curr_len = 0

    for part, size in parts:
        if temp:
            if curr_len + size <= max_size:
                temp.append((part, size))
                curr_len += size
            else:
                final.append((join_tags(temp, tag_name), curr_len))
                temp = [(part, size)]
                curr_len = size
        else:
            temp = [(part, size)]
            curr_len = size

    if temp:
        final.append((join_tags(temp, tag_name), curr_len))

    return final

def divide_html_version(soup, transform_html_func, max_size):
    text = tokenizer(transform_html_func(soup)).input_ids
    size = len(text)

    if size <= max_size:
        return str(soup), size

    parts = []

    for elem in soup.contents:
        res = divide_html_version(soup=elem, transform_html_func=transform_html_func, max_size=max_size) if isinstance(elem, Tag) else (str(elem), len(tokenizer(str(elem)).input_ids))

        if res is None:
            return None
        elif isinstance(res, list):
            parts += res
        else:
            parts.append(res)

    return join_as_many_tags_as_possible(parts=parts, max_size=max_size, tag_name=soup.name)

In [7]:
def get_max_input_size(model_max_input_size: int, prompt_desc_size: int, header_md: str, tokenizer: T5TokenizerFast) -> int:
    return model_max_input_size - prompt_desc_size - len(tokenizer(header_md).input_ids)

In [8]:
# prompt = """
# The text given below was extracted from a product website page. Extract the following information about the product from the text:
# - title - string
# - brand - string
# - current price - dictionary with the price as number and the currency as string
# - old price - number
# - product features - return only features describing the product itself as dictionary, don't include description
# - description - string
# - colors variants - list of strings
# - size variants - list of strings excluding dimensions
# - customer ratings - dictionary with average_rating and number of ratings of each rating only if such information is strictly provided
# - customer reviews - list of strings
# - product categories - list of strings, if not provided then always suggest your own, try to be as general as possible
# - similar or related products, or products recommended for me - as a list of strings named similar products
# Don't mistake similar products for customer reviews. Don't return any size tables information.
# Never guess, if there's no information return: null.
# Return only a proper JSON format using snake case for properties. The text:
# """

In [5]:
prompt = """
The text given below was extracted from a product website page. Extract the following product information from the text:
- title - string
- brand - string
- current price - dictionary with the price as number and the currency as string
- old price - number
- product features - list of dictionaries, each dictionary should contain the feature and its value, do not include things provided in other points
- short description - string
- colors variants - list of strings
- size variants - list of strings excluding dimensions
- product categories - as list of strings, if not provided then always suggest your own, try to be as general as possible - must always be provided
- customer ratings - dictionary with average_rating and number of ratings of each rating only if such information is strictly provided
- customer reviews - list of strings
- similar or related products - list of strings
- products recommended for me - list of strings
Don't return any size table information.
Never guess, if there's no information return: null.
Return only a proper JSON format using snake case for properties. The text:
"""

In [9]:
input = tokenizer(prompt)
prompt_desc_size = len(input.input_ids)

NameError: name 'tokenizer' is not defined

In [None]:
prompt_desc_size

In [6]:
model_max_input_size = 16000

In [8]:
from data_gathering.data_extraction.html_features import get_attributes_values
from bs4 import Tag
from markdownify import MarkdownConverter

class CustomMarkdownConverter(MarkdownConverter):
    def process_tag(self, node, convert_as_inline, children_only=False):
        attrs_words = get_attributes_values(node, abbreviations)

        start = " ".join(attrs_words)
        text = super().process_tag(node, convert_as_inline, children_only)
        if start:
            start = f"\n- {start}:"
        return f"{start}\n{text}" if start else text

# Create shorthand method for conversion
def create_md(**options):
    converter = CustomMarkdownConverter(**options)

    def md(soup: Tag):
        return converter.convert_soup(soup)

    return md

In [9]:
md = create_md(heading_style=ATX, newline_style=BACKSLASH)

In [38]:
with open("../web_pages/all_domains/pages/english_fashionnova_4646.html", "r") as file:
    html = file.read()

soup = BeautifulSoup(html, "html.parser")

simplified_soup_body = simplify_body(
    soup=soup.body,
    text_formatting_tags=text_formatting_tags,
    tags_to_include=tags_to_include,
    class_id_to_exclude=class_id_to_exclude,
)
simplified_body_text = md(simplified_soup_body)

# meta_tags_str = textify_meta_tags(soup=soup.head, meta_acceptable_values=meta_values)
# full_page_text_curr = f'{meta_tags_str}\n\n{simplified_body_text}'.strip()
#
# if not os.path.exists(f'results'):
#     os.makedirs(f'results')
#
# max_input_size = get_max_input_size(model_max_input_size, prompt_desc_size, meta_tags_str, tokenizer)
# print('max_input_size', max_input_size)
#
# temps = [0.6, 0.7]
# for temp in temps:
#     subdir_name = str(temp).replace('.', '_')
#     if not os.path.exists(f'results/{subdir_name}'):
#         os.makedirs(f'results/{subdir_name}')
#     #
#     # if f'results/{dir_name}/{subdir_name}/part_{i}.json' in os.listdir(f'results/{dir_name}/{subdir_name}'):
#     #     continue
#
#     print(f"Temperature: {temp}")
#
#     chat_gpt_prompt = f"{prompt}\n{full_page_text_curr}"
#
#     response = openai.ChatCompletion.create(
#       model="gpt-3.5-turbo-16k",
#       temperature=temp,
#       messages=[
#         {
#             "role": "user",
#             "content": chat_gpt_prompt
#         }
#       ]
#     )
#
#     # save the response to txt file
#     with open(f'results/{subdir_name}/part_0.json', "w") as f:
#         f.write(response['choices'][0]['message']['content'])
#
# # with open(f'simplified.txt', "w") as f:
# #     f.write(f'{meta_tags_str}\n{simplified_body_text}')

In [10]:
DIR_PATH = "../web_pages/all_domains/pages"

files = os.listdir(DIR_PATH)
files.sort()

for i, file_name in enumerate(files):
    print(f"File: {file_name}")

    dir_name = file_name.replace('.', '_')
    if not os.path.exists(f'text_v/{dir_name}'):
        os.makedirs(f'text_v/{dir_name}')

    with open(f"{DIR_PATH}/{file_name}", "r") as f:
        html = f.read()

    soup = BeautifulSoup(html, "html.parser")

    simplified_soup_body = simplify_body(
        soup=soup.body,
        text_formatting_tags=text_formatting_tags,
        tags_to_include=tags_to_include,
        class_id_to_exclude=class_id_to_exclude,
    )
    simplified_body_text = md(simplified_soup_body)

    meta_tags_str = textify_meta_tags(soup=soup.head, meta_acceptable_values=meta_values)

    full_page_text_curr = f"{meta_tags_str}\n\n{simplified_body_text}".strip()

    # save to file
    with open(f'text_v/{dir_name}/part_0.txt', "w") as f:
        f.write(full_page_text_curr)

File: english_barnesandnoble_2011.html
File: english_barnesandnoble_2139.html
File: english_baublebar_112.html
File: english_baublebar_1262.html
File: english_charmingcharlie_3922.html
File: english_charmingcharlie_4756.html
File: english_colourpop_1227.html
File: english_colourpop_397.html
File: english_craftonlineusa_1222.html
File: english_craftonlineusa_4872.html
File: english_dunhamssports_3818.html
File: english_dunhamssports_3937.html
File: english_ebay_423.html
File: english_ebay_4298.html
File: english_estellecoloredglass_60.html
File: english_estellecoloredglass_69.html
File: english_fashionnova_4646.html
File: english_fashionnova_513.html
File: english_fnp_2182.html
File: english_fnp_4636.html
File: english_footlocker_3436.html
File: english_footlocker_912.html
File: english_hallmark_1163.html
File: english_hallmark_1578.html
File: english_lovinglane_3157.html
File: english_lovinglane_3251.html
File: english_marksandspencer_4081.html
File: english_marksandspencer_4352.html
F

In [15]:
attrs_words_counts

{'page': 176,
 'pdp': 76,
 'responsive': 140,
 'bncom': 2,
 'hidden': 161,
 'browser': 2,
 'notice': 2,
 'outdated': 2,
 'ie': 2,
 'x': 120,
 'whole': 4,
 'bg': 12,
 'color': 81,
 'site': 22,
 'invisible': 2,
 'schema': 125,
 'container': 507,
 'product': 2341,
 'entity': 40,
 'badge': 167,
 'bn': 4,
 'added': 5,
 'size': 230,
 'main': 143,
 'music': 2,
 'book': 8,
 'http': 125,
 'org': 125,
 'content': 523,
 'section': 354,
 'placeholder': 5,
 'hero': 8,
 'sku': 71,
 'detail': 537,
 'base': 16,
 'pt': 16,
 'item': 1696,
 'wrap': 381,
 'flex': 108,
 'm': 170,
 'd': 168,
 'spacing': 14,
 'p': 80,
 'thumbnail': 23,
 'image': 296,
 'row': 301,
 'img': 41,
 'pr': 43,
 'column': 717,
 'lg': 55,
 'sm': 181,
 'pl': 6,
 'carousel': 113,
 'pre': 8,
 'init': 2,
 'slide': 89,
 'pa': 2,
 'cont': 59,
 'commerce': 4,
 'header': 493,
 's': 616,
 'sticky': 42,
 'zone': 24,
 'for': 23,
 'margin': 47,
 'left': 145,
 'right': 108,
 'spinner': 2,
 'promo': 13,
 'block': 195,
 'none': 59,
 'price': 881,
 '

In [16]:
import pandas as pd

df = pd.DataFrame.from_dict(attrs_words_counts, orient='index', columns=['count'])

In [17]:
df

Unnamed: 0,count
page,176
pdp,76
responsive,140
bncom,2
hidden,161
...,...
intro,1
nothing,1
place,1
char,1


In [18]:
df.sort_values(by='count', ascending=False)

Unnamed: 0,count
product,2341
item,1696
ty,1019
menu,953
review,919
...,...
nightwear,1
disco,1
zero,1
combiner,1


## Names to include:
product, review, detail, title, info, label, rating, offer, size, description, opinion, value, feedback, star, main, option, answer, count, breadcrumb, category, additional, data, color, comment, score, summary, search, related, availability, quantity, similar, money, variant, feature, primary???, reviewer, type, average, basic, regular, specific, entity, currency, property, compare, aggregated, amount, question, specification, information, number, spec???, available, material, recommend, measurement

1. maybe don't choose words, check if a given word is an english word and if it's at least 4 chars long.

In [19]:
# DIR_PATH = "../web_pages/all_domains/pages"
#
# files = os.listdir(DIR_PATH)
# files.sort()
#
# for i, file_name in enumerate(files):
#     print(f"File: {file_name}")
#
#     dir_name = file_name.replace('.', '_')
#     if not os.path.exists(f'results/{dir_name}'):
#         os.makedirs(f'results/{dir_name}')
#
#     with open(f"{DIR_PATH}/{file_name}", "r") as f:
#         html = f.read()
#
#     soup = BeautifulSoup(html, "html.parser")
#
#     simplified_soup_body = simplify_body(
#         soup=soup.body,
#         text_formatting_tags=text_formatting_tags,
#         tags_to_include=tags_to_include,
#         class_id_to_exclude=class_id_to_exclude,
#     )
#     simplified_body_text = md(simplified_soup_body)
#
#     meta_tags_str = textify_meta_tags(soup=soup.head, meta_acceptable_values=meta_values)
#
#     max_input_size = get_max_input_size(model_max_input_size, prompt_desc_size, meta_tags_str, tokenizer)
#
#     res = divide_html_version(simplified_soup_body, transform_html_func=md, max_size=max_input_size)
#
#     if isinstance(res, list):
#         pass
#     else:
#         res = [res]
#
#     for i, elem in enumerate(res):
#         print("Webpage part", i)
#         elem_soup = BeautifulSoup(elem[0], "html.parser")
#
#         simplified_elem_text = md(elem_soup)
#         # full_page_text_curr = f"{simplified_soup_head_text}\n{simplified_elem_text}"
#         full_page_text_curr = f"{meta_tags_str}\n\n{simplified_elem_text}".strip()
#
#         chat_gpt_prompt = f"{prompt}\n{full_page_text_curr}"
#
#         temps = [0.6, 0.7]
#         for temp in temps:
#             subdir_name = str(temp).replace('.', '_')
#             if not os.path.exists(f'results/{dir_name}/{subdir_name}'):
#                 os.makedirs(f'results/{dir_name}/{subdir_name}')
#
#             if f'results/{dir_name}/{subdir_name}/part_{i}.json' in os.listdir(f'results/{dir_name}/{subdir_name}'):
#                 continue
#
#             print(f"Temperature: {temp}")
#             response = openai.ChatCompletion.create(
#               model="gpt-3.5-turbo",
#               temperature=temp,
#               messages=[
#                 {
#                     "role": "user",
#                     "content": chat_gpt_prompt
#                 }
#               ]
#             )
#
#             # save the response to txt file
#             with open(f'results/{dir_name}/{subdir_name}/part_{i}.json', "w") as f:
#                 f.write(response['choices'][0]['message']['content'])


File: english_autoanything_1942.html
Webpage part 0
Temperature: 0.5
Temperature: 0.6
Temperature: 0.7
Temperature: 0.8
File: english_autoanything_2.html
Webpage part 0
Temperature: 0.5
Temperature: 0.6
Temperature: 0.7
Temperature: 0.8
Webpage part 1
Temperature: 0.5
Temperature: 0.6
Temperature: 0.7
Temperature: 0.8
File: english_barnesandnoble_2011.html
Webpage part 0
Temperature: 0.5
Temperature: 0.6
Temperature: 0.7
Temperature: 0.8
File: english_barnesandnoble_2139.html
Webpage part 0
Temperature: 0.5
Temperature: 0.6
Temperature: 0.7
Temperature: 0.8
File: english_baublebar_112.html
Webpage part 0
Temperature: 0.5


RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 08208626c6321bac1b23aee683b97899 in your message.)