In [1]:
import re

import openai
import os
import json

from markdownify import ATX, BACKSLASH
from transformers import T5TokenizerFast
from bs4 import BeautifulSoup, Tag

from data_gathering.data_extraction.utils import get_binary_dicts_templates
from data_gathering.data_extraction.tree_modification import simplify_body

In [2]:
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv('OPEN_AI_API_KEY')

openai.api_key = API_KEY

In [3]:
tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-base")

In [2]:
with open("../dataset_creation_config.json", "r") as file:
    config = json.load(file)

tags_to_include = set(config["tags"])
text_formatting_tags = set(config["text_formatting_tags"])
abbreviations = config["abbreviations"]
meta_values = config["meta_values"]
(
    available_tags_binary_dict,
    available_attributes_values_binary_dict,
) = get_binary_dicts_templates(config)

In [3]:
class_id_to_exclude = [
    "nav",
    "tooltip",
    "banner",
    "footer",
    "shipping",
    "popup",
    "checkout",
    "payments",
    "returns",
    "delivery",
    "support",
    "warranty",
    "privacy",
    "benefit",
    "sign-in",
    "sign-up",
    "login",
    "password",
    "add-to-cart",
    "announcement",
    "notification",
    "wishlist",
    "social",
    "json",
    "alert",
    "disclaimer",
    "contact",
    "cookies",
    "newsletter",
    "basket",
    "askforproduct",
    "ask-about-product",
    "mobile-header",
    "header-menu",
    "customer"
]

In [4]:
def join_tags(elems, tag_name):
    html = ""

    for partial_html, _ in elems:
        html += partial_html

    html = f"<{tag_name}>{html}</{tag_name}>"

    return html

def join_as_many_tags_as_possible(parts, max_size: int, tag_name="body"):
    final = []
    temp = []
    curr_len = 0

    for part, size in parts:
        if temp:
            if curr_len + size <= max_size:
                temp.append((part, size))
                curr_len += size
            else:
                final.append((join_tags(temp, tag_name), curr_len))
                temp = [(part, size)]
                curr_len = size
        else:
            temp = [(part, size)]
            curr_len = size

    if temp:
        final.append((join_tags(temp, tag_name), curr_len))

    return final

def divide_html_version(soup, transform_html_func, max_size):
    text = tokenizer(transform_html_func(soup)).input_ids
    size = len(text)

    if size <= max_size:
        return str(soup), size

    parts = []

    for elem in soup.contents:
        res = divide_html_version(soup=elem, transform_html_func=transform_html_func, max_size=max_size) if isinstance(elem, Tag) else (str(elem), len(tokenizer(str(elem)).input_ids))

        if res is None:
            return None
        elif isinstance(res, list):
            parts += res
        else:
            parts.append(res)

    return join_as_many_tags_as_possible(parts=parts, max_size=max_size, tag_name=soup.name)

In [6]:
def get_max_input_size(model_max_input_size: int, prompt_desc_size: int, header_md: str, tokenizer: T5TokenizerFast) -> int:
    return model_max_input_size - prompt_desc_size - len(tokenizer(header_md).input_ids)

In [13]:
# prompt = """
# The text given below was extracted from a product website page. Extract the following product information from the text:
# - title - a string
# - brand - a string
# - current price - a number
# - old price - a number
# - currency - a string
# - product features and specifications - returns all product features and specifications included in text, those could be for example but not necessarily materials, design, genres, model, labels and other things like that. Return a list of dictionaries containing present product features and specifications. Each dictionary should contain the feature name and its value (string or array), split features into separate points, do not provide a description
# - short description - a string as a meaningful short description of the product
# - available colors - a  list of strings
# - available size variants - a list of strings. Don't include any dimensions, quantity, and size table information information. If products come in only one universal size return word default.
# - product categories - a list of strings. Use categories from the text to come up with your own categories. If no information is provided in the text create your own categories. Try to be as general and also precise as possible.
# - customer ratings - a dictionary with the average rating and each separate rating with their number (if that information is strictly provided)
# - customer reviews - return all customer reviews (opinions) as a list of strings
# - similar or related products - a list of strings consisting of similar products
# - products recommended for me - a list of strings consisting of products recommended for me
# Don't return any size table information or delivery information.
# Never guess unless you're asked to, if there's no information return: null.
# Return only a proper JSON format using snake case for properties. The text:
# """

In [14]:
prompt = """
The text given below was extracted from a product website page. Extract the following product information from the text:
- title - a string
- brand - a string
- current price - a number
- old price - a number
- currency - a string
- product features and specifications - returns all product features and specifications included in text and relevant to the product itself (describing it). Return a list of dictionaries containing present product features and specifications. Each dictionary should contain the feature name as "feature" and its value as "value". The important thing is that you must provide all the features and specifications you can find. Split features into separate points. Remember to never provide a description. Name this field "product_features".
- short description - a string as a meaningful short description of the product
- available colors - a  list of strings
- available size variants - a list of strings. Don't ever include any product dimensions, quantity, and size table information information. If products come in only one universal size return a word (string) default.
- product categories - a list of strings. Use categories from the text to come up with your own categories. If no information is provided in the text create your own categories. Try to be as general and precise as possible. Remember, this field must always be provided.
- customer ratings - a dictionary with the average rating and each separate rating with their number (if that information is strictly provided otherwise if each rating with a number is not provided return only the mean rating and overall number of ratings)
- customer reviews - return all customer reviews (opinions) as a list of strings
- similar or related products - a list of strings consisting of similar products
- products recommended for me - a list of strings consisting of products recommended for me
Don't return any size table information or delivery information. Return each number as a number, not a string.
Never guess unless you're asked to, if there's no information return: null.
Return only a proper JSON format using snake case for properties. The text:
"""

In [15]:
input = tokenizer(prompt)
prompt_desc_size = len(input.input_ids)

NameError: name 'tokenizer' is not defined

In [16]:
prompt_desc_size

NameError: name 'prompt_desc_size' is not defined

In [17]:
model_max_input_size = 16384

In [5]:
from transformers import pipeline

# translator = pipeline("translation", model="Helsinki-NLP/opus-mt-pl-en")
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-pl-en", device="cuda:0")

In [6]:
import spacy

nlp = spacy.load("pl_core_news_sm", disable=['tagger', 'ner', 'morphologizer', 'lemmatizer', 'attribute_ruler', 'parser', 'tok2vec'])
nlp.enable_pipe("senter")
# nlp = spacy.load("pl_core_news_sm", disable=['tagger', 'ner', 'morphologizer', 'lemmatizer', 'attribute_ruler'])

In [7]:
print(nlp.pipe_names)

['senter']


In [8]:
from data_gathering.data_extraction.html2markdown import HtmlAttrsAndTranslationMarkdownConverter

markdown_converter = HtmlAttrsAndTranslationMarkdownConverter(
    abbreviations=abbreviations,
    meta_acceptable_values=meta_values,
    translation_pipeline=translator,
    nlp=nlp,
    heading_style=ATX,
    newline_style=BACKSLASH
)

In [9]:
from time import time

file_name = "polish_trendhim_1602.html"

with open(f"../web_pages/all_domains/pages/{file_name}", "r") as file:
    html = file.read()

start = time()

soup = BeautifulSoup(html, "html.parser")

simplified_soup_body = simplify_body(
    soup=soup.body,
    text_formatting_tags=text_formatting_tags,
    tags_to_include=tags_to_include,
    class_id_to_exclude=class_id_to_exclude,
)

should_translate_to_english = file_name.startswith('polish')

simplified_body_text = markdown_converter.textify_body(simplified_soup_body, should_translate_to_english)

meta_tags_str = markdown_converter.textify_meta_tags(soup=soup, should_translate_to_english=should_translate_to_english)

full_page_text_curr = f'{meta_tags_str}\n\n{simplified_body_text}'.strip()

exec_time = time() - start

In [10]:
with open(f"{file_name}.txt", "w") as file:
    file.write(full_page_text_curr)

In [11]:
exec_time

25.210716485977173

cpu time: 76.38324189186096
gpu time: 25.474854946136475

In [None]:
from time import time

file_name = "polish_trendhim_1602.html"

with open(f"../web_pages/all_domains/pages/{file_name}", "r") as file:
    html = file.read()

start = time()

soup = BeautifulSoup(html, "html.parser")

simplified_soup_body = simplify_body(
    soup=soup.body,
    text_formatting_tags=text_formatting_tags,
    tags_to_include=tags_to_include,
    class_id_to_exclude=class_id_to_exclude,
)

should_translate_to_english = file_name.startswith('polish')

simplified_body_text = markdown_converter.textify_body(simplified_soup_body, should_translate_to_english)

meta_tags_str = markdown_converter.textify_meta_tags(soup=soup, should_translate_to_english=should_translate_to_english)
full_page_text_curr = f'{meta_tags_str}\n\n{simplified_body_text}'.strip()

exec_time = time() - start

# with open(f"{file_name}.txt", "w") as file:
#     file.write(full_page_text_curr)

# if not os.path.exists(f'results'):
#     os.makedirs(f'results')
# 
# max_input_size = get_max_input_size(model_max_input_size, prompt_desc_size, meta_tags_str, tokenizer)
# print('max_input_size', max_input_size)

# temps = [0.7]
# for temp in temps:
#     subdir_name = str(temp).replace('.', '_')
#     if not os.path.exists(f'results/{subdir_name}'):
#         os.makedirs(f'results/{subdir_name}')
#     
#     # if f'results/{dir_name}/{subdir_name}/part_{i}.json' in os.listdir(f'results/{dir_name}/{subdir_name}'):
#     #     continue
# 
#     print(f"Temperature: {temp}")
# 
#     chat_gpt_prompt = f"{prompt}\n{full_page_text_curr}"
#     print(chat_gpt_prompt)
# 
#     response = openai.ChatCompletion.create(
#       model="gpt-3.5-turbo-16k",
#       temperature=temp,
#       messages=[
#         {
#             "role": "user",
#             "content": chat_gpt_prompt
#         }
#       ]
#     )
# 
#     # save the response to txt file
#     with open(f'result.json', "w") as f:
#         f.write(response['choices'][0]['message']['content'])

# with open(f'simplified.txt', "w") as f:
#     f.write(f'{meta_tags_str}\n{simplified_body_text}')


## Get all features keys/names etc.

In [19]:
all_features = {}

for name in os.listdir('results'):
    with open(f'results/{name}/0_7/part_0.json', "r") as file:
        data = json.load(file)
    
    if 'product_features_and_specifications' not in data and "product_features_specifications" not in data and 'product_features' not in data:
        print(f'features has some other name for {name}')
        
    features = data['product_features_and_specifications'] if 'product_features_and_specifications' in data else (data['product_features_specifications'] if 'product_features_specifications' in data else data['product_features'])
    
    for elem in features:
        if 'feature' in elem:
            key = elem['feature']
        elif 'feature_name' in elem:
            key = elem['feature_name']
        elif 'name' in elem:
            key = elem['name']
        else:
            print(elem)
            key = list(elem.keys())[0]
            
        all_features[key] = all_features.get(key, 0) + 1

{'Earring Type': 'Drop Earrings'}
{'Fine or Fashion': 'Fashion'}
{'Item Type': 'Earrings'}
{'Style': 'Trendy'}
{'Metals Type': 'Copper'}
{'Gender': 'Women'}
{'Material': 'Cubic Zirconia'}
{'Marka': 'Obsessive'}
{'Kod producenta': '5901688221129'}
{'Rozmiar': 'L/XL'}
{'Kolor': 'Bordowy'}
{'Nazwa dostawcy': 'Miamor figi kolor: bordowy L/XL'}
{'Okazje': 'Dzień Kobiet'}
{'Długość towaru w centymetrach': '15'}
{'Szerokość towaru w centymetrach': '10'}
{'Wysokość towaru w centymetrach': '7'}
{'Suggested Age': '22 Years and Up'}
{'Number of Pages': 320}
{'Format': 'Hardcover'}
{'Genre': 'Medical'}
{'Sub-Genre': 'History'}
{'Publisher': 'MIT Press'}
{'Author': 'Mikkael A Sekeres'}
{'Language': 'English'}
{'Street Date': 'September 27, 2022'}
{'TCIN': '86094342'}
{'UPC': '9780262047319'}
{'Item Number (DPCI)': '247-19-1506'}
{'Origin': 'Made in the USA or Imported'}
{'Release Date': '06/23/2015'}
{'Label': 'Prophecy'}
{'UPC': '0884388716315'}
{'catalogNumber': '163'}
{'Rank': 76940}
{'wzór_domi

In [21]:
import pandas as pd

df = pd.DataFrame.from_dict(all_features, orient='index', columns=['count']).sort_values(by='count', ascending=False)

In [22]:
df

Unnamed: 0,count
Material,13
category,11
Materiał,10
Color,10
Kolor,8
...,...
Mattress size,1
Bed size,1
Bag size,1
Crafted from,1
