In [1]:
import re

import openai
import os
import json

import pandas as pd
from markdownify import ATX, BACKSLASH
from transformers import T5TokenizerFast
from bs4 import BeautifulSoup, Tag

from data_gathering.data_extraction.utils import get_binary_dicts_templates
from data_gathering.data_extraction.tree_modification import simplify_body, create_meta_tags_texifier
from data_gathering.data_extraction.html2markdown import create_md

In [2]:
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv('OPEN_AI_API_KEY')

openai.api_key = API_KEY

In [3]:
tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-base")

In [4]:
with open("../dataset_creation_config.json", "r") as file:
    config = json.load(file)

tags_to_include = set(config["tags"])
text_formatting_tags = set(config["text_formatting_tags"])
abbreviations = config["abbreviations"]
meta_values = config["meta_values"]
(
    available_tags_binary_dict,
    available_attributes_values_binary_dict,
) = get_binary_dicts_templates(config)

In [5]:
class_id_to_exclude = [
    "nav",
    "tooltip",
    "banner",
    "footer",
    "shipping",
    "popup",
    "checkout",
    "payments",
    "returns",
    "delivery",
    "support",
    "warranty",
    "privacy",
    "benefit",
    "sign-in",
    "sign-up",
    "login",
    "password",
    "add-to-cart",
    "announcement",
    "notification",
    "wishlist",
    "social",
    "json",
    "alert",
    "disclaimer",
    "contact",
    "cookies",
    "newsletter",
    "basket",
    "askforproduct",
    "ask-about-product",
    "mobile-header",
    "header-menu",
    "customer"
]

In [6]:
def join_tags(elems, tag_name):
    html = ""

    for partial_html, _ in elems:
        html += partial_html

    html = f"<{tag_name}>{html}</{tag_name}>"

    return html

def join_as_many_tags_as_possible(parts, max_size: int, tag_name="body"):
    final = []
    temp = []
    curr_len = 0

    for part, size in parts:
        if temp:
            if curr_len + size <= max_size:
                temp.append((part, size))
                curr_len += size
            else:
                final.append((join_tags(temp, tag_name), curr_len))
                temp = [(part, size)]
                curr_len = size
        else:
            temp = [(part, size)]
            curr_len = size

    if temp:
        final.append((join_tags(temp, tag_name), curr_len))

    return final

def divide_html_version(soup, transform_html_func, max_size):
    text = tokenizer(transform_html_func(soup)).input_ids
    size = len(text)

    if size <= max_size:
        return str(soup), size

    parts = []

    for elem in soup.contents:
        res = divide_html_version(soup=elem, transform_html_func=transform_html_func, max_size=max_size) if isinstance(elem, Tag) else (str(elem), len(tokenizer(str(elem)).input_ids))

        if res is None:
            return None
        elif isinstance(res, list):
            parts += res
        else:
            parts.append(res)

    return join_as_many_tags_as_possible(parts=parts, max_size=max_size, tag_name=soup.name)

In [7]:
def get_max_input_size(model_max_input_size: int, prompt_desc_size: int, header_md: str, tokenizer: T5TokenizerFast) -> int:
    return model_max_input_size - prompt_desc_size - len(tokenizer(header_md).input_ids)

In [8]:
# prompt = """
# The text given below was extracted from a product website page. Extract the following product information from the text:
# - title - a string
# - brand - a string
# - current price - a number
# - old price - a number
# - currency - a string
# - product features and specifications - returns all product features and specifications included in text, those could be for example but not necessarily materials, design, genres, model, labels and other things like that. Return a list of dictionaries containing present product features and specifications. Each dictionary should contain the feature name and its value (string or array), split features into separate points, do not provide a description
# - short description - a string as a meaningful short description of the product
# - available colors - a  list of strings
# - available size variants - a list of strings. Don't include any dimensions, quantity, and size table information information. If products come in only one universal size return word default.
# - product categories - a list of strings. Use categories from the text to come up with your own categories. If no information is provided in the text create your own categories. Try to be as general and also precise as possible.
# - customer ratings - a dictionary with the average rating and each separate rating with their number (if that information is strictly provided)
# - customer reviews - return all customer reviews (opinions) as a list of strings
# - similar or related products - a list of strings consisting of similar products
# - products recommended for me - a list of strings consisting of products recommended for me
# Don't return any size table information or delivery information.
# Never guess unless you're asked to, if there's no information return: null.
# Return only a proper JSON format using snake case for properties. The text:
# """

In [9]:
prompt = """
The text given below was extracted from a product website page. Extract the following product information from the text:
- title - a string
- brand - a string
- current price - a number
- old price - a number
- currency - a string
- product features and specifications - returns all product features and specifications included in text and relevant to the product itself (describing it). Return a list of dictionaries containing present product features and specifications. Each dictionary should contain the feature name as "feature" and its value as "value". The important thing is that you must provide all the features and specifications you can find. Split features into separate points. Remember to never provide a description. Name this field "product_features".
- short description - a string as a meaningful short description of the product
- available colors - a  list of strings
- available size variants - a list of strings. Don't ever include any product dimensions, quantity, and size table information information. If products come in only one universal size return a word (string) default.
- product categories - a list of strings. Use categories from the text to come up with your own categories. If no information is provided in the text create your own categories. Try to be as general and precise as possible. Remember, this field must always be provided.
- customer ratings - a dictionary with the average rating and each separate rating with their number (if that information is strictly provided otherwise if each rating with a number is not provided return only the mean rating and overall number of ratings)
- customer reviews - return all customer reviews (opinions) as a list of strings
- similar or related products - a list of strings consisting of similar products
- products recommended for me - a list of strings consisting of products recommended for me
Don't return any size table information or delivery information. Return each number as a number, not a string.
Never guess unless you're asked to, if there's no information return: null.
Return only a proper JSON format using snake case for properties. The text:
"""

In [10]:
input = tokenizer(prompt)
prompt_desc_size = len(input.input_ids)

In [11]:
prompt_desc_size

474

In [12]:
model_max_input_size = 16384

In [13]:
# from transformers import pipeline
# 
# translator = pipeline("translation", model="Helsinki-NLP/opus-mt-pl-en")

In [2]:
from transformers import pipeline, Pipeline, MarianMTModel, AutoTokenizer

src = "pl"  # source language
trg = "en"  # target language

model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# translator = pipeline("translation", model="Helsinki-NLP/opus-mt-pl-en")

In [12]:
def translate(text):
    batch = tokenizer(text, return_tensors="pt", padding=True)
    generated_ids = model.generate(**batch, max_length=512, num_beams=1)
    print(generated_ids)
    return tokenizer.batch_decode(generated_ids)

In [13]:
import regex
unicode_letters_re = regex.compile(r'\p{L}+')

In [14]:
from nltk.tokenize import sent_tokenize

In [15]:
text = "Życie nigdy się nie kończy – przygotuj się zatem na ciąg dalszy. Zasilany twoją energią zegarek z widocznym mechanizmem Mads Dante dopasuje się do ciebie, tempo do tempa. Zrób dziś to, czego inni nie zrobią. Dzięki temu jutro będziesz mógł zrobić to, czego inni nie mogą. Dzięki szkieletowej tarczy i wystawowemu dekielkowi będziesz mieć miejsce w pierwszym rzędzie w samym sercu zegarka. Dzięki wytrzymałemu kryształowi szafiru nie musisz się martwić zadrapaniami. Ten męski zegarek wykorzystuje energię poprzez naturalny ruch twojego nadgarstka lub można go ręcznie nakręcić. Oznacza to, że już nigdy nie kupisz (i będziesz miał trudności z włożeniem) baterii do zegarka… pozostawiając sobie więcej czasu na wykorzystanie tej chwili. Wykonany z odpornej na rdzę stali nierdzewnej w złocistym odcieniu z pasującą czarno-złocistą bransoletą ze stali nierdzewnej. Mechanizm jest w kolorze szarym i chroniony mocnym szafirowym kryształem. Zaprojektowany w Danii i wysyłany w charakterystycznym pudełku upominkowym."

In [17]:
translate(text)

tensor([[63429,  7157,   522, 10126,    15,     0]])


['<pad> Life never ends </s>']

In [206]:
sent_tokenize(text)

['Życie nigdy się nie kończy – przygotuj się zatem na ciąg dalszy.',
 'Zasilany twoją energią zegarek z widocznym mechanizmem Mads Dante dopasuje się do ciebie, tempo do tempa.',
 'Zrób dziś to, czego inni nie zrobią.',
 'Dzięki temu jutro będziesz mógł zrobić to, czego inni nie mogą.',
 'Dzięki szkieletowej tarczy i wystawowemu dekielkowi będziesz mieć miejsce w pierwszym rzędzie w samym sercu zegarka.',
 'Dzięki wytrzymałemu kryształowi szafiru nie musisz się martwić zadrapaniami.',
 'Ten męski zegarek wykorzystuje energię poprzez naturalny ruch twojego nadgarstka lub można go ręcznie nakręcić.',
 'Oznacza to, że już nigdy nie kupisz (i będziesz miał trudności z włożeniem) baterii do zegarka… pozostawiając sobie więcej czasu na wykorzystanie tej chwili.',
 'Wykonany z odpornej na rdzę stali nierdzewnej w złocistym odcieniu z pasującą czarno-złocistą bransoletą ze stali nierdzewnej.',
 'Mechanizm jest w kolorze szarym i chroniony mocnym szafirowym kryształem.',
 'Zaprojektowany w Dani

In [207]:
translator([text], max_length=512, num_beams=2)

[{'translation_text': 'Life will never end, therefore, prepare yourself for the continuation. With your energy-powered watch with the visible Mads Dante mechanism, you will fit into your pace. Do what others will not do today. This will allow you to do what others cannot do tomorrow. Thanks to your skeletal shield and display decelebrity you will have a place in the first row in the heart of the watch. Thanks to the durable sapphire crystal you do not have to worry about scratching. This male watch uses energy through the natural movement of your wrist or can manually film it. This means that you will never buy (and you will have difficulty putting) a battery into your watch... leaving yourself more time to use this moment. Made of rust-resistant stainless steel in a golden shade with a matching black-golden stainless steel bracelet. The mechanism is in grey color and protected by a strong sapphire crystal. Designed in Denmark and sent in a natural gift box.'}]

In [199]:
from typing import Callable, Dict, List

from bs4 import Tag
from markdownify import MarkdownConverter
from transformers import Pipeline
import regex

from data_gathering.data_extraction.html_features import get_attributes_values

class TranslationBatchingException(Exception):
    pass

class MissingTranslationTextException(Exception):
    pass

class HtmlAttrsAndTranslationMarkdownConverter(MarkdownConverter):
    def __init__(
        self,
        abbreviations: Dict[str, str],
        translation_pipeline: Pipeline,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.abbreviations = abbreviations
        self.translation_pipeline = translation_pipeline
        self.should_translate_to_english = False
        self.text_lines_to_translate = []
    
    def __call__(self, soup: Tag, should_translate_to_english: bool = False):
        self._reset(should_translate_to_english)
        
        result = self.convert_soup(soup)
    
        if not self.should_translate_to_english:
            return result
        
        if not self.text_lines_to_translate:
            raise MissingTranslationTextException('No text to translate')

        translated_text = self.translation_pipeline(self.text_lines_to_translate, max_length=512, num_beams=1)
        
        # for elem in zip(self.text_lines_to_translate, translated_text):
        #     # result = result.replace('{}', elem[1], 1)
        #     print(elem[0])
        #     print(elem[1]['translation_text'])
        #     print()
        #     print('-'*30)
        #     print()
        
        return result.format(*self._extract_all_translations(translated_text))
    
    def _reset(self, should_translate_to_english: bool = False):
        self.should_translate_to_english = should_translate_to_english
        self.text_lines_to_translate = []
    
    def _extract_all_translations(self, translations: List[Dict[str, str]]) -> List[str]:
        return [elem['translation_text'] for elem in translations]
    
    def _batch_translation_text(self) -> List[str]:
        """
        From the list of text lines to translate, create batches of text lines to translate.
        Add as much text as you can to the batch, but don't exceed the max input size.
        Each text line is split by ` [TS] ` (translation separator) token and spaces around it.
        """
        batch = []
        
        for line in self.text_lines_to_translate:                
            if len(batch) == 0:
                text_ids = self.tokenizer(line, return_tensors="pt")['input_ids']
                batch.append([line, text_ids.shape[1]])
            else:
                text = ' <T> ' + line
                text_ids = self.tokenizer(text, return_tensors="pt")['input_ids']
                text_len = text_ids.shape[1]
                
                if text_len + batch[-1][1] > self.translation_model.config.max_length:
                    batch.append([line, text_len])
                else:
                    batch[-1][0] += text
                    batch[-1][1] += text_len
                    
        return [elem[0] for elem in batch]

    def convert_soup(self, soup: Tag) -> str:
        return self.process_tag(
            soup,
            convert_as_inline=False,
            children_only=True,
        )

    def process_tag(self, node: Tag, convert_as_inline: bool, children_only: bool = False) -> str:
        attrs_words = get_attributes_values(node, self.abbreviations)
        start = " ".join(attrs_words)

        text = super().process_tag(node, convert_as_inline, children_only)

        if start:
            return f"\n- {start}:\n{text}"
        
        return text

    def process_text(self, el: Tag):
        text = super().process_text(el)
        
        if self.should_translate_to_english and text and bool(regex.findall(unicode_letters_re, text.strip())):
            self.text_lines_to_translate.append(text)
            return '{}'
        
        return text.strip()
    

In [200]:
md = HtmlAttrsAndTranslationMarkdownConverter(
    abbreviations=abbreviations,
    translation_pipeline=translator,
    heading_style=ATX,
    newline_style=BACKSLASH
)

In [201]:
# textify_meta_tags = create_meta_tags_texifier(
#     meta_acceptable_values=meta_values,
#     translator=translator,
#     extract_translation=extract_translation,
# )

In [202]:
file_name = "polish_trendhim_1602.html"

with open(f"../web_pages/all_domains/pages/{file_name}", "r") as file:
    html = file.read()

soup = BeautifulSoup(html, "html.parser")

simplified_soup_body = simplify_body(
    soup=soup.body,
    text_formatting_tags=text_formatting_tags,
    tags_to_include=tags_to_include,
    class_id_to_exclude=class_id_to_exclude,
)

should_translate_to_english = file_name.startswith('polish')

simplified_body_text = md(simplified_soup_body, should_translate_to_english)

In [203]:
simplified_body_text

'\n- full store view product page:\n\n- wrapper main currency page:\n\n- header:\n\n- main header wrap:\n\n- main header:\n\n- main header wrap:\n\n- main header wrap:\n\n- text icon header:\nWishlist\n- include wrap cart header:\n\n- text icon header:\nBasket\n- wrapper main content:\n\n- main content:\n\n- main:\n\n- breadcrumb_list wrap:\n\n- wrap list element list_item item:\n\n- name:\nWatches\n- last crumb list wrap element list_item item:\n\n- line split:\n/\n- name:\nHand watches\n- gallery wrap:\n\n- gallery:\n\n- gallery wrap:\n\n- gallery list:\n\n- content gallery list:\n\n- gallery content list slider thumb:\n\n- content list slider gallery item thumb video:\n\n- icon outer wrap:\nFilm\n- gallery bottom:\n\n- hidden gallery good full:\n\n- gallery full:\n\n- zoom reset:\nZoom in\n- gallery bottom:\n\n- wrap:\n\n- wrap brand:\n\n- price wrap title:\n\n- main product title:\n# Mads Dante I ed.\n\n\n- price wrap product title:\n\n- price wrap:\n\n- price product piece:\n999 z

In [204]:
with open(f'simplified.txt', "w") as f:
    f.write(simplified_body_text)

In [None]:

file_name = "polish_trendhim_1602.html"

with open(f"../web_pages/all_domains/pages/{file_name}", "r") as file:
    html = file.read()

soup = BeautifulSoup(html, "html.parser")

simplified_soup_body = simplify_body(
    soup=soup.body,
    text_formatting_tags=text_formatting_tags,
    tags_to_include=tags_to_include,
    class_id_to_exclude=class_id_to_exclude,
)

should_translate_to_english = file_name.startswith('polish')

simplified_body_text = md(simplified_soup_body, should_translate_to_english)

# meta_tags_str = textify_meta_tags(soup=soup)
# full_page_text_curr = f'{meta_tags_str}\n\n{simplified_body_text}'.strip()

# with open(f"{file_name}.txt", "w") as file:
#     file.write(full_page_text_curr)
# 
# if not os.path.exists(f'results'):
#     os.makedirs(f'results')
# 
# max_input_size = get_max_input_size(model_max_input_size, prompt_desc_size, meta_tags_str, tokenizer)
# print('max_input_size', max_input_size)

# temps = [0.7]
# for temp in temps:
#     subdir_name = str(temp).replace('.', '_')
#     if not os.path.exists(f'results/{subdir_name}'):
#         os.makedirs(f'results/{subdir_name}')
#     
#     # if f'results/{dir_name}/{subdir_name}/part_{i}.json' in os.listdir(f'results/{dir_name}/{subdir_name}'):
#     #     continue
# 
#     print(f"Temperature: {temp}")
# 
#     chat_gpt_prompt = f"{prompt}\n{full_page_text_curr}"
#     print(chat_gpt_prompt)
# 
#     response = openai.ChatCompletion.create(
#       model="gpt-3.5-turbo-16k",
#       temperature=temp,
#       messages=[
#         {
#             "role": "user",
#             "content": chat_gpt_prompt
#         }
#       ]
#     )
# 
#     # save the response to txt file
#     with open(f'result.json', "w") as f:
#         f.write(response['choices'][0]['message']['content'])

# with open(f'simplified.txt', "w") as f:
#     f.write(f'{meta_tags_str}\n{simplified_body_text}')


## Get all features keys/names etc.

In [19]:
all_features = {}

for name in os.listdir('results'):
    with open(f'results/{name}/0_7/part_0.json', "r") as file:
        data = json.load(file)
    
    if 'product_features_and_specifications' not in data and "product_features_specifications" not in data and 'product_features' not in data:
        print(f'features has some other name for {name}')
        
    features = data['product_features_and_specifications'] if 'product_features_and_specifications' in data else (data['product_features_specifications'] if 'product_features_specifications' in data else data['product_features'])
    
    for elem in features:
        if 'feature' in elem:
            key = elem['feature']
        elif 'feature_name' in elem:
            key = elem['feature_name']
        elif 'name' in elem:
            key = elem['name']
        else:
            print(elem)
            key = list(elem.keys())[0]
            
        all_features[key] = all_features.get(key, 0) + 1

{'Earring Type': 'Drop Earrings'}
{'Fine or Fashion': 'Fashion'}
{'Item Type': 'Earrings'}
{'Style': 'Trendy'}
{'Metals Type': 'Copper'}
{'Gender': 'Women'}
{'Material': 'Cubic Zirconia'}
{'Marka': 'Obsessive'}
{'Kod producenta': '5901688221129'}
{'Rozmiar': 'L/XL'}
{'Kolor': 'Bordowy'}
{'Nazwa dostawcy': 'Miamor figi kolor: bordowy L/XL'}
{'Okazje': 'Dzień Kobiet'}
{'Długość towaru w centymetrach': '15'}
{'Szerokość towaru w centymetrach': '10'}
{'Wysokość towaru w centymetrach': '7'}
{'Suggested Age': '22 Years and Up'}
{'Number of Pages': 320}
{'Format': 'Hardcover'}
{'Genre': 'Medical'}
{'Sub-Genre': 'History'}
{'Publisher': 'MIT Press'}
{'Author': 'Mikkael A Sekeres'}
{'Language': 'English'}
{'Street Date': 'September 27, 2022'}
{'TCIN': '86094342'}
{'UPC': '9780262047319'}
{'Item Number (DPCI)': '247-19-1506'}
{'Origin': 'Made in the USA or Imported'}
{'Release Date': '06/23/2015'}
{'Label': 'Prophecy'}
{'UPC': '0884388716315'}
{'catalogNumber': '163'}
{'Rank': 76940}
{'wzór_domi

In [21]:
import pandas as pd

df = pd.DataFrame.from_dict(all_features, orient='index', columns=['count']).sort_values(by='count', ascending=False)

In [22]:
df

Unnamed: 0,count
Material,13
category,11
Materiał,10
Color,10
Kolor,8
...,...
Mattress size,1
Bed size,1
Bag size,1
Crafted from,1


In [53]:
from transformers import MarianTokenizer

In [54]:
from transformers import MarianTokenizer

src = "pl"  # source language
trg = "en"  # target language

model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"

tokenizer = MarianTokenizer.from_pretrained(model_name)

In [55]:
tok = MarianTokenizer.from_pretrained(model_name)

In [59]:
tok.model_max_length

512