In [1]:
import os
import re
from typing import Dict, List
from dataclasses import dataclass

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options

In [62]:
BASE_URL = "https://www.amazon.com.br/Tablet-Samsung-Galaxy-Wi-Fi-Octa-Core/dp/B08YLDRSYC/ref=gbps_tit_m-4_19e9_c427f22e?smid=A1ZZFT5FULY4LN&pf_rd_p=cdc5728b-732b-443b-88ba-bfcb09ed19e9&pf_rd_s=merchandised-search-4&pf_rd_t=101&pf_rd_i=16209062011&pf_rd_m=A1ZZFT5FULY4LN&pf_rd_r=EPK2XTK4Y6ZKF9B9BJ49"

In [63]:
class Crawler:
    def __init__(self, driver_path: str) -> None:
        self.DRIVER_PATH: str = driver_path
        self.driver_options: Options = Options()
        self.driver: webdriver.Chrome = None

    def set_driver_options(self, opts: List[str]):
        for opt in opts:
            self.driver_options.add_argument(opt)

    def start_crawler(self, implicit_wait_time: int):
        self.driver = self._start_driver()
        self._set_driver_implicit_wait(implicit_wait_time)
        self._maximize_driver_window()
    
    def _start_driver(self) -> webdriver.Chrome:
        return webdriver.Chrome(executable_path=self.DRIVER_PATH, options=self.driver_options)

    def _set_driver_implicit_wait(self, wait_time: int):
        self.driver.implicitly_wait(wait_time)

    def _maximize_driver_window(self):
        self.driver.maximize_window()

    def navigate_to_url(self, url: str):
        self.driver.get(url)

    def quit(self):
        self.driver.quit()
        


In [30]:
class HtmlDataExtractor:
    def __init__(self, crawler):
        self.crawler: Crawler = crawler

    def find_price_identifier(self, identifier_map: Dict[str, List[str]]) -> str:
        for key, identifiers in identifier_map.items():
            if key == "xpath":
                for identifier in identifiers:
                    try:
                        element = self.crawler.driver.find_element_by_xpath(identifier)
                        if element:
                            print("Achou identificador ", identifier)
                            return key, identifier
                    except NoSuchElementException:
                        pass
            elif key == "class":
                for identifier in identifiers:
                    try:
                        element = self.crawler.driver.find_element_by_class_name(identifier)
                        if element:
                            print("Achou identificador ", identifier)
                            return key, identifier
                    except NoSuchElementException:
                        pass

        return None, None

    def get_product_price_by_xpath(self, price_path: str) -> int:
        try:
            product_price = self._get_element_text_by_xpath(price_path)
            formatted_price = self._price_string_to_float(product_price)

            return formatted_price
        except NoSuchElementException:
            return None
        except ValueError:
            print(f"Erro na conversão de preço para inteiro")
            return None
        except Exception as e:
            print(f"Ocorreu o seguinte erro: {e}")
            return None
    
    def _get_element_text_by_xpath(self, xpath: str) -> str:
        html_element = self.crawler.driver.find_element_by_xpath(xpath)
        html_text = html_element.text

        return html_text
    
    def _price_string_to_float(self, price_string: str) -> int:
        formatted_string = price_string.replace("R$", "").replace(".", "").replace(",","")

        int_price = int(formatted_string.strip())
        return int_price

    def get_original_price_if_exists(self, original_price_class: str) -> int:
        try:
            original_price_value = self._get_element_text_by_class_name(original_price_class)
            formatted_price = self._price_string_to_float(original_price_value)
            
            return formatted_price
        except NoSuchElementException:
            return None
        except ValueError:
            print(f"Erro na conversão de preço para inteiro")
            return None
        except Exception as e:
            print(f"Ocorreu o seguinte erro: {e}")
            return None


    def get_discount_percentage_if_exists(self, discount_class: str) -> str:
        try:
            discount_value = self._get_element_text_by_class_name(discount_class)

            if discount_value:
                discount_percentage = self._extract_percentage_string(discount_value)
                return discount_percentage
            
            return discount_value
        except NoSuchElementException:
            return None
        except IndexError as e:
            return None
        except Exception as e:
            print(f"Ocorreu o seguinte erro: {e}")
            return None
    
    def _get_element_text_by_class_name(self, class_name: str) -> str:
        html_element = self.crawler.driver.find_element_by_class_name(class_name)
        html_text = html_element.text

        return html_text
        
    def _extract_percentage_string(self, original_string: str) -> str:
        match_string = '\d{1,2}%'
        
        matches = re.findall(match_string, original_string)
        match = matches[0]

        return match

    def get_current_price_by_class(self, class_name: str) -> int:
        try:
            current_price_value = self._get_element_text_by_class_name(class_name)
            formatted_price = self._price_string_to_float(current_price_value)
            
            return formatted_price
        except NoSuchElementException:
            return None
        except ValueError:
            print(f"Erro na conversão de preço para inteiro")
            return None
        except Exception as e:
            print(f"Ocorreu o seguinte erro: {e}")
            return None

In [5]:
@dataclass
class Product:
    price: int
    original_price: int
    discount: str

In [6]:
crawler = Crawler(
    driver_path=os.path.abspath('../driver/chromedriver')
)

In [7]:
crawler.set_driver_options([
    "--disable-extensions",
    "--disable-gpu",
    "--no-sandbox",
    # "--headless"
])

In [8]:
crawler.start_crawler(implicit_wait_time=10)

In [64]:
crawler.navigate_to_url(BASE_URL)

In [31]:
extractor = HtmlDataExtractor(crawler=crawler)

In [47]:
current_price_identifiers = {
    "xpath": [
        '//span[@id="priceblock_ourprice"]',
        '//span[@id="priceblock_dealprice"]',
        '//span[@id="priceblock_saleprice"]',
        '//span[contains(@class, "a-price")]',
        '//span[contains(@class, "a-size-medium")]',
        '//span[contains(@class, "apexPriceToPay")]',
        '//span[contains(@class, "a-text-price")]'
    ],
    "class": [
        "priceBlockSavingsString"
    ]
}

In [82]:
current_price_identifier_type, current_price_identifier_path = extractor.find_price_identifier(current_price_identifiers)

if current_price_identifier_type == "xpath":
    current_price = extractor.get_product_price_by_xpath(current_price_identifier_path)
elif current_price_identifier_type == "class":
    current_price = extractor.get_current_price_by_class(current_price_identifier_path)
else:
    current_price = None



original_price = extractor.get_original_price_if_exists('priceBlockStrikePriceString')
discount = extractor.get_discount_percentage_if_exists('priceBlockSavingsString')

Achou identificador  //span[contains(@class, "a-price")]


In [89]:
# extractor.get_discount_percentage_if_exists('a-color-price')
crawler.driver.find_element_by_class_name('a-color-price').text

''

In [83]:
if original_price is None:
    original_price = extractor.get_product_price_by_xpath('//span[contains(@data-a-strike, "true")]')
    if original_price is None:
        original_price = current_price

In [84]:
product = Product(
    price=current_price,
    original_price=original_price,
    discount=discount
)

In [85]:
print(product)

Product(price=146990, original_price=159900, discount=None)


In [23]:
crawler.quit()