In [36]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import re
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC



In [None]:
# Configurar Selenium
options = Options()
options.add_argument('--headless')  # Oculta navegador
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

BASE_URL = "https://www.jolse.com"
PRODUCT_LIST_URL = "https://www.jolse.com/category/skincare/1018/?page={}"

products = []
product_urls = []

# PRIMERA PASADA: guardar URLs
for page in range(1, 34):  # Cambiar rango para más páginas
    driver.get(PRODUCT_LIST_URL.format(page))
    time.sleep(3)

    links = driver.find_elements(By.CSS_SELECTOR, 'ul.prdList li div.thumbnail a')
    for link in links:
        url = link.get_attribute('href')
        if url:
            product_urls.append(url)

# SEGUNDA PASADA: visitar cada URL y extraer datos
for product_url in product_urls:
    driver.get(product_url)
    time.sleep(3)

   # NAME
    try:
        name = driver.find_element(By.CSS_SELECTOR, 'div.xans-product-detail h2').text.strip()
        if not name:
            name = driver.find_element(By.CSS_SELECTOR, 'div.prd-detail-header h2').text.strip()
    except:
        name = None
        print(f"⚠️ Nombre no encontrado en {product_url}")
   
    # BRAND
    try:
        brand = None
        th_tags = driver.find_elements(By.TAG_NAME, 'th')
        for th in th_tags:
            if th.text.strip().lower() == 'brand':
                brand = th.find_element(By.XPATH, 'following-sibling::td').text.strip()
                break
    except:
        brand = None

    # PRICE
    try:
        price = driver.find_element(By.ID, 'span_product_price_text').text.strip()
    except:
        price = None

    # INGREDIENTS
    ingredients = None
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'cont'))
        )
        divs = driver.find_elements(By.CLASS_NAME, 'cont')
        for div in divs:
            text = div.text.strip()
            match = re.search(r'Ingredients\s*[:\-]?\s*(.+)', text, flags=re.IGNORECASE)
            if match:
                candidate = match.group(1).strip()
                if candidate.count(',') > 5:
                    ingredients = candidate
                    break
    except Exception as e:
        print(f"❌ Error buscando ingredientes: {e}")

    
    
    # IMAGE
    try:
        image = driver.find_element(By.CSS_SELECTOR, 'div.keyImg img').get_attribute('src')
    except:
        image = None

    products.append({
        'name': name,
        'brand': brand,
        'price': price,
        'ingredients': ingredients,
        'product_url': product_url,
        'image_url': image
    })

    print(f"✔️ Scrapeado: {product_url}")

driver.quit()

# Guardar CSV
df = pd.DataFrame(products)
df.to_csv('jolse_products.csv', index=False, encoding='utf-8-sig')
print("✅ Archivo guardado como jolse_products_selenium.csv")

✔️ Scrapeado: https://www.jolse.com/product/skin1004-madagascar-centella-light-cleansing-oil-200ml/28746/category/1018/display/2/
✔️ Scrapeado: https://www.jolse.com/product/b-lab-matcha-hydrating-foam-cleanser-120ml/33721/category/1018/display/2/
✔️ Scrapeado: https://www.jolse.com/product/round-lab-birch-juice-moisturizing-sunscreen-spf-50-pa-50ml/40013/category/1018/display/2/
✔️ Scrapeado: https://www.jolse.com/product/cosrx-aloe-soothing-sun-cream-spf50-pa-50ml/3400/category/1018/display/2/
✔️ Scrapeado: https://www.jolse.com/product/anua-heartleaf-pore-control-cleansing-oil-200ml/65381/category/1018/display/2/
✔️ Scrapeado: https://www.jolse.com/product/skin1004-madagascar-centella-poremizing-quick-clay-stick-mask-27g/69432/category/1018/display/1/
✔️ Scrapeado: https://www.jolse.com/product/skin1004-madagascar-centella-tone-brightening-capsule-ampoule-100ml/41719/category/1018/display/1/
✔️ Scrapeado: https://www.jolse.com/product/skin1004-madagascar-centella-hyalu-cica-water-fi