In [1]:
from playwright.async_api import async_playwright 

In [2]:
import pandas as pd

In [3]:
import json

In [8]:
import asyncio
import nest_asyncio

In [21]:
from playwright_stealth import stealth

In [23]:
import random

In [11]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
page = await browser.new_page()

In [29]:
async def scrape_sephora():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # Run in headful mode for debugging
        context = await browser.new_context()
        page = await context.new_page()

        print("🔎 Navigating to Sephora...")
        await page.goto("https://www.sephora.com/shop/foundation-makeup?pageSize=300", wait_until="domcontentloaded")

        # 🟢 Perform Auto-Scroll to Load All Products
        print("📜 Scrolling down to load all products...")
        await auto_scroll(page)  

        # 🟢 Extract Product URLs After Scrolling
        product_urls = await page.evaluate('''() => {
            return Array.from(document.querySelectorAll(".css-ix8km1")).map(item => item.href);
        }''')

        print(f"✅ Found {len(product_urls)} product URLs!")

        await browser.close()

async def auto_scroll(page):
    """ Scrolls down gradually to load all products """
    last_height = await page.evaluate("document.body.scrollHeight")
    while True:
        await page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
        await page.wait_for_timeout(random.randint(2000, 4000))  # Random delay to mimic human behavior

        new_height = await page.evaluate("document.body.scrollHeight")
        if new_height == last_height:
            break  # Exit loop if no new content is loaded
        last_height = new_height

asyncio.run(scrape_sephora())


🔎 Navigating to Sephora...
📜 Scrolling down to load all products...
✅ Found 0 product URLs!


In [15]:
url = "https://www.sephora.com/shop/foundation-makeup?pageSize=300"

In [16]:
await page.goto(url)
try:
    brand = await page.locator("h1 a span").inner_text()
except:
    brand = "Unknown"


In [17]:
try:
        product = await page.locator(".css-1wd4e6l span:nth-child(2)").inner_text()
except:
        product = "Unknown"

In [None]:
# Fix the "RuntimeError: asyncio.run() cannot be called from a running event loop"
nest_asyncio.apply()

results = []

async def get_color_swatches(url, page):
    # Scrapes color swatches and product details from a Sephora product page.
    await page.goto(url)

    try:
        brand = await page.locator("h1 a span").inner_text()
    except:
        brand = "Unknown"

    try:
        product = await page.locator(".css-1wd4e6l span:nth-child(2)").inner_text()
    except:
        product = "Unknown"

    swatches = await page.locator(".css-1j1jwa4, .css-cl742e").evaluate_all(
        lambda elements: [
            {
                "description": el.getAttribute("aria-label"),
                "imgSrc": el.querySelector("img").getAttribute("src") if el.querySelector("img") else None,
                "imgAlt": el.querySelector("img").getAttribute("alt") if el.querySelector("img") else None
            } for el in elements
        ]
    )

    full = {"brand": brand, "product": product, "url": url, "swatches": swatches}
    results.append(full)

async def loop_through_urls(urls, page):
    # Loops through product URLs and scrapes data.
    for url in urls:
        try:
            await get_color_swatches(url, page)
            await page.wait_for_timeout(1000)  # Pause briefly to avoid being blocked
        except Exception as e:
            print(f"Error scraping {url}: {e}")

    # Save results to JSON
    with open("sephora.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4)
    print("✅ Scraping complete. Data saved to sephora.json.")

async def find_product_url():
# Finds product URLs from Sephora's foundation page and scrapes them.
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)  # Change to False for debugging
        page = await browser.new_page()
        await page.goto("https://www.sephora.com/shop/foundation-makeup?pageSize=300")

        # Adjust viewport dynamically
        body_width = await page.evaluate("document.body.scrollWidth")
        body_height = await page.evaluate("document.body.scrollHeight")
        await page.set_viewport_size({"width": body_width, "height": body_height})

        # Wait for product items to load
        await page.wait_for_function("document.querySelectorAll('.css-dkxsdo').length > 10")

        # Extract product URLs
        product_urls = await page.locator(".css-ix8km1").evaluate_all(
            lambda items: [item.href for item in items]
        )

        print(f"🔎 Found {len(product_urls)} product URLs. Scraping data now...")

        # Start scraping product pages
        await loop_through_urls(product_urls, page)

        await browser.close()

# ✅ FIX: Instead of asyncio.run(), use an event loop in Jupyter
asyncio.get_event_loop().run_until_complete(find_product_url())
