In [16]:
!pip install -q playwright
!playwright install chromium
!pip install tqdm
!pip install aiohttp

Collecting aiohttp
  Downloading aiohttp-3.12.11-cp313-cp313-macosx_11_0_arm64.whl.metadata (7.6 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp)
  Downloading aiosignal-1.3.2-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting attrs>=17.3.0 (from aiohttp)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting frozenlist>=1.1.1 (from aiohttp)
  Downloading frozenlist-1.6.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (17 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp)
  Downloading multidict-6.4.4-cp313-cp313-macosx_11_0_arm64.whl.metadata (5.3 kB)
Collecting propcache>=0.2.0 (from aiohttp)
  Downloading propcache-0.3.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (10 kB)
Collecting yarl<2.0,>=1.17.0 (from aiohttp)
  Downloading yarl-1.20.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (72 kB)
Collecting idna>=2.0 (from yarl<2.0,>=1.17.0->aiohttp)
  Download

In [None]:
import asyncio
import json
import os
from pathlib import Path
from urllib.parse import urlparse
import aiohttp
from playwright.async_api import async_playwright

BASE_URL = "https://www.redfin.com"
CITIES = {
    "Bethlehem": "1616",
    "Easton": "5583",
    "Allentown": "514"
}
MAX_LISTINGS_PER_CITY = 5


def sanitize_filename(text):
    return text.replace(" ", "-").replace(",", "").replace("/", "-")


async def download_images(image_urls, address, city, session, out_dir="images"):
    os.makedirs(out_dir, exist_ok=True)
    saved_paths = []

    for idx, url in enumerate(image_urls):
        ext = os.path.splitext(urlparse(url).path)[1] or ".jpg"
        safe_address = sanitize_filename(address)
        filename = f"{city}_{safe_address}_{idx}{ext}"
        filepath = os.path.join(out_dir, filename)

        try:
            async with session.get(url) as resp:
                if resp.status == 200:
                    with open(filepath, "wb") as f:
                        f.write(await resp.read())
                    saved_paths.append(filepath)
        except Exception as e:
            print(f"❌ Failed to download {url}: {e}")

    return saved_paths


async def add_stealth(page):
    await page.add_init_script("""
    Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
    window.chrome = { runtime: {} };
    Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
    Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
    """)


async def scrape_listing(page, url, city_name, session):
    try:
        await page.goto(url, timeout=30000)
        await page.wait_for_timeout(1500)

        address = await page.text_content("div.street-address")
        desc_tag = await page.query_selector('div[data-rf-test-id="listingRemarks"]')
        description = await desc_tag.inner_text() if desc_tag else ""

        # Stats (beds, baths, sqft)
        beds_el = await page.query_selector('div[data-rf-test-id="abp-beds"] .statsValue')
        baths_el = await page.query_selector('div[data-rf-test-id="abp-baths"] .statsValue')
        sqft_el = await page.query_selector('div[data-rf-test-id="abp-sqFt"] .statsValue')

        beds = await beds_el.inner_text() if beds_el else None
        baths = await baths_el.inner_text() if baths_el else None
        sqft = await sqft_el.inner_text() if sqft_el else None

        # Key Details
        key_rows = await page.query_selector_all("div.KeyDetailsTable div.keyDetails-row")
        year_built = property_type = price_per_sqft = None

        for row in key_rows:
            label_el = await row.query_selector("span.valueType")
            value_el = await row.query_selector("span.valueText")
            if not label_el or not value_el:
                continue
            label = (await label_el.inner_text()).strip()
            value = (await value_el.inner_text()).strip()

            if label == "Year Built":
                year_built = value
            elif label == "Property Type":
                property_type = value
            elif label == "Price/Sq.Ft.":
                price_per_sqft = value

        # Image scraping
        img_tags = await page.query_selector_all("a[data-rf-test-id^='MB-image-card-'] img")
        image_urls = []
        for img in img_tags:
            src = await img.get_attribute("src")
            if src:
                image_urls.append(src)

        local_image_paths = await download_images(image_urls, address or "unknown", city_name, session)

        return {
            "url": url,
            "address": address.strip() if address else "",
            "description": description.strip(),
            "beds": beds,
            "baths": baths,
            "sqft": sqft,
            "year_built": year_built,
            "property_type": property_type,
            "price_per_sqft": price_per_sqft,
            "image_paths": local_image_paths
        }
    except Exception as e:
        print(f"❌ Error parsing {url}: {e}")
        return None


async def scrape_city(page, city_name, city_id, max_listings, session):
    print(f"\n🔍 Scraping {city_name} (max {max_listings})...")
    listings = []
    page_num = 1

    while len(listings) < max_listings:
        url = f"{BASE_URL}/city/{city_id}/PA/{city_name}/page-{page_num}"
        for attempt in range(2):
            try:
                await page.goto(url, timeout=60000)
                break
            except TimeoutError:
                print(f"⚠️ Timeout on {url}, retrying ({attempt+1}/2)...")
                await page.wait_for_timeout(3000)
        else:
            print(f"❌ Failed to load {url} after 2 attempts.")
            break

        await page.wait_for_timeout(2000)
        await page.evaluate("() => window.scrollTo(0, document.body.scrollHeight)")
        await page.wait_for_timeout(2000)

        try:
            await page.wait_for_selector("div.MapHomeCard", timeout=15000)
        except Exception as e:
            print(f"⚠️ Timeout waiting for listings on {city_name} page {page_num}: {e}")
            break

        cards = await page.query_selector_all("div.MapHomeCard a[href*='/PA/']")
        print(f"🔎 Page {page_num}: found {len(cards)} cards")

        if not cards:
            html = await page.content()
            with open(f"debug_{city_name.lower()}_page{page_num}.html", "w") as f:
                f.write(html)
            break

        hrefs = []
        for card in cards:
            href = await card.get_attribute("href")
            if href and (BASE_URL + href) not in hrefs:
                hrefs.append(BASE_URL + href)

        for href in hrefs:
            if len(listings) >= max_listings:
                break
            data = await scrape_listing(page, href, city_name, session)
            if data:
                data["city"] = city_name
                listings.append(data)

        page_num += 1

    print(f"✅ {city_name}: Collected {len(listings)} listings.")
    return listings


async def main():
    all_data = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=False,
            args=["--no-sandbox", "--disable-blink-features=AutomationControlled"]
        )
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
            viewport={"width": 1280, "height": 800},
            locale="en-US",
            extra_http_headers={"Accept-Language": "en-US,en;q=0.9"}
        )
        page = await context.new_page()

        async with aiohttp.ClientSession() as session:
            for city, city_id in CITIES.items():
                listings = await scrape_city(page, city, city_id, MAX_LISTINGS_PER_CITY, session)
                all_data.extend(listings)

        await browser.close()

    with open("tri_city_listings.json", "w") as f:
        json.dump(all_data, f, indent=2)
    print(f"\n🎉 Saved {len(all_data)} total listings to tri_city_listings.json")


await main()


🔍 Scraping Bethlehem (max 5)...
🔎 Page 1: found 41 cards
✅ Bethlehem: Collected 5 listings.

🔍 Scraping Easton (max 5)...
🔎 Page 1: found 41 cards
✅ Easton: Collected 5 listings.

🔍 Scraping Allentown (max 5)...
🔎 Page 1: found 41 cards
✅ Allentown: Collected 5 listings.

🎉 Saved 15 total listings to tri_city_listings.json
