In [55]:
import asyncio
import json
import time
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# -------------------------------
# CONFIGURATION
# -------------------------------
BASE_URL = "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/mazowieckie/warszawa"
LISTINGS_CONTAINER_CLASS = "css-1pkwj40"  # **MAY NEED UPDATING!**
LISTING_CLASS = "css-19ucd76"  # **MAY NEED UPDATING!**
OUTPUT_FILE = "listings.csv"

# -------------------------------
# Initialize Chrome WebDriver
# -------------------------------
def init_chrome_driver():
    """
    Initialize Chrome WebDriver with proper options.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run Chrome in the background
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_options.add_argument("--log-level=3")  # Reduce logging
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # Bypass anti-bot detection

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.implicitly_wait(10)
    return driver

# -------------------------------
# Dismiss Cookie Banner
# -------------------------------
def dismiss_cookie_banner(driver):
    """
    Clicks the 'Accept Cookies' button if found.
    """
    try:
        accept_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Akceptuję')]"))
        )
        accept_button.click()
        print("✅ Cookie banner dismissed.")
    except Exception:
        print("⚠️ No cookie banner found or already dismissed.")

# -------------------------------
# Scroll to Load JavaScript Listings
# -------------------------------
def scroll_to_load(driver):
    """
    Scroll multiple times to trigger lazy loading of listings.
    """
    for _ in range(5):  # Adjust scroll depth if needed
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Allow JavaScript to load content

# -------------------------------
# Load Listings Page
# -------------------------------
def get_listings_page(driver, url):
    """
    Navigate to the page, handle cookies, and wait for listings to load.
    """
    driver.get(url)
    dismiss_cookie_banner(driver)
    scroll_to_load(driver)

    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CLASS_NAME, LISTINGS_CONTAINER_CLASS))
        )
        print("✅ Listings successfully loaded.")
    except Exception:
        print("❌ Listings did not load in time.")
        print("📜 Debugging Page Source:\n", driver.page_source[:5000])  # Print first 5000 chars for debugging
        return None

    return BeautifulSoup(driver.page_source, "html.parser")

# -------------------------------
# Extract Listings from Page
# -------------------------------
def extract_listings(soup):
    """
    Extract listings from the parsed HTML.
    """
    listings = []
    
    container = soup.find("div", class_=LISTINGS_CONTAINER_CLASS)
    if not container:
        print("❌ Listing container not found.")
        return listings

    for listing in container.find_all("div", class_=LISTING_CLASS):
        title = listing.find("h3").text.strip() if listing.find("h3") else "No title"
        price = listing.find("span", class_="css-1wi2w6s").text.strip() if listing.find("span", class_="css-1wi2w6s") else "No price"
        location = listing.find("p", class_="css-1pgwcoa").text.strip() if listing.find("p", class_="css-1pgwcoa") else "No location"
        url = listing.find("a")["href"] if listing.find("a") else "No URL"

        listings.append({
            "title": title,
            "price": price,
            "location": location,
            "url": f"https://www.otodom.pl{url}" if url.startswith("/") else url
        })
    
    return listings

# -------------------------------
# Save to CSV
# -------------------------------
def save_to_csv(data, filename=OUTPUT_FILE):
    """
    Save listings to a CSV file.
    """
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding="utf-8")
    print(f"💾 Listings saved to {filename}")

# -------------------------------
# Main Scraper Function
# -------------------------------
def scrape_otodom():
    """
    Scrapes Otodom listings and saves results to a CSV file.
    """
    driver = init_chrome_driver()
    all_listings = []

    try:
        # Iterate over multiple pages (up to 5 pages in this case)
        for page in range(1, 6):
            print(f"🔄 Scraping page {page}...")
            page_url = f"{BASE_URL}?page={page}"
            soup = get_listings_page(driver, page_url)
            if not soup:
                break
            
            listings = extract_listings(soup)
            if not listings:
                print(f"❌ No listings found on page {page}. Stopping.")
                break
            
            all_listings.extend(listings)
            time.sleep(2)  # Be polite, avoid being blocked

        # Save results
        if all_listings:
            save_to_csv(all_listings)
            print(f"✅ Successfully scraped {len(all_listings)} listings!")
        else:
            print("❌ No listings found at all.")

    finally:
        driver.quit()

# -------------------------------
# Run the Scraper
# -------------------------------
scrape_otodom()

🔄 Scraping page 1...
⚠️ No cookie banner found or already dismissed.
❌ Listings did not load in time.
📜 Debugging Page Source:
 <html><head><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<title>ERROR: The request could not be satisfied</title>
</head><body>
<h1>403 ERROR</h1>
<h2>The request could not be satisfied.</h2>
<hr noshade="" size="1px">
Request blocked.
We can't connect to the server for this app or website at this time. There might be too much traffic or a configuration error. Try again later, or contact the app or website owner.
<br clear="all">
If you provide content to customers through CloudFront, you can find steps to troubleshoot and help prevent this error by reviewing the CloudFront documentation.
<br clear="all">
<hr noshade="" size="1px">
<pre>Generated by cloudfront (CloudFront)
Request ID: GBflfbaLWPhOG9XpQcS-C4SZVgtsbExyeShPUMRDwfp_h9BFV0483w==
</pre>
<address>
</address>
</body></html>
❌ No listings found at all.


In [None]:
from lib2to3.pgen2 import driver
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
print(driver.page_source)

In [56]:
import requests
from bs4 import BeautifulSoup

URL = "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/mazowieckie/warszawa"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

response = requests.get(URL, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Print all divs (to analyze structure)
for div in soup.find_all("div"):
    print(div.attrs)

{'id': '__next'}
{'style': 'position:fixed;z-index:9999'}
{'class': ['css-1bx5ylf', 'e50rtj23']}
{'class': ['css-11az3rb', 'etqjiy20']}
{'id': 'baxter-l-nav-top', 'class': [], 'style': 'display:none', 'data-cy': 'baxter-slot-baxter-l-nav-top'}
{'id': 'baxter-l-scr', 'class': [], 'style': 'display:none', 'data-cy': 'baxter-slot-baxter-l-scr'}
{'class': ['css-szdbo9', 'e1r4zmi50']}
{'class': ['e1r4zmi52', 'css-1bc7oiu']}
{'class': ['ehhk7110', 'css-1d7ri1z']}
{'class': ['ehhk7111', 'css-70n85g']}
{'data-expanded': 'false', 'class': ['css-ivlwbd', 'e12g1ibn0']}
{'id': 'sell-container', 'class': ['css-1bbw7v7', 'emfe5dh0']}
{'data-cy': 'sellDesktopSubmenu', 'class': ['css-156h79b', 'e1h9b5vx0']}
{'class': ['e1h9b5vx1', 'css-oftajj']}
{'id': 'rent-container', 'class': ['css-1bbw7v7', 'emfe5dh0']}
{'data-cy': 'rentDesktopSubmenu', 'class': ['css-156h79b', 'e1h9b5vx0']}
{'class': ['e1h9b5vx1', 'css-oftajj']}
{'id': 'sellers-container', 'class': ['css-1bbw7v7', 'emfe5dh0']}
{'data-cy': 'seller

In [57]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Initialize Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open Otodom
driver.get("https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/mazowieckie/warszawa")

# Wait and Find Listings
driver.implicitly_wait(5)
listings = driver.find_elements(By.CSS_SELECTOR, "div[class*='listing']")

# Print all elements found
for listing in listings:
    print(listing.text)

driver.quit()