In [1]:
from construct import list_
from numpy import save
import requests
import os
import time
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def download_image(image_url, location, base_url='https://www.estately.com', save_dir='../data/estately_images/queens'):
    if not image_url.startswith('http'):
        image_url = base_url + image_url
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    filename = "".join([c for c in location if c.isalpha() or c.isdigit() or c == ' ']).rstrip()
    filename = filename.replace(' ', '_') + '.jpg'
    file_path = os.path.join(save_dir, filename)
    try:
        response = requests.get(image_url)
        if response.status_code == 200:
            with open(file_path, 'wb') as file:
                file.write(response.content)
        else:
            filename = None
    except requests.exceptions.RequestException:
        filename = None 
    return filename

def fetch_properties(driver, max_images=300, area = 'NY/New_York'):
    properties = []
    image_count = 0
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    property_divs = soup.find_all('div', class_='result-item-info clearfix')
    for div in property_divs:
        if image_count >= max_images:
            break
        location_tag = div.find('h2', class_='result-address').find('a')
        location = location_tag.text.strip() if location_tag else "No location provided"
        image_tag = div.find('img', class_='listing-card-image')
        detail_link = 'https://www.estately.com' + location_tag['href'] if location_tag else None

        # Price extraction
        price_tag = div.find('p', class_='result-price')
        price = price_tag.text.strip() if price_tag else "Price not listed"
        
        # Property type extraction
        property_type_tag = div.find('h2', class_='result-address').find('small')
        property_type = property_type_tag.text.strip() if property_type_tag else "Property type not listed"
        
        photo_count_tag = div.parent.find('div', class_='photo-count-small')
        photo_count = photo_count_tag.text.strip() if photo_count_tag else "No photo"
        
        broker_tag = div.parent.find('p')
        broker = broker_tag.text.strip() if broker_tag else "No broker listed"
        
        image_url = image_tag.get('data-src', image_tag.get('src', "No image provided")) if image_tag else "No image provided"
        image_name = download_image(image_url, location, save_dir=f'../data/estately/img/{area}')
        
        # Basic details extraction
        basics_grid = div.find('ul', class_='result-basics-grid')
        beds = baths = sqft = lot_size = None 
        if basics_grid:
            for li in basics_grid.find_all('li'):
                text = li.text.strip()
                if 'bed' in text.lower():
                    beds = text.split()[0]
                elif 'bath' in text.lower():
                    baths = text.split()[0]
                elif 'sqft' in text.lower() and not 'lot' in text.lower():
                    sqft = text.split()[0]
                elif 'lot' in text.lower():
                    lot_size = text.split()[0]
                elif 'on site' in text.lower():
                    days_on_site = text.split()[0]
        properties.append({
            'Location': location,
            'Detail Link': detail_link,
            'Image Name': image_name,
            'Price': price,
            'Property Type': property_type,
            'Photo Count': photo_count,
            'Broker': broker,
            'Beds': beds,
            'Baths': baths,
            'Sqft': sqft,
            'Lot Size': lot_size,
            'Days on Site': days_on_site
        })
        image_count += 1
        if image_count % 20 == 0:
            print(f"Scraped {image_count} images.")
    return properties

def main(max_images=300, area = 'NY/New_York'):
    url = f'https://www.estately.com/{area}'
    driver = setup_driver()
    driver.get(url)
    time.sleep(5)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)
    properties = fetch_properties(driver, max_images, area)
    driver.quit()
    df = pd.DataFrame(properties)
    return df

In [12]:
# Area to scrap
# area = 'NY/Queens'
# area = 'NY/Brooklyn'
list_areas = ['NY/Queens', 'NY/Brooklyn', 'MD/Cumberland']
area = list_areas[2]
loc = area.replace('/', '_')
print(loc)

MD_Cumberland


In [13]:
df = main(300, area)
print('='*50)
print(f"Scraped {len(df)} properties in {area}.")
# print(df)
print('='*50)

Scraped 20 images.
Scraped 40 images.
Scraped 60 images.
Scraped 80 images.
Scraped 100 images.
Scraped 105 properties in MD/Cumberland.


In [14]:
df.to_csv(f'../data/estately/property_data/{loc}.csv', index=False) 

In [15]:
df

Unnamed: 0,Location,Detail Link,Image Name,Price,Property Type,Photo Count,Broker,Beds,Baths,Sqft,Lot Size
0,"504 Warren Street, CUMBERLAND, MD",https://www.estately.com/listings/info/504-war...,504_Warren_Street_CUMBERLAND_MD.jpg,"$190,000",House For Sale,3 photos,House For Sale,3,2,1200,4500
1,"808 Louisiana Avenue, CUMBERLAND, MD",https://www.estately.com/listings/info/808-lou...,808_Louisiana_Avenue_CUMBERLAND_MD.jpg,"$200,000",House For Sale,22 photos,House For Sale,3,2,1455,8450
2,"1212 Lafayette Ave, CUMBERLAND, MD",https://www.estately.com/listings/info/1212-la...,1212_Lafayette_Ave_CUMBERLAND_MD.jpg,"$218,500",House For Sale,112 photos,House For Sale,3,3,1600,8276
3,"903 Maryland Avenue, CUMBERLAND, MD",https://www.estately.com/listings/info/903-mar...,903_Maryland_Avenue_CUMBERLAND_MD.jpg,"$67,900",House For Sale,24 photos,House For Sale,2,1,728,1589
4,"136 Arch Street, CUMBERLAND, MD",https://www.estately.com/listings/info/136-arc...,136_Arch_Street_CUMBERLAND_MD.jpg,"$138,500",Multifamily For Sale,32 photos,Multifamily For Sale,‚Äì,‚Äì,2296,3950
...,...,...,...,...,...,...,...,...,...,...,...
100,"447 Williams Street, CUMBERLAND, MD",https://www.estately.com/listings/info/447-wil...,447_Williams_Street_CUMBERLAND_MD.jpg,"$34,500",Land For Sale,1 photo,Land For Sale,‚Äì,‚Äì,‚Äì,
101,"126 Seymour Street, CUMBERLAND, MD",https://www.estately.com/listings/info/126-sey...,126_Seymour_Street_CUMBERLAND_MD.jpg,"$66,000",House For Sale,31 photos,House For Sale,4,2,2876,3886
102,"12528 Willowbrook Road, CUMBERLAND, MD",https://www.estately.com/listings/info/12528-w...,12528_Willowbrook_Road_CUMBERLAND_MD.jpg,"$1,800,000",Land For Sale,1 photo,Land For Sale,‚Äì,‚Äì,‚Äì,
103,"2 12 Patterson Avenue, CUMBERLAND, MD",https://www.estately.com/listings/info/2-12-pa...,2_12_Patterson_Avenue_CUMBERLAND_MD.jpg,"$19,900",Land For Sale,9 photos,Land For Sale,‚Äì,‚Äì,‚Äì,


In [4]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
# url = 'https://www.estately.com/NY/Brooklyn/house,condo,townhouse,multifamily'
url = 'https://www.estately.com/33.4219,-119.3132,34.3295,-117.6653'
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

In [5]:
print(soup)

<html class="map" lang="en"><head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# estately: http://ogp.me/ns/fb/estately#">
<meta charset="utf-8"/>
<meta content="width=980, maximum-scale=1, user-scalable=no" name="viewport"/>
<title>Recently Sold Real Estate &amp; Homes - Estately</title>
<link href="https://images.estately.net" rel="preconnect"/>
<link href="https://www.google-analytics.com" rel="preconnect"/>
<script async="" nonce="" src="https://www.googletagmanager.com/gtag/js?id=G-CX2LBLNSX6&amp;l=dataLayer&amp;cx=c" type="text/javascript"></script><script async="" src="https://www.googletagmanager.com/gtm.js?id=GTM-TFGTLBP"></script><script nonce="">
//<![CDATA[
window.estately={data:{"controller":"MapController","git_sha":"569a421bcfe866fe7d528ca00f71ba4dfbad78d0","is_user_logged_in":false,"is_mobile":false,"is_csr":null,"property":{"id":null,"ln":null,"city":null,"county":null,"state":null,"zip":null,"mls":{"acronym":null}}},analytics:{"id":"UA-1073029-1","events":[],"

In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import os
from bs4 import BeautifulSoup
import requests

def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def get_total_pages(driver):
    # Extract the total results count from the initial page
    total_results_tag = driver.find_element(By.ID, "js-map-results-count")
    print("total result: ", total_results_tag)
    if total_results_tag:
        total_results = int(total_results_tag.text.split()[0])
        total_pages = (total_results + 199) // 200
        return total_pages
    return 0
def fetch_properties(driver):
    properties = []
    image_count = 0
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    property_divs = soup.find_all('div', class_='result-item-info clearfix')
    for div in property_divs:
        location_tag = div.find('h2', class_='result-address').find('a')
        location = location_tag.text.strip() if location_tag else "No location provided"
        image_tag = div.find('img', class_='listing-card-image')
        detail_link = 'https://www.estately.com' + location_tag['href'] if location_tag else None

        # Price extraction
        price_tag = div.find('p', class_='result-price')
        price = price_tag.text.strip() if price_tag else "Price not listed"
        
        # Property type extraction
        property_type_tag = div.find('h2', class_='result-address').find('small')
        property_type = property_type_tag.text.strip() if property_type_tag else "Property type not listed"
        
        photo_count_tag = div.parent.find('div', class_='photo-count-small')
        photo_count = photo_count_tag.text.strip() if photo_count_tag else "No photo"
        
        broker_tag = div.parent.find('p')
        broker = broker_tag.text.strip() if broker_tag else "No broker listed"
        
        basics_row = div.find('ul', class_='result-basics-row')
        days_on_site = basics_row.text.strip() if basics_row else "None"

        image_name = None
#         image_url = image_tag.get('data-src', image_tag.get('src', "No image provided")) if image_tag else "No image provided"
#         image_name = download_image(image_url, location, save_dir=f'../data/estately/img/{area}')
        
        # Basic details extraction
        basics_grid = div.find('ul', class_='result-basics-grid')
        beds = baths = sqft = lot_size = days_on_site = None 
        if basics_grid:
            for li in basics_grid.find_all('li'):
                text = li.text.strip()
                if 'bed' in text.lower():
                    beds = text.split()[0]
                elif 'bath' in text.lower():
                    baths = text.split()[0]
                elif 'sqft' in text.lower() and not 'lot' in text.lower():
                    sqft = text.split()[0]
                elif 'lot' in text.lower():
                    lot_size = text.split()[0]
        properties.append({
            'Location': location,
            'Detail Link': detail_link,
            'Image Name': image_name,
            'Price': price,
            'Property Type': property_type,
            'Photo Count': photo_count,
            'Broker': broker,
            'Beds': beds,
            'Baths': baths,
            'Sqft': sqft,
            'Lot Size': lot_size,
            'Days on Site': days_on_site
        })
        image_count += 1
        if image_count % 20 == 0:
            print(f"Scraped {image_count} images.")
        break
    return properties

def scrape_all_pages(area):
    url = f'https://www.estately.com/{area}'
    driver = setup_driver()
    driver.get(url)
    time.sleep(5)  # Wait for dynamic content to load

    # Get total pages
    total_pages = get_total_pages(driver)
    if total_pages == 0:
        return 
    all_properties = []

    for page in range(1, total_pages + 1):
        # Navigate to the correct page
        driver.get(f"{url}?page={page}")
        time.sleep(5)
        
        # Fetch properties from this page
        properties = fetch_properties(driver)
        all_properties.extend(properties)

    driver.quit()
    return all_properties

def main(area='NY/New_York'):
    properties = scrape_all_pages(area)
#     df = pd.DataFrame(properties)
#     loc = area.replace('/', '_')
#     df.to_csv(f'../data/estately/property_data/{loc}.csv', index=False)
#     print(f"Scraped {len(df)} properties in {area}.")
    print(properties)

# Example usage
main('NY/Queens')


BadZipFile: File is not a zip file