In [None]:
!pip install requests selenium

In [None]:
import os
import csv
import time
import json
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [40]:
import os
import json
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def init_driver(headless=False):
    options = Options()
    if headless:
        options.add_argument("--headless=new")
    return webdriver.Chrome(options=options)

def click_tab(driver, tab_text):
    tabs = driver.find_elements(By.XPATH, f"//a[span[contains(text(), '{tab_text}')]]")
    for tab in tabs:
        if tab.is_displayed():
            tab.click()
            time.sleep(2)
            return
    print(f"[WARN] Tab with text '{tab_text}' not found.")

def scrape_residential_tab(driver):
    try:
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "Residential")))
    except:
        print("[WARN] Residential table not found.")
        return {}

    data = {}
    rows = driver.find_elements(By.CSS_SELECTOR, "#Residential tr")
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        if len(cells) >= 2:
            key = cells[0].text.strip().replace(":", "")
            value = cells[1].text.strip()
            if key:
                data[key] = value
    return data

def wait_for_photoDetails(driver, timeout=10):
    for _ in range(timeout * 10):
        try:
            result = driver.execute_script("return typeof photoDetails !== 'undefined' && photoDetails.length > 0")
            if result:
                return True
        except:
            pass
        time.sleep(0.1)
    return False

def scrape_and_download_photos_from_photoDetails(driver, address_folder):
    if not wait_for_photoDetails(driver):
        print("[WARN] photoDetails not found or empty.")
        return []

    try:
        photo_details_json = driver.execute_script("return JSON.stringify(photoDetails);")
        photo_details = json.loads(photo_details_json)
    except Exception as e:
        print(f"[ERROR] Couldn't extract photoDetails: {e}")
        return []

    photo_urls = []
    for idx, photo in enumerate(photo_details):
        std_url = f"https://www.ncpub.org/_web/api/document/{photo['Id']}/standard?token=RnNBOFBQNFhzakRDS3dzVVFPYm1wVHpMMFhZR2FvVGZSWEFmRkc5SDE0az0="
        photo_urls.append(std_url)
        try:
            img_data = requests.get(std_url).content
            with open(os.path.join(address_folder, f"photo_{idx+1}.jpg"), 'wb') as f:
                f.write(img_data)
        except Exception as e:
            print(f"[WARN] Failed to download image {idx+1}: {e}")
    return photo_urls

def get_total_record_count(driver):
    try:
        txt = driver.find_element(By.ID, "DTLNavigator_txtFromTo").get_attribute("value")
        return int(txt.split(" of ")[-1])
    except:
        return 1


def scrape_sketch_details(driver):
    """
    Scrapes the table with class 'rgMasterTable' inside div.rgDataDiv and returns a dictionary
    mapping the third <td>'s text to the integer value of the fourth <td> in each row.
    
    Args:
        driver: Selenium WebDriver object, assumed to be on the target page.
    
    Returns:
        dict: { third_td_text: int(fourth_td_text) }
    """
    details = {}
    
    iframe = driver.find_element(By.TAG_NAME, "iframe")
    driver.switch_to.frame(iframe)
    
    # Wait for the table inside div.rgDataDiv to load
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.rgDataDiv table.rgMasterTable"))
    )

    table = driver.find_element(By.CSS_SELECTOR, "div#RadGrid1_GridData table")
    rows = table.find_elements(By.TAG_NAME, "tr")
    
    print(f"[INFO] {len(rows)} rows found.")

    for row in rows:
        tds = row.find_elements(By.TAG_NAME, "td")
        if len(tds) >= 4:
            key = tds[2].text.strip()
            value_text = tds[3].text.strip()
            try:
                value = int(value_text.replace(',', ''))  # Remove commas if numbers are formatted
                details[key] = value
            except ValueError:
                continue  # Skip rows where the fourth td is not an integer

    return details

def scrape_sketch_image(driver, address_folder):
    """
    Takes a screenshot of the sketch image (with overlays rendered) and saves it to address_folder.
    
    Args:
        driver: Selenium WebDriver object, assumed to be on the target page.
        address_folder: str, path to save the screenshot.
    """
    # Wait for image to load (max 10 seconds)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "BinImage"))
    )

    time.sleep(2)

    element = driver.find_element(By.ID, "BinImage")
    element.screenshot(f"{address_folder}/sketch.png")
    driver.switch_to.default_content()
    

def scrape_all_records_on_street(driver, street_name, output_dir):
    base_url = "https://www.ncpub.org/_web/search/commonsearch.aspx?mode=address"
    driver.get(base_url)
    time.sleep(2)

    try:
        driver.find_element(By.ID, "btAgree").click()
        time.sleep(2)
    except:
        pass

    driver.find_element(By.ID, "inpStreet").send_keys(street_name)
    driver.find_element(By.ID, "btSearch").click()
    time.sleep(3)

    # Click the first result
    try:
        result_links = driver.find_elements(By.CSS_SELECTOR, "#searchResults tr.SearchResults")
        if not result_links:
            print(f"[INFO] No results found for street: {street_name}")
            return
        result_links[0].click()
        time.sleep(2)
    except Exception as e:
        print(f"[ERROR] Could not click initial search result: {e}")
        return

    total = get_total_record_count(driver)
    print(f"[INFO] Found {total} records for {street_name}")

    for i in range(total):
        try:
            click_tab(driver, "Residential")
            data = scrape_residential_tab(driver)
            

            click_tab(driver, "Photos")
            folder = os.path.join(output_dir, data.get("address", f"{street_name}_{i}").replace(" ", "_"))
            os.makedirs(folder, exist_ok=True)
            scrape_and_download_photos_from_photoDetails(driver, folder)
            
            click_tab(driver, "Sketch")
            sketch_data = scrape_sketch_details(driver)
            scrape_sketch_image(driver, folder)

            data["sketch_data"] = sketch_data
            
            out_json = os.path.join(folder, "data.json")

            with open(out_json, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2)

            print(f"[SAVED] {data.get('address', 'Unknown')}")

            if i < total - 1:
                driver.find_element(By.ID, "DTLNavigator_imageNext").click()
                time.sleep(2)
        except Exception as e:
            print(f"[WARN] Skipped record {i} due to error: {e}")
            break

In [41]:
driver = init_driver(headless=False)
driver.implicitly_wait(0)
output_dir = "scraped_properties"
os.makedirs(output_dir, exist_ok=True)

STREETS = ["RAMBEAU RD"]

for street in STREETS:
  scrape_all_records_on_street(driver, street, output_dir)

driver.quit()

[INFO] Found 34 records for RAMBEAU RD
[INFO] Saved current page HTML to debug_page.html
[INFO] 8 rows found.
[REMOVE ME] Data: {'Main Building': 900, 'A1 - 11:OFP OPEN FRAME PORCH': 270, 'A2 - 13:FR GR FRAME GARAGE': 576, 'A3 - 16:FROVR FRAME OVERHANG': 72, 'VINYL POOL - RP2:PREFABRICATED VINYL POOL': 512}
[SAVED] Unknown
[WARN] Tab with text 'Residential' not found.
[WARN] Residential table not found.
[WARN] Tab with text 'Photos' not found.


KeyboardInterrupt: 