In [1]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# load in suburb names
suburb_list = pd.read_csv("suburbs.csv")
valid_suburbs = []
sale_data = pd.DataFrame(columns=["suburb", "year", "type", "price"])

# use a headless browser (saves time)
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--headless")

# set up the browser
driver = webdriver.Chrome(options=chrome_options)

def search_for_suburb(suburb, state):
    ''' navigates to search pages and inputs suburb '''
    # navigate to search screen
    driver.get("https://www.allhomes.com.au/ah/research/property-and-past-sales")
    search_field = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "ss"))
    )
    # input suburb name
    search_field.clear()
    search_field.send_keys(suburb)
    # gather suggestions
    location_suggestions = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#ui-id-1 .ui-menu-item"))
    )
    # check suggestions, click correct on
    location = 0
    while True:
        text = location_suggestions[location].find_element_by_tag_name("div").get_attribute("innerText")
        icon = location_suggestions[location].find_element_by_tag_name("img").get_attribute("src")
        if (f"{suburb}," in text) and (f", {state}," in text) and (icon[49:icon.find("?")] == "division-icon.png"):
            location_suggestions[location].click()
            check = True
            break        
        else:
            location = location + 1
            if location == len(location_suggestions):
                check = False
                break
    if check:
        check_suburb(suburb, state)
        
def check_suburb(suburb, state):
    ''' check validity of suburb '''    
    time.sleep(1)
    try:
        suburb_section = driver.find_elements_by_class_name("four_column_wrapper")[1]
        suburb_options = suburb_section.find_elements_by_css_selector("dd a")    
        option = 0
        while True:
            text = suburb_options[option].get_attribute("innerText")
            if (suburb in text) and (f"({state}" in text):
                suburb_options[option].click()
                check = True
                break
            else:
                option = option + 1
                if option == len(suburb_options):
                    check = False
                    break
        if check:
            try:
                sales_2007 = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, ".sales-history-wrapper .research-year-button"))
                )
                if sales_2007.get_attribute("nodeName").lower() == "a":
                    print("sales recorded in", suburb)
                    valid_suburbs.append(suburb)
                    sales_2007.click()
                    record_sales(suburb, "2007")
                    print("2007 sales recorded")
                    driver.back()
                    sales_2020 = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, ".sales-history-wrapper .research-year-button:nth-child(14)"))
                    )                
                    sales_2020.click()
                    record_sales(suburb, "2020")
                    print("2020 sales recorded")
                else:
                    print(suburb, "is invalid")
            except:
                print(suburb, "is invalid")
    except:
        print(suburb, "is invalid")

def record_sales(suburb, year):
    sales = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.TAG_NAME, "tbody"))
    )
    for sale in sales:        
        type = sale.find_elements_by_css_selector(".expanded-details-col2 span.research-details-value")[3].get_attribute("innerText")
        price = sale.find_element_by_class_name("boldFont").get_attribute("innerText")
        sale_data.loc[len(sale_data) - 1] = [suburb, year, type, price]

# iterate through suburbs
for suburb in suburb_list.index:
    search_for_suburb(suburb_list.loc[suburb].suburb, suburb_list.loc[suburb].state)
    sale_data.to_csv("sale_data.csv", index=False)

print(valid_suburbs)
driver.quit()

Acton is invalid
sales recorded in Ainslie
2007 sales recorded
2020 sales recorded
sales recorded in Amaroo
2007 sales recorded
2020 sales recorded
sales recorded in Aranda
2007 sales recorded
2020 sales recorded
sales recorded in Banks
2007 sales recorded
2020 sales recorded
sales recorded in Barton
2007 sales recorded
2020 sales recorded
Beard is invalid
sales recorded in Belconnen
2007 sales recorded
2020 sales recorded
sales recorded in Bonner
2007 sales recorded
2020 sales recorded
sales recorded in Bonython
2007 sales recorded
2020 sales recorded
sales recorded in Braddon
2007 sales recorded
2020 sales recorded
sales recorded in Bruce
2007 sales recorded
2020 sales recorded
sales recorded in Calwell
2007 sales recorded
2020 sales recorded
sales recorded in Campbell
2007 sales recorded
2020 sales recorded
Canberra Airport is invalid
Capital Hill is invalid
Casey is invalid
sales recorded in Chapman
2007 sales recorded
2020 sales recorded
sales recorded in Charnwood
2007 sales reco