# Scraper

This scraping code is built upoin the `scrape.py` code provided

All scrapes are in with domain's [robot.txt](https://www.domain.com.au/robots.txt)

The scrapping take a long time and is dependent on the date scrape. Because of this a pre-scrapped file is provided

In [None]:
import re
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import sleep
from IPython.display import display


pd.options.display.max_columns = None


BASE_URL = "https://www.domain.com.au"

DIR_RAW = "../data/raw/"

HEADERS = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}

PERCENTAGES = range(1, 101, 5)

REQUEST_WAIT_TIME = 0.2

# get list of VIC postcodes
df_postcodes = pd.read_csv(f"{DIR_RAW}postcodes.csv")
df_postcodes = df_postcodes[df_postcodes["state"] == "VIC"]
postcodes = set(df_postcodes["postcode"].unique())

In [None]:
def get_display_percentages(xs):
    display_percentages = [round(len(xs) * (percentage/100)) for percentage in PERCENTAGES]
    display_percentages[0] = 1
    display_percentages[-1] = len(xs)
    return display_percentages

## Collect property URLs by postcodes

In [None]:
property_urls = set()

postcode_num = 0
display_percentage = get_display_percentages(postcodes)

for postcode in postcodes:

    page_num = 1
    while True:
        
        page_url = f"{BASE_URL}/rent/?postcode={postcode}&page={page_num}"
        content = BeautifulSoup(requests.get(page_url, headers=HEADERS).text, "html.parser")
        
        # collect all candidate property links on page
        links = content.find("ul", {"data-testid": "results"})
        if not links:
            break
            
        links = links.findAll("a", href=re.compile(f"{BASE_URL}/*"))
        if not links:
            break
        
        for link in links:
            if "address" in link["class"]:
                property_urls.add(link["href"])
        
        page_num += 1
        
        # wait a little time in-between each request to prevent DDOS attack
        sleep(REQUEST_WAIT_TIME)
    
    postcode_num += 1
    if postcode_num in display_percentage:
        print(f"{round(postcode_num / len(postcodes) * 100)}% of VIC postcode URLs scraped")

## Scrape each URL collected

In [None]:
data = []

property_url_num = 0
display_percentage = get_display_percentages(property_urls)

for property_url in property_urls:
    
    current_scrape = {"url": property_url}
    
    content = BeautifulSoup(requests.get(property_url, headers=HEADERS).text, "html.parser")

    if not content:
        pass

    content_price = content.find("div", {"data-testid": "listing-details__summary-title"})

    content_address = content.find("h1", {"class": "css-164r41r"})

    content_features = content.find("div", {"data-testid": "property-features"})
    if content_features:
        content_features = content_features.findAll("span", {"data-testid": "property-features-text-container"})

    content_property_type = content.find("div", {"data-testid": "listing-summary-property-type"})

    content_agent = content.find("a", {"data-testid": "listing-details__agent-details-agent-company-name"})

    content_summary = content.find("div", {"data-testid": "strip-content-list"})
    if content_summary:
        content_summary = content_summary.findAll("li")

    content_domain_says = content.find("p", {"data-testid": "listing-details__domain-says-text"})

    content_neighbourhood_insights = content.find("section", {"data-testid": "neighbourhood-insights"})
    if content_neighbourhood_insights:
        content_age = content.findAll("tr", {"data-testid": "neighbourhood-insights__age-brackets-row"})
        content_long_term_residents = content.find("div", {"data-testid": "single-value-doughnut-graph"})
        content_type = content.findAll("div", {"class": "css-14hea9r"})
    else:
        content_age = content_long_term_residents = content_type = None

    content_stats = content.find("div", {"data-testid": "listing-details__suburb-insights"})
    if content_stats:
        content_values = content_stats.findAll("div", {"class": "css-35ezg3"})

        content_occupancy = content_stats.find("div", {"data-testid": "suburb-insights__occupancy"})
        content_household = content_stats.find("div", {"data-testid": "suburb-insights__household"})
    else:
        content_values = content_occupancy = content_household = None

    content_coordinates = content.find("a", {"target": "_blank", "rel": "noopener noreferer"})

    if content_price:
        current_scrape["price"] = content_price.getText()

    if content_address:
        current_scrape["address"] = content_address.getText()

    if content_features and len(content_features) >= 1:
        current_scrape["num_beds"] = content_features[0].getText()

    if content_features and len(content_features) >= 2:
        current_scrape["num_bath"] = content_features[1].getText()

    if content_features and len(content_features) >= 3:
        current_scrape["num_car"] = content_features[2].getText()

    if content_property_type:
        current_scrape["property_type"] = content_property_type.getText()

    if content_agent:
        current_scrape["agent"] = content_agent.getText()

    if content_summary:
        for entry in content_summary:
            bond_found = re.findall(r"([bB]ond \$[0-9,\.]+)",  entry.getText())
            internal_area_found = re.findall(r"([iI]nternal area .+)",  entry.getText())
            land_area_found = re.findall(r"([lL]and area .+)",  entry.getText())

            if bond_found:
                current_scrape["bond"] = bond_found[0]

            if internal_area_found:
                current_scrape["internal_area"] = internal_area_found[0]

            if land_area_found:
                current_scrape["land_area"] = land_area_found[0]

    if content_domain_says:
        current_scrape["domain_says"] = content_domain_says.getText()

    if content_age and len(content_age) >= 4:
        content_under_20 = content_age[0].find("div", {"data-testid": "bar-value"})
        content_20_to_39 = content_age[1].find("div", {"data-testid": "bar-value"})
        content_40_to_59 = content_age[2].find("div", {"data-testid": "bar-value"})
        content_above_60 = content_age[3].find("div", {"data-testid": "bar-value"})

        if content_under_20:
            current_scrape["neighbourhood_under_20"] = content_under_20.getText()

        if content_20_to_39:
            current_scrape["neighbourhood_20_to_39"] = content_20_to_39.getText()

        if content_40_to_59:
            current_scrape["neighbourhood_40_to_59"] = content_40_to_59.getText()

        if content_above_60:
            current_scrape["neighbourhood_above_60"] = content_above_60.getText()

    if content_long_term_residents:
        current_scrape["neighbourhood_long_term_residents"] = content_long_term_residents.getText()

    if content_type and len(content_type) >= 4:
        content_owners = content_type[0].find("span", {"data-testid": "left-value"})
        content_renter = content_type[0].find("span", {"data-testid": "right-value"})
        content_family = content_type[1].find("span", {"data-testid": "left-value"})
        content_single = content_type[1].find("span", {"data-testid": "right-value"})

        if content_owners:
            current_scrape["neighbourhood_owners"] = content_owners.getText()

        if content_renter:
            current_scrape["neighbourhood_renter"] = content_renter.getText()

        if content_family:
            current_scrape["neighbourhood_family"] = content_family.getText()

        if content_single:
            current_scrape["neighbourhood_single"] = content_single.getText()

    if content_values and len(content_values) >= 6:
        current_scrape["performance_median_price"] = content_values[0].getText()
        current_scrape["performance_auction_clearance"] = content_values[1].getText()
        current_scrape["performance_sold_this_year"] = content_values[2].getText()
        current_scrape["performance_avg_days_on_market"] = content_values[3].getText()

        current_scrape["demographic_population"] = content_values[4].getText()
        current_scrape["demographic_average_age"] = content_values[5].getText()

    if content_occupancy:
        content_owners = content_occupancy.find("span", {"data-testid": "left-value"})
        content_renter = content_occupancy.find("span", {"data-testid": "right-value"})

        if content_owners:
            current_scrape["demographic_owner"] = content_owners.getText()

        if content_renter:
            current_scrape["demographic_renter"] = content_renter.getText()

    if content_household:
        content_family = content_household.find("span", {"data-testid": "left-value"})
        content_single = content_household.find("span", {"data-testid": "right-value"})

        if content_family:
            current_scrape["demographic_family"] = content_family.getText()

        if content_single:
            current_scrape["demographic_single"] = content_single.getText()

    if content_coordinates:
        coordinates = re.findall(r"destination=([-\s,\d\.]+)", content_coordinates.attrs["href"])
        if coordinates:
            coordinates = coordinates[0].split(",")
            current_scrape["latitude"] = coordinates[0]
            current_scrape["longitude"] = coordinates[1]           
    
    data.append(current_scrape)
    
    # wait a little time in-between each request to prevent DDOS attack
    sleep(REQUEST_WAIT_TIME)
    
    property_url_num += 1
    if property_url_num in display_percentage:
        print(f"{round(property_url_num / len(property_urls) * 100)}% of property URLs scraped")

## Save scraped data

In [None]:
df_scrape = pd.json_normalize(data)
display(df_scrape.head(10))

df_scrape.to_csv(f"{DIR_RAW}/scraped_properties.csv", index=False)