# Scraping domain.com.au

In [1]:
from seleniumwire import webdriver
from bs4 import BeautifulSoup
import os
import json
import requests
import re

In [2]:
# Fetch the webpage
response = requests.get("https://postcodes-australia.com/state-postcodes/vic")

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Extract all the text
extracted_text = soup.get_text()

# Define the regex pattern to extract postcodes and suburbs
pattern = re.compile(r"(\d{4})\s+([A-Za-z\s\n]+)")

matches = pattern.findall(extracted_text)

# Initialize dictionary to store postcode to suburb mapping
postcode_dict = {}

# Populate the dictionary
for match in matches:
    postcode = int(match[0])
    suburbs = match[1].strip().split('\n')  # Split suburbs by newline character
    suburbs = [suburb.strip() for suburb in suburbs if suburb]  # Clean up whitespace and remove empty strings
    if postcode in postcode_dict:
        postcode_dict[postcode].extend(suburbs)  # Add to existing list if postcode already exists
    else:
        postcode_dict[postcode] = suburbs  # Create new list of suburbs for postcode

# Convert dictionary to JSON format
with open("postcodes.json", 'w') as f:
    json.dump(postcode_dict, f, indent=4)

## Listing URLs

In [3]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

def getUrls(suburb, postcode, page):
    driver = webdriver.Chrome(seleniumwire_options={}, options=options)
    # Go to the Domain listings page
    driver.get(f'https://www.domain.com.au/rent/{suburb}-vic-{postcode}/?ssubs=0&sort=price-desc&page={page}')

    page_source = driver.page_source

    # Close the browser
    driver.quit()

    soup = BeautifulSoup(page_source, 'lxml')

    listings = soup.find_all('a', class_='address')

    return [listing['href'] for listing in listings]

In [6]:
def scrape_postcode(postcode):

    suburbs = postcode_dict[postcode]

    print("Scraping postcode", postcode)

    if not os.path.exists('domain_urls.json'):
        with open('domain_urls.json', 'w') as f:
            f.write("{}")

    with open('domain_urls.json') as f:
        data = json.load(f)
    
    data[postcode] = []

    for suburb in suburbs:
        for i in range(50):
            try:
                print("Scraping", postcode, "urls - page", i+1, end="\r")
                new_data = getUrls(suburb, postcode, i+1)
                if not new_data:
                    print("\nNo data for page", i+1,)
                    break
                data[postcode] += new_data
                with open('domain_urls.json', 'w') as f:
                    f.write(json.dumps(data, indent=4))
            except:
                break

    print(f"Done. Scraped {i} pages from {suburb}")

# EXAMPLE: scrape_postcode(3067)

In [8]:
def scrape_listing_data(url):
    # Initialize the WebDriver with ChromeOptions
    driver = webdriver.Chrome(options=options, seleniumwire_options={})
    
    # Go to the listing page
    driver.get(url)

    # Get the page source
    page_source = driver.page_source

    # Close the browser
    driver.quit()

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, 'lxml')

    # Find the script tag containing the digitalData
    script_tag = soup.find('script', text=lambda text: text and 'digitalData' in text)
    
    # Extract the content of the script tag
    script_content = script_tag.string

    # Extract JSON data from the script content
    json_data = script_content.split('var digitalData = ')[1].split(';')[0]
    
    # Parse JSON data
    data = json.loads(json_data)

    with open('listings.json') as f:
        listings = json.load(f)
    
    listings[url] = data['page']['pageInfo']

    with open('listings.json', 'w') as f:
        f.write(json.dumps(listings, indent=4))


    # print("DATA\n", json.dumps(data, indent=4), '\n\n\n')

    # # Extract relevant information
    # property_info = data['page']['pageInfo']['property']
    # listing_data = {
    #     'address': property_info.get('address'),
    #     'price': property_info.get('price'),
    #     'bedrooms': property_info.get('bedrooms'),
    #     'bathrooms': property_info.get('bathrooms'),
    #     'agency': property_info.get('agency'),
    #     'agent': property_info.get('agentNames'),
    #     'images': property_info.get('images'),
    #     'dateListed': property_info.get('dateListed'),
    #     # Add more fields as needed
    # }

    # return listing_data

# EXAMPLE: data = scrape_listing_data('https://www.domain.com.au/906-238-flinders-street-melbourne-vic-3000-14781843')

## SCRAPE URLS FOR RENTAL PROPERTIES IN ALL OF VICTORIA

In [9]:
for postcode in postcode_dict.keys():
    scrape_postcode(postcode)

Scraping postcode 3000
Scraping 3000 urls - page 4

  self.log(traceback.format_exc(), "error")


Scraping 3000 urls - page 5

Exception ignored in: <function Reply.__del__ at 0x7af213594ea0>
Traceback (most recent call last):
  File "/home/julianbonitz/.local/lib/python3.12/site-packages/seleniumwire/thirdparty/mitmproxy/controller.py", line 132, in __del__
    raise exceptions.ControlException("Uncommitted reply: %s" % self.obj)
seleniumwire.thirdparty.mitmproxy.exceptions.ControlException: Uncommitted reply: TlsLayer(inactive)


Scraping 3000 urls - page 23

Exception ignored in: <function Reply.__del__ at 0x7af213594ea0>
Traceback (most recent call last):
  File "/home/julianbonitz/.local/lib/python3.12/site-packages/seleniumwire/thirdparty/mitmproxy/controller.py", line 132, in __del__
    raise exceptions.ControlException("Uncommitted reply: %s" % self.obj)
seleniumwire.thirdparty.mitmproxy.exceptions.ControlException: Uncommitted reply: TlsLayer(client and server)


Scraping 3000 urls - page 35

Exception ignored in: <function Reply.__del__ at 0x7af213594ea0>
Traceback (most recent call last):
  File "/home/julianbonitz/.local/lib/python3.12/site-packages/seleniumwire/thirdparty/mitmproxy/controller.py", line 132, in __del__
    raise exceptions.ControlException("Uncommitted reply: %s" % self.obj)
seleniumwire.thirdparty.mitmproxy.exceptions.ControlException: Uncommitted reply: <HTTPFlow
  request = Request(CONNECT sync-t1.taboola.com:443)
  client_conn = <ClientConnection: 127.0.0.1:34402>
  server_conn = <ServerConnection: <no address>>>


Scraping 3000 urls - page 46
No data for page 46
Done. Scraped 45 pages from Melbourne
Scraping postcode 3001
Scraping 3001 urls - page 1
No data for page 1
Done. Scraped 0 pages from Melbourne
Scraping postcode 3002
Scraping 3002 urls - page 1
No data for page 1
Done. Scraped 0 pages from East Melbourne
Scraping postcode 3003
Scraping 3003 urls - page 1
No data for page 1
Done. Scraped 0 pages from West Melbourne
Scraping postcode 3004
Scraping 3004 urls - page 6
No data for page 6
Done. Scraped 5 pages from Melbourne
Scraping postcode 3005
Scraping 3005 urls - page 1
No data for page 1
Done. Scraped 0 pages from World Trade Centre
Scraping postcode 3006
Scraping 3006 urls - page 1

Exception ignored in: <function Reply.__del__ at 0x7af213594ea0>
Traceback (most recent call last):
  File "/home/julianbonitz/.local/lib/python3.12/site-packages/seleniumwire/thirdparty/mitmproxy/controller.py", line 132, in __del__
    raise exceptions.ControlException("Uncommitted reply: %s" % self.obj)
seleniumwire.thirdparty.mitmproxy.exceptions.ControlException: Uncommitted reply: <HTTPFlow
  request = Request(CONNECT x.bidswitch.net:443)
  client_conn = <ClientConnection: 127.0.0.1:58706>
  server_conn = <ServerConnection: <no address>>>


Scraping 3006 urls - page 21
No data for page 21
Done. Scraped 20 pages from Southbank
Scraping postcode 3008
Scraping 3008 urls - page 11
No data for page 11
Done. Scraped 10 pages from Docklands
Scraping postcode 3010
Scraping 3010 urls - page 1
No data for page 1
Done. Scraped 0 pages from University Of Melbourne
Scraping postcode 3011
Scraping 3011 urls - page 9
No data for page 9
Scraping 3011 urls - page 2
No data for page 2
Done. Scraped 1 pages from Seddon
Scraping postcode 3012
Scraping 3012 urls - page 2
No data for page 2
Scraping 3012 urls - page 2
No data for page 2
Scraping 3012 urls - page 3
No data for page 3
Scraping 3012 urls - page 1
No data for page 1
Scraping 3012 urls - page 1
No data for page 1
Done. Scraped 0 pages from West Footscray
Scraping postcode 3013
Scraping 3013 urls - page 4
No data for page 4
Done. Scraped 3 pages from Yarraville
Scraping postcode 3015
Scraping 3015 urls - page 3
No data for page 3
Scraping 3015 urls - page 1
No data for page 1
Scrapi

Exception ignored in: <function Reply.__del__ at 0x7af213594ea0>
Traceback (most recent call last):
  File "/home/julianbonitz/.local/lib/python3.12/site-packages/seleniumwire/thirdparty/mitmproxy/controller.py", line 132, in __del__
    raise exceptions.ControlException("Uncommitted reply: %s" % self.obj)
seleniumwire.thirdparty.mitmproxy.exceptions.ControlException: Uncommitted reply: TlsLayer(client and server)


Scraping 3016 urls - page 4
No data for page 4
Scraping 3016 urls - page 1
No data for page 1
Done. Scraped 0 pages from Williamstown North
Scraping postcode 3018
Scraping 3018 urls - page 3
No data for page 3
Scraping 3018 urls - page 2
No data for page 2
Done. Scraped 1 pages from Seaholme
Scraping postcode 3019
Scraping 3019 urls - page 2
No data for page 2
Done. Scraped 1 pages from Braybrook
Scraping postcode 3020
Scraping 3020 urls - page 2
No data for page 2
Scraping 3020 urls - page 1
No data for page 1
Scraping 3020 urls - page 1
No data for page 1
Done. Scraped 0 pages from Sunshine West
Scraping postcode 3021
Scraping 3021 urls - page 2
No data for page 2
Scraping 3021 urls - page 2
No data for page 2
Scraping 3021 urls - page 1
No data for page 1
Scraping 3021 urls - page 1
No data for page 1
Done. Scraped 0 pages from St Albans
Scraping postcode 3022
Done. Scraped 0 pages from Ardeer
Scraping postcode 3023
Scraping 3023 urls - page 1

## Data from each listing

# Scraping realestate.com.au

In [75]:
import requests
from bs4 import BeautifulSoup
import json
from fake_useragent import UserAgent

ua = UserAgent()

url_stem = 'https://www.realestate.com.au/rent/in-victoria/list-'
page = '1'

cookies = {
    # Extract the `list1` cURL request from Chrome Dev Tools
    # -> Network @ https://www.realestate.com.au/rent/in-victoria/list-1
    # Use https://curlconverter.com/ to retrieve this cookie session token
    'KP_UIDz-ssn': '0CrCafki39X6xXWdV46WLyzxgC7g2k4zNGaNu4W9z1umOYQ7J4CdFuuwR7LgmrlOcV5QgUC9LxRCVorsvoh5aWvVTZ47dyB8fUkm8gdu0JQ9eKOCpH8REXxHVUIxBcPBSNCyaeNvzmqz95b5B4JV7OZtV8bmVcf27oPvwmac',
}

headers = {
    # Similarly extract your user-agent header from the cURL request ^
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
}

def getPageOfRentals(page):
    url = url_stem+str(page)
    print(url, end='\r')

    response = requests.get(url, cookies=cookies, headers=headers)

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting the JSON data embedded in the script tag
    json_data = None
    for script in soup.find_all('script', type='application/ld+json'):
        try:
            json_data = json.loads(script.string)
            break  # Assuming we only need the first JSON found
        except json.JSONDecodeError:
            continue

    # Assuming json_data is a list of dictionaries with house details
    houses = []
    if json_data:
        for item in json_data:
            if item.get('@type') == 'Event':  # Filtering for house listings
                house_details = {
                    'name': item['location']['name'],
                    'address': item['location']['address']['streetAddress'],
                    'locality': item['location']['address']['addressLocality'],
                    'region': item['location']['address']['addressRegion'],
                    'postalCode': item['location']['address']['postalCode'],
                    'startDate': item['startDate'],
                    'endDate': item['endDate'],
                    'url': item['url']
                }
                houses.append(house_details)

    return houses

In [96]:
print(
    getPageOfRentals(1)
)

https://www.realestate.com.au/rent/in-victoria/list-1
[]


In [77]:
pages = []

i = 1

for i in range(52, 80):
    try:
        new_page = getPageOfRentals(i+1)
        if not new_page:
            print("\nEMPTY PAGE",i+1)
            break
        pages.append(new_page)
    except Exception as e:
        print("\nFAILED GET PAGE", i+1, e)
        break

print("\nDone")

https://www.realestate.com.au/rent/in-victoria/list-53
https://www.realestate.com.au/rent/in-victoria/list-54
https://www.realestate.com.au/rent/in-victoria/list-55
https://www.realestate.com.au/rent/in-victoria/list-56
https://www.realestate.com.au/rent/in-victoria/list-57
https://www.realestate.com.au/rent/in-victoria/list-58
https://www.realestate.com.au/rent/in-victoria/list-59
https://www.realestate.com.au/rent/in-victoria/list-60
https://www.realestate.com.au/rent/in-victoria/list-61
https://www.realestate.com.au/rent/in-victoria/list-62
https://www.realestate.com.au/rent/in-victoria/list-63
https://www.realestate.com.au/rent/in-victoria/list-64
https://www.realestate.com.au/rent/in-victoria/list-65
https://www.realestate.com.au/rent/in-victoria/list-66
https://www.realestate.com.au/rent/in-victoria/list-67
https://www.realestate.com.au/rent/in-victoria/list-68
https://www.realestate.com.au/rent/in-victoria/list-69
https://www.realestate.com.au/rent/in-victoria/list-70
https://ww

In [92]:
# with open('saved_pages.json', 'w') as f:
#     f.write(json.dumps(pages, indent=4))

In [95]:
urls = []
for page in pages:
    for house in page:
        urls.append(house['url'])
        
print(urls)

['https://www.realestate.com.au/property-apartment-vic-doncaster-437896292', 'https://www.realestate.com.au/property-townhouse-vic-box+hill+south-439595188', 'https://www.realestate.com.au/property-house-vic-brunswick-439997368', 'https://www.realestate.com.au/property-house-vic-west+footscray-439997292', 'https://www.realestate.com.au/property-townhouse-vic-reservoir-439804068', 'https://www.realestate.com.au/property-townhouse-vic-reservoir-439804552', 'https://www.realestate.com.au/property-townhouse-vic-reservoir-439987380', 'https://www.realestate.com.au/property-house-vic-bundoora-439997180', 'https://www.realestate.com.au/property-apartment-vic-port+melbourne-439997152', 'https://www.realestate.com.au/property-townhouse-vic-thornhill+park-439997136', 'https://www.realestate.com.au/property-house-vic-wantirna-439997096', 'https://www.realestate.com.au/property-apartment-vic-box+hill-439997008', 'https://www.realestate.com.au/property-house-vic-northcote-439896100', 'https://www.r