# Crawling Domain Datasets

## Load Library

In [2]:
import requests
from bs4 import BeautifulSoup
import csv

## Crawling Domain Website

In [5]:
# Define the Initial URL of Domain
base_url = 'https://www.domain.com.au/rent/vic/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Assume the number of pages to traverse
max_pages = 5

# The list to Store all URLs
page_urls = []

# Traverse all pages
for page_number in range(1, max_pages + 1):
    if page_number == 1:
        page_url = base_url
    else:
        page_url = f"{base_url}&page={page_number}"
    page_urls.append(page_url)

print(page_urls)


['https://www.domain.com.au/rent/vic/', 'https://www.domain.com.au/rent/vic/&page=2', 'https://www.domain.com.au/rent/vic/&page=3', 'https://www.domain.com.au/rent/vic/&page=4', 'https://www.domain.com.au/rent/vic/&page=5']


In [6]:
# Initializes a list to store all property links
property_list = []

# Iterate through each page's pagination links
for page_url in page_urls:
    page = requests.get(page_url, headers=headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    listings = soup.find_all('li', class_="css-1qp9106")
    
    # Find the listing link and add them to the property_list
    for listing in listings:
        property_link = listing.find('a', href=True)
        if property_link:
            property_url = property_link['href']
            if property_url.startswith('/'):
                property_url = 'https://www.domain.com.au' + property_url
            property_list.append(property_url)


print(property_list)

['https://www.domain.com.au/31-154-rathmines-road-hawthorn-east-vic-3123-17245256', 'https://www.domain.com.au/406-422-collins-street-melbourne-vic-3000-17245214', 'https://www.domain.com.au/2008-65-dudley-street-west-melbourne-vic-3003-17245191', 'https://www.domain.com.au/4-798-warrigal-road-malvern-east-vic-3145-15783166', 'https://www.domain.com.au/6a-victoria-street-st-kilda-vic-3182-17245062', 'https://www.domain.com.au/53-marriage-road-brighton-east-vic-3187-17245034', 'https://www.domain.com.au/8-orford-street-moonee-ponds-vic-3039-17245022', 'https://www.domain.com.au/32-bridport-street-south-melbourne-vic-3205-17244999', 'https://www.domain.com.au/709-480-riversdale-road-hawthorn-vic-3122-17244953', 'https://www.domain.com.au/2316-65-dudley-street-west-melbourne-vic-3003-17244942', 'https://www.domain.com.au/310-408-lonsdale-street-melbourne-vic-3000-16040182', 'https://www.domain.com.au/102-35-spring-street-melbourne-vic-3000-17244878', 'https://www.domain.com.au/205e-93-119

In [7]:
# Initialise a list to store the details of each listing
property_details = []

for property_url in property_list:
    page = requests.get(property_url, headers=headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Extract Rent
    price_span = soup.find('div', {'data-testid': 'listing-details__summary-title'}).find('span')
    price = price_span.text.strip() if price_span else 'No price found'
    
    # Extract Address
    address_h1 = soup.find('h1', class_="css-164r41r")
    address = address_h1.text.strip() if address_h1 else 'No address found'

    # Extract the Number of bedrooms, bathrooms and parkings
    beds = soup.find('span', {'data-testid': 'property-features-text'}, string=['Bed', 'Beds'])
    beds_number = beds.find_previous('span', {'class': 'css-lvv8is'}).text.strip() if beds else 'No beds info'

    baths = soup.find('span', {'data-testid': 'property-features-text'}, string=['Bath', 'Baths'])
    baths_number = baths.find_previous('span', {'class': 'css-lvv8is'}).text.strip() if baths else 'No baths info'

    parking = soup.find('span', {'data-testid': 'property-features-text'}, string='Parking')
    parking_number = parking.find_previous('span', {'class': 'css-lvv8is'}).text.strip() if parking else 'No parking info'
  
    # Store the information into the dictionary
    property_info = {
        'URL': property_url,
        'Price': price,
        'Address': address,
        'Beds': beds_number,
        'Baths': baths_number,
        'Parking': parking_number,
    }
    
    property_details.append(property_info)

for detail in property_details:
    print(detail)

{'URL': 'https://www.domain.com.au/31-154-rathmines-road-hawthorn-east-vic-3123-17245256', 'Price': '$570.00', 'Address': '31/154 Rathmines Road, Hawthorn East VIC 3123', 'Beds': '2 Beds', 'Baths': '1 Bath', 'Parking': '2 Parking'}
{'URL': 'https://www.domain.com.au/406-422-collins-street-melbourne-vic-3000-17245214', 'Price': '$575 per week', 'Address': '406/422 Collins Street, Melbourne VIC 3000', 'Beds': '2 Beds', 'Baths': '1 Bath', 'Parking': '− Parking'}
{'URL': 'https://www.domain.com.au/2008-65-dudley-street-west-melbourne-vic-3003-17245191', 'Price': '$630 per Week', 'Address': '2008/65 Dudley Street, West Melbourne VIC 3003', 'Beds': '1 Bed', 'Baths': '1 Bath', 'Parking': '− Parking'}
{'URL': 'https://www.domain.com.au/4-798-warrigal-road-malvern-east-vic-3145-15783166', 'Price': '$350', 'Address': '4/798 Warrigal Road, Malvern East VIC 3145', 'Beds': '1 Bed', 'Baths': '1 Bath', 'Parking': '1 Parking'}
{'URL': 'https://www.domain.com.au/6a-victoria-street-st-kilda-vic-3182-172

## Store in CSV Document

In [8]:
# Define Field Names for CSV
fieldnames = ["Price", "Address", "Beds", "Baths", "Parking"]

# Create and write into CSV
with open("../Data/combined_house", "w", newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for property in property_details:
        writer.writerow({"Price": property["Price"], "Address": property["Address"], "Beds": property["Beds"], "Baths": property["Baths"], "Parking": property["Parking"]})