### This section uses beautiful soup to retrieve current data on domain.com.au

Reference https://www.xbyte.io/how-to-use-python-to-scrape-real-estate-website-data-using-web-scraping-and-making-data-wrangling.php

### 1. Scraping

In [1]:
# built-in imports
import re
from json import dump

from collections import defaultdict

# user packages
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests

#### Find how many properties are avaiable under this postcode (trial using postcode = 3000)

In [39]:
BASE_URL = "https://www.domain.com.au"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47"}

url = BASE_URL + f"/rent/?postcode=3000&sort=dateupdated-desc"
bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")
title_tag = bs_object.find("h1")
r1 = "<strong>(\d+)\s"
pat = re.compile(r1)
how_many = pat.findall(str(title_tag))[0]
print(int(how_many) // 20)
type(how_many)

29


str

#### Now for each postcode, compute the number of end page by property number // 20, e.g. postcode 3000 has 595 properties, so end page 595//20 = 29

In [46]:
# constants
BASE_URL = "https://www.domain.com.au"

# begin code
url_links = []
property_metadata = defaultdict(dict)
# acts as an authentic browser
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47"}

for code in range(3000, 3050):
    url = BASE_URL + f"/rent/?postcode={code}&sort=dateupdated-desc"
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")
    
    # Find how many properties are avaiable under this postcode
    title_tag = bs_object.find("h1")
    r1 = "<strong>(\d+)\s"
    pat = re.compile(r1)
    how_many = pat.findall(str(title_tag))[0]
    # if postcode invalid, e.g. 3001, then end page = 0
    end_page = int(how_many) // 20
    print(f"postcode = {code} end page = {end_page}")
    N_PAGES = range(1, end_page+1)

    # generate list of urls to visit
    for page in N_PAGES:
        url = url + f"&page={page}"
        bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")
        # find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.

        index_links = bs_object \
            .find(
                "ul",
                {"data-testid": "results"}
            ) \
            .findAll(
                "a",
                href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
            )

        for link in index_links:
            # if its a property address, add it to the list
            if 'address' in link['class']:
                url_links.append(link['href'])

end page = 29
end page = 0
end page = 2
end page = 3
end page = 4
end page = 0
end page = 10
end page = 0
end page = 6
end page = 0
end page = 0
end page = 10
end page = 6
end page = 4
end page = 0
end page = 3
end page = 3
end page = 0
end page = 2
end page = 1
end page = 6
end page = 6
end page = 0
end page = 5
end page = 5
end page = 2
end page = 0
end page = 1
end page = 4
end page = 13
end page = 13
end page = 5
end page = 7
end page = 1
end page = 1
end page = 0
end page = 0
end page = 2
end page = 0
end page = 4
end page = 5
end page = 1
end page = 2
end page = 2
end page = 4
end page = 0
end page = 7
end page = 3
end page = 1
end page = 0


In [47]:
print(len(url_links))
url_links

3660


['https://www.domain.com.au/2504-36-la-trobe-street-melbourne-vic-3000-15220874',
 'https://www.domain.com.au/2606-157-a-beckett-street-melbourne-vic-3000-15043424',
 'https://www.domain.com.au/818-139-lonsdale-street-melbourne-vic-3000-15284899',
 'https://www.domain.com.au/304-58-la-trobe-street-melbourne-vic-3000-16069105',
 'https://www.domain.com.au/1716-135-a-beckett-street-melbourne-vic-3000-15295747',
 'https://www.domain.com.au/701-101-therry-st-melbourne-vic-3000-16069017',
 'https://www.domain.com.au/205-36-la-trobe-street-melbourne-vic-3000-16068935',
 'https://www.domain.com.au/1407-601-little-lonsdale-street-melbourne-vic-3000-16068933',
 'https://www.domain.com.au/1610-157-a-beckett-street-melbourne-vic-3000-14657393',
 'https://www.domain.com.au/208-565-flinders-street-melbourne-vic-3000-6173345',
 'https://www.domain.com.au/4507-318-russell-street-melbourne-vic-3000-16068548',
 'https://www.domain.com.au/1412-25-therry-street-melbourne-vic-3000-16068459',
 'https://www

#### Now, we also have the URLs about every property of Melbourne, Australia. Every URL is exclusive for every property in Melbourne. So, the next step would be, going inside every URL as well as scrape prices, total bedrooms, total bathrooms, total parking, addresses and locations.
##### Note: Ignore the output if it seems too dizzy, only for debugging purpose

In [48]:
# removing duplicate links while maintaining the order of urls
abc_links = [] 
for i in url_links: 
    if i not in abc_links: 
        abc_links.append(i) 
        
# defining required regural expression for data extraction     
pat_feature = re.compile(r'text\">(.+?)<\/span>|-->(.?)<\/span>')
pat_value = re.compile(r">(.+?)<!")
pattern1 = re.compile(r'>(.+)<.')
pattern2 = re.compile(r'destination=(.+)" rel=.')
basic_feature_list = []
# loop to iterate through each url
for property_url in abc_links:
    
    bsobj = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")
    
    # extracting address/name of property
    property_name = bsobj.find("h1", {"class": "css-164r41r"})
    
    # extracting baths, rooms, parking etc
    all_basic_features = bsobj.find("div", {"data-testid": "property-features-wrapper"}).findAll("span", {"data-testid": "property-features-text-container"})
    print(f"length of all basic features = {len(all_basic_features)}")
    print(all_basic_features)
    # extracting property price
    property_price = bsobj.find("div", {"data-testid": "listing-details__summary-title"})
    
    # extracting latitudes and longitudes
    lat_long = bsobj.find("a", {"target": "_blank", 'rel': "noopener noreferer"})
    
    # dictionary to store temporary data
    basic_feature_dict = {}
    
    # few properties does not contain all the 4 features such as rooms, baths, parkings, area. So need to check
    # how many features they contain
    if len(all_basic_features) == 4:
        basic_feature_dict[pat_feature.findall(str(all_basic_features[0]))[0][0]] = pat_value.findall(str(all_basic_features[0]))[0]
        basic_feature_dict[pat_feature.findall(str(all_basic_features[1]))[0][0]] = pat_value.findall(str(all_basic_features[1]))[0]
        basic_feature_dict[pat_feature.findall(str(all_basic_features[2]))[0][0]] = pat_value.findall(str(all_basic_features[2]))[0]
        basic_feature_dict['area'] = pat_value.findall(str(all_basic_features[3]))[0]
        
    elif len(all_basic_features) == 3:
        basic_feature_dict[pat_feature.findall(str(all_basic_features[0]))[0][0]] = pat_value.findall(str(all_basic_features[0]))[0]
        basic_feature_dict[pat_feature.findall(str(all_basic_features[1]))[0][0]] = pat_value.findall(str(all_basic_features[1]))[0]
        basic_feature_dict[pat_feature.findall(str(all_basic_features[2]))[0][0]] = pat_value.findall(str(all_basic_features[2]))[0]
        
    elif len(all_basic_features) == 2:
        basic_feature_dict[pat_feature.findall(str(all_basic_features[0]))[0][0]] = pat_value.findall(str(all_basic_features[0]))[0]
        basic_feature_dict[pat_feature.findall(str(all_basic_features[1]))[0][0]] = pat_value.findall(str(all_basic_features[1]))[0]
        
    elif len(all_basic_features) == 1:
        basic_feature_dict[pat_feature.findall(str(all_basic_features[0]))[0][0]] = pat_value.findall(str(all_basic_features[0]))[0]
# putting 'none' if price is missing    
    if property_price is None:
        basic_feature_dict['price'] = None
        
    else:
        basic_feature_dict['price'] = pattern1.findall(str(property_price))[0]
        
    # putting 'none' if property name/address is missing       
    if property_name is None:
        basic_feature_dict['name'] = None
        
    else:
        basic_feature_dict['name'] = pattern1.findall(str(property_name))[0]
        
    # putting 'none' if latitude and logitude are missing        
    if lat_long is None:
        basic_feature_dict['lat'] = None
        basic_feature_dict['long'] = None
        
    else:
        basic_feature_dict['lat'] = pattern2.findall(str(lat_long))[0].split(',')[0]
        basic_feature_dict['long'] = pattern2.findall(str(lat_long))[0].split(',')[1]
# appending all the data into a list
    basic_feature_list.append(basic_feature_dict)

length of all basic features = 3
[<span class="css-lvv8is" data-testid="property-features-text-container">2<!-- --> <span class="css-9fxapx" data-testid="property-features-text">Beds</span></span>, <span class="css-lvv8is" data-testid="property-features-text-container">1<!-- --> <span class="css-9fxapx" data-testid="property-features-text">Bath</span></span>, <span class="css-lvv8is" data-testid="property-features-text-container">−<!-- --> <span class="css-12a1b0h" data-testid="property-features-text">Parking</span></span>]
length of all basic features = 3
[<span class="css-lvv8is" data-testid="property-features-text-container">2<!-- --> <span class="css-9fxapx" data-testid="property-features-text">Beds</span></span>, <span class="css-lvv8is" data-testid="property-features-text-container">1<!-- --> <span class="css-9fxapx" data-testid="property-features-text">Bath</span></span>, <span class="css-lvv8is" data-testid="property-features-text-container">−<!-- --> <span class="css-12a1b0h" 

In [49]:
basic_feature_list

[{'Beds': '2',
  'Bath': '1',
  'Parking': '−',
  'price': '$540/week',
  'name': '2504/36 La Trobe Street Melbourne VIC 3000',
  'lat': '-37.8081637',
  'long': '144.967594'},
 {'Beds': '2',
  'Bath': '1',
  'Parking': '−',
  'price': '$690/w',
  'name': "2606/157 A'beckett street Melbourne VIC 3000",
  'lat': '-37.8099156',
  'long': '144.9591846'},
 {'Beds': '2',
  'Bath': '1',
  'Parking': '−',
  'price': '$380 pw',
  'name': '818/139 Lonsdale Street Melbourne VIC 3000',
  'lat': '-37.8107954',
  'long': '144.9683039'},
 {'Bed': '1',
  'Bath': '1',
  'Parking': '−',
  'price': '$380 per week',
  'name': '304/58 La Trobe Street Melbourne VIC 3000',
  'lat': '-37.8084287',
  'long': '144.9670537'},
 {'Beds': '2',
  'Bath': '1',
  'Parking': '1',
  'price': '$630 per week',
  'name': "1716/135 A'Beckett Street Melbourne VIC 3000",
  'lat': '-37.8097879',
  'long': '144.9596466'},
 {'Bed': '1',
  'Bath': '1',
  'Parking': '−',
  'price': 'NEW corner apartment Keep your bond',
  'name':

#### Now, an output of a given code provides us the listing of dictionaries having all the accessible scraped data. Here, we would convert that into different individual lists as we need to do a bit more cleaning as well as scraping of above-mined data as well as it would become easier to perform in the lists.

In [50]:
# creating empty lists
beds_list = []
baths_list = []
parking_list = []
area_list = []
name_list = []
lat_list = []
long_list = []
price_list = []
# interating through list created above with data
for row in basic_feature_list:
    
    # checking if the row cointains 'Beds', 'Bed' or nothing
    if 'Beds' in row:
        beds_list.append(row['Beds'])
    elif 'Bed' in row:
        beds_list.append(row['Bed'])
    elif 'beds' in row:
        beds_list.append(row['beds'])
    elif 'bed' in row:
        beds_list.append(row['bed'])
    else:
        beds_list.append(None)
        
    # checking if the row cointains 'Baths', 'Bath' or nothing    
    if 'Baths' in row:
        baths_list.append(row['Baths'])
    elif 'Bath ' in row:
        baths_list.append(row['Bath'])
    elif 'baths' in row:
        baths_list.append(row['baths'])
    elif 'bath' in row:
        baths_list.append(row['bath'])
    else:
        baths_list.append(None)
        
    # checking if the row cointains 'Parking', '-' or nothing     
    if 'Parking' in row and row['Parking'] != '−':
        parking_list.append(row['Parking'])
    else:
        parking_list.append(None)
        
    # checking if the row cointains area, or empty string. Because empty string (i.e. '') reprsents area  
    if 'area' in row:
        area_list.append(row['area'])
    elif '' in row:
        area_list.append(row[''])
    else:
        area_list.append(None)
        
# checking if the row cointains 'name' that is address of property         
    if 'name' in row:
        name_list.append(row['name'])
    else:
        name_list.append(None)
    
    # checking if the row cointains 'price'         
    if 'price' in row:
        price_list.append(row['price'])
    else:
        price_list.append(None)        
    
    # checking if the row cointains 'lat' that is lattitude of property         
    if 'lat' in row:
        lat_list.append(row['lat'])
    else:
        lat_list.append(None)  
        
    # checking if the row cointains 'long' that is lattitude of property             
    if 'long' in row:
        long_list.append(row['long'])
    else:
        long_list.append(None)

In [51]:
price_list

['$540/week',
 '$690/w',
 '$380 pw',
 '$380 per week',
 '$630 per week',
 'NEW corner apartment Keep your bond',
 '$450',
 '$750 per week',
 '$490 per week',
 '$400',
 '$650 per week',
 '$500 per week',
 'New apartment No bond needed Pets welcome',
 '$460/week',
 '$650',
 '$450 per week',
 '$550.00',
 '$500',
 '400 per week',
 '$630 Per Week',
 '$500 per week',
 '$590',
 '$360 PER WEEK',
 '$650 per week',
 '$480.00',
 '$750 per week',
 '$450.00',
 '$400.00',
 '$800 per week',
 'Furnished $600/week',
 '$550 per week',
 '$440',
 '$540/week',
 '$530 per week',
 '$580 per week *Fully Furnished*',
 '$500 per week',
 '$500',
 '$400 per week',
 '$620 per week',
 '$630',
 '$600 per week',
 '$530 per week',
 '$525 Per Week',
 '$450 per week',
 '$530 pw *Plus Study',
 '$550 per week',
 '$295.00 per week',
 '$700/W Furnished',
 'Unfurnished $500/week',
 '$320',
 '$700',
 '$400 PER WEEK',
 '$650/week',
 '$580',
 '$450 per week *Unfurnished*',
 '$410 per week',
 '$450',
 '$775/week',
 '$515 per wee

### Save the raw scraped data

In [67]:
import pandas as pd
house_dict = {}
house_dict['bedrooms'] = beds_list
house_dict['bathrooms'] = baths_list
house_dict['carspaces'] = parking_list
house_dict['building_area'] = area_list
house_dict['street_address'] = name_list
house_dict['latitude'] = lat_list
house_dict['longitude'] = long_list
house_dict['price'] = price_list # uncleaned price, pure price text
house_df = pd.DataFrame(house_dict)
house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3660 entries, 0 to 3659
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   bedrooms        3614 non-null   object
 1   bathrooms       1585 non-null   object
 2   carspaces       2827 non-null   object
 3   building_area   145 non-null    object
 4   street_address  3660 non-null   object
 5   latitude        3660 non-null   object
 6   longitude       3660 non-null   object
 7   price           3660 non-null   object
dtypes: object(8)
memory usage: 228.9+ KB


In [68]:
house_df.to_csv('BS_Raw_PropertiesInfo.csv', index=False)

In [69]:
# raw house_df
print(house_df.shape)
house_df.head(60)

(3660, 8)


Unnamed: 0,bedrooms,bathrooms,carspaces,building_area,street_address,latitude,longitude,price
0,2.0,,,,2504/36 La Trobe Street Melbourne VIC 3000,-37.8081637,144.967594,$540/week
1,2.0,,,,2606/157 A'beckett street Melbourne VIC 3000,-37.8099156,144.9591846,$690/w
2,2.0,,,,818/139 Lonsdale Street Melbourne VIC 3000,-37.8107954,144.9683039,$380 pw
3,1.0,,,,304/58 La Trobe Street Melbourne VIC 3000,-37.8084287,144.9670537,$380 per week
4,2.0,,1.0,,1716/135 A'Beckett Street Melbourne VIC 3000,-37.8097879,144.9596466,$630 per week
5,1.0,,,,701/101 Therry St Melbourne VIC 3000,-37.8074149,144.9591227,NEW corner apartment Keep your bond
6,1.0,,,,205/36 La Trobe Street Melbourne VIC 3000,-37.8081637,144.967594,$450
7,2.0,2.0,1.0,,1407/601 Little Lonsdale Street Melbourne VIC ...,-37.8137564,144.9537143,$750 per week
8,1.0,,,,1610/157 A'Beckett Street Melbourne VIC 3000,-37.8099156,144.9591846,$490 per week
9,1.0,,,,208/565 Flinders Street Melbourne VIC 3000,-37.8210586,144.9559072,$400
