# Domain Scrape

In [120]:
from bs4 import BeautifulSoup
import requests

import unicodedata
from csv import writer
import re

import pandas as pd
import numpy as np

import json
from pyspark.sql import SparkSession
import os

In [74]:
headers = {"User-Agent":
           "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}

# home url of domian.com australia
home_url = "https://www.domain.com.au"

In [78]:
#configuration
postcode_range = [line.rstrip() for line in open('postcode.txt')]

#postcode of properties
postcode_list = [i for i in range(int(postcode_list[0]),int(postcode_list[1]))]

In [87]:
# return unique extracted link for given postcode
def link_by_postcode(postcode):
    
    # list to store all the urls of properties
    list_of_links = []
    
    # store unique link
    abc_links = [] 
    
    # number of pages of search result are 50
    page_numbers = list(range(50))[1:50]
    
    # for loop for all 50 search(melbourne region) pages
    for page in page_numbers:

        # extracting html document of search page
        url = home_url + f"/rent/melbourne-vic-{postcode}/?excludedeposittaken=1&page={page}"

        # parsing html document to 'lxml' format
        bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")
        
        #break if current page not exist or no links found
        if ((bs_object is None) | (bs_object.find("ul", {"data-testid": "results"}) is None)):
            break
        
        #for each page, finding all the links available in 'ul' tag whos 'data-testid' is 'results'
        all_links = bs_object.find(
            "ul", {"data-testid": "results"}).findAll("a", href=re.compile("https://www.domain.com.au/*"))

        # inner loop to find links inside each property page because few properties are project so they have more properties inside their project page
        for link1 in all_links:
            # checking if it is a project and then performing similar above
            if 'project' in link1.attrs['href']:
                inner1_url = link1.attrs['href']
                inner1_bsobj = BeautifulSoup(requests.get(inner1_url, headers=headers).text, "html.parser")
                for link2 in inner1_bsobj.find("div", {"name": "listing-details__other-listings"}).findAll("a", href=re.compile("https://www.domain.com.au/*")):
                    if 'href' in link2.attrs:
                        list_of_links.append(link2.attrs['href'])
            else:
                list_of_links.append(link1.attrs['href'])
   
    # remove deplicated links
    for i in list_of_links: 
        if i not in abc_links: 
            abc_links.append(i) 
    return abc_links


In [88]:
def links_for_postcodes(postcode_list):
    all_links = []
    for postcode in postcode_list:
        current_postcode_links = link_by_postcode(postcode)
        all_links.extend(current_postcode_links)
    return list(set(all_links))
        

In [91]:
all_links = links_for_postcodes(postcode_list)

In [32]:
# defining required regural expression for data extraction     
pattern = re.compile(r'>(.+)<!.*>(.+?)</span>.*')
pattern1 = re.compile(r'>(.+)<.')
pattern2 = re.compile(r'destination=(.+)" rel=.')
#school
pattern_school = re.compile(r'>(.+)<')
pattern_distance = re.compile(r'">(.+)<!')
#street information
pattern_address1 = re.compile(r'>(.+)<!.*>')
pattern_address2 = re.compile(r'>(.+)<.')

In [106]:
basic_feature_list = []


# loop to iterate through each url
for link in all_links:
    
    # opening urls
    bs_object = BeautifulSoup(requests.get(link, headers=headers).text, "html.parser")
    
    # extracting address/name of property
    property_name = bs_object.find("h1", {"class": "css-164r41r"})
    
    # extracting baths, rooms, parking etc
    all_basic_features = bs_object.find("div", {"class": "css-ghc6s4"}).findAll("span", {"data-testid": "property-features-text-container"})
    
    # extracting property price
    property_price = bs_object.find("div", {"data-testid": "listing-details__summary-title"})
    
    # extracting latitudes and longitudes
    lat_long = bs_object.find("a", {"target": "_blank", 'rel': "noopener noreferer"})
    
    #extract schoolname, school distance
    # school= bs_object.find("div", {"data-testid": "listing-details__school-catchment"}).findAll("h5",{"class": "css-5w5cop"})
    school_data = json.loads(bs_object.find("script", {"id": "__NEXT_DATA__"}).text)
    school = [s['name'] for s in school_data['props']['pageProps']['componentProps']['schoolCatchment']['schools']]
    school_distance= [s['distance'] for s in school_data['props']['pageProps']['componentProps']['schoolCatchment']['schools']]
  
    # address info
    address1 = bs_object.find("span", {"data-testid": "address-line1"})
    address2 = bs_object.find("span", {"data-testid": "address-line2"})
    
    #recent rented
    recent_rented_number = bs_object.find("div", {"class": "css-12uo2x5"})
    
    
    #neighbourhood family vs single ratio
    neighb_family = bs_object.find("span", {"class": "css-1ruej2h"})
    neighb_single = bs_object.find("span", {"class": "css-1ymya4c"})
    
    
    #street owner vs renter
    street_renter = bs_object.find("span", {"class": "css-1ejmyle"})
    street_owner = bs_object.find("span", {"class": "css-1jhlvvv"})

    
    #long term resident percentage
    long_term_resi= bs_object.find("div", {"class": "css-ibsnk8"})
    
    
    # dictionary to store temporary data
    basic_feature_dict = {}
    
    # few properties does not contain all the 4 features such as rooms, baths, parkings, area. So need to check
    # how many features they contain
    if len(all_basic_features) == 4:
        basic_feature_dict[pattern.findall(str(all_basic_features[0]))[0][1]] = pattern.findall(str(all_basic_features[0]))[0][0]
        basic_feature_dict[pattern.findall(str(all_basic_features[1]))[0][1]] = pattern.findall(str(all_basic_features[1]))[0][0]
        basic_feature_dict[pattern.findall(str(all_basic_features[2]))[0][1]] = pattern.findall(str(all_basic_features[2]))[0][0]
        basic_feature_dict[pattern.findall(str(all_basic_features[3]))[0][1]] = pattern.findall(str(all_basic_features[3]))[0][0]
        
    elif len(all_basic_features) == 3:
        basic_feature_dict[pattern.findall(str(all_basic_features[0]))[0][1]] = pattern.findall(str(all_basic_features[0]))[0][0]
        basic_feature_dict[pattern.findall(str(all_basic_features[1]))[0][1]] = pattern.findall(str(all_basic_features[1]))[0][0]
        basic_feature_dict[pattern.findall(str(all_basic_features[2]))[0][1]] = pattern.findall(str(all_basic_features[2]))[0][0]
        
    elif len(all_basic_features) == 2:
        basic_feature_dict[pattern.findall(str(all_basic_features[0]))[0][1]] = pattern.findall(str(all_basic_features[0]))[0][0]
        basic_feature_dict[pattern.findall(str(all_basic_features[1]))[0][1]] = pattern.findall(str(all_basic_features[1]))[0][0]
        
    elif len(all_basic_features) == 1:
        basic_feature_dict[pattern.findall(str(all_basic_features[0]))[0][1]] = pattern.findall(str(all_basic_features[0]))[0][0]

        
        
    # putting 'none' if school is missing    
    if school is None:
        basic_feature_dict['school'] = None
        
    else:
        basic_feature_dict['school'] = school
    
    # putting 'none' if school distance is missing   
    if school_distance is None:
        basic_feature_dict['school_distance'] = None
        
    else:
        basic_feature_dict['school_distance'] = school_distance
     
    
    
    # putting 'none' if address(street) distance is missing   
    if address1 is None:
        basic_feature_dict['address1'] = None
        
    else:
        basic_feature_dict['address1'] = pattern_address1.findall(str(address1))
        
    # putting 'none' if address2 is missing   
    if address2 is None:
        basic_feature_dict['address2'] = None
        
    else:
        basic_feature_dict['address2'] = sum([pattern_address2.findall(str(i)) for i in address2],[])
    
    # putting 'none' if recent_rented number is missing   
    if recent_rented_number is None:
        basic_feature_dict['recent_rented_number'] = None
        
    else:
        basic_feature_dict['recent_rented_number'] = pattern1.findall(str(recent_rented_number))[0]
        
    
    
    # putting 'none' if longterm resident number is missing   
    if long_term_resi is None:
        basic_feature_dict['long_term_resi_percentage'] = None
        
    else:
        basic_feature_dict['long_term_resi_percentage']  = float(str(long_term_resi.text[:-1]))/100

    
    
    # putting 'none' if neighbourhood family/single number is missing   
    if (neighb_family is None) |(neighb_single is None) :
        basic_feature_dict['neighb_family_single_ratio'] = None
        
    else:
        #to avoid zero division
        add_on = 0.000001
        basic_feature_dict['neighb_family_single_ratio'] = float(str(neighb_family.text[:-1]))/(float(str(neighb_single.text[:-1]))+add_on)
        

        
    
    # putting 'none' if street owener/renter number is missing   
    if (street_owner is None) | (street_renter is None):
        basic_feature_dict['street_renter_owner_ratio'] = None
             
    else:
        #to avoid zero division
        add_on = 0.000001
        basic_feature_dict['street_renter_owner_ratio'] = float(str(street_renter.text[:-1]))/(float(str(street_owner.text[:-1]))+add_on)


        
    # putting 'none' if price is missing    
    if property_price is None:
        basic_feature_dict['price'] = None
        
    else:
        basic_feature_dict['price'] = pattern1.findall(str(property_price))[0]
        
    # putting 'none' if property name/address is missing       
    if property_name is None:
        basic_feature_dict['name'] = None
        
    else:
        basic_feature_dict['name'] = pattern1.findall(str(property_name))[0]
        
    # putting 'none' if latitude and logitude are missing        
    if lat_long is None:
        basic_feature_dict['lat'] = None
        basic_feature_dict['long'] = None
        
    else:
        basic_feature_dict['lat'] = pattern2.findall(str(lat_long))[0].split(',')[0]
        basic_feature_dict['long'] = pattern2.findall(str(lat_long))[0].split(',')[1]
        
    # appending all the data into a list
    basic_feature_list.append(basic_feature_dict)

In [107]:
# creating empty lists
beds_list = []
baths_list = []
parking_list = []
area_list = []
name_list = []
lat_list = []
long_list = []
price_list = []
school_list = []
school_distance_list =[]
address1_list = []
address2_list = []
recent_rented_number_list = []

street_renter_owner_ratio_list = []
neighb_family_single_ratio_list = []
long_term_resi_percentage_list =[]



# interating through list created above with data
for row in basic_feature_list:
    
    # checking if the row cointains 'Beds', 'Bed' or nothing
    if 'Beds' in row:
        beds_list.append(row['Beds'])
    elif 'bed' in row:
        beds_list.append(row['Bed'])
    else:
        beds_list.append(None)
        
    # checking if the row cointains 'Baths', 'Bath' or nothing    
    if 'Baths' in row:
        baths_list.append(row['Baths'])
    elif 'Bath ' in row:
        baths_list.append(row['Bath'])
    else:
        baths_list.append(None)
        
    # checking if the row cointains 'Parking', '-' or nothing     
    if 'Parking' in row and row['Parking'] != '−':
        parking_list.append(row['Parking'])
    else:
        parking_list.append(None)
        
    # checking if the row cointains ' ', or nothing. Because empty space (i.e. ' ') reprsents area  
    if ' ' in row:
        area_list.append(row[' '])
    else:
        area_list.append(None)
    # checking if the row cointains 'name' that is address of property         
    if 'name' in row:
        name_list.append(row['name'])
    else:
        name_list.append(None)
    
    # checking if the row cointains 'price'         
    if 'price' in row:
        price_list.append(row['price'])
    else:
        price_list.append(None)        
    
    # checking if the row cointains 'lat' that is lattitude of property         
    if 'lat' in row:
        lat_list.append(row['lat'])
    else:
        lat_list.append(None)  
        
    # checking if the row cointains 'long' that is lattitude of property             
    if 'long' in row:
        long_list.append(row['long'])
    else:
        long_list.append(None)
        
    # checking if the row cointains 'school' that is lattitude of property             
    if 'school' in row:
        school_list.append(row['school'])
    else:
        school_list.append(None)
      
    # checking if the row cointains 'school_distance'            
    if 'school_distance' in row:
        school_distance_list.append(row['school_distance'])
    else:
        school_distance_list.append(None)
        
    # checking if the row cointains 'address1'
    if 'address1' in row:
        address1_list.append(row['address1'])
    else:
        address1_list.append(None)
        
    # checking if the row cointains 'address2'      
    if 'address2' in row:
        address2_list.append(row['address2'])
    else:
        address2_list.append(None)
        
    # checking if the row cointains 'recent_rented_number'            
    if 'recent_rented_number' in row:
        recent_rented_number_list.append(row['recent_rented_number'])
    else:
        recent_rented_number_list.append(None)
    
        
    # checking if the row cointains 'long_term_resi_percentage'            
    if 'long_term_resi_percentage' in row:
        long_term_resi_percentage_list.append(row['long_term_resi_percentage'])
    else:
        long_term_resi_percentage_list.append(None)
        
    # checking if the row cointains 'street_renter_owner_ratio'            
    if 'street_renter_owner_ratio' in row:
        street_renter_owner_ratio_list.append(row['street_renter_owner_ratio'])
    else:
        recent_rented_price_list.append(None)
        
    # checking if the row cointains 'neighb_family_single_ratio'            
    if 'neighb_family_single_ratio' in row:
        neighb_family_single_ratio_list.append(row['neighb_family_single_ratio'])
    else:
        neighb_family_single_ratio.append(None)



        
        
        
        


In [108]:
#create dataframe 
house_dict = {}
house_dict['Beds'] = beds_list
house_dict['Baths'] = baths_list
house_dict['Parking'] = parking_list
house_dict['Area'] = area_list
house_dict['Address'] = name_list
house_dict['Latitude'] = lat_list
house_dict['Longitude'] = long_list
house_dict['Rent'] = price_list

house_dict['School'] = school_list
house_dict['School_distance'] = school_distance_list

house_dict['Location_1'] = address1_list
house_dict['Location_2'] = address2_list
house_dict['recent_rented_number'] = recent_rented_number_list
house_dict['recent_rented_price'] = recent_rented_price_list

house_dict['recent_rented_number'] = recent_rented_number_list
house_dict['recent_rented_price'] = recent_rented_price_list

house_dict['street_renter_owner_ratio'] = street_renter_owner_ratio_list 
house_dict['neighb_family_single_ratio'] = neighb_family_single_ratio_list
house_dict['long_term_resi_percentage'] = long_term_resi_percentage_list


house_df = pd.DataFrame(house_dict)

In [109]:
def remove_chars(s):
    return re.sub('[^0-9.]+', '', s) 

def remove_squaremeter(s):
    if s != None:
        return remove_chars(s)
    else:
        return None

In [110]:
#find pattern for week/annual rent
year_pattern = r"p.a.|annually|annual"
week_pattern = r"pw|week"

regex_week_pattern = re.compile(week_pattern)
regex_year_pattern = re.compile(year_pattern)

#flag type of rent with 1(weekly), 0(annul), None(unknown)
def type_flag(s):
    # week rent
    if (len(regex_week_pattern.findall(s.lower())) != 0 )& (len(regex_year_pattern.findall(s.lower())) == 0) :
        return 1
    # year rent
    elif (len(regex_week_pattern.findall(s.lower())) == 0) & (len(regex_year_pattern.findall(s.lower())) != 0) :
        return 0
    # not speficied
    else:
        return None

#convert rent to weekly rent
#if not sure rent type, it will be considered as weekly rent
def convert_rent(s):
    # week rent
    if (len(regex_week_pattern.findall(s.lower())) != 0 )& (len(regex_year_pattern.findall(s.lower())) == 0) :
        return remove_chars(s)
    # year rent
    elif (len(regex_week_pattern.findall(s.lower())) == 0) & (len(regex_year_pattern.findall(s.lower())) != 0) :
        return float(remove_chars(s))//12
    # not speficied
    else:
        return remove_chars(s)


house_df['Converted_Rent'] = [convert_rent(i) for i in price_list]
house_df['Type'] = [type_flag(i) for i in price_list]

In [111]:
#extract post code
house_df['Postcode'] = house_df['Address'].str[-4:]

#clean area
house_df['Area'] = [remove_squaremeter(i) for i in area_list]

In [113]:
house_df

Unnamed: 0,Beds,Baths,Parking,Area,Address,Latitude,Longitude,Rent,School,School_distance,Location_1,Location_2,recent_rented_number,recent_rented_price,street_renter_owner_ratio,neighb_family_single_ratio,long_term_resi_percentage,Converted_Rent,Type,Postcode
0,2,,1,,4202/568 Collins Street Melbourne VIC 3000,-37.8183759,144.9554361,$625,"[Eltham College - King Street Campus, Hester H...","[136.86393519280682, 226.0278132952064, 624.70...",[568 Collins St],"[Melbourne, VIC, 3000]",859,,0.538462,0.470588,0.20,625,,3000
1,,,,,1107/5 Sutherland Street Melbourne VIC 3000,-37.811416,144.9608504,$450.00,"[Eltham College - Lonsdale Street Campus, Ozfo...","[132.6662987018676, 576.7496829668854, 577.438...",[5 Sutherland St],"[Melbourne, VIC, 3000]",215,,1.857143,0.282051,0.26,450.00,,3000
2,2,2,2,,51A O'Connell Street North Melbourne VIC 3051,-37.8045402,144.9575597,"$1,500 Per Week","[River Nile School, St Joseph's Flexible Learn...","[232.48676074968782, 271.88497579047674, 441.4...",,,,,,0.250000,0.15,1500,1.0,3051
3,2,2,1,,307/604 St Kilda Road Melbourne 3004 VIC 3004,-37.851214,144.9800936,$500 per week,"[Wesley College, Wesley College - St Kilda Roa...","[320.2856003614696, 367.8343763865302, 807.187...",,,,,,0.587302,0.17,500,1.0,3004
4,2,2,1,,67/73 River Street Richmond VIC 3121,-37.8191545,145.0130619,$530 per week,"[Melbourne Girls College, Hawthorn West Primar...","[388.595286935481, 479.89505564717956, 560.703...",[73 River St],"[Richmond, VIC, 3121]",211,,0.818182,0.818182,,530,1.0,3121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1426,2,,,,605/455 Elizabeth Street Melbourne VIC 3000,-37.8084293,144.9602064,$500 per week,"[Eltham College - Lonsdale Street Campus, Rive...","[463.3611599786358, 588.5331765906494, 695.270...",[455 Elizabeth St],"[Melbourne, VIC, 3000]",81,,1.500000,0.388889,,500,1.0,3000
1427,2,,,,602/315 La Trobe Street Melbourne VIC 3000,-37.8109646,144.959936,$555/week,"[Eltham College - Lonsdale Street Campus, Ozfo...","[189.6404316139417, 516.9761052571154, 517.772...",[315 La Trobe St],"[Melbourne, VIC, 3000]",150,,0.538462,0.282051,0.26,555,1.0,3000
1428,3,,1,,21/8 Wellington Crescent East Melbourne VIC 3002,-37.8172238,144.9776702,$700 per week,"[Stott's Colleges, Holmes Grammar School, Acad...","[889.2292925744988, 903.4745562053921, 1527.07...",[8 Wellington Cr],"[East Melbourne, VIC, 3002]",69,,0.428571,0.818182,0.36,700,1.0,3002
1429,2,,,,1605/31 A'beckett Street Melbourne VIC 3000,-37.8088893,144.9625546,$580 per week,"[Eltham College - Lonsdale Street Campus, Stot...","[446.07142441130526, 765.5201411209711, 786.57...",[31 Abeckett St],"[Melbourne, VIC, 3000]",341,,1.500000,0.162791,,580,1.0,3000


In [121]:
# save the property_df to parquet format
postcode_start = postcode_range[0]
postcode_end = postcode_range[1]


# check if it exists as it makedir will raise an error if it does exist
raw_output_dir = '../data/raw'

if not os.path.exists(raw_output_dir):
    os.makedirs(raw_output_dir)

In [122]:
# Create a spark session

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/09/02 13:58:55 WARN Utils: Your hostname, haixindeMacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.14 instead (on interface en0)
22/09/02 13:58:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/02 13:58:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [126]:
filename = f'rental_info_{postcode_start}-{postcode_end}.parquet'
output_dir = f"{raw_output_dir}/{filename}"

In [None]:
house_spark = spark.createDataFrame(house_df, schema = schema)

In [124]:
house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1431 entries, 0 to 1430
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Beds                        910 non-null    object 
 1   Baths                       397 non-null    object 
 2   Parking                     768 non-null    object 
 3   Area                        20 non-null     object 
 4   Address                     1431 non-null   object 
 5   Latitude                    1431 non-null   object 
 6   Longitude                   1431 non-null   object 
 7   Rent                        1431 non-null   object 
 8   School                      1431 non-null   object 
 9   School_distance             1431 non-null   object 
 10  Location_1                  1180 non-null   object 
 11  Location_2                  1180 non-null   object 
 12  recent_rented_number        1180 non-null   object 
 13  recent_rented_price         0 non

In [None]:
house_spark.write.format("parquet").mode("overwrite").save(f"{otuput_dir}")