In [1]:
from bs4 import BeautifulSoup
import requests

import unicodedata
from csv import writer
import re

import pandas as pd
import numpy as np

import json
from pyspark.sql import SparkSession
import os

In [2]:
headers = {"User-Agent":
           "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}

# home url of domian.com australia
home_url = "https://www.domain.com.au"

In [9]:
# return unique extracted link for given postcode
def link_by_postcode(postcode):
    
    # list to store all the urls of properties
    list_of_links = []
    
    
    # number of pages of search result are 50
    page_numbers = list(range(50))[1:50]
    
    # for loop for all 50 search(melbourne region) pages
    for page in page_numbers:

        # extracting html document of search page
        url = home_url + f"/rent/melbourne-vic-{postcode}/?excludedeposittaken=1&page={page}"

        # parsing html document to 'lxml' format
        bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")
        
        #break if current page not exist or no links found
        if ((bs_object is None) | (bs_object.find("ul", {"data-testid": "results"}) is None)):
            continue
        
        #for each page, finding all the links available in 'ul' tag whos 'data-testid' is 'results'
        all_links = bs_object.find(
            "ul", {"data-testid": "results"}).findAll("a", href=re.compile("https://www.domain.com.au/*"))

        # inner loop to find links inside each property page because few properties are project so they have more properties inside their project page
        for link1 in all_links:
            # checking if it is a project and then performing similar above
            if 'project' in link1.attrs['href']:
                inner1_url = link1.attrs['href']
                inner1_bsobj = BeautifulSoup(requests.get(inner1_url, headers=headers).text, "html.parser")
                for link2 in inner1_bsobj.find("div", {"name": "listing-details__other-listings"}).findAll("a", href=re.compile("https://www.domain.com.au/*")):
                    if 'href' in link2.attrs:
                        list_of_links.append(link2.attrs['href'])
            else:
                list_of_links.append(link1.attrs['href'])
   
    return list_of_links



In [3]:
#find pattern for week/annual rent
year_pattern = r"p.a.|annually|annual"
week_pattern = r"pw|week"

regex_week_pattern = re.compile(week_pattern)
regex_year_pattern = re.compile(year_pattern)


def remove_chars(s):
    return re.sub('[^0-9.]+', '', s) 

#flag type of rent with 1(weekly), 0(annul), None(unknown)
def type_flag(s):
    # week rent
    if (len(regex_week_pattern.findall(s.lower())) != 0 )& (len(regex_year_pattern.findall(s.lower())) == 0) :
        return 1
    # year rent
    elif (len(regex_week_pattern.findall(s.lower())) == 0) & (len(regex_year_pattern.findall(s.lower())) != 0) :
        return 0
    # not speficied
    else:
        return None

#convert rent to weekly rent
#if not sure rent type, it will be considered as weekly rent
def convert_rent(s):
    # week rent
    if (len(regex_week_pattern.findall(s.lower())) != 0 )& (len(regex_year_pattern.findall(s.lower())) == 0) :
        return remove_chars(s)
    # year rent
    elif (len(regex_week_pattern.findall(s.lower())) == 0) & (len(regex_year_pattern.findall(s.lower())) != 0) :
        return float(remove_chars(s))//12
    # not speficied
    else:
        return remove_chars(s)

In [4]:



#input url return dict
def input_url_output_df(url_list):
    basic_feature_list = []
    for url in url_list  :
        
        link = url
        
        # opening urls
        bs_object = BeautifulSoup(requests.get(link, headers=headers).text, "html.parser")

        #recent rented
        recent_rented_number = bs_object.find("div", {"class": "css-12uo2x5"})

        #neighbourhood family vs single ratio
        neighb_family = bs_object.find("span", {"class": "css-1ruej2h"})
        neighb_single = bs_object.find("span", {"class": "css-1ymya4c"})


        #street owner vs renter
        street_renter = bs_object.find("span", {"class": "css-1ejmyle"})
        street_owner = bs_object.find("span", {"class": "css-1jhlvvv"})


        #long term resident percentage
        long_term_resi= bs_object.find("div", {"class": "css-ibsnk8"})

        # extracting property price
        property_price = bs_object.find("div", {"data-testid": "listing-details__summary-title"})



        # dictionary to store temporary data
        basic_feature_dict = {}
        pattern1 = re.compile(r'>(.+)<.')

        # putting 'none' if recent_rented number is missing   
        if recent_rented_number is None:
            basic_feature_dict['recent_rented_number'] = None

        else:
            basic_feature_dict['recent_rented_number'] = pattern1.findall(str(recent_rented_number))[0]



        # putting 'none' if longterm resident number is missing   
        if long_term_resi is None:
            basic_feature_dict['long_term_resi_percentage'] = None

        else:
            basic_feature_dict['long_term_resi_percentage']  = float(str(long_term_resi.text[:-1]))/100



        # putting 'none' if neighbourhood family/single number is missing   
        if (neighb_family is None) |(neighb_single is None) :
            basic_feature_dict['neighb_family_single_ratio'] = None

        else:
            #to avoid zero division
            add_on = 0.000001
            basic_feature_dict['neighb_family_single_ratio'] = float(str(neighb_family.text[:-1]))/(float(str(neighb_single.text[:-1]))+add_on)




        # putting 'none' if street owener/renter number is missing   
        if (street_owner is None) | (street_renter is None):
            basic_feature_dict['street_renter_owner_ratio'] = None

        else:
            #to avoid zero division
            add_on = 0.000001
            basic_feature_dict['street_renter_owner_ratio'] = float(str(street_renter.text[:-1]))/(float(str(street_owner.text[:-1]))+add_on)



        # putting 'none' if price is missing    
        if property_price is None:
            basic_feature_dict['price'] = None

        else:
            basic_feature_dict['price'] = pattern1.findall(str(property_price))[0]
            
         # appending all the data into a list
        basic_feature_list.append(basic_feature_dict)
        
    return basic_feature_list
        
        

In [5]:
def list_to_df(basic_feature_list):

    recent_rented_number_list = []
    street_renter_owner_ratio_list = []
    neighb_family_single_ratio_list = []
    long_term_resi_percentage_list =[]
    price_list = []

    for row in basic_feature_list:
        # checking if the row cointains 'price'         
        if 'price' in row:
            price_list.append(row['price'])
        else:
            price_list.append(None)        

        # checking if the row cointains 'recent_rented_number'            
        if 'recent_rented_number' in row:
            recent_rented_number_list.append(row['recent_rented_number'])
        else:
            recent_rented_number_list.append(None)


        # checking if the row cointains 'long_term_resi_percentage'            
        if 'long_term_resi_percentage' in row:
            long_term_resi_percentage_list.append(row['long_term_resi_percentage'])
        else:
            long_term_resi_percentage_list.append(None)

        # checking if the row cointains 'street_renter_owner_ratio'            
        if 'street_renter_owner_ratio' in row:
            street_renter_owner_ratio_list.append(row['street_renter_owner_ratio'])
        else:
            street_renter_owner_ratio_list.append(None)

        # checking if the row cointains 'neighb_family_single_ratio'            
        if 'neighb_family_single_ratio' in row:
            neighb_family_single_ratio_list.append(row['neighb_family_single_ratio'])
        else:
            neighb_family_single_ratio.append(None)
        
        
    house_dict = {}    
    house_dict['Rent'] = price_list
    house_dict['street_renter_owner_ratio'] = street_renter_owner_ratio_list 
    house_dict['neighb_family_single_ratio'] = neighb_family_single_ratio_list
    house_dict['long_term_resi_percentage'] = long_term_resi_percentage_list
    house_df = pd.DataFrame(house_dict)
    
    house_df['Converted_Rent'] = [convert_rent(i) for i in price_list]
    house_df['Type'] = [type_flag(i) for i in price_list]
    return house_df



In [10]:
test_links = link_by_postcode('3000')

In [11]:
len(test_links)

1002

In [7]:
output_list = input_url_output_df(test_links)
output_list

[{'recent_rented_number': None,
  'long_term_resi_percentage': 0.12,
  'neighb_family_single_ratio': 0.24999999687500005,
  'street_renter_owner_ratio': None,
  'price': '$800'},
 {'recent_rented_number': '161',
  'long_term_resi_percentage': 0.18,
  'neighb_family_single_ratio': 0.4925373060815328,
  'street_renter_owner_ratio': 0.9999999800000005,
  'price': '$500.00'},
 {'recent_rented_number': '20',
  'long_term_resi_percentage': 0.3,
  'neighb_family_single_ratio': 0.58730157797934,
  'street_renter_owner_ratio': 0.9999999800000005,
  'price': '$360 per week'},
 {'recent_rented_number': '245',
  'long_term_resi_percentage': 0.38,
  'neighb_family_single_ratio': 0.8181818033057854,
  'street_renter_owner_ratio': 0.33333332888888895,
  'price': '$2,050'},
 {'recent_rented_number': '335',
  'long_term_resi_percentage': 0.51,
  'neighb_family_single_ratio': 0.20481927464073163,
  'street_renter_owner_ratio': 0.11111110987654323,
  'price': '$800 per week'},
 {'recent_rented_number': '

In [8]:
list_to_df(output_list)

Unnamed: 0,Rent,street_renter_owner_ratio,neighb_family_single_ratio,long_term_resi_percentage,Converted_Rent,Type
0,$800,,0.25,0.12,800.0,
1,$500.00,1.0,0.492537,0.18,500.0,
2,$360 per week,1.0,0.587302,0.3,360.0,1.0
3,"$2,050",0.333333,0.818182,0.38,2050.0,
4,$800 per week,0.111111,0.204819,0.51,800.0,1.0
5,$200,1.222222,0.25,0.29,200.0,
6,$495 Per Week,0.818182,0.960784,0.41,495.0,1.0
7,$680,1.5,0.388889,,680.0,


In [5]:
sorted([1,1,2,-1])


[-1, 1, 1, 2]