In [65]:
import re
from json import dump
import requests

from collections import defaultdict

# user packages
from bs4 import BeautifulSoup
from urllib.request import urlopen
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 51) # update this to your liking

# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:
    url = BASE_URL + f"/rent/melbourne-region-vic/?sort=price-desc&page={page}"
    bs_object = BeautifulSoup(requests.get(
    url, headers=headers).text, "html.parser")

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])


# for each url, scrape some basic metadata
for property_url in url_links[1:]:
    # bs_object = BeautifulSoup(urlopen(property_url), "lxml")
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # looks for the header class to get property name
    property_metadata[property_url]['name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # looks for the div containing a summary title for cost
    property_metadata[property_url]['cost_text'] = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # extract coordinates from the hyperlink provided
    # i'll let you figure out what this does :P
    property_metadata[property_url]['coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]

    property_metadata[property_url]['rooms'] = [
        re.findall(r'\d\s[A-Za-z]+', feature.text) for feature in bs_object \
            .find("div", {"data-testid": "property-features"}) \
            .findAll("span", {"data-testid": "property-features-text-container"})
    ]
    
    information = bs_object.find("div", {"data-testid": "strip-content-list"})
    if information :
        property_metadata[property_url]['external_information'] = information.text
    else: 
        property_metadata[property_url]['external_information'] = None

    property_metadata[property_url]['desc'] = re \
        .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
        .strip('</p>')

# output to example json in data/raw/
with open('../data/raw/example.json', 'w') as f:
    dump(property_metadata, f)

In [None]:
import json
import pandas as pd
  
# Opening JSON file
f = open('../data/raw/example.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)
df2 = pd.DataFrame.from_dict(data, orient="index")

In [None]:
l = []
for i in range(len(df2)):
    if(len(df2['rooms'][i]) != 0):
        l.append(True)
    else:
        l.append(False)
df3 = df2[l]

In [None]:
df3 = df3.reset_index()
df3.rename(columns={'index':'link'}, inplace = True)

In [None]:
import re
bedroom_list = []
bathroom_list = []
parking_list = []
for i in range(len(df3)):
    bedroom_list.append(re.search('\d+', df3['rooms'][i][0][0]).group())
    bathroom_list.append(re.search('\d+', df3['rooms'][i][1][0]).group())
    if(len(df3['rooms'][i][2])!= 0):
        parking_list.append(re.search('\d+', df3['rooms'][i][2][0]).group())
    else:
        parking_list.append('0')
    # df3['bathroom'][i] = re.search('\d+', df3['rooms'][i][1][0]).group()
    # df3['parking'][i] = re.search('\d+', df3['rooms'][i][2][0]).group()

In [None]:
df3['bedroom'] = bedroom_list
df3['bathroom'] = bathroom_list
df3['parking'] = parking_list

In [None]:
df3

In [None]:
def amount(text):
    amount = re.findall(r'\d+,?.\d+.?\d+',text)
    if len(amount) == 0:
        return None
    elif len(amount) == 1:       
        rent = float(amount[0].replace(',',''))
    else:
        sum = 0
        for num in amount:
            num = float(num.replace(',',''))
            sum += num
        rent = sum/len(amount)
    if('Annually' in text):
        weekly_rent = rent/52
    else:
        weekly_rent = rent
    return weekly_rent
        
        


In [None]:
df3['weekly_rent'] = df3['cost_text'].apply(amount)

In [None]:
def postcode(text):
    postcode = re.findall(r'VIC \d+',text)
    if(len(postcode) == 0):
        return None
    else:
        return postcode[0].split(' ')[1]
   

In [None]:
df3['postcode'] = df3['name'].apply(postcode)

In [None]:
df3['external_information']

0      Date Available: Available NowInternal area 337m²
1              Date Available: Available NowBond $36000
2     Available from Friday, 23rd December 2022Bond ...
3     Available from Tuesday, 1st November 2022Bond ...
4              Date Available: Available NowBond $27000
                            ...                        
93              Date Available: Available NowBond $7821
94              Date Available: Available NowBond $7817
95              Date Available: Available NowBond $7604
96              Date Available: Available NowBond $7584
97              Date Available: Available NowBond $9354
Name: external_information, Length: 98, dtype: object

In [None]:
def bond(text):
    if(text is None):
        return None
    if('Bond' in text):
        amount = re.findall(r'Bond \$\d+,?.\d+.?\d+',text)
        if (len(amount) == 1):
            amount = float(amount[0].split(' ')[1][1:])
            return amount
        else:
            print("amount")
            return amount
   
    else:
        return None


In [None]:
df3['bond'] = df3['external_information'].apply(bond)
