In [1]:
import os
os.chdir('../scripts')

from scrape_domain import *
from constants import *
from utils import *

In [2]:
#find pattern for week/annual rent
year_pattern = r"p.a.|annually|annual"
week_pattern = r"pw|week"

regex_week_pattern = re.compile(week_pattern)
regex_year_pattern = re.compile(year_pattern)


def remove_chars(s):
    return re.sub('[^0-9.]+', '', s) 

#flag type of rent with 1(weekly), 0(annul), None(unknown)
def type_flag(s):
    # week rent
    if (len(regex_week_pattern.findall(s.lower())) != 0 )& (len(regex_year_pattern.findall(s.lower())) == 0) :
        return 1
    # year rent
    elif (len(regex_week_pattern.findall(s.lower())) == 0) & (len(regex_year_pattern.findall(s.lower())) != 0) :
        return 0
    # not speficied
    else:
        return None

#convert rent to weekly rent
#if not sure rent type, it will be considered as weekly rent
def convert_rent(s):
    # week rent
    if (len(regex_week_pattern.findall(s.lower())) != 0 )& (len(regex_year_pattern.findall(s.lower())) == 0) :
        return remove_chars(s)
    # year rent
    elif (len(regex_week_pattern.findall(s.lower())) == 0) & (len(regex_year_pattern.findall(s.lower())) != 0) :
        return float(remove_chars(s))//12
    # not speficied
    else:
        return remove_chars(s)

In [3]:

#input url return dict
def input_url_output_df(url_list):
    basic_feature_list = []
    for url in url_list  :
        
        link = url
        
        # opening urls
        bs_object = BeautifulSoup(requests.get(link, headers=headers).text, "html.parser")

        #recent rented
        recent_rented_number = bs_object.find("div", {"class": "css-12uo2x5"})

        #neighbourhood family vs single ratio
        neighb_family = bs_object.find("span", {"class": "css-1ruej2h"})
        neighb_single = bs_object.find("span", {"class": "css-1ymya4c"})


        #street owner vs renter
        street_renter = bs_object.find("span", {"class": "css-1ejmyle"})
        street_owner = bs_object.find("span", {"class": "css-1jhlvvv"})


        #long term resident percentage
        long_term_resi= bs_object.find("div", {"class": "css-ibsnk8"})

        # extracting property price
        property_price = bs_object.find("div", {"data-testid": "listing-details__summary-title"})



        # dictionary to store temporary data
        basic_feature_dict = {}
        pattern1 = re.compile(r'>(.+)<.')

        # putting 'none' if recent_rented number is missing   
        if recent_rented_number is None:
            basic_feature_dict['recent_rented_number'] = None

        else:
            basic_feature_dict['recent_rented_number'] = pattern1.findall(str(recent_rented_number))[0]



        # putting 'none' if longterm resident number is missing   
        if long_term_resi is None:
            basic_feature_dict['long_term_resi_percentage'] = None

        else:
            basic_feature_dict['long_term_resi_percentage']  = float(str(long_term_resi.text[:-1]))/100



        # putting 'none' if neighbourhood family/single number is missing   
        if (neighb_family is None) |(neighb_single is None) :
            basic_feature_dict['neighb_family_single_ratio'] = None

        else:
            #to avoid zero division
            add_on = 0.000001
            basic_feature_dict['neighb_family_single_ratio'] = float(str(neighb_family.text[:-1]))/(float(str(neighb_single.text[:-1]))+add_on)




        # putting 'none' if street owener/renter number is missing   
        if (street_owner is None) | (street_renter is None):
            basic_feature_dict['street_renter_owner_ratio'] = None

        else:
            #to avoid zero division
            add_on = 0.000001
            basic_feature_dict['street_renter_owner_ratio'] = float(str(street_renter.text[:-1]))/(float(str(street_owner.text[:-1]))+add_on)



        # putting 'none' if price is missing    
        if property_price is None:
            basic_feature_dict['price'] = None

        else:
            basic_feature_dict['price'] = pattern1.findall(str(property_price))[0]
            
         # appending all the data into a list
        basic_feature_list.append(basic_feature_dict)
        
    return basic_feature_list
        
        

In [4]:
def list_to_df(basic_feature_list):
    
    recent_rented_number_list = []
    street_renter_owner_ratio_list = []
    neighb_family_single_ratio_list = []
    long_term_resi_percentage_list =[]
    price_list = []

    for row in basic_feature_list:
        # checking if the row cointains 'price'         
        if 'price' in row:
            price_list.append(row['price'])
        else:
            price_list.append(None)        

        # checking if the row cointains 'recent_rented_number'            
        if 'recent_rented_number' in row:
            recent_rented_number_list.append(row['recent_rented_number'])
        else:
            recent_rented_number_list.append(None)


        # checking if the row cointains 'long_term_resi_percentage'            
        if 'long_term_resi_percentage' in row:
            long_term_resi_percentage_list.append(row['long_term_resi_percentage'])
        else:
            long_term_resi_percentage_list.append(None)

        # checking if the row cointains 'street_renter_owner_ratio'            
        if 'street_renter_owner_ratio' in row:
            street_renter_owner_ratio_list.append(row['street_renter_owner_ratio'])
        else:
            street_renter_owner_ratio_list.append(None)

        # checking if the row cointains 'neighb_family_single_ratio'            
        if 'neighb_family_single_ratio' in row:
            neighb_family_single_ratio_list.append(row['neighb_family_single_ratio'])
        else:
            neighb_family_single_ratio.append(None)
        
        
    house_dict = {}    
    house_dict['Rent'] = price_list
    house_dict['street_renter_owner_ratio'] = street_renter_owner_ratio_list 
    house_dict['neighb_family_single_ratio'] = neighb_family_single_ratio_list
    house_dict['long_term_resi_percentage'] = long_term_resi_percentage_list
    house_df = pd.DataFrame(house_dict)
    
    house_df['Converted_Rent'] = [convert_rent(i) for i in price_list]
    house_df['Type'] = [type_flag(i) for i in price_list]
    return house_df



In [5]:
vic_postcodes = constants.postcodes['VIC_sample']

domain_links = set()

for i in tqdm(range(len(vic_postcodes))):
    current_links = domain_property_links(vic_postcodes[i])
    domain_links = domain_links | current_links
    
    print(len(current_links))

0it [00:00, ?it/s]/100 [00:00<?, ?it/s]
  1%|          | 1/100 [00:00<00:27,  3.57it/s]

0


0it [00:00, ?it/s]
  2%|▏         | 2/100 [00:00<00:24,  3.93it/s]

0


0it [00:00, ?it/s]
  3%|▎         | 3/100 [00:00<00:32,  2.96it/s]

0


100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
  4%|▍         | 4/100 [00:02<01:09,  1.38it/s]

2


100%|██████████| 1/1 [00:00<00:00,  1.05it/s]
  5%|▌         | 5/100 [00:03<01:33,  1.02it/s]

4


0it [00:00, ?it/s]
  6%|▌         | 6/100 [00:03<01:08,  1.37it/s]

0


0it [00:00, ?it/s]
  7%|▋         | 7/100 [00:04<01:10,  1.32it/s]

0


0it [00:00, ?it/s]
  8%|▊         | 8/100 [00:04<00:53,  1.71it/s]

0


0it [00:00, ?it/s]
  9%|▉         | 9/100 [00:05<00:45,  2.00it/s]

0


100%|██████████| 1/1 [00:00<00:00,  1.89it/s]
 10%|█         | 10/100 [00:06<00:55,  1.62it/s]

14


100%|██████████| 1/1 [00:00<00:00,  2.60it/s]
 11%|█         | 11/100 [00:06<00:59,  1.50it/s]

19


100%|██████████| 1/1 [00:00<00:00,  3.13it/s]
 12%|█▏        | 12/100 [00:07<00:59,  1.48it/s]

1


100%|██████████| 1/1 [00:00<00:00,  2.65it/s]
 13%|█▎        | 13/100 [00:08<01:00,  1.44it/s]

12


100%|██████████| 1/1 [00:00<00:00,  3.93it/s]
 14%|█▍        | 14/100 [00:08<00:55,  1.56it/s]

2


0it [00:00, ?it/s]
 15%|█▌        | 15/100 [00:09<00:43,  1.94it/s]

0


100%|██████████| 1/1 [00:00<00:00,  1.72it/s]
 16%|█▌        | 16/100 [00:10<00:54,  1.54it/s]

14


0it [00:00, ?it/s]
 17%|█▋        | 17/100 [00:10<00:44,  1.85it/s]

0


0it [00:00, ?it/s]
 18%|█▊        | 18/100 [00:10<00:37,  2.16it/s]

0


100%|██████████| 1/1 [00:00<00:00,  3.54it/s]
 19%|█▉        | 19/100 [00:11<00:40,  1.98it/s]

2


100%|██████████| 1/1 [00:00<00:00,  3.17it/s]
 20%|██        | 20/100 [00:11<00:45,  1.76it/s]

1


0it [00:00, ?it/s]
 21%|██        | 21/100 [00:12<00:38,  2.06it/s]

0


0it [00:00, ?it/s]
 22%|██▏       | 22/100 [00:12<00:33,  2.30it/s]

0


100%|██████████| 2/2 [00:01<00:00,  1.30it/s]
 23%|██▎       | 23/100 [00:15<01:22,  1.07s/it]

22


0it [00:00, ?it/s]
 24%|██▍       | 24/100 [00:15<01:03,  1.19it/s]

0


0it [00:00, ?it/s]
 25%|██▌       | 25/100 [00:15<00:48,  1.55it/s]

0


100%|██████████| 1/1 [00:00<00:00,  3.97it/s]
 26%|██▌       | 26/100 [00:16<00:47,  1.57it/s]

4


100%|██████████| 1/1 [00:00<00:00,  2.13it/s]
 27%|██▋       | 27/100 [00:17<00:54,  1.34it/s]

8


100%|██████████| 1/1 [00:00<00:00,  4.25it/s]
 28%|██▊       | 28/100 [00:17<00:47,  1.50it/s]

1


0it [00:00, ?it/s]
 29%|██▉       | 29/100 [00:18<00:45,  1.57it/s]

0


100%|██████████| 1/1 [00:00<00:00,  3.01it/s]
 30%|███       | 30/100 [00:18<00:45,  1.54it/s]

2


100%|██████████| 1/1 [00:00<00:00,  2.05it/s]
 31%|███       | 31/100 [00:19<00:50,  1.37it/s]

19


100%|██████████| 2/2 [00:00<00:00,  2.83it/s]
 32%|███▏      | 32/100 [00:21<01:02,  1.09it/s]

28


0it [00:00, ?it/s]
 33%|███▎      | 33/100 [00:21<00:46,  1.43it/s]

0


0it [00:00, ?it/s]
 34%|███▍      | 34/100 [00:21<00:35,  1.84it/s]

0


100%|██████████| 1/1 [00:00<00:00,  1.61it/s]
 35%|███▌      | 35/100 [00:22<00:47,  1.38it/s]

14


0it [00:00, ?it/s]
 36%|███▌      | 36/100 [00:23<00:39,  1.64it/s]

0


100%|██████████| 2/2 [00:01<00:00,  1.75it/s]
 37%|███▋      | 37/100 [00:24<00:56,  1.11it/s]

26


100%|██████████| 1/1 [00:00<00:00,  2.25it/s]
 38%|███▊      | 38/100 [00:25<00:53,  1.17it/s]

3


100%|██████████| 1/1 [00:00<00:00,  2.36it/s]
 39%|███▉      | 39/100 [00:26<00:53,  1.15it/s]

7


100%|██████████| 2/2 [00:00<00:00,  2.15it/s]
 40%|████      | 40/100 [00:27<01:03,  1.06s/it]

27


100%|██████████| 1/1 [00:00<00:00,  1.88it/s]
 41%|████      | 41/100 [00:29<01:13,  1.24s/it]

20


100%|██████████| 2/2 [00:01<00:00,  1.40it/s]
 42%|████▏     | 42/100 [00:31<01:28,  1.52s/it]

34


100%|██████████| 1/1 [00:00<00:00,  2.24it/s]
 43%|████▎     | 43/100 [00:32<01:20,  1.42s/it]

8


100%|██████████| 1/1 [00:00<00:00,  2.64it/s]
 44%|████▍     | 44/100 [00:33<01:07,  1.21s/it]

7


100%|██████████| 1/1 [00:00<00:00,  2.59it/s]
 45%|████▌     | 45/100 [00:34<01:03,  1.15s/it]

5


0it [00:00, ?it/s]
 46%|████▌     | 46/100 [00:34<00:47,  1.14it/s]

0


100%|██████████| 1/1 [00:00<00:00,  3.17it/s]
 47%|████▋     | 47/100 [00:36<00:55,  1.05s/it]

1


0it [00:00, ?it/s]
 48%|████▊     | 48/100 [00:36<00:41,  1.25it/s]

0


0it [00:00, ?it/s]
 49%|████▉     | 49/100 [00:38<00:54,  1.08s/it]

0


0it [00:00, ?it/s]
 50%|█████     | 50/100 [00:38<00:40,  1.22it/s]

0


100%|██████████| 1/1 [00:00<00:00,  3.14it/s]
 51%|█████     | 51/100 [00:39<00:37,  1.32it/s]

4


0it [00:00, ?it/s]
 52%|█████▏    | 52/100 [00:39<00:29,  1.62it/s]

0


0it [00:00, ?it/s]
 53%|█████▎    | 53/100 [00:39<00:23,  2.00it/s]

0


100%|██████████| 1/1 [00:00<00:00,  2.06it/s]
 54%|█████▍    | 54/100 [00:40<00:26,  1.73it/s]

5


0it [00:00, ?it/s]
 55%|█████▌    | 55/100 [00:40<00:24,  1.85it/s]

0


0it [00:00, ?it/s]
 56%|█████▌    | 56/100 [00:41<00:22,  1.93it/s]

0


100%|██████████| 1/1 [00:00<00:00,  2.60it/s]
 57%|█████▋    | 57/100 [00:41<00:25,  1.72it/s]

8


0it [00:00, ?it/s]
 58%|█████▊    | 58/100 [00:42<00:29,  1.42it/s]

0


0it [00:00, ?it/s]
 59%|█████▉    | 59/100 [00:43<00:24,  1.64it/s]

0


100%|██████████| 1/1 [00:00<00:00,  4.09it/s]
 60%|██████    | 60/100 [00:44<00:28,  1.39it/s]

1


100%|██████████| 1/1 [00:00<00:00,  1.57it/s]
 61%|██████    | 61/100 [00:45<00:31,  1.23it/s]

5


0it [00:00, ?it/s]
 62%|██████▏   | 62/100 [00:45<00:24,  1.57it/s]

0


0it [00:00, ?it/s]
 63%|██████▎   | 63/100 [00:46<00:22,  1.63it/s]

0


0it [00:00, ?it/s]
 64%|██████▍   | 64/100 [00:46<00:18,  1.94it/s]

0


0it [00:00, ?it/s]
 65%|██████▌   | 65/100 [00:47<00:23,  1.48it/s]

0


0it [00:00, ?it/s]
 66%|██████▌   | 66/100 [00:47<00:19,  1.79it/s]

0


0it [00:00, ?it/s]
 67%|██████▋   | 67/100 [00:48<00:15,  2.14it/s]

0


0it [00:00, ?it/s]
 68%|██████▊   | 68/100 [00:48<00:12,  2.48it/s]

0


0it [00:00, ?it/s]
 69%|██████▉   | 69/100 [00:48<00:10,  2.90it/s]

0


0it [00:00, ?it/s]
 70%|███████   | 70/100 [00:49<00:15,  1.93it/s]

0


0it [00:00, ?it/s]
 71%|███████   | 71/100 [00:49<00:12,  2.27it/s]

0


100%|██████████| 1/1 [00:00<00:00,  2.95it/s]
 72%|███████▏  | 72/100 [00:50<00:15,  1.85it/s]

3


0it [00:00, ?it/s]
 73%|███████▎  | 73/100 [00:50<00:12,  2.10it/s]

0


0it [00:00, ?it/s]
 74%|███████▍  | 74/100 [00:51<00:10,  2.37it/s]

0


0it [00:00, ?it/s]
 75%|███████▌  | 75/100 [00:51<00:09,  2.76it/s]

0


100%|██████████| 1/1 [00:00<00:00,  2.32it/s]
 76%|███████▌  | 76/100 [00:53<00:18,  1.30it/s]

19


100%|██████████| 2/2 [00:01<00:00,  1.81it/s]
 77%|███████▋  | 77/100 [00:54<00:23,  1.00s/it]

24


100%|██████████| 7/7 [00:04<00:00,  1.65it/s]
 78%|███████▊  | 78/100 [00:59<00:46,  2.12s/it]

129


100%|██████████| 4/4 [00:01<00:00,  2.26it/s]
 79%|███████▉  | 79/100 [01:01<00:45,  2.18s/it]

74


0it [00:00, ?it/s]
 80%|████████  | 80/100 [01:01<00:32,  1.62s/it]

0


100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
 81%|████████  | 81/100 [01:02<00:24,  1.30s/it]

2


100%|██████████| 1/1 [00:00<00:00,  4.40it/s]
 82%|████████▏ | 82/100 [01:03<00:19,  1.11s/it]

2


0it [00:00, ?it/s]
 83%|████████▎ | 83/100 [01:03<00:14,  1.20it/s]

0


0it [00:00, ?it/s]
 84%|████████▍ | 84/100 [01:03<00:10,  1.49it/s]

0


100%|██████████| 1/1 [00:00<00:00,  2.24it/s]
 85%|████████▌ | 85/100 [01:04<00:11,  1.36it/s]

11


0it [00:00, ?it/s]
 86%|████████▌ | 86/100 [01:04<00:08,  1.66it/s]

0


0it [00:00, ?it/s]
 87%|████████▋ | 87/100 [01:05<00:08,  1.60it/s]

0


100%|██████████| 1/1 [00:00<00:00,  1.09it/s]
 88%|████████▊ | 88/100 [01:06<00:10,  1.19it/s]

2


0it [00:00, ?it/s]
 89%|████████▉ | 89/100 [01:07<00:08,  1.35it/s]

0


0it [00:00, ?it/s]
 90%|█████████ | 90/100 [01:07<00:05,  1.74it/s]

0


0it [00:00, ?it/s]
 91%|█████████ | 91/100 [01:07<00:04,  2.14it/s]

0


0it [00:00, ?it/s]
 92%|█████████▏| 92/100 [01:07<00:03,  2.58it/s]

0


100%|██████████| 1/1 [00:00<00:00,  1.88it/s]
 93%|█████████▎| 93/100 [01:08<00:03,  1.97it/s]

2


0it [00:00, ?it/s]
 94%|█████████▍| 94/100 [01:09<00:02,  2.16it/s]

0


0it [00:00, ?it/s]
 95%|█████████▌| 95/100 [01:09<00:01,  2.61it/s]

0


100%|██████████| 2/2 [00:01<00:00,  1.42it/s]
 96%|█████████▌| 96/100 [01:11<00:03,  1.21it/s]

22


100%|██████████| 1/1 [00:00<00:00,  1.66it/s]
 97%|█████████▋| 97/100 [01:12<00:02,  1.18it/s]

9


0it [00:00, ?it/s]
 98%|█████████▊| 98/100 [01:12<00:01,  1.41it/s]

0


0it [00:00, ?it/s]
 99%|█████████▉| 99/100 [01:12<00:00,  1.80it/s]

0


0it [00:00, ?it/s]
100%|██████████| 100/100 [01:12<00:00,  1.37it/s]

0





In [6]:
with open('2022-09-05.VIC.domain.pickle', 'wb') as file:
    pickle.dump(domain_links, file , protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
with open('2022-09-05.VIC.domain.pickle', 'rb') as file:
    links = pickle.load(file)

len(links)

629

In [8]:
links

{'https://www.domain.com.au/0-cnr-watson-rd-mcpherson-rd-leongatha-vic-3953-15592673',
 'https://www.domain.com.au/1-1-haven-court-cranbourne-vic-3977-16066375',
 'https://www.domain.com.au/1-10-bay-street-port-albert-vic-3971-9745101',
 'https://www.domain.com.au/1-12-william-street-cranbourne-vic-3977-16063928',
 'https://www.domain.com.au/1-14-meikle-street-meeniyan-vic-3956-16004995',
 'https://www.domain.com.au/1-15-roadknight-street-lakes-entrance-vic-3909-16061234',
 'https://www.domain.com.au/1-16-kent-street-mornington-vic-3931-16061709',
 'https://www.domain.com.au/1-16-mckitterick-street-meeniyan-vic-3956-15920145',
 'https://www.domain.com.au/1-171-comers-road-kalimna-west-vic-3909-16072320',
 'https://www.domain.com.au/1-18-bayport-drive-langwarrin-vic-3910-16072133',
 'https://www.domain.com.au/1-18-johnson-street-capel-sound-vic-3940-16039887',
 'https://www.domain.com.au/1-18-stawell-street-cranbourne-vic-3977-16032807',
 'https://www.domain.com.au/1-181-dromana-parade-

In [9]:
test_links = links

In [10]:
output_list = input_url_output_df(test_links)

In [11]:
df = list_to_df(output_list)

In [12]:
df

Unnamed: 0,Rent,street_renter_owner_ratio,neighb_family_single_ratio,long_term_resi_percentage,Converted_Rent,Type
0,$850 per week,0.052632,6.142857,0.65,850,1.0
1,$450,0.250000,1.439024,0.60,450,
2,$360.00 pw,0.250000,2.571428,0.70,360.00,1.0
3,$600 per week,0.052632,7.333333,0.73,600,1.0
4,UNDER APPLICATION Pending Commencement Date.,0.176471,2.030303,0.67,.,
...,...,...,...,...,...,...
624,Under Application,0.250000,2.846154,0.57,,
625,$580 wk,,5.250000,0.77,580,
626,$575.00,,11.499999,0.68,575.00,
627,$420 per week,0.428571,4.555555,,420,1.0
