In [1]:
import os
os.chdir('../scripts')

from scrape_domain import *
from constants import *
from utils import *

In [2]:
#find pattern for week/annual rent
year_pattern = r"p.a.|annually|annual"
week_pattern = r"pw|week"

regex_week_pattern = re.compile(week_pattern)
regex_year_pattern = re.compile(year_pattern)


def remove_chars(s):
    return re.sub('[^0-9.]+', '', s) 

#flag type of rent with 1(weekly), 0(annul), None(unknown)
def type_flag(s):
    # week rent
    if (len(regex_week_pattern.findall(s.lower())) != 0 )& (len(regex_year_pattern.findall(s.lower())) == 0) :
        return 1
    # year rent
    elif (len(regex_week_pattern.findall(s.lower())) == 0) & (len(regex_year_pattern.findall(s.lower())) != 0) :
        return 0
    # not speficied
    else:
        return None

#convert rent to weekly rent
#if not sure rent type, it will be considered as weekly rent
def convert_rent(s):
    # week rent
    if (len(regex_week_pattern.findall(s.lower())) != 0 )& (len(regex_year_pattern.findall(s.lower())) == 0) :
        return remove_chars(s)
    # year rent
    elif (len(regex_week_pattern.findall(s.lower())) == 0) & (len(regex_year_pattern.findall(s.lower())) != 0) :
        return float(remove_chars(s))//12
    # not speficied
    else:
        return remove_chars(s)

In [3]:

#input url return dict
def input_url_output_df(url_list):
    basic_feature_list = []
    for url in url_list  :
        
        link = url
        
        # opening urls
        bs_object = BeautifulSoup(requests.get(link, headers=headers).text, "html.parser")

        #recent rented
        recent_rented_number = bs_object.find("div", {"class": "css-12uo2x5"})

        #neighbourhood family vs single ratio
        neighb_family = bs_object.find("span", {"class": "css-1ruej2h"})
        neighb_single = bs_object.find("span", {"class": "css-1ymya4c"})


        #street owner vs renter
        street_renter = bs_object.find("span", {"class": "css-1ejmyle"})
        street_owner = bs_object.find("span", {"class": "css-1jhlvvv"})


        #long term resident percentage
        long_term_resi= bs_object.find("div", {"class": "css-ibsnk8"})

        # extracting property price
        property_price = bs_object.find("div", {"data-testid": "listing-details__summary-title"})



        # dictionary to store temporary data
        basic_feature_dict = {}
        pattern1 = re.compile(r'>(.+)<.')

        # putting 'none' if recent_rented number is missing   
        if recent_rented_number is None:
            basic_feature_dict['recent_rented_number'] = None

        else:
            basic_feature_dict['recent_rented_number'] = pattern1.findall(str(recent_rented_number))[0]



        # putting 'none' if longterm resident number is missing   
        if long_term_resi is None:
            basic_feature_dict['long_term_resi_percentage'] = None

        else:
            basic_feature_dict['long_term_resi_percentage']  = float(str(long_term_resi.text[:-1]))/100



        # putting 'none' if neighbourhood family/single number is missing   
        if (neighb_family is None) |(neighb_single is None) :
            basic_feature_dict['neighb_family_single_ratio'] = None

        else:
            #to avoid zero division
            add_on = 0.000001
            basic_feature_dict['neighb_family_single_ratio'] = float(str(neighb_family.text[:-1]))/(float(str(neighb_single.text[:-1]))+add_on)




        # putting 'none' if street owener/renter number is missing   
        if (street_owner is None) | (street_renter is None):
            basic_feature_dict['street_renter_owner_ratio'] = None

        else:
            #to avoid zero division
            add_on = 0.000001
            basic_feature_dict['street_renter_owner_ratio'] = float(str(street_renter.text[:-1]))/(float(str(street_owner.text[:-1]))+add_on)



        # putting 'none' if price is missing    
        if property_price is None:
            basic_feature_dict['price'] = None

        else:
            basic_feature_dict['price'] = pattern1.findall(str(property_price))[0]
            
         # appending all the data into a list
        basic_feature_list.append(basic_feature_dict)
        
    return basic_feature_list
        
        

In [4]:
def list_to_df(basic_feature_list):
    
    recent_rented_number_list = []
    street_renter_owner_ratio_list = []
    neighb_family_single_ratio_list = []
    long_term_resi_percentage_list =[]
    price_list = []

    for row in basic_feature_list:
        # checking if the row cointains 'price'         
        if 'price' in row:
            price_list.append(row['price'])
        else:
            price_list.append(None)        

        # checking if the row cointains 'recent_rented_number'            
        if 'recent_rented_number' in row:
            recent_rented_number_list.append(row['recent_rented_number'])
        else:
            recent_rented_number_list.append(None)


        # checking if the row cointains 'long_term_resi_percentage'            
        if 'long_term_resi_percentage' in row:
            long_term_resi_percentage_list.append(row['long_term_resi_percentage'])
        else:
            long_term_resi_percentage_list.append(None)

        # checking if the row cointains 'street_renter_owner_ratio'            
        if 'street_renter_owner_ratio' in row:
            street_renter_owner_ratio_list.append(row['street_renter_owner_ratio'])
        else:
            street_renter_owner_ratio_list.append(None)

        # checking if the row cointains 'neighb_family_single_ratio'            
        if 'neighb_family_single_ratio' in row:
            neighb_family_single_ratio_list.append(row['neighb_family_single_ratio'])
        else:
            neighb_family_single_ratio.append(None)
        
        
    house_dict = {}    
    house_dict['Rent'] = price_list
    house_dict['street_renter_owner_ratio'] = street_renter_owner_ratio_list 
    house_dict['neighb_family_single_ratio'] = neighb_family_single_ratio_list
    house_dict['long_term_resi_percentage'] = long_term_resi_percentage_list
    house_df = pd.DataFrame(house_dict)
    
    house_df['Converted_Rent'] = [convert_rent(i) for i in price_list]
    house_df['Type'] = [type_flag(i) for i in price_list]
    return house_df



In [5]:
vic_postcodes = constants.postcodes['VIC_sample']

domain_links = set()

for i in tqdm(range(len(vic_postcodes))):
    current_links = domain_property_links(vic_postcodes[i])
    domain_links = domain_links | current_links
    
    print(len(current_links))

0it [00:00, ?it/s]/5 [00:00<?, ?it/s]
 20%|██        | 1/5 [00:00<00:01,  3.88it/s]

0


0it [00:00, ?it/s]
 40%|████      | 2/5 [00:00<00:00,  4.25it/s]

0


0it [00:00, ?it/s]
 60%|██████    | 3/5 [00:00<00:00,  4.26it/s]

0


100%|██████████| 1/1 [00:00<00:00,  4.31it/s]
 80%|████████  | 4/5 [00:01<00:00,  2.60it/s]

2


100%|██████████| 1/1 [00:00<00:00,  2.35it/s]
100%|██████████| 5/5 [00:02<00:00,  2.41it/s]

4





In [6]:
with open('2022-09-05.VIC.domain.pickle', 'wb') as file:
    pickle.dump(domain_links, file , protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
with open('2022-09-05.VIC.domain.pickle', 'rb') as file:
    links = pickle.load(file)

len(links)

6

In [13]:
df = pd.DataFrame(links)
df.to_csv(r'../data/links_sample.csv')

In [8]:
test_links = links

In [9]:
output_list = input_url_output_df(test_links)

In [10]:
df = list_to_df(output_list)

In [11]:
df

Unnamed: 0,Rent,street_renter_owner_ratio,neighb_family_single_ratio,long_term_resi_percentage,Converted_Rent,Type
0,600 p/w,0.538462,4.263158,0.52,600.0,
1,$550 p/w,0.052632,4.263158,0.52,550.0,
2,$565 per week,0.0,4.263158,0.52,565.0,1.0
3,$310.00 per week,,5.25,0.59,310.0,1.0
4,$310 Per week,,5.25,0.59,310.0,1.0
5,$510.00 per week,0.111111,4.263158,0.52,510.0,1.0
