In [4]:
import pandas as pd
import re
import numpy as np
from ast import literal_eval

In [5]:
# Read in primary dataset
rental_df = pd.read_csv("../data/curated/All_Houses_Scraped.csv")

# Obtain all suburb names in Victoria
state_suburbs_df = pd.read_csv("../data/raw/georef-australia-state-suburb.zip")
state_suburbs_df = state_suburbs_df[state_suburbs_df["Official Name State"] == "Victoria"]
suburbs = [suburb.lower() for suburb in state_suburbs_df["Official Name Suburb"]]

for i in range(len(suburbs)):
    suburbs[i] = re.sub(r'\(.*?\)', '', suburbs[i])
    suburbs[i] = suburbs[i].strip()

extra_suburbs = ["preston west", "prahran east", "sanctuary lakes", "mallacoota"]
for i in range(len(extra_suburbs)):
    suburbs.append(extra_suburbs[i])
    
# Convert address to list
rental_df["name"] = rental_df["name"].str.split(' ')

# Create new column for postcodes
rental_df["postcode"] = [row[-1] for row in rental_df["name"]]

# Create new columns for potential suburb names
rental_df["name_one"] = [row[-3].lower() for row in rental_df["name"]]
rental_df["name_two"] = [row[-4:-2] for row in rental_df["name"]]
rental_df["name_three"] = [row[-5:-2] for row in rental_df["name"]]

# Convert potential suburb names from list to string
rental_df["name_two"] = rental_df["name_two"].str.join(' ')
rental_df["name_two"] = [row.lower() for row in rental_df["name_two"]]
rental_df["name_three"] = rental_df["name_three"].str.join(' ')
rental_df["name_three"] = [row.lower() for row in rental_df["name_three"]]

i = 0
for suburb_one, suburb_two, suburb_three in zip(rental_df.iloc[:, -3], rental_df.iloc[:, -2], rental_df.iloc[:, -1]):
    if suburb_three in suburbs:
        rental_df.loc[i, ["suburb"]] = suburb_three
    elif suburb_two in suburbs:
        rental_df.loc[i, ["suburb"]] = suburb_two
    elif suburb_one in suburbs:
        rental_df.loc[i, ["suburb"]] = suburb_one
    i += 1

rental_df = rental_df.drop(columns=["name_one", "name_two", "name_three"])

# Drop rows containing at least one null value in primary dataset
rental_df = rental_df.dropna(axis=0, how ="any")

In [6]:
# Drop rows without room information
rental_df["rooms"] = rental_df["rooms"].replace("[]", np.nan)
rental_df = rental_df.dropna(subset = ["rooms"] , how="any").reset_index(drop=True)

# Set up cases for extracting room information
info_one = '|'.join(["bed"])
info_two = '|'.join(["bath"])
info_three = '|'.join(["park"])
numbers = '\d+[.,]?\d*'                              
case_one = fr'({numbers})(?:[\s\d\-\+\/]*)(?:{info_one})'
case_two = fr'({numbers})(?:[\s\d\-\+\/]*)(?:{info_two})'
case_three = fr'({numbers})(?:[\s\d\-\+\/]*)(?:{info_three})'
pattern_one = re.compile(case_one) 
pattern_two = re.compile(case_two) 
pattern_three = re.compile(case_three) 

# Create columns for the number of beedrooms, bathrooms and parking spaces
for i in range(len(rental_df)):
    rental_df["rooms"].loc[i] = literal_eval(rental_df["rooms"].loc[i])
    for j in range(len(rental_df["rooms"].loc[i])):
        rental_df["rooms"].loc[i][j] = rental_df["rooms"].loc[i][j].lower()     
        if pattern_one.findall(rental_df["rooms"].loc[i][j]) != []:
            rental_df.loc[i, "bedrooms"] = pattern_one.findall(rental_df["rooms"].loc[i][j])
        elif pattern_two.findall(rental_df["rooms"].loc[i][j]) != []:
            rental_df.loc[i, "bathrooms"] = pattern_two.findall(rental_df["rooms"].loc[i][j])
        else:
            rental_df.loc[i, "parking_spaces"] = pattern_three.findall(rental_df["rooms"].loc[i][j])

In [8]:
rental_df.to_csv("../data/curated/Feature_Engineered_Scraped_Dataset.csv", index=False)