# Import libraries

In [1]:
# import libraries
import json
import pandas as pd
import re
import time
import os
import geopandas as gpd
from IPython.display import display
import googlemaps
from datetime import datetime
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import geopy
from geopy.exc import GeocoderTimedOut

# Helper function

In [8]:
''' Get number of bed room, bath room, car park information from property dataset
    Put each property dataset in then
    Return room information
'''
def get_room_info(each_property):
    # find the number of bedroom, bathroom and parking area
    num_bed = re.findall(r'(\d|-|−)\s?Bed', each_property)
    num_bath = re.findall(r'(\d|-|−)\s?Bath', each_property)
    num_park = re.findall(r'(\d|-|−)\s?Park', each_property)

    # if no information, set the number to zero
    if (len(num_bed) == 0) or (num_bed[0].isdigit() == False):
        num_bed = ["0"]
    if (len(num_bath) == 0) or (num_bath[0].isdigit() == False):
        num_bath = ["0"]
    if (len(num_park) == 0) or (num_park[0].isdigit() == False):
        num_park = ["0"]

    # return room information
    return int(num_bed[0]), int(num_bath[0]), int(num_park[0])

In [9]:
''' Get rent of property in unit of per week
    Put each property data in
    Return cleaned rent in unit of per week
'''
def get_rent_info(each_property_rent):
    # get the price
    price = each_property_rent.lower()
    rent = re.findall(r"[$](\d+\.?,?\d*)", price)

    # find the annually rent
    if ("p.a." in price) or ("annually" in price) or ("pa." in price):
        each_property_rent = float(rent[0].replace(",", ""))/52

    # find the fortnightly rent
    elif "fortnightly" in price:
        each_property_rent = float(rent[0].replace(",", ""))/2

    # find the nightly rent
    elif "night" in price:
        if len(rent) == 0:
            rent = re.findall(r"(\d+\.?,?\d*)", price)
        each_property_rent = float(rent[0].replace(",", "")) * 7

    # find the monthly rent
    elif ("pcm" in price) or ("pm" in price) or ("month" in price):

        # check whether contain weekly rent
        if re.findall(r"(p([a-z]*|\/*)\s?w)", price) != "":
            if len(rent) == 0:
                rent = re.findall(r"(\d+\.?,?\d*)", price)
            each_property_rent = float(rent[0].replace(",", ""))
            
        # if only contain monthly rent, calculate the weekly rent
        else:
            if len(rent) == 0:
                rent = re.findall(r"(\d+\.?,?\d*)", price)
            rent = (float(rent[0].replace(",", "")) / 30) * 7
            each_property_rent = rent
        
    # find the weekly rent
    else:
        if len(rent) == 0:
            rent = re.findall(r"(\d+\.?,?\d*)", price)

            # set the rent without any price data as zero
            if len(rent) == 0:
                each_property_rent = 0
            else:
                each_property_rent = float(rent[0].replace(",", ""))
        
        elif len(rent) == 2:
            num1 = int(float(rent[0].replace(",", "")))
            num2 = int(float(rent[1].replace(",", "")))

            # check for normal weekly rent range
            if len(str(num1)) != len(str(num2)):
                # check whether the larger price is twice than the lower price, 
                # if yes, using lower price, if no, using the mean price
                if num1 * 2 < num2:
                    rent = num1
                else:
                    rent = (num1 + num2)/2
            else:
                rent = (num1 + num2)/2
            each_property_rent = rent

        else:
            each_property_rent = float(rent[0].replace(",", ""))

    return each_property_rent

In [10]:
''' Get floor of each property
    Put each property data in
    Return the property at which floor
'''
def get_floor_info(each_property_address):

    floor = -1

    # change the building that only have one floor only (such as Villa)
    if (each_property_address == "House") or (each_property_address == "Villa") \
        or (each_property_address == "New House & Land") or \
            (each_property_address == "Semi-Detached") or \
                (each_property_address == "Vacant land"):
        floor = 1

    elif (each_property_address == "Rural") or (each_property_address == "Farm"):
        floor = 1
    else:
        location = each_property_address.split(" ")[0]
        if "/" in location:  
            floor_str = location.split("/")
            # create temporary variables to store the floor information
            f=''
            fff=''
            # process the floor address is not digital.
            if floor_str[0].isdigit() == False: 
                # special case
                # ground floor
                if ("G" in floor_str[0]) or ("Ground" in floor_str[0]):
                    floor = 1
                if ("LG" in floor_str[0]):
                    floor = 1
                # switch the format of the floor address from various form to uniform digitals.
                if ("Level" in floor_str[0]) or ("L" in floor_str[0]):  
                    if len(re.findall('[0-9]', floor_str[0])) != 0:
                        floor = int(f.join(re.findall("[0-9]", floor_str[0])))
                elif ("&" in floor_str[0]):
                    floor = floor_str[0][0]
                elif ("." in floor_str[0]):
                    if ("AP" in floor_str[0]):
                        floor = floor_str[0].split(".")[0][-1] 
                    else:
                        if len(re.findall('[0-9]', floor_str[0].split('.')[0])) != 0:
                            floor = int(fff.join(re.findall('[0-9]', floor_str[0].split('.')[0])))
                else:
                    # Determine the floor number when there is a letter on the end of the floor address such as '1014B'
                    if re.search('(\\d+)[A-Za-z]', each_property_address) != None:
                        tmp_floor = re.search('(\\d+)[A-Za-z]', each_property_address).group()[:-1]
                        if len(tmp_floor) == 1 or len(tmp_floor) == 2:
                            floor = tmp_floor
                        elif len(tmp_floor) == 3 and int(tmp_floor[1]) > 1:
                            floor = tmp_floor[:2]
                        elif len(tmp_floor) == 3 and int(tmp_floor[1]) <= 1:
                            floor = tmp_floor[0]
                        elif len(tmp_floor) == 4:
                            floor = tmp_floor[:2]       
                                        
            # determine which number is the floor. for example: 319 is 3th floor and 3109 is the 31th floor.
            else:
                if len(floor_str[0]) == 1 or len(floor_str[0]) == 2:
                    floor = floor_str[0]
                elif len(floor_str[0]) == 3 and int(floor_str[0][1]) > 1:
                    floor = floor_str[0][:2]
                elif len(floor_str[0]) == 3 and int(floor_str[0][1]) <= 1:
                    floor = floor_str[0][0]
                elif len(floor_str[0]) == 4:
                    floor = floor_str[0][:2]

        # process the address without '/'
        else: 
            # check the floors that have "," on the tail.       
            if ',' in each_property_address:
                floor_str111 = each_property_address.split(",")[0]
                if len(floor_str111.split(' '))>1:
                    FL = floor_str111.split(' ')[0]
                    FF = floor_str111.split(' ')[1]
                    if ("Level" in FL) or ("Lvl" in FL):  
                        floor = FF
            # Determine the floor number when there is a letter on the end of the floor address such as '1014B'
            if re.search('(\\d+)[A-Za-z]', each_property_address) != None:
                tmp_floor = re.search('(\\d+)[A-Za-z]', each_property_address).group()[:-1]
                if len(tmp_floor) == 1 or len(tmp_floor) == 2:
                    floor = tmp_floor
                elif len(tmp_floor) == 3 and int(tmp_floor[1]) > 1:
                    floor = tmp_floor[:2]
                elif len(tmp_floor) == 3 and int(tmp_floor[1]) <= 1:
                    floor = tmp_floor[0]
                elif len(tmp_floor) == 4:
                    floor = tmp_floor[:2]
    return floor

# Clean property features

In [2]:
# read in property json file
property_df = pd.read_json("../data/raw/property.json").transpose().reset_index()

# rename columns
property_df.columns = ['url', 'address', 'rent', 'features', 'type', 'furnitured', 'pool', 'gym', 'coordinates','desc']

# drop url column
property_df = property_df.drop(columns = ['url'])

In [3]:
# drop duplicate properties
unique_property_df = property_df.iloc[property_df.astype(str).drop_duplicates(subset=None, keep='first', inplace=False).index]

# index for graph
# property_df["index"] = property_df.index.to_list()

In [4]:
'''Get latitue from coordinate'''
def latitude(coord):
    return coord[0]

'''Get longtitude from coordinate'''
def longtitude(coord):
    return coord[1]

In [5]:
# put latitude and longititude information in dataframe
property_df["Latitude"] = property_df['coordinates'].apply(latitude)
property_df["Longtitude"] = property_df['coordinates'].apply(longtitude)

In [6]:
# check the structure of property data
unique_property_df.head()

Unnamed: 0,address,rent,features,type,furnitured,pool,gym,coordinates,desc
0,1414/218-228 A'Beckett Street Melbourne VIC 3000,$400 Per Week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8102832, 144.9566691]",South Melbourne Leasing
1,11a/131 Lonsdale Sreet Melbourne VIC 3000,$350 per week,1 Bed1 Bath− Parking,Studio,Yes,No,No,"[-37.810779, 144.9685513]",Wimpie Santoso
2,911/408 Lonsdale Street Melbourne VIC 3000,$330 per week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8125979, 144.9604012]",Ender Gok
3,918/422 Collins St Melbourne VIC 3000,$600 Per week fully furnished,2 Beds1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8170971, 144.9601487]",Precinct Leasing
4,602/118 Franklin Street Melbourne VIC 3000,$330,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8082052, 144.9589035]",Manuel Flores


In [7]:
# check and drop duplicate properties
unique_property_df.shape

(15075, 9)

In [8]:
# get postcode for all properties from the address
unique_property_df["postcode"] = unique_property_df["address"].\
                                apply(lambda x: int(x[-4:]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_property_df["postcode"] = unique_property_df["address"].\


## Combine external dataset and property data

In [9]:
# reverse the coordinates to address
def reverse_coord_geopy(coordinates):
    #access server
    geopy.geocoders.options.default_user_agent = "my"
    geolocator = Nominatim(user_agent="my")
    try:
        reverse = RateLimiter(geolocator.reverse, min_delay_seconds=2)
        location=reverse(coordinates,language='en',exactly_one=True, timeout=60)  #find location
        address = location.raw["address"]   #extract address
        return address
    except GeocoderTimedOut as e:    #if meet time out, wait for 2 second and reverse again
        time.sleep(2)
        return reverse_coord_geopy(coordinates)
    
def generate_suburb_postcode_geopy(df):
    #reverse
    df["Reverse_Coord"]= df["coordinates"].apply(reverse_coord_geopy)
    return df

In [11]:
# reverse geolocation and get address
property_address = pd.DataFrame()
# reverse the coordinates by small size due to request limitation
i = 0
while i < 16000:
    if i < 15000:
        property_address_i = generate_suburb_postcode_geopy(property_df.copy().iloc[i:i+1000])
        property_address = pd.concat([property_address,property_address_i],ignore_index=True)
    else:
        property_address_i = generate_suburb_postcode_geopy(property_df.copy().iloc[i:])
        property_address = pd.concat([property_address,property_address_i],ignore_index=True)
    print(i)
    i+=1000
# save GNR Data
path = os.getcwd().replace("notebooks","") + "data/curated/"
property_address.to_csv(path+'property_suburb.csv',index=False)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000


# Data cleaning

In [2]:
# read in combined dataset
property_data = pd.read_csv("../data/curated/property_suburb.csv")
property_data.head()

Unnamed: 0,address,rent,features,type,furnitured,pool,gym,coordinates,desc,Latitude,Longtitude,Reverse_Coord
0,1414/218-228 A'Beckett Street Melbourne VIC 3000,$400 Per Week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8102832, 144.9566691]",South Melbourne Leasing,-37.810283,144.956669,"{'building': 'The Istana', 'road': ""A'Beckett ..."
1,11a/131 Lonsdale Sreet Melbourne VIC 3000,$350 per week,1 Bed1 Bath− Parking,Studio,Yes,No,No,"[-37.810779, 144.9685513]",Wimpie Santoso,-37.810779,144.968551,"{'building': 'Ozford House', 'road': 'Lacey Pl..."
2,911/408 Lonsdale Street Melbourne VIC 3000,$330 per week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8125979, 144.9604012]",Ender Gok,-37.812598,144.960401,"{'highway': 'Hardware Lane/Lonsdale Street', '..."
3,918/422 Collins St Melbourne VIC 3000,$600 Per week fully furnished,2 Beds1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8170971, 144.9601487]",Precinct Leasing,-37.817097,144.960149,"{'house_number': '422', 'road': 'Collins Stree..."
4,602/118 Franklin Street Melbourne VIC 3000,$330,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8082052, 144.9589035]",Manuel Flores,-37.808205,144.958903,"{'house_number': '118', 'road': 'Franklin Stre..."


In [3]:
# drop the property with empty features
property_data = property_data[property_data["features"] != ""].reset_index(drop=True)
property_data = property_data.dropna(how='any').reset_index(drop=True)

# check the size of the data
property_data.shape

In [5]:
# initiate rent dataframe
rent_df_new = pd.DataFrame()
rent_df_new["rent_weekly"] = property_data["rent"]

Unnamed: 0,rent_weekly
0,$400 Per Week
1,$350 per week
2,$330 per week
3,$600 Per week fully furnished
4,$330


In [6]:
# initiate floor dataframe
floor_df_new = pd.DataFrame()

# initial number of the floor is -1
floor = []
for i in range(len(property_data["address"])):
    floor.append('-1')

# create the floor feature
floor_df_new["floor"] = floor

Unnamed: 0,floor
0,-1
1,-1
2,-1
3,-1
4,-1


In [11]:
# ready to save room information
num_bed_list = []
num_bath_list = []
num_park_list = []

for i in range(len(property_data)):

    # for debug
    # print(property_data['features'][i])

    # get room information from property "features"
    num_bed, num_bath, num_park = get_room_info(property_data['features'][i])
    num_bed_list.append(num_bed)
    num_bath_list.append(num_bath)
    num_park_list.append(num_park)

    # get rent per week from property "rent"
    cleaned_rent_info = get_rent_info(property_data['rent'][i])
    rent_df_new["rent_weekly"][i] = cleaned_rent_info

    # get floor  from property "address"
    cleaned_floor_info = get_floor_info(property_data['address'][i])
    floor_df_new["floor"][i] = cleaned_floor_info


In [12]:
# add room information in property dataset
property_data["num_bed"] = num_bed_list
property_data["num_bath"] = num_bath_list
property_data["num_car_park"] = num_park_list


# drop the property with no bedroom
property_data = property_data[property_data["num_bed"] != 0].reset_index(drop=True)

In [13]:
# transfer the rent data type to integer
rent_df_new = rent_df_new.astype(int)

# add the processed rent data into the whole property data set
property_data["rent_weekly"] = rent_df_new["rent_weekly"]

# clean the zero rent
property_data = property_data[property_data["rent_weekly"] != 0]
property_data = property_data.reset_index()

In [14]:
# transfer the floor data type to integer
floor_df_new = floor_df_new.astype(int)

# add the processed floor data into the whole property data set
property_data["floor"] = floor_df_new["floor"]

In [15]:
property_data.head()

Unnamed: 0,address,rent,features,type,furnitured,pool,gym,coordinates,desc,Latitude,Longtitude,Reverse_Coord,num_bed,num_bath,num_car_park,rent_weekly,floor
0,1414/218-228 A'Beckett Street Melbourne VIC 3000,$400 Per Week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8102832, 144.9566691]",South Melbourne Leasing,-37.810283,144.956669,"{'building': 'The Istana', 'road': ""A'Beckett ...",1,1,0,400,14
1,11a/131 Lonsdale Sreet Melbourne VIC 3000,$350 per week,1 Bed1 Bath− Parking,Studio,Yes,No,No,"[-37.810779, 144.9685513]",Wimpie Santoso,-37.810779,144.968551,"{'building': 'Ozford House', 'road': 'Lacey Pl...",1,1,0,350,11
2,911/408 Lonsdale Street Melbourne VIC 3000,$330 per week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8125979, 144.9604012]",Ender Gok,-37.812598,144.960401,"{'highway': 'Hardware Lane/Lonsdale Street', '...",1,1,0,330,9
3,918/422 Collins St Melbourne VIC 3000,$600 Per week fully furnished,2 Beds1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8170971, 144.9601487]",Precinct Leasing,-37.817097,144.960149,"{'house_number': '422', 'road': 'Collins Stree...",2,1,0,600,9
4,602/118 Franklin Street Melbourne VIC 3000,$330,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8082052, 144.9589035]",Manuel Flores,-37.808205,144.958903,"{'house_number': '118', 'road': 'Franklin Stre...",1,1,0,330,6


In [None]:
# save the cleaned property dataframe to csv
property_data.to_csv('../data/curated/cleaned_property_data.csv', index = None)