# Import libraries

In [61]:
# import libraries
import json
import pandas as pd
import re
import time
import os
import geopandas as gpd
from IPython.display import display
from datetime import datetime


# Helper function

In [62]:
def get_room_info(each_property):
    ''' Get number of bed room, bath room, car park information from property dataset

    Args:
        property dataset
    
    Return:
        room information
    '''
    
    # find the number of bedroom, bathroom and parking area
    num_bed = re.findall(r'(\d|-|−)\s?Bed', each_property)
    num_bath = re.findall(r'(\d|-|−)\s?Bath', each_property)
    num_park = re.findall(r'(\d|-|−)\s?Park', each_property)

    # if no information, set the number to zero
    if (len(num_bed) == 0) or (num_bed[0].isdigit() == False):
        num_bed = ["0"]
    if (len(num_bath) == 0) or (num_bath[0].isdigit() == False):
        num_bath = ["0"]
    if (len(num_park) == 0) or (num_park[0].isdigit() == False):
        num_park = ["0"]

    # return room information
    return int(num_bed[0]), int(num_bath[0]), int(num_park[0])

In [63]:
def get_rent_info(each_property_rent):
    ''' Get rent of property in unit of per week

    Args:
        one property data

    Return:
        cleaned rent and the unit is per week
    '''
    # get the price
    price = each_property_rent.lower()
    rent = re.findall(r"[$](\d+\.?,?\d*)", price)

    # find the annually rent
    if ("p.a." in price) or ("annually" in price) or ("pa." in price):
        each_property_rent = float(rent[0].replace(",", ""))/52

    # find the fortnightly rent
    elif "fortnightly" in price:
        each_property_rent = float(rent[0].replace(",", ""))/2

    # find the nightly rent
    elif "night" in price:
        if len(rent) == 0:
            rent = re.findall(r"(\d+\.?,?\d*)", price)
        each_property_rent = float(rent[0].replace(",", "")) * 7

    # find the monthly rent
    elif ("pcm" in price) or ("pm" in price) or ("month" in price):

        # check whether contain weekly rent
        if re.findall(r"(p([a-z]*|\/*)\s?w)", price) != "":
            if len(rent) == 0:
                rent = re.findall(r"(\d+\.?,?\d*)", price)
            each_property_rent = float(rent[0].replace(",", ""))
            
        # if only contain monthly rent, calculate the weekly rent
        else:
            if len(rent) == 0:
                rent = re.findall(r"(\d+\.?,?\d*)", price)
            rent = (float(rent[0].replace(",", "")) / 30) * 7
            each_property_rent = rent
        
    # find the weekly rent
    else:
        if len(rent) == 0:
            rent = re.findall(r"(\d+\.?,?\d*)", price)

            # set the rent without any price data as zero
            if len(rent) == 0:
                each_property_rent = 0
            else:
                each_property_rent = float(rent[0].replace(",", ""))
        
        elif len(rent) == 2:
            num1 = int(float(rent[0].replace(",", "")))
            num2 = int(float(rent[1].replace(",", "")))

            # check for normal weekly rent range
            if len(str(num1)) != len(str(num2)):

                # check whether the larger price is twice than the lower price, 
                # if yes, using lower price, otherwise using the mean price
                if num1 * 2 < num2:
                    rent = num1
                else:
                    rent = (num1 + num2)/2
            else:
                rent = (num1 + num2)/2
            each_property_rent = rent

        else:
            each_property_rent = float(rent[0].replace(",", ""))

    # return cleaned rent
    return each_property_rent

In [64]:
def get_floor_info(i, property_df):
    ''' Get floor of each property

    Args:
        one property data
        
    Return:
        the floor of property
    '''

    # change the building that only have one floor only (such as Villa)
    if (property_df["type"][i] == "House") or (property_df["type"][i] == "Villa")\
        or (property_df["type"][i] == "New House & Land") or \
            (property_df["type"][i] == "Semi-Detached") or\
            (property_df["type"][i] == "Vacant land"):
        property_df.loc[[i],['floor']]  = 1

    # get floor for house
    elif (property_df["type"][i] == "Rural") or (property_df["type"][i] == "Farm"):
        property_df.loc[[i],['floor']]  = 1

    # for apartment
    else:

        location = property_df["address"][i].split(" ")[0]
        if "/" in location:  
            floor_str = location.split("/")

            # create temporary variables to store the floor information
            f=''
            fff=''

            # process the floor address is not digital.
            if floor_str[0].isdigit() == False: 

                # special case
                # ground floor
                if ("G" in floor_str[0]) or ("Ground" in floor_str[0]):
                    property_df.loc[[i],['floor']]  = 1
                if ("LG" in floor_str[0]):
                    property_df.loc[[i],['floor']]  = 1

                # switch the format of the floor address from various form to uniform digitals.
                if ("Level" in floor_str[0]) or ("L" in floor_str[0]):  
                    if len(re.findall('[0-9]', floor_str[0])) != 0:
                        property_df.loc[[i],['floor']] = int(f.join(re.\
                            findall("[0-9]", floor_str[0])))
                elif ("&" in floor_str[0]):
                    property_df.loc[[i],['floor']]  = floor_str[0][0]
                elif ("." in floor_str[0]):
                    if ("AP" in floor_str[0]):
                        property_df.loc[[i],['floor']]  = floor_str[0].split(".")[0][-1] 
                    else:
                        if len(re.findall('[0-9]', floor_str[0].split('.')[0])) != 0:
                            property_df.loc[[i],['floor']]  = int(\
                                fff.join(re.findall('[0-9]', floor_str[0].split('.')[0])))
                else:

                    # Determine the floor number when there is a letter on the end of 
                    # the floor address such as '1014B'
                    if re.search('(\\d+)[A-Za-z]',property_df["address"][i]) != None:
                        tmp_floor = re.search('(\\d+)[A-Za-z]',\
                            property_df["address"][i]).group()[:-1]
                        if len(tmp_floor) == 1 or len(tmp_floor) == 2:
                            property_df.loc[[i],['floor']] = tmp_floor
                        elif len(tmp_floor) == 3 and int(tmp_floor[1]) > 1:
                            property_df.loc[[i],['floor']] = tmp_floor[:2]
                        elif len(tmp_floor) == 3 and int(tmp_floor[1]) <= 1:
                            property_df.loc[[i],['floor']] = tmp_floor[0]
                        elif len(tmp_floor) == 4:
                            property_df.loc[[i],['floor']] = tmp_floor[:2]       
                                        
            # determine which number is the floor. for example: 319 is 3th 
            # floor and 3109 is the 31th floor.
            else:
                if len(floor_str[0]) == 1 or len(floor_str[0]) == 2:
                    property_df.loc[[i],['floor']]  = floor_str[0]
                elif len(floor_str[0]) == 3 and int(floor_str[0][1]) > 1:
                    property_df.loc[[i],['floor']]  = floor_str[0][:2]
                elif len(floor_str[0]) == 3 and int(floor_str[0][1]) <= 1:
                    property_df.loc[[i],['floor']]  = floor_str[0][0]
                elif len(floor_str[0]) == 4:
                    property_df.loc[[i],['floor']]  = floor_str[0][:2]

        # process the address without '/'
        else: 

            # check the floors that have "," on the tail.       
            if ',' in property_df["address"][i]:
                floor_str111 = property_df["address"][i].split(",")[0]
                if len(floor_str111.split(' '))>1:
                    FL = floor_str111.split(' ')[0]
                    FF = floor_str111.split(' ')[1]
                    if ("Level" in FL) or ("Lvl" in FL):  
                        property_df.loc[[i],['floor']] = FF

            # Determine the floor number when there is a letter on the end of the
            #  floor address such as '1014B'
            if re.search('(\\d+)[A-Za-z]',property_df["address"][i]) != None:
                tmp_floor = re.search('(\\d+)[A-Za-z]',property_df["address"][i]).group()[:-1]
                if len(tmp_floor) == 1 or len(tmp_floor) == 2:
                    property_df.loc[[i],['floor']]  = tmp_floor
                elif len(tmp_floor) == 3 and int(tmp_floor[1]) > 1:
                    property_df.loc[[i],['floor']]  = tmp_floor[:2]
                elif len(tmp_floor) == 3 and int(tmp_floor[1]) <= 1:
                    property_df.loc[[i],['floor']]  = tmp_floor[0]
                elif len(tmp_floor) == 4:
                    property_df.loc[[i],['floor']]  = tmp_floor[:2]
    
    # return the dataframe with cleaned floor
    return property_df

# Clean property features

In [65]:
# read in property json file
property_df = pd.read_json("../data/raw/property.json").transpose().reset_index()

# rename columns
property_df.columns = ['url', 'address', 'rent', 'features', 'type', \
    'furnitured', 'pool', 'gym', 'coordinates','desc']

# drop url column
property_df = property_df.drop(columns = ['url'])

In [66]:
# drop duplicate properties
unique_property_df = property_df.iloc[property_df.astype(str)\
    .drop_duplicates(subset=None, keep='first', inplace=False).index]

In [67]:
def latitude(coord):
    ''' Get latitue from coordinate

    Args:
        coordinate
    
    Return:
        latitude
    '''

    # return latidude
    return coord[0]

def longtitude(coord):
    ''' Get longtitude from coordinate

    Args:
        coordinate

    Return:
        longitude
    '''
    
    # return longtitude
    return coord[1]

In [68]:
# put latitude and longititude information in dataframe
property_df["Latitude"] = property_df['coordinates'].apply(latitude)
property_df["Longtitude"] = property_df['coordinates'].apply(longtitude)

In [69]:
# check the structure of property data
unique_property_df.head()

Unnamed: 0,address,rent,features,type,furnitured,pool,gym,coordinates,desc
0,1414/218-228 A'Beckett Street Melbourne VIC 3000,$400 Per Week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8102832, 144.9566691]",South Melbourne Leasing
1,11a/131 Lonsdale Sreet Melbourne VIC 3000,$350 per week,1 Bed1 Bath− Parking,Studio,Yes,No,No,"[-37.810779, 144.9685513]",Wimpie Santoso
2,911/408 Lonsdale Street Melbourne VIC 3000,$330 per week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8125979, 144.9604012]",Ender Gok
3,918/422 Collins St Melbourne VIC 3000,$600 Per week fully furnished,2 Beds1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8170971, 144.9601487]",Precinct Leasing
4,602/118 Franklin Street Melbourne VIC 3000,$330,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8082052, 144.9589035]",Manuel Flores


In [70]:
# check and drop duplicate properties
unique_property_df.shape

(15075, 9)

In [71]:
# get postcode for all properties from the address
unique_property_df["postcode"] = unique_property_df["address"].\
                                apply(lambda x: int(x[-4:]))

# save as csv
unique_property_df.to_csv("../data/curated/property.csv", index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_property_df["postcode"] = unique_property_df["address"].\


# Data cleaning

In [72]:
# read in combined dataset
property_data = pd.read_csv("../data/curated/property.csv")
property_data.head()

Unnamed: 0,address,rent,features,type,furnitured,pool,gym,coordinates,desc,postcode
0,1414/218-228 A'Beckett Street Melbourne VIC 3000,$400 Per Week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8102832, 144.9566691]",South Melbourne Leasing,3000
1,11a/131 Lonsdale Sreet Melbourne VIC 3000,$350 per week,1 Bed1 Bath− Parking,Studio,Yes,No,No,"[-37.810779, 144.9685513]",Wimpie Santoso,3000
2,911/408 Lonsdale Street Melbourne VIC 3000,$330 per week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8125979, 144.9604012]",Ender Gok,3000
3,918/422 Collins St Melbourne VIC 3000,$600 Per week fully furnished,2 Beds1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8170971, 144.9601487]",Precinct Leasing,3000
4,602/118 Franklin Street Melbourne VIC 3000,$330,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8082052, 144.9589035]",Manuel Flores,3000


In [73]:
# drop the property with empty features
property_data = property_data[property_data["features"] != ""].reset_index(drop=True)
property_data = property_data.dropna(how='any').reset_index(drop=True)

# check the size of the data
property_data.shape

(15041, 10)

In [74]:
# initiate rent dataframe
rent_df_new = pd.DataFrame()

# initiate rent_weekly column
rent_df_new["rent_weekly"] = property_data["rent"]

In [75]:
# initiate floor dataframe
floor_df_new = property_data.copy()

# initial number of the floor is -1
floor = []
for i in range(len(property_data["address"])):
    floor.append('-1')

# create the floor feature
floor_df_new["floor"] = floor

In [76]:
# ready to save room information
num_bed_list = []
num_bath_list = []
num_park_list = []

for i in range(len(property_data)):

    # for debug
    # print(property_data['features'][i])

    # get room information from property "features"
    num_bed, num_bath, num_park = get_room_info(property_data['features'][i])

    # append features in the list
    num_bed_list.append(num_bed)
    num_bath_list.append(num_bath)
    num_park_list.append(num_park)

    # get rent per week from property "rent"
    cleaned_rent_info = get_rent_info(property_data['rent'][i])
    rent_df_new["rent_weekly"][i] = cleaned_rent_info

    # get floor  from property "address"
    floor_df_new = get_floor_info(i, floor_df_new)


In [77]:
# update property data floor with cleaned floor
property_data['floor'] = floor_df_new['floor']

In [79]:
# add room information in property dataset
property_data["num_bed"] = num_bed_list
property_data["num_bath"] = num_bath_list
property_data["num_car_park"] = num_park_list


# drop the property with no bedroom
property_data = property_data[property_data["num_bed"] != 0].reset_index(drop=True)

In [80]:
# transfer the rent data type to integer
rent_df_new = rent_df_new.astype(int)

# add the processed rent data into the whole property data set
property_data["rent_weekly"] = rent_df_new["rent_weekly"]

# clean the zero rent
property_data = property_data[property_data["rent_weekly"] != 0]

# reset index of porperty data
property_data = property_data.reset_index()

In [None]:
# show proerty data
property_data.head()

In [None]:
# save the cleaned property dataframe to csv
property_data.to_csv('../data/curated/cleaned_property_data.csv', index = None)